Tue, 6 Apr 2004 03:11:41 -0700

Update of /cvs/core/icu-sword/source/common
In directory www:/tmp/cvs-serv8911/source/common

Modified Files:
	Makefile.in brkiter.cpp caniter.cpp chariter.cpp charstr.h 
	cmemory.c cmemory.h common.dsp common.rc common.vcproj 
	cstring.c cstring.h dbbi.cpp iculserv.cpp iculserv.h 
	icunotif.cpp icunotif.h icuserv.cpp icuserv.h locid.cpp 
	locmap.c locmap.h mutex.h normlzr.cpp propname.cpp propname.h 
	putil.c rbbi.cpp rbbicst.pl rbbidata.cpp rbbidata.h 
	rbbinode.cpp rbbinode.h rbbirb.cpp rbbirb.h rbbirpt.h 
	rbbirpt.txt rbbiscan.cpp rbbiscan.h rbbisetb.cpp rbbisetb.h 
	rbbistbl.cpp rbbitblb.cpp rbbitblb.h resbund.cpp schriter.cpp 
	sprpimpl.h uassert.h ubidi.c ubidiln.c ubrk.cpp uchar.c 
	uchriter.cpp ucln_cmn.c ucln_cmn.h ucmndata.c ucmndata.h 
	ucmp8.c ucnv.c ucnv2022.c ucnv_bld.c ucnv_bld.h ucnv_cb.c 
	ucnv_cnv.c ucnv_cnv.h ucnv_imp.h ucnv_io.c ucnv_io.h 
	ucnv_lmb.c ucnv_u16.c ucnv_u32.c ucnv_u7.c ucnv_u8.c 
	ucnvbocu.c ucnvhz.c ucnvisci.c ucnvlat1.c ucnvmbcs.c 
	ucnvmbcs.h ucnvscsu.c udata.c udatamem.c udatamem.h uhash.c 
	uhash.h uidna.cpp uiter.cpp uloc.c umapfile.c umutex.c 
	umutex.h unames.c unifilt.cpp unifunct.cpp uniset.cpp 
	unistr.cpp unorm.cpp unormimp.h uobject.cpp uprops.c uprops.h 
	uresbund.c uresdata.c uresdata.h uresimp.h uset.cpp 
	usetiter.cpp ustr_imp.h ustrenum.cpp ustring.c ustrtrns.c 
	util.cpp utrie.c utrie.h uvector.cpp uvector.h uvectr32.cpp 
	uvectr32.h 
Added Files:
	UTRACIMP.H locbased.cpp locbased.h msvcres.h parsepos.cpp 
	ruleiter.cpp ruleiter.h uarrsort.c uarrsort.h ucnv_ext.c 
	ucnv_ext.h ucol_swp.c ucol_swp.h udataswp.c udataswp.h 
	usprep.cpp ustack.cpp utrace.c 
Removed Files:
	digitlst.cpp digitlst.h mutex.cpp nameprep.cpp nameprep.h 
	strprep.cpp strprep.h symtable.h 
Log Message:
ICU 2.8 sync

--- NEW FILE: UTRACIMP.H ---
/*
*******************************************************************************
*
*   Copyright (C) 2003, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  utracimp.h
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2003aug06
*   created by: Markus W. Scherer
*
*   Internal header for ICU tracing/logging.
*
*
*   Various notes:
*   - using a trace level variable to only call trace functions
*     when the level is sufficient
*   - using the same variable for tracing on/off to never make a function
*     call when off
*   - the function number is put into a local variable by the entry macro
*     and used implicitly to avoid copy&paste/typing mistakes by the developer
*   - the application must call utrace_setFunctions() and pass in
*     implementations for the trace functions
*   - ICU trace macros call ICU functions that route through the function
*     pointers if they have been set;
*     this avoids an indirection at the call site
*     (which would cost more code for another check and for the indirection)
*
*   ### TODO Issues:
*   - Verify that va_list is portable among compilers for the same platform.
*     va_list should be portable because printf() would fail otherwise!
*   - Should enum values like UTraceLevel be passed into int32_t-type arguments,
*     or should enum types be used?
*/

#ifndef __UTRACIMP_H__
#define __UTRACIMP_H__

#include "unicode/utrace.h"
#include <stdarg.h>

U_CDECL_BEGIN

/**
 * \var utrace_level
 * Trace level variable. Negative for "off".
 * Use only via UTRACE_ macros.
 * @internal
 */
#ifdef UTRACE_IMPL
U_EXPORT int32_t
#elif U_COMMON_IMPLEMENTATION
U_CFUNC int32_t
#else
U_CFUNC U_IMPORT int32_t
#endif
utrace_level;


/** 
 *   Traced Function Exit return types.  
 *   Flags indicating the number and types of varargs included in a call
 *   to a UTraceExit function.
 *   Bits 0-3:  The function return type.  First variable param.
 *   Bit    4:  Flag for presence of U_ErrorCode status param.
 *   @internal
 */
typedef enum UTraceExitVal {
    /** The traced function returns no value  @internal */
    UTRACE_EXITV_NONE   = 0,
    /** The traced function returns an int32_t, or compatible, type.  @internal */
    UTRACE_EXITV_I32    = 1,
    /** The traced function returns a pointer  @internal */
    UTRACE_EXITV_PTR    = 2,
    /** The traced function returns a UBool  @internal */
    UTRACE_EXITV_BOOL   = 3,
    /** Mask to extract the return type values from a UTraceExitVal  @internal */
    UTRACE_EXITV_MASK   = 0xf,
    /** Bit indicating that the traced function includes a UErrorCode parameter  @internal */
    UTRACE_EXITV_STATUS = 0x10
} UTraceExitVal;

/**
 * Trace function for the entry point of a function.
 * Do not use directly, use UTRACE_ENTRY instead.
 * @param fnNumber The UTraceFunctionNumber for the current function.
 * @internal
 */
U_CAPI void U_EXPORT2
utrace_entry(int32_t fnNumber);

/**
 * Trace function for each exit point of a function.
 * Do not use directly, use UTRACE_EXIT* instead.
 * @param fnNumber The UTraceFunctionNumber for the current function.
 * @param returnType The type of the value returned by the function.
 * @param errorCode The UErrorCode value at function exit. See UTRACE_EXIT.
 * @internal
 */
U_CAPI void U_EXPORT2
utrace_exit(int32_t fnNumber, int32_t returnType, ...);


/**
 * Trace function used inside functions that have a UTRACE_ENTRY() statement.
 * Do not use directly, use UTRACE_DATAX() macros instead.
 *
 * @param utraceFnNumber The number of the current function, from the local
 *        variable of the same name.
 * @param level The trace level for this message.
 * @param fmt The trace format string.
 *
 * @internal
 */
U_CAPI void U_EXPORT2
utrace_data(int32_t utraceFnNumber, int32_t level, const char *fmt, ...);

U_CDECL_END

#if U_ENABLE_TRACING

/**
 * Boolean expression to see if ICU tracing is turned on
 * to at least the specified level.
 * @internal
 */
#define UTRACE_LEVEL(level) (utrace_level>=(level))

/**
  *  Flag bit in utraceFnNumber, the local variable added to each function 
  *  with tracing code to contains the function number.
  *
  *  Set the flag if the function's entry is traced, which will cause the
  *  function's exit to also be traced.  utraceFnNumber is uncoditionally 
  *  set at entry, whether or not the entry is traced, so that it will
  *  always be available for error trace output.
  *  @internal
  */            
#define UTRACE_TRACED_ENTRY 0x80000000

/**
 * Trace statement for the entry point of a function.
 * Stores the function number in a local variable.
 * In C code, must be placed immediately after the last variable declaration.
 * Must be matched with UTRACE_EXIT() at all function exit points.
 *
 * Tracing should start with UTRACE_ENTRY after checking for
 * U_FAILURE at function entry, so that if a function returns immediately
 * because of a pre-existing error condition, it does not show up in the trace,
 * consistent with ICU's error handling model.
 *
 * @param fnNumber The UTraceFunctionNumber for the current function.
 * @internal
 */
#define UTRACE_ENTRY(fnNumber) \
    int32_t utraceFnNumber=(fnNumber); \
    if(utrace_level>=UTRACE_INFO) { \
        utrace_entry(fnNumber); \
        utraceFnNumber |= UTRACE_TRACED_ENTRY; \
    }


/**
 * Trace statement for the entry point of open and close functions.
 * Produces trace output at a less verbose setting than plain UTRACE_ENTRY
 * Stores the function number in a local variable.
 * In C code, must be placed immediately after the last variable declaration.
 * Must be matched with UTRACE_EXIT() at all function exit points.
 *
 * @param fnNumber The UTraceFunctionNumber for the current function.
 * @internal
 */
#define UTRACE_ENTRY_OC(fnNumber) \
    int32_t utraceFnNumber=(fnNumber); \
    if(utrace_level>=UTRACE_OPEN_CLOSE) { \
        utrace_entry(fnNumber); \
        utraceFnNumber |= UTRACE_TRACED_ENTRY; \
    }

/**
 * Trace statement for each exit point of a function that has a UTRACE_ENTRY()
 * statement.
 *
 * @param errorCode The function's ICU UErrorCode value at function exit,
 *                  or U_ZERO_ERROR if the function does not use a UErrorCode.
 *                  0==U_ZERO_ERROR indicates success,
 *                  positive values an error (see u_errorName()),
 *                  negative values an informational status.
 *
 * @internal
 */
#define UTRACE_EXIT() \
    {if(utraceFnNumber & UTRACE_TRACED_ENTRY) { \
        utrace_exit(utraceFnNumber & ~UTRACE_TRACED_ENTRY, UTRACE_EXITV_NONE); \
    }}

/**
 * Trace statement for each exit point of a function that has a UTRACE_ENTRY()
 * statement, and that returns a value.
 *
 * @param val       The function's return value, int32_t or comatible type.
 *
 * @internal 
 */
#define UTRACE_EXIT_VALUE(val) \
    {if(utraceFnNumber & UTRACE_TRACED_ENTRY) { \
        utrace_exit(utraceFnNumber & ~UTRACE_TRACED_ENTRY, UTRACE_EXITV_I32, val); \
    }}

#define UTRACE_EXIT_STATUS(status) \
    {if(utraceFnNumber & UTRACE_TRACED_ENTRY) { \
        utrace_exit(utraceFnNumber & ~UTRACE_TRACED_ENTRY, UTRACE_EXITV_STATUS, status); \
    }}

#define UTRACE_EXIT_VALUE_STATUS(val, status) \
    {if(utraceFnNumber & UTRACE_TRACED_ENTRY) { \
        utrace_exit(utraceFnNumber & ~UTRACE_TRACED_ENTRY, (UTRACE_EXITV_I32 | UTRACE_EXITV_STATUS), val, status); \
    }}

#define UTRACE_EXIT_PTR_STATUS(ptr, status) \
    {if(utraceFnNumber & UTRACE_TRACED_ENTRY) { \
        utrace_exit(utraceFnNumber & ~UTRACE_TRACED_ENTRY, (UTRACE_EXITV_PTR | UTRACE_EXITV_STATUS), ptr, status); \
    }}

/**
 * Trace statement used inside functions that have a UTRACE_ENTRY() statement.
 * Takes no data arguments.
 * The number of arguments for this macro must match the number of inserts
 * in the format string. Vector inserts count as two arguments.
 * Calls utrace_data() if the level is high enough.
 * @internal
 */
#define UTRACE_DATA0(level, fmt) \
    if(UTRACE_LEVEL(level)) { \
        utrace_data(utraceFnNumber & ~UTRACE_TRACED_ENTRY, (level), (fmt)); \
    }

/**
 * Trace statement used inside functions that have a UTRACE_ENTRY() statement.
 * Takes one data argument.
 * The number of arguments for this macro must match the number of inserts
 * in the format string. Vector inserts count as two arguments.
 * Calls utrace_data() if the level is high enough.
 * @internal
 */
#define UTRACE_DATA1(level, fmt, a) \
    if(UTRACE_LEVEL(level)) { \
        utrace_data(utraceFnNumber & ~UTRACE_TRACED_ENTRY , (level), (fmt), (a)); \
    }

/**
 * Trace statement used inside functions that have a UTRACE_ENTRY() statement.
 * Takes two data arguments.
 * The number of arguments for this macro must match the number of inserts
 * in the format string. Vector inserts count as two arguments.
 * Calls utrace_data() if the level is high enough.
 * @internal
 */
#define UTRACE_DATA2(level, fmt, a, b) \
    if(UTRACE_LEVEL(level)) { \
        utrace_data(utraceFnNumber & ~UTRACE_TRACED_ENTRY , (level), (fmt), (a), (b)); \
    }

/**
 * Trace statement used inside functions that have a UTRACE_ENTRY() statement.
 * Takes three data arguments.
 * The number of arguments for this macro must match the number of inserts
 * in the format string. Vector inserts count as two arguments.
 * Calls utrace_data() if the level is high enough.
 * @internal
 */
#define UTRACE_DATA3(level, fmt, a, b, c) \
    if(UTRACE_LEVEL(level)) { \
        utrace_data(utraceFnNumber & ~UTRACE_TRACED_ENTRY, (level), (fmt), (a), (b), (c)); \
    }

/**
 * Trace statement used inside functions that have a UTRACE_ENTRY() statement.
 * Takes four data arguments.
 * The number of arguments for this macro must match the number of inserts
 * in the format string. Vector inserts count as two arguments.
 * Calls utrace_data() if the level is high enough.
 * @internal
 */
#define UTRACE_DATA4(level, fmt, a, b, c, d) \
    if(UTRACE_LEVEL(level)) { \
        utrace_data(utraceFnNumber & ~UTRACE_TRACED_ENTRY, (level), (fmt), (a), (b), (c), (d)); \
    }

/**
 * Trace statement used inside functions that have a UTRACE_ENTRY() statement.
 * Takes five data arguments.
 * The number of arguments for this macro must match the number of inserts
 * in the format string. Vector inserts count as two arguments.
 * Calls utrace_data() if the level is high enough.
 * @internal
 */
#define UTRACE_DATA5(level, fmt, a, b, c, d, e) \
    if(UTRACE_LEVEL(level)) { \
        utrace_data(utraceFnNumber & ~UTRACE_TRACED_ENTRY, (level), (fmt), (a), (b), (c), (d), (e)); \
    }

/**
 * Trace statement used inside functions that have a UTRACE_ENTRY() statement.
 * Takes six data arguments.
 * The number of arguments for this macro must match the number of inserts
 * in the format string. Vector inserts count as two arguments.
 * Calls utrace_data() if the level is high enough.
 * @internal
 */
#define UTRACE_DATA6(level, fmt, a, b, c, d, e, f) \
    if(UTRACE_LEVEL(level)) { \
        utrace_data(utraceFnNumber & ~UTRACE_TRACED_ENTRY, (level), (fmt), (a), (b), (c), (d), (e), (f)); \
    }

/**
 * Trace statement used inside functions that have a UTRACE_ENTRY() statement.
 * Takes seven data arguments.
 * The number of arguments for this macro must match the number of inserts
 * in the format string. Vector inserts count as two arguments.
 * Calls utrace_data() if the level is high enough.
 * @internal
 */
#define UTRACE_DATA7(level, fmt, a, b, c, d, e, f, g) \
    if(UTRACE_LEVEL(level)) { \
        utrace_data(utraceFnNumber & ~UTRACE_TRACED_ENTRY, (level), (fmt), (a), (b), (c), (d), (e), (f), (g)); \
    }

/**
 * Trace statement used inside functions that have a UTRACE_ENTRY() statement.
 * Takes eight data arguments.
 * The number of arguments for this macro must match the number of inserts
 * in the format string. Vector inserts count as two arguments.
 * Calls utrace_data() if the level is high enough.
 * @internal
 */
#define UTRACE_DATA8(level, fmt, a, b, c, d, e, f, g, h) \
    if(UTRACE_LEVEL(level)) { \
        utrace_data(utraceFnNumber & ~UTRACE_TRACED_ENTRY, (level), (fmt), (a), (b), (c), (d), (e), (f), (g), (h)); \
    }

/**
 * Trace statement used inside functions that have a UTRACE_ENTRY() statement.
 * Takes nine data arguments.
 * The number of arguments for this macro must match the number of inserts
 * in the format string. Vector inserts count as two arguments.
 * Calls utrace_data() if the level is high enough.
 * @internal
 */
#define UTRACE_DATA9(level, fmt, a, b, c, d, e, f, g, h, i) \
    if(UTRACE_LEVEL(level)) { \
        utrace_data(utraceFnNumber & ~UTRACE_TRACED_ENTRY, (level), (fmt), (a), (b), (c), (d), (e), (f), (g), (h), (i)); \
    }

#else

/*
 * When tracing is disabled, the following macros become empty
 */

#define UTRACE_LEVEL(level) 0
#define UTRACE_ENTRY(fnNumber)
#define UTRACE_ENTRY_OC(fnNumber)
#define UTRACE_EXIT()
#define UTRACE_EXIT_VALUE(val)
#define UTRACE_EXIT_STATUS(status)
#define UTRACE_EXIT_VALUE_STATUS(val, status)
#define UTRACE_EXIT_PTR_STATUS(ptr, status)
#define UTRACE_DATA0(level, fmt)
#define UTRACE_DATA1(level, fmt, a)
#define UTRACE_DATA2(level, fmt, a, b)
#define UTRACE_DATA3(level, fmt, a, b, c)
#define UTRACE_DATA4(level, fmt, a, b, c, d)
#define UTRACE_DATA5(level, fmt, a, b, c, d, e)
#define UTRACE_DATA6(level, fmt, a, b, c, d, e, f)
#define UTRACE_DATA7(level, fmt, a, b, c, d, e, f, g)
#define UTRACE_DATA8(level, fmt, a, b, c, d, e, f, g, h)
#define UTRACE_DATA9(level, fmt, a, b, c, d, e, f, g, h, i)

#endif

#endif

--- NEW FILE: locbased.cpp ---
/*
**********************************************************************
* Copyright (c) 2004, International Business Machines
* Corporation and others.  All Rights Reserved.
**********************************************************************
* Author: Alan Liu
* Created: January 16 2004
* Since: ICU 2.8
**********************************************************************
*/
#include "locbased.h"
#include "cstring.h"

U_NAMESPACE_BEGIN

Locale LocaleBased::getLocale(ULocDataLocaleType type, UErrorCode& status) const {
    const char* id = getLocaleID(type, status);
    return Locale((id != 0) ? id : "");
}

const char* LocaleBased::getLocaleID(ULocDataLocaleType type, UErrorCode& status) const {
    if (U_FAILURE(status)) {
        return NULL;
    }

    switch(type) {
    case ULOC_VALID_LOCALE:
        return valid;
    case ULOC_ACTUAL_LOCALE:
        return actual;
    default:
        status = U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }
}

void LocaleBased::setLocaleIDs(const char* validID, const char* actualID) {
    if (validID != 0) {
        uprv_strcpy(valid, validID);
    }
    if (actualID != 0) {
        uprv_strcpy(actual, actualID);
    }
}

U_NAMESPACE_END

--- NEW FILE: locbased.h ---
/*
**********************************************************************
* Copyright (c) 2004, International Business Machines
* Corporation and others.  All Rights Reserved.
**********************************************************************
* Author: Alan Liu
* Created: January 16 2004
* Since: ICU 2.8
**********************************************************************
*/
#ifndef LOCBASED_H
#define LOCBASED_H

#include "unicode/locid.h"
#include "unicode/uobject.h"

/**
 * Macro to declare a locale LocaleBased wrapper object for the given
 * object, which must have two members named `validLocale' and
 * `actualLocale'.
 */
#define U_LOCALE_BASED(varname, objname) \
  LocaleBased varname((objname).validLocale, (objname).actualLocale);

U_NAMESPACE_BEGIN

/**
 * A utility class that unifies the implementation of getLocale() by
 * various ICU services.  This class is likely to be removed in the
 * ICU 3.0 time frame in favor of an integrated approach with the
 * services framework.
 * @since ICU 2.8
 */
class U_COMMON_API LocaleBased : public UMemory {

 public:

    /**
     * Construct a LocaleBased wrapper around the two pointers.  These
     * will be aliased for the lifetime of this object.
     */
    inline LocaleBased(char* validAlias, char* actualAlias);

    /**
     * Construct a LocaleBased wrapper around the two const pointers.
     * These will be aliased for the lifetime of this object.
     */
    inline LocaleBased(const char* validAlias, const char* actualAlias);

    /**
     * Return locale meta-data for the service object wrapped by this
     * object.  Either the valid or the actual locale may be
     * retrieved.
     * @param type either ULOC_VALID_LOCALE or ULOC_ACTUAL_LOCALE
     * @param status input-output error code
     * @return the indicated locale
     */
    Locale getLocale(ULocDataLocaleType type, UErrorCode& status) const;

    /**
     * Return the locale ID for the service object wrapped by this
     * object.  Either the valid or the actual locale may be
     * retrieved.
     * @param type either ULOC_VALID_LOCALE or ULOC_ACTUAL_LOCALE
     * @param status input-output error code
     * @return the indicated locale ID
     */
    const char* getLocaleID(ULocDataLocaleType type, UErrorCode& status) const;

    /**
     * Set the locale meta-data for the service object wrapped by this
     * object.  If either parameter is zero, it is ignored.
     * @param valid the ID of the valid locale
     * @param actual the ID of the actual locale
     */
    void setLocaleIDs(const char* valid, const char* actual);

 private:

    char* valid;
    
    char* actual;
};

inline LocaleBased::LocaleBased(char* validAlias, char* actualAlias) :
    valid(validAlias), actual(actualAlias) {
}

inline LocaleBased::LocaleBased(const char* validAlias,
                                const char* actualAlias) :
    // ugh: cast away const
    valid((char*)validAlias), actual((char*)actualAlias) {
}

U_NAMESPACE_END

#endif

--- NEW FILE: msvcres.h ---
//{{NO_DEPENDENCIES}}
// Copyright (c) 2003 International Business Machines
// Corporation and others. All Rights Reserved.
//
// Used by common.rc and other .rc files.
//Do not edit with Microsoft Developer Studio because it will modify this
//header the wrong way. This is here to prevent Visual Studio .NET from
//unnessarily building the resource files when it's not needed.
//
#include "unicode/uversion.h"


--- NEW FILE: parsepos.cpp ---
/*
**********************************************************************
*   Copyright (C) 2003-2003, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*/

#include "unicode/parsepos.h"

U_NAMESPACE_BEGIN

UOBJECT_DEFINE_RTTI_IMPLEMENTATION(ParsePosition)

ParsePosition::~ParsePosition() {}

ParsePosition *
ParsePosition::clone() const {
    return new ParsePosition(*this);
}

U_NAMESPACE_END

--- NEW FILE: ruleiter.cpp ---
/*
**********************************************************************
* Copyright (c) 2003, International Business Machines
* Corporation and others.  All Rights Reserved.
**********************************************************************
* Author: Alan Liu
* Created: September 24 2003
* Since: ICU 2.8
**********************************************************************
*/
#include "ruleiter.h"
#include "unicode/parsepos.h"
#include "unicode/unistr.h"
#include "unicode/symtable.h"
#include "uprops.h"

U_NAMESPACE_BEGIN

RuleCharacterIterator::RuleCharacterIterator(const UnicodeString& theText, const SymbolTable* theSym,
                      ParsePosition& thePos) :
    text(theText),
    pos(thePos),
    sym(theSym),
    buf(0)
{}

UBool RuleCharacterIterator::atEnd() const {
    return buf == 0 && pos.getIndex() == text.length();
}

UChar32 RuleCharacterIterator::next(int32_t options, UBool& isEscaped, UErrorCode& ec) {
    if (U_FAILURE(ec)) return DONE;

    UChar32 c = DONE;
    isEscaped = FALSE;

    for (;;) {
        c = _current();
        _advance(UTF_CHAR_LENGTH(c));

        if (c == SymbolTable::SYMBOL_REF && buf == 0 &&
            (options & PARSE_VARIABLES) != 0 && sym != 0) {
            UnicodeString name = sym->parseReference(text, pos, text.length());
            // If name is empty there was an isolated SYMBOL_REF;
            // return it.  Caller must be prepared for this.
            if (name.length() == 0) {
                break;
            }
            bufPos = 0;
            buf = sym->lookup(name);
            if (buf == 0) {
                ec = U_UNDEFINED_VARIABLE;
                return DONE;
            }
            // Handle empty variable value
            if (buf->length() == 0) {
                buf = 0;
            }
            continue;
        }

        if ((options & SKIP_WHITESPACE) != 0 &&
            uprv_isRuleWhiteSpace(c)) {
            continue;
        }

        if (c == 0x5C /*'\\'*/ && (options & PARSE_ESCAPES) != 0) {
            UnicodeString s;
            int32_t offset = 0;
            c = lookahead(s).unescapeAt(offset);
            jumpahead(offset);
            isEscaped = TRUE;
            if (c < 0) {
                ec = U_MALFORMED_UNICODE_ESCAPE;
                return DONE;
            }
        }

        break;
    }

    return c;
}

void RuleCharacterIterator::getPos(RuleCharacterIterator::Pos& p) const {
    p.buf = buf;
    p.pos = pos.getIndex();
    p.bufPos = bufPos;
}

void RuleCharacterIterator::setPos(const RuleCharacterIterator::Pos& p) {
    buf = p.buf;
    pos.setIndex(p.pos);
    bufPos = p.bufPos;
}

void RuleCharacterIterator::skipIgnored(int32_t options) {
    if ((options & SKIP_WHITESPACE) != 0) {
        for (;;) {
            UChar32 a = _current();
            if (!uprv_isRuleWhiteSpace(a)) break;
            _advance(UTF_CHAR_LENGTH(a));
        }
    }
}

UnicodeString& RuleCharacterIterator::lookahead(UnicodeString& result) const {
    if (buf != 0) {
        buf->extract(bufPos, 0x7FFFFFFF, result);
    } else {
        text.extract(pos.getIndex(), 0x7FFFFFFF, result);
    }
    return result;
}

void RuleCharacterIterator::jumpahead(int32_t count) {
    _advance(count);
}

UnicodeString& RuleCharacterIterator::toString(UnicodeString& result) const {
    int32_t b = pos.getIndex();
    text.extract(0, b, result);
    return result.append((UChar) 0x7C /*'|'*/).append(text, b, 0x7FFFFFFF);
}

UChar32 RuleCharacterIterator::_current() const {
    if (buf != 0) {
        return buf->char32At(bufPos);
    } else {
        int i = pos.getIndex();
        return (i < text.length()) ? text.char32At(i) : (UChar32)DONE;
    }
}

void RuleCharacterIterator::_advance(int32_t count) {
    if (buf != 0) {
        bufPos += count;
        if (bufPos == buf->length()) {
            buf = 0;
        }
    } else {
        pos.setIndex(pos.getIndex() + count);
        if (pos.getIndex() > text.length()) {
            pos.setIndex(text.length());
        }
    }
}

U_NAMESPACE_END

//eof

--- NEW FILE: ruleiter.h ---
/*
**********************************************************************
* Copyright (c) 2003, International Business Machines
* Corporation and others.  All Rights Reserved.
**********************************************************************
* Author: Alan Liu
* Created: September 24 2003
* Since: ICU 2.8
**********************************************************************
*/
#ifndef _RULEITER_H_
#define _RULEITER_H_

#include "unicode/utypes.h"

U_NAMESPACE_BEGIN

class UnicodeString;
class ParsePosition;
class SymbolTable;

/**
 * An iterator that returns 32-bit code points.  This class is deliberately
 * <em>not</em> related to any of the ICU character iterator classes
 * in order to minimize complexity.
 * @author Alan Liu
 * @since ICU 2.8
 */
class U_COMMON_API RuleCharacterIterator {

    // TODO: Ideas for later.  (Do not implement if not needed, lest the
    // code coverage numbers go down due to unused methods.)
    // 1. Add a copy constructor, operator==() method.
    // 2. Rather than return DONE, throw an exception if the end
    // is reached -- this is an alternate usage model, probably not useful.

private:
    /**
     * Text being iterated.
     */    
    const UnicodeString& text;

    /**
     * Position of iterator.
     */
    ParsePosition& pos;

    /**
     * Symbol table used to parse and dereference variables.  May be 0.
     */
    const SymbolTable* sym;
    
    /**
     * Current variable expansion, or 0 if none.
     */
    const UnicodeString* buf;

    /**
     * Position within buf.  Meaningless if buf == 0.
     */
    int32_t bufPos;

public:
    /**
     * Value returned when there are no more characters to iterate.
     */
    enum { DONE = -1 };

    /**
     * Bitmask option to enable parsing of variable names.  If (options &
     * PARSE_VARIABLES) != 0, then an embedded variable will be expanded to
     * its value.  Variables are parsed using the SymbolTable API.
     */
    enum { PARSE_VARIABLES = 1 };

    /**
     * Bitmask option to enable parsing of escape sequences.  If (options &
     * PARSE_ESCAPES) != 0, then an embedded escape sequence will be expanded
     * to its value.  Escapes are parsed using Utility.unescapeAt().
     */
    enum { PARSE_ESCAPES   = 2 };

    /**
     * Bitmask option to enable skipping of whitespace.  If (options &
     * SKIP_WHITESPACE) != 0, then whitespace characters will be silently
     * skipped, as if they were not present in the input.  Whitespace
     * characters are defined by UCharacterProperty.isRuleWhiteSpace().
     */
    enum { SKIP_WHITESPACE = 4 };

    /**
     * Constructs an iterator over the given text, starting at the given
     * position.
     * @param text the text to be iterated
     * @param sym the symbol table, or null if there is none.  If sym is null,
     * then variables will not be deferenced, even if the PARSE_VARIABLES
     * option is set.
     * @param pos upon input, the index of the next character to return.  If a
     * variable has been dereferenced, then pos will <em>not</em> increment as
     * characters of the variable value are iterated.
     */
    RuleCharacterIterator(const UnicodeString& text, const SymbolTable* sym,
                          ParsePosition& pos);
    
    /**
     * Returns true if this iterator has no more characters to return.
     */
    UBool atEnd() const;

    /**
     * Returns the next character using the given options, or DONE if there
     * are no more characters, and advance the position to the next
     * character.
     * @param options one or more of the following options, bitwise-OR-ed
     * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE.
     * @param isEscaped output parameter set to TRUE if the character
     * was escaped
     * @param ec input-output error code.  An error will only be set by
     * this routing if options includes PARSE_VARIABLES and an unknown
     * variable name is seen, or if options includes PARSE_ESCAPES and
     * an invalid escape sequence is seen.
     * @return the current 32-bit code point, or DONE
     */
    UChar32 next(int32_t options, UBool& isEscaped, UErrorCode& ec);

    /**
     * Returns true if this iterator is currently within a variable expansion.
     */
    inline UBool inVariable() const;

    /**
     * An opaque object representing the position of a RuleCharacterIterator.
     */
    struct Pos {
    private:
        const UnicodeString* buf;
        int32_t pos;
        int32_t bufPos;
        friend class RuleCharacterIterator;
    };

    /**
     * Sets an object which, when later passed to setPos(), will
     * restore this iterator's position.  Usage idiom:
     *
     * RuleCharacterIterator iterator = ...;
     * RuleCharacterIterator::Pos pos;
     * iterator.getPos(pos);
     * for (;;) {
     *   iterator.getPos(pos);
     *   int c = iterator.next(...);
     *   ...
     * }
     * iterator.setPos(pos);
     *
     * @param p a position object to be set to this iterator's
     * current position.
     */
    void getPos(Pos& p) const;

    /**
     * Restores this iterator to the position it had when getPos()
     * set the given object.
     * @param p a position object previously set by getPos()
     */
    void setPos(const Pos& p);

    /**
     * Skips ahead past any ignored characters, as indicated by the given
     * options.  This is useful in conjunction with the lookahead() method.
     *
     * Currently, this only has an effect for SKIP_WHITESPACE.
     * @param options one or more of the following options, bitwise-OR-ed
     * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE.
     */
    void skipIgnored(int32_t options);

    /**
     * Returns a string containing the remainder of the characters to be
     * returned by this iterator, without any option processing.  If the
     * iterator is currently within a variable expansion, this will only
     * extend to the end of the variable expansion.  This method is provided
     * so that iterators may interoperate with string-based APIs.  The typical
     * sequence of calls is to call skipIgnored(), then call lookahead(), then
     * parse the string returned by lookahead(), then call jumpahead() to
     * resynchronize the iterator.
     * @param result a string to receive the characters to be returned
     * by future calls to next()
     * @return a reference to result
     */
    UnicodeString& lookahead(UnicodeString& result) const;

    /**
     * Advances the position by the given number of 16-bit code units.
     * This is useful in conjunction with the lookahead() method.
     * @param count the number of 16-bit code units to jump over
     */
    void jumpahead(int32_t count);

    /**
     * Returns a string representation of this object, consisting of the
     * characters being iterated, with a '|' marking the current position.
     * Position within an expanded variable is <em>not</em> indicated.
     * @param result output parameter to receive a string
     * representation of this object
     */
    UnicodeString& toString(UnicodeString& result) const;
    
private:
    /**
     * Returns the current 32-bit code point without parsing escapes, parsing
     * variables, or skipping whitespace.
     * @return the current 32-bit code point
     */
    UChar32 _current() const;
    
    /**
     * Advances the position by the given amount.
     * @param count the number of 16-bit code units to advance past
     */
    void _advance(int32_t count);
};

inline UBool RuleCharacterIterator::inVariable() const {
    return buf != 0;
}

U_NAMESPACE_END

#endif // _RULEITER_H_
//eof

--- NEW FILE: uarrsort.c ---
/*
*******************************************************************************
*
*   Copyright (C) 2003, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  uarrsort.c
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2003aug04
*   created by: Markus W. Scherer
*
*   Internal function for sorting arrays.
*/

#include "unicode/utypes.h"
#include "cmemory.h"
#include "uarrsort.h"

enum {
    MIN_QSORT=9, /* from Knuth */
    STACK_ITEM_SIZE=200
};

/* UComparator convenience implementations ---------------------------------- */

U_CAPI int32_t U_EXPORT2
uprv_uint16Comparator(const void *context, const void *left, const void *right) {
    return (int32_t)*(const uint16_t *)left - (int32_t)*(const uint16_t *)right;
}

U_CAPI int32_t U_EXPORT2
uprv_int32Comparator(const void *context, const void *left, const void *right) {
    return *(const int32_t *)left - *(const int32_t *)right;
}

U_CAPI int32_t U_EXPORT2
uprv_uint32Comparator(const void *context, const void *left, const void *right) {
    uint32_t l=*(const uint32_t *)left, r=*(const uint32_t *)right;

    /* compare directly because (l-r) would overflow the int32_t result */
    if(l<r) {
        return -1;
    } else if(l==r) {
        return 0;
    } else /* l>r */ {
        return 1;
    }
}

/* Straight insertion sort from Knuth vol. III, pg. 81 ---------------------- */

static void
doInsertionSort(char *array, int32_t start, int32_t limit, int32_t itemSize,
                UComparator *cmp, const void *context, void *pv) {
    int32_t i, j;

    for(j=start+1; j<limit; ++j) {
        /* v=array[j] */
        uprv_memcpy(pv, array+j*itemSize, itemSize);

        for(i=j; i>start; --i) {
            if(/* v>=array[i-1] */ cmp(context, pv, array+(i-1)*itemSize)>=0) {
                break;
            }

            /* array[i]=array[i-1]; */
            uprv_memcpy(array+i*itemSize, array+(i-1)*itemSize, itemSize);
        }

        if(i!=j) {
            /* array[i]=v; */
            uprv_memcpy(array+i*itemSize, pv, itemSize);
        }
    }
}

static void
insertionSort(char *array, int32_t length, int32_t itemSize,
              UComparator *cmp, const void *context, UErrorCode *pErrorCode) {
    UAlignedMemory v[STACK_ITEM_SIZE/sizeof(UAlignedMemory)+1];
    void *pv;

    /* allocate an intermediate item variable (v) */
    if(itemSize<=STACK_ITEM_SIZE) {
        pv=v;
    } else {
        pv=uprv_malloc(itemSize);
        if(pv==NULL) {
            *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
            return;
        }
    }

    doInsertionSort(array, 0, length, itemSize, cmp, context, pv);

    if(pv!=v) {
        uprv_free(pv);
    }
}

/* QuickSort ---------------------------------------------------------------- */

/*
 * This implementation is semi-recursive:
 * It recurses for the smaller sub-array to shorten the recursion depth,
 * and loops for the larger sub-array.
 *
 * Loosely after QuickSort algorithms in
 * Niklaus Wirth
 * Algorithmen und Datenstrukturen mit Modula-2
 * B.G. Teubner Stuttgart
 * 4. Auflage 1986
 * ISBN 3-519-02260-5
 */
static void
subQuickSort(char *array, int32_t start, int32_t limit, int32_t itemSize,
             UComparator *cmp, const void *context,
             void *px, void *pw) {
    int32_t left, right;

    /* start and left are inclusive, limit and right are exclusive */
    do {
        if((start+MIN_QSORT)>=limit) {
            doInsertionSort(array, start, limit, itemSize, cmp, context, px);
            break;
        }

        left=start;
        right=limit;

        /* x=array[middle] */
        uprv_memcpy(px, array+((start+limit)/2)*itemSize, itemSize);

        do {
            while(/* array[left]<x */
                  cmp(context, array+left*itemSize, px)<0
            ) {
                ++left;
            }
            while(/* x<array[right-1] */
                  cmp(context, px, array+(right-1)*itemSize)<0
            ) {
                --right;
            }

            /* swap array[left] and array[right-1] via w; ++left; --right */
            if(left<right) {
                --right;

                if(left<right) {
                    uprv_memcpy(pw, array+left*itemSize, itemSize);
                    uprv_memcpy(array+left*itemSize, array+right*itemSize, itemSize);
                    uprv_memcpy(array+right*itemSize, pw, itemSize);
                }

                ++left;
            }
        } while(left<right);

        /* sort sub-arrays */
        if((right-start)<(limit-left)) {
            /* sort [start..right[ */
            if(start<(right-1)) {
                subQuickSort(array, start, right, itemSize, cmp, context, px, pw);
            }

            /* sort [left..limit[ */
            start=left;
        } else {
            /* sort [left..limit[ */
            if(left<(limit-1)) {
                subQuickSort(array, left, limit, itemSize, cmp, context, px, pw);
            }

            /* sort [start..right[ */
            limit=right;
        }
    } while(start<(limit-1));
}

static void
quickSort(char *array, int32_t length, int32_t itemSize,
            UComparator *cmp, const void *context, UErrorCode *pErrorCode) {
    UAlignedMemory xw[(2*STACK_ITEM_SIZE)/sizeof(UAlignedMemory)+1];
    void *p;

    /* allocate two intermediate item variables (x and w) */
    if(itemSize<=STACK_ITEM_SIZE) {
        p=xw;
    } else {
        p=uprv_malloc(2*itemSize);
        if(p==NULL) {
            *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
            return;
        }
    }

    subQuickSort(array, 0, length, itemSize,
                 cmp, context, p, (char *)p+itemSize);

    if(p!=xw) {
        uprv_free(p);
    }
}

/* uprv_sortArray() API ----------------------------------------------------- */

/*
 * Check arguments, select an appropriate implementation,
 * cast the array to char * so that array+i*itemSize works.
 */
U_CAPI void U_EXPORT2
uprv_sortArray(void *array, int32_t length, int32_t itemSize,
               UComparator *cmp, const void *context,
               UBool sortStable, UErrorCode *pErrorCode) {
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return;
    }
    if((length>0 && array==NULL) || length<0 || itemSize<=0 || cmp==NULL) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }

    if(length<=1) {
        return;
    } else if(length<MIN_QSORT || sortStable) {
        insertionSort((char *)array, length, itemSize, cmp, context, pErrorCode);
        /* could add heapSort or similar for stable sorting of longer arrays */
    } else {
        quickSort((char *)array, length, itemSize, cmp, context, pErrorCode);
    }
}

--- NEW FILE: uarrsort.h ---
/*
*******************************************************************************
*
*   Copyright (C) 2003, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  uarrsort.h
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2003aug04
*   created by: Markus W. Scherer
*
*   Internal function for sorting arrays.
*/

#ifndef __UARRSORT_H__
#define __UARRSORT_H__

#include "unicode/utypes.h"

U_CDECL_BEGIN
/**
 * Function type for comparing two items as part of sorting an array or similar.
 * Callback function for uprv_sortArray().
 *
 * @param context Application-specific pointer, passed through by uprv_sortArray().
 * @param left    Pointer to the "left" item.
 * @param right   Pointer to the "right" item.
 * @return 32-bit signed integer comparison result:
 *                <0 if left<right
 *               ==0 if left==right
 *                >0 if left>right
 *
 * @internal
 */
typedef int32_t U_CALLCONV
UComparator(const void *context, const void *left, const void *right);
U_CDECL_END

/**
 * Array sorting function.
 * Uses a UComparator for comparing array items to each other, and simple
 * memory copying to move items.
 *
 * @param array      The array to be sorted.
 * @param length     The number of items in the array.
 * @param itemSize   The size in bytes of each array item.
 * @param cmp        UComparator function used to compare two items each.
 * @param context    Application-specific pointer, passed through to the UComparator.
 * @param sortStable If true, a stable sorting algorithm must be used.
 * @param pErrorCode ICU in/out UErrorCode parameter.
 *
 * @internal
 */
U_CAPI void U_EXPORT2
uprv_sortArray(void *array, int32_t length, int32_t itemSize,
               UComparator *cmp, const void *context,
               UBool sortStable, UErrorCode *pErrorCode);

/**
 * Convenience UComparator implementation for uint16_t arrays.
 * @internal
 */
U_CAPI int32_t U_EXPORT2
uprv_uint16Comparator(const void *context, const void *left, const void *right);

/**
 * Convenience UComparator implementation for int32_t arrays.
 * @internal
 */
U_CAPI int32_t U_EXPORT2
uprv_int32Comparator(const void *context, const void *left, const void *right);

/**
 * Convenience UComparator implementation for uint32_t arrays.
 * @internal
 */
U_CAPI int32_t U_EXPORT2
uprv_uint32Comparator(const void *context, const void *left, const void *right);

#endif

--- NEW FILE: ucnv_ext.c ---
/*
******************************************************************************
*
*   Copyright (C) 2003, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
******************************************************************************
*   file name:  ucnv_ext.c
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2003jun13
*   created by: Markus W. Scherer
*
*   Conversion extensions
*/

#include "unicode/utypes.h"
[...1036 lines suppressed...]
                                (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
                                pErrorCode);
                        } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
                                           UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&
                                  UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
                        ) {
                            uset_add(set, c);
                        }
                    } while((++c&0xf)!=0);
                } else {
                    c+=16; /* empty stage 3 block */
                }
            }
        } else {
            c+=1024; /* empty stage 2 block */
        }
    }
}

#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */

--- NEW FILE: ucnv_ext.h ---
/*
******************************************************************************
*
*   Copyright (C) 2003, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
******************************************************************************
*   file name:  ucnv_ext.h
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2003jun13
*   created by: Markus W. Scherer
*
*   Conversion extensions
*/

#ifndef __UCNV_EXT_H__
#define __UCNV_EXT_H__

#include "unicode/utypes.h"
#include "unicode/ucnv.h"
#include "ucnv_cnv.h"

/*
 * See icuhtml/design/conversion/conversion_extensions.html
 *
 * Conversion extensions serve two purposes:
 * 1. They support m:n mappings.
 * 2. They support extension-only conversion files that are used together
 *    with the regular conversion data in base files.
 *
 * A base file may contain an extension table (explicitly requested or
 * implicitly generated for m:n mappings), but its extension table is not
 * used when an extension-only file is used.
 *
 * It is an error if a base file contains any regular (not extension) mapping
 * from the same sequence as a mapping in the extension file
 * because the base mapping would hide the extension mapping.
 *
 *
 * Data for conversion extensions:
 *
 * One set of data structures per conversion direction (to/from Unicode).
 * The data structures are sorted by input units to allow for binary search.
 * Input sequences of more than one unit are handled like contraction tables
 * in collation:
 * The lookup value of a unit points to another table that is to be searched
 * for the next unit, recursively.
 *
 * For conversion from Unicode, the initial code point is looked up in
 * a 3-stage trie for speed,
 * with an additional table of unique results to save space.
 *
 * Long output strings are stored in separate arrays, with length and index
 * in the lookup tables.
 * Output results also include a flag distinguishing roundtrip from
 * (reverse) fallback mappings.
 *
 * Input Unicode strings must not begin or end with unpaired surrogates
 * to avoid problems with matches on parts of surrogate pairs.
 *
 * Mappings from multiple characters (code points or codepage state
 * table sequences) must be searched preferring the longest match.
 * For this to work and be efficient, the variable-width table must contain
 * all mappings that contain prefixes of the multiple characters.
 * If an extension table is built on top of a base table in another file
 * and a base table entry is a prefix of a multi-character mapping, then
 * this is an error.
 *
 *
 * Implementation note:
 *
 * Currently, the parser and several checks in the code limit the number
 * of UChars or bytes in a mapping to
 * UCNV_EXT_MAX_UCHARS and UCNV_EXT_MAX_BYTES, respectively,
 * which are output value limits in the data structure.
 *
 * For input, this is not strictly necessary - it is a hard limit only for the
 * buffers in UConverter that are used to store partial matches.
 *
 * Input sequences could otherwise be arbitrarily long if partial matches
 * need not be stored (i.e., if a sequence does not span several buffers with too
 * many units before the last buffer), although then results would differ
 * depending on whether partial matches exceed the limits or not,
 * which depends on the pattern of buffer sizes.
 *
 *
 * Data structure:
 *
 * int32_t indexes[>=32];
 *
 *   Array of indexes and lengths etc. The length of the array is at least 32.
 *   The actual length is stored in indexes[0] to be forward compatible.
 *
 *   Each index to another array is the number of bytes from indexes[].
 *   Each length of an array is the number of array base units in that array.
 *
 *   Some of the structures may not be present, in which case their indexes
 *   and lengths are 0.
 *
 *   Usage of indexes[i]:
 *   [0]  length of indexes[]
 *
 *   // to Unicode table
 *   [1]  index of toUTable[] (array of uint32_t)
 *   [2]  length of toUTable[]
 *   [3]  index of toUUChars[] (array of UChar)
 *   [4]  length of toUUChars[]
 *
 *   // from Unicode table, not for the initial code point
 *   [5]  index of fromUTableUChars[] (array of UChar)
 *   [6]  index of fromUTableValues[] (array of uint32_t)
 *   [7]  length of fromUTableUChars[] and fromUTableValues[]
 *   [8]  index of fromUBytes[] (array of char)
 *   [9]  length of fromUBytes[]
 *
 *   // from Unicode trie for initial-code point lookup
 *   [10] index of fromUStage12[] (combined array of uint16_t for stages 1 & 2)
 *   [11] length of stage 1 portion of fromUStage12[]
 *   [12] length of fromUStage12[]
 *   [13] index of fromUStage3[] (array of uint16_t indexes into fromUStage3b[])
 *   [14] length of fromUStage3[]
 *   [15] index of fromUStage3b[] (array of uint32_t like fromUTableValues[])
 *   [16] length of fromUStage3b[]
 *
 *   [17] Bit field containing numbers of bytes:
 *        31..24 reserved, 0
 *        23..16 maximum input bytes
 *        15.. 8 maximum output bytes
 *         7.. 0 maximum bytes per UChar
 *
 *   [18] Bit field containing numbers of UChars:
 *        31..24 reserved, 0
 *        23..16 maximum input UChars
 *        15.. 8 maximum output UChars
 *         7.. 0 maximum UChars per byte
 *
 *   [19] Bit field containing flags:
 *               (extension table unicodeMask)
 *         1     UCNV_HAS_SURROGATES flag for the extension table
 *         0     UCNV_HAS_SUPPLEMENTARY flag for the extension table
 *
 *   [20]..[30] reserved, 0
 *   [31] number of bytes for the entire extension structure
 *   [>31] reserved; there are indexes[0] indexes
 *
 *
 * uint32_t toUTable[];
 *
 *   Array of byte/value pairs for lookups for toUnicode conversion.
 *   The array is partitioned into sections like collation contraction tables.
 *   Each section contains one word with the number of following words and
 *   a default value for when the lookup in this section yields no match.
 *
 *   A section is sorted in ascending order of input bytes,
 *   allowing for fast linear or binary searches.
 *   The builder may store entries for a contiguous range of byte values
 *   (compare difference between the first and last one with count),
 *   which then allows for direct array access.
 *   The builder should always do this for the initial table section.
 *
 *   Entries may have 0 values, see below.
 *   No two entries in a section have the same byte values.
 *
 *   Each uint32_t contains an input byte value in bits 31..24 and the
 *   corresponding lookup value in bits 23..0.
 *   Interpret the value as follows:
 *     if(value==0) {
 *       no match, see below
 *     } else if(value<0x1f0000) {
 *       partial match - use value as index to the next toUTable section
 *       and match the next unit; (value indexes toUTable[value])
 *     } else {
 *       if(bit 23 set) {
 *         roundtrip;
 *       } else {
 *         fallback;
 *       }
 *       unset value bit 23;
 *       if(value<=0x2fffff) {
 *         (value-0x1f0000) is a code point; (BMP: value<=0x1fffff)
 *       } else {
 *         bits 17..0 (value&0x3ffff) is an index to
 *           the result UChars in toUUChars[]; (0 indexes toUUChars[0])
 *         length of the result=((value>>18)-12); (length=0..19)
 *       }
 *     }
 *
 *   The first word in a section contains the number of following words in the
 *   input byte position (bits 31..24, number=1..0xff).
 *   The value of the initial word is used when the current byte is not found
 *   in this section.
 *   If the value is not 0, then it represents a result as above.
 *   If the value is 0, then the search has to return a shorter match with an
 *   earlier default value as the result, or result in "unmappable" even for the
 *   initial bytes.
 *   If the value is 0 for the initial toUTable entry, then the initial byte
 *   does not start any mapping input.
 *
 *
 * UChar toUUChars[];
 *
 *   Contains toUnicode mapping results, stored as sequences of UChars.
 *   Indexes and lengths stored in the toUTable[].
 *
 *
 * UChar fromUTableUChars[];
 * uint32_t fromUTableValues[];
 *
 *   The fromUTable is split into two arrays, but works otherwise much like
 *   the toUTable. The array is partitioned into sections like collation
 *   contraction tables and toUTable.
 *   A row in the table consists of same-index entries in fromUTableUChars[]
 *   and fromUTableValues[].
 *
 *   Interpret a value as follows:
 *     if(value==0) {
 *       no match, see below
 *     } else if(value<=0xffffff) { (bits 31..24 are 0)
 *       partial match - use value as index to the next fromUTable section
 *       and match the next unit; (value indexes fromUTable[value])
 *     } else {
 *       if(value==0x80000001) {
 *         return no mapping, but request for <subchar1>;
 *       }
 *       if(bit 31 set) {
 *         roundtrip;
 *       } else {
 *         fallback;
 *       }
 *       // bits 30..29 reserved, 0
 *       length=(value>>24)&0x1f; (bits 28..24)
 *       if(length==1..3) {
 *         bits 23..0 contain 1..3 bytes, padded with 00s on the left;
 *       } else {
 *         bits 23..0 (value&0xffffff) is an index to
 *           the result bytes in fromUBytes[]; (0 indexes fromUBytes[0])
 *       }
 *     }
 *       
 *   The first pair in a section contains the number of following pairs in the
 *   UChar position (16 bits, number=1..0xffff).
 *   The value of the initial pair is used when the current UChar is not found
 *   in this section.
 *   If the value is not 0, then it represents a result as above.
 *   If the value is 0, then the search has to return a shorter match with an
 *   earlier default value as the result, or result in "unmappable" even for the
 *   initial UChars.
 *
 *   If the from Unicode trie is present, then the from Unicode search tables
 *   are not used for initial code points.
 *   In this case, the first entries (index 0) in the tables are not used
 *   (reserved, set to 0) because a value of 0 is used in trie results
 *   to indicate no mapping.
 *
 *
 * uint16_t fromUStage12[];
 *
 *   Stages 1 & 2 of a trie that maps an initial code point.
 *   Indexes in stage 1 are all offset by the length of stage 1 so that the
 *   same array pointer can be used for both stages.
 *   If (c>>10)>=(length of stage 1) then c does not start any mapping.
 *   Same bit distribution as for regular conversion tries.
 *
 *
 * uint16_t fromUStage3[];
 * uint32_t fromUStage3b[];
 *
 *   Stage 3 of the trie. The first array simply contains indexes to the second,
 *   which contains words in the same format as fromUTableValues[].
 *   Use a stage 3 granularity of 4, which allows for 256k stage 3 entries,
 *   and 16-bit entries in stage 3 allow for 64k stage 3b entries.
 *   The stage 3 granularity means that the stage 2 entry needs to be left-shifted.
 *
 *   Two arrays are used because it is expected that more than half of the stage 3
 *   entries will be zero. The 16-bit index stage 3 array saves space even
 *   considering storing a total of 6 bytes per non-zero entry in both arrays
 *   together.
 *   Using a stage 3 granularity of >1 diminishes the compactability in that stage
 *   but provides a larger effective addressing space in stage 2.
 *   All but the final result stage use 16-bit entries to save space.
 *
 *   fromUStage3b[] contains a zero for "no mapping" at its index 0,
 *   and may contain UCNV_EXT_FROM_U_SUBCHAR1 at index 1 for "<subchar1> SUB mapping"
 *   (i.e., "no mapping" with preference for <subchar1> rather than <subchar>),
 *   and all other items are unique non-zero results.
 *
 *   The default value of a fromUTableValues[] section that is referenced
 *   _directly_ from a fromUStage3b[] item may also be UCNV_EXT_FROM_U_SUBCHAR1,
 *   but this value must not occur anywhere else in fromUTableValues[]
 *   because "no mapping" is always a property of a single code point,
 *   never of multiple.
 *
 *
 * char fromUBytes[];
 *
 *   Contains fromUnicode mapping results, stored as sequences of chars.
 *   Indexes and lengths stored in the fromUTableValues[].
 */
enum {
    UCNV_EXT_INDEXES_LENGTH,            /* 0 */

    UCNV_EXT_TO_U_INDEX,                /* 1 */
    UCNV_EXT_TO_U_LENGTH,
    UCNV_EXT_TO_U_UCHARS_INDEX,
    UCNV_EXT_TO_U_UCHARS_LENGTH,

    UCNV_EXT_FROM_U_UCHARS_INDEX,       /* 5 */
    UCNV_EXT_FROM_U_VALUES_INDEX,
    UCNV_EXT_FROM_U_LENGTH,
    UCNV_EXT_FROM_U_BYTES_INDEX,
    UCNV_EXT_FROM_U_BYTES_LENGTH,

    UCNV_EXT_FROM_U_STAGE_12_INDEX,     /* 10 */
    UCNV_EXT_FROM_U_STAGE_1_LENGTH,
    UCNV_EXT_FROM_U_STAGE_12_LENGTH,
    UCNV_EXT_FROM_U_STAGE_3_INDEX,
    UCNV_EXT_FROM_U_STAGE_3_LENGTH,
    UCNV_EXT_FROM_U_STAGE_3B_INDEX,
    UCNV_EXT_FROM_U_STAGE_3B_LENGTH,

    UCNV_EXT_COUNT_BYTES,               /* 17 */
    UCNV_EXT_COUNT_UCHARS,
    UCNV_EXT_FLAGS,

    UCNV_EXT_RESERVED_INDEX,            /* 20, moves with additional indexes */

    UCNV_EXT_SIZE=31,
    UCNV_EXT_INDEXES_MIN_LENGTH=32
};

/* get the pointer to an extension array from indexes[index] */
#define UCNV_EXT_ARRAY(indexes, index, itemType) \
    ((const itemType *)((const char *)(indexes)+(indexes)[index]))

#define UCNV_GET_MAX_BYTES_PER_UCHAR(indexes) \
    ((indexes)[UCNV_EXT_COUNT_BYTES]&0xff)

/* internal API ------------------------------------------------------------- */

U_CFUNC UBool
ucnv_extInitialMatchToU(UConverter *cnv, const int32_t *cx,
                        int32_t firstLength,
                        const char **src, const char *srcLimit,
                        UChar **target, const UChar *targetLimit,
                        int32_t **offsets, int32_t srcIndex,
                        UBool flush,
                        UErrorCode *pErrorCode);

U_CFUNC UChar32
ucnv_extSimpleMatchToU(const int32_t *cx,
                       const char *source, int32_t length,
                       UBool useFallback);

U_CFUNC void
ucnv_extContinueMatchToU(UConverter *cnv,
                         UConverterToUnicodeArgs *pArgs, int32_t srcIndex,
                         UErrorCode *pErrorCode);


U_CFUNC UBool
ucnv_extInitialMatchFromU(UConverter *cnv, const int32_t *cx,
                          UChar32 cp,
                          const UChar **src, const UChar *srcLimit,
                          char **target, const char *targetLimit,
                          int32_t **offsets, int32_t srcIndex,
                          UBool flush,
                          UErrorCode *pErrorCode);

U_CFUNC int32_t
ucnv_extSimpleMatchFromU(const int32_t *cx,
                         UChar32 cp, uint32_t *pValue,
                         UBool useFallback);

U_CFUNC void
ucnv_extContinueMatchFromU(UConverter *cnv,
                           UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,
                           UErrorCode *pErrorCode);

U_CFUNC void
ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
                      USet *set,
                      UConverterUnicodeSet which,
                      UErrorCode *pErrorCode);

/* toUnicode helpers -------------------------------------------------------- */

#define UCNV_EXT_TO_U_BYTE_SHIFT 24
#define UCNV_EXT_TO_U_VALUE_MASK 0xffffff
#define UCNV_EXT_TO_U_MIN_CODE_POINT 0x1f0000
#define UCNV_EXT_TO_U_MAX_CODE_POINT 0x2fffff
#define UCNV_EXT_TO_U_ROUNDTRIP_FLAG ((uint32_t)1<<23)
#define UCNV_EXT_TO_U_INDEX_MASK 0x3ffff
#define UCNV_EXT_TO_U_LENGTH_SHIFT 18
#define UCNV_EXT_TO_U_LENGTH_OFFSET 12

/* maximum number of indexed UChars */
#define UCNV_EXT_MAX_UCHARS 19

#define UCNV_EXT_TO_U_MAKE_WORD(byte, value) (((uint32_t)(byte)<<UCNV_EXT_TO_U_BYTE_SHIFT)|(value))

#define UCNV_EXT_TO_U_GET_BYTE(word) ((word)>>UCNV_EXT_TO_U_BYTE_SHIFT)
#define UCNV_EXT_TO_U_GET_VALUE(word) ((word)&UCNV_EXT_TO_U_VALUE_MASK)

#define UCNV_EXT_TO_U_IS_PARTIAL(value) ((value)<UCNV_EXT_TO_U_MIN_CODE_POINT)
#define UCNV_EXT_TO_U_GET_PARTIAL_INDEX(value) (value)

#define UCNV_EXT_TO_U_IS_ROUNDTRIP(value) (((value)&UCNV_EXT_TO_U_ROUNDTRIP_FLAG)!=0)
#define UCNV_EXT_TO_U_MASK_ROUNDTRIP(value) ((value)&~UCNV_EXT_TO_U_ROUNDTRIP_FLAG)

/* use after masking off the roundtrip flag */
#define UCNV_EXT_TO_U_IS_CODE_POINT(value) ((value)<=UCNV_EXT_TO_U_MAX_CODE_POINT)
#define UCNV_EXT_TO_U_GET_CODE_POINT(value) ((value)-UCNV_EXT_TO_U_MIN_CODE_POINT)

#define UCNV_EXT_TO_U_GET_INDEX(value) ((value)&UCNV_EXT_TO_U_INDEX_MASK)
#define UCNV_EXT_TO_U_GET_LENGTH(value) (((value)>>UCNV_EXT_TO_U_LENGTH_SHIFT)-UCNV_EXT_TO_U_LENGTH_OFFSET)

/* fromUnicode helpers ------------------------------------------------------ */

/* most trie constants are shared with ucnvmbcs.h */

/* see similar utrie.h UTRIE_INDEX_SHIFT and UTRIE_DATA_GRANULARITY */
#define UCNV_EXT_STAGE_2_LEFT_SHIFT 2
#define UCNV_EXT_STAGE_3_GRANULARITY 4

/* trie access, returns the stage 3 value=index to stage 3b; s1Index=c>>10 */
#define UCNV_EXT_FROM_U(stage12, stage3, s1Index, c) \
    (stage3)[ ((int32_t)(stage12)[ (stage12)[s1Index] +(((c)>>4)&0x3f) ]<<UCNV_EXT_STAGE_2_LEFT_SHIFT) +((c)&0xf) ]

#define UCNV_EXT_FROM_U_LENGTH_SHIFT 24
#define UCNV_EXT_FROM_U_ROUNDTRIP_FLAG ((uint32_t)1<<31)
#define UCNV_EXT_FROM_U_RESERVED_MASK 0x60000000
#define UCNV_EXT_FROM_U_DATA_MASK 0xffffff

/* special value for "no mapping" to <subchar1> (impossible roundtrip to 0 bytes, value 01) */
#define UCNV_EXT_FROM_U_SUBCHAR1 0x80000001

/* at most 3 bytes in the lower part of the value */
#define UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH 3

/* maximum number of indexed bytes */
#define UCNV_EXT_MAX_BYTES 0x1f

#define UCNV_EXT_FROM_U_IS_PARTIAL(value) (((value)>>UCNV_EXT_FROM_U_LENGTH_SHIFT)==0)
#define UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value) (value)

#define UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) (((value)&UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)!=0)
#define UCNV_EXT_FROM_U_MASK_ROUNDTRIP(value) ((value)&~UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)

/* use after masking off the roundtrip flag */
#define UCNV_EXT_FROM_U_GET_LENGTH(value) (int32_t)(((value)>>UCNV_EXT_FROM_U_LENGTH_SHIFT)&UCNV_EXT_MAX_BYTES)

/* get bytes or bytes index */
#define UCNV_EXT_FROM_U_GET_DATA(value) ((value)&UCNV_EXT_FROM_U_DATA_MASK)

#endif

--- NEW FILE: ucol_swp.c ---
/*
*******************************************************************************
*
*   Copyright (C) 2003, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  ucol_swp.c
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2003sep10
*   created by: Markus W. Scherer
*
*   Swap collation binaries.
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

#include "unicode/udata.h" /* UDataInfo */
#include "cmemory.h"
#include "utrie.h"
#include "udataswp.h"
#include "ucol_imp.h"
#include "ucol_swp.h"

/* swap a header-less collation binary, inside a resource bundle or ucadata.icu */
U_CAPI int32_t U_EXPORT2
ucol_swapBinary(const UDataSwapper *ds,
                const void *inData, int32_t length, void *outData,
                UErrorCode *pErrorCode) {
    const uint8_t *inBytes;
    uint8_t *outBytes;

    const UCATableHeader *inHeader;
    UCATableHeader *outHeader;
    UCATableHeader header={ 0 };

    uint32_t count;

    /* argument checking in case we were not called from ucol_swap() */
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return 0;
    }
    if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    inBytes=(const uint8_t *)inData;
    outBytes=(uint8_t *)outData;

    inHeader=(const UCATableHeader *)inData;
    outHeader=(UCATableHeader *)outData;

    /*
     * The collation binary must contain at least the UCATableHeader,
     * starting with its size field.
     * sizeof(UCATableHeader)==42*4 in ICU 2.8
     * check the length against the header size before reading the size field
     */
    if(length<0) {
        header.size=udata_readInt32(ds, inHeader->size);
    } else if((length<(42*4) || length<(header.size=udata_readInt32(ds, inHeader->size)))) {
        udata_printError(ds, "ucol_swapBinary(): too few bytes (%d after header) for collation data\n",
                         length);
        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
        return 0;
    }

    header.magic=ds->readUInt32(inHeader->magic);
    if(!(
        header.magic==UCOL_HEADER_MAGIC &&
        inHeader->formatVersion[0]==2 &&
        inHeader->formatVersion[1]>=3
    )) {
        udata_printError(ds, "ucol_swapBinary(): magic 0x%08x or format version %02x.%02x is not a collation binary\n",
                         header.magic,
                         inHeader->formatVersion[0], inHeader->formatVersion[1]);
        *pErrorCode=U_UNSUPPORTED_ERROR;
        return 0;
    }

    if(inHeader->isBigEndian!=ds->inIsBigEndian || inHeader->charSetFamily!=ds->inCharset) {
        udata_printError(ds, "ucol_swapBinary(): endianness %d or charset %d does not match the swapper\n",
                         inHeader->isBigEndian, inHeader->charSetFamily);
        *pErrorCode=U_INVALID_FORMAT_ERROR;
        return 0;
    }

    if(length>=0) {
        /* copy everything, takes care of data that needs no swapping */
        if(inBytes!=outBytes) {
            uprv_memcpy(outBytes, inBytes, header.size);
        }

        /* swap the necessary pieces in the order of their occurrence in the data */

        /* read more of the UCATableHeader (the size field was read above) */
        header.options=                 ds->readUInt32(inHeader->options);
        header.UCAConsts=               ds->readUInt32(inHeader->UCAConsts);
        header.contractionUCACombos=    ds->readUInt32(inHeader->contractionUCACombos);
        header.mappingPosition=         ds->readUInt32(inHeader->mappingPosition);
        header.expansion=               ds->readUInt32(inHeader->expansion);
        header.contractionIndex=        ds->readUInt32(inHeader->contractionIndex);
        header.contractionCEs=          ds->readUInt32(inHeader->contractionCEs);
        header.contractionSize=         ds->readUInt32(inHeader->contractionSize);
        header.endExpansionCE=          ds->readUInt32(inHeader->endExpansionCE);
        header.expansionCESize=         ds->readUInt32(inHeader->expansionCESize);
        header.endExpansionCECount=     udata_readInt32(ds, inHeader->endExpansionCECount);
        header.contractionUCACombosSize=udata_readInt32(ds, inHeader->contractionUCACombosSize);

        /* swap the 32-bit integers in the header */
        ds->swapArray32(ds, inHeader, (int32_t)((const char *)&inHeader->jamoSpecial-(const char *)inHeader),
                           outHeader, pErrorCode);

        /* set the output platform properties */
        outHeader->isBigEndian=ds->outIsBigEndian;
        outHeader->charSetFamily=ds->outCharset;

        /* swap the options */
        if(header.options!=0) {
            ds->swapArray32(ds, inBytes+header.options, header.expansion-header.options,
                               outBytes+header.options, pErrorCode);
        }

        /* swap the expansions */
        if(header.mappingPosition!=0 && header.expansion!=0) {
            if(header.contractionIndex!=0) {
                /* expansions bounded by contractions */
                count=header.contractionIndex-header.expansion;
            } else {
                /* no contractions: expansions bounded by the main trie */
                count=header.mappingPosition-header.expansion;
            }
            ds->swapArray32(ds, inBytes+header.expansion, (int32_t)count,
                               outBytes+header.expansion, pErrorCode);
        }

        /* swap the contractions */
        if(header.contractionSize!=0) {
            /* contractionIndex: UChar[] */
            ds->swapArray16(ds, inBytes+header.contractionIndex, header.contractionSize*2,
                               outBytes+header.contractionIndex, pErrorCode);

            /* contractionCEs: CEs[] */
            ds->swapArray32(ds, inBytes+header.contractionCEs, header.contractionSize*4,
                               outBytes+header.contractionCEs, pErrorCode);
        }

        /* swap the main trie */
        if(header.mappingPosition!=0) {
            count=header.endExpansionCE-header.mappingPosition;
            utrie_swap(ds, inBytes+header.mappingPosition, (int32_t)count,
                          outBytes+header.mappingPosition, pErrorCode);
        }

        /* swap the max expansion table */
        if(header.endExpansionCECount!=0) {
            ds->swapArray32(ds, inBytes+header.endExpansionCE, header.endExpansionCECount*4,
                               outBytes+header.endExpansionCE, pErrorCode);
        }

        /* expansionCESize, unsafeCP, contrEndCP: uint8_t[], no need to swap */

        /* swap UCA constants */
        if(header.UCAConsts!=0) {
            /*
             * if UCAConsts!=0 then contractionUCACombos because we are swapping
             * the UCA data file, and we know that the UCA contains contractions
             */
            count=header.contractionUCACombos-header.UCAConsts;
            ds->swapArray32(ds, inBytes+header.UCAConsts, header.contractionUCACombos-header.UCAConsts,
                               outBytes+header.UCAConsts, pErrorCode);
        }

        /* swap UCA contractions */
        if(header.contractionUCACombosSize!=0) {
            count=header.contractionUCACombosSize*inHeader->contractionUCACombosWidth*U_SIZEOF_UCHAR;
            ds->swapArray16(ds, inBytes+header.contractionUCACombos, (int32_t)count,
                               outBytes+header.contractionUCACombos, pErrorCode);
        }
    }

    return header.size;
}

/* swap ICU collation data like ucadata.icu */
U_CAPI int32_t U_EXPORT2
ucol_swap(const UDataSwapper *ds,
          const void *inData, int32_t length, void *outData,
          UErrorCode *pErrorCode) {
    const UDataInfo *pInfo;
    int32_t headerSize, collationSize;

    /* udata_swapDataHeader checks the arguments */
    headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return 0;
    }

    /* check data format and format version */
    pInfo=(const UDataInfo *)((const char *)inData+4);
    if(!(
        pInfo->dataFormat[0]==0x55 &&   /* dataFormat="UCol" */
        pInfo->dataFormat[1]==0x43 &&
        pInfo->dataFormat[2]==0x6f &&
        pInfo->dataFormat[3]==0x6c &&
        pInfo->formatVersion[0]==2 &&
        pInfo->formatVersion[1]>=3
    )) {
        udata_printError(ds, "ucol_swap(): data format %02x.%02x.%02x.%02x (format version %02x.%02x) is not a collation file\n",
                         pInfo->dataFormat[0], pInfo->dataFormat[1],
                         pInfo->dataFormat[2], pInfo->dataFormat[3],
                         pInfo->formatVersion[0], pInfo->formatVersion[1]);
        *pErrorCode=U_UNSUPPORTED_ERROR;
        return 0;
    }

    collationSize=ucol_swapBinary(ds,
                        (const char *)inData+headerSize,
                        length>=0 ? length-headerSize : -1,
                        (char *)outData+headerSize,
                        pErrorCode);
    if(U_SUCCESS(*pErrorCode)) {
        return headerSize+collationSize;
    } else {
        return 0;
    }
}

/* swap inverse UCA collation data (invuca.icu) */
U_CAPI int32_t U_EXPORT2
ucol_swapInverseUCA(const UDataSwapper *ds,
                    const void *inData, int32_t length, void *outData,
                    UErrorCode *pErrorCode) {
    const UDataInfo *pInfo;
    int32_t headerSize;

    const uint8_t *inBytes;
    uint8_t *outBytes;

    const InverseUCATableHeader *inHeader;
    InverseUCATableHeader *outHeader;
    InverseUCATableHeader header={ 0 };

    /* udata_swapDataHeader checks the arguments */
    headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return 0;
    }

    /* check data format and format version */
    pInfo=(const UDataInfo *)((const char *)inData+4);
    if(!(
        pInfo->dataFormat[0]==0x49 &&   /* dataFormat="InvC" */
        pInfo->dataFormat[1]==0x6e &&
        pInfo->dataFormat[2]==0x76 &&
        pInfo->dataFormat[3]==0x43 &&
        pInfo->formatVersion[0]==2 &&
        pInfo->formatVersion[1]>=1
    )) {
        udata_printError(ds, "ucol_swapInverseUCA(): data format %02x.%02x.%02x.%02x (format version %02x.%02x) is not an inverse UCA collation file\n",
                         pInfo->dataFormat[0], pInfo->dataFormat[1],
                         pInfo->dataFormat[2], pInfo->dataFormat[3],
                         pInfo->formatVersion[0], pInfo->formatVersion[1]);
        *pErrorCode=U_UNSUPPORTED_ERROR;
        return 0;
    }

    inBytes=(const uint8_t *)inData+headerSize;
    outBytes=(uint8_t *)outData+headerSize;

    inHeader=(const InverseUCATableHeader *)inBytes;
    outHeader=(InverseUCATableHeader *)outBytes;

    /*
     * The inverse UCA collation binary must contain at least the InverseUCATableHeader,
     * starting with its size field.
     * sizeof(UCATableHeader)==8*4 in ICU 2.8
     * check the length against the header size before reading the size field
     */
    if(length<0) {
        header.byteSize=udata_readInt32(ds, inHeader->byteSize);
    } else if(
        ((length-headerSize)<(8*4) ||
         (uint32_t)(length-headerSize)<(header.byteSize=udata_readInt32(ds, inHeader->byteSize)))
    ) {
        udata_printError(ds, "ucol_swapInverseUCA(): too few bytes (%d after header) for inverse UCA collation data\n",
                         length);
        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
        return 0;
    }

    if(length>=0) {
        /* copy everything, takes care of data that needs no swapping */
        if(inBytes!=outBytes) {
            uprv_memcpy(outBytes, inBytes, header.byteSize);
        }

        /* swap the necessary pieces in the order of their occurrence in the data */

        /* read more of the InverseUCATableHeader (the byteSize field was read above) */
        header.tableSize=   ds->readUInt32(inHeader->tableSize);
        header.contsSize=   ds->readUInt32(inHeader->contsSize);
        header.table=       ds->readUInt32(inHeader->table);
        header.conts=       ds->readUInt32(inHeader->conts);

        /* swap the 32-bit integers in the header */
        ds->swapArray32(ds, inHeader, 5*4, outHeader, pErrorCode);

        /* swap the inverse table; tableSize counts uint32_t[3] rows */
        ds->swapArray32(ds, inBytes+header.table, header.tableSize*3*4,
                           outBytes+header.table, pErrorCode);

        /* swap the continuation table; contsSize counts UChars */
        ds->swapArray16(ds, inBytes+header.conts, header.contsSize*U_SIZEOF_UCHAR,
                           outBytes+header.conts, pErrorCode);
    }

    return headerSize+header.byteSize;
}

#endif /* #if !UCONFIG_NO_COLLATION */

--- NEW FILE: ucol_swp.h ---
/*
*******************************************************************************
*
*   Copyright (C) 2003, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  ucol_swp.h
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2003sep10
*   created by: Markus W. Scherer
*
*   Swap collation binaries.
*/

#ifndef __UCOL_SWP_H__
#define __UCOL_SWP_H__

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

#include "udataswp.h"

/**
 * Swap a header-less collation binary, inside a resource bundle or ucadata.icu.
 * See udataswp.h.
 * @internal
 */
U_CAPI int32_t U_EXPORT2
ucol_swapBinary(const UDataSwapper *ds,
                const void *inData, int32_t length, void *outData,
                UErrorCode *pErrorCode);

/**
 * Swap ICU collation data like ucadata.icu. See udataswp.h.
 * @internal
 */
U_CAPI int32_t U_EXPORT2
ucol_swap(const UDataSwapper *ds,
          const void *inData, int32_t length, void *outData,
          UErrorCode *pErrorCode);

/**
 * Swap inverse UCA collation data (invuca.icu). See udataswp.h.
 * @internal
 */
U_CAPI int32_t U_EXPORT2
ucol_swapInverseUCA(const UDataSwapper *ds,
                    const void *inData, int32_t length, void *outData,
                    UErrorCode *pErrorCode);

#endif /* #if !UCONFIG_NO_COLLATION */

#endif

--- NEW FILE: udataswp.c ---
/*
*******************************************************************************
*
*   Copyright (C) 2003, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  udataswp.c
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2003jun05
*   created by: Markus W. Scherer
*
*   Definitions for ICU data transformations for different platforms,
*   changing between big- and little-endian data and/or between
*   charset families (ASCII<->EBCDIC).
*/

#include <stdarg.h>
#include "unicode/utypes.h"
#include "unicode/udata.h" /* UDataInfo */
#include "ucmndata.h" /* DataHeader */
#include "cmemory.h"
#include "udataswp.h"

/* swapping primitives ------------------------------------------------------ */

static int32_t U_CALLCONV
uprv_swapArray16(const UDataSwapper *ds,
                 const void *inData, int32_t length, void *outData,
                 UErrorCode *pErrorCode) {
    const uint16_t *p;
    uint16_t *q;
    int32_t count;
    uint16_t x;

    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return 0;
    }
    if(ds==NULL || inData==NULL || length<0 || (length&1)!=0 || outData==NULL) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    /* setup and swapping */
    p=(const uint16_t *)inData;
    q=(uint16_t *)outData;
    count=length/2;
    while(count>0) {
        x=*p++;
        *q++=(uint16_t)((x<<8)|(x>>8));
        --count;
    }

    return length;
}

static int32_t U_CALLCONV
uprv_copyArray16(const UDataSwapper *ds,
                 const void *inData, int32_t length, void *outData,
                 UErrorCode *pErrorCode) {
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return 0;
    }
    if(ds==NULL || inData==NULL || length<0 || (length&1)!=0 || outData==NULL) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    if(length>0 && inData!=outData) {
        uprv_memcpy(outData, inData, length);
    }
    return length;
}

static int32_t U_CALLCONV
uprv_swapArray32(const UDataSwapper *ds,
                 const void *inData, int32_t length, void *outData,
                 UErrorCode *pErrorCode) {
    const uint32_t *p;
    uint32_t *q;
    int32_t count;
    uint32_t x;

    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return 0;
    }
    if(ds==NULL || inData==NULL || length<0 || (length&3)!=0 || outData==NULL) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    /* setup and swapping */
    p=(const uint32_t *)inData;
    q=(uint32_t *)outData;
    count=length/4;
    while(count>0) {
        x=*p++;
        *q++=(uint32_t)((x<<24)|((x<<8)&0xff0000)|((x>>8)&0xff00)|(x>>24));
        --count;
    }

    return length;
}

static int32_t U_CALLCONV
uprv_copyArray32(const UDataSwapper *ds,
                 const void *inData, int32_t length, void *outData,
                 UErrorCode *pErrorCode) {
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return 0;
    }
    if(ds==NULL || inData==NULL || length<0 || (length&3)!=0 || outData==NULL) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    if(length>0 && inData!=outData) {
        uprv_memcpy(outData, inData, length);
    }
    return length;
}

static uint16_t U_CALLCONV
uprv_readSwapUInt16(uint16_t x) {
    return (uint16_t)((x<<8)|(x>>8));
}

static uint16_t U_CALLCONV
uprv_readDirectUInt16(uint16_t x) {
    return x;
}

static uint32_t U_CALLCONV
uprv_readSwapUInt32(uint32_t x) {
    return (uint32_t)((x<<24)|((x<<8)&0xff0000)|((x>>8)&0xff00)|(x>>24));
}

static uint32_t U_CALLCONV
uprv_readDirectUInt32(uint32_t x) {
    return x;
}

static void U_CALLCONV
uprv_writeSwapUInt16(uint16_t *p, uint16_t x) {
    *p=(uint16_t)((x<<8)|(x>>8));
}

static void U_CALLCONV
uprv_writeDirectUInt16(uint16_t *p, uint16_t x) {
    *p=x;
}

static void U_CALLCONV
uprv_writeSwapUInt32(uint32_t *p, uint32_t x) {
    *p=(uint32_t)((x<<24)|((x<<8)&0xff0000)|((x>>8)&0xff00)|(x>>24));
}

static void U_CALLCONV
uprv_writeDirectUInt32(uint32_t *p, uint32_t x) {
    *p=x;
}

U_CAPI int16_t U_EXPORT2
udata_readInt16(const UDataSwapper *ds, int16_t x) {
    return (int16_t)ds->readUInt16((uint16_t)x);
}

U_CAPI int32_t U_EXPORT2
udata_readInt32(const UDataSwapper *ds, int32_t x) {
    return (int32_t)ds->readUInt32((uint32_t)x);
}

/**
 * Swap a block of invariant, NUL-terminated strings, but not padding
 * bytes after the last string.
 * @internal
 */
U_CAPI int32_t U_EXPORT2
udata_swapInvStringBlock(const UDataSwapper *ds,
                         const void *inData, int32_t length, void *outData,
                         UErrorCode *pErrorCode) {
    const char *inChars;
    int32_t stringsLength;

    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return 0;
    }
    if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    /* reduce the strings length to not include bytes after the last NUL */
    inChars=(const char *)inData;
    stringsLength=length;
    while(stringsLength>0 && inChars[stringsLength-1]!=0) {
        --stringsLength;
    }

    /* swap up to the last NUL */
    ds->swapInvChars(ds, inData, stringsLength, outData, pErrorCode);

    /* copy the bytes after the last NUL */
    if(inData!=outData && length>stringsLength) {
        uprv_memcpy((char *)outData+stringsLength, inChars+stringsLength, length-stringsLength);
    }

    /* return the length including padding bytes */
    if(U_SUCCESS(*pErrorCode)) {
        return length;
    } else {
        return 0;
    }
}

U_CAPI void U_EXPORT2
udata_printError(const UDataSwapper *ds,
                 const char *fmt,
                 ...) {
    va_list args;

    if(ds->printError!=NULL) {
        va_start(args, fmt);
        ds->printError(ds->printErrorContext, fmt, args);
        va_end(args);
    }
}

/* swap a data header ------------------------------------------------------- */

U_CAPI int32_t U_EXPORT2
udata_swapDataHeader(const UDataSwapper *ds,
                     const void *inData, int32_t length, void *outData,
                     UErrorCode *pErrorCode) {
    const DataHeader *pHeader;
    uint16_t headerSize, infoSize;

    /* argument checking */
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return 0;
    }
    if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    /* check minimum length and magic bytes */
    pHeader=(const DataHeader *)inData;
    if( (length>=0 && length<sizeof(DataHeader)) ||
        pHeader->dataHeader.magic1!=0xda ||
        pHeader->dataHeader.magic2!=0x27 ||
        pHeader->info.sizeofUChar!=2
    ) {
        udata_printError(ds, "udata_swapDataHeader(): initial bytes do not look like ICU data\n");
        *pErrorCode=U_UNSUPPORTED_ERROR;
        return 0;
    }

    headerSize=ds->readUInt16(pHeader->dataHeader.headerSize);
    infoSize=ds->readUInt16(pHeader->info.size);

    if( headerSize<sizeof(DataHeader) ||
        infoSize<sizeof(UDataInfo) ||
        headerSize<(sizeof(pHeader->dataHeader)+infoSize) ||
        (length>=0 && length<headerSize)
    ) {
        udata_printError(ds, "udata_swapDataHeader(): header size mismatch - headerSize %d infoSize %d length %d\n",
                         headerSize, infoSize, length);
        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
        return 0;
    }

    if(length>0) {
        DataHeader *outHeader;
        const char *s;
        int32_t maxLength;

        /* Most of the fields are just bytes and need no swapping. */
        if(inData!=outData) {
            uprv_memcpy(outData, inData, headerSize);
        }
        outHeader=(DataHeader *)outData;

        outHeader->info.isBigEndian = ds->outIsBigEndian;
        outHeader->info.charsetFamily = ds->outCharset;

        /* swap headerSize */
        ds->swapArray16(ds, &pHeader->dataHeader.headerSize, 2, &outHeader->dataHeader.headerSize, pErrorCode);

        /* swap UDataInfo size and reservedWord */
        ds->swapArray16(ds, &pHeader->info.size, 4, &outHeader->info.size, pErrorCode);

        /* swap copyright statement after the UDataInfo */
        infoSize+=sizeof(pHeader->dataHeader);
        s=(const char *)inData+infoSize;
        maxLength=headerSize-infoSize;
        /* get the length of the string */
        for(length=0; length<maxLength && s[length]!=0; ++length) {}
        /* swap the string contents */
        ds->swapInvChars(ds, s, length, (char *)outData+infoSize, pErrorCode);
    }

    return headerSize;
}

/* API functions ------------------------------------------------------------ */

U_CAPI UDataSwapper * U_EXPORT2
udata_openSwapper(UBool inIsBigEndian, uint8_t inCharset,
                  UBool outIsBigEndian, uint8_t outCharset,
                  UErrorCode *pErrorCode) {
    UDataSwapper *swapper;

    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return NULL;
    }
    if(inCharset>U_EBCDIC_FAMILY || outCharset>U_EBCDIC_FAMILY) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }

    /* allocate the swapper */
    swapper=uprv_malloc(sizeof(UDataSwapper));
    if(swapper==NULL) {
        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
        return NULL;
    }
    uprv_memset(swapper, 0, sizeof(UDataSwapper));

    /* set values and functions pointers according to in/out parameters */
    swapper->inIsBigEndian=inIsBigEndian;
    swapper->inCharset=inCharset;
    swapper->outIsBigEndian=outIsBigEndian;
    swapper->outCharset=outCharset;

    swapper->readUInt16= inIsBigEndian==U_IS_BIG_ENDIAN ? uprv_readDirectUInt16 : uprv_readSwapUInt16;
    swapper->readUInt32= inIsBigEndian==U_IS_BIG_ENDIAN ? uprv_readDirectUInt32 : uprv_readSwapUInt32;

    swapper->writeUInt16= outIsBigEndian==U_IS_BIG_ENDIAN ? uprv_writeDirectUInt16 : uprv_writeSwapUInt16;
    swapper->writeUInt32= outIsBigEndian==U_IS_BIG_ENDIAN ? uprv_writeDirectUInt32 : uprv_writeSwapUInt32;

    swapper->compareInvChars= outCharset==U_ASCII_FAMILY ? uprv_compareInvAscii : uprv_compareInvEbcdic;

    swapper->swapArray16= inIsBigEndian==outIsBigEndian ? uprv_copyArray16 : uprv_swapArray16;
    swapper->swapArray32= inIsBigEndian==outIsBigEndian ? uprv_copyArray32 : uprv_swapArray32;

    if(inCharset==U_ASCII_FAMILY) {
        swapper->swapInvChars= outCharset==U_ASCII_FAMILY ? uprv_copyAscii : uprv_ebcdicFromAscii;
    } else /* U_EBCDIC_FAMILY */ {
        swapper->swapInvChars= outCharset==U_EBCDIC_FAMILY ? uprv_copyEbcdic : uprv_asciiFromEbcdic;
    }

    return swapper;
}

U_CAPI UDataSwapper * U_EXPORT2
udata_openSwapperForInputData(const void *data, int32_t length,
                              UBool outIsBigEndian, uint8_t outCharset,
                              UErrorCode *pErrorCode) {
    const DataHeader *pHeader;
    uint16_t headerSize, infoSize;
    UBool inIsBigEndian;
    int8_t inCharset;

    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return NULL;
    }
    if( data==NULL ||
        (length>=0 && length<sizeof(DataHeader)) ||
        outCharset>U_EBCDIC_FAMILY
    ) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }

    pHeader=(const DataHeader *)data;
    if( (length>=0 && length<sizeof(DataHeader)) ||
        pHeader->dataHeader.magic1!=0xda ||
        pHeader->dataHeader.magic2!=0x27 ||
        pHeader->info.sizeofUChar!=2
    ) {
        *pErrorCode=U_UNSUPPORTED_ERROR;
        return 0;
    }

    inIsBigEndian=(UBool)pHeader->info.isBigEndian;
    inCharset=pHeader->info.charsetFamily;

    if(inIsBigEndian==U_IS_BIG_ENDIAN) {
        headerSize=pHeader->dataHeader.headerSize;
        infoSize=pHeader->info.size;
    } else {
        headerSize=uprv_readSwapUInt16(pHeader->dataHeader.headerSize);
        infoSize=uprv_readSwapUInt16(pHeader->info.size);
    }

    if( headerSize<sizeof(DataHeader) ||
        infoSize<sizeof(UDataInfo) ||
        headerSize<(sizeof(pHeader->dataHeader)+infoSize) ||
        (length>=0 && length<headerSize)
    ) {
        *pErrorCode=U_UNSUPPORTED_ERROR;
        return 0;
    }

    return udata_openSwapper(inIsBigEndian, inCharset, outIsBigEndian, outCharset, pErrorCode);
}

U_CAPI void U_EXPORT2
udata_closeSwapper(UDataSwapper *ds) {
    uprv_free(ds);
}

--- NEW FILE: udataswp.h ---
/*
*******************************************************************************
*
*   Copyright (C) 2003, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  udataswp.h
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2003jun05
*   created by: Markus W. Scherer
*
*   Definitions for ICU data transformations for different platforms,
*   changing between big- and little-endian data and/or between
*   charset families (ASCII<->EBCDIC).
*/

#ifndef __UDATASWP_H__
#define __UDATASWP_H__

#include <stdarg.h>
#include "unicode/utypes.h"

/* forward declaration */

U_CDECL_BEGIN

struct UDataSwapper;
typedef struct UDataSwapper UDataSwapper;

/**
 * Function type for data transformation.
 * Transforms data, or just returns the length of the data if
 * the input length is -1.
 * Swap functions assume that their data pointers are aligned properly.
 *
 * Quick implementation outline:
 * (best to copy and adapt and existing swapper implementation)
 * check that the data looks like the expected format
 * if(length<0) {
 *   preflight:
 *   never dereference outData
 *   read inData and determine the data size
 *   assume that inData is long enough for this
 * } else {
 *   outData can be NULL if length==0
 *   inData==outData (in-place swapping) possible but not required!
 *   verify that length>=(actual size)
 *   if there is a chance that not every byte up to size is reached
 *     due to padding etc.:
 *   if(inData!=outData) {
 *     memcpy(outData, inData, actual size);
 *   }
 *   swap contents
 * }
 * return actual size
 *
 * Further implementation notes:
 * - read integers from inData before swapping them
 *   because in-place swapping can make them unreadable
 * - compareInvChars compares a local Unicode string with already-swapped
 *   output charset strings
 *
 * @param ds Pointer to UDataSwapper containing global data about the
 *           transformation and function pointers for handling primitive
 *           types.
 * @param inData Pointer to the input data to be transformed or examined.
 * @param length Length of the data, counting bytes. May be -1 for preflighting.
 *               If length>=0, then transform the data.
 *               If length==-1, then only determine the length of the data.
 *               The length cannot be determined from the data itself for all
 *               types of data (e.g., not for simple arrays of integers).
 * @param outData Pointer to the output data buffer.
 *                If length>=0 (transformation), then the output buffer must
 *                have a capacity of at least length.
 *                If length==-1, then outData will not be used and can be NULL.
 * @param pErrorCode ICU UErrorCode parameter, must not be NULL and must
 *                   fulfill U_SUCCESS on input.
 * @return The actual length of the data.
 *
 * @see UDataSwapper
 * @draft ICU 2.8
 */
typedef int32_t U_CALLCONV
UDataSwapFn(const UDataSwapper *ds,
            const void *inData, int32_t length, void *outData,
            UErrorCode *pErrorCode);

/**
 * Convert one uint16_t from input to platform endianness.
 * @draft ICU 2.8
 */
typedef uint16_t U_CALLCONV
UDataReadUInt16(uint16_t x);

/**
 * Convert one uint32_t from input to platform endianness.
 * @draft ICU 2.8
 */
typedef uint32_t U_CALLCONV
UDataReadUInt32(uint32_t x);

/**
 * Convert one uint16_t from platform to input endianness.
 * @draft ICU 2.8
 */
typedef void U_CALLCONV
UDataWriteUInt16(uint16_t *p, uint16_t x);

/**
 * Convert one uint32_t from platform to input endianness.
 * @draft ICU 2.8
 */
typedef void U_CALLCONV
UDataWriteUInt32(uint32_t *p, uint32_t x);

/**
 * Compare invariant-character strings, one in the output data and the
 * other one caller-provided in Unicode.
 * An output data string is compared because strings are usually swapped
 * before the rest of the data, to allow for sorting of string tables
 * according to the output charset.
 * You can use -1 for the length parameters of NUL-terminated strings as usual.
 * Returns Unicode code point order for invariant characters.
 * @draft ICU 2.8
 */
typedef int32_t U_CALLCONV
UDataCompareInvChars(const UDataSwapper *ds,
                     const char *outString, int32_t outLength,
                     const UChar *localString, int32_t localLength);

/**
 * Function for message output when an error occurs during data swapping.
 * A format string and variable number of arguments are passed
 * like for vprintf().
 *
 * @param context A function-specific context pointer.
 * @param fmt The format string.
 * @param args The arguments for format string inserts.
 *
 * @draft ICU 2.8
 */
typedef void U_CALLCONV
UDataPrintError(void *context, const char *fmt, va_list args);

struct UDataSwapper {
    /** Input endianness. @draft ICU 2.8 */
    UBool inIsBigEndian;
    /** Input charset family. @see U_CHARSET_FAMILY @draft ICU 2.8 */
    uint8_t inCharset;
    /** Output endianness. @draft ICU 2.8 */
    UBool outIsBigEndian;
    /** Output charset family. @see U_CHARSET_FAMILY @draft ICU 2.8 */
    uint8_t outCharset;

    /* basic functions for reading data values */

    /** Convert one uint16_t from input to platform endianness. @draft ICU 2.8 */
    UDataReadUInt16 *readUInt16;
    /** Convert one uint32_t from input to platform endianness. @draft ICU 2.8 */
    UDataReadUInt32 *readUInt32;
    /** Compare an invariant-character output string with a local one. @draft ICU 2.8 */
    UDataCompareInvChars *compareInvChars;

    /* basic functions for writing data values */

    /** Convert one uint16_t from platform to input endianness. @draft ICU 2.8 */
    UDataWriteUInt16 *writeUInt16;
    /** Convert one uint32_t from platform to input endianness. @draft ICU 2.8 */
    UDataWriteUInt32 *writeUInt32;

    /* basic functions for data transformations */

    /** Transform an array of 16-bit integers. @draft ICU 2.8 */
    UDataSwapFn *swapArray16;
    /** Transform an array of 32-bit integers. @draft ICU 2.8 */
    UDataSwapFn *swapArray32;
    /** Transform an invariant-character string. @draft ICU 2.8 */
    UDataSwapFn *swapInvChars;

    /**
     * Function for message output when an error occurs during data swapping.
     * Can be NULL.
     * @draft ICU 2.8
     */
    UDataPrintError *printError;
    /** Context pointer for printError. @draft ICU 2.8 */
    void *printErrorContext;
};

U_CDECL_END

U_CAPI UDataSwapper * U_EXPORT2
udata_openSwapper(UBool inIsBigEndian, uint8_t inCharset,
                  UBool outIsBigEndian, uint8_t outCharset,
                  UErrorCode *pErrorCode);

/**
 * Open a UDataSwapper for the given input data and the specified output
 * characteristics.
 * Values of -1 for any of the characteristics mean the local platform's
 * characteristics.
 *
 * @see udata_swap
 * @draft ICU 2.8
 */
U_CAPI UDataSwapper * U_EXPORT2
udata_openSwapperForInputData(const void *data, int32_t length,
                              UBool outIsBigEndian, uint8_t outCharset,
                              UErrorCode *pErrorCode);

U_CAPI void U_EXPORT2
udata_closeSwapper(UDataSwapper *ds);

/**
 * Read the beginning of an ICU data piece, recognize magic bytes,
 * swap the structure.
 * Set a U_UNSUPPORTED_ERROR if it does not look like an ICU data piece.
 *
 * @return The size of the data header, in bytes.
 *
 * @draft ICU 2.8
 */
U_CAPI int32_t U_EXPORT2
udata_swapDataHeader(const UDataSwapper *ds,
                     const void *inData, int32_t length, void *outData,
                     UErrorCode *pErrorCode);

/**
 * Convert one int16_t from input to platform endianness.
 * @draft ICU 2.8
 */
U_CAPI int16_t U_EXPORT2
udata_readInt16(const UDataSwapper *ds, int16_t x);

/**
 * Convert one int32_t from input to platform endianness.
 * @draft ICU 2.8
 */
U_CAPI int32_t U_EXPORT2
udata_readInt32(const UDataSwapper *ds, int32_t x);

/**
 * Swap a block of invariant, NUL-terminated strings, but not padding
 * bytes after the last string.
 * @internal
 */
U_CAPI int32_t U_EXPORT2
udata_swapInvStringBlock(const UDataSwapper *ds,
                         const void *inData, int32_t length, void *outData,
                         UErrorCode *pErrorCode);

U_CAPI void U_EXPORT2
udata_printError(const UDataSwapper *ds,
                 const char *fmt,
                 ...);

/* internal exports from putil.c -------------------------------------------- */

/* declared here to keep them out of the public putil.h */

/**
 * Swap invariant char * strings ASCII->EBCDIC.
 * @internal
 */
U_CFUNC int32_t
uprv_ebcdicFromAscii(const UDataSwapper *ds,
                     const void *inData, int32_t length, void *outData,
                     UErrorCode *pErrorCode);

/**
 * Copy invariant ASCII char * strings and verify they are invariant.
 * @internal
 */
U_CFUNC int32_t
uprv_copyAscii(const UDataSwapper *ds,
               const void *inData, int32_t length, void *outData,
               UErrorCode *pErrorCode);

/**
 * Swap invariant char * strings EBCDIC->ASCII.
 * @internal
 */
U_CFUNC int32_t
uprv_asciiFromEbcdic(const UDataSwapper *ds,
                     const void *inData, int32_t length, void *outData,
                     UErrorCode *pErrorCode);

/**
 * Copy invariant EBCDIC char * strings and verify they are invariant.
 * @internal
 */
U_CFUNC int32_t
uprv_copyEbcdic(const UDataSwapper *ds,
                const void *inData, int32_t length, void *outData,
                UErrorCode *pErrorCode);

/**
 * Compare ASCII invariant char * with Unicode invariant UChar *
 * @internal
 */
U_CFUNC int32_t
uprv_compareInvAscii(const UDataSwapper *ds,
                     const char *outString, int32_t outLength,
                     const UChar *localString, int32_t localLength);

/**
 * Compare EBCDIC invariant char * with Unicode invariant UChar *
 * @internal
 */
U_CFUNC int32_t
uprv_compareInvEbcdic(const UDataSwapper *ds,
                      const char *outString, int32_t outLength,
                      const UChar *localString, int32_t localLength);

/* material... -------------------------------------------------------------- */

#if 0

/* udata.h */

/**
 * Public API function in udata.c
 *
 * Same as udata_openChoice() but automatically swaps the data.
 * isAcceptable, if not NULL, may accept data with endianness and charset family
 * different from the current platform's properties.
 * If the data is acceptable and the platform properties do not match, then
 * the swap function is called to swap an allocated version of the data.
 * Preflighting may or may not be performed depending on whether the size of
 * the loaded data item is known.
 *
 * @param isAcceptable Same as for udata_openChoice(). May be NULL.
 *
 * @draft ICU 2.8
 */
U_CAPI UDataMemory * U_EXPORT2
udata_openSwap(const char *path, const char *type, const char *name,
               UDataMemoryIsAcceptable *isAcceptable, void *isAcceptableContext,
               UDataSwapFn *swap,
               UDataPrintError *printError, void *printErrorContext,
               UErrorCode *pErrorCode);

#endif

#endif

--- NEW FILE: usprep.cpp ---
/*
 *******************************************************************************
 *
 *   Copyright (C) 2003, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
 *   file name:  usprep.cpp
 *   encoding:   US-ASCII
 *   tab size:   8 (not used)
 *   indentation:4
 *
 *   created on: 2003jul2
 *   created by: Ram Viswanadha
 */

#include "unicode/utypes.h"

#if !UCONFIG_NO_IDNA

#include "unicode/usprep.h"

#include "unicode/unorm.h"
#include "unicode/ustring.h"
#include "unicode/uchar.h"
#include "unicode/uversion.h"
#include "umutex.h"
#include "cmemory.h"
#include "sprpimpl.h"
#include "ustr_imp.h"
#include "uhash.h"
#include "cstring.h"
#include "udataswp.h"

U_CDECL_BEGIN

/*
Static cache for already opened StringPrep profiles
*/
static UHashtable *SHARED_DATA_HASHTABLE = NULL;

static UMTX usprepMutex = NULL;


static UBool U_CALLCONV
isAcceptable(void * /* context */,
             const char * /* type */, 
             const char * /* name */,
             const UDataInfo *pInfo) {
    if(
        pInfo->size>=20 &&
        pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
        pInfo->charsetFamily==U_CHARSET_FAMILY &&
        pInfo->dataFormat[0]==0x53 &&   /* dataFormat="SPRP" */
        pInfo->dataFormat[1]==0x50 &&
        pInfo->dataFormat[2]==0x52 &&
        pInfo->dataFormat[3]==0x50 &&
        pInfo->formatVersion[0]==3 &&
        pInfo->formatVersion[2]==UTRIE_SHIFT &&
        pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
    ) {
        return TRUE;
    } else {
        return FALSE;
    }
}

static int32_t U_CALLCONV
getFoldingOffset(uint32_t data) {

    return (int32_t)data;

}

/* hashes an entry  */
static int32_t U_EXPORT2 U_CALLCONV 
hashEntry(const UHashTok parm) {
    UStringPrepKey *b = (UStringPrepKey *)parm.pointer;
    UHashTok namekey, pathkey;
    namekey.pointer = b->name;
    pathkey.pointer = b->path;
    return uhash_hashChars(namekey)+37*uhash_hashChars(pathkey);
}

/* compares two entries */
static UBool U_EXPORT2 U_CALLCONV 
compareEntries(const UHashTok p1, const UHashTok p2) {
    UStringPrepKey *b1 = (UStringPrepKey *)p1.pointer;
    UStringPrepKey *b2 = (UStringPrepKey *)p2.pointer;
    UHashTok name1, name2, path1, path2;
    name1.pointer = b1->name;
    name2.pointer = b2->name;
    path1.pointer = b1->path;
    path2.pointer = b2->path;
    return ((UBool)(uhash_compareChars(name1, name2) & 
        uhash_compareChars(path1, path2)));
}

U_CDECL_END

U_CFUNC void 
usprep_init() {
    umtx_init(&usprepMutex);
}

/** Initializes the cache for resources */
static void 
initCache(UErrorCode *status) {
  UBool makeCache = FALSE;
  umtx_lock(&usprepMutex);
  makeCache = (SHARED_DATA_HASHTABLE ==  NULL);
  umtx_unlock(&usprepMutex);
  if(makeCache) {
      UHashtable *newCache = uhash_open(hashEntry, compareEntries, status);
      if (U_FAILURE(*status)) {
          return;
      }
      umtx_lock(&usprepMutex);
      if(SHARED_DATA_HASHTABLE == NULL) {
          SHARED_DATA_HASHTABLE = newCache;
          newCache = NULL;
      }
      umtx_unlock(&usprepMutex);
      if(newCache != NULL) {
          uhash_close(newCache);
      }
  }
}

static UBool U_CALLCONV
loadData(UStringPrepProfile* profile, 
         const char* path, 
         const char* name, 
         const char* type, 
         UErrorCode* errorCode) {
    /* load Unicode SPREP data from file */    
    UTrie _sprepTrie={ 0,0,0,0,0,0,0 };
    UDataMemory *dataMemory;
    const int32_t *p=NULL;
    const uint8_t *pb;
    UVersionInfo unicodeVersion;
    int32_t normVer, uniVer;

    if(errorCode==NULL || U_FAILURE(*errorCode)) {
        return 0;
    }

    /* open the data outside the mutex block */
    //TODO: change the path
    dataMemory=udata_openChoice(path, type, name, isAcceptable, NULL, errorCode);
    if(U_FAILURE(*errorCode)) {
        return FALSE;
    }

    p=(const int32_t *)udata_getMemory(dataMemory);
    pb=(const uint8_t *)(p+_SPREP_INDEX_TOP);
    utrie_unserialize(&_sprepTrie, pb, p[_SPREP_INDEX_TRIE_SIZE], errorCode);
    _sprepTrie.getFoldingOffset=getFoldingOffset;


    if(U_FAILURE(*errorCode)) {
        udata_close(dataMemory);
        return FALSE;
    }

    /* in the mutex block, set the data for this process */
    umtx_lock(&usprepMutex);
    if(profile->sprepData==NULL) {
        profile->sprepData=dataMemory;
        dataMemory=NULL;
        uprv_memcpy(&profile->indexes, p, sizeof(profile->indexes));
        uprv_memcpy(&profile->sprepTrie, &_sprepTrie, sizeof(UTrie));
    } else {
        p=(const int32_t *)udata_getMemory(profile->sprepData);
    }
    umtx_unlock(&usprepMutex);
    /* initialize some variables */
    profile->mappingData=(uint16_t *)((uint8_t *)(p+_SPREP_INDEX_TOP)+profile->indexes[_SPREP_INDEX_TRIE_SIZE]);
    
    /* 
     * check the normalization corrections version and the current Unicode version 
     * supported by ICU 
     */
    u_versionFromString(unicodeVersion, U_UNICODE_VERSION);
    normVer = profile->indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION];
    uniVer  = (unicodeVersion[0] << 24) + (unicodeVersion[1] << 16) + 
              (unicodeVersion[2] << 8 ) + (unicodeVersion[3]);

    if( normVer < uniVer &&
        ((profile->indexes[_SPREP_OPTIONS] & _SPREP_NORMALIZATION_ON) > 0) /* normalization turned on*/
      ){
        *errorCode = U_INVALID_FORMAT_ERROR;
        udata_close(dataMemory);
        return FALSE;
    }
    profile->isDataLoaded = TRUE;

    /* if a different thread set it first, then close the extra data */
    if(dataMemory!=NULL) {
        udata_close(dataMemory); /* NULL if it was set correctly */
    }


    return profile->isDataLoaded;
}

static UStringPrepProfile* 
usprep_getProfile(const char* path, 
                  const char* name,
                  UErrorCode *status){

    UStringPrepProfile* profile = NULL;

    initCache(status);

    if(U_FAILURE(*status)){
        return NULL;
    }

    UStringPrepKey stackKey;
    /* 
     * const is cast way to save malloc, strcpy and free calls 
     * we use the passed in pointers for fetching the data from the 
     * hash table which is safe
     */
    stackKey.name = (char*) name;
    stackKey.path = (char*) path;

    /* fetch the data from the cache */
    profile = (UStringPrepProfile*) (uhash_get(SHARED_DATA_HASHTABLE,&stackKey));
    
    if(profile == NULL){
        UStringPrepKey* key   = (UStringPrepKey*) uprv_malloc(sizeof(UStringPrepKey));
        if(key == NULL){
            *status = U_MEMORY_ALLOCATION_ERROR;
            return NULL;
        }
        /* else load the data and put the data in the cache */
        profile = (UStringPrepProfile*) uprv_malloc(sizeof(UStringPrepProfile));
        if(profile == NULL){
            *status = U_MEMORY_ALLOCATION_ERROR;
            uprv_free(key);
            return NULL;
        }

        /* initialize the data struct members */
        uprv_memset(profile->indexes,0,sizeof(profile->indexes));
        profile->mappingData = NULL;
        profile->sprepData   = NULL;
        profile->refCount    = 0;
    
        /* initialize the  key memebers */
        key->name  = (char*) uprv_malloc(uprv_strlen(name)+1);
        if(key->name == NULL){
            *status = U_MEMORY_ALLOCATION_ERROR;
            uprv_free(key);
            uprv_free(profile);
            return NULL;
        }

        uprv_strcpy(key->name, name);
        
        key->path=NULL;

        if(path != NULL){
            key->path      = (char*) uprv_malloc(uprv_strlen(path)+1);
            if(key->path == NULL){
                *status = U_MEMORY_ALLOCATION_ERROR;
                uprv_free(key->path);
                uprv_free(key);
                uprv_free(profile);
                return NULL;
            }
            uprv_strcpy(key->path, path);
        }        

        /* load the data */
        if(!loadData(profile, path, name, _SPREP_DATA_TYPE, status) || U_FAILURE(*status) ){
            return NULL;
        }
        
        /* get the options */
        profile->doNFKC            = (UBool)((profile->indexes[_SPREP_OPTIONS] & _SPREP_NORMALIZATION_ON) > 0);
        profile->checkBiDi         = (UBool)((profile->indexes[_SPREP_OPTIONS] & _SPREP_CHECK_BIDI_ON) > 0);
        
        umtx_lock(&usprepMutex);
        /* add the data object to the cache */
        uhash_put(SHARED_DATA_HASHTABLE, key, profile, status);
        umtx_unlock(&usprepMutex);
    }
    umtx_lock(&usprepMutex);
    /* increment the refcount */
    profile->refCount++;
    umtx_unlock(&usprepMutex);

    return profile;
}

U_CAPI UStringPrepProfile* U_EXPORT2
usprep_open(const char* path, 
            const char* name,
            UErrorCode* status){

    if(status == NULL || U_FAILURE(*status)){
        return NULL;
    }
    /* initialize the mutex */
    usprep_init();
       
    /* initialize the profile struct members */
    return usprep_getProfile(path,name,status);;
}

U_CAPI void U_EXPORT2
usprep_close(UStringPrepProfile* profile){
    if(profile==NULL){
        return;
    }

    umtx_lock(&usprepMutex);
    /* decrement the ref count*/
    if(profile->refCount > 0){
        profile->refCount--;
    }
    umtx_unlock(&usprepMutex);
    
}

static void 
usprep_unload(UStringPrepProfile* data){
    udata_close(data->sprepData);
}


static int32_t 
usprep_internal_flushCache(UBool noRefCount){
    UStringPrepProfile *profile = NULL;
    UStringPrepKey  *key  = NULL;
    int32_t pos = -1;
    int32_t deletedNum = 0;
    const UHashElement *e;

    /*
     * if shared data hasn't even been lazy evaluated yet
     * return 0
     */
    umtx_lock(&usprepMutex);
    if (SHARED_DATA_HASHTABLE == NULL) {
        umtx_unlock(&usprepMutex);
        return 0;
    }

    /*creates an enumeration to iterate through every element in the table */
    while ((e = uhash_nextElement(SHARED_DATA_HASHTABLE, &pos)) != NULL)
    {
        profile = (UStringPrepProfile *) e->value.pointer;
        key  = (UStringPrepKey *) e->key.pointer;

        if ((noRefCount== FALSE && profile->refCount == 0) || 
             noRefCount== TRUE) {
            deletedNum++;
            uhash_removeElement(SHARED_DATA_HASHTABLE, e);

            /* unload the data */
            usprep_unload(profile);

            if(key->name != NULL) {
                uprv_free(key->name);
                key->name=NULL;
            }
            if(key->path != NULL) {
                uprv_free(key->path);
                key->path=NULL;
            }
            uprv_free(profile);
            uprv_free(key);
        }
       
    }
    umtx_unlock(&usprepMutex);

    return deletedNum;
}

/* Works just like ucnv_flushCache() 
static int32_t 
usprep_flushCache(){
    return usprep_internal_flushCache(FALSE);
}
*/

U_CFUNC UBool 
usprep_cleanup(void){
    if (SHARED_DATA_HASHTABLE != NULL) {
        usprep_internal_flushCache(TRUE);
        if (SHARED_DATA_HASHTABLE != NULL && uhash_count(SHARED_DATA_HASHTABLE) == 0) {
            uhash_close(SHARED_DATA_HASHTABLE);
            SHARED_DATA_HASHTABLE = NULL;
        }
    }

    umtx_destroy(&usprepMutex);             /* Don't worry about destroying the mutex even  */
                                            /*  if the hash table still exists.  The mutex  */
                                            /*  will lazily re-init  itself if needed.      */
    return (SHARED_DATA_HASHTABLE == NULL);
}

U_CFUNC void 
uprv_syntaxError(const UChar* rules, 
                 int32_t pos,
                 int32_t rulesLen,
                 UParseError* parseError){
    if(parseError == NULL){
        return;
    }
    if(pos == rulesLen && rulesLen >0){
        pos--;
    }
    parseError->offset = pos;
    parseError->line = 0 ; // we are not using line numbers 
    
    // for pre-context
    int32_t start = (pos <=U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
    int32_t stop  = pos;
    
    u_memcpy(parseError->preContext,rules+start,stop-start);
    //null terminate the buffer
    parseError->preContext[stop-start] = 0;
    
    //for post-context
    start = pos;
    if(start<rulesLen) {
        U16_FWD_1(rules, start, rulesLen);
    }

    stop  = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN)) : 
                                                            rulesLen;
    if(start < stop){
        u_memcpy(parseError->postContext,rules+start,stop-start);
        //null terminate the buffer
        parseError->postContext[stop-start]= 0;
    }
    
}


static inline UStringPrepType
getValues(uint16_t trieWord, int16_t& value, UBool& isIndex){

    UStringPrepType type;
    if(trieWord == 0){
        /* 
         * Initial value stored in the mapping table 
         * just return USPREP_TYPE_LIMIT .. so that
         * the source codepoint is copied to the destination
         */
        type = USPREP_TYPE_LIMIT;
    }else if(trieWord >= _SPREP_TYPE_THRESHOLD){
        type = (UStringPrepType) (trieWord - _SPREP_TYPE_THRESHOLD);
    }else{
        /* get the type */
        type = USPREP_MAP;
        /* ascertain if the value is index or delta */
        if(trieWord & 0x02){
            isIndex = TRUE;
            value = trieWord  >> 2; //mask off the lower 2 bits and shift

        }else{
            isIndex = FALSE;
            value = (int16_t)trieWord;
            value =  (value >> 2);

        }
 
        if((trieWord>>2) == _SPREP_MAX_INDEX_VALUE){
            type = USPREP_DELETE;
            isIndex =FALSE;
            value = 0;
        }
    }
    return type;
}



static int32_t 
usprep_map(  const UStringPrepProfile* profile, 
             const UChar* src, int32_t srcLength, 
             UChar* dest, int32_t destCapacity,
             int32_t options,
             UParseError* parseError,
             UErrorCode* status ){
    
    uint16_t result;
    int32_t destIndex=0;
    int32_t srcIndex;
    UBool allowUnassigned = (UBool) ((options & USPREP_ALLOW_UNASSIGNED)>0);
    UStringPrepType type;
    int16_t value;
    UBool isIndex;
    const int32_t* indexes = profile->indexes;

    // no error checking the caller check for error and arguments
    // no string length check the caller finds out the string length

    for(srcIndex=0;srcIndex<srcLength;){
        UChar32 ch;

        U16_NEXT(src,srcIndex,srcLength,ch);
        
        result=0;

        UTRIE_GET16(&profile->sprepTrie,ch,result);
        
        type = getValues(result, value, isIndex);

        // check if the source codepoint is unassigned
        if(type == USPREP_UNASSIGNED && allowUnassigned == FALSE){

            uprv_syntaxError(src,srcIndex-U16_LENGTH(ch), srcLength,parseError);
            *status = U_STRINGPREP_UNASSIGNED_ERROR;
            return 0;
            
        }else if(type == USPREP_MAP){
            
            int32_t index, length;

            if(isIndex){
                index = value;
                if(index >= indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] &&
                         index < indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START]){
                    length = 1;
                }else if(index >= indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] &&
                         index < indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START]){
                    length = 2;
                }else if(index >= indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] &&
                         index < indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START]){
                    length = 3;
                }else{
                    length = profile->mappingData[index++];
         
                }

                /* copy mapping to destination */
                for(int32_t i=0; i< length; i++){
                    if(destIndex < destCapacity  ){
                        dest[destIndex] = profile->mappingData[index+i];
                    }
                    destIndex++; /* for pre-flighting */
                }  
                continue;
            }else{
                // subtract the delta to arrive at the code point
                ch -= value;
            }

        }else if(type==USPREP_DELETE){
             // just consume the codepoint and contine
            continue;
        }
        //copy the code point into destination
        if(ch <= 0xFFFF){
            if(destIndex < destCapacity ){
                dest[destIndex] = (UChar)ch;
            }
            destIndex++;
        }else{
            if(destIndex+1 < destCapacity ){
                dest[destIndex]   = U16_LEAD(ch);
                dest[destIndex+1] = U16_TRAIL(ch);
            }
            destIndex +=2;
        }
       
    }
        
    return u_terminateUChars(dest, destCapacity, destIndex, status);
}


static int32_t 
usprep_normalize(   const UChar* src, int32_t srcLength, 
                    UChar* dest, int32_t destCapacity,
                    UErrorCode* status ){

    return unorm_normalize(src,srcLength,UNORM_NFKC,UNORM_UNICODE_3_2,dest,destCapacity,status);
}


 /*
   1) Map -- For each character in the input, check if it has a mapping
      and, if so, replace it with its mapping.  

   2) Normalize -- Possibly normalize the result of step 1 using Unicode
      normalization. 

   3) Prohibit -- Check for any characters that are not allowed in the
      output.  If any are found, return an error.  

   4) Check bidi -- Possibly check for right-to-left characters, and if
      any are found, make sure that the whole string satisfies the
      requirements for bidirectional strings.  If the string does not
      satisfy the requirements for bidirectional strings, return an
      error.  
      [Unicode3.2] defines several bidirectional categories; each character
       has one bidirectional category assigned to it.  For the purposes of
       the requirements below, an "RandALCat character" is a character that
       has Unicode bidirectional categories "R" or "AL"; an "LCat character"
       is a character that has Unicode bidirectional category "L".  Note


       that there are many characters which fall in neither of the above
       definitions; Latin digits (<U+0030> through <U+0039>) are examples of
       this because they have bidirectional category "EN".

       In any profile that specifies bidirectional character handling, all
       three of the following requirements MUST be met:

       1) The characters in section 5.8 MUST be prohibited.

       2) If a string contains any RandALCat character, the string MUST NOT
          contain any LCat character.

       3) If a string contains any RandALCat character, a RandALCat
          character MUST be the first character of the string, and a
          RandALCat character MUST be the last character of the string.
*/

#define MAX_STACK_BUFFER_SIZE 300


U_CAPI int32_t U_EXPORT2
usprep_prepare(   const UStringPrepProfile* profile,
                  const UChar* src, int32_t srcLength, 
                  UChar* dest, int32_t destCapacity,
                  int32_t options,
                  UParseError* parseError,
                  UErrorCode* status ){

    // check error status
    if(status == NULL || U_FAILURE(*status)){
        return 0;
    }
    
    //check arguments
    if(profile==NULL || src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
        *status=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    UChar b1Stack[MAX_STACK_BUFFER_SIZE], b2Stack[MAX_STACK_BUFFER_SIZE];
    UChar *b1 = b1Stack, *b2 = b2Stack;
    int32_t b1Len, b2Len=0,
            b1Capacity = MAX_STACK_BUFFER_SIZE , 
            b2Capacity = MAX_STACK_BUFFER_SIZE;
    uint16_t result;
    int32_t b2Index = 0;
    UCharDirection direction=U_CHAR_DIRECTION_COUNT, firstCharDir=U_CHAR_DIRECTION_COUNT;
    UBool leftToRight=FALSE, rightToLeft=FALSE;
    int32_t rtlPos =-1, ltrPos =-1;

    //get the string length
    if(srcLength == -1){
        srcLength = u_strlen(src);
    }
    // map
    b1Len = usprep_map(profile, src, srcLength, b1, b1Capacity, options, parseError, status);

    if(*status == U_BUFFER_OVERFLOW_ERROR){
        // redo processing of string
        /* we do not have enough room so grow the buffer*/
        b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR);
        if(b1==NULL){
            *status = U_MEMORY_ALLOCATION_ERROR;
            goto CLEANUP;
        }

        *status = U_ZERO_ERROR; // reset error
        
        b1Len = usprep_map(profile, src, srcLength, b1, b1Len, options, parseError, status);
        
    }

    // normalize
    if(profile->doNFKC == TRUE){
        b2Len = usprep_normalize(b1,b1Len, b2,b2Capacity,status);
        
        if(*status == U_BUFFER_OVERFLOW_ERROR){
            // redo processing of string
            /* we do not have enough room so grow the buffer*/
            b2 = (UChar*) uprv_malloc(b2Len * U_SIZEOF_UCHAR);
            if(b2==NULL){
                *status = U_MEMORY_ALLOCATION_ERROR;
                goto CLEANUP;
            }

            *status = U_ZERO_ERROR; // reset error
        
            b2Len = usprep_normalize(b1,b1Len, b2,b2Len,status);
        
        }

    }else{
        b2 = b1;
        b2Len = b1Len;
    }
    

    if(U_FAILURE(*status)){
        goto CLEANUP;
    }

    UChar32 ch;
    UStringPrepType type;
    int16_t value;
    UBool isIndex;
    
    // Prohibit and checkBiDi in one pass
    for(b2Index=0; b2Index<b2Len;){
        
        ch = 0;

        U16_NEXT(b2, b2Index, b2Len, ch);

        UTRIE_GET16(&profile->sprepTrie,ch,result);
        
        type = getValues(result, value, isIndex);

        if( type == USPREP_PROHIBITED || 
            ((result < _SPREP_TYPE_THRESHOLD) && (result & 0x01) /* first bit says it the code point is prohibited*/)
           ){
            *status = U_STRINGPREP_PROHIBITED_ERROR;
            uprv_syntaxError(b1, b2Index-U16_LENGTH(ch), b2Len, parseError);
            goto CLEANUP;
        }

        direction = u_charDirection(ch);
        if(firstCharDir == U_CHAR_DIRECTION_COUNT){
            firstCharDir = direction;
        }
        if(direction == U_LEFT_TO_RIGHT){
            leftToRight = TRUE;
            ltrPos = b2Index-1;
        }
        if(direction == U_RIGHT_TO_LEFT || direction == U_RIGHT_TO_LEFT_ARABIC){
            rightToLeft = TRUE;
            rtlPos = b2Index-1;
        }
    }           
    if(profile->checkBiDi == TRUE){
        // satisfy 2
        if( leftToRight == TRUE && rightToLeft == TRUE){
            *status = U_STRINGPREP_CHECK_BIDI_ERROR;
            uprv_syntaxError(b2,(rtlPos>ltrPos) ? rtlPos : ltrPos, b2Len, parseError);
            goto CLEANUP;
        }

        //satisfy 3
        if( rightToLeft == TRUE && 
            !((firstCharDir == U_RIGHT_TO_LEFT || firstCharDir == U_RIGHT_TO_LEFT_ARABIC) &&
              (direction == U_RIGHT_TO_LEFT || direction == U_RIGHT_TO_LEFT_ARABIC))
           ){
            *status = U_STRINGPREP_CHECK_BIDI_ERROR;
            uprv_syntaxError(b2, rtlPos, b2Len, parseError);
            return FALSE;
        }
    }
    if(b2Len <= destCapacity){
        uprv_memmove(dest,b2, b2Len*U_SIZEOF_UCHAR);
    }

CLEANUP:
    if(b1!=b1Stack){
        uprv_free(b1);
        b1=NULL;
    }

    if(b2!=b1Stack && b2!=b2Stack && b2!=b1 /* b1 should not be freed twice */){
        uprv_free(b2);
        b2=NULL;
    }
    return u_terminateUChars(dest, destCapacity, b2Len, status);
}


/* data swapping ------------------------------------------------------------ */

U_CAPI int32_t U_EXPORT2
usprep_swap(const UDataSwapper *ds,
            const void *inData, int32_t length, void *outData,
            UErrorCode *pErrorCode) {
    const UDataInfo *pInfo;
    int32_t headerSize;

    const uint8_t *inBytes;
    uint8_t *outBytes;

    const int32_t *inIndexes;
    int32_t indexes[16];

    int32_t i, offset, count, size;

    /* udata_swapDataHeader checks the arguments */
    headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return 0;
    }

    /* check data format and format version */
    pInfo=(const UDataInfo *)((const char *)inData+4);
    if(!(
        pInfo->dataFormat[0]==0x53 &&   /* dataFormat="SPRP" */
        pInfo->dataFormat[1]==0x50 &&
        pInfo->dataFormat[2]==0x52 &&
        pInfo->dataFormat[3]==0x50 &&
        pInfo->formatVersion[0]==3
    )) {
        udata_printError(ds, "usprep_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as StringPrep .spp data\n",
                         pInfo->dataFormat[0], pInfo->dataFormat[1],
                         pInfo->dataFormat[2], pInfo->dataFormat[3],
                         pInfo->formatVersion[0]);
        *pErrorCode=U_UNSUPPORTED_ERROR;
        return 0;
    }

    inBytes=(const uint8_t *)inData+headerSize;
    outBytes=(uint8_t *)outData+headerSize;

    inIndexes=(const int32_t *)inBytes;

    if(length>=0) {
        length-=headerSize;
        if(length<16*4) {
            udata_printError(ds, "usprep_swap(): too few bytes (%d after header) for StringPrep .spp data\n",
                             length);
            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
            return 0;
        }
    }

    /* read the first 16 indexes (ICU 2.8/format version 3: _SPREP_INDEX_TOP==16, might grow) */
    for(i=0; i<16; ++i) {
        indexes[i]=udata_readInt32(ds, inIndexes[i]);
    }

    /* calculate the total length of the data */
    size=
        16*4+ /* size of indexes[] */
        indexes[_SPREP_INDEX_TRIE_SIZE]+
        indexes[_SPREP_INDEX_MAPPING_DATA_SIZE];

    if(length>=0) {
        if(length<size) {
            udata_printError(ds, "usprep_swap(): too few bytes (%d after header) for all of StringPrep .spp data\n",
                             length);
            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
            return 0;
        }

        /* copy the data for inaccessible bytes */
        if(inBytes!=outBytes) {
            uprv_memcpy(outBytes, inBytes, size);
        }

        offset=0;

        /* swap the int32_t indexes[] */
        count=16*4;
        ds->swapArray32(ds, inBytes, count, outBytes, pErrorCode);
        offset+=count;

        /* swap the UTrie */
        count=indexes[_SPREP_INDEX_TRIE_SIZE];
        utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
        offset+=count;

        /* swap the uint16_t mappingTable[] */
        count=indexes[_SPREP_INDEX_MAPPING_DATA_SIZE];
        ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
        offset+=count;
    }

    return headerSize+size;
}

#endif /* #if !UCONFIG_NO_IDNA */

--- NEW FILE: ustack.cpp ---
/*
**********************************************************************
*   Copyright (C) 2003-2003, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*/

#include "uvector.h"

U_NAMESPACE_BEGIN

UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UStack)

U_NAMESPACE_END

--- NEW FILE: utrace.c ---
/*
*******************************************************************************
*   Copyright (C) 2003, International Business Machines
*   Corporation and others.  All Rights Reserved.
*******************************************************************************
*   file name:  utrace.c
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*/

#define   UTRACE_IMPL
#include "unicode/utrace.h"
#include "utracimp.h"
#include "cstring.h"
#include "uassert.h"
#include "ucln_cmn.h"


static UTraceEntry     *pTraceEntryFunc = NULL;
static UTraceExit      *pTraceExitFunc  = NULL;
static UTraceData      *pTraceDataFunc  = NULL;
static const void      *gTraceContext   = NULL;

U_EXPORT int32_t
utrace_level;

U_CAPI void U_EXPORT2
utrace_entry(int32_t fnNumber) {
    if (pTraceEntryFunc != NULL) {
        (*pTraceEntryFunc)(gTraceContext, fnNumber);
    }
}


static const char gExitFmt[]             = "Returns.";
static const char gExitFmtValue[]        = "Returns %d.";
static const char gExitFmtStatus[]       = "Returns.  Status = %d.";
static const char gExitFmtValueStatus[]  = "Returns %d.  Status = %d.";
static const char gExitFmtPtrStatus[]    = "Returns %d.  Status = %p.";

U_CAPI void U_EXPORT2
utrace_exit(int32_t fnNumber, int32_t returnType, ...) {
    if (pTraceExitFunc != NULL) {
        va_list     args;
        const char *fmt;

        switch (returnType) {
        case 0:
            fmt = gExitFmt;
            break;
        case UTRACE_EXITV_I32:
            fmt = gExitFmtValue;
            break;
        case UTRACE_EXITV_STATUS:
            fmt = gExitFmtStatus;
            break;
        case UTRACE_EXITV_I32 | UTRACE_EXITV_STATUS:
            fmt = gExitFmtValueStatus;
            break;
        case UTRACE_EXITV_PTR | UTRACE_EXITV_STATUS:
            fmt = gExitFmtPtrStatus;
            break;
        default:
            U_ASSERT(FALSE);
            fmt = gExitFmt;
        }

        va_start(args, returnType);
        (*pTraceExitFunc)(gTraceContext, fnNumber, fmt, args);
        va_end(args);
    }
}
 

 
U_CAPI void U_EXPORT2 
utrace_data(int32_t fnNumber, int32_t level, const char *fmt, ...) {
    if (pTraceDataFunc != NULL) {
           va_list args;
           va_start(args, fmt ); 
           (*pTraceDataFunc)(gTraceContext, fnNumber, level, fmt, args);
           va_end(args);
    }
}


static void outputChar(char c, char *outBuf, int32_t *outIx, int32_t capacity, int32_t indent) {
    int32_t i;
    /* Check whether a start of line indenting is needed.  Three cases:
     *   1.  At the start of the first line  (output index == 0).
     *   2.  At the start of subsequent lines  (preceeding char in buffer == '\n')
     *   3.  When preflighting buffer len (buffer capacity is exceeded), when
     *       a \n is output.  Ideally we wouldn't do the indent until the following char
     *       is received, but that won't work because there's no place to remember that
     *       the preceding char was \n.  Meaning that we may overstimate the
     *       buffer size needed.  No harm done.
     */
    if (*outIx==0 ||   /* case 1. */
        (c!='\n' && c!=0 && *outIx < capacity && outBuf[(*outIx)-1]=='\n') ||  /* case 2. */
        (c=='\n' && *outIx>=capacity))    /* case 3 */
    {
        /* At the start of a line.  Indent. */
        for(i=0; i<indent; i++) {
            if (*outIx < capacity) {
                outBuf[*outIx] = ' ';
            }
            (*outIx)++;
        }
    }

    if (*outIx < capacity) {
        outBuf[*outIx] = c;
    }
    if (c != 0) {
        /* Nulls only appear as end-of-string terminators.  Move them to the output
         *  buffer, but do not update the length of the buffer, so that any
         *  following output will overwrite the null. */
        (*outIx)++;
    }
}

static void outputHexBytes(int64_t val, int32_t charsToOutput,
                           char *outBuf, int32_t *outIx, int32_t capacity) {
    static const char gHexChars[] = "0123456789abcdef";
    int32_t shiftCount;
    for  (shiftCount=(charsToOutput-1)*4; shiftCount >= 0; shiftCount-=4) {
        char c = gHexChars[(val >> shiftCount) & 0xf];
        outputChar(c, outBuf, outIx, capacity, 0);
    }
}

/* Output a pointer value in hex.  Work with any size of pointer   */
static void outputPtrBytes(void *val, char *outBuf, int32_t *outIx, int32_t capacity) {
    static const int16_t endianTestVal = (int16_t)0xabcd;
    int32_t  i;
    int32_t  incVal = 1;              /* +1 for big endian, -1 for little endian          */
    char     *p     = (char *)&val;   /* point to current byte to output in the ptr val  */

    if (*(uint8_t *)&endianTestVal == (uint8_t)0xcd) {
        /* Little Endian.  Move p to most significant end of the value      */
        incVal = -1;
        p += sizeof(void *) - 1;
    }

    /* Loop through the bytes of the ptr as it sits in memory, from 
     * most significant to least significant end                    */
    for (i=0; i<sizeof(void *); i++) {
        outputHexBytes(*p, 2, outBuf, outIx, capacity);
        p += incVal;
    }
}

static void outputString(const char *s, char *outBuf, int32_t *outIx, int32_t capacity, int32_t indent) {
    int32_t i = 0;
    char    c;
    if (s==NULL) {
        s = "*NULL*";
    }
    do {
        c = s[i++];
        outputChar(c, outBuf, outIx, capacity, indent);
    } while (c != 0);
}
        


static void outputUString(const UChar *s, int32_t len, 
                          char *outBuf, int32_t *outIx, int32_t capacity, int32_t indent) {
    int32_t i = 0;
    UChar   c;
    if (s==NULL) {
        outputString(NULL, outBuf, outIx, capacity, indent);
        return;
    }

    for (i=0; i<len || len==-1; i++) {
        c = s[i];
        outputHexBytes(c, 4, outBuf, outIx, capacity);
        outputChar(' ', outBuf, outIx, capacity, indent);
        if (len == -1 && c==0) {
            break;
        }
    }
}
        
U_CAPI int32_t U_EXPORT2
utrace_vformat(char *outBuf, int32_t capacity, int32_t indent, const char *fmt, va_list args) {
    int32_t   outIx  = 0;
    int32_t   fmtIx  = 0;
    char      fmtC;
    char      c;
    int32_t   intArg;
    int64_t   longArg = 0;
    char      *ptrArg;

    /*   Loop runs once for each character in the format string.
     */
    for (;;) {
        fmtC = fmt[fmtIx++];
        if (fmtC != '%') {
            /* Literal character, not part of a %sequence.  Just copy it to the output. */
            outputChar(fmtC, outBuf, &outIx, capacity, indent);
            if (fmtC == 0) {
                /* We hit the null that terminates the format string.
                 * This is the normal (and only) exit from the loop that
                 * interprets the format
                 */
                break;
            }
            continue;
        }

        /* We encountered a '%'.  Pick up the following format char */
        fmtC = fmt[fmtIx++];

        switch (fmtC) {
        case 'c':
            /* single 8 bit char   */
            c = (char)va_arg(args, int32_t);
            outputChar(c, outBuf, &outIx, capacity, indent);
            break;

        case 's':
            /* char * string, null terminated.  */
            ptrArg = va_arg(args, char *);
            outputString((const char *)ptrArg, outBuf, &outIx, capacity, indent);
            break;

        case 'S':
            /* UChar * string, with length, len==-1 for null terminated. */
            ptrArg = va_arg(args, void *);             /* Ptr    */
            intArg =(int32_t)va_arg(args, int32_t);    /* Length */
            outputUString((const unsigned short *)ptrArg, intArg, outBuf, &outIx, capacity, indent);
            break;

        case 'b':
            /*  8 bit int  */
            intArg = va_arg(args, int);
            outputHexBytes(intArg, 2, outBuf, &outIx, capacity);
            break;

        case 'h':
            /*  16 bit int  */
            intArg = va_arg(args, int);
            outputHexBytes(intArg, 4, outBuf, &outIx, capacity);
            break;

        case 'd':
            /*  32 bit int  */
            intArg = va_arg(args, int);
            outputHexBytes(intArg, 8, outBuf, &outIx, capacity);
            break;

        case 'l':
            /*  64 bit long  */
            longArg = va_arg(args, int64_t);
            outputHexBytes(longArg, 16, outBuf, &outIx, capacity);
            break;
            
        case 'p':
            /*  Pointers.   */
            ptrArg = va_arg(args, void *);
            outputPtrBytes(ptrArg, outBuf, &outIx, capacity);
            break;

        case 0:
            /* Single '%' at end of fmt string.  Output as literal '%'.   
             * Back up index into format string so that the terminating null will be
             * re-fetched in the outer loop, causing it to terminate.
             */
            outputChar('%', outBuf, &outIx, capacity, indent);
            fmtIx--;
            break;

        case 'v':
            {
                /* Vector of values, e.g. %vh */
                char     vectorType;
                int32_t  vectorLen;
                const char   *i8Ptr;
                int16_t  *i16Ptr;
                int32_t  *i32Ptr;
                int64_t  *i64Ptr;
                void     **ptrPtr;
                int32_t   charsToOutput = 0;
                int32_t   i;
                
                vectorType = fmt[fmtIx];    /* b, h, d, l, p, etc. */
                if (vectorType != 0) {
                    fmtIx++;
                }
                i8Ptr = (const char *)va_arg(args, void*);
                i16Ptr = (int16_t *)i8Ptr;
                i32Ptr = (int32_t *)i8Ptr;
                i64Ptr = (int64_t *)i8Ptr;
                ptrPtr = (void **)i8Ptr;
                vectorLen =(int32_t)va_arg(args, int32_t);
                if (ptrPtr == NULL) {
                    outputString("*NULL* ", outBuf, &outIx, capacity, indent);
                } else {
                    for (i=0; i<vectorLen || vectorLen==-1; i++) { 
                        switch (vectorType) {
                        case 'b':
                            charsToOutput = 2;
                            longArg = *i8Ptr++;
                            break;
                        case 'h':
                            charsToOutput = 4;
                            longArg = *i16Ptr++;
                            break;
                        case 'd':
                            charsToOutput = 8;
                            longArg = *i32Ptr++;
                            break;
                        case 'l':
                            charsToOutput = 16;
                            longArg = *i64Ptr++;
                            break;
                        case 'p':
                            charsToOutput = 0;
                            outputPtrBytes(*ptrPtr, outBuf, &outIx, capacity);
                            longArg = *ptrPtr==NULL? 0: 1;    /* test for null terminated array. */
                            ptrPtr++;
                            break;
                        case 'c':
                            charsToOutput = 0;
                            outputChar(*i8Ptr, outBuf, &outIx, capacity, indent);
                            longArg = *i8Ptr;    /* for test for null terminated array. */
                            i8Ptr++;
                            break;
                        case 's':
                            charsToOutput = 0;
                            outputString(*ptrPtr, outBuf, &outIx, capacity, indent);
                            outputChar('\n', outBuf, &outIx, capacity, indent);
                            longArg = *ptrPtr==NULL? 0: 1;   /* for test for null term. array. */
                            ptrPtr++;
                            break;

                        case 'S':
                            charsToOutput = 0;
                            outputUString((const unsigned short *)*ptrPtr, -1, outBuf, &outIx, capacity, indent);
                            outputChar('\n', outBuf, &outIx, capacity, indent);
                            longArg = *ptrPtr==NULL? 0: 1;   /* for test for null term. array. */
                            ptrPtr++;
                            break;

                            
                        }
                        if (charsToOutput > 0) {
                            outputHexBytes(longArg, charsToOutput, outBuf, &outIx, capacity);
                            outputChar(' ', outBuf, &outIx, capacity, indent);
                        }
                        if (vectorLen == -1 && longArg == 0) {
                            break;
                        }
                    }
                }
                outputChar('[', outBuf, &outIx, capacity, indent);
                outputHexBytes(vectorLen, 8, outBuf, &outIx, capacity);
                outputChar(']', outBuf, &outIx, capacity, indent);
            }
            break;


        default:
            /* %. in format string, where . is some character not in the set
             *    of recognized format chars.  Just output it as if % wasn't there.
             *    (Covers "%%" outputing a single '%')
             */
             outputChar(fmtC, outBuf, &outIx, capacity, indent);
        }
    }
    outputChar(0, outBuf, &outIx, capacity, indent);  /* Make sure that output is null terminated  */
    return outIx + 1;     /* outIx + 1 because outIx does not increment when outputing final null. */
}




U_CAPI int32_t U_EXPORT2
utrace_format(char *outBuf, int32_t capacity,
                int32_t indent, const char *fmt,  ...) {
    int32_t retVal;
    va_list args;
    va_start(args, fmt ); 
    retVal = utrace_vformat(outBuf, capacity, indent, fmt, args);
    va_end(args);
    return retVal;
}


U_CAPI void U_EXPORT2
utrace_setFunctions(const void *context,
                    UTraceEntry *e, UTraceExit *x, UTraceData *d) {
    pTraceEntryFunc = e;
    pTraceExitFunc  = x;
    pTraceDataFunc  = d;
    gTraceContext   = context;
}


U_CAPI void U_EXPORT2
utrace_getFunctions(const void **context,
                    UTraceEntry **e, UTraceExit **x, UTraceData **d) {
    *e = pTraceEntryFunc;
    *x = pTraceExitFunc;
    *d = pTraceDataFunc;
    *context = gTraceContext;
}

U_CAPI void U_EXPORT2
utrace_setLevel(int32_t level) {
    if (level < UTRACE_OFF) {
        level = UTRACE_OFF;
    }
    if (level > UTRACE_VERBOSE) {
        level = UTRACE_VERBOSE;
    }
    utrace_level = level;
}

U_CAPI int32_t U_EXPORT2
utrace_getLevel() {
    return utrace_level;
}


U_CFUNC UBool 
utrace_cleanup() {
    pTraceEntryFunc = NULL;
    pTraceExitFunc  = NULL;
    pTraceDataFunc  = NULL;
    utrace_level    = UTRACE_OFF;
    gTraceContext   = NULL;
    return TRUE;
}


static const char * const
trFnName[] = {
    "u_init",
    "u_cleanup",
    NULL
};


static const char * const
trConvNames[] = {
    "ucnv_open",
    "ucnv_openPackage",
    "ucnv_openAlgorithmic",
    "ucnv_clone",
    "ucnv_close",
    "ucnv_flushCache",
    "ucnv_load",
    "ucnv_unload",
    NULL
};

    
static const char * const
trCollNames[] = {
    "ucol_open",
    "ucol_close",
    "ucol_strcoll",
    "ucol_getSortKey",
    "ucol_getLocale",
    "ucol_nextSortKeyPart",
    "ucol_strcollIter",
    NULL
};

                
U_CAPI const char * U_EXPORT2
utrace_functionName(int32_t fnNumber) {
    if(UTRACE_FUNCTION_START <= fnNumber && fnNumber < UTRACE_FUNCTION_LIMIT) {
        return trFnName[fnNumber];
    } else if(UTRACE_CONVERSION_START <= fnNumber && fnNumber < UTRACE_CONVERSION_LIMIT) {
        return trConvNames[fnNumber - UTRACE_CONVERSION_START];
    } else if(UTRACE_COLLATION_START <= fnNumber && fnNumber < UTRACE_COLLATION_LIMIT){
        return trCollNames[fnNumber - UTRACE_COLLATION_START];
    } else {
        return "[BOGUS Trace Function Number]";
    }
}


Index: Makefile.in
===================================================================
RCS file: /cvs/core/icu-sword/source/common/Makefile.in,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- Makefile.in	10 Sep 2003 02:42:02 -0000	1.4
+++ Makefile.in	6 Apr 2004 10:07:57 -0000	1.5
@@ -4,7 +4,7 @@
 #   Corporation and others.  All Rights Reserved.
 #
 #******************************************************************************
-## Makefile.in for ICU - icu.so
+## Makefile.in for ICU - icuuc.so
 ## Stephen F. Booth
 
 ## Source directory information
@@ -27,13 +27,17 @@
 TARGET_STUBNAME=uc
 
 ifneq ($(ENABLE_STATIC),)
-TARGET = $(LIBICU)$(TARGET_STUBNAME)$(ICULIBSUFFIX).a
+TARGET = $(LIBSICU)$(TARGET_STUBNAME)$(ICULIBSUFFIX).$(A)
 endif
 
 ifneq ($(ENABLE_SHARED),)
 SO_TARGET = $(LIBICU)$(TARGET_STUBNAME)$(ICULIBSUFFIX).$(SO)
 ALL_SO_TARGETS = $(SO_TARGET) $(MIDDLE_SO_TARGET) $(FINAL_SO_TARGET)
 
+ifeq ($(ENABLE_SO_VERSION_DATA),1)
+SO_VERSION_DATA = common.res
+endif
+
 ifeq ($(OS390BATCH),1)
 BATCH_TARGET = $(BATCH_COMMON_TARGET)
 BATCH_LIBS = $(BATCH_LIBICUDT) -lm
@@ -47,29 +51,29 @@
 DYNAMICCFLAGS = $(SHAREDLIBCFLAGS)
 DYNAMICCXXFLAGS = $(SHAREDLIBCXXFLAGS)
 
-CPPFLAGS += -I. -I$(srcdir) $(LIBCPPFLAGS)
+CPPFLAGS += -I. -I$(srcdir) -I$(srcdir)/../i18n $(LIBCPPFLAGS)
 DEFS += -DU_COMMON_IMPLEMENTATION
 
 # $(LIBICUDT) is either stub data or the real DLL common data.
 LIBS = $(LIBICUDT) $(DEFAULT_LIBS)
 
-OBJECTS = putil.o uobject.o locmap.o mutex.o umutex.o \
-udata.o ucmndata.o udatamem.o umapfile.o filestrm.o \
-uresbund.o uresdata.o resbund.o cwchar.o uloc.o locid.o uhash.o uhash_us.o \
-ucnv.o ucnv_bld.o ucnv_cb.o ucnv_cnv.o ucnv_err.o ucnv_io.o ucnvlat1.o \
-ucnv_u7.o ucnv_u8.o ucnv_u16.o ucnv_u32.o \
-ucnvmbcs.o ucnv2022.o ucnvhz.o ucnv_lmb.o ucnvscsu.o \
-ucnvbocu.o ucnvisci.o \
+OBJECTS = putil.o uobject.o cmemory.o umutex.o \
+udata.o ucmndata.o udatamem.o udataswp.o umapfile.o ucol_swp.o \
+uresbund.o uresdata.o resbund.o ucat.o locmap.o uloc.o locid.o \
+uhash.o uhash_us.o \
+ucnv.o ucnv_bld.o ucnv_cb.o ucnv_cnv.o ucnv_err.o ucnv_ext.o ucnv_io.o ucnvlat1.o \
+ucnv_u7.o ucnv_u8.o ucnv_u16.o ucnv_u32.o ucnvscsu.o ucnvbocu.o \
+ucnvmbcs.o ucnv2022.o ucnvhz.o ucnv_lmb.o ucnvisci.o \
 unistr.o utf_impl.o ustring.o ustrcase.o cstring.o ustrfmt.o ustrtrns.o \
 normlzr.o unorm.o unorm_it.o chariter.o schriter.o uchriter.o uiter.o \
 uchar.o uprops.o propname.o ubidi.o ubidiwrt.o ubidiln.o ushape.o unames.o \
-ucln_cmn.o uscript.o usc_impl.o umemstrm.o ucmp8.o uvector.o uvectr32.o digitlst.o \
+ucln_cmn.o uscript.o usc_impl.o uvector.o ustack.o uvectr32.o ucmp8.o \
+uarrsort.o utrie.o uset.o uniset.o ruleiter.o caniter.o unifilt.o unifunct.o usetiter.o \
 brkiter.o brkdict.o ubrk.o dbbi.o dbbi_tbl.o \
 rbbi.o rbbidata.o rbbinode.o rbbirb.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o \
-utrie.o uset.o cmemory.o caniter.o \
-unifilt.o unifunct.o uniset.o usetiter.o util.o uenum.o \
-icuserv.o iculserv.o icunotif.o ustrenum.o \
-uidna.o strprep.o nameprep.o punycode.o ucat.o
+icuserv.o iculserv.o icunotif.o uenum.o ustrenum.o \
+uidna.o usprep.o punycode.o \
+cwchar.o filestrm.o umemstrm.o util.o parsepos.o utrace.o locbased.o
 
 STATIC_OBJECTS = $(OBJECTS:.o=.$(STATIC_O))
 
@@ -133,7 +137,7 @@
 
 clean-local:
 	test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES)
-	$(RMV) $(OBJECTS) $(STATIC_OBJECTS) $(ALL_TARGETS)
+	$(RMV) $(OBJECTS) $(STATIC_OBJECTS) $(ALL_TARGETS) $(SO_VERSION_DATA)
 
 distclean-local: clean-local
 	$(RMV) Makefile icucfg.h unicode/platform.h
@@ -154,7 +158,7 @@
 endif
 
 ifneq ($(ENABLE_SHARED),)
-$(FINAL_SO_TARGET): $(OBJECTS)
+$(FINAL_SO_TARGET): $(OBJECTS) $(SO_VERSION_DATA)
 	$(SHLIB.cc) $(LD_SONAME) $(OUTOPT)$@ $^ $(LIBS)
 
 ifeq ($(OS390BATCH),1)

Index: brkiter.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/brkiter.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- brkiter.cpp	10 Sep 2003 02:42:02 -0000	1.1
+++ brkiter.cpp	6 Apr 2004 10:07:58 -0000	1.2
@@ -1,6 +1,6 @@
 /*
 *******************************************************************************
-* Copyright (C) 1997-2001, International Business Machines Corporation and    *
+* Copyright (C) 1997-2003, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 *
@@ -29,6 +29,7 @@
 #include "cstring.h"
 #include "mutex.h"
 #include "iculserv.h"
+#include "locbased.h"
 
 // *****************************************************************************
 // class BreakIterator
@@ -47,7 +48,7 @@
 BreakIterator*
 BreakIterator::createWordInstance(const Locale& key, UErrorCode& status)
 {
-  return createInstance(key, UBRK_WORD, status);
+    return createInstance(key, UBRK_WORD, status);
 }
 
 BreakIterator*
@@ -80,15 +81,17 @@
     else {
         result = new RuleBasedBreakIterator(file, status);
     }
+    if (U_FAILURE(status)) {   // Sometimes redundant check, but simple.
+        if (result != NULL) {
+            delete result;
+        }
+        return NULL;
+    }
     if (result == NULL) {
         udata_close(file);
         status = U_MEMORY_ALLOCATION_ERROR;
     }
-    if (U_FAILURE(status)) {   // Sometimes redundant check, but simple.
-        delete result;
-        result = NULL;
-    }
-
+    
     return result;
 }
 
@@ -98,7 +101,7 @@
 BreakIterator*
 BreakIterator::createLineInstance(const Locale& key, UErrorCode& status)
 {
-  return createInstance(key, UBRK_LINE, status);
+    return createInstance(key, UBRK_LINE, status);
 }
 
 BreakIterator*
@@ -131,14 +134,16 @@
     else {
         result = new RuleBasedBreakIterator(file, status);
     }
+    if (U_FAILURE(status)) {   // Sometimes redundant check, but simple.
+        if (result != NULL) {
+            delete result;
+        }
+        return NULL;
+    }
     if (result == NULL) {
         udata_close(file);
         status = U_MEMORY_ALLOCATION_ERROR;
     }
-    if (U_FAILURE(status)) {   // Sometimes redundant check, but simple.
-        delete result;
-        result = NULL;
-    }
     return result;
 }
 
@@ -148,7 +153,7 @@
 BreakIterator*
 BreakIterator::createCharacterInstance(const Locale& key, UErrorCode& status)
 {
-  return createInstance(key, UBRK_CHARACTER, status);
+    return createInstance(key, UBRK_CHARACTER, status);
 }
 
 BreakIterator*
@@ -169,14 +174,17 @@
     // The UDataMemory is adopted by the break iterator.
 
     result = new RuleBasedBreakIterator(file, status);
+    
+    if (U_FAILURE(status)) {   // Sometimes redundant check, but simple.
+        if (result != NULL) {
+            delete result;
+        }
+        return NULL;
+    }
     if (result == NULL) {
         udata_close(file);
         status = U_MEMORY_ALLOCATION_ERROR;
     }
-    if (U_FAILURE(status)) {   // Sometimes redundant check, but simple.
-        delete result;
-        result = NULL;
-    }
     return result;
 }
 
@@ -186,7 +194,7 @@
 BreakIterator*
 BreakIterator::createSentenceInstance(const Locale& key, UErrorCode& status)
 {
-  return createInstance(key, UBRK_SENTENCE, status);
+    return createInstance(key, UBRK_SENTENCE, status);
 }
 
 BreakIterator*
@@ -207,14 +215,16 @@
     // The UDataMemory is adopted by the break iterator.
 
     result = new RuleBasedBreakIterator(file, status);
+    if (U_FAILURE(status)) {   // Sometimes redundant check, but simple.
+        if (result != NULL) {
+            delete result;
+        }
+        return NULL;
+    }
     if (result == NULL) {
         udata_close(file);
         status = U_MEMORY_ALLOCATION_ERROR;
     }
-    if (U_FAILURE(status)) {   // Sometimes redundant check, but simple.
-        delete result;
-        result = NULL;
-    }
 
     return result;
 }
@@ -225,7 +235,7 @@
 BreakIterator*
 BreakIterator::createTitleInstance(const Locale& key, UErrorCode& status)
 {
-  return createInstance(key, UBRK_TITLE, status);
+    return createInstance(key, UBRK_TITLE, status);
 }
 
 BreakIterator*
@@ -246,14 +256,16 @@
     // The UDataMemory is adopted by the break iterator.
 
     result = new RuleBasedBreakIterator(file, status);
+    if (U_FAILURE(status)) {   // Sometimes redundant check, but simple.
+        if (result != NULL) {
+            delete result;
+        }
+        return NULL;
+    }
     if (result == NULL) {
         udata_close(file);
         status = U_MEMORY_ALLOCATION_ERROR;
     }
-    if (U_FAILURE(status)) {   // Sometimes redundant check, but simple.
-        delete result;
-        result = NULL;
-    }
 
     return result;
 }
@@ -295,6 +307,7 @@
 BreakIterator::BreakIterator()
 {
     fBufferClone = FALSE;
+    *validLocale = *actualLocale = 0;
 }
 
 BreakIterator::~BreakIterator()
@@ -313,37 +326,37 @@
 
 class ICUBreakIteratorFactory : public ICUResourceBundleFactory {
 protected:
-  virtual UObject* handleCreate(const Locale& loc, int32_t kind, const ICUService* service, UErrorCode& status) const {
-    return BreakIterator::makeInstance(loc, kind, status);
-  }
+    virtual UObject* handleCreate(const Locale& loc, int32_t kind, const ICUService* /*service*/, UErrorCode& status) const {
+        return BreakIterator::makeInstance(loc, kind, status);
+    }
 };
 
 // -------------------------------------
 
 class ICUBreakIteratorService : public ICULocaleService {
 public:
-  ICUBreakIteratorService()
-    : ICULocaleService("Break Iterator")
-  {
-    UErrorCode status = U_ZERO_ERROR;
-    registerFactory(new ICUBreakIteratorFactory(), status);
-  }
-
-  virtual UObject* cloneInstance(UObject* instance) const {
-	  return ((BreakIterator*)instance)->clone();
-  }
-
-  virtual UObject* handleDefault(const ICUServiceKey& key, UnicodeString* actualID, UErrorCode& status) const {
-	LocaleKey& lkey = (LocaleKey&)key;
-	int32_t kind = lkey.kind();
-	Locale loc;
-	lkey.currentLocale(loc);
-	return BreakIterator::makeInstance(loc, kind, status);
-  }
-
-  virtual UBool isDefault() const {
-	return countFactories() == 1;
-  }
+    ICUBreakIteratorService()
+        : ICULocaleService("Break Iterator")
+    {
+        UErrorCode status = U_ZERO_ERROR;
+        registerFactory(new ICUBreakIteratorFactory(), status);
+    }
+    
+    virtual UObject* cloneInstance(UObject* instance) const {
+        return ((BreakIterator*)instance)->clone();
+    }
+    
+    virtual UObject* handleDefault(const ICUServiceKey& key, UnicodeString* /*actualID*/, UErrorCode& status) const {
+        LocaleKey& lkey = (LocaleKey&)key;
+        int32_t kind = lkey.kind();
+        Locale loc;
+        lkey.currentLocale(loc);
+        return BreakIterator::makeInstance(loc, kind, status);
+    }
+    
+    virtual UBool isDefault() const {
+        return countFactories() == 1;
+    }
 };
 
 // -------------------------------------
@@ -374,8 +387,8 @@
 static UBool
 hasService(void) 
 {
-  Mutex mutex;
-  return gService != NULL;
+    Mutex mutex;
+    return gService != NULL;
 }
 
 // -------------------------------------
@@ -387,8 +400,25 @@
         return NULL;
     }
     
+    u_init(&status);
     if (hasService()) {
-        return (BreakIterator*)gService->get(loc, kind, status);
+        Locale actualLoc;
+        BreakIterator *result = (BreakIterator*)gService->get(loc, kind, &actualLoc, status);
+        // TODO: The way the service code works in ICU 2.8 is that if
+        // there is a real registered break iterator, the actualLoc
+        // will be populated, but if the handleDefault path is taken
+        // (because nothing is registered that can handle the
+        // requested locale) then the actualLoc comes back empty.  In
+        // that case, the returned object already has its actual/valid
+        // locale data populated (by makeInstance, which is what
+        // handleDefault calls), so we don't touch it.  YES, A COMMENT
+        // THIS LONG is a sign of bad code -- so the action item is to
+        // revisit this in ICU 3.0 and clean it up/fix it/remove it.
+        if (*actualLoc.getName() != 0) {
+            U_LOCALE_BASED(locBased, *result);
+            locBased.setLocaleIDs(actualLoc.getName(), actualLoc.getName());
+        }
+        return result;
     } else {
         return makeInstance(loc, kind, status);
     }
@@ -399,7 +429,7 @@
 URegistryKey
 BreakIterator::registerInstance(BreakIterator* toAdopt, const Locale& locale, UBreakIteratorType kind, UErrorCode& status) 
 {
-  return getService()->registerInstance(toAdopt, locale, kind, status);
+    return getService()->registerInstance(toAdopt, locale, kind, status);
 }
 
 // -------------------------------------
@@ -429,16 +459,56 @@
 BreakIterator* 
 BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
 {
+
+	if (U_FAILURE(status)) {
+		return NULL;
+	}
+
+    BreakIterator *result = NULL;
     switch (kind) {
-    case UBRK_CHARACTER: return BreakIterator::makeCharacterInstance(loc, status);
-    case UBRK_WORD: return BreakIterator::makeWordInstance(loc, status);
-    case UBRK_LINE: return BreakIterator::makeLineInstance(loc, status);
-    case UBRK_SENTENCE: return BreakIterator::makeSentenceInstance(loc, status);
-    case UBRK_TITLE: return BreakIterator::makeTitleInstance(loc, status);
+    case UBRK_CHARACTER: 
+        result = BreakIterator::makeCharacterInstance(loc, status);
+        break;
+    case UBRK_WORD:
+        result = BreakIterator::makeWordInstance(loc, status);
+        break;
+    case UBRK_LINE:
+        result = BreakIterator::makeLineInstance(loc, status);
+        break;
+    case UBRK_SENTENCE:
+        result = BreakIterator::makeSentenceInstance(loc, status);
+        break;
+    case UBRK_TITLE:
+        result = BreakIterator::makeTitleInstance(loc, status);
+        break;
     default:
-      status = U_ILLEGAL_ARGUMENT_ERROR;
-      return NULL;
+        status = U_ILLEGAL_ARGUMENT_ERROR;
     }
+
+	if (U_FAILURE(status)) {
+		return NULL;
+	}
+
+    // this is more of a placeholder. All the break iterators have the same actual locale: root
+    // except the Thai one
+    ResourceBundle res(NULL, loc, status);
+    U_LOCALE_BASED(locBased, *result);
+    locBased.setLocaleIDs(res.getLocale(ULOC_VALID_LOCALE, status).getName(),
+                          (uprv_strcmp(loc.getLanguage(), "th") == 0) ?
+                          "th" : "root");
+    return result;
+}
+
+Locale 
+BreakIterator::getLocale(ULocDataLocaleType type, UErrorCode& status) const {
+    U_LOCALE_BASED(locBased, *this);
+    return locBased.getLocale(type, status);
+}
+
+const char *
+BreakIterator::getLocaleID(ULocDataLocaleType type, UErrorCode& status) const {
+    U_LOCALE_BASED(locBased, *this);
+    return locBased.getLocaleID(type, status);
 }
 
 U_NAMESPACE_END
@@ -449,11 +519,11 @@
  * Release all static memory held by breakiterator.  
  */
 U_CFUNC UBool breakiterator_cleanup(void) {
-  if (gService) {
-    delete gService;
-    gService = NULL;
-  }
-  return TRUE;
+    if (gService) {
+        delete gService;
+        gService = NULL;
+    }
+    return TRUE;
 }
 
 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

Index: caniter.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/caniter.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- caniter.cpp	10 Sep 2003 02:42:02 -0000	1.1
+++ caniter.cpp	6 Apr 2004 10:07:58 -0000	1.2
@@ -80,7 +80,7 @@
 
 // TODO: add boilerplate methods.
 
-const char CanonicalIterator::fgClassID=0;
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CanonicalIterator)
 
 /**
  *@param source string to get results for

Index: chariter.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/chariter.cpp,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- chariter.cpp	10 Sep 2003 02:42:02 -0000	1.3
+++ chariter.cpp	6 Apr 2004 10:07:58 -0000	1.4
@@ -1,6 +1,6 @@
 /*
 **********************************************************************
-*   Copyright (C) 1999-2002, International Business Machines
+*   Copyright (C) 1999-2003, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */
@@ -8,6 +8,9 @@
 #include "unicode/chariter.h"
 
 U_NAMESPACE_BEGIN
+
+ForwardCharacterIterator::~ForwardCharacterIterator() {}
+
 
 CharacterIterator::CharacterIterator()
 : textLength(0), pos(0), begin(0), end(0) {

Index: charstr.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/charstr.h,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- charstr.h	10 Sep 2003 02:42:02 -0000	1.3
+++ charstr.h	6 Apr 2004 10:07:58 -0000	1.4
@@ -1,6 +1,6 @@
 /*
 **********************************************************************
-*   Copyright (c) 2001, International Business Machines
+*   Copyright (c) 2001-2003, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *   Date        Name        Description
@@ -25,7 +25,11 @@
 
 class U_COMMON_API CharString : public UMemory {
 public:
-    inline CharString(const UnicodeString& str);
+    // Constructor
+    //     @param  str    The unicode string to be converted to char *
+    //     @param  codepage   The char * code page.  ""   for invariant conversion.
+    //                                               NULL for default code page.
+    inline CharString(const UnicodeString& str, const char *codepage = "");
     inline ~CharString();
     inline operator const char*() const { return ptr; }
 
@@ -37,14 +41,15 @@
     CharString &operator=(const CharString &other); // forbid copying of this class
 };
 
-inline CharString::CharString(const UnicodeString& str) {
-    // Invariant converter should create str.length() chars
-    if (str.length() >= (int32_t)sizeof(buf)) {
-        ptr = (char *)uprv_malloc(str.length() + 8);
-    } else {
-        ptr = buf;
+inline CharString::CharString(const UnicodeString& str, const char *codepage) {
+    int32_t    len;
+    ptr = buf;
+    len = str.extract(0, 0x7FFFFFFF, buf ,sizeof(buf)-1, codepage);
+    buf[sizeof(buf)-1] = 0;  // extract does not add null if it thinks there is no space for it.
+    if (len >= (int32_t)(sizeof(buf)-1)) {
+        ptr = (char *)uprv_malloc(len+1);
+        str.extract(0, 0x7FFFFFFF, ptr, len+1, codepage);
     }
-    str.extract(0, 0x7FFFFFFF, ptr, "");
 }
 
 inline CharString::~CharString() {

Index: cmemory.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/cmemory.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- cmemory.c	10 Sep 2003 02:42:02 -0000	1.1
+++ cmemory.c	6 Apr 2004 10:07:58 -0000	1.2
@@ -1,7 +1,7 @@
 /*
 ******************************************************************************
 *
-*   Copyright (C) 2002, International Business Machines
+*   Copyright (C) 2002-2003, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************
@@ -19,14 +19,30 @@
 ******************************************************************************
 */
 #include "cmemory.h"
+#include "unicode/uclean.h"
 
 /* uprv_malloc(0) returns a pointer to this read-only data. */                
 static const int32_t zeroMem[] = {0, 0, 0, 0, 0, 0};
 
+/* Function Pointers for user-supplied heap functions  */
+static const void     *pContext;
+static UMemAllocFn    *pAlloc;
+static UMemReallocFn  *pRealloc;
+static UMemFreeFn     *pFree;
+
+/* Flag indicating whether any heap allocations have happened.
+ *   Used to prevent changing out the heap functions after allocations have been made */
+static UBool   gHeapInUse;
+
 U_CAPI void * U_EXPORT2
 uprv_malloc(size_t s) {
     if (s > 0) {
-        return malloc(s);
+        gHeapInUse = TRUE;
+        if (pAlloc) {
+            return (*pAlloc)(pContext, s);
+        } else {
+            return malloc(s);
+        }
     } else {
         return (void *)zeroMem;
     }
@@ -37,17 +53,71 @@
     if (buffer == zeroMem) {
         return uprv_malloc(size);
     } else if (size == 0) {
-        free(buffer);
+        if (pFree) {
+            (*pFree)(pContext, buffer);
+        } else {
+            free(buffer);
+        }
         return (void *)zeroMem;
     } else {
-        return realloc(buffer, size);
+        gHeapInUse = TRUE;
+        if (pRealloc) {
+            return (*pRealloc)(pContext, buffer, size);
+        } else {
+            return realloc(buffer, size);
+        }
     }
 }
 
 U_CAPI void U_EXPORT2
 uprv_free(void *buffer) {
     if (buffer != zeroMem) {
-        free(buffer);
+        if (pFree) {
+            (*pFree)(pContext, buffer);
+        } else {
+            free(buffer);
+        }
+    }
+}
+
+U_CAPI void U_EXPORT2
+u_setMemoryFunctions(const void *context, UMemAllocFn *a, UMemReallocFn *r, UMemFreeFn *f,  UErrorCode *status)
+{
+    if (U_FAILURE(*status)) {
+        return;
+    }
+    if (a==NULL || r==NULL || f==NULL) {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+        return;
+    }
+    if (gHeapInUse) {
+        *status = U_INVALID_STATE_ERROR;
+        return;
     }
+    pContext  = context;
+    pAlloc    = a;
+    pRealloc  = r;
+    pFree     = f;
+}
+
+
+U_CFUNC UBool cmemory_cleanup(void) {
+    pContext   = NULL;
+    pAlloc     = NULL;
+    pRealloc   = NULL;
+    pFree      = NULL;
+    gHeapInUse = FALSE;
+    return TRUE;
+}
+
+
+/*
+ *   gHeapInUse
+ *       Return True if ICU has allocated any memory.
+ *       Used by u_SetMutexFunctions() and similar to verify that ICU has not
+ *               been used, that it is in a pristine initial state.
+ */
+U_CFUNC UBool cmemory_inUse() {
+    return gHeapInUse;
 }
 

Index: cmemory.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/cmemory.h,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- cmemory.h	10 Sep 2003 02:42:02 -0000	1.3
+++ cmemory.h	6 Apr 2004 10:07:58 -0000	1.4
@@ -1,7 +1,7 @@
 /*
 ******************************************************************************
 *
-*   Copyright (C) 1997-2001, International Business Machines
+*   Copyright (C) 1997-2003, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************
@@ -64,5 +64,20 @@
  * in order to get the next aligned address
  */
 #define U_ALIGNMENT_OFFSET_UP(ptr) (sizeof(UAlignedMemory) - U_ALIGNMENT_OFFSET(ptr))
+
+/**
+  *  Indicate whether the ICU allocation functions have been used.
+  *  This is used to determine whether ICU is in an initial, unused state.
+  */
+U_CFUNC UBool 
+cmemory_inUse(void);
+
+/**
+  *  Heap clean up function, called from u_cleanup()
+  *    Clears any user heap functions from u_setMemoryFunctions()
+  *    Does NOT deallocate any remaining allocated memory.
+  */
+U_CFUNC UBool 
+cmemory_cleanup(void);
 
 #endif

Index: common.dsp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/common.dsp,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- common.dsp	10 Sep 2003 02:42:02 -0000	1.4
+++ common.dsp	6 Apr 2004 10:07:58 -0000	1.5
@@ -55,7 +55,7 @@
 # ADD BSC32 /nologo
 LINK32=link.exe
 # ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /dll /machine:I386
-# ADD LINK32 icudata.lib kernel32.lib user32.lib advapi32.lib shell32.lib /nologo /base:"0x4a800000" /dll /machine:I386 /out:"..\..\bin\icuuc26.dll" /implib:"..\..\lib\icuuc.lib" /libpath:"..\..\lib"
+# ADD LINK32 icudata.lib kernel32.lib user32.lib advapi32.lib shell32.lib /nologo /base:"0x4a800000" /dll /machine:I386 /out:"..\..\bin\icuuc28.dll" /implib:"..\..\lib\icuuc.lib" /libpath:"..\..\lib"
 # SUBTRACT LINK32 /pdb:none /debug
 
 !ELSEIF  "$(CFG)" == "common - Win32 Debug"
@@ -82,7 +82,7 @@
 # ADD BSC32 /nologo
 LINK32=link.exe
 # ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /dll /debug /machine:I386 /pdbtype:sept
-# ADD LINK32 icudata.lib kernel32.lib user32.lib advapi32.lib shell32.lib /nologo /base:"0x4a800000" /dll /debug /machine:I386 /out:"..\..\bin\icuuc26d.dll" /implib:"..\..\lib\icuucd.lib" /pdbtype:sept /libpath:"..\..\lib"
+# ADD LINK32 icudata.lib kernel32.lib user32.lib advapi32.lib shell32.lib /nologo /base:"0x4a800000" /dll /debug /machine:I386 /out:"..\..\bin\icuuc28d.dll" /implib:"..\..\lib\icuucd.lib" /pdbtype:sept /libpath:"..\..\lib"
 # SUBTRACT LINK32 /pdb:none
 
 !ELSEIF  "$(CFG)" == "common - Win64 Release"
@@ -109,7 +109,7 @@
 # ADD BSC32 /nologo
 LINK32=link.exe
 # ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /dll /machine:IX86 /machine:IA64
-# ADD LINK32 icudata.lib kernel32.lib user32.lib advapi32.lib shell32.lib /nologo /base:"0x4a800000" /dll /machine:IX86 /out:"..\..\bin\icuuc26.dll" /implib:"..\..\lib\icuuc.lib" /libpath:"..\..\lib" /machine:IA64
+# ADD LINK32 icudata.lib kernel32.lib user32.lib advapi32.lib shell32.lib /nologo /base:"0x4a800000" /dll /machine:IX86 /out:"..\..\bin\icuuc28.dll" /implib:"..\..\lib\icuuc.lib" /libpath:"..\..\lib" /machine:IA64
 # SUBTRACT LINK32 /debug
 
 !ELSEIF  "$(CFG)" == "common - Win64 Debug"
@@ -136,7 +136,7 @@
 # ADD BSC32 /nologo
 LINK32=link.exe
 # ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /dll /debug /machine:IX86 /pdbtype:sept /machine:IA64
-# ADD LINK32 icudata.lib kernel32.lib user32.lib advapi32.lib shell32.lib /nologo /base:"0x4a800000" /dll /incremental:no /debug /machine:IX86 /out:"..\..\bin\icuuc26d.dll" /implib:"..\..\lib\icuucd.lib" /pdbtype:sept /libpath:"..\..\lib" /machine:IA64
+# ADD LINK32 icudata.lib kernel32.lib user32.lib advapi32.lib shell32.lib /nologo /base:"0x4a800000" /dll /incremental:no /debug /machine:IX86 /out:"..\..\bin\icuuc28d.dll" /implib:"..\..\lib\icuucd.lib" /pdbtype:sept /libpath:"..\..\lib" /machine:IA64
 
 !ENDIF 
 
@@ -544,6 +544,19 @@
 
 # End Source File
 # End Group
+# Begin Group "collation"
+
+# PROP Default_Filter ""
+# Begin Source File
+
+SOURCE=.\ucol_swp.c
+# ADD CPP /I "..\i18n"
+# End Source File
+# Begin Source File
+
+SOURCE=.\ucol_swp.h
+# End Source File
+# End Group
 # Begin Group "collections"
 
 # PROP Default_Filter ""
@@ -600,6 +613,14 @@
 # End Source File
 # Begin Source File
 
+SOURCE=.\uarrsort.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\uarrsort.h
+# End Source File
+# Begin Source File
+
 SOURCE=.\ucmp8.c
 # End Source File
 # Begin Source File
@@ -675,6 +696,10 @@
 # End Source File
 # Begin Source File
 
+SOURCE=.\ustack.cpp
+# End Source File
+# Begin Source File
+
 SOURCE=.\ustrenum.cpp
 # End Source File
 # Begin Source File
@@ -735,10 +760,6 @@
 # End Source File
 # Begin Source File
 
-SOURCE=.\mutex.cpp
-# End Source File
-# Begin Source File
-
 SOURCE=.\mutex.h
 # End Source File
 # Begin Source File
@@ -1004,6 +1025,61 @@
 # End Source File
 # Begin Source File
 
+SOURCE=.\utrace.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\unicode\utrace.h
+
+!IF  "$(CFG)" == "common - Win32 Release"
+
+# Begin Custom Build
+InputPath=.\unicode\utrace.h
+
+"..\..\include\unicode\utrace.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	copy    $(InputPath)    ..\..\include\unicode
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "common - Win32 Debug"
+
+# Begin Custom Build
+InputPath=.\unicode\utrace.h
+
+"..\..\include\unicode\utrace.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	copy    $(InputPath)    ..\..\include\unicode
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "common - Win64 Release"
+
+# Begin Custom Build
+InputPath=.\unicode\utrace.h
+
+"..\..\include\unicode\utrace.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	copy    $(InputPath)    ..\..\include\unicode
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "common - Win64 Debug"
+
+# Begin Custom Build
+InputPath=.\unicode\utrace.h
+
+"..\..\include\unicode\utrace.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	copy    $(InputPath)    ..\..\include\unicode
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\utracimp.h
+# End Source File
+# Begin Source File
+
 SOURCE=.\unicode\utypes.h
 
 !IF  "$(CFG)" == "common - Win32 Release"
@@ -1275,6 +1351,14 @@
 # End Source File
 # Begin Source File
 
+SOURCE=.\ucnv_ext.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\ucnv_ext.h
+# End Source File
+# Begin Source File
+
 SOURCE=.\ucnv_imp.h
 # End Source File
 # Begin Source File
@@ -1474,6 +1558,14 @@
 # End Source File
 # Begin Source File
 
+SOURCE=.\udataswp.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\udataswp.h
+# End Source File
+# Begin Source File
+
 SOURCE=.\umapfile.c
 # ADD CPP /Ze
 # End Source File
@@ -1538,14 +1630,6 @@
 # PROP Default_Filter ""
 # Begin Source File
 
-SOURCE=.\digitlst.cpp
-# End Source File
-# Begin Source File
-
-SOURCE=.\digitlst.h
-# End Source File
-# Begin Source File
-
 SOURCE=.\unicode\parseerr.h
 
 !IF  "$(CFG)" == "common - Win32 Release"
@@ -1593,6 +1677,10 @@
 # End Source File
 # Begin Source File
 
+SOURCE=.\parsepos.cpp
+# End Source File
+# Begin Source File
+
 SOURCE=.\unicode\parsepos.h
 
 !IF  "$(CFG)" == "common - Win32 Release"
@@ -1706,11 +1794,82 @@
 SOURCE=.\util.h
 # End Source File
 # End Group
+# Begin Group "idna"
+
+# PROP Default_Filter "*.c,*.h"
+# Begin Source File
+
+SOURCE=.\punycode.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\punycode.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\uidna.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=.\unicode\uidna.h
+
+!IF  "$(CFG)" == "common - Win32 Release"
+
+# Begin Custom Build
+InputPath=.\unicode\uidna.h
+
+"..\..\include\unicode\uidna.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	copy    $(InputPath)    ..\..\include\unicode
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "common - Win32 Debug"
+
+# Begin Custom Build
+InputPath=.\unicode\uidna.h
+
+"..\..\include\unicode\uidna.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	copy    $(InputPath)    ..\..\include\unicode
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "common - Win64 Release"
+
+# Begin Custom Build
+InputPath=.\unicode\uidna.h
+
+"..\..\include\unicode\uidna.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	copy    $(InputPath)    ..\..\include\unicode
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "common - Win64 Debug"
+
+# Begin Custom Build
+InputPath=.\unicode\uidna.h
+
+"..\..\include\unicode\uidna.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	copy    $(InputPath)    ..\..\include\unicode
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# End Group
 # Begin Group "locales & resources"
 
 # PROP Default_Filter ""
 # Begin Source File
 
+SOURCE=.\locbased.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=.\locbased.h
+# End Source File
+# Begin Source File
+
 SOURCE=.\locid.cpp
 # End Source File
 # Begin Source File
@@ -2159,7 +2318,58 @@
 # End Source File
 # Begin Source File
 
-SOURCE=.\symtable.h
+SOURCE=.\ruleiter.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=.\ruleiter.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\unicode\symtable.h
+
+!IF  "$(CFG)" == "common - Win32 Release"
+
+# Begin Custom Build
+InputPath=.\unicode\symtable.h
+
+"..\..\include\unicode\symtable.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	copy    $(InputPath)    ..\..\include\unicode
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "common - Win32 Debug"
+
+# Begin Custom Build
+InputPath=.\unicode\symtable.h
+
+"..\..\include\unicode\symtable.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	copy    $(InputPath)    ..\..\include\unicode
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "common - Win64 Release"
+
+# Begin Custom Build
+InputPath=.\unicode\symtable.h
+
+"..\..\include\unicode\symtable.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	copy    $(InputPath)    ..\..\include\unicode
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "common - Win64 Debug"
+
+# Begin Custom Build
+InputPath=.\unicode\symtable.h
+
+"..\..\include\unicode\symtable.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	copy    $(InputPath)    ..\..\include\unicode
+
+# End Custom Build
+
+!ENDIF 
+
 # End Source File
 # Begin Source File
 
@@ -2622,6 +2832,65 @@
 SOURCE=.\icuserv.h
 # End Source File
 # End Group
+# Begin Group "sprep"
+
+# PROP Default_Filter ""
+# Begin Source File
+
+SOURCE=.\sprpimpl.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\usprep.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=.\unicode\usprep.h
+
+!IF  "$(CFG)" == "common - Win32 Release"
+
+# Begin Custom Build
+InputPath=.\unicode\usprep.h
+
+"..\..\include\unicode\usprep.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	copy    $(InputPath)    ..\..\include\unicode
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "common - Win32 Debug"
+
+# Begin Custom Build
+InputPath=.\unicode\usprep.h
+
+"..\..\include\unicode\usprep.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	copy    $(InputPath)    ..\..\include\unicode
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "common - Win64 Release"
+
+# Begin Custom Build
+InputPath=.\unicode\usprep.h
+
+"..\..\include\unicode\usprep.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	copy    $(InputPath)    ..\..\include\unicode
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "common - Win64 Debug"
+
+# Begin Custom Build
+InputPath=.\unicode\usprep.h
+
+"..\..\include\unicode\usprep.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	copy    $(InputPath)    ..\..\include\unicode
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# End Group
 # Begin Group "strings"
 
 # PROP Default_Filter ""
@@ -3292,65 +3561,6 @@
 	copy    $(InputPath)    ..\..\include\unicode
 
 # End Custom Build
-
-!ENDIF 
-
-# End Source File
-# End Group
-# Begin Group "idna"
-
-# PROP Default_Filter "*.c,*.h"
-# Begin Source File
-
-SOURCE=.\nameprep.cpp
-# End Source File
-# Begin Source File
-
-SOURCE=.\nameprep.h
-# End Source File
-# Begin Source File
-
-SOURCE=.\punycode.c
-# End Source File
-# Begin Source File
-
-SOURCE=.\punycode.h
-# End Source File
-# Begin Source File
-
-SOURCE=.\sprpimpl.h
-# End Source File
-# Begin Source File
-
-SOURCE=.\strprep.cpp
-# End Source File
-# Begin Source File
-
-SOURCE=.\strprep.h
-# End Source File
-# Begin Source File
-
-SOURCE=.\uidna.cpp
-# End Source File
-# Begin Source File
-
-SOURCE=.\unicode\uidna.h
-
-!IF  "$(CFG)" == "common - Win32 Release"
-
-!ELSEIF  "$(CFG)" == "common - Win32 Debug"
-
-# Begin Custom Build
-InputPath=.\unicode\uidna.h
-
-"..\..\include\unicode\uidna.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
-	copy    $(InputPath)    ..\..\include\unicode
-
-# End Custom Build
-
-!ELSEIF  "$(CFG)" == "common - Win64 Release"
-
-!ELSEIF  "$(CFG)" == "common - Win64 Debug"
 
 !ENDIF 
 

Index: common.rc
===================================================================
RCS file: /cvs/core/icu-sword/source/common/common.rc,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- common.rc	10 Sep 2003 02:42:02 -0000	1.3
+++ common.rc	6 Apr 2004 10:07:58 -0000	1.4
@@ -1,25 +1,50 @@
-//Do not edit with Microsoft Developer Studio because it will modify this
-//script in the wrong way.
+// Do not edit with Microsoft Developer Studio Resource Editor.
+//   It will permanently substitute version numbers that are intended to be
+//   picked up by the pre-processor during each build.
 // Copyright (c) 2001-2003 International Business Machines
 // Corporation and others. All Rights Reserved.
 //
+#include "msvcres.h"
+
 #define APSTUDIO_READONLY_SYMBOLS
 /////////////////////////////////////////////////////////////////////////////
 //
 // Generated from the TEXTINCLUDE 2 resource.
 //
-#include "winresrc.h"
-
+#include <winresrc.h>
 /////////////////////////////////////////////////////////////////////////////
 #undef APSTUDIO_READONLY_SYMBOLS
 
 /////////////////////////////////////////////////////////////////////////////
-//
+// 
 
 LANGUAGE LANG_NEUTRAL, SUBLANG_NEUTRAL
 #pragma code_page(1252)
 
-#include "unicode\uversion.h"
+#ifdef APSTUDIO_INVOKED
+/////////////////////////////////////////////////////////////////////////////
+//
+// TEXTINCLUDE
+//
+
+1 TEXTINCLUDE 
+BEGIN
+    "msvcres.h\0"
+END
+
+2 TEXTINCLUDE 
+BEGIN
+    "#include <winresrc.h>\0"
+END
+
+3 TEXTINCLUDE 
+BEGIN
+    "\r\n"
+    "\0"
+END
+
+#endif    // APSTUDIO_INVOKED
+
 
 /////////////////////////////////////////////////////////////////////////////
 //
@@ -55,7 +80,6 @@
 #else
             VALUE "OriginalFilename", "icuuc" U_ICU_VERSION_SHORT ".dll\0")
 #endif
-
             VALUE "PrivateBuild", "\0"
             VALUE "ProductName", "International Components for Unicode\0"
             VALUE "ProductVersion", CommaVersionString(U_ICU_VERSION_MAJOR_NUM, U_ICU_VERSION_MINOR_NUM, U_ICU_VERSION_PATCHLEVEL_NUM, 0)
@@ -69,4 +93,16 @@
 END
 
 /////////////////////////////////////////////////////////////////////////////
+
+
+
+#ifndef APSTUDIO_INVOKED
+/////////////////////////////////////////////////////////////////////////////
+//
+// Generated from the TEXTINCLUDE 3 resource.
+//
+
+
+/////////////////////////////////////////////////////////////////////////////
+#endif    // not APSTUDIO_INVOKED
 

Index: common.vcproj
===================================================================
RCS file: /cvs/core/icu-sword/source/common/common.vcproj,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- common.vcproj	10 Sep 2003 02:42:02 -0000	1.1
+++ common.vcproj	6 Apr 2004 10:07:58 -0000	1.2
@@ -1,8 +1,9 @@
-<?xml version="1.0" encoding = "Windows-1252"?>
+<?xml version="1.0" encoding="Windows-1252"?>
 <VisualStudioProject
 	ProjectType="Visual C++"
-	Version="7.00"
+	Version="7.10"
 	Name="common"
+	ProjectGUID="{F0C3266C-B49D-4097-BDBD-EEB592017672}"
 	SccProjectName=""
 	SccLocalPath="">
 	<Platforms>
[...1520 lines suppressed...]
-			<File
-				RelativePath=".\strprep.h">
-			</File>
-			<File
-				RelativePath=".\uidna.cpp">
-			</File>
-			<File
-				RelativePath=".\unicode\uidna.h">
 				<FileConfiguration
 					Name="Debug|Win32">
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="copy    $(InputPath)    ..\..\include\unicode
+						CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode
 "
-						Outputs="..\..\include\unicode\uidna.h"/>
+						Outputs="..\..\include\unicode\$(InputFileName)"/>
 				</FileConfiguration>
 			</File>
 		</Filter>

Index: cstring.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/cstring.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- cstring.c	10 Sep 2003 02:42:02 -0000	1.4
+++ cstring.c	6 Apr 2004 10:07:58 -0000	1.5
@@ -25,6 +25,7 @@
 #include "unicode/utypes.h"
 #include "cmemory.h"
 #include "cstring.h"
+#include "uassert.h"
 
 /*
  * We hardcode case conversion for invariant characters to match our expectation
@@ -58,6 +59,17 @@
     return c;
 }
 
+
+#if 0
+/*
+ * Commented out because cstring.h defines uprv_tolower() to be
+ * the same as either uprv_asciitolower() or uprv_ebcdictolower()
+ * to reduce the amount of code to cover with tests.
+ *
+ * Note that this uprv_tolower() definition is likely to work for most
+ * charset families, not just ASCII and EBCDIC, because its #else branch
+ * is written generically.
+ */
 U_CAPI char U_EXPORT2
 uprv_tolower(char c) {
 #if U_CHARSET_FAMILY==U_EBCDIC_FAMILY
@@ -71,6 +83,26 @@
 #endif
     return c;
 }
+#endif
+
+U_CAPI char U_EXPORT2
+uprv_asciitolower(char c) {
+    if(0x41<=c && c<=0x5a) {
+        c=(char)(c+0x20);
+    }
+    return c;
+}
+
+U_CAPI char U_EXPORT2
+uprv_ebcdictolower(char c) {
+    if( (0xc1<=(uint8_t)c && (uint8_t)c<=0xc9) ||
+        (0xd1<=(uint8_t)c && (uint8_t)c<=0xd9) ||
+        (0xe2<=(uint8_t)c && (uint8_t)c<=0xe9)
+    ) {
+        c=(char)(c-0x40);
+    }
+    return c;
+}
 
 
 U_CAPI char* U_EXPORT2
@@ -104,44 +136,76 @@
 /*
  * Takes a int32_t and fills in  a char* string with that number "radix"-based.
  * Does not handle negative values (makes an empty string for them).
- * Writes at most 11 chars ("2147483647" plus NUL).
- * Returns the length of the string.
+ * Writes at most 12 chars ("-2147483647" plus NUL).
+ * Returns the length of the string (not including the NUL).
  */
 U_CAPI int32_t U_EXPORT2
-T_CString_integerToString(char* buffer, int32_t i, int32_t radix)
+T_CString_integerToString(char* buffer, int32_t v, int32_t radix)
 {
-  int32_t length;
-  int32_t num;
-  int8_t digit;
-  char temp;
-
-  if(i<0) {
-    *buffer = 0;
-    return 0;
-  }
-
-  length = 0;
-  while (i>=radix)
-    {
-      num = i/radix;
-      digit = (int8_t)(i - num*radix);
-      buffer[length++] = (char)(T_CString_itosOffset(digit));
-      i = num;
+    char      tbuf[30];
+    int32_t   tbx    = sizeof(tbuf);
+    uint8_t   digit;
+    int32_t   length = 0;
+    uint32_t  uval;
+    
+    U_ASSERT(radix>=2 && radix<=16);
+    uval = (uint32_t) v;
+    if(v<0 && radix == 10) {
+        /* Only in base 10 do we conside numbers to be signed. */
+        uval = (uint32_t)(-v); 
+        buffer[length++] = '-';
     }
-
-  buffer[length] = (char)(T_CString_itosOffset(i));
-  buffer[++length] = '\0';
+    
+    tbx = sizeof(tbuf)-1;
+    tbuf[tbx] = 0;   /* We are generating the digits backwards.  Null term the end. */
+    do {
+        digit = (uint8_t)(uval % radix);
+        tbuf[--tbx] = (char)(T_CString_itosOffset(digit));
+        uval  = uval / radix;
+    } while (uval != 0);
+    
+    /* copy converted number into user buffer  */
+    uprv_strcpy(buffer+length, tbuf+tbx);
+    length += sizeof(tbuf) - tbx -1;
+    return length;
+}
 
 
-  /* Reverses the string, swap digits at buffer[0]..buffer[num] */
-  num = length - 1;
-  for (i = 0; i < num; ++i, --num) {
-    temp = buffer[num];
-    buffer[num] = buffer[i];
-    buffer[i] = temp;
-  }
 
-  return length;
+/*
+ * Takes a int64_t and fills in  a char* string with that number "radix"-based.
+ * Writes at most 21: chars ("-9223372036854775807" plus NUL).
+ * Returns the length of the string, not including the terminating NULL.
+ */
+U_CAPI int32_t U_EXPORT2
+T_CString_int64ToString(char* buffer, int64_t v, uint32_t radix)
+{
+    char      tbuf[30];
+    int32_t   tbx    = sizeof(tbuf);
+    uint8_t   digit;
+    int32_t   length = 0;
+    uint64_t  uval;
+    
+    U_ASSERT(radix>=2 && radix<=16);
+    uval = (uint64_t) v;
+    if(v<0 && radix == 10) {
+        /* Only in base 10 do we conside numbers to be signed. */
+        uval = (uint64_t)(-v); 
+        buffer[length++] = '-';
+    }
+    
+    tbx = sizeof(tbuf)-1;
+    tbuf[tbx] = 0;   /* We are generating the digits backwards.  Null term the end. */
+    do {
+        digit = (uint8_t)(uval % radix);
+        tbuf[--tbx] = (char)(T_CString_itosOffset(digit));
+        uval  = uval / radix;
+    } while (uval != 0);
+    
+    /* copy converted number into user buffer  */
+    uprv_strcpy(buffer+length, tbuf+tbx);
+    length += sizeof(tbuf) - tbx -1;
+    return length;
 }
 
 
@@ -149,7 +213,7 @@
 T_CString_stringToInteger(const char *integerString, int32_t radix)
 {
     char *end;
-    return strtoul(integerString, &end, radix);
+    return uprv_strtoul(integerString, &end, radix);
 
 }
     
@@ -235,7 +299,7 @@
 
 U_CAPI char* U_EXPORT2
 uprv_strdup(const char *src) {
-    size_t len = strlen(src) + 1;
+    size_t len = uprv_strlen(src) + 1;
     char *dup = (char *) uprv_malloc(len);
 
     if (dup) {

Index: cstring.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/cstring.h,v
retrieving revision 1.6
retrieving revision 1.7
diff -u -d -r1.6 -r1.7
--- cstring.h	10 Sep 2003 02:42:02 -0000	1.6
+++ cstring.h	6 Apr 2004 10:07:58 -0000	1.7
@@ -45,8 +45,21 @@
 U_CAPI char U_EXPORT2
 uprv_toupper(char c);
 
+
 U_CAPI char U_EXPORT2
-uprv_tolower(char c);
+uprv_asciitolower(char c);
+
+U_CAPI char U_EXPORT2
+uprv_ebcdictolower(char c);
+
+#if U_CHARSET_FAMILY==U_ASCII_FAMILY
+#   define uprv_tolower uprv_asciitolower
+#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
+#   define uprv_tolower uprv_ebcdictolower
+#else
+#   error U_CHARSET_FAMILY is not valid
+#endif
+
 
 #define uprv_strtoul(str, end, base) U_STANDARD_CPP_NAMESPACE strtoul(str, end, base)
 #define uprv_strtol(str, end, base) U_STANDARD_CPP_NAMESPACE strtol(str, end, base)
@@ -81,6 +94,9 @@
 
 U_CAPI int32_t U_EXPORT2
 T_CString_integerToString(char *buffer, int32_t n, int32_t radix);
+
+U_CAPI int32_t U_EXPORT2
+T_CString_int64ToString(char *buffer, int64_t n, uint32_t radix);
 
 U_CAPI int32_t U_EXPORT2
 T_CString_stringToInteger(const char *integerString, int32_t radix);

Index: dbbi.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/dbbi.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- dbbi.cpp	10 Sep 2003 02:42:02 -0000	1.1
+++ dbbi.cpp	6 Apr 2004 10:07:58 -0000	1.2
@@ -1,6 +1,6 @@
 /*
 **********************************************************************
-*   Copyright (C) 1999-2001 IBM Corp. All rights reserved.
+*   Copyright (C) 1999-2003 IBM Corp. All rights reserved.
 **********************************************************************
 *   Date        Name        Description
 *   12/1/99    rgillam     Complete port from Java.
@@ -20,14 +20,14 @@
 
 U_NAMESPACE_BEGIN
 
-const char DictionaryBasedBreakIterator::fgClassID = 0;
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(DictionaryBasedBreakIterator)
 
 
-//-------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 //
 // constructors
 //
-//-------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 
 DictionaryBasedBreakIterator::DictionaryBasedBreakIterator() :
 RuleBasedBreakIterator() {
@@ -43,17 +43,18 @@
     init();
     if (U_FAILURE(status)) {return;};
     fTables = new DictionaryBasedBreakIteratorTables(dictionaryFilename, status);
+    if (U_FAILURE(status)) {
+        if (fTables != NULL) {
+            fTables->removeReference();
+            fTables = NULL;
+        }
+        return;
+    }
     /* test for NULL */
     if(fTables == 0) {
         status = U_MEMORY_ALLOCATION_ERROR;
         return;
     }
-    
-    if (U_FAILURE(status)) {
-        fTables->removeReference();
-        fTables = NULL;
-        return;
-    }
 }
 
 
@@ -70,11 +71,11 @@
 
 
 
-//-------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 //
 //   Destructor
 //
-//-------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 DictionaryBasedBreakIterator::~DictionaryBasedBreakIterator()
 {
     uprv_free(cachedBreakPositions);
@@ -82,12 +83,12 @@
     if (fTables != NULL) {fTables->removeReference();};
 }
 
-//-------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 //
 //   Assignment operator.     Sets this iterator to have the same behavior,
 //                            and iterate over the same text, as the one passed in.
 //
-//-------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 DictionaryBasedBreakIterator&
 DictionaryBasedBreakIterator::operator=(const DictionaryBasedBreakIterator& that) {
     if (this == &that) {
@@ -103,12 +104,12 @@
     return *this;
 }
 
-//-------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 //
 //   Clone()    Returns a newly-constructed RuleBasedBreakIterator with the same
 //              behavior, and iterating over the same text, as this one.
 //
-//-------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 BreakIterator*
 DictionaryBasedBreakIterator::clone() const {
     return new DictionaryBasedBreakIterator(*this);
@@ -292,11 +293,11 @@
 
 
 
-//-------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 //
 //    init()    Common initialization routine, for use by constructors, etc.
 //
-//-------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 void DictionaryBasedBreakIterator::init() {
     cachedBreakPositions    = NULL;
     fTables                 = NULL;
@@ -306,11 +307,11 @@
 }
 
 
-//-------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 //
 //    BufferClone
 //
-//-------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 BreakIterator *  DictionaryBasedBreakIterator::createBufferClone(void *stackBuffer,
                                    int32_t &bufferSize,
                                    UErrorCode &status)
@@ -396,6 +397,9 @@
         c = fText->next();
     }
 
+    if (U_FAILURE(status)) {
+        return; // UStack below overwrites the status error codes
+    }
     
     // initialize.  We maintain two stacks: currentBreakPositions contains
     // the list of break positions that will be returned if we successfully
@@ -412,7 +416,9 @@
     // further, this saves us from having to follow each possible path
     // through the text all the way to the error (hopefully avoiding many
     // future recursive calls as well).
-    UStack currentBreakPositions(status);
+    // there can be only one kind of error in UStack and UVector, so we'll 
+    // just let the error fall through
+    UStack currentBreakPositions(status); 
     UStack possibleBreakPositions(status);
     UVector wrongBreakPositions(status);
 
@@ -445,6 +451,9 @@
         // the possible-break-positions stack
         if (fTables->fDictionary->at(state, (int32_t)0) == -1) {
             possibleBreakPositions.push(fText->getIndex(), status);
+            if (U_FAILURE(status)) {
+                return;
+            }
         }
 
         // look up the new state to transition to in the dictionary
@@ -456,6 +465,9 @@
         // of the loop.
         if (state == -1) {
             currentBreakPositions.push(fText->getIndex(), status);
+            if (U_FAILURE(status)) {
+                return;
+            }
             break;
         }
 
@@ -501,6 +513,9 @@
                     currentBreakPositions.removeAllElements();
                     for (int32_t i = 0; i < bestBreakPositions.size(); i++) {
                         currentBreakPositions.push(bestBreakPositions.elementAti(i), status);
+                        if (U_FAILURE(status)) {
+                            return;
+                        }
                     }
                     bestBreakPositions.removeAllElements();
                     if (farthestEndPoint < endPos) {
@@ -515,9 +530,15 @@
                             || currentBreakPositions.peeki() != fText->getIndex())
                             && fText->getIndex() != startPos) {
                         currentBreakPositions.push(fText->getIndex(), status);
+                        if (U_FAILURE(status)) {
+                            return;
+                        }
                     }
                     fText->next();
                     currentBreakPositions.push(fText->getIndex(), status);
+                    if (U_FAILURE(status)) {
+                        return;
+                    }
                 }
             }
 
@@ -561,6 +582,9 @@
         currentBreakPositions.popi();
     }
     currentBreakPositions.push(endPos, status);
+    if (U_FAILURE(status)) {
+        return;
+    }
 
     // create a regular array to hold the break positions and copy
     // the break positions from the stack to the array (in addition,

Index: iculserv.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/iculserv.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- iculserv.cpp	10 Sep 2003 02:42:02 -0000	1.1
+++ iculserv.cpp	6 Apr 2004 10:07:58 -0000	1.2
@@ -86,7 +86,7 @@
 }
 
 const Hashtable*
-LocaleUtility::getAvailableLocaleNames(const UnicodeString& bundleID)
+LocaleUtility::getAvailableLocaleNames(const UnicodeString& /* bundleID */)
 {
     // have to ignore bundleID for the moment, since we don't have easy C++ api.
     // assume it's the default bundle
@@ -293,7 +293,7 @@
 }
 #endif
 
-const char LocaleKey::fgClassID = 0;
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(LocaleKey)
 
 /*
  ******************************************************************
@@ -346,7 +346,7 @@
 
         const UHashElement* elem = NULL;
         int32_t pos = 0;
-        while (elem = supported->nextElement(pos)) {
+        while ((elem = supported->nextElement(pos)) != NULL) {
             const UnicodeString& id = *((const UnicodeString*)elem->key.pointer);
             if (!visible) {
                 result.remove(id);
@@ -375,7 +375,10 @@
 }
 
 UObject*
-LocaleKeyFactory::handleCreate(const Locale& loc, int32_t kind, const ICUService* service, UErrorCode& status) const {
+LocaleKeyFactory::handleCreate(const Locale& /* loc */, 
+			       int32_t /* kind */, 
+			       const ICUService* /* service */, 
+			       UErrorCode& /* status */) const {
     return NULL;
 }
 
@@ -386,7 +389,7 @@
 }
 
 const Hashtable*
-LocaleKeyFactory::getSupportedIDs(UErrorCode& status) const {
+LocaleKeyFactory::getSupportedIDs(UErrorCode& /* status */) const {
     return NULL;
 }
 
@@ -409,7 +412,7 @@
 }
 #endif
 
-const char LocaleKeyFactory::fgClassID = 0;
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(LocaleKeyFactory)
 
 /*
  ******************************************************************
@@ -461,7 +464,7 @@
 }
 
 UBool
-SimpleLocaleKeyFactory::isSupportedID(const UnicodeString& id, UErrorCode& status) const
+SimpleLocaleKeyFactory::isSupportedID(const UnicodeString& id, UErrorCode& /* status */) const
 {
     return id == _id;
 }
@@ -497,7 +500,7 @@
 }
 #endif
 
-const char SimpleLocaleKeyFactory::fgClassID = 0;
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SimpleLocaleKeyFactory)
 
 /*
  ******************************************************************
@@ -525,7 +528,7 @@
 }
 
 UObject*
-ICUResourceBundleFactory::handleCreate(const Locale& loc, int32_t kind, const ICUService* service, UErrorCode& status) const
+ICUResourceBundleFactory::handleCreate(const Locale& loc, int32_t /* kind */, const ICUService* /* service */, UErrorCode& status) const
 {
     if (U_SUCCESS(status)) {
         return new ResourceBundle(_bundleName, loc, status);
@@ -549,7 +552,7 @@
 }
 #endif
 
-const char ICUResourceBundleFactory::fgClassID = '\0';
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(ICUResourceBundleFactory)
 
 /*
  ******************************************************************
@@ -691,21 +694,37 @@
     int32_t _timestamp;
     UVector _ids;
     int32_t _pos;
-    void* _bufp;
-    int32_t _buflen;
 
 private:
-    ServiceEnumeration(const ICULocaleService* service, UErrorCode status)
+    ServiceEnumeration(const ICULocaleService* service, UErrorCode &status)
         : _service(service)
         , _timestamp(service->getTimestamp())
         , _ids(uhash_deleteUnicodeString, NULL, status)
         , _pos(0)
-        , _bufp(NULL)
-        , _buflen(0)
     {
         _service->getVisibleIDs(_ids, status);
     }
 
+    ServiceEnumeration(const ServiceEnumeration &other, UErrorCode &status)
+        : _service(other._service)
+        , _timestamp(other._timestamp)
+        , _ids(uhash_deleteUnicodeString, NULL, status)
+        , _pos(0)
+    {
+        if(U_SUCCESS(status)) {
+            int32_t i, length;
+
+            length = other._ids.size();
+            for(i = 0; i < length; ++i) {
+                _ids.addElement(((UnicodeString *)other._ids.elementAt(i))->clone(), status);
+            }
+
+            if(U_SUCCESS(status)) {
+                _pos = other._pos;
+            }
+        }
+    }
+
 public:
     static ServiceEnumeration* create(const ICULocaleService* service) {
         UErrorCode status = U_ZERO_ERROR;
@@ -717,55 +736,20 @@
         return NULL;
     }
 
-    virtual ~ServiceEnumeration() {
-        uprv_free(_bufp);
-    }
-
-    virtual int32_t count(UErrorCode& status) const {
-        return upToDate(status) ? _ids.size() : 0;
-    }
+    virtual ~ServiceEnumeration() {}
 
-    const char* next(int32_t* resultLength, UErrorCode& status) {
-        const UnicodeString* us = snext(status);
-        if (us) {
-            while (TRUE) {
-                int32_t newlen = us->extract((char*)_bufp, _buflen / sizeof(char), NULL, status);
-                if (status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR) {
-                    resizeBuffer((newlen + 1) * sizeof(char));
-                    status = U_ZERO_ERROR;
-                } else if (U_SUCCESS(status)) {
-                    ((char*)_bufp)[newlen] = 0;
-                    if (resultLength) {
-                        resultLength[0] = newlen;
-                    }
-                    return (const char*)_bufp;
-                } else {
-                    break;
-                }
-            }
+    virtual StringEnumeration *clone() const {
+        UErrorCode status = U_ZERO_ERROR;
+        ServiceEnumeration *cl = new ServiceEnumeration(*this, status);
+        if(U_FAILURE(status)) {
+            delete cl;
+            cl = NULL;
         }
-        return NULL;
+        return cl;
     }
 
-    const UChar* unext(int32_t* resultLength, UErrorCode& status) {
-        const UnicodeString* us = snext(status);
-        if (us) {
-            while (TRUE) {
-                int32_t newlen = us->extract((UChar*)_bufp, _buflen / sizeof(UChar), status);
-                if (status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR) {
-                    resizeBuffer((newlen + 1) * sizeof(UChar));
-                } else if (U_SUCCESS(status)) {
-                    ((UChar*)_bufp)[newlen] = 0;
-                    if (resultLength) {
-                        resultLength[0] = newlen;
-                    }
-                    return (const UChar*)_bufp;
-                } else {
-                    break;
-                }
-            }
-        }
-        return NULL;
+    virtual int32_t count(UErrorCode& status) const {
+        return upToDate(status) ? _ids.size() : 0;
     }
 
     const UnicodeString* snext(UErrorCode& status) {
@@ -775,15 +759,6 @@
         return NULL;
     }
 
-    void resizeBuffer(int32_t newlen) {
-        if (_bufp) {
-            _bufp = uprv_realloc(_bufp, newlen);
-        } else {
-            _bufp = uprv_malloc(newlen);
-        }
-        _buflen = newlen;
-    }
-
     UBool upToDate(UErrorCode& status) const {
         if (U_SUCCESS(status)) {
             if (_timestamp == _service->getTimestamp()) {
@@ -806,13 +781,11 @@
     }
 
 public:
-    virtual UClassID getDynamicClassID(void) const { return getStaticClassID(); }
-    static UClassID getStaticClassID(void) { return (UClassID)&fgClassID; }
-private:
-    static const char fgClassID;
+    static UClassID getStaticClassID(void);
+    virtual UClassID getDynamicClassID(void) const;
 };
 
-const char ServiceEnumeration::fgClassID = '\0';
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(ServiceEnumeration)
 
 StringEnumeration*
 ICULocaleService::getAvailableLocales(void) const

Index: iculserv.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/iculserv.h,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- iculserv.h	10 Sep 2003 02:42:02 -0000	1.1
+++ iculserv.h	6 Apr 2004 10:07:58 -0000	1.2
@@ -25,11 +25,8 @@
 
 #else
 
-#include "unicode/uobject.h"
 #include "unicode/unistr.h"
-#include "unicode/chariter.h"
 #include "unicode/locid.h"
-#include "unicode/ubrk.h"
 #include "unicode/strenum.h"
 
 #include "hash.h"
@@ -157,13 +154,9 @@
     /**
      * UObject boilerplate.
      */
-    static inline UClassID getStaticClassID() { 
-        return (UClassID)&fgClassID;
-    }
+    static UClassID getStaticClassID();
 
-    virtual UClassID getDynamicClassID() const {
-        return getStaticClassID();
-    }
+    virtual UClassID getDynamicClassID() const;
 
 #ifdef SERVICE_DEBUG
  public:
@@ -171,8 +164,6 @@
     virtual UnicodeString& debugClass(UnicodeString& result) const;
 #endif
 
- private:
-    static const char fgClassID;
 };
 
 /*
@@ -274,17 +265,13 @@
      */
     virtual const Hashtable* getSupportedIDs(UErrorCode& status) const;
 
- public:
+public:
     /**
      * UObject boilerplate.
      */
-    static inline UClassID getStaticClassID() { 
-        return (UClassID)&fgClassID;
-    }
+    static UClassID getStaticClassID();
 
-    virtual UClassID getDynamicClassID() const {
-        return getStaticClassID();
-    }
+    virtual UClassID getDynamicClassID() const;
 
 #ifdef SERVICE_DEBUG
  public:
@@ -292,8 +279,6 @@
     virtual UnicodeString& debugClass(UnicodeString& result) const;
 #endif
 
- private:
-    static const char fgClassID;
 };
 
 /*
@@ -344,17 +329,13 @@
  	virtual UBool isSupportedID(const UnicodeString& id, UErrorCode& status) const;
 
 
- public:
+public:
     /**
      * UObject boilerplate.
      */
-    static inline UClassID getStaticClassID() { 
-        return (UClassID)&fgClassID;
-    }
+    static UClassID getStaticClassID();
 
-    virtual UClassID getDynamicClassID() const {
-        return getStaticClassID();
-    }
+    virtual UClassID getDynamicClassID() const;
 
 #ifdef SERVICE_DEBUG
  public:
@@ -362,8 +343,6 @@
     virtual UnicodeString& debugClass(UnicodeString& result) const;
 #endif
 
- private:
-    static const char fgClassID;
 };
 
 /*
@@ -406,17 +385,13 @@
      */
     virtual UObject* handleCreate(const Locale& loc, int32_t kind, const ICUService* service, UErrorCode& status) const;
 
- public:
+public:
     /**
      * UObject boilerplate.
      */
-    virtual UClassID getDynamicClassID() const {
-        return getStaticClassID();
-    }
+    static UClassID getStaticClassID();
+    virtual UClassID getDynamicClassID() const;
 
-    static UClassID getStaticClassID() { 
-        return (UClassID)&fgClassID;
-    }
 
 #ifdef SERVICE_DEBUG
  public:
@@ -424,8 +399,6 @@
     virtual UnicodeString& debugClass(UnicodeString& result) const;
 #endif
 
- private:
-    static const char fgClassID;
 };
 
 /*

Index: icunotif.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/icunotif.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- icunotif.cpp	10 Sep 2003 02:42:02 -0000	1.1
+++ icunotif.cpp	6 Apr 2004 10:07:58 -0000	1.2
@@ -1,6 +1,6 @@
 /**
  *******************************************************************************
- * Copyright (C) 2001-2002, International Business Machines Corporation and    *
+ * Copyright (C) 2001-2003, International Business Machines Corporation and    *
  * others. All Rights Reserved.                                                *
  *******************************************************************************
  */
@@ -10,11 +10,30 @@
 #if !UCONFIG_NO_SERVICE
 
 #include "icunotif.h"
+#if DEBUG
 #include <stdio.h>
+#endif
 
 U_NAMESPACE_BEGIN
 
-const char EventListener::fgClassID = '\0';
+EventListener::~EventListener() {}
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(EventListener)
+
+ICUNotifier::ICUNotifier(void) 
+    : notifyLock(0), listeners(NULL) 
+{
+    umtx_init(&notifyLock);
+}
+
+ICUNotifier::~ICUNotifier(void) {
+    {
+        Mutex lmx(&notifyLock);
+        delete listeners;
+        listeners = NULL;
+    }
+    umtx_destroy(&notifyLock);
+}
+
 
 void 
 ICUNotifier::addListener(const EventListener* l, UErrorCode& status) 
@@ -90,7 +109,7 @@
   }
 }
 
-U_NAMESPACE_END;
+U_NAMESPACE_END
 
 /* UCONFIG_NO_SERVICE */
 #endif

Index: icunotif.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/icunotif.h,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- icunotif.h	10 Sep 2003 02:42:02 -0000	1.1
+++ icunotif.h	6 Apr 2004 10:07:58 -0000	1.2
@@ -32,19 +32,16 @@
 U_NAMESPACE_BEGIN
 
 class U_COMMON_API EventListener : public UObject {
- public: 
-  virtual ~EventListener() {}
+public: 
+    virtual ~EventListener();
 
- public:
-    static inline UClassID getStaticClassID() { 
-        return (UClassID)&fgClassID;
-    }
+public:
+    static UClassID getStaticClassID();
 
-    virtual UClassID getDynamicClassID() const {
-        return getStaticClassID();
-    }
+    virtual UClassID getDynamicClassID() const;
 
- public:
+public:
+#ifdef SERVICE_DEBUG
     virtual UnicodeString& debug(UnicodeString& result) const {
       return debugClass(result);
     }
@@ -52,9 +49,7 @@
     virtual UnicodeString& debugClass(UnicodeString& result) const {
       return result.append("Key");
     }
-
- private:
-    static const char fgClassID;
+#endif
 };
 
 /**
@@ -75,60 +70,49 @@
  */
 
 class U_COMMON_API ICUNotifier : public UMemory  {
- private: UMTX notifyLock;
- private: UVector* listeners;
-
- public: 
- ICUNotifier(void) 
-   : notifyLock(0), listeners(NULL) 
-   {
-     umtx_init(&notifyLock);
-   }
-
- virtual ~ICUNotifier(void) {
-   {
-     Mutex lmx(&notifyLock);
-     delete listeners;
-     listeners = NULL;
-   }
-   umtx_destroy(&notifyLock);
- }
-
- /**
-  * Add a listener to be notified when notifyChanged is called.
-  * The listener must not be null. AcceptsListener must return
-  * true for the listener.  Attempts to concurrently
-  * register the identical listener more than once will be
-  * silently ignored.  
-  */
- virtual void addListener(const EventListener* l, UErrorCode& status);
-
- /**
-  * Stop notifying this listener.  The listener must
-  * not be null.  Attemps to remove a listener that is
-  * not registered will be silently ignored.
-  */
- virtual void removeListener(const EventListener* l, UErrorCode& status);
-
- /**
-  * ICU doesn't spawn its own threads.  All listeners are notified in
-  * the thread of the caller.  Misbehaved listeners can therefore
-  * indefinitely block the calling thread.  Callers should beware of
-  * deadlock situations.  
-  */
- virtual void notifyChanged(void);
-
- protected: 
- /**
-  * Subclasses implement this to return TRUE if the listener is
-  * of the appropriate type.
-  */
- virtual UBool acceptsListener(const EventListener& l) const = 0;
-
- /**
-  * Subclasses implement this to notify the listener.
-  */
- virtual void notifyListener(EventListener& l) const = 0;
+private: UMTX notifyLock;
+private: UVector* listeners;
+         
+public: 
+    ICUNotifier(void);
+    
+    virtual ~ICUNotifier(void);
+    
+    /**
+     * Add a listener to be notified when notifyChanged is called.
+     * The listener must not be null. AcceptsListener must return
+     * true for the listener.  Attempts to concurrently
+     * register the identical listener more than once will be
+     * silently ignored.  
+     */
+    virtual void addListener(const EventListener* l, UErrorCode& status);
+    
+    /**
+     * Stop notifying this listener.  The listener must
+     * not be null.  Attemps to remove a listener that is
+     * not registered will be silently ignored.
+     */
+    virtual void removeListener(const EventListener* l, UErrorCode& status);
+    
+    /**
+     * ICU doesn't spawn its own threads.  All listeners are notified in
+     * the thread of the caller.  Misbehaved listeners can therefore
+     * indefinitely block the calling thread.  Callers should beware of
+     * deadlock situations.  
+     */
+    virtual void notifyChanged(void);
+    
+protected: 
+    /**
+     * Subclasses implement this to return TRUE if the listener is
+     * of the appropriate type.
+     */
+    virtual UBool acceptsListener(const EventListener& l) const = 0;
+    
+    /**
+     * Subclasses implement this to notify the listener.
+     */
+    virtual void notifyListener(EventListener& l) const = 0;
 };
 
 U_NAMESPACE_END

Index: icuserv.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/icuserv.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- icuserv.cpp	10 Sep 2003 02:42:02 -0000	1.1
+++ icuserv.cpp	6 Apr 2004 10:07:58 -0000	1.2
@@ -18,235 +18,6 @@
 
 U_NAMESPACE_BEGIN
 
-// A reference counted wrapper for an object.  Creation and access is
-// through RefHandle.
-
-#ifdef SERVICE_REFCOUNT
-
-#include "unicode/strenum.h"
-
-/*
- ******************************************************************
- */
-
-class RefCounted {
-private:
-  int32_t _count;
-  UObject* _obj;
-
-  friend class RefHandle;
-
-  RefCounted(UObject* objectToAdopt) : _count(1), _obj(objectToAdopt) {}
-  ~RefCounted() { delete _obj; }
-  void ref() { umtx_atomic_inc(&_count); }
-  void unref() { if (umtx_atomic_dec(&_count) == 0) { delete this; }}
-};
-
-/*
- ******************************************************************
- */
-
-// Reference counted handle for an object
-class RefHandle {
-  RefCounted* _ref;
-  
-public:
-  RefHandle() : _ref(NULL) {}
-  RefHandle(UObject* obj) : _ref(new RefCounted(obj)) {}
-  RefHandle(const RefHandle& rhs) : _ref(NULL) { operator=(rhs); }
-  ~RefHandle() { if (_ref) _ref->unref(); }
-  RefHandle& operator=(const RefHandle& rhs) {
-    if (rhs._ref) rhs._ref->ref();
-    if (_ref) _ref->unref();
-    _ref = rhs._ref;
-  }
-  const UObject* get() const { return _ref ? _ref->_obj : NULL; }
-};
-
-/*
- ******************************************************************
- */
-
-// Generic enumeration class with fail-fast behavior.
-
-class MapEnumeration : public UObject, public StringEnumeration
-{
-    private:
-    UChar* _buffer;
-    int _buflen;
-
-    protected:
-    const ICUService* _service;
-    uint32_t _timestamp;
-    RefHandle _table;
-    ICUServiceKey* _filter;
-    int32_t _position;
-    int32_t _count;
-
-    protected:
-    MapEnumeration(ICUService* service, int32_t timestamp, RefHandle& table, ICUServiceKey* filter = NULL)
-        : _buffer(NULL)
-        , _buflen(0)
-        , _service(service)
-        , _timestamp(timestamp)
-        , _table(table)
-        , _filter(filter)
-        , _position(0)
-        , _count(((const Hashtable*)table.get())->count())
-    {
-    }
-   
-    virtual ~MapEnumeration()
-    {
-        delete _filter;
-    }
-
-    int32_t count(UErrorCode& status) const
-    {
-        return U_SUCCESS(status) ? _count : 0;
-    }
-
-  const char* next(UErrorCode& status) {
-    const UnicodeString* us = snext(status);
-    if (us) {
-      int newlen;
-      for (newlen = us->extract((char*)_buffer, _buflen / sizeof(char), NULL, status);
-           status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR;)
-        {
-          resizeBuffer((newlen + 1) * sizeof(char));
-          status = U_ZERO_ERROR;
-        }
-      
-      if (U_SUCCESS(status)) {
-        ((char*)_buffer)[newlen] = 0;
-        return (const char*)_buffer;
-      }
-    }
-    return NULL;
-  }
-
-  const UChar* unext(UErrorCode& status) {
-    const UnicodeString* us = snext(status);
-    if (us) {
-      int newlen;
-      for (newlen = us->extract((UChar*)_buffer, _buflen / sizeof(UChar), NULL, status);
-           status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR;)
-        {
-          resizeBuffer((newlen + 1) * sizeof(UChar));
-          status = U_ZERO_ERROR;
-        }
-      
-      if (U_SUCCESS(status)) {
-        ((UChar*)_buffer)[newlen] = 0;
-        return (const UChar*)_buffer;
-      }
-    }
-    return NULL;
-  }
-
-    const UnicodeString* snext(UErrorCode& status) 
-    {
-        if (U_SUCCESS(status)) {
-            if (_timestamp != _service->_timestamp) {
-                status = U_ENUM_OUT_OF_SYNCH_ERROR;
-            } else {
-                return internalNext((Hashtable*)_table.get());
-            }
-        }
-        return NULL;
-    }
-
-    void reset(UErrorCode& status)
-    {
-        if (U_SUCCESS(status)) {
-            service->reset(this);
-        }
-    }
-
-    protected:
-    virtual const UnicodeString* internalNext(Hashtable* table) = 0;
-
-    private:
-    void reset(RefHandle& table, int32_t timestamp)
-    {
-        _table = table;
-        _timestamp = timestamp;
-        _position = 0;
-        _count = ((const Hashtable*)table.get())->count();
-    }
-
-    friend class ICUService;
-};
-
-/*
- ******************************************************************
- */
-
-// An enumeration over the visible ids in a service.  The ids
-// are in the hashtable, which is refcounted, so it will not
-// disappear as long as the enumeration exists even if the
-// service itself unrefs it.  For "fail-fast" behavior the
-// enumeration checks the timestamp of the service, but this
-// is not a guarantee that the result the client receives will
-// still be valid once the function returns.
-
-class IDEnumeration : public MapEnumeration {
-public:
-  IDEnumeration(ICUService* service, int32_t timestamp, RefHandle& table, ICUServiceKey* filter = NULL)
-    : MapEnumeration(service, timestamp, table, filter)
-  {
-  }
-
-protected:
-  const UnicodeString* internalNext(Hashtable* table) {
-    while (TRUE) {
-      const UnicodeString* elem = (const UnicodeString*)(table->nextElement(_position).key.pointer);
-      if (elem == NULL ||
-          _filter == NULL ||
-          _filter->isFallbackOf(*elem)) {
-        return elem;
-      }
-    }
-    return NULL;
-  }
-};
-
-/*
- ******************************************************************
- */
-
-class DisplayEnumeration : public MapEnumeration {
-private:
-  Locale _locale;
-  UnicodeString _cache;
-
-public:
-  DisplayEnumeration(ICUService* service, int32_t timestamp, RefHandle& table, Locale& locale, ICUServiceKey* filter = NULL)
-    : MapEnumeration(service, timestamp, table, filter), _locale(locale)
-  {
-  }
-
-protected:
-  const UnicodeString* internalNext(Hashtable* table) {
-    while (TRUE) {
-      UHashElement* elem = table->nextElement(_position);
-      if (elem == NULL) {
-        return NULL;
-      }
-      const UnicodeString* id = (const UnicodeString*)elem->key.pointer;
-      const ICUServiceFactory* factory = (const ICUServiceFactory*)elem->value.pointer;
-      if (_filter == NULL || _filter->isFallbackOf(*id)) {
-        factory->getDisplayName(*id, cache, locale);
-        return &cache;
-      }
-    }
-    return NULL;
-  }
-};
-
-/* SERVICE_REFCOUNT */
-#endif
-
 /*
  ******************************************************************
  */
@@ -343,7 +114,7 @@
 }
 #endif
 
-const char ICUServiceKey::fgClassID = '\0';
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(ICUServiceKey)
 
 /*
  ******************************************************************
@@ -382,7 +153,7 @@
 }
 
 UnicodeString& 
-SimpleFactory::getDisplayName(const UnicodeString& id, const Locale& locale, UnicodeString& result) const 
+SimpleFactory::getDisplayName(const UnicodeString& id, const Locale& /* locale */, UnicodeString& result) const 
 {
   if (_visible && _id == id) {
     result = _id;
@@ -411,13 +182,13 @@
 }
 #endif
 
-const char SimpleFactory::fgClassID = '\0';
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SimpleFactory)
 
 /*
  ******************************************************************
  */
 
-const char ServiceListener::fgClassID = '\0';
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(ServiceListener)
 
 /*
  ******************************************************************
@@ -480,8 +251,8 @@
 };
 
 // UObjectDeleter for serviceCache
-
-U_CAPI void U_EXPORT2
+U_CDECL_BEGIN
+static void U_CALLCONV
 cacheDeleter(void* obj) {
   U_NAMESPACE_USE
     ((CacheEntry*)obj)->unref();
@@ -490,11 +261,12 @@
 /**
  * Deleter for UObjects
  */
-U_CAPI void U_EXPORT2
+static void U_CALLCONV
 deleteUObject(void *obj) {
   U_NAMESPACE_USE
     delete (UObject*) obj;
 }
+U_CDECL_END
 
 /*
  ******************************************************************
@@ -547,7 +319,7 @@
 }
 
 U_CAPI void U_EXPORT2
-deleteStringPair(void *obj) {
+userv_deleteStringPair(void *obj) {
   U_NAMESPACE_USE
     delete (StringPair*) obj;
 }
@@ -721,7 +493,6 @@
       // going to update the cache at all.
       putInCache = TRUE;
 
-      int32_t n = 0;
       int32_t index = startIndex;
       while (index < limit) {
         ICUServiceFactory* f = (ICUServiceFactory*)factories->elementAt(index++);
@@ -819,7 +590,7 @@
 }
 
 UObject* 
-ICUService::handleDefault(const ICUServiceKey& key, UnicodeString* actualIDReturn, UErrorCode& status) const 
+ICUService::handleDefault(const ICUServiceKey& /* key */, UnicodeString* /* actualIDReturn */, UErrorCode& /* status */) const 
 {
   return NULL;
 }
@@ -972,7 +743,7 @@
 
         int32_t pos = 0;
         const UHashElement* entry = NULL;
-        while (entry = m->nextElement(pos)) {
+        while ((entry = m->nextElement(pos)) != NULL) {
           const UnicodeString* id = (const UnicodeString*)entry->key.pointer;
           ICUServiceFactory* f = (ICUServiceFactory*)entry->value.pointer;
           UnicodeString dname;
@@ -996,7 +767,7 @@
   ICUServiceKey* matchKey = createKey(matchID, status);
   int32_t pos = 0;
   const UHashElement *entry = NULL;
-  while (entry = dnCache->cache.nextElement(pos)) {
+  while ((entry = dnCache->cache.nextElement(pos)) != NULL) {
     const UnicodeString* id = (const UnicodeString*)entry->value.pointer;
     if (matchKey != NULL && !matchKey->isFallbackOf(*id)) {
       continue;

Index: icuserv.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/icuserv.h,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- icuserv.h	10 Sep 2003 02:42:02 -0000	1.1
+++ icuserv.h	6 Apr 2004 10:07:58 -0000	1.2
@@ -24,11 +24,8 @@
 
 #else
 
-#include "unicode/uobject.h"
 #include "unicode/unistr.h"
-#include "unicode/chariter.h"
 #include "unicode/locid.h"
-#include "unicode/ubrk.h"
 
 #include "hash.h"
 #include "uvector.h"
@@ -42,7 +39,6 @@
 class ICUServiceFactory;
 class SimpleFactory;
 class ServiceListener;
-class ICUServiceEnumeration;
 class ICUService;
 
 class DNCache;
@@ -183,20 +179,16 @@
   */
   static UnicodeString& parseSuffix(UnicodeString& result);
 
- public:
+public:
   /**
    * UObject RTTI boilerplate.
    */
-  static inline UClassID getStaticClassID() { 
-    return (UClassID)&fgClassID;
-  }
+  static UClassID getStaticClassID();
 
   /**
    * UObject RTTI boilerplate.
    */
-  virtual UClassID getDynamicClassID() const {
-    return getStaticClassID();
-  }
+  virtual UClassID getDynamicClassID() const;
 
 #ifdef SERVICE_DEBUG
  public:
@@ -204,8 +196,6 @@
   virtual UnicodeString& debugClass(UnicodeString& result) const;
 #endif
 
- private:
-    static const char fgClassID;
 };
 
  /*******************************************************************
@@ -347,16 +337,12 @@
  /**
   * UObject RTTI boilerplate.
   */
-  static inline UClassID getStaticClassID() { 
-	  return (UClassID)&fgClassID;
-  }
+  static UClassID getStaticClassID();
 
  /**
   * UObject RTTI boilerplate.
   */
-  virtual UClassID getDynamicClassID() const {
-	  return getStaticClassID();
-  }
+  virtual UClassID getDynamicClassID() const;
 
 #ifdef SERVICE_DEBUG
  public:
@@ -364,8 +350,6 @@
   virtual UnicodeString& debugClass(UnicodeString& toAppendTo) const;
 #endif
 
- private:
-  static const char fgClassID;
 };
 
 /*
@@ -394,19 +378,13 @@
     /**
      * UObject RTTI boilerplate.
      */
-    static inline UClassID getStaticClassID() {
-        return (UClassID)&fgClassID;
-    }
+    static UClassID getStaticClassID();
     
     /**
      * UObject RTTI boilerplate.
      */
-    virtual UClassID getDynamicClassID() const {
-        return getStaticClassID();
-    }
+    virtual UClassID getDynamicClassID() const;
     
-private:
-    static const char fgClassID;
 };
 
 /*
@@ -455,7 +433,7 @@
  * Deleter for StringPairs
  */
 U_CAPI void U_EXPORT2
-deleteStringPair(void *obj);
+userv_deleteStringPair(void *obj);
 
 /**
  * Opaque type returned by registerInstance and registerFactory.

Index: locid.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/locid.cpp,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- locid.cpp	10 Sep 2003 02:42:02 -0000	1.4
+++ locid.cpp	6 Apr 2004 10:07:58 -0000	1.5
@@ -11,9 +11,9 @@
 * Modification History:
 *
 *   Date        Name        Description
-*   02/11/97    aliu        Changed gLocPath to fgDataDirectory and added 
+*   02/11/97    aliu        Changed gLocPath to fgDataDirectory and added
 *                           methods to get and set it.
-*   04/02/97    aliu        Made operator!= inline; fixed return value 
+*   04/02/97    aliu        Made operator!= inline; fixed return value
 *                           of getName().
 *   04/15/97    aliu        Cleanup for AIX/Win32.
 *   04/24/97    aliu        Numerous changes per code review.
@@ -39,6 +39,8 @@
 #include "uhash.h"
 #include "ucln_cmn.h"
 
+#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
+
 static Locale*  availableLocaleList = NULL;
 static int32_t  availableLocaleListCount;
 typedef enum ELocalePos {
@@ -67,12 +69,27 @@
     eMAX_LOCALES
 } ELocalePos;
 
-/* Use void * to make it properly aligned */
-/* Add 1 for rounding */
-// static void *gByteLocaleCache[(eMAX_LOCALES + 1) * sizeof(Locale) / sizeof(void*)];
+U_CFUNC int32_t locale_getKeywords(const char *localeID,
+            char prev,
+            char *keywords, int32_t keywordCapacity,
+            char *values, int32_t valuesCapacity, int32_t *valLen,
+            UBool valuesToo,
+            UErrorCode *status);
+
+static Locale        *gLocaleCache         = NULL;
+static const Locale  *gDefaultLocale       = NULL;
+static UHashtable    *gDefaultLocalesHashT = NULL;
+
+U_CDECL_BEGIN
+//
+// Deleter function for Locales owned by the default Locale hash table/
+//
+static void U_CALLCONV
+deleteLocale(void *obj) {
+    delete (Locale *) obj;
+}
+U_CDECL_END
 
-static Locale *gLocaleCache   = NULL;
-static Locale *gDefaultLocale = NULL;
 
 UBool
 locale_cleanup(void)
@@ -89,40 +106,114 @@
         delete [] gLocaleCache;
         gLocaleCache = NULL;
     }
-    if (gDefaultLocale) {
-        delete gDefaultLocale;
-        gDefaultLocale = NULL;
+
+    if (gDefaultLocalesHashT) {
+        uhash_close(gDefaultLocalesHashT);   // Automatically deletes all elements, using deleter func.
+        gDefaultLocalesHashT = NULL;
     }
+    gDefaultLocale = NULL;
+
     return TRUE;
 }
 
 U_NAMESPACE_BEGIN
-const char Locale::fgClassID=0;
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Locale)
 
+//
+//  locale_set_default_internal.
+//
 void locale_set_default_internal(const char *id)
 {
     U_NAMESPACE_USE
-    Locale tempLocale(Locale::eBOGUS);
+    UErrorCode   status = U_ZERO_ERROR;
 
-    if (id == NULL) 
-    {
+    // If given a NULL string for the locale id, grab the default
+    //   name from the system.
+    //   (Different from most other locale APIs, where a null name means use
+    //    the current ICU default locale.)
+    if (id == NULL) {
         umtx_lock(NULL);
         id = uprv_getDefaultLocaleID();
         umtx_unlock(NULL);
     }
 
-    tempLocale.init(id);   // Note:  we do not want to hold the mutex through init(),
-                           //        which is a relatively large, complex function.
-                           //        Hence, the use of a temporary locale.
-    const Locale *defLocale = &Locale::getDefault();
-    
+    // put the locale id into a canonical form,
+    //   in preparation for looking up this locale in the hash table of
+    //   already-created locale objects.
+    //
+    status = U_ZERO_ERROR;
+    char localeNameBuf[512];
+
+    uloc_getName(id, localeNameBuf, sizeof(localeNameBuf)-1, &status);
+    localeNameBuf[sizeof(localeNameBuf)-1] = 0;  // Force null termination in event of
+                                                 //   a long name filling the buffer.
+                                                 //   (long names are truncated.)
+
+    // Lazy creation of the hash table itself, if needed.
+    //
     umtx_lock(NULL);
-    Locale *ncDefLocale = (Locale *)defLocale;
-    *ncDefLocale = tempLocale;
+    UBool hashTableNeedsInit = (gDefaultLocalesHashT == NULL);
     umtx_unlock(NULL);
+    if (hashTableNeedsInit) {
+        status = U_ZERO_ERROR;
+        UHashtable *tHashTable = uhash_open(uhash_hashChars, uhash_compareChars, &status);
+        if (U_FAILURE(status)) {
+            return;
+        }
+        uhash_setValueDeleter(tHashTable, deleteLocale);
+        umtx_lock(NULL);
+        if (gDefaultLocalesHashT == NULL) {
+            gDefaultLocalesHashT = tHashTable;
+            umtx_unlock(NULL);
+        } else {
+            umtx_unlock(NULL);
+            uhash_close(tHashTable);
+        }
+    }
+
+    // Hash table lookup, key is the locale full name
+    umtx_lock(NULL);
+    Locale *newDefault = (Locale *)uhash_get(gDefaultLocalesHashT, localeNameBuf);
+    if (newDefault != NULL) {
+        // We have the requested locale in the hash table already.
+        // Just set it as default.  Inside the mutex lock, for those troublesome processors.
+        gDefaultLocale = newDefault;
+        umtx_unlock(NULL);
+    } else {
+        umtx_unlock(NULL);
+        // We haven't seen this locale id before.
+        // Create a new Locale object for it.
+        newDefault = new Locale(Locale::eBOGUS);
+        if (newDefault == NULL) {
+            // No way to report errors from here.
+            return;
+        }
+        newDefault->init(localeNameBuf);
+
+        // Add newly created Locale to the hash table of default Locales
+        const char *key = newDefault->getName();
+        U_ASSERT(uprv_strcmp(key, localeNameBuf) == 0);
+        umtx_lock(NULL);
+        const Locale *hashTableVal = (const Locale *)uhash_get(gDefaultLocalesHashT, key);
+        if (hashTableVal == NULL) {
+            uhash_put(gDefaultLocalesHashT, (void *)key, newDefault, &status);
+            gDefaultLocale = newDefault;
+            umtx_unlock(NULL);
+            // ignore errors from hash table insert.  (Couldn't do anything anyway)
+            // We can still set the default Locale,
+            //  it just wont be cached, and will eventually leak.
+        } else {
+            // Some other thread raced us through here, and got the new Locale
+            //   into the hash table before us.  Use that one.
+            gDefaultLocale = hashTableVal;  // Assignment to gDefaultLocale must happen inside mutex
+            umtx_unlock(NULL);
+            delete newDefault;
+        }
+    }
 }
 U_NAMESPACE_END
 
+
 /* sfb 07/21/99 */
 U_CFUNC void
 locale_set_default(const char *id)
@@ -149,32 +240,42 @@
 #define SEP_CHAR '_'
 
 Locale::~Locale()
-{   
+{
     /*if fullName is on the heap, we free it*/
-    if (fullName != fullNameBuffer) 
+    if (fullName != fullNameBuffer)
     {
         uprv_free(fullName);
         fullName = NULL;
     }
+    if (baseName && baseName != baseNameBuffer) {
+        uprv_free(baseName);
+        baseName = NULL;
+    }
 }
 
 Locale::Locale()
-    : UObject(), fullName(fullNameBuffer)
+    : UObject(), fullName(fullNameBuffer), baseName(NULL)
 {
     init(NULL);
 }
 
-Locale::Locale(Locale::ELocaleType t) 
-    : UObject(), fullName(fullNameBuffer)
+/*
+ * Internal constructor to allow construction of a locale object with
+ *   NO side effects.   (Default constructor tries to get
+ *   the default locale.)
+ */
+Locale::Locale(Locale::ELocaleType)
+    : UObject(), fullName(fullNameBuffer), baseName(NULL)
 {
     setToBogus();
 }
 
 
-Locale::Locale( const   char * newLanguage, 
-                const   char * newCountry, 
-                const   char * newVariant) 
-    : UObject(), fullName(fullNameBuffer)
+Locale::Locale( const   char * newLanguage,
+                const   char * newCountry,
+                const   char * newVariant,
+                const   char * newKeywords)
+    : UObject(), fullName(fullNameBuffer), baseName(NULL)
 {
     if( (newLanguage==NULL) && (newCountry == NULL) && (newVariant == NULL) )
     {
@@ -189,6 +290,7 @@
         int32_t lsize = 0;
         int32_t csize = 0;
         int32_t vsize = 0;
+        int32_t ksize = 0;
         char    *p;
 
         // Calculate the size of the resulting string.
@@ -215,7 +317,7 @@
             {
                 newVariant++;
             }
-            
+
             // remove trailing _'s
             vsize = (int32_t)uprv_strlen(newVariant);
             while( (vsize>1) && (newVariant[vsize-1] == SEP_CHAR) )
@@ -232,13 +334,20 @@
         // Separator rules:
         if ( vsize > 0 )
         {
-            size += 2;  // at least: __v 
+            size += 2;  // at least: __v
         }
         else if ( csize > 0 )
         {
-            size += 1;  // at least: _v 
+            size += 1;  // at least: _v
+        }
+
+        if ( newKeywords != NULL) 
+        {
+            ksize = (int32_t)uprv_strlen(newKeywords);
+            size += ksize + 1;
         }
 
+
         //  NOW we have the full locale string..
 
         /*if the whole string is longer than our internal limit, we need
@@ -269,7 +378,7 @@
         }
 
         if ( csize != 0 )
-        { 
+        {
             uprv_strcpy(p, newCountry);
             p += csize;
         }
@@ -278,11 +387,18 @@
         {
             *p++ = SEP_CHAR; // at least: __v
 
-            uprv_strncpy(p, newVariant, vsize);  // Must use strncpy because 
+            uprv_strncpy(p, newVariant, vsize);  // Must use strncpy because
             p += vsize;                          // of trimming (above).
             *p = 0; // terminate
         }
 
+        if ( ksize != 0)
+        {
+            *p++ = '@';
+            uprv_strcpy(p, newKeywords);
+            p += ksize;
+        }
+
         // Parse it, because for example 'language' might really be a complete
         // string.
         init(togo);
@@ -294,7 +410,7 @@
 }
 
 Locale::Locale(const Locale &other)
-    : UObject(other), fullName(fullNameBuffer)
+    : UObject(other), fullName(fullNameBuffer), baseName(NULL)
 {
     *this = other;
 }
@@ -316,16 +432,29 @@
         fullName = fullNameBuffer;
     }
 
+    if(baseName && baseName != baseNameBuffer) {
+        uprv_free(baseName);
+        baseName = NULL;
+    }
+
     /* Allocate the full name if necessary */
     if(other.fullName != other.fullNameBuffer) {
         fullName = (char *)uprv_malloc(sizeof(char)*(uprv_strlen(other.fullName)+1));
     }
-
     /* Copy the full name */
     uprv_strcpy(fullName, other.fullName);
 
+    if(other.baseName) {
+        if(other.baseName != other.baseNameBuffer) {
+            baseName = (char *)uprv_malloc(sizeof(char)*(uprv_strlen(other.fullName)+1));
+        }
+        uprv_strcpy(baseName, other.baseName);
+    }
+
+
     /* Copy the language and country fields */
     uprv_strcpy(language, other.language);
+    uprv_strcpy(script, other.script);
     uprv_strcpy(country, other.country);
 
     /* The variantBegin is an offset into fullName, just copy it */
@@ -334,6 +463,11 @@
     return *this;
 }
 
+Locale *
+Locale::clone() const {
+    return new Locale(*this);
+}
+
 UBool
 Locale::operator==( const   Locale& other) const
 {
@@ -350,11 +484,20 @@
         fullName = fullNameBuffer;
     }
 
+    if(baseName && baseName != baseNameBuffer) {
+        uprv_free(baseName);
+        baseName = NULL;
+    }
+
     // not a loop:
     // just an easy way to have a common error-exit
     // without goto and without another function
     do {
-        char *separator, *prev;
+        char *separator;
+        char *field[5] = {0};
+        int32_t fieldLen[5] = {0};
+        int32_t fieldIdx;
+        int32_t variantField;
         int32_t length;
         UErrorCode err;
 
@@ -363,10 +506,13 @@
             return *this = getDefault();
         }
 
+        /* preset all fields to empty */
+        language[0] = script[0] = country[0] = 0;
+
         // "canonicalize" the locale ID to ICU/Java format
         err = U_ZERO_ERROR;
         length = uloc_getName(localeID, fullName, sizeof(fullNameBuffer), &err);
-        if(U_FAILURE(err) || err == U_STRING_NOT_TERMINATED_WARNING) {
+        if(err == U_BUFFER_OVERFLOW_ERROR || length >= (int32_t)sizeof(fullNameBuffer)) {
             /*Go to heap for the fullName if necessary*/
             fullName = (char *)uprv_malloc(sizeof(char)*(length + 1));
             if(fullName == 0) {
@@ -381,50 +527,55 @@
             break;
         }
 
-        /* preset all fields to empty */
-        language[0] = country[0] = 0;
-        variantBegin = (int32_t)uprv_strlen(fullName);
+        variantBegin = length;
 
         /* after uloc_getName() we know that only '_' are separators */
-        separator = uprv_strchr(fullName, SEP_CHAR);
-        if(separator != 0) {
-            /* there is a country field */
-            length = (int32_t)(separator - fullName);
-            if(length > 0) {
-                if(length >= (int32_t)sizeof(language)) {
-                    break; // error: language code too long
-                }
-                uprv_memcpy(language, fullName, length);
-            }
-            language[length] = 0;
+        separator = field[0] = fullName;
+        fieldIdx = 1;
+        while ((separator = uprv_strchr(field[fieldIdx-1], SEP_CHAR)) && fieldIdx < (int32_t)(sizeof(field)/sizeof(field[0]))-1) {
+            field[fieldIdx] = separator + 1;
+            fieldLen[fieldIdx-1] = separator - field[fieldIdx-1];
+            fieldIdx++;
+        }
+        if((separator = uprv_strchr(field[fieldIdx-1], '@'))) {
+            fieldLen[fieldIdx-1] = separator - field[fieldIdx-1];
+        } else {
+            fieldLen[fieldIdx-1] = length - (int32_t)(field[fieldIdx-1] - fullName);
+        }
 
-            prev = separator + 1;
-            separator = uprv_strchr(prev, SEP_CHAR);
-            if(separator != 0) {
-                /* there is a variant field */
-                length = (int32_t)(separator - prev);
-                if(length > 0) {
-                    if(length >= (int32_t)sizeof(country)) {
-                        break; // error: country code too long
-                    }
-                    uprv_memcpy(country, prev, length);
-                }
-                country[length] = 0;
+        if (fieldLen[0] >= (int32_t)(sizeof(language))
+            || (fieldLen[1] == 4 && fieldLen[2] >= (int32_t)(sizeof(country)))
+            || (fieldLen[1] != 4 && fieldLen[1] >= (int32_t)(sizeof(country))))
+        {
+            break; // error: one of the fields is too long
+        }
 
-                variantBegin = (int32_t)((separator + 1) - fullName);
-            } else {
-                /* variantBegin==strlen(fullName), length==strlen(language)==prev-1-fullName */
-                if((variantBegin - length - 1) >= (int32_t)sizeof(country)) {
-                    break; // error: country code too long
-                }
-                uprv_strcpy(country, prev);
-            }
-        } else {
-            /* variantBegin==strlen(fullName) */
-            if(variantBegin >= (int32_t)sizeof(language)) {
-                break; // error: language code too long
+        variantField = 0;
+        if (fieldLen[0] > 0) {
+            /* We have a language */
+            uprv_memcpy(language, fullName, fieldLen[0]);
+            language[fieldLen[0]] = 0;
+        }
+        if (fieldLen[1] == 4) {
+            /* We have at least a script */
+            uprv_memcpy(script, field[1], fieldLen[1]);
+            script[fieldLen[1]] = 0;
+            variantField = 3;
+            if (fieldLen[2] > 0) {
+                /* We have a country */
+                uprv_memcpy(country, field[2], fieldLen[2]);
+                country[fieldLen[2]] = 0;
             }
-            uprv_strcpy(language, fullName);
+        }
+        else if (fieldLen[1] > 0) {
+            /* We have a country and no script */
+            uprv_memcpy(country, field[1], fieldLen[1]);
+            country[fieldLen[1]] = 0;
+            variantField = 2;
+        }
+        if (variantField > 0 && fieldLen[variantField] > 0) {
+            /* We have a variant */
+            variantBegin = (int32_t)(field[variantField] - fullName);
         }
 
         // successful end of init()
@@ -438,74 +589,65 @@
 }
 
 int32_t
-Locale::hashCode() const 
+Locale::hashCode() const
 {
     UHashTok hashKey;
     hashKey.pointer = fullName;
     return uhash_hashChars(hashKey);
 }
 
-void 
+void
 Locale::setToBogus() {
-  /* Free our current storage */
-  if(fullName != fullNameBuffer) {
-      uprv_free(fullName);
-      fullName = fullNameBuffer;
-  }
-  *fullNameBuffer = 0;
-  *language = 0;
-  *country = 0;
-  fIsBogus = TRUE;
+    /* Free our current storage */
+    if(fullName != fullNameBuffer) {
+        uprv_free(fullName);
+        fullName = fullNameBuffer;
+    }
+    *fullNameBuffer = 0;
+    *language = 0;
+    *script = 0;
+    *country = 0;
+    fIsBogus = TRUE;
 }
 
 const Locale&
-Locale::getDefault() 
+Locale::getDefault()
 {
     umtx_lock(NULL);
     UBool needInit = (gDefaultLocale == NULL);
     umtx_unlock(NULL);
     if (needInit) {
-        Locale *tLocale = new Locale(Locale::eBOGUS);
-        if (tLocale != NULL) {
-            const char *cLocale;
-
-            umtx_lock(NULL);
-            /* uprv_getDefaultLocaleID is not thread safe, so we surround it with a mutex */
-            cLocale = uprv_getDefaultLocaleID();
-            umtx_unlock(NULL);
-
-            tLocale->init(cLocale);
-            umtx_lock(NULL);
-            if (gDefaultLocale == NULL) {
-                gDefaultLocale = tLocale;
-                tLocale = NULL;
-            }
-            umtx_unlock(NULL);
-            delete tLocale;
-        }
+        umtx_lock(NULL);
+        /* uprv_getDefaultLocaleID is not thread safe, so we surround it with a mutex */
+        const char *cLocale = uprv_getDefaultLocaleID();
+        umtx_unlock(NULL);
+        locale_set_default_internal(cLocale);
     }
     return *gDefaultLocale;
 }
 
-void 
-Locale::setDefault( const   Locale&     newLocale, 
-                            UErrorCode&  status) 
+
+
+void
+Locale::setDefault( const   Locale&     newLocale,
+                            UErrorCode&  status)
 {
-    if (U_FAILURE(status))
+    if (U_FAILURE(status)) {
         return;
-    
-    const Locale *defLocale = &Locale::getDefault();
-    umtx_lock(NULL);
-    Locale *ncDefLocale = (Locale *)defLocale;
-    *ncDefLocale = newLocale;
-    umtx_unlock(NULL);
+    }
+
+    /* Set the default from the full name string of the supplied locale.
+     * This is a convenient way to access the default locale caching mechanisms.
+     */
+    const char *localeID = newLocale.getName();
+    locale_set_default_internal(localeID);
 }
 
 Locale
 Locale::createFromName (const char *name)
 {
     if (name) {
-        Locale l;
+        Locale l("");
         l.init(name);
         return l;
     }
@@ -535,13 +677,13 @@
  * in an incorrect format, 0 is returned.  The LocaleID is for use in
  * Windows (it is an LCID), but is available on all platforms.
  */
-uint32_t 
+uint32_t
 Locale::getLCID() const
 {
     return uloc_getLCID(fullName);
 }
 
-UnicodeString& 
+UnicodeString&
 Locale::getDisplayLanguage(UnicodeString& dispLang) const
 {
     return this->getDisplayLanguage(getDefault(), dispLang);
@@ -591,7 +733,51 @@
     return result;
 }
 
-UnicodeString& 
+UnicodeString&
+Locale::getDisplayScript(UnicodeString& dispScript) const
+{
+    return this->getDisplayScript(getDefault(), dispScript);
+}
+
+UnicodeString&
+Locale::getDisplayScript(const Locale &displayLocale,
+                          UnicodeString &result) const {
+    UChar *buffer;
+    UErrorCode errorCode=U_ZERO_ERROR;
+    int32_t length;
+
+    buffer=result.getBuffer(ULOC_FULLNAME_CAPACITY);
+    if(buffer==0) {
+        result.truncate(0);
+        return result;
+    }
+
+    length=uloc_getDisplayScript(fullName, displayLocale.fullName,
+                                  buffer, result.getCapacity(),
+                                  &errorCode);
+    result.releaseBuffer(length);
+
+    if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
+        buffer=result.getBuffer(length);
+        if(buffer==0) {
+            result.truncate(0);
+            return result;
+        }
+        errorCode=U_ZERO_ERROR;
+        length=uloc_getDisplayScript(fullName, displayLocale.fullName,
+                                      buffer, result.getCapacity(),
+                                      &errorCode);
+        result.releaseBuffer(length);
+    }
+
+    if(U_FAILURE(errorCode)) {
+        result.truncate(0);
+    }
+
+    return result;
+}
+
+UnicodeString&
 Locale::getDisplayCountry(UnicodeString& dispCntry) const
 {
     return this->getDisplayCountry(getDefault(), dispCntry);
@@ -635,7 +821,7 @@
     return result;
 }
 
-UnicodeString& 
+UnicodeString&
 Locale::getDisplayVariant(UnicodeString& dispVar) const
 {
     return this->getDisplayVariant(getDefault(), dispVar);
@@ -679,7 +865,7 @@
     return result;
 }
 
-UnicodeString& 
+UnicodeString&
 Locale::getDisplayName( UnicodeString& name ) const
 {
     return this->getDisplayName(getDefault(), name);
@@ -723,13 +909,13 @@
     return result;
 }
 const Locale*
-Locale::getAvailableLocales(int32_t& count) 
+Locale::getAvailableLocales(int32_t& count)
 {
     // for now, there is a hardcoded list, so just walk through that list and set it up.
     umtx_lock(NULL);
     UBool needInit = availableLocaleList == 0;
     umtx_unlock(NULL);
-    
+
     if (needInit) {
         int32_t locCount = uloc_countAvailable();
         Locale *newLocaleList = 0;
@@ -739,13 +925,13 @@
         if (newLocaleList == NULL) {
             return NULL;
         }
-      
+
         count = locCount;
-      
+
         while(--locCount >= 0) {
             newLocaleList[locCount].setFromPOSIXID(uloc_getAvailable(locCount));
         }
-      
+
         umtx_lock(NULL);
         if(availableLocaleList == 0) {
             availableLocaleListCount = count;
@@ -769,7 +955,7 @@
     return uloc_getISOLanguages();
 }
 
-// Set the locale's data based on a posix id. 
+// Set the locale's data based on a posix id.
 void Locale::setFromPOSIXID(const char *posixID)
 {
     init(posixID);
@@ -925,7 +1111,7 @@
     umtx_lock(NULL);
     UBool needInit = (gLocaleCache == NULL);
     umtx_unlock(NULL);
-    
+
     if (needInit) {
         Locale *tLocaleCache = new Locale[eMAX_LOCALES];
         if (tLocaleCache == NULL) {
@@ -949,7 +1135,7 @@
         tLocaleCache[eUS]            = Locale("en", "US");
         tLocaleCache[eCANADA]        = Locale("en", "CA");
         tLocaleCache[eCANADA_FRENCH] = Locale("fr", "CA");
-        
+
         umtx_lock(NULL);
         if (gLocaleCache == NULL) {
             gLocaleCache = tLocaleCache;
@@ -962,6 +1148,137 @@
     }
     return gLocaleCache;
 }
+
+class KeywordEnumeration : public StringEnumeration {
+private:
+    char *keywords;
+    char *current;
+    int32_t length;
+    UnicodeString currUSKey;
+    static const char fgClassID;/* Warning this is used beyond the typical RTTI usage. */
+
+public:
+    static UClassID getStaticClassID(void) { return (UClassID)&fgClassID; }
+    virtual UClassID getDynamicClassID(void) const { return getStaticClassID(); }
+public:
+    KeywordEnumeration(const char *keys, int32_t keywordLen, int32_t currentIndex, UErrorCode &status)
+        : keywords((char *)&fgClassID), current((char *)&fgClassID), length(0) {
+        if(U_SUCCESS(status) && keywordLen != 0) {
+            if(keys == NULL || keywordLen < 0) {
+                status = U_ILLEGAL_ARGUMENT_ERROR;
+            } else {
+                keywords = (char *)uprv_malloc(keywordLen+1);
+                if (keywords == NULL) {
+                    status = U_MEMORY_ALLOCATION_ERROR;
+                }
+                else {
+                    uprv_memcpy(keywords, keys, keywordLen);
+                    keywords[keywordLen] = 0;
+                    current = keywords + currentIndex;
+                    length = keywordLen;
+                }
+            }
+        }
+    }
+
+    ~KeywordEnumeration() {
+        uprv_free(keywords);
+    }
+
+    virtual StringEnumeration * clone() const
+    {
+        UErrorCode status = U_ZERO_ERROR;
+        return new KeywordEnumeration(keywords, length, (int32_t)(current - keywords), status);
+    }
+
+    int32_t count(UErrorCode &/*status*/) const {
+        char *kw = keywords;
+        int32_t result = 0;
+        while(*kw) {
+            result++;
+            kw += uprv_strlen(kw)+1;
+        }
+        return result;
+    }
+
+    const char* next(int32_t* resultLength, UErrorCode& status) {
+        const char* result;
+        int32_t len;
+        if(U_SUCCESS(status) && *current != 0) {
+            result = current;
+            len = uprv_strlen(current);
+            current += len+1;
+            if(resultLength != NULL) {
+                *resultLength = len;
+            }
+        } else {
+            if(resultLength != NULL) {
+                *resultLength = 0;
+            }
+            result = NULL;
+        }
+        return result;
+    }
+
+    const UnicodeString* snext(UErrorCode& status) {
+        int32_t resultLength = 0;
+        const char *s = next(&resultLength, status);
+        return setChars(s, resultLength, status);
+    }
+
+    void reset(UErrorCode& /*status*/) {
+        current = keywords;
+    }
+};
+
+const char KeywordEnumeration::fgClassID = '\0';
+
+StringEnumeration *
+Locale::createKeywords(UErrorCode &status) const
+{
+    char keywords[256];
+    int32_t keywordCapacity = 256;
+    StringEnumeration *result = NULL;
+
+    const char* variantStart = uprv_strchr(fullName, '@');
+    const char* assignment = uprv_strchr(fullName, '=');
+    if(variantStart) {
+        if(assignment) {
+            int32_t keyLen = locale_getKeywords(variantStart+1, '@', keywords, keywordCapacity, NULL, 0, NULL, FALSE, &status);
+            if(keyLen) {
+                result = new KeywordEnumeration(keywords, keyLen, 0, status);
+            }
+        } else {
+            status = U_INVALID_FORMAT_ERROR;
+        }
+    }
+    return result;
+}
+
+int32_t
+Locale::getKeywordValue(const char* keywordName, char *buffer, int32_t bufLen, UErrorCode &status) const
+{
+    return uloc_getKeywordValue(fullName, keywordName, buffer, bufLen, &status);
+}
+
+const char *
+Locale::getBaseName() const
+{
+    // lazy init
+    UErrorCode status = U_ZERO_ERROR;
+    // semantically const
+    if(baseName == 0) {
+        ((Locale *)this)->baseName = ((Locale *)this)->baseNameBuffer;
+        int32_t baseNameSize = uloc_getBaseName(fullName, baseName, ULOC_FULLNAME_CAPACITY, &status);
+        if(baseNameSize >= ULOC_FULLNAME_CAPACITY) {
+            ((Locale *)this)->baseName = (char *)uprv_malloc(sizeof(char) * baseNameSize + 1);
+            uloc_getBaseName(fullName, baseName, baseNameSize+1, &status);
+        }
+        baseName[baseNameSize] = 0;
+    }
+    return baseName;
+}
+
 
 //eof
 U_NAMESPACE_END

Index: locmap.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/locmap.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- locmap.c	10 Sep 2003 02:42:02 -0000	1.4
+++ locmap.c	6 Apr 2004 10:07:58 -0000	1.5
@@ -27,6 +27,7 @@
  * 08/23/01     george      Convert to C
  */
 
+#include "unicode/uloc.h"
 #include "locmap.h"
 #include "cstring.h"
 
@@ -70,7 +71,6 @@
  { "in", "iso-8859-1" },  /* Indonesian */
  { "is", "iso-8859-1" },  /* Icelandic */
  { "it", "iso-8859-1" },  /* Italian  */
- { "iw", "iso-8859-8" },  /* hebrew old ISO name */
  { "ja", "Shift_JIS"  },  /* Japanese [was: ja_JP ] */
  { "ji", "iso-8859-8" },  /* Yiddish */
  { "kl", "iso-8859-4" },  /* Greenlandic */
@@ -134,23 +134,15 @@
     return NULL;
 }
 
-#ifdef WIN32
-
 /*
  * Note:
- * This code is used only internally by putil.c/uprv_getDefaultLocaleID().
  * The mapping from Win32 locale ID numbers to POSIX locale strings should
- * be the faster one. It is more important to get the LCID to ICU locale
- * mapping correct than to get a correct ICU locale to LCID mapping.
- *
- * In order to test this code, please use the lcid test program.
+ * be the faster one.
  *
- * The LCID values come from winnt.h
+ * Many LCID values come from winnt.h
+ * Some also come from http://www.microsoft.com/globaldev/reference/lcid-all.mspx
  */
 
-#include "unicode/uloc.h"
-
-
 /*
 ////////////////////////////////////////////////
 //
@@ -171,25 +163,6 @@
     const struct ILcidPosixElement* const regionMaps;
 } ILcidPosixMap;
 
-static const char* posixID(const ILcidPosixMap *this_0, uint32_t fromHostID);
-
-/**
- * Searches for a Windows LCID
- *
- * @param posixid the Posix style locale id.
- * @param status gets set to U_ILLEGAL_ARGUMENT_ERROR when the Posix ID has
- *               no equivalent Windows LCID.
- * @return the LCID
- */
-static uint32_t hostID(const ILcidPosixMap *this_0, const char* fromPosixID, UErrorCode* status);
-
-/**
- * Do not call this function. It is called by hostID.
- * The function is not private because this struct must stay as a C struct,
- * and this is an internal class.
- */
-static int32_t idCmp(const char* id1, const char* id2);
-
 
 /*
 /////////////////////////////////////////////////
@@ -252,19 +225,41 @@
     {0x2401, "ar_YE"}
 };
 
-ILCID_POSIX_ELEMENT_ARRAY(0x044d, as, as_IN)    /*Todo: Data does not exist*/
+ILCID_POSIX_ELEMENT_ARRAY(0x044d, as, as_IN)
+ILCID_POSIX_ELEMENT_ARRAY(0x045e, am, am_ET)
 
 static const ILcidPosixElement az[] = {
     {0x2c,   "az"},
     {0x082c, "az_AZ"},  /* Cyrillic based */
-    {0x042c, "az_AZ_LATN"} /* Latin based */
+    {0x042c, "az_Latn_AZ"} /* Latin based */
 };
 
 ILCID_POSIX_ELEMENT_ARRAY(0x0423, be, be_BY)
 ILCID_POSIX_ELEMENT_ARRAY(0x0402, bg, bg_BG)
-ILCID_POSIX_ELEMENT_ARRAY(0x0445, bn, bn_IN)
+
+static const ILcidPosixElement bn[] = {
+    {0x45,   "bn"},
+    {0x0845, "bn_BD"},
+    {0x0445, "bn_IN"}
+};
+
+
+static const ILcidPosixElement bo[] = {
+    {0x51,   "bo"},
+    {0x0851, "bo_BT"},
+    {0x0451, "bo_CN"}
+};
+
 ILCID_POSIX_ELEMENT_ARRAY(0x0403, ca, ca_ES)
-ILCID_POSIX_ELEMENT_ARRAY(0x0405, cs, cs_CZ)
+ILCID_POSIX_ELEMENT_ARRAY(0x045c, chr,chr_US)
+
+/* Declared as cs_CZ to get around compiler errors on z/OS, which defines cs as a function */
+static const ILcidPosixElement cs_CZ[] = {
+    {0x05,   "cs"},
+    {0x0405, "cs_CZ"},
+};
+
+ILCID_POSIX_ELEMENT_ARRAY(0x0452, cy, cy_GB)
 ILCID_POSIX_ELEMENT_ARRAY(0x0406, da, da_DK)
 
 static const ILcidPosixElement de[] = {
@@ -295,7 +290,12 @@
     {0x007f, "en_US_POSIX"}, /* duplicate for roundtripping */
     {0x2409, "en_VI"},  /* Virgin Islands AKA Caribbean Islands (en_CB). */
     {0x1c09, "en_ZA"},
-    {0x3009, "en_ZW"}
+    {0x3009, "en_ZW"},
+    {0x0409, "en_AS"},  /* Alias for en_US. Leave last. */
+    {0x0409, "en_GU"},  /* Alias for en_US. Leave last. */
+    {0x0409, "en_MH"},  /* Alias for en_US. Leave last. */
+    {0x0409, "en_MP"},  /* Alias for en_US. Leave last. */
+    {0x0409, "en_UM"}   /* Alias for en_US. Leave last. */
 };
 
 static const ILcidPosixElement en_US_POSIX[] = {
@@ -336,20 +336,43 @@
     {0x0c,   "fr"},
     {0x080c, "fr_BE"},
     {0x0c0c, "fr_CA"},
+    {0x240c, "fr_CD"},
     {0x100c, "fr_CH"},
+    {0x300c, "fr_CI"},
+    {0x2c0c, "fr_CM"},
     {0x040c, "fr_FR"},
+    {0x3c0c, "fr_HT"},
     {0x140c, "fr_LU"},
-    {0x180c, "fr_MC"}
+    {0x380c, "fr_MA"},
+    {0x180c, "fr_MC"},
+    {0x340c, "fr_ML"},
+    {0x200c, "fr_RE"},
+    {0x280c, "fr_SN"}
+};
+
+ILCID_POSIX_ELEMENT_ARRAY(0x0462, fy, fy_NL)
+
+/* This LCID is really two different locales.*/
+static const ILcidPosixElement ga[] = {
+    {0x3c,   "ga"},
+    {0x3c,   "gd"},
+    {0x083c, "ga_IE"},  /* Gaelic (Ireland) */
+    {0x043c, "gd_GB"}   /* Gaelic (Scotland) */
 };
 
 ILCID_POSIX_ELEMENT_ARRAY(0x0456, gl, gl_ES)
 ILCID_POSIX_ELEMENT_ARRAY(0x0447, gu, gu_IN)
+ILCID_POSIX_ELEMENT_ARRAY(0x0474, gn, gn_PY)
+ILCID_POSIX_ELEMENT_ARRAY(0x0468, ha, ha_NG)
+ILCID_POSIX_ELEMENT_ARRAY(0x0475, haw,haw_US)
 ILCID_POSIX_ELEMENT_ARRAY(0x040d, he, he_IL)
 ILCID_POSIX_ELEMENT_ARRAY(0x0439, hi, hi_IN)
 
-/* This LCID is really three different locales.*/
+/* This LCID is really four different locales.*/
 static const ILcidPosixElement hr[] = {
     {0x1a,   "hr"},
+    {0x141a, "bs_BA"},  /* Bosnian, Bosnia and Herzegovina */
+    {0x141a, "bs"},     /* Bosnian */
     {0x041a, "hr_HR"},  /* Croatian*/
     {0x081a, "sh_YU"},  /* Serbo-Croatian*/
     {0x081a, "sh"},     /* It's 0x1a or 0x081a, pick one to make the test program happy. */
@@ -360,6 +383,7 @@
 ILCID_POSIX_ELEMENT_ARRAY(0x040e, hu, hu_HU)
 ILCID_POSIX_ELEMENT_ARRAY(0x042b, hy, hy_AM)
 ILCID_POSIX_ELEMENT_ARRAY(0x0421, id, id_ID)
+ILCID_POSIX_ELEMENT_ARRAY(0x0470, ig, ig_NG)
 ILCID_POSIX_ELEMENT_ARRAY(0x040f, is, is_IS)
 
 static const ILcidPosixElement it[] = {
@@ -368,10 +392,12 @@
     {0x0410, "it_IT"}
 };
 
+ILCID_POSIX_ELEMENT_ARRAY(0x045d, iu, iu_CA)    /* TODO: verify country */
 ILCID_POSIX_ELEMENT_ARRAY(0x040d, iw, iw_IL)    /*Left in for compatibility*/
 ILCID_POSIX_ELEMENT_ARRAY(0x0411, ja, ja_JP)
 ILCID_POSIX_ELEMENT_ARRAY(0x0437, ka, ka_GE)
 ILCID_POSIX_ELEMENT_ARRAY(0x043f, kk, kk_KZ)
+ILCID_POSIX_ELEMENT_ARRAY(0x0453, km, km_KH)
 ILCID_POSIX_ELEMENT_ARRAY(0x044b, kn, kn_IN)
 
 static const ILcidPosixElement ko[] = {
@@ -381,15 +407,25 @@
 };
 
 ILCID_POSIX_ELEMENT_ARRAY(0x0457, kok, kok_IN)
-ILCID_POSIX_ELEMENT_ARRAY(0x0460, ks,  ks_IN)   /*Todo: Data does not exist*/
-ILCID_POSIX_ELEMENT_ARRAY(0x0440, ky,  ky_KG)   /* Kyrgyz is spoken in Kyrgyzstan */
-ILCID_POSIX_ELEMENT_ARRAY(0x0427, lt,  lt_LT)
-ILCID_POSIX_ELEMENT_ARRAY(0x0426, lv,  lv_LV)
-ILCID_POSIX_ELEMENT_ARRAY(0x042f, mk,  mk_MK)
-ILCID_POSIX_ELEMENT_ARRAY(0x044c, ml,  ml_IN)   /*Todo: Data does not exist*/
-ILCID_POSIX_ELEMENT_ARRAY(0x0450, mn,  mn_MN)
-ILCID_POSIX_ELEMENT_ARRAY(0x0458, mni, mni_IN)  /*Todo: Data does not exist*/
-ILCID_POSIX_ELEMENT_ARRAY(0x044e, mr,  mr_IN)
+ILCID_POSIX_ELEMENT_ARRAY(0x0471, kr,  kr_NG)
+
+static const ILcidPosixElement ks[] = {         /* We could add PK and CN too */
+    {0x60,   "ks"},
+    {0x0860, "ks_IN"},              /* Documentation doesn't mention script */
+    {0x0460, "ks_Arab_IN"}
+};
+
+ILCID_POSIX_ELEMENT_ARRAY(0x0440, ky, ky_KG)   /* Kyrgyz is spoken in Kyrgyzstan */
+ILCID_POSIX_ELEMENT_ARRAY(0x0476, la, la_IT)   /* TODO: Verify the country */
+ILCID_POSIX_ELEMENT_ARRAY(0x0454, lo, lo_LA)
+ILCID_POSIX_ELEMENT_ARRAY(0x0427, lt, lt_LT)
+ILCID_POSIX_ELEMENT_ARRAY(0x0426, lv, lv_LV)
+ILCID_POSIX_ELEMENT_ARRAY(0x0481, mi, mi_NZ)
+ILCID_POSIX_ELEMENT_ARRAY(0x042f, mk, mk_MK)
+ILCID_POSIX_ELEMENT_ARRAY(0x044c, ml, ml_IN)
+ILCID_POSIX_ELEMENT_ARRAY(0x0450, mn, mn_MN)
+ILCID_POSIX_ELEMENT_ARRAY(0x0458, mni,mni_IN)
+ILCID_POSIX_ELEMENT_ARRAY(0x044e, mr, mr_IN)
 
 static const ILcidPosixElement ms[] = {
     {0x3e,   "ms"},
@@ -400,7 +436,7 @@
 /* The MSJDK documentation says this is maltese, but it's not supported.*/
 ILCID_POSIX_ELEMENT_ARRAY(0x043a, mt, mt_MT)
 
-static const ILcidPosixElement ne[] = {         /*Todo: Data does not exist*/
+static const ILcidPosixElement ne[] = {
     {0x61,   "ne"},
     {0x0861, "ne_IN"},   /* India*/
     {0x0461, "ne_NP"}    /* Nepal*/
@@ -415,19 +451,31 @@
 /* The "no" locale split into nb and nn.  By default in ICU, "no" is nb.*/
 static const ILcidPosixElement no[] = {
     {0x14,   "nb"},     /* really nb */
-    {0x0414, "nb_NO"},  /* really nb_NO */
-    {0x0814, "nn_NO"},  /* really nn_NO */
-    {0x0814, "nn"}      /* It's 0x14 or 0x814, pick one to make the test program happy. */
+    {0x0414, "nb_NO"},  /* really nb_NO. Keep first in the 414 list. */
+    {0x0414, "no"},     /* really nb_NO */
+    {0x0414, "no_NO"},  /* really nb_NO */
+    {0x0814, "nn_NO"},  /* really nn_NO. Keep first in the 814 list.  */
+    {0x0814, "nn"},     /* It's 0x14 or 0x814, pick one to make the test program happy. */
+    {0x0814, "no_NO_NY"}/* really nn_NO */
 };
 
+ILCID_POSIX_ELEMENT_ARRAY(0x046c, nso,nso_ZA)   /* TODO: Verify the country */
+ILCID_POSIX_ELEMENT_ARRAY(0x0472, om, om_ET)    /* TODO: Verify the country */
+
 /* Declared as or_IN to get around compiler errors*/
 static const ILcidPosixElement or_IN[] = {
     {0x48,   "or"},
     {0x0448, "or_IN"},
 };
 
-ILCID_POSIX_ELEMENT_ARRAY(0x0446, pa, pa_IN)
+static const ILcidPosixElement pa[] = {
+    {0x46,   "pa"},
+    {0x0446, "pa_IN"},
+    {0x0846, "pa_PK"}
+};
+
 ILCID_POSIX_ELEMENT_ARRAY(0x0415, pl, pl_PL)
+ILCID_POSIX_ELEMENT_ARRAY(0x0463, ps, ps_AF)
 
 static const ILcidPosixElement pt[] = {
     {0x16,   "pt"},
@@ -435,6 +483,13 @@
     {0x0816, "pt_PT"}
 };
 
+static const ILcidPosixElement qu[] = {
+    {0x6B,   "qu"},
+    {0x046B, "qu_BO"},
+    {0x086B, "qu_EC"},
+    {0x0C6B, "qu_PE"}
+};
+
 ILCID_POSIX_ELEMENT_ARRAY(0x0418, ro, ro_RO)
 
 static const ILcidPosixElement root[] = {
@@ -443,9 +498,17 @@
 
 ILCID_POSIX_ELEMENT_ARRAY(0x0419, ru, ru_RU)
 ILCID_POSIX_ELEMENT_ARRAY(0x044f, sa, sa_IN)
-ILCID_POSIX_ELEMENT_ARRAY(0x0459, sd, sd_IN)    /*Todo: Data does not exist*/
+
+static const ILcidPosixElement sd[] = {
+    {0x59,   "sd"},
+    {0x0459, "sd_IN"},
+    {0x0859, "sd_PK"}
+};
+
+ILCID_POSIX_ELEMENT_ARRAY(0x045b, si, si_LK)
 ILCID_POSIX_ELEMENT_ARRAY(0x041b, sk, sk_SK)
 ILCID_POSIX_ELEMENT_ARRAY(0x0424, sl, sl_SI)
+ILCID_POSIX_ELEMENT_ARRAY(0x0477, so, so_ET)    /* TODO: Verify the country */
 ILCID_POSIX_ELEMENT_ARRAY(0x041c, sq, sq_AL)
 
 static const ILcidPosixElement sv[] = {
@@ -459,8 +522,19 @@
 ILCID_POSIX_ELEMENT_ARRAY(0x0449, ta, ta_IN)
 ILCID_POSIX_ELEMENT_ARRAY(0x044a, te, te_IN)
 ILCID_POSIX_ELEMENT_ARRAY(0x041e, th, th_TH)
+
+static const ILcidPosixElement ti[] = {
+    {0x73,   "ti"},
+    {0x0873, "ti_ER"},
+    {0x0473, "ti_ET"}
+};
+
+ILCID_POSIX_ELEMENT_ARRAY(0x0442, tk, tk_TM)
+ILCID_POSIX_ELEMENT_ARRAY(0x0464, tl, tl_PH)
+ILCID_POSIX_ELEMENT_ARRAY(0x0432, tn, tn_BW)
 ILCID_POSIX_ELEMENT_ARRAY(0x041f, tr, tr_TR)
 ILCID_POSIX_ELEMENT_ARRAY(0x0444, tt, tt_RU)
+ILCID_POSIX_ELEMENT_ARRAY(0x0480, ug, ug_CN)
 ILCID_POSIX_ELEMENT_ARRAY(0x0422, uk, uk_UA)
 
 static const ILcidPosixElement ur[] = {
@@ -472,36 +546,66 @@
 static const ILcidPosixElement uz[] = {
     {0x43,   "uz"},
     {0x0843, "uz_UZ"},  /* Cyrillic based */
-    {0x0443, "uz_UZ_LATN"} /* Latin based */
+    {0x0443, "uz_Latn_UZ"} /* Latin based */
 };
 
+ILCID_POSIX_ELEMENT_ARRAY(0x0433, ve, ve_ZA)    /* TODO: Verify the country */
 ILCID_POSIX_ELEMENT_ARRAY(0x042a, vi, vi_VN)
+ILCID_POSIX_ELEMENT_ARRAY(0x0434, xh, xh_ZA)    /* TODO: Verify the country */
+ILCID_POSIX_ELEMENT_ARRAY(0x046a, yo, yo_NG)    /* TODO: Verify the country */
 
+/* TODO: Make the locales with the script the primary locale once the names are implemented in the resources. */
 static const ILcidPosixElement zh[] = {
     {0x04,   "zh"},
     {0x0804, "zh_CN"},
+    {0x0804, "zh_Hans_CN"},
     {0x0c04, "zh_HK"},
+    {0x0c04, "zh_Hant_HK"},
     {0x1404, "zh_MO"},
+    {0x1404, "zh_Hant_MO"},
     {0x1004, "zh_SG"},
+    {0x1004, "zh_Hans_SG"},
     {0x0404, "zh_TW"},
-    {0x30404,"zh_TW"},
-    {0x20404,"zh_TW_STROKE"}
+    {0x0404, "zh_Hant_TW"},
+    {0x30404,"zh_TW"},          /* Bopomofo order */
+    {0x30404,"zh_Hant_TW"},
+    {0x20404,"zh_TW@collation=STROKE"},
+    {0x20404,"zh_TW_STROKE"},   /* remove? */
+    {0x20404,"zh_Hant_TW_STROKE"}
 };
 
+ILCID_POSIX_ELEMENT_ARRAY(0x0435, zu, zu_ZA)    /* TODO: Verify the country */
+
 /* This must be static and grouped by LCID. */
+
+/* non-existent ISO-639 codes */
+/*
+0x466   Edo
+0x467   Fulfulde - Nigeria
+0x43b   Sami (Lappish)
+0x42e   Sorbian (iso639 = dsb, hsb, wen)
+0x430   Sutu
+0x45f   Tamazight (Arabic script)
+0x85f   Tamazight (Latin script)
+0x478   Yi
+*/
 static const ILcidPosixMap gPosixIDmap[] = {
     ILCID_POSIX_MAP(af),    /*  af  Afrikaans                 0x36 */
+    ILCID_POSIX_MAP(am),    /*  am  Amharic                   0x5e */
     ILCID_POSIX_MAP(ar),    /*  ar  Arabic                    0x01 */
     ILCID_POSIX_MAP(as),    /*  as  Assamese                  0x4d */
     ILCID_POSIX_MAP(az),    /*  az  Azerbaijani               0x2c */
-    ILCID_POSIX_MAP(be),    /*  be  Byelorussian              0x23 */
+    ILCID_POSIX_MAP(be),    /*  be  Belarusian                0x23 */
     ILCID_POSIX_MAP(bg),    /*  bg  Bulgarian                 0x02 */
     ILCID_POSIX_MAP(bn),    /*  bn  Bengali; Bangla           0x45 */
+    ILCID_POSIX_MAP(bo),    /*  bo  Tibetan                   0x51 */
     ILCID_POSIX_MAP(ca),    /*  ca  Catalan                   0x03 */
-    ILCID_POSIX_MAP(cs),    /*  cs  Czech                     0x05 */
+    ILCID_POSIX_MAP(chr),   /*  chr Cherokee                  0x5c */
+    ILCID_POSIX_MAP(cs_CZ), /*  cs  Czech                     0x05 */
+    ILCID_POSIX_MAP(cy),    /*  cy  Welsh                     0x52 */
     ILCID_POSIX_MAP(da),    /*  da  Danish                    0x06 */
     ILCID_POSIX_MAP(de),    /*  de  German                    0x07 */
-    ILCID_POSIX_MAP(dv),    /*  dv Divehi                     0x65 */
+    ILCID_POSIX_MAP(dv),    /*  dv  Divehi                    0x65 */
     ILCID_POSIX_MAP(el),    /*  el  Greek                     0x08 */
     ILCID_POSIX_MAP(en),    /*  en  English                   0x09 */
     ILCID_POSIX_MAP(en_US_POSIX), /*    invariant             0x7f */
@@ -512,28 +616,39 @@
     ILCID_POSIX_MAP(fi),    /*  fi  Finnish                   0x0b */
     ILCID_POSIX_MAP(fo),    /*  fo  Faroese                   0x38 */
     ILCID_POSIX_MAP(fr),    /*  fr  French                    0x0c */
+    ILCID_POSIX_MAP(fy),    /*  fy  Frisian                   0x62 */
+    ILCID_POSIX_MAP(ga),    /*  *   Gaelic (Ireland,Scotland) 0x3c */
     ILCID_POSIX_MAP(gl),    /*  gl  Galician                  0x56 */
+    ILCID_POSIX_MAP(gn),    /*  gn  Guarani                   0x74 */
     ILCID_POSIX_MAP(gu),    /*  gu  Gujarati                  0x47 */
+    ILCID_POSIX_MAP(ha),    /*  ha  Hausa                     0x68 */
+    ILCID_POSIX_MAP(haw),   /*  haw Hawaiian                  0x75 */
     ILCID_POSIX_MAP(he),    /*  he  Hebrew (formerly iw)      0x0d */
     ILCID_POSIX_MAP(hi),    /*  hi  Hindi                     0x39 */
-    ILCID_POSIX_MAP(hr),    /*  hr  Croatian                  0x1a */
+    ILCID_POSIX_MAP(hr),    /*  *   Croatian and others       0x1a */
     ILCID_POSIX_MAP(hu),    /*  hu  Hungarian                 0x0e */
     ILCID_POSIX_MAP(hy),    /*  hy  Armenian                  0x2b */
     ILCID_POSIX_MAP(id),    /*  id  Indonesian (formerly in)  0x21 */
-/*        ILCID_POSIX_MAP(in),    //  in  Indonesian                0x21 */
+    ILCID_POSIX_MAP(ig),    /*  ig  Igbo                      0x70 */
     ILCID_POSIX_MAP(is),    /*  is  Icelandic                 0x0f */
     ILCID_POSIX_MAP(it),    /*  it  Italian                   0x10 */
+    ILCID_POSIX_MAP(iu),    /*  iu  Inuktitut                 0x5d */
     ILCID_POSIX_MAP(iw),    /*  iw  Hebrew                    0x0d */
     ILCID_POSIX_MAP(ja),    /*  ja  Japanese                  0x11 */
     ILCID_POSIX_MAP(ka),    /*  ka  Georgian                  0x37 */
     ILCID_POSIX_MAP(kk),    /*  kk  Kazakh                    0x3f */
+    ILCID_POSIX_MAP(km),    /*  km  Khmer                     0x53 */
     ILCID_POSIX_MAP(kn),    /*  kn  Kannada                   0x4b */
-    ILCID_POSIX_MAP(ky),    /*  ky  Kyrgyz                    0x40 */
     ILCID_POSIX_MAP(ko),    /*  ko  Korean                    0x12 */
     ILCID_POSIX_MAP(kok),   /*  kok Konkani                   0x57 */
+    ILCID_POSIX_MAP(kr),    /*  kr  Kanuri                    0x71 */
     ILCID_POSIX_MAP(ks),    /*  ks  Kashmiri                  0x60 */
+    ILCID_POSIX_MAP(ky),    /*  ky  Kyrgyz                    0x40 */
+    ILCID_POSIX_MAP(la),    /*  la  Latin                     0x76 */
+    ILCID_POSIX_MAP(lo),    /*  lo  Lao                       0x54 */
     ILCID_POSIX_MAP(lt),    /*  lt  Lithuanian                0x27 */
     ILCID_POSIX_MAP(lv),    /*  lv  Latvian, Lettish          0x26 */
+    ILCID_POSIX_MAP(mi),    /*  mi  Maori                     0x81 */
     ILCID_POSIX_MAP(mk),    /*  mk  Macedonian                0x2f */
     ILCID_POSIX_MAP(ml),    /*  ml  Malayalam                 0x4c */
     ILCID_POSIX_MAP(mn),    /*  mn  Mongolian                 0x50 */
@@ -541,42 +656,62 @@
     ILCID_POSIX_MAP(mr),    /*  mr  Marathi                   0x4e */
     ILCID_POSIX_MAP(ms),    /*  ms  Malay                     0x3e */
     ILCID_POSIX_MAP(mt),    /*  mt  Maltese                   0x3a */
-/*        ILCID_POSIX_MAP(nb),    //  no  Norwegian                 0x14 */
+/*    ILCID_POSIX_MAP(nb),    //  no  Norwegian                 0x14 */
     ILCID_POSIX_MAP(ne),    /*  ne  Nepali                    0x61 */
     ILCID_POSIX_MAP(nl),    /*  nl  Dutch                     0x13 */
-/*        ILCID_POSIX_MAP(nn),    //  no  Norwegian                 0x14 */
-    ILCID_POSIX_MAP(no),    /*  nb/nn Norwegian (formerly no) 0x14 */
+/*    ILCID_POSIX_MAP(nn),    //  no  Norwegian                 0x14 */
+    ILCID_POSIX_MAP(no),    /*  *   Norwegian                 0x14 */
+    ILCID_POSIX_MAP(nso),   /*  nso Sotho, Northern (Sepedi dialect) 0x6c */
+    ILCID_POSIX_MAP(om),    /*  om  Oromo                     0x72 */
     ILCID_POSIX_MAP(or_IN), /*  or  Oriya                     0x48 */
     ILCID_POSIX_MAP(pa),    /*  pa  Punjabi                   0x46 */
     ILCID_POSIX_MAP(pl),    /*  pl  Polish                    0x15 */
+    ILCID_POSIX_MAP(ps),    /*  ps  Pashto                    0x63 */
     ILCID_POSIX_MAP(pt),    /*  pt  Portuguese                0x16 */
+    ILCID_POSIX_MAP(qu),    /*  qu  Quechua (correct spelling)0x6B */
     ILCID_POSIX_MAP(ro),    /*  ro  Romanian                  0x18 */
     ILCID_POSIX_MAP(root),  /*  root                          0x00 */
     ILCID_POSIX_MAP(ru),    /*  ru  Russian                   0x19 */
     ILCID_POSIX_MAP(sa),    /*  sa  Sanskrit                  0x4f */
     ILCID_POSIX_MAP(sd),    /*  sd  Sindhi                    0x59 */
-/*        ILCID_POSIX_MAP(sh),    //  sh  Serbo-Croatian            0x1a */
+/*    ILCID_POSIX_MAP(sh),    //  sh  Serbo-Croatian            0x1a */
+    ILCID_POSIX_MAP(si),    /*  si  Sinhalese                 0x5b */
     ILCID_POSIX_MAP(sk),    /*  sk  Slovak                    0x1b */
     ILCID_POSIX_MAP(sl),    /*  sl  Slovenian                 0x24 */
+    ILCID_POSIX_MAP(so),    /*  so  Somali                    0x77 */
     ILCID_POSIX_MAP(sq),    /*  sq  Albanian                  0x1c */
-/*        ILCID_POSIX_MAP(sr),    //  sr  Serbian                   0x1a */
+/*    ILCID_POSIX_MAP(sr),    //  sr  Serbian                   0x1a */
     ILCID_POSIX_MAP(sv),    /*  sv  Swedish                   0x1d */
     ILCID_POSIX_MAP(sw),    /*  sw  Swahili                   0x41 */
     ILCID_POSIX_MAP(syr),   /*  syr Syriac                    0x5A */
     ILCID_POSIX_MAP(ta),    /*  ta  Tamil                     0x49 */
     ILCID_POSIX_MAP(te),    /*  te  Telugu                    0x4a */
     ILCID_POSIX_MAP(th),    /*  th  Thai                      0x1e */
+    ILCID_POSIX_MAP(ti),    /*  ti  Tigrigna                  0x73 */
+    ILCID_POSIX_MAP(tk),    /*  tk  Turkmen                   0x42 */
+    ILCID_POSIX_MAP(tl),    /*  tl  Tagalog (Filipino)        0x64 */
+    ILCID_POSIX_MAP(tn),    /*  tn  Tswana                    0x32 */
     ILCID_POSIX_MAP(tr),    /*  tr  Turkish                   0x1f */
     ILCID_POSIX_MAP(tt),    /*  tt  Tatar                     0x44 */
+    ILCID_POSIX_MAP(ug),    /*  ug  Uighur                    0x80 */
     ILCID_POSIX_MAP(uk),    /*  uk  Ukrainian                 0x22 */
     ILCID_POSIX_MAP(ur),    /*  ur  Urdu                      0x20 */
     ILCID_POSIX_MAP(uz),    /*  uz  Uzbek                     0x43 */
+    ILCID_POSIX_MAP(ve),    /*  ve  Venda                     0x33 */
     ILCID_POSIX_MAP(vi),    /*  vi  Vietnamese                0x2a */
+    ILCID_POSIX_MAP(xh),    /*  xh  Xhosa                     0x34 */
+    ILCID_POSIX_MAP(yo),    /*  yo  Yoruba                    0x6a */
     ILCID_POSIX_MAP(zh),    /*  zh  Chinese                   0x04 */
+    ILCID_POSIX_MAP(zu),    /*  zu  Zulu                      0x35 */
 };
 
 static const uint32_t gLocaleCount = sizeof(gPosixIDmap)/sizeof(ILcidPosixMap);
 
+/**
+ * Do not call this function. It is called by hostID.
+ * The function is not private because this struct must stay as a C struct,
+ * and this is an internal class.
+ */
 static int32_t
 idCmp(const char* id1, const char* id2)
 {
@@ -598,7 +733,7 @@
  * @return the LCID
  */
 static uint32_t
-hostID(const ILcidPosixMap *this_0, const char* posixID, UErrorCode* status)
+getHostID(const ILcidPosixMap *this_0, const char* posixID, UErrorCode* status)
 {
     int32_t bestIdx = 0;
     int32_t bestIdxDiff = 0;
@@ -627,7 +762,7 @@
 }
 
 static const char*
-posixID(const ILcidPosixMap *this_0, uint32_t hostID)
+getPosixID(const ILcidPosixMap *this_0, uint32_t hostID)
 {
     uint32_t i;
     for (i = 0; i <= this_0->numRegions; i++)
@@ -661,13 +796,13 @@
     {
         if (langID == gPosixIDmap[index].regionMaps->hostID)
         {
-            return posixID(&gPosixIDmap[index], hostid);
+            return getPosixID(&gPosixIDmap[index], hostid);
         }
     }
 
     /* no match found */
     *status = U_ILLEGAL_ARGUMENT_ERROR;
-    return "??_??";
+    return NULL;
 }
 
 /*
@@ -683,8 +818,9 @@
 {
 
     uint32_t   low    = 0;
-    uint32_t   high   = gLocaleCount - 1;
+    uint32_t   high   = gLocaleCount;
     uint32_t   mid    = high;
+    uint32_t   oldmid = 0;
     int32_t    compVal;
     char       langID[ULOC_FULLNAME_CAPACITY];
 
@@ -704,19 +840,25 @@
     }
 
     /*Binary search for the map entry for normal cases */
-    /* When mid == 0, it's not found */
-    while (low <= high && mid != 0) {
 
-        mid = (low + high + 1) / 2;    /* +1 is to round properly */
+    while (high > low)  /*binary search*/{
 
-        compVal = uprv_strcmp(langID, gPosixIDmap[mid].regionMaps->posixID);
+        mid = (high+low) >> 1; /*Finds median*/
 
-        if (compVal < 0)
-            high = mid - 1;
-        else if (compVal > 0)
-            low = mid + 1;
-        else  /* found match! */
-            return hostID(&gPosixIDmap[mid], posixID, status);
+        if (mid == oldmid) 
+            break;
+
+        compVal = uprv_strcmp(langID, gPosixIDmap[mid].regionMaps->posixID);
+        if (compVal < 0){
+            high = mid;
+        }
+        else if (compVal > 0){
+            low = mid;
+        }
+        else /*we found it*/{
+            return getHostID(&gPosixIDmap[mid], posixID, status);
+        }
+        oldmid = mid;
     }
 
     /*
@@ -725,7 +867,7 @@
      */
     for (idx = 0; idx < gLocaleCount; idx++ ) {
         myStatus = U_ZERO_ERROR;
-        value = hostID(&gPosixIDmap[idx], posixID, &myStatus);
+        value = getHostID(&gPosixIDmap[idx], posixID, &myStatus);
         if (myStatus == U_ZERO_ERROR) {
             return value;
         }
@@ -743,6 +885,4 @@
     *status = U_ILLEGAL_ARGUMENT_ERROR;
     return 0;   /* return international (root) */
 }
-
-#endif
 

Index: locmap.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/locmap.h,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- locmap.h	10 Sep 2003 02:42:02 -0000	1.4
+++ locmap.h	6 Apr 2004 10:07:58 -0000	1.5
@@ -34,12 +34,10 @@
 U_CAPI const char* U_EXPORT2 
 uprv_defaultCodePageForLocale(const char *locale);
 
-#ifdef WIN32
 #define LANGUAGE_LCID(hostID) (uint16_t)(0x03FF & hostID)
 
 U_CAPI const char *uprv_convertToPosix(uint32_t hostid, UErrorCode* status);
 
 U_CAPI uint32_t uprv_convertToLCID(const char* posixID, UErrorCode* status);
-#endif /* WIN32 */
 
 #endif /* LOCMAP_H */

Index: mutex.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/mutex.h,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- mutex.h	10 Sep 2003 02:42:02 -0000	1.3
+++ mutex.h	6 Apr 2004 10:07:58 -0000	1.4
@@ -1,7 +1,7 @@
 /*
 ******************************************************************************
 *
-*   Copyright (C) 1997-2001, International Business Machines
+*   Copyright (C) 1997-2003, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************
@@ -27,7 +27,7 @@
 U_NAMESPACE_BEGIN
 
 //----------------------------------------------------------------------------
-// Code within this library which accesses protected data
+// Code within that accesses shared static or global data should
 // should instantiate a Mutex object while doing so. You should make your own 
 // private mutex where possible.
 
@@ -45,7 +45,7 @@
 //
 // void Function(int arg1, int arg2)
 // {
-//    static Object* foo; // Shared read-write object
+//    static Object* foo;     // Shared read-write object
 //    Mutex mutex(&myMutex);  // or no args for the global lock
 //    foo->Method();
 //    // When 'mutex' goes out of scope and gets destroyed here, the lock is released

Index: normlzr.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/normlzr.cpp,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -d -r1.5 -r1.6
--- normlzr.cpp	10 Sep 2003 02:42:02 -0000	1.5
+++ normlzr.cpp	6 Apr 2004 10:07:58 -0000	1.6
@@ -21,7 +21,7 @@
 
 U_NAMESPACE_BEGIN
 
-const char Normalizer::fgClassID=0;
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
 
 //-------------------------------------------------------------------------
 // Constructors and other boilerplate

Index: propname.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/propname.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- propname.cpp	10 Sep 2003 02:42:02 -0000	1.1
+++ propname.cpp	6 Apr 2004 10:07:58 -0000	1.2
@@ -12,6 +12,9 @@
 #include "unicode/uchar.h"
 #include "unicode/udata.h"
 #include "umutex.h"
+#include "cmemory.h"
+#include "cstring.h"
+#include "uarrsort.h"
 
 U_NAMESPACE_BEGIN
 
@@ -193,7 +196,441 @@
 u_getPropertyValueEnum(UProperty property,
                        const char* alias) {
     return load() ? PNAME->getPropertyValueEnum(property, alias)
-                  : UCHAR_INVALID_CODE;
+                  : (int32_t)UCHAR_INVALID_CODE;
+}
+
+/* data swapping ------------------------------------------------------------ */
+
+/*
+ * Sub-structure-swappers use the temp array (which is as large as the
+ * actual data) for intermediate storage,
+ * as well as to indicate if a particular structure has been swapped already.
+ * The temp array is initially reset to all 0.
+ * pos is the byte offset of the sub-structure in the inBytes/outBytes/temp arrays.
+ */
+
+int32_t
+EnumToOffset::swap(const UDataSwapper *ds,
+                   const uint8_t *inBytes, int32_t length, uint8_t *outBytes,
+                   uint8_t *temp, int32_t pos,
+                   UErrorCode *pErrorCode) {
+    const EnumToOffset *inMap;
+    EnumToOffset *outMap, *tempMap;
+    int32_t size;
+
+    tempMap=(EnumToOffset *)(temp+pos);
+    if(tempMap->enumStart!=0 || tempMap->enumLimit!=0) {
+        /* this map was swapped already */
+        size=tempMap->getSize();
+        return size;
+    }
+
+    inMap=(const EnumToOffset *)(inBytes+pos);
+    outMap=(EnumToOffset *)(outBytes+pos);
+
+    tempMap->enumStart=udata_readInt32(ds, inMap->enumStart);
+    tempMap->enumLimit=udata_readInt32(ds, inMap->enumLimit);
+    size=tempMap->getSize();
+
+    if(length>=0) {
+        if(length<(pos+size)) {
+            if(length<(int32_t)sizeof(PropertyAliases)) {
+                udata_printError(ds, "upname_swap(EnumToOffset): too few bytes (%d after header)\n"
+                                     "    for pnames.icu EnumToOffset{%d..%d} at %d\n",
+                                 length, tempMap->enumStart, tempMap->enumLimit, pos);
+                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+                return 0;
+            }
+        }
+
+        /* swap enumStart and enumLimit */
+        ds->swapArray32(ds, inMap, 2*sizeof(EnumValue), outMap, pErrorCode);
+
+        /* swap _offsetArray[] */
+        ds->swapArray16(ds, inMap->getOffsetArray(), (tempMap->enumLimit-tempMap->enumStart)*sizeof(Offset),
+                           outMap->getOffsetArray(), pErrorCode);
+    }
+
+    return size;
+}
+
+int32_t
+NonContiguousEnumToOffset::swap(const UDataSwapper *ds,
+                   const uint8_t *inBytes, int32_t length, uint8_t *outBytes,
+                   uint8_t *temp, int32_t pos,
+                   UErrorCode *pErrorCode) {
+    const NonContiguousEnumToOffset *inMap;
+    NonContiguousEnumToOffset *outMap, *tempMap;
+    int32_t size;
+
+    tempMap=(NonContiguousEnumToOffset *)(temp+pos);
+    if(tempMap->count!=0) {
+        /* this map was swapped already */
+        size=tempMap->getSize();
+        return size;
+    }
+
+    inMap=(const NonContiguousEnumToOffset *)(inBytes+pos);
+    outMap=(NonContiguousEnumToOffset *)(outBytes+pos);
+
+    tempMap->count=udata_readInt32(ds, inMap->count);
+    size=tempMap->getSize();
+
+    if(length>=0) {
+        if(length<(pos+size)) {
+            if(length<(int32_t)sizeof(PropertyAliases)) {
+                udata_printError(ds, "upname_swap(NonContiguousEnumToOffset): too few bytes (%d after header)\n"
+                                     "    for pnames.icu NonContiguousEnumToOffset[%d] at %d\n",
+                                 length, tempMap->count, pos);
+                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+                return 0;
+            }
+        }
+
+        /* swap count and _enumArray[] */
+        length=(1+tempMap->count)*sizeof(EnumValue);
+        ds->swapArray32(ds, inMap, length,
+                           outMap, pErrorCode);
+
+        /* swap _offsetArray[] */
+        pos+=length;
+        ds->swapArray16(ds, inBytes+pos, tempMap->count*sizeof(Offset),
+                           outBytes+pos, pErrorCode);
+    }
+
+    return size;
+}
+
+struct NameAndIndex {
+    Offset name, index;
+};
+
+U_CDECL_BEGIN
+typedef int32_t U_CALLCONV PropNameCompareFn(const char *name1, const char *name2);
+
+struct CompareContext {
+    const char *chars;
+    PropNameCompareFn *propCompare;
+};
+
+static int32_t U_CALLCONV
+upname_compareRows(const void *context, const void *left, const void *right) {
+    CompareContext *cmp=(CompareContext *)context;
+    return cmp->propCompare(cmp->chars+((const NameAndIndex *)left)->name,
+                            cmp->chars+((const NameAndIndex *)right)->name);
+}
+U_CDECL_END
+
+int32_t
+NameToEnum::swap(const UDataSwapper *ds,
+                   const uint8_t *inBytes, int32_t length, uint8_t *outBytes,
+                   uint8_t *temp, int32_t pos,
+                   UErrorCode *pErrorCode) {
+    const NameToEnum *inMap;
+    NameToEnum *outMap, *tempMap;
+
+    const EnumValue *inEnumArray;
+    EnumValue *outEnumArray;
+
+    const Offset *inNameArray;
+    Offset *outNameArray;
+
+    NameAndIndex *sortArray;
+    CompareContext cmp;
+
+    int32_t i, size, oldIndex;
+
+    tempMap=(NameToEnum *)(temp+pos);
+    if(tempMap->count!=0) {
+        /* this map was swapped already */
+        size=tempMap->getSize();
+        return size;
+    }
+
+    inMap=(const NameToEnum *)(inBytes+pos);
+    outMap=(NameToEnum *)(outBytes+pos);
+
+    tempMap->count=udata_readInt32(ds, inMap->count);
+    size=tempMap->getSize();
+
+    if(length>=0) {
+        if(length<(pos+size)) {
+            if(length<(int32_t)sizeof(PropertyAliases)) {
+                udata_printError(ds, "upname_swap(NameToEnum): too few bytes (%d after header)\n"
+                                     "    for pnames.icu NameToEnum[%d] at %d\n",
+                                 length, tempMap->count, pos);
+                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+                return 0;
+            }
+        }
+
+        /* swap count */
+        ds->swapArray32(ds, inMap, 4, outMap, pErrorCode);
+
+        inEnumArray=inMap->getEnumArray();
+        outEnumArray=outMap->getEnumArray();
+
+        inNameArray=(const Offset *)(inEnumArray+tempMap->count);
+        outNameArray=(Offset *)(outEnumArray+tempMap->count);
+
+        if(ds->inCharset==ds->outCharset) {
+            /* no need to sort, just swap the enum/name arrays */
+            ds->swapArray32(ds, inEnumArray, tempMap->count*4, outEnumArray, pErrorCode);
+            ds->swapArray16(ds, inNameArray, tempMap->count*2, outNameArray, pErrorCode);
+            return size;
+        }
+
+        /*
+         * The name and enum arrays are sorted by names and must be resorted
+         * if inCharset!=outCharset.
+         * We use the corresponding part of the temp array to sort an array
+         * of pairs of name offsets and sorting indexes.
+         * Then the sorting indexes are used to permutate-swap the name and enum arrays.
+         *
+         * The outBytes must already contain the swapped strings.
+         */
+        sortArray=(NameAndIndex *)tempMap->getEnumArray();
+        for(i=0; i<tempMap->count; ++i) {
+            sortArray[i].name=udata_readInt16(ds, inNameArray[i]);
+            sortArray[i].index=(Offset)i;
+        }
+
+        /*
+         * use a stable sort to avoid shuffling of equal strings,
+         * which makes testing harder
+         */
+        cmp.chars=(const char *)outBytes;
+        cmp.propCompare=
+            ds->outCharset==U_ASCII_FAMILY ?
+                uprv_compareASCIIPropertyNames :
+                uprv_compareEBCDICPropertyNames;
+        uprv_sortArray(sortArray, tempMap->count, sizeof(NameAndIndex),
+                       upname_compareRows, &cmp,
+                       TRUE, pErrorCode);
+        if(U_FAILURE(*pErrorCode)) {
+            udata_printError(ds, "upname_swap(NameToEnum).uprv_sortArray(%d items) failed - %s\n",
+                             tempMap->count, u_errorName(*pErrorCode));
+            return 0;
+        }
+
+        /* copy/swap/permutate _enumArray[] and _nameArray[] */
+        if(inEnumArray!=outEnumArray) {
+            for(i=0; i<tempMap->count; ++i) {
+                oldIndex=sortArray[i].index;
+                ds->swapArray32(ds, inEnumArray+oldIndex, 4, outEnumArray+i, pErrorCode);
+                ds->swapArray16(ds, inNameArray+oldIndex, 2, outNameArray+i, pErrorCode);
+            }
+        } else {
+            /*
+             * in-place swapping: need to permutate into a temporary array
+             * and then copy back to not destroy the data
+             */
+            EnumValue *tempEnumArray;
+            Offset *oldIndexes;
+
+            /* write name offsets directly from sortArray */
+            for(i=0; i<tempMap->count; ++i) {
+                ds->writeUInt16((uint16_t *)outNameArray+i, (uint16_t)sortArray[i].name);
+            }
+
+            /*
+             * compress the oldIndexes into a separate array to make space for tempEnumArray
+             * the tempMap _nameArray becomes oldIndexes[], getting the index
+             *   values from the 2D sortArray[],
+             * while sortArray=tempMap _enumArray[] becomes tempEnumArray[]
+             * this saves us allocating more memory
+             *
+             * it works because sizeof(NameAndIndex)<=sizeof(EnumValue)
+             * and because the nameArray[] can be used for oldIndexes[]
+             */
+            tempEnumArray=(EnumValue *)sortArray;
+            oldIndexes=(Offset *)(sortArray+tempMap->count);
+
+            /* copy sortArray[].index values into oldIndexes[] */
+            for(i=0; i<tempMap->count; ++i) {
+                oldIndexes[i]=sortArray[i].index;
+            }
+
+            /* permutate inEnumArray[] into tempEnumArray[] */
+            for(i=0; i<tempMap->count; ++i) {
+                ds->swapArray32(ds, inEnumArray+oldIndexes[i], 4, tempEnumArray+i, pErrorCode);
+            }
+
+            /* copy tempEnumArray[] to outEnumArray[] */
+            uprv_memcpy(outEnumArray, tempEnumArray, tempMap->count*4);
+        }
+    }
+
+    return size;
+}
+
+int32_t
+PropertyAliases::swap(const UDataSwapper *ds,
+                      const uint8_t *inBytes, int32_t length, uint8_t *outBytes,
+                      UErrorCode *pErrorCode) {
+    const PropertyAliases *inAliases;
+    PropertyAliases *outAliases;
+    PropertyAliases aliases;
+
+    const ValueMap *inValueMaps;
+    ValueMap *outValueMaps;
+    ValueMap valueMap;
+
+    uint8_t *temp;
+
+    int32_t i;
+
+    inAliases=(const PropertyAliases *)inBytes;
+    outAliases=(PropertyAliases *)outBytes;
+
+    /* read the input PropertyAliases - all 16-bit values */
+    for(i=0; i<(int32_t)sizeof(PropertyAliases)/2; ++i) {
+        ((uint16_t *)&aliases)[i]=ds->readUInt16(((const uint16_t *)inBytes)[i]);
+    }
+
+    if(length>=0) {
+        if(length<aliases.total_size) {
+            udata_printError(ds, "upname_swap(): too few bytes (%d after header) for all of pnames.icu\n",
+                             length);
+            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+            return 0;
+        }
+
+        /* copy the data for inaccessible bytes */
+        if(inBytes!=outBytes) {
+            uprv_memcpy(outBytes, inBytes, aliases.total_size);
+        }
+
+        /* swap the PropertyAliases class fields */
+        ds->swapArray16(ds, inAliases, sizeof(PropertyAliases), outAliases, pErrorCode);
+
+        /* swap the name groups */
+        ds->swapArray16(ds, inBytes+aliases.nameGroupPool_offset,
+                                aliases.stringPool_offset-aliases.nameGroupPool_offset,
+                           outBytes+aliases.nameGroupPool_offset, pErrorCode);
+
+        /* swap the strings */
+        udata_swapInvStringBlock(ds, inBytes+aliases.stringPool_offset,
+                                        aliases.total_size-aliases.stringPool_offset,
+                                    outBytes+aliases.stringPool_offset, pErrorCode);
+
+        /*
+         * alloc uint8_t temp[total_size] and reset it
+         * swap each top-level struct, put at least the count fields into temp
+         *   use subclass-specific swap() functions
+         * enumerate value maps, for each
+         *   if temp does not have count!=0 yet
+         *     read count, put it into temp
+         *     swap the array(s)
+         *     resort strings in name->enum maps
+         * swap value maps
+         */
+        temp=(uint8_t *)uprv_malloc(aliases.total_size);
+        if(temp==NULL) {
+            udata_printError(ds, "upname_swap(): unable to allocate temp memory (%d bytes)\n",
+                             aliases.total_size);
+            *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
+            return 0;
+        }
+        uprv_memset(temp, 0, aliases.total_size);
+
+        /* swap properties->name groups map */
+        NonContiguousEnumToOffset::swap(ds, inBytes, length, outBytes,
+                                        temp, aliases.enumToName_offset, pErrorCode);
+
+        /* swap name->properties map */
+        NameToEnum::swap(ds, inBytes, length, outBytes,
+                         temp, aliases.nameToEnum_offset, pErrorCode);
+
+        /* swap properties->value maps map */
+        NonContiguousEnumToOffset::swap(ds, inBytes, length, outBytes,
+                                        temp, aliases.enumToValue_offset, pErrorCode);
+
+        /* enumerate all ValueMaps and swap them */
+        inValueMaps=(const ValueMap *)(inBytes+aliases.valueMap_offset);
+        outValueMaps=(ValueMap *)(outBytes+aliases.valueMap_offset);
+
+        for(i=0; i<aliases.valueMap_count; ++i) {
+            valueMap.enumToName_offset=udata_readInt16(ds, inValueMaps[i].enumToName_offset);
+            valueMap.ncEnumToName_offset=udata_readInt16(ds, inValueMaps[i].ncEnumToName_offset);
+            valueMap.nameToEnum_offset=udata_readInt16(ds, inValueMaps[i].nameToEnum_offset);
+
+            if(valueMap.enumToName_offset!=0) {
+                EnumToOffset::swap(ds, inBytes, length, outBytes,
+                                   temp, valueMap.enumToName_offset,
+                                   pErrorCode);
+            } else if(valueMap.ncEnumToName_offset!=0) {
+                NonContiguousEnumToOffset::swap(ds, inBytes, length, outBytes,
+                                                temp, valueMap.ncEnumToName_offset,
+                                                pErrorCode);
+            }
+            if(valueMap.nameToEnum_offset!=0) {
+                NameToEnum::swap(ds, inBytes, length, outBytes,
+                                 temp, valueMap.nameToEnum_offset,
+                                 pErrorCode);
+            }
+        }
+
+        /* swap the ValueMaps array itself */
+        ds->swapArray16(ds, inValueMaps, aliases.valueMap_count*sizeof(ValueMap),
+                           outValueMaps, pErrorCode);
+
+        /* name groups and strings were swapped above */
+
+        /* release temp */
+        uprv_free(temp);
+    }
+
+    return aliases.total_size;
+}
+
+U_CAPI int32_t U_EXPORT2
+upname_swap(const UDataSwapper *ds,
+            const void *inData, int32_t length, void *outData,
+            UErrorCode *pErrorCode) {
+    const UDataInfo *pInfo;
+    int32_t headerSize;
+
+    const uint8_t *inBytes;
+    uint8_t *outBytes;
+
+    /* udata_swapDataHeader checks the arguments */
+    headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
+    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+        return 0;
+    }
+
+    /* check data format and format version */
+    pInfo=(const UDataInfo *)((const char *)inData+4);
+    if(!(
+        pInfo->dataFormat[0]==0x70 &&   /* dataFormat="pnam" */
+        pInfo->dataFormat[1]==0x6e &&
+        pInfo->dataFormat[2]==0x61 &&
+        pInfo->dataFormat[3]==0x6d &&
+        pInfo->formatVersion[0]==1
+    )) {
+        udata_printError(ds, "upname_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as pnames.icu\n",
+                         pInfo->dataFormat[0], pInfo->dataFormat[1],
+                         pInfo->dataFormat[2], pInfo->dataFormat[3],
+                         pInfo->formatVersion[0]);
+        *pErrorCode=U_UNSUPPORTED_ERROR;
+        return 0;
+    }
+
+    inBytes=(const uint8_t *)inData+headerSize;
+    outBytes=(uint8_t *)outData+headerSize;
+
+    if(length>=0) {
+        length-=headerSize;
+        if(length<(int32_t)sizeof(PropertyAliases)) {
+            udata_printError(ds, "upname_swap(): too few bytes (%d after header) for pnames.icu\n",
+                             length);
+            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+            return 0;
+        }
+    }
+
+    return headerSize+PropertyAliases::swap(ds, inBytes, length, outBytes, pErrorCode);
 }
 
 //eof

Index: propname.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/propname.h,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- propname.h	10 Sep 2003 02:42:02 -0000	1.1
+++ propname.h	6 Apr 2004 10:07:58 -0000	1.2
@@ -1,6 +1,6 @@
 /*
 **********************************************************************
-* Copyright (c) 2002, International Business Machines
+* Copyright (c) 2002-2003, International Business Machines
 * Corporation and others.  All Rights Reserved.
 **********************************************************************
 * Author: Alan Liu
@@ -13,27 +13,25 @@
 
 #include "unicode/utypes.h"
 #include "unicode/uchar.h"
+#include "udataswp.h"
 #include "uprops.h"
 
-class Builder;
-
-U_NAMESPACE_BEGIN
-
-// This header defines the in-memory layout of the property names data
-// structure representing the UCD data files PropertyAliases.txt and
-// PropertyValueAliases.txt.  It is used by:
-//   propname.cpp - reads data
-//   genpname     - creates data
+/*
+ * This header defines the in-memory layout of the property names data
+ * structure representing the UCD data files PropertyAliases.txt and
+ * PropertyValueAliases.txt.  It is used by:
+ *   propname.cpp - reads data
+ *   genpname     - creates data
+ */
 
-//----------------------------------------------------------------------
-// UDataMemory structure and signatures
+/* UDataMemory structure and signatures ------------------------------------- */
 
 #define PNAME_DATA_NAME "pnames"
 #define PNAME_DATA_TYPE "icu"
 
-// Fields in UDataInfo:
+/* Fields in UDataInfo: */
 
-// PNAME_SIG[] is encoded as numeric literals for compatibility with the HP compiler
+/* PNAME_SIG[] is encoded as numeric literals for compatibility with the HP compiler */
 #define PNAME_SIG_0 ((uint8_t)0x70) /* p */
 #define PNAME_SIG_1 ((uint8_t)0x6E) /* n */
 #define PNAME_SIG_2 ((uint8_t)0x61) /* a */
@@ -42,13 +40,29 @@
 #define PNAME_FORMAT_VERSION ((int8_t)1) /* formatVersion[0] */
 
 /**
+ * Swap pnames.icu. See udataswp.h.
+ * @internal
+ */
+U_CAPI int32_t U_EXPORT2
+upname_swap(const UDataSwapper *ds,
+            const void *inData, int32_t length, void *outData,
+            UErrorCode *pErrorCode);
+
+
+#ifdef XP_CPLUSPLUS
+
+class Builder;
+
+U_NAMESPACE_BEGIN
+
+/**
  * An offset from the start of the pnames data to a contained entity.
  * This must be a signed value, since negative offsets are used as an
  * end-of-list marker.  Offsets to actual objects are non-zero.  A
  * zero offset indicates an absent entry; this corresponds to aliases
  * marked "n/a" in the original Unicode data files.
  */
-typedef int16_t Offset; // must be signed
+typedef int16_t Offset; /*  must be signed */
 
 #define MAX_OFFSET 0x7FFF
 
@@ -63,8 +77,8 @@
  */
 typedef int32_t EnumValue;
 
-//----------------------------------------------------------------------
-// ValueMap
+/* ---------------------------------------------------------------------- */
+/*  ValueMap */
 
 /**
  * For any top-level property that has named values (binary and
@@ -82,18 +96,18 @@
  */
 struct ValueMap {
 
-    // -- begin pnames data --
-    // Enum=>name EnumToOffset / NonContiguousEnumToOffset objects.
-    // Exactly one of these will be nonzero.
+    /*  -- begin pnames data -- */
+    /*  Enum=>name EnumToOffset / NonContiguousEnumToOffset objects. */
+    /*  Exactly one of these will be nonzero. */
     Offset enumToName_offset;
     Offset ncEnumToName_offset;
 
-    Offset nameToEnum_offset; // Name=>enum data
-    // -- end pnames data --
+    Offset nameToEnum_offset; /*  Name=>enum data */
+    /*  -- end pnames data -- */
 };
 
-//----------------------------------------------------------------------
-// PropertyAliases class
+/* ---------------------------------------------------------------------- */
+/*  PropertyAliases class */
 
 /**
  * A class encapsulating access to the memory-mapped data representing
@@ -106,29 +120,29 @@
  */
 class PropertyAliases {
 
-    // -- begin pnames data --
-    // Enum=>name EnumToOffset object for binary and enumerated
-    // properties
+    /*  -- begin pnames data -- */
+    /*  Enum=>name EnumToOffset object for binary and enumerated */
+    /*  properties */
     Offset enumToName_offset;
 
-    // Name=>enum data for binary & enumerated properties
+    /*  Name=>enum data for binary & enumerated properties */
     Offset nameToEnum_offset;
 
-    // Enum=>offset EnumToOffset object mapping enumerated properties
-    // to ValueMap objects
+    /*  Enum=>offset EnumToOffset object mapping enumerated properties */
+    /*  to ValueMap objects */
     Offset enumToValue_offset;
 
-    // The following are needed by external readers of this data.
-    // We don't use them ourselves.
-    int16_t total_size; // size in bytes excluding the udata header
-    Offset valueMap_offset; // offset to start of array
-    int16_t valueMap_count; // number of entries
-    Offset nameGroupPool_offset; // offset to start of array
-    int16_t nameGroupPool_count; // number of entries (not groups)
-    Offset stringPool_offset; // offset to start of pool
-    int16_t stringPool_count; // number of strings (not size in bytes)
+    /*  The following are needed by external readers of this data. */
+    /*  We don't use them ourselves. */
+    int16_t total_size; /*  size in bytes excluding the udata header */
+    Offset valueMap_offset; /*  offset to start of array */
+    int16_t valueMap_count; /*  number of entries */
+    Offset nameGroupPool_offset; /*  offset to start of array */
+    int16_t nameGroupPool_count; /*  number of entries (not groups) */
+    Offset stringPool_offset; /*  offset to start of pool */
+    int16_t stringPool_count; /*  number of strings (not size in bytes) */
 
-    // -- end pnames data --
+    /*  -- end pnames data -- */
 
     friend class ::Builder;
 
@@ -157,10 +171,15 @@
     
     inline EnumValue getPropertyValueEnum(EnumValue prop,
                                           const char* alias) const;
+
+    static int32_t
+    swap(const UDataSwapper *ds,
+         const uint8_t *inBytes, int32_t length, uint8_t *outBytes,
+         UErrorCode *pErrorCode);
 };
 
-//----------------------------------------------------------------------
-// EnumToOffset
+/* ---------------------------------------------------------------------- */
+/*  EnumToOffset */
 
 /**
  * A generic map from enum values to Offsets.  The enum values must be
@@ -169,11 +188,11 @@
  */
 class EnumToOffset {
 
-    // -- begin pnames data --
+    /*  -- begin pnames data -- */
     EnumValue enumStart;
     EnumValue enumLimit;
-    Offset _offsetArray; // [array of enumLimit-enumStart]
-    // -- end pnames data --
+    Offset _offsetArray; /*  [array of enumLimit-enumStart] */
+    /*  -- end pnames data -- */
 
     friend class ::Builder;
 
@@ -189,20 +208,30 @@
         return sizeof(EnumToOffset) + sizeof(Offset) * (n - 1);
     }
 
+    int32_t getSize() {
+        return getSize(enumLimit - enumStart);
+    }
+
  public:
 
     Offset getOffset(EnumValue enumProbe) const {
         if (enumProbe < enumStart ||
             enumProbe >= enumLimit) {
-            return 0; // not found
+            return 0; /*  not found */
         }
         const Offset* p = getOffsetArray();
         return p[enumProbe - enumStart];
     }
+
+    static int32_t
+    swap(const UDataSwapper *ds,
+         const uint8_t *inBytes, int32_t length, uint8_t *outBytes,
+         uint8_t *temp, int32_t pos,
+         UErrorCode *pErrorCode);
 };
 
-//----------------------------------------------------------------------
-// NonContiguousEnumToOffset
+/* ---------------------------------------------------------------------- */
+/*  NonContiguousEnumToOffset */
 
 /**
  * A generic map from enum values to Offsets.  The enum values may be
@@ -211,11 +240,11 @@
  */
 class NonContiguousEnumToOffset {
 
-    // -- begin pnames data --
+    /*  -- begin pnames data -- */
     int32_t count;
-    EnumValue _enumArray; // [array of count]
-    // Offset _offsetArray; // [array of count] after enumValue[count-1]
-    // -- end pnames data --
+    EnumValue _enumArray; /*  [array of count] */
+    /*  Offset _offsetArray; // [array of count] after enumValue[count-1] */
+    /*  -- end pnames data -- */
 
     friend class ::Builder;
 
@@ -239,35 +268,45 @@
         return sizeof(int32_t) + (sizeof(EnumValue) + sizeof(Offset)) * n;
     }
 
+    int32_t getSize() {
+        return getSize(count);
+    }
+
  public:
 
     Offset getOffset(EnumValue enumProbe) const {
         const EnumValue* e = getEnumArray();
         const Offset* p = getOffsetArray();
-        // linear search; binary later if warranted
-        // (binary is not faster for short lists)
+        /*  linear search; binary later if warranted */
+        /*  (binary is not faster for short lists) */
         for (int32_t i=0; i<count; ++i) {
             if (e[i] < enumProbe) continue;
             if (e[i] > enumProbe) break;
             return p[i];
         }
-        return 0; // not found
+        return 0; /*  not found */
     }
+
+    static int32_t
+    swap(const UDataSwapper *ds,
+         const uint8_t *inBytes, int32_t length, uint8_t *outBytes,
+         uint8_t *temp, int32_t pos,
+         UErrorCode *pErrorCode);
 };
 
-//----------------------------------------------------------------------
-// NameToEnum
+/* ---------------------------------------------------------------------- */
+/*  NameToEnum */
 
 /**
  * A map from names to enum values.
  */
 class NameToEnum {
 
-    // -- begin pnames data --
-    int32_t count;       // number of entries
-    EnumValue _enumArray; // [array of count] EnumValues
-    // Offset _nameArray; // [array of count] offsets to names
-    // -- end pnames data --
+    /*  -- begin pnames data -- */
+    int32_t count;       /*  number of entries */
+    EnumValue _enumArray; /*  [array of count] EnumValues */
+    /*  Offset _nameArray; // [array of count] offsets to names */
+    /*  -- end pnames data -- */
 
     friend class ::Builder;
 
@@ -291,6 +330,10 @@
         return sizeof(int32_t) + (sizeof(Offset) + sizeof(EnumValue)) * n;
     }
 
+    int32_t getSize() {
+        return getSize(count);
+    }
+
  public:
   
     EnumValue getEnum(const char* alias, const PropertyAliases& data) const {
@@ -298,8 +341,8 @@
         const Offset* n = getNameArray();
         const EnumValue* e = getEnumArray();
 
-        // linear search; binary later if warranted
-        // (binary is not faster for short lists)
+        /*  linear search; binary later if warranted */
+        /*  (binary is not faster for short lists) */
         for (int32_t i=0; i<count; ++i) {
             const char* name = (const char*) data.getPointer(n[i]);
             int32_t c = uprv_comparePropertyNames(alias, name);
@@ -310,6 +353,12 @@
         
         return UCHAR_INVALID_CODE;
     }
+
+    static int32_t
+    swap(const UDataSwapper *ds,
+         const uint8_t *inBytes, int32_t length, uint8_t *outBytes,
+         uint8_t *temp, int32_t pos,
+         UErrorCode *pErrorCode);
 };
 
 /*----------------------------------------------------------------------
@@ -348,7 +397,20 @@
  *  nameToEnum_offset (>2)
  *  enumToValue_offset (>3)
  *  (alignment padding build in to header)
- * 
+ *
+ * The header also contains the following, used by "external readers"
+ * like ICU4J and icuswap.
+ *
+ *  // The following are needed by external readers of this data.
+ *  // We don't use them ourselves.
+ *  int16_t total_size; // size in bytes excluding the udata header
+ *  Offset valueMap_offset; // offset to start of array
+ *  int16_t valueMap_count; // number of entries
+ *  Offset nameGroupPool_offset; // offset to start of array
+ *  int16_t nameGroupPool_count; // number of entries (not groups)
+ *  Offset stringPool_offset; // offset to start of pool
+ *  int16_t stringPool_count; // number of strings (not size in bytes)
+ *
  * 0: # NonContiguousEnumToOffset obj for props => name groups
  *  count
  *  enumArray [x count]
@@ -410,6 +472,6 @@
  */
 U_NAMESPACE_END
 
-#endif
+#endif /* C++ */
 
-//eof
+#endif

Index: putil.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/putil.c,v
retrieving revision 1.6
retrieving revision 1.7
diff -u -d -r1.6 -r1.7
--- putil.c	10 Sep 2003 02:42:02 -0000	1.6
+++ putil.c	6 Apr 2004 10:07:58 -0000	1.7
@@ -40,10 +40,6 @@
 #    include<sys/types.h>
 #endif
 
-#ifdef __QNXNTO__
-#include <sys/neutrino.h>
-#endif
-
 #ifndef PTX
 
 /* Define _XOPEN_SOURCE for Solaris and friends. */
@@ -65,11 +61,14 @@
 /* include ICU headers */
 #include "unicode/utypes.h"
 #include "unicode/putil.h"
+#include "unicode/ustring.h"
+#include "uassert.h"
 #include "umutex.h"
 #include "cmemory.h"
 #include "cstring.h"
 #include "locmap.h"
 #include "ucln_cmn.h"
+#include "udataswp.h"
 
 /* Include standard headers. */
 #include <stdio.h>
@@ -108,7 +107,7 @@
 #   include <TextUtils.h>
 #elif defined(OS390)
 #include "unicode/ucnv.h"   /* Needed for UCNV_SWAP_LFNL_OPTION_STRING */
-#elif defined(AIX)
+#elif defined(U_AIX)
 /*
 #   include <sys/ldr.h>
 */
@@ -117,10 +116,15 @@
 #   include <dlfcn.h>
 #   include <link.h>
 */
-#elif defined(HPUX)
+#elif defined(U_HPUX)
 /*
 #   include <dl.h>
 */
+#elif defined(U_DARWIN)
+#include <sys/file.h>
+#include <sys/param.h>
+#elif defined(U_QNX)
+#include <sys/neutrino.h>
 #endif
 
 /* Define the extension for data files, again... */
@@ -145,8 +149,8 @@
 
 #if USE_64BIT_DOUBLE_OPTIMIZATION
 /* gcc 3.2 has an optimization bug */
-static const int64_t gNan64 = 0x7FF8000000000000L;
-static const int64_t gInf64 = 0x7FF0000000000000L;
+static const int64_t gNan64 = 0x7FF8000000000000LL;
+static const int64_t gInf64 = 0x7FF0000000000000LL;
 static const double * const fgNan = (const double *)(&gNan64);
 static const double * const fgInf = (const double *)(&gInf64);
 #else
@@ -228,14 +232,14 @@
     time_t t, t1, t2;
     struct tm tmrec;
 
-    memset( &tmrec, 0, sizeof(tmrec) );
+    uprv_memset( &tmrec, 0, sizeof(tmrec) );
     tmrec.tm_year = 70;
     tmrec.tm_mon = 0;
     tmrec.tm_mday = 1;
     t1 = mktime(&tmrec);    /* seconds of 1/1/1970*/
 
     time(&t);
-    memcpy( &tmrec, gmtime(&t), sizeof(tmrec) );
+    uprv_memcpy( &tmrec, gmtime(&t), sizeof(tmrec) );
     t2 = mktime(&tmrec);    /* seconds of current GMT*/
     return t2 - t1;         /* GMT (or UTC) in seconds since 1970*/
 #else
@@ -262,7 +266,7 @@
 #if USE_64BIT_DOUBLE_OPTIMIZATION
     /* gcc 3.2 has an optimization bug */
     /* Infinity is 0x7FF0000000000000U. Anything greater than that is a NaN */
-    return (UBool)(((*((int64_t *)&number)) & INT64_MAX) > gInf64);
+    return (UBool)(((*((int64_t *)&number)) & U_INT64_MAX) > gInf64);
 
 #else
     /* This should work in theory, but it doesn't, so we resort to the more*/
@@ -312,7 +316,7 @@
 #if IEEE_754
 #if USE_64BIT_DOUBLE_OPTIMIZATION
     /* gcc 3.2 has an optimization bug */
-    return (UBool)(((*((int64_t *)&number)) & INT64_MAX) == gInf64);
+    return (UBool)(((*((int64_t *)&number)) & U_INT64_MAX) == gInf64);
 #else
 
     /* We know the top bit is the sign bit, so we mask that off in a copy of */
@@ -975,12 +979,12 @@
     /* Obtain TIME_ZONE_INFORMATION from the API, and then convert it
        to TZI.  We could also interrogate the registry directly; we do
        this below if needed. */
-    memset(&apiTZI, 0, sizeof(apiTZI));
+    uprv_memset(&apiTZI, 0, sizeof(apiTZI));
     GetTimeZoneInformation(&apiTZI);
     tziKey.Bias = apiTZI.Bias;
-    memcpy((char *)&tziKey.StandardDate, (char*)&apiTZI.StandardDate,
+    uprv_memcpy((char *)&tziKey.StandardDate, (char*)&apiTZI.StandardDate,
            sizeof(apiTZI.StandardDate));
-    memcpy((char *)&tziKey.DaylightDate, (char*)&apiTZI.DaylightDate,
+    uprv_memcpy((char *)&tziKey.DaylightDate, (char*)&apiTZI.DaylightDate,
            sizeof(apiTZI.DaylightDate));
 
     /* For each zone that can be identified by Offset+Rules, see if we
@@ -1012,7 +1016,7 @@
                these unreliable fields. */
             tziKey.StandardBias = tziReg.StandardBias;
             tziKey.DaylightBias = tziReg.DaylightBias;
-            if (memcmp((char *)&tziKey, (char*)&tziReg,
+            if (uprv_memcmp((char *)&tziKey, (char*)&tziReg,
                        sizeof(tziKey)) == 0) {
                 if (firstMatch < 0) {
                     firstMatch = j;
@@ -1066,7 +1070,7 @@
                 RegCloseKey(hkey);
                 if (result == ERROR_SUCCESS &&
                     stdRegNameSize == stdNameSize &&
-                    memcmp(stdName, stdRegName, stdNameSize) == 0) {
+                    uprv_memcmp(stdName, stdRegName, stdNameSize) == 0) {
                     firstMatch = j; /* record the match */
                     break;
                 }
@@ -1098,18 +1102,18 @@
     char* name;
     int i;
 
-    strcpy(subKeyName, TZ_REGKEY[(winType == WIN_9X_ME_TYPE) ? 0 : 1]);
+    uprv_strcpy(subKeyName, TZ_REGKEY[(winType == WIN_9X_ME_TYPE) ? 0 : 1]);
     name = &subKeyName[strlen(subKeyName)];
-    strcat(subKeyName, winid);
+    uprv_strcat(subKeyName, winid);
     if (winType != WIN_9X_ME_TYPE) {
         /* Don't modify "Mexico Standard Time 2", which does not occur
            on WIN_9X_ME_TYPE.  Also, if the type is WIN_NT_TYPE, then
            in practice this means the GMT key is not followed by
            " Standard Time", so don't append in that case. */
-        int isMexico2 = (winid[strlen(winid)- 1] == '2');
+        int isMexico2 = (winid[uprv_strlen(winid)- 1] == '2');
         if (!isMexico2 &&
-            !(winType == WIN_NT_TYPE && strcmp(winid, "GMT") == 0)) {
-            strcat(subKeyName, STANDARD_TIME_REGKEY);
+            !(winType == WIN_NT_TYPE && uprv_strcmp(winid, "GMT") == 0)) {
+            uprv_strcat(subKeyName, STANDARD_TIME_REGKEY);
         }
     }
     result = RegOpenKeyEx(HKEY_LOCAL_MACHINE,
@@ -1122,11 +1126,11 @@
         /* If the primary lookup fails, try to remap the Windows zone
            ID, according to the remapping table. */
         for (i=0; ZONE_REMAP[i].winid; ++i) {
-            if (strcmp(winid, ZONE_REMAP[i].winid) == 0) {
-                strcpy(name, ZONE_REMAP[i].altwinid + 1);
+            if (uprv_strcmp(winid, ZONE_REMAP[i].winid) == 0) {
+                uprv_strcpy(name, ZONE_REMAP[i].altwinid + 1);
                 if (*(ZONE_REMAP[i].altwinid) == '+' &&
                     winType != WIN_9X_ME_TYPE) {
-                    strcat(subKeyName, STANDARD_TIME_REGKEY);                
+                    uprv_strcat(subKeyName, STANDARD_TIME_REGKEY);                
                 }
                 result = RegOpenKeyEx(HKEY_LOCAL_MACHINE,
                                       subKeyName,
@@ -1168,10 +1172,10 @@
     int32_t tdiff = 0;
 
     time(&t);
-    memcpy( &tmrec, localtime(&t), sizeof(tmrec) );
+    uprv_memcpy( &tmrec, localtime(&t), sizeof(tmrec) );
     dst_checked = (tmrec.tm_isdst != 0); /* daylight savings time is checked*/
     t1 = mktime(&tmrec);                 /* local time in seconds*/
-    memcpy( &tmrec, gmtime(&t), sizeof(tmrec) );
+    uprv_memcpy( &tmrec, gmtime(&t), sizeof(tmrec) );
     t2 = mktime(&tmrec);                 /* GMT (or UTC) in seconds*/
     tdiff = t2 - t1;
     /* imitate NT behaviour, which returns same timezone offset to GMT for
@@ -1182,15 +1186,23 @@
 #endif
 }
 
-/* Note that U_TZNAME does *not* have to be tzname, but if it does,
+/* Note that U_TZNAME does *not* have to be tzname, but if it is,
    some platforms need to have it declared here. */ 
 
-#if defined(IRIX) || defined(U_DARWIN) /* For SGI/MacOSX.  */
+#if defined(U_IRIX) || defined(U_DARWIN) /* For SGI or Mac OS X.  */
 extern char *tzname[]; /* RS6000 and others reject char **tzname.  */
 #elif defined(U_CYGWIN)
 extern U_IMPORT char *_tzname[2]; 
 #endif
 
+#if defined(U_DARWIN)	/* For Mac OS X */
+#define TZZONELINK	"/etc/localtime"
+#define TZZONEINFO	"/usr/share/zoneinfo/"
+static char *gTimeZoneBuffer = NULL; /* Heap allocated */
+#endif
+
+#include <stdio.h>
+
 U_CAPI char* U_EXPORT2
 uprv_tzname(int n)
 {
@@ -1201,6 +1213,41 @@
     }
 #endif
 
+#if defined(U_DARWIN)
+    int ret;
+
+    char *tzenv;
+
+    tzenv = getenv("TZFILE");
+    if (tzenv != NULL) {
+    	return tzenv;
+    }
+
+#if 0
+    /* TZ is often set to "PST8PDT" or similar, so we cannot use it. Alan */
+    tzenv = getenv("TZ");
+    if (tzenv != NULL) {
+    	return tzenv;
+    }
+#endif
+    
+    /* Caller must handle threading issues */
+    if (gTimeZoneBuffer == NULL) {
+    	gTimeZoneBuffer = (char *) uprv_malloc(MAXPATHLEN + 2);
+
+        ret = readlink(TZZONELINK, gTimeZoneBuffer, MAXPATHLEN + 2);
+        if (0 < ret) {
+            gTimeZoneBuffer[ret] = '\0';
+            if (uprv_strncmp(gTimeZoneBuffer, TZZONEINFO, sizeof(TZZONEINFO) - 1) == 0) {
+                return (gTimeZoneBuffer += sizeof(TZZONEINFO) - 1);
+            }
+        }
+
+        uprv_free(gTimeZoneBuffer);
+        gTimeZoneBuffer = NULL;
+    }
+#endif
+
 #ifdef U_TZNAME
     return U_TZNAME[n];
 #else
@@ -1238,12 +1285,12 @@
 U_CAPI void U_EXPORT2
 u_setDataDirectory(const char *directory) {
     char *newDataDir;
-    int length;
+    int32_t length;
 
     if(directory==NULL) {
         directory = "";
     }
-    length=uprv_strlen(directory);
+    length=(int32_t)uprv_strlen(directory);
     newDataDir = (char *)uprv_malloc(length + 2);
     uprv_strcpy(newDataDir, directory);
 
@@ -1648,7 +1695,7 @@
     int32_t lang = MAC_LC_INIT_NUMBER;
     /* = GetScriptManagerVariable(smScriptLang);*/
     int32_t date_region = MAC_LC_INIT_NUMBER;
-    char* posixID = 0;
+    const char* posixID = 0;
     int32_t count = sizeof(mac_lc_recs) / sizeof(mac_lc_rec);
     int32_t i;
     Intl1Hndl ih;
@@ -1848,7 +1895,7 @@
     {
         uprv_memset(codesetName, 0, sizeof(codesetName));
     }
-    localeName = setlocale(LC_CTYPE, "");
+    localeName = setlocale(LC_CTYPE, NULL);
     if (localeName != NULL && (name = (uprv_strchr(localeName, (int)'.'))) != NULL)
     {
         /* strip the locale name and look at the suffix only */
@@ -1898,97 +1945,57 @@
 #endif
 }
 
-#if U_CHARSET_FAMILY==U_EBCDIC_FAMILY
-#ifdef OS390
-/*
- * These maps for ASCII to/from EBCDIC are from
- * "UTF-EBCDIC - EBCDIC-Friendly Unicode (or UCS) Transformation Format"
- * at http://www.unicode.org/unicode/reports/tr16/
- * (which should reflect codepage 1047)
- * but modified to explicitly exclude the variant
- * control and graphical characters that are in ASCII-based
- * codepages at 0x80 and above.
- * Also, unlike in Version 6.0 of the UTR on UTF-EBCDIC,
- * the Line Feed mapping varies according to the environment.
- *
- * These tables do not establish a converter or a codepage.
- */
-
-    /* on S/390 Open Edition, ASCII 0xa (LF) maps to 0x15 and ISO-8 0x85 maps to 0x25 */
-#   define E_LF 0x15
-#   define A_15 0x0a
-#   define A_25 0x00
-
-#   if 0
-        /* the CDRA variation of 1047 is not currently used - see tables in #else below */
-        /* in standard EBCDIC (CDRA), ASCII 0xa (LF) maps to 0x25 and ISO-8 0x85 maps to 0x15 */
-#       define E_LF 0x25
-#       define A_15 0x00
-#       define A_25 0x0a
-#   endif
-
-static const uint8_t asciiFromEbcdic[256]={
-    0x00, 0x01, 0x02, 0x03, 0x00, 0x09, 0x00, 0x7F, 0x00, 0x00, 0x00, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
-    0x10, 0x11, 0x12, 0x13, 0x00, A_15, 0x08, 0x00, 0x18, 0x19, 0x00, 0x00, 0x1C, 0x1D, 0x1E, 0x1F,
-    0x00, 0x00, 0x00, 0x00, 0x00, A_25, 0x17, 0x1B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x06, 0x07,
-    0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x14, 0x15, 0x00, 0x1A,
-    0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2E, 0x3C, 0x28, 0x2B, 0x7C,
-    0x26, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, 0x24, 0x2A, 0x29, 0x3B, 0x5E,
-    0x2D, 0x2F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2C, 0x25, 0x5F, 0x3E, 0x3F,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x3A, 0x23, 0x40, 0x27, 0x3D, 0x22,
-    0x00, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x7E, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x00, 0x00, 0x00, 0x5B, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5D, 0x00, 0x00,
-    0x7B, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x7D, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x5C, 0x00, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
-};
-
-static const uint8_t ebcdicFromAscii[256]={
-    0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, E_LF, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
-    0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F,
-    0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61,
-    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0x7A, 0x5E, 0x4C, 0x7E, 0x6E, 0x6F,
-    0x7C, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6,
-    0xD7, 0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xAD, 0xE0, 0xBD, 0x5F, 0x6D,
-    0x79, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
-    0x97, 0x98, 0x99, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xC0, 0x4F, 0xD0, 0xA1, 0x07,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
+/* invariant-character handling --------------------------------------------- */
 
-#else
 /*
- * These maps for ASCII to/from EBCDIC were generated
- * using the ICU converter for codepage 37 on 2000-may-22.
- * They explicitly exclude the variant
+ * These maps for ASCII to/from EBCDIC map invariant characters (see utypes.h)
+ * appropriately for most EBCDIC codepages.
+ *
+ * They currently also map most other ASCII graphic characters,
+ * appropriately for codepages 37 and 1047.
+ * Exceptions: The characters for []^ have different codes in 37 & 1047.
+ * Both versions are mapped to ASCII.
+ *
+ *    ASCII 37 1047
+ * [     5B BA   AD
+ * ]     5D BB   BD
+ * ^     5E B0   5F
+ *
+ * There are no mappings for variant characters from Unicode to EBCDIC.
+ *
+ * Currently, C0 control codes are also included in these maps.
+ * Exceptions: S/390 Open Edition swaps LF and NEL codes compared with other
+ * EBCDIC platforms; both codes (15 and 25) are mapped to ASCII LF (0A),
+ * but there is no mapping for ASCII LF back to EBCDIC.
+ *
+ *    ASCII EBCDIC S/390-OE
+ * LF    0A     25       15
+ * NEL   85     15       25
+ *
+ * The maps below explicitly exclude the variant
  * control and graphical characters that are in ASCII-based
  * codepages at 0x80 and above.
+ * "No mapping" is expressed by mapping to a 00 byte.
  *
  * These tables do not establish a converter or a codepage.
  */
 
 static const uint8_t asciiFromEbcdic[256]={
     0x00, 0x01, 0x02, 0x03, 0x00, 0x09, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
-    0x10, 0x11, 0x12, 0x13, 0x00, 0x00, 0x08, 0x00, 0x18, 0x19, 0x00, 0x00, 0x1c, 0x1d, 0x1e, 0x1f,
+    0x10, 0x11, 0x12, 0x13, 0x00, 0x0a, 0x08, 0x00, 0x18, 0x19, 0x00, 0x00, 0x1c, 0x1d, 0x1e, 0x1f,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x17, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x06, 0x07,
     0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x14, 0x15, 0x00, 0x1a,
+
     0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2e, 0x3c, 0x28, 0x2b, 0x7c,
-    0x26, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x00,
+    0x26, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x5e,
     0x2d, 0x2f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2c, 0x25, 0x5f, 0x3e, 0x3f,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22,
+
     0x00, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x7e, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x5e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5b, 0x5d, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x7e, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00,
+    0x5e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5b, 0x5d, 0x00, 0x5d, 0x00, 0x00,
+
     0x7b, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x7d, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x5c, 0x00, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -1996,57 +2003,449 @@
 };
 
 static const uint8_t ebcdicFromAscii[256]={
-    0x00, 0x01, 0x02, 0x03, 0x37, 0x2d, 0x2e, 0x2f, 0x16, 0x05, 0x25, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+    0x00, 0x01, 0x02, 0x03, 0x37, 0x2d, 0x2e, 0x2f, 0x16, 0x05, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
     0x10, 0x11, 0x12, 0x13, 0x3c, 0x3d, 0x32, 0x26, 0x18, 0x19, 0x3f, 0x27, 0x1c, 0x1d, 0x1e, 0x1f,
-    0x40, 0x5a, 0x7f, 0x7b, 0x5b, 0x6c, 0x50, 0x7d, 0x4d, 0x5d, 0x5c, 0x4e, 0x6b, 0x60, 0x4b, 0x61,
+    0x40, 0x00, 0x7f, 0x00, 0x00, 0x6c, 0x50, 0x7d, 0x4d, 0x5d, 0x5c, 0x4e, 0x6b, 0x60, 0x4b, 0x61,
     0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0x7a, 0x5e, 0x4c, 0x7e, 0x6e, 0x6f,
-    0x7c, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6,
-    0xd7, 0xd8, 0xd9, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xba, 0xe0, 0xbb, 0xb0, 0x6d,
-    0x79, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
-    0x97, 0x98, 0x99, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xc0, 0x4f, 0xd0, 0xa1, 0x07,
+
+    0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6,
+    0xd7, 0xd8, 0xd9, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0x00, 0x00, 0x00, 0x00, 0x6d,
+    0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
+    0x97, 0x98, 0x99, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0x00, 0x00, 0x00, 0x00, 0x07,
+
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 
-#endif
+/*
+ * Bit sets indicating which characters of the ASCII repertoire
+ * (by ASCII/Unicode code) are "invariant".
+ * See utypes.h for more details.
+ *
+ * As invariant are considered the characters of the ASCII repertoire except
+ * for the following:
+ * 21  '!' <exclamation mark>
+ * 23  '#' <number sign>
+ * 24  '$' <dollar sign>
+ *
+ * 40  '@' <commercial at>
+ *
+ * 5b  '[' <left bracket>
+ * 5c  '\' <backslash>
+ * 5d  ']' <right bracket>
+ * 5e  '^' <circumflex>
+ *
+ * 60  '`' <grave accent>
+ *
+ * 7b  '{' <left brace>
+ * 7c  '|' <vertical line>
+ * 7d  '}' <right brace>
+ * 7e  '~' <tilde>
+ */
+static const uint32_t invariantChars[4]={
+    0xfffffbff, /* 00..1f but not 0a */
+    0xffffffe5, /* 20..3f but not 21 23 24 */
+    0x87fffffe, /* 40..5f but not 40 5b..5e */
+    0x87fffffe  /* 60..7f but not 60 7b..7e */
+};
 
-#endif
+/*
+ * test unsigned types (or values known to be non-negative) for invariant characters,
+ * tests ASCII-family character values
+ */
+#define UCHAR_IS_INVARIANT(c) (((c)<=0x7f) && (invariantChars[(c)>>5]&((uint32_t)1<<((c)&0x1f)))!=0)
+
+/* test signed types for invariant characters, adds test for positive values */
+#define SCHAR_IS_INVARIANT(c) ((0<=(c)) && UCHAR_IS_INVARIANT(c))
 
 U_CAPI void U_EXPORT2
 u_charsToUChars(const char *cs, UChar *us, int32_t length) {
+    UChar u;
+    uint8_t c;
+    UBool onlyInvariantChars;
+
+    /*
+     * Allow the entire ASCII repertoire to be mapped _to_ Unicode.
+     * For EBCDIC systems, this works for characters with codes from
+     * codepages 37 and 1047 or compatible.
+     */
+    onlyInvariantChars=TRUE;
     while(length>0) {
+        c=(uint8_t)(*cs++);
 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
-        *us++=(UChar)(uint8_t)(*cs++);
+        u=(UChar)c;
 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
-        *us++=(UChar)asciiFromEbcdic[(uint8_t)(*cs++)];
+        u=(UChar)asciiFromEbcdic[c];
 #else
 #   error U_CHARSET_FAMILY is not valid
 #endif
+        if(u==0 && c!=0) {
+            onlyInvariantChars=FALSE;
+        }
+        *us++=u;
         --length;
     }
+    U_ASSERT(onlyInvariantChars); /* only invariant chars? */
 }
 
 U_CAPI void U_EXPORT2
 u_UCharsToChars(const UChar *us, char *cs, int32_t length) {
+    UChar u;
+    UBool onlyInvariantChars;
+
+    onlyInvariantChars=TRUE;
     while(length>0) {
+        u=*us++;
+        if(!UCHAR_IS_INVARIANT(u)) {
+            onlyInvariantChars=FALSE;
+            u=0;
+        }
 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
-        *cs++=(char)(*us++);
+        *cs++=(char)u;
 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
-        *cs++=(char)ebcdicFromAscii[(uint8_t)(*us++)];
+        *cs++=(char)ebcdicFromAscii[u];
 #else
 #   error U_CHARSET_FAMILY is not valid
 #endif
         --length;
     }
+    U_ASSERT(onlyInvariantChars); /* only invariant chars? */
 }
 
-/* end of platform-specific implementation */
+U_CAPI UBool U_EXPORT2
+uprv_isInvariantString(const char *s, int32_t length) {
+    uint8_t c;
+
+    for(;;) {
+        if(length<0) {
+            /* NUL-terminated */
+            c=(uint8_t)*s++;
+            if(c==0) {
+                break;
+            }
+        } else {
+            /* count length */
+            if(length==0) {
+                break;
+            }
+            --length;
+            c=(uint8_t)*s++;
+            if(c==0) {
+                continue; /* NUL is invariant */
+            }
+        }
+        /* c!=0 now, one branch below checks c==0 for variant characters */
+
+        /*
+         * no assertions here because these functions are legitimately called
+         * for strings with variant characters
+         */
+#if U_CHARSET_FAMILY==U_ASCII_FAMILY
+        if(!UCHAR_IS_INVARIANT(c)) {
+            return FALSE; /* found a variant char */
+        }
+#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
+        c=asciiFromEbcdic[c];
+        if(c==0 || !UCHAR_IS_INVARIANT(c)) {
+            return FALSE; /* found a variant char */
+        }
+#else
+#   error U_CHARSET_FAMILY is not valid
+#endif
+    }
+    return TRUE;
+}
+
+U_CAPI UBool U_EXPORT2
+uprv_isInvariantUString(const UChar *s, int32_t length) {
+    UChar c;
+
+    for(;;) {
+        if(length<0) {
+            /* NUL-terminated */
+            c=*s++;
+            if(c==0) {
+                break;
+            }
+        } else {
+            /* count length */
+            if(length==0) {
+                break;
+            }
+            --length;
+            c=*s++;
+        }
+
+        /*
+         * no assertions here because these functions are legitimately called
+         * for strings with variant characters
+         */
+        if(!UCHAR_IS_INVARIANT(c)) {
+            return FALSE; /* found a variant char */
+        }
+    }
+    return TRUE;
+}
+
+/* UDataSwapFn implementations used in udataswp.c ------- */
+
+/* convert ASCII to EBCDIC and verify that all characters are invariant */
+U_CFUNC int32_t
+uprv_ebcdicFromAscii(const UDataSwapper *ds,
+                     const void *inData, int32_t length, void *outData,
+                     UErrorCode *pErrorCode) {
+    const uint8_t *s;
+    uint8_t *t;
+    uint8_t c;
+
+    int32_t count;
+
+    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+        return 0;
+    }
+    if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) {
+        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }
+
+    /* setup and swapping */
+    s=(const uint8_t *)inData;
+    t=(uint8_t *)outData;
+    count=length;
+    while(count>0) {
+        c=*s++;
+        if(!UCHAR_IS_INVARIANT(c)) {
+            udata_printError(ds, "uprv_ebcdicFromAscii() string[%d] contains a variant character in position %d\n",
+                             length, length-count);
+            *pErrorCode=U_INVALID_CHAR_FOUND;
+            return 0;
+        }
+        *t++=ebcdicFromAscii[c];
+        --count;
+    }
+
+    return length;
+}
+
+/* this function only checks and copies ASCII strings without conversion */
+U_CFUNC int32_t
+uprv_copyAscii(const UDataSwapper *ds,
+               const void *inData, int32_t length, void *outData,
+               UErrorCode *pErrorCode) {
+    const uint8_t *s;
+    uint8_t c;
+
+    int32_t count;
+
+    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+        return 0;
+    }
+    if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) {
+        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }
+
+    /* setup and checking */
+    s=(const uint8_t *)inData;
+    count=length;
+    while(count>0) {
+        c=*s++;
+        if(!UCHAR_IS_INVARIANT(c)) {
+            udata_printError(ds, "uprv_copyFromAscii() string[%d] contains a variant character in position %d\n",
+                             length, length-count);
+            *pErrorCode=U_INVALID_CHAR_FOUND;
+            return 0;
+        }
+        --count;
+    }
+
+    if(length>0 && inData!=outData) {
+        uprv_memcpy(outData, inData, length);
+    }
+
+    return length;
+}
+
+/* convert EBCDIC to ASCII and verify that all characters are invariant */
+U_CFUNC int32_t
+uprv_asciiFromEbcdic(const UDataSwapper *ds,
+                     const void *inData, int32_t length, void *outData,
+                     UErrorCode *pErrorCode) {
+    const uint8_t *s;
+    uint8_t *t;
+    uint8_t c;
+
+    int32_t count;
+
+    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+        return 0;
+    }
+    if(ds==NULL || inData==NULL || length<0 ||  (length>0 && outData==NULL)) {
+        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }
+
+    /* setup and swapping */
+    s=(const uint8_t *)inData;
+    t=(uint8_t *)outData;
+    count=length;
+    while(count>0) {
+        c=*s++;
+        if(c!=0 && ((c=asciiFromEbcdic[c])==0 || !UCHAR_IS_INVARIANT(c))) {
+            udata_printError(ds, "uprv_asciiFromEbcdic() string[%d] contains a variant character in position %d\n",
+                             length, length-count);
+            *pErrorCode=U_INVALID_CHAR_FOUND;
+            return 0;
+        }
+        *t++=c;
+        --count;
+    }
+
+    return length;
+}
+
+/* this function only checks and copies EBCDIC strings without conversion */
+U_CFUNC int32_t
+uprv_copyEbcdic(const UDataSwapper *ds,
+                const void *inData, int32_t length, void *outData,
+                UErrorCode *pErrorCode) {
+    const uint8_t *s;
+    uint8_t c;
+
+    int32_t count;
+
+    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+        return 0;
+    }
+    if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) {
+        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }
+
+    /* setup and checking */
+    s=(const uint8_t *)inData;
+    count=length;
+    while(count>0) {
+        c=*s++;
+        if(c!=0 && ((c=asciiFromEbcdic[c])==0 || !UCHAR_IS_INVARIANT(c))) {
+            udata_printError(ds, "uprv_copyEbcdic() string[%] contains a variant character in position %d\n",
+                             length, length-count);
+            *pErrorCode=U_INVALID_CHAR_FOUND;
+            return 0;
+        }
+        --count;
+    }
+
+    if(length>0 && inData!=outData) {
+        uprv_memcpy(outData, inData, length);
+    }
+
+    return length;
+}
+
+/* compare invariant strings; variant characters compare less than others and unlike each other */
+U_CFUNC int32_t
+uprv_compareInvAscii(const UDataSwapper *ds,
+                     const char *outString, int32_t outLength,
+                     const UChar *localString, int32_t localLength) {
+    int32_t minLength;
+    UChar32 c1, c2;
+    uint8_t c;
+
+    if(outString==NULL || outLength<-1 || localString==NULL || localLength<-1) {
+        return 0;
+    }
+
+    if(outLength<0) {
+        outLength=(int32_t)uprv_strlen(outString);
+    }
+    if(localLength<0) {
+        localLength=u_strlen(localString);
+    }
+
+    minLength= outLength<localLength ? outLength : localLength;
+
+    while(minLength>0) {
+        c=(uint8_t)*outString++;
+        if(UCHAR_IS_INVARIANT(c)) {
+            c1=c;
+        } else {
+            c1=-1;
+        }
+
+        c2=*localString++;
+        if(!UCHAR_IS_INVARIANT(c2)) {
+            c1=-2;
+        }
+
+        if((c1-=c2)!=0) {
+            return c1;
+        }
+
+        --minLength;
+    }
+
+    /* strings start with same prefix, compare lengths */
+    return outLength-localLength;
+}
+
+U_CFUNC int32_t
+uprv_compareInvEbcdic(const UDataSwapper *ds,
+                      const char *outString, int32_t outLength,
+                      const UChar *localString, int32_t localLength) {
+    int32_t minLength;
+    UChar32 c1, c2;
+    uint8_t c;
+
+    if(outString==NULL || outLength<-1 || localString==NULL || localLength<-1) {
+        return 0;
+    }
+
+    if(outLength<0) {
+        outLength=(int32_t)uprv_strlen(outString);
+    }
+    if(localLength<0) {
+        localLength=u_strlen(localString);
+    }
+
+    minLength= outLength<localLength ? outLength : localLength;
+
+    while(minLength>0) {
+        c=(uint8_t)*outString++;
+        if(c==0) {
+            c1=0;
+        } else if((c1=asciiFromEbcdic[c])!=0 && UCHAR_IS_INVARIANT(c1)) {
+            /* c1 is set */
+        } else {
+            c1=-1;
+        }
+
+        c2=*localString++;
+        if(!UCHAR_IS_INVARIANT(c2)) {
+            c1=-2;
+        }
+
+        if((c1-=c2)!=0) {
+            return c1;
+        }
+
+        --minLength;
+    }
+
+    /* strings start with same prefix, compare lengths */
+    return outLength-localLength;
+}
+
+/* end of platform-specific implementation -------------- */
+
+/* version handling --------------------------------------------------------- */
 
 U_CAPI void U_EXPORT2
 u_versionFromString(UVersionInfo versionArray, const char *versionString) {
@@ -2216,7 +2615,8 @@
     "U_STATE_TOO_OLD_ERROR",
     "U_TOO_MANY_ALIASES_ERROR",
     "U_ENUM_OUT_OF_SYNC_ERROR",
-    "U_INVARIANT_CONVERSION_ERROR"
+    "U_INVARIANT_CONVERSION_ERROR",
+    "U_INVALID_STATE_ERROR"
 };
 static const char * const
 _uFmtErrorName[U_FMT_PARSE_ERROR_LIMIT - U_FMT_PARSE_ERROR_START] = {
@@ -2248,7 +2648,8 @@
     "U_BRK_NEW_LINE_IN_QUOTED_STRING",
     "U_BRK_UNDEFINED_VARIABLE",
     "U_BRK_INIT_ERROR",
-    "U_BRK_RULE_EMPTY_SET"
+    "U_BRK_RULE_EMPTY_SET",
+    "U_BRK_UNRECOGNIZED_OPTION"
 };
 
 static const char * const
@@ -2273,8 +2674,8 @@
 static const char * const
 _uIDNAErrorName[U_IDNA_ERROR_LIMIT - U_IDNA_ERROR_START] = {
       "U_IDNA_ERROR_START",
-      "U_IDNA_PROHIBITED_CODEPOINT_FOUND_ERROR",
-      "U_IDNA_UNASSIGNED_CODEPOINT_FOUND_ERROR",
+      "U_IDNA_PROHIBITED_ERROR",
+      "U_IDNA_UNASSIGNED_ERROR",
       "U_IDNA_CHECK_BIDI_ERROR",
       "U_IDNA_STD3_ASCII_RULES_ERROR",
       "U_IDNA_ACE_PREFIX_ERROR",

Index: rbbi.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/rbbi.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- rbbi.cpp	10 Sep 2003 02:42:02 -0000	1.1
+++ rbbi.cpp	6 Apr 2004 10:07:59 -0000	1.2
@@ -1,14 +1,14 @@
-//
-//  file:  rbbi.c    Contains the implementation of the rule based break iterator
-//                   runtime engine and the API implementation for
-//                   class RuleBasedBreakIterator
-//
 /*
 ***************************************************************************
 *   Copyright (C) 1999-2003 International Business Machines Corporation   *
 *   and others. All rights reserved.                                      *
 ***************************************************************************
 */
+//
+//  file:  rbbi.c    Contains the implementation of the rule based break iterator
+//                   runtime engine and the API implementation for
+//                   class RuleBasedBreakIterator
+//
 
 #include "unicode/utypes.h"
 
@@ -17,6 +17,7 @@
 #include "unicode/rbbi.h"
 #include "unicode/schriter.h"
 #include "unicode/udata.h"
+#include "unicode/uclean.h"
 #include "rbbidata.h"
 #include "rbbirb.h"
 #include "cmemory.h"
@@ -30,11 +31,8 @@
 static const int16_t START_STATE = 1;     // The state number of the starting state
 static const int16_t STOP_STATE  = 0;     // The state-transition value indicating "stop"
 
-/**
- * Class ID.  (value is irrelevant; address is important)
- */
-const char
-RuleBasedBreakIterator::fgClassID = 0;
+
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
 
 
 //=======================================================================
@@ -48,8 +46,8 @@
 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
 {
     init();
+    fData = new RBBIDataWrapper(data, status); // status checked in constructor
     if (U_FAILURE(status)) {return;}
-    fData = new RBBIDataWrapper(data, status);
     if(fData == 0) {
         status = U_MEMORY_ALLOCATION_ERROR;
         return;
@@ -65,8 +63,8 @@
 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
 {
     init();
+    fData = new RBBIDataWrapper(udm, status); // status checked in constructor
     if (U_FAILURE(status)) {return;}
-    fData = new RBBIDataWrapper(udm, status);
     if(fData == 0) {
         status = U_MEMORY_ALLOCATION_ERROR;
         return;
@@ -84,6 +82,7 @@
                                                 UParseError          &parseError,
                                                 UErrorCode           &status)
 {
+    u_init(&status);      // Just in case ICU is not yet initialized
     init();
     if (U_FAILURE(status)) {return;}
     RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
@@ -176,7 +175,6 @@
 
     fText                = NULL;
     fData                = NULL;
-    fCharMappings        = NULL;
     fLastBreakTag        = 0;
     fLastBreakTagValid   = TRUE;
     fDictionaryCharCount = 0;
@@ -406,12 +404,19 @@
         return BreakIterator::DONE;
     }
 
+    if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
+        return handlePrevious(fData->fReverseTable);
+    }
+
+    // old rule syntax
     // set things up.  handlePrevious() will back us up to some valid
     // break position before the current position (we back our internal
     // iterator up one step to prevent handlePrevious() from returning
     // the current position), but not necessarily the last one before
     // where we started
+
     int32_t start = current();
+
     fText->previous32();
     int32_t lastResult    = handlePrevious();
     int32_t result        = lastResult;
@@ -421,6 +426,7 @@
     // iterate forward from the known break position until we pass our
     // starting point.  The last break position before the starting
     // point is our return value
+
     for (;;) {
         result         = handleNext();
         if (result == BreakIterator::DONE || result >= start) {
@@ -434,10 +440,9 @@
     // fLastBreakTag wants to have the value for section of text preceding
     // the result position that we are to return (in lastResult.)  If
     // the backwards rules overshot and the above loop had to do two or more
-    //  handleNext()s to move up to the desired return position, we will have a valid
-    //  tag value.  But, if handlePrevious() took us to exactly the correct result positon,
-    //  we wont have a tag value for that position, which is only set by handleNext().
-
+    // handleNext()s to move up to the desired return position, we will have a valid
+    // tag value. But, if handlePrevious() took us to exactly the correct result positon,
+    // we wont have a tag value for that position, which is only set by handleNext().
 
     // set the current iteration position to be the last break position
     // before where we started, and then return that value
@@ -447,9 +452,6 @@
     return lastResult;
 }
 
-
-
-
 /**
  * Sets the iterator to refer to the first boundary position following
  * the specified position.
@@ -477,10 +479,48 @@
     // otherwise, set our internal iteration position (temporarily)
     // to the position passed in.  If this is the _beginning_ position,
     // then we can just use next() to get our return value
-    fText->setIndex(offset);
-    if (offset == fText->startIndex())
-        return handleNext();
+    
+    int32_t result = 0;
 
+    if (fData->fSafeRevTable != NULL) {
+        // new rule syntax
+        /// todo synwee 
+        fText->setIndex(offset);
+        // move forward one codepoint to prepare for moving back to a
+        // safe point.
+        // this handles offset being between a supplementary character
+        fText->next32();
+        // handlePrevious will move most of the time to < 1 boundary away
+        handlePrevious(fData->fSafeRevTable);
+        int32_t result = next();
+        while (result <= offset) {
+            result = next();
+        }
+        return result;
+    }
+    if (fData->fSafeFwdTable != NULL) {
+        // backup plan if forward safe table is not available
+        fText->setIndex(offset);
+        fText->previous32();
+        // handle next will give result >= offset
+        handleNext(fData->fSafeFwdTable);
+        // previous will give result 0 or 1 boundary away from offset, 
+        // most of the time
+        // we have to 
+        int32_t oldresult = previous();
+        while (oldresult > offset) {
+            int32_t result = previous();
+            if (result <= offset) {
+                return oldresult;
+            }
+            oldresult = result;
+        }
+        int32_t result = next();
+        if (result <= offset) {
+            return next();
+        }
+        return result;
+    }
     // otherwise, we have to sync up first.  Use handlePrevious() to back
     // us up to a known break position before the specified position (if
     // we can determine that the specified position is a break position,
@@ -488,8 +528,14 @@
     // position at or before our starting position.  Advance forward
     // from here until we've passed the starting position.  The position
     // we stop on will be the first break position after the specified one.
+    // old rule syntax
+
+    fText->setIndex(offset);
+    if (offset == fText->startIndex()) {
+        return handleNext();
+    }
+    result = previous();
 
-    int32_t result = previous();
     while (result != BreakIterator::DONE && result <= offset) {
         result = next();
     }
@@ -518,6 +564,48 @@
     // if we start by updating the current iteration position to the
     // position specified by the caller, we can just use previous()
     // to carry out this operation
+
+    if (fData->fSafeFwdTable != NULL) {
+        /// todo synwee
+        // new rule syntax
+        fText->setIndex(offset);
+        // move backwards one codepoint to prepare for moving forwards to a
+        // safe point.
+        // this handles offset being between a supplementary character
+        fText->previous32();
+        handleNext(fData->fSafeFwdTable);
+        int32_t result = previous();
+        while (result >= offset) {
+            result = previous();
+        }
+        return result;
+    }
+    if (fData->fSafeRevTable != NULL) {
+        // backup plan if forward safe table is not available
+        fText->setIndex(offset);
+        fText->next32();
+        // handle previous will give result <= offset
+        handlePrevious(fData->fSafeRevTable);
+
+        // next will give result 0 or 1 boundary away from offset, 
+        // most of the time
+        // we have to 
+        int32_t oldresult = next();
+        while (oldresult < offset) {
+            int32_t result = next();
+            if (result >= offset) {
+                return oldresult;
+            }
+            oldresult = result;
+        }
+        int32_t result = previous();
+        if (result >= offset) {
+            return previous();
+        }
+        return result;
+    }
+
+    // old rule syntax
     fText->setIndex(offset);
     return previous();
 }
@@ -536,6 +624,11 @@
         return TRUE;
     }
 
+    if (offset == fText->endIndex()) {
+        last();       // For side effects on current position, tag values.
+        return TRUE;
+    }
+
     // out-of-range indexes are never boundary positions
     if (offset < fText->startIndex()) {
         first();       // For side effects on current position, tag values.
@@ -573,10 +666,14 @@
 //     vectors through here.  This method initializes the state machine to state 1
 //     and advances through the text character by character until we reach the end
 //     of the text or the state machine transitions to state 0.  We update our return
-//     value every time the state machine passes through a possible end state.
+//     value every time the state machine passes through an accepting state.
 //
 //-----------------------------------------------------------------------------------
-int32_t RuleBasedBreakIterator::handleNext(void) {
+int32_t RuleBasedBreakIterator::handleNext() {
+    return handleNext(fData->fForwardTable);
+}
+
+int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
     if (fTrace) {
         RBBIDebugPrintf("Handle Next   pos   char  state category  \n");
     }
@@ -585,17 +682,13 @@
     fLastBreakTagValid = TRUE;
 
     // if we're already at the end of the text, return DONE.
-    if (fText == NULL || fData == NULL || fText->getIndex() == fText->endIndex()) {
+    if (fText == NULL || fData == NULL || fText->hasNext() == FALSE) {
         fLastBreakTag = 0;
         return BreakIterator::DONE;
     }
 
-    // no matter what, we always advance at least one character forward
-    int32_t temp = fText->getIndex();
-    fText->next32();
-    int32_t result = fText->getIndex();
-    fText->setIndex(temp);
-
+    int32_t initialPosition = fText->getIndex();
+    int32_t result          = initialPosition;
     int32_t lookaheadResult = 0;
 
     // Initialize the state machine.  Begin in state 1
@@ -609,7 +702,7 @@
     fLastBreakTag = 0;
 
     row = (RBBIStateTableRow *)    // Point to starting row of state table.
-        (fData->fForwardTable->fTableData + (fData->fForwardTable->fRowLen * state));
+        (statetable->fTableData + (statetable->fRowLen * state));
 
     // Character Category fetch for starting character.
     //    See comments on character category code within loop, below.
@@ -622,15 +715,31 @@
     // loop until we reach the end of the text or transition to state 0
     for (;;) {
         if (c == CharacterIterator::DONE && fText->hasNext()==FALSE) {
-            // Note: CharacterIterator::DONE is 0xffff, which is also a legal
-            //       character value.  Check for DONE first, because it's quicker,
-            //       but also need to check fText->hasNext() to be certain.
+            // Reached end of input string.
+            //    Note: CharacterIterator::DONE is 0xffff, which is also a legal
+            //          character value.  Check for DONE first, because it's quicker,
+            //          but also need to check fText->hasNext() to be certain.
+
+            if (lookaheadResult > result) {
+                // We ran off the end of the string with a pending look-ahead match.
+                // Treat this as if the look-ahead condition had been met, and return
+                //  the match at the / position from the look-ahead rule.
+                result          = lookaheadResult;
+                fLastBreakTag   = lookaheadTag;
+                lookaheadStatus = 0;
+            } else if (result == initialPosition) {
+                // Ran off end, no match found.
+                // move forward one
+                fText->setIndex(initialPosition);
+                fText->next32();
+                fText->getIndex();
+            }
             break;
         }
         // look up the current character's character category, which tells us
         // which column in the state table to look at.
         // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
-        //        not the size of the character going in.
+        //        not the size of the character going in, which is a UChar32.
         //
         UTRIE_GET16(&fData->fTrie, c, category);
 
@@ -658,78 +767,74 @@
         // look up a state transition in the state table
         state = row->fNextState[category];
         row = (RBBIStateTableRow *)
-            (fData->fForwardTable->fTableData + (fData->fForwardTable->fRowLen * state));
+            (statetable->fTableData + (statetable->fRowLen * state));
 
         // Get the next character.  Doing it here positions the iterator
         //    to the correct position for recording matches in the code that
         //    follows.
-        //  TODO:  16 bit next, and a 16 bit TRIE lookup, with escape code
-        //         for non-BMP chars, would be faster.
         c = fText->next32();
 
-        if (row->fAccepting == 0 && row->fLookAhead == 0) {
-            // No match, nothing of interest happening, common case.
-            goto continueOn;
-        }
-
         if (row->fAccepting == -1) {
-            // Match found, common case, no lookahead involved.
-            //    (It's possible that some lookahead rule matched here also,
-            //     but since there's an unconditional match, we'll favor that.)
-            result          = fText->getIndex();
-            lookaheadStatus = 0;           // clear out any pending look-ahead matches.
+            // Match found, common case, could have lookahead so we move on to check it
+            result = fText->getIndex();
+            /// added
             fLastBreakTag   = row->fTag;   // Remember the break status (tag) value.
-            goto continueOn;
-        }
-
-        if (row->fAccepting == 0 && row->fLookAhead != 0) {
-            // Lookahead match point.  Remember it, but only if no other rule has
-            //                         unconitionally matched up to this point.
-            // TODO:  handle case where there's a pending match from a different rule -
-            //        where lookaheadStatus != 0  && lookaheadStatus != row->fLookAhead.
-            int32_t  r = fText->getIndex();
-            if (r > result) {
-                lookaheadResult = r;
-                lookaheadStatus = row->fLookAhead;
-                lookaheadTag   = row->fTag;
-            }
-            goto continueOn;
         }
 
-        if (row->fAccepting != 0 && row->fLookAhead != 0) {
-            // Lookahead match is completed.  Set the result accordingly, but only
-            //   if no other rule has matched further in the mean time.
-            if (lookaheadResult > result) {
-                U_ASSERT(row->fAccepting == lookaheadStatus);   // TODO:  handle this case
-                //    of overlapping lookahead matches.
+        if (row->fLookAhead != 0) {
+            if (lookaheadStatus != 0 
+                && row->fAccepting == lookaheadStatus) { 
+                // Lookahead match is completed.  Set the result accordingly, but only
+                // if no other rule has matched further in the mean time.
                 result          = lookaheadResult;
                 fLastBreakTag   = lookaheadTag;
                 lookaheadStatus = 0;
+                /// i think we have to back up to read the lookahead character again
+                /// fText->setIndex(lookaheadResult);
+                /// TODO: this is a simple hack since reverse rules only have simple
+                /// lookahead rules that we can definitely break out from.
+                /// we need to make the lookahead rules not chain eventually.
+                /// return result;
+                /// this is going to be the longest match again
+                goto continueOn;
             }
+
+            int32_t  r = fText->getIndex();
+            lookaheadResult = r;
+            lookaheadStatus = row->fLookAhead;
+            lookaheadTag    = row->fTag;
             goto continueOn;
         }
 
+
+        if (row->fAccepting == 0) {
+            // No match, nothing of interest happening, common case.
+            goto continueOn;
+        }
+
+        lookaheadStatus = 0;           // clear out any pending look-ahead matches.
+
 continueOn:
         if (state == STOP_STATE) {
+            // This is the normal exit from the lookup state machine.
+            // We have advanced through the string until it is certain that no
+            //   longer match is possible, no matter what characters follow.
             break;
-        }
-
-        // c = fText->next32();
+        } 
     }
 
-    // if we've run off the end of the text, and the very last character took us into
-    // a lookahead state, advance the break position to the lookahead position
-    // (the theory here is that if there are no characters at all after the lookahead
-    // position, that always matches the lookahead criteria)
-    //   TODO:  is this really the right behavior?
-    if (c == CharacterIterator::DONE &&
-        fText->hasNext()==FALSE &&
-        lookaheadResult == fText->endIndex()) {
-            result          = lookaheadResult;
-            fLastBreakTag   = lookaheadTag;
-    }
+    // The state machine is done.  Check whether it found a match...
 
+    // If the iterator failed to advance in the match engine, force it ahead by one.
+    //   (This really indicates a defect in the break rules.  They should always match
+    //    at least one character.)
+    if (result == initialPosition) {
+        result = fText->setIndex(initialPosition);
+        fText ->next32();
+        result = fText->getIndex();
+    }
 
+    // Leave the iterator at our result position.
     fText->setIndex(result);
     if (fTrace) {
         RBBIDebugPrintf("result = %d\n\n", result);
@@ -737,18 +842,6 @@
     return result;
 }
 
-//-----------------------------------------------------------------------------------
-//
-//  handlePrevious()
-//
-//      This method backs the iterator back up to a "safe position" in the text.
-//      This is a position that we know, without any context, must be a break position.
-//      The various calling methods then iterate forward from this safe position to
-//      the appropriate position to return.
-//
-//      The logic of this function is very similar to handleNext(), above.
-//
-//-----------------------------------------------------------------------------------
 int32_t RuleBasedBreakIterator::handlePrevious(void) {
     if (fText == NULL || fData == NULL) {
         return 0;
@@ -873,6 +966,166 @@
 }
 
 
+//-----------------------------------------------------------------------------------
+//
+//  handlePrevious()
+//
+//      This method backs the iterator back up to a "safe position" in the text.
+//      This is a position that we know, without any context, may be any position
+//      not more than 2 breaks away. Occasionally, the position may be less than
+//      one break away.
+//      The various calling methods then iterate forward from this safe position to
+//      the appropriate position to return.
+//
+//      The logic of this function is very similar to handleNext(), above.
+//
+//-----------------------------------------------------------------------------------
+int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
+    if (fText == NULL || statetable == NULL) {
+        return 0;
+    }
+    // break tag is no longer valid after icu switched to exact backwards
+    // positioning.
+    fLastBreakTagValid = FALSE;
+    if (statetable == NULL) {
+        return fText->setToStart();
+    }
+
+    int32_t            state              = START_STATE;
+    int32_t            category;
+    int32_t            lastCategory       = 0;
+    UBool              hasPassedStartText = !fText->hasPrevious(); 
+    UChar32            c                  = fText->previous32();
+    // previous character
+    int32_t            result             = fText->getIndex(); 
+    int32_t            lookaheadStatus    = 0;
+    int32_t            lookaheadResult    = 0;
+    int32_t            lookaheadTag       = 0;
+    UBool              lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
+
+    RBBIStateTableRow *row;
+
+    row = (RBBIStateTableRow *)
+        (statetable->fTableData + (state * statetable->fRowLen));
+    UTRIE_GET16(&fData->fTrie, c, category);
+    if ((category & 0x4000) != 0)  {
+        fDictionaryCharCount++;
+        category &= ~0x4000;
+    }
+
+    if (fTrace) {
+        RBBIDebugPrintf("Handle Prev   pos   char  state category  \n");
+    }
+
+    // loop until we reach the beginning of the text or transition to state 0
+    for (;;) {
+        // if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
+        if (hasPassedStartText) { 
+            // if we have already considered the start of the text
+            if (row->fLookAhead != 0 && lookaheadResult == 0) {
+                result = 0;
+            }
+            break;
+        }
+
+        // save the last character's category and look up the current
+        // character's category
+        lastCategory = category;
+        UTRIE_GET16(&fData->fTrie, c, category);
+
+        // Check the dictionary bit in the character's category.
+        //    Counter is only used by dictionary based iterators.
+        //
+        if ((category & 0x4000) != 0)  {
+            fDictionaryCharCount++;
+            category &= ~0x4000;
+        }
+
+        if (fTrace) {
+            RBBIDebugPrintf("             %4d   ", fText->getIndex());
+            if (0x20<=c && c<0x7f) {
+                RBBIDebugPrintf("\"%c\"  ", c);
+            } else {
+                RBBIDebugPrintf("%5x  ", c);
+            }
+            RBBIDebugPrintf("%3d  %3d\n", state, category);
+        }
+
+        // look up a state transition in the backwards state table
+        state = row->fNextState[category];
+        row = (RBBIStateTableRow *)
+            (statetable->fTableData + (state * statetable->fRowLen));
+    
+        if (row->fAccepting == -1) {
+            // Match found, common case, could have lookahead so we move on to check it
+            result = fText->getIndex();
+            /// added
+            fLastBreakTag   = row->fTag;   // Remember the break status (tag) value.
+        }
+
+        if (row->fLookAhead != 0) {
+            if (lookaheadStatus != 0 
+                && row->fAccepting == lookaheadStatus) { 
+                // Lookahead match is completed.  Set the result accordingly, but only
+                // if no other rule has matched further in the mean time.
+                result          = lookaheadResult;
+                fLastBreakTag   = lookaheadTag;
+                lookaheadStatus = 0;
+                /// i think we have to back up to read the lookahead character again
+                /// fText->setIndex(lookaheadResult);
+                /// TODO: this is a simple hack since reverse rules only have simple
+                /// lookahead rules that we can definitely break out from.
+                /// we need to make the lookahead rules not chain eventually.
+                /// return result;
+                /// this is going to be the longest match again
+
+                /// syn wee todo hard coded for line breaks stuff
+                /// needs to provide a tag in rules to ensure a stop.
+
+                if (lookAheadHardBreak) {
+                    fText->setIndex(result);
+                    return result;
+                }
+                category = lastCategory;
+                fText->setIndex(result);
+              
+                goto continueOn;
+            }
+
+            int32_t  r = fText->getIndex();
+            lookaheadResult = r;
+            lookaheadStatus = row->fLookAhead;
+            lookaheadTag    = row->fTag;
+            goto continueOn;
+        }
+
+        // not lookahead
+        if (row->fAccepting == 0) {
+            // No match, nothing of interest happening, common case.
+            goto continueOn;
+        }
+
+        lookaheadStatus = 0;     // clear out any pending look-ahead matches.
+
+continueOn:
+        if (state == STOP_STATE) { 
+            break;
+        }
+
+        // then advance one character backwards
+        hasPassedStartText = !fText->hasPrevious(); 
+        c = fText->previous32();
+    }
+
+    // Note:  the result postion isn't what is returned to the user by previous(),
+    //        but where the implementation of previous() turns around and
+    //        starts iterating forward again.
+    fText->setIndex(result);
+
+    return result;
+}
+
+
 void
 RuleBasedBreakIterator::reset()
 {
@@ -908,7 +1161,11 @@
             int32_t pa = current();
             nonConstThis->previous();
             int32_t pb = nonConstThis->next();
-            U_ASSERT(pa == pb);
+            if (pa != pb) {
+                // note: the if (pa != pb) test is here only to eliminate warnings for
+                //       unused local variables on gcc.  Logically, it isn't needed.
+                U_ASSERT(pa == pb);
+            }
         }
     }
     return nonConstThis->fLastBreakTag;

Index: rbbicst.pl
===================================================================
RCS file: /cvs/core/icu-sword/source/common/rbbicst.pl,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- rbbicst.pl	10 Sep 2003 02:42:02 -0000	1.1
+++ rbbicst.pl	6 Apr 2004 10:07:59 -0000	1.2
@@ -288,7 +288,7 @@
     print "//    It is generated by the Perl script \"rbbicst.pl\" from\n";
     print "//    the rule parser state definitions file \"rbbirpt.txt\".\n";
     print "//\n";
-    print "//   Copyright (C) 2002 International Business Machines Corporation \n";
+    print "//   Copyright (C) 2002-2003 International Business Machines Corporation \n";
     print "//   and others. All rights reserved.  \n";
     print "//\n";
     print "//---------------------------------------------------------------------------------\n";

Index: rbbidata.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/rbbidata.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- rbbidata.cpp	10 Sep 2003 02:42:02 -0000	1.1
+++ rbbidata.cpp	6 Apr 2004 10:07:59 -0000	1.2
@@ -51,7 +51,9 @@
 
 RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) {
     const RBBIDataHeader *d = (const RBBIDataHeader *)
-        ((char *)&(udm->pHeader->info) + udm->pHeader->info.size);
+        // ((char *)&(udm->pHeader->info) + udm->pHeader->info.size);
+        // taking into consideration the padding added in by udata_write
+        ((char *)(udm->pHeader) + udm->pHeader->dataHeader.headerSize);
     init(d, status);
     fUDataMem = udm;
 }
@@ -73,11 +75,21 @@
     }
 
     fUDataMem     = NULL;
-    fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
     fReverseTable = NULL;
+    fSafeFwdTable = NULL;
+    fSafeRevTable = NULL;
+    if (data->fFTableLen != 0) {
+        fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
+    }
     if (data->fRTableLen != 0) {
         fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
     }
+    if (data->fSFTableLen != 0) {
+        fSafeFwdTable = (RBBIStateTable *)((char *)data + fHeader->fSFTable);
+    }
+    if (data->fSRTableLen != 0) {
+        fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable);
+    }
 
 
     utrie_unserialize(&fTrie,
@@ -183,45 +195,219 @@
 //  print   -  debugging function to dump the runtime data tables.
 //
 //-----------------------------------------------------------------------------
-void  RBBIDataWrapper::printData() {
 #ifdef RBBI_DEBUG
-    uint32_t c, s;
+void  RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *table) {
+    uint32_t   c;
+    uint32_t   s;
 
-    RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
-    RBBIDebugPrintf("   Version = %d\n", fHeader->fVersion);
-    RBBIDebugPrintf("   total length of data  = %d\n", fHeader->fLength);
-    RBBIDebugPrintf("   number of character categories = %d\n\n", fHeader->fCatCount);
+    RBBIDebugPrintf("   %s\n", heading);
 
-    RBBIDebugPrintf("   Forward State Transition Table\n");
     RBBIDebugPrintf("State |  Acc  LA   Tag");
     for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);}
-    RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) {RBBIDebugPrintf("----");}
+    RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) {
+        RBBIDebugPrintf("----");
+    }
     RBBIDebugPrintf("\n");
 
-    for (s=0; s<fForwardTable->fNumStates; s++) {
+    if (table == NULL) {
+        RBBIDebugPrintf("         N U L L   T A B L E\n\n");
+        return;
+    }
+    for (s=0; s<table->fNumStates; s++) {
         RBBIStateTableRow *row = (RBBIStateTableRow *)
-                                  (fForwardTable->fTableData + (fForwardTable->fRowLen * s));
+                                  (table->fTableData + (table->fRowLen * s));
         RBBIDebugPrintf("%4d  |  %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTag);
         for (c=0; c<fHeader->fCatCount; c++)  {
             RBBIDebugPrintf("%3d ", row->fNextState[c]);
         }
         RBBIDebugPrintf("\n");
     }
+    RBBIDebugPrintf("\n");
+}
+#endif
+
+
+#ifdef RBBI_DEBUG
+void  RBBIDataWrapper::printData() {
+    RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
+    RBBIDebugPrintf("   Version = %d\n", fHeader->fVersion);
+    RBBIDebugPrintf("   total length of data  = %d\n", fHeader->fLength);
+    RBBIDebugPrintf("   number of character categories = %d\n\n", fHeader->fCatCount);
+
+    printTable("Forward State Transition Table", fForwardTable);
+    printTable("Reverse State Transition Table", fReverseTable);
+    printTable("Safe Forward State Transition Table", fSafeFwdTable);
+    printTable("Safe Reverse State Transition Table", fSafeRevTable);
 
     RBBIDebugPrintf("\nOrignal Rules source:\n");
-    c = 0;
-    for (;;) {
-        if (fRuleSource[c] == 0)
-            break;
+    for (int32_t c=0; fRuleSource[c] != 0; c++) {
         RBBIDebugPrintf("%c", fRuleSource[c]);
-        c++;
     }
     RBBIDebugPrintf("\n\n");
-#endif
 }
+#endif
 
+U_NAMESPACE_END
 
+//-----------------------------------------------------------------------------
+//
+//  ubrk_swap   -  byte swap and char encoding swap of RBBI data
+//
+//-----------------------------------------------------------------------------
+
+U_CAPI int32_t U_EXPORT2
+ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
+           UErrorCode *status) {
+
+    if (status == NULL || U_FAILURE(*status)) {
+        return 0;
+    }
+
+    //
+    //  Check that the data header is for for break data.
+    //    (Header contents are defined in genbrk.cpp)
+    //
+    const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
+    if(!(  pInfo->dataFormat[0]==0x42 &&   /* dataFormat="Brk " */
+           pInfo->dataFormat[1]==0x72 &&
+           pInfo->dataFormat[2]==0x6b &&
+           pInfo->dataFormat[3]==0x20 &&
+           pInfo->formatVersion[0]==3  )) {
+        udata_printError(ds, "ubrk_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
+                         pInfo->dataFormat[0], pInfo->dataFormat[1],
+                         pInfo->dataFormat[2], pInfo->dataFormat[3],
+                         pInfo->formatVersion[0]);
+        *status=U_UNSUPPORTED_ERROR;
+        return 0;
+    }
+
+    //
+    // Swap the data header.  (This is the generic ICU Data Header, not the RBBI Specific
+    //                         RBBIDataHeader).  This swap also conveniently gets us
+    //                         the size of the ICU d.h., which lets us locate the start
+    //                         of the RBBI specific data.
+    //
+    int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
+
+
+    //
+    // Get the RRBI Data Header, and check that it appears to be OK.
+    //
+    const uint8_t  *inBytes =(const uint8_t *)inData+headerSize;
+    RBBIDataHeader *rbbiDH = (RBBIDataHeader *)inBytes;
+    if (ds->readUInt32(rbbiDH->fMagic)   != 0xb1a0 ||
+        ds->readUInt32(rbbiDH->fVersion) != 1      ||
+        ds->readUInt32(rbbiDH->fLength)  <  sizeof(RBBIDataHeader)) 
+    {
+        udata_printError(ds, "ubrk_swap(): RBBI Data header is invalid.\n");
+        *status=U_UNSUPPORTED_ERROR;
+        return 0;
+    }
+
+    //
+    // Prefight operation?  Just return the size
+    //
+    int32_t totalSize = headerSize + ds->readUInt32(rbbiDH->fLength);
+    if (length < 0) {
+        return totalSize;
+    }
+
+    //
+    // Check that length passed in is consistent with length from RBBI data header.
+    //
+    if (length > 0) {
+        length -= headerSize;
+        if ((uint32_t)length < ds->readUInt32(rbbiDH->fLength)) {
+            udata_printError(ds, "ubrk_swap(): too few bytes (%d after ICU Data header) for break data.\n",
+                             length);
+            *status=U_INDEX_OUTOFBOUNDS_ERROR;
+            return 0;
+        }
+    }
+
+
+    //
+    // Swap the Data.  Do the data itself first, then the RBBI Data Header, because
+    //                 we need to reference the header to locate the data, and an
+    //                 inplace swap of the header leaves it unusable.
+    //
+    uint8_t *outBytes = (uint8_t *)outData + headerSize;
+    int32_t   tableStartOffset;
+    int32_t   tableLength;
+
+    //
+    // If not swapping in place, zero out the output buffer before starting.
+    //    Individual tables and other data items within are aligned to 8 byte boundaries
+    //    when originally created.  Any unused space between items needs to be zero.
+    //
+    if (inBytes != outBytes) {
+       uprv_memset(outBytes, 0, length);
+    }
+
+    //
+    // Each state table begins with several 32 bit fields.  Calculate the size
+    //   in bytes of these.
+    //
+    RBBIStateTable *stp = NULL;
+    int32_t         topSize = (char *)stp->fTableData - (char *)stp;
+
+    // Forward state table.  
+    tableStartOffset = ds->readUInt32(rbbiDH->fFTable);
+    tableLength      = ds->readUInt32(rbbiDH->fFTableLen);
+
+	if (tableLength > 0) {
+		ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 
+							outBytes+tableStartOffset, status);
+		ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
+							outBytes+tableStartOffset+topSize, status);
+	}
+    
+    // Reverse state table.  Same layout as forward table, above.
+    tableStartOffset = ds->readUInt32(rbbiDH->fRTable);
+    tableLength      = ds->readUInt32(rbbiDH->fRTableLen);
+
+	if (tableLength > 0) {
+		ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 
+							outBytes+tableStartOffset, status);
+		ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
+							outBytes+tableStartOffset+topSize, status);
+	}
+
+    // Safe Forward state table.  Same layout as forward table, above.
+    tableStartOffset = ds->readUInt32(rbbiDH->fSFTable);
+    tableLength      = ds->readUInt32(rbbiDH->fSFTableLen);
+
+	if (tableLength > 0) {
+		ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 
+							outBytes+tableStartOffset, status);
+		ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
+							outBytes+tableStartOffset+topSize, status);
+	}
+
+    // Safe Reverse state table.  Same layout as forward table, above.
+    tableStartOffset = ds->readUInt32(rbbiDH->fSRTable);
+    tableLength      = ds->readUInt32(rbbiDH->fSRTableLen);
+
+	if (tableLength > 0) {
+		ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 
+							outBytes+tableStartOffset, status);
+		ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
+							outBytes+tableStartOffset+topSize, status);
+	}
+
+    // Trie table for character categories
+    utrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
+                            outBytes+ds->readUInt32(rbbiDH->fTrie), status);
+
+    // Source Rules Text.  It's UChar data
+    ds->swapArray16(ds, inBytes+ds->readUInt32(rbbiDH->fRuleSource), ds->readUInt32(rbbiDH->fRuleSourceLen),
+                        outBytes+ds->readUInt32(rbbiDH->fRuleSource), status);
+
+    // And, last, the header.  All 32 bit values.
+    ds->swapArray32(ds, inBytes,  sizeof(RBBIDataHeader), outBytes, status);
+
+    return totalSize;
+}
 
-U_NAMESPACE_END
 
 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

Index: rbbidata.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/rbbidata.h,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- rbbidata.h	10 Sep 2003 02:42:02 -0000	1.1
+++ rbbidata.h	6 Apr 2004 10:07:59 -0000	1.2
@@ -1,102 +1,130 @@
-//  file:  rbbidata.h
-//
-//**********************************************************************
-//   Copyright (C) 1999 IBM Corp. All rights reserved.
-//**********************************************************************
-//
-//   RBBI data formats  Includes
-//
-//                          Structs that describes the format of the Binary RBBI data,
-//                          as it is stored in ICU's data file.
-//
-//      RBBIDataWrapper  -  Instances of this class sit between the
-//                          raw data structs and the RulesBasedBreakIterator objects
-//                          that are created by applications.  The wrapper class
-//                          provides reference counting for the underlying data,
-//                          and direct pointers to data that would not otherwise
-//                          be accessible without ugly pointer arithmetic.  The
-//                          wrapper does not attempt to provide any higher level
-//                          abstractions for the data itself.
-//
-//                          There will be only one instance of RBBIDataWrapper for any
-//                          set of RBBI run time data being shared by instances
-//                          (clones) of RulesBasedBreakIterator.
-//
+/*
+*******************************************************************************
+*
+*   Copyright (C) 1999-2003, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+*
+*******************************************************************************
+*   file name:  rbbidata.h
+*   encoding:   US-ASCII
+*   tab size:   8 (not used)
+*   indentation:4
+*
+*   RBBI data formats  Includes
+*
+*                          Structs that describes the format of the Binary RBBI data,
+*                          as it is stored in ICU's data file.
+*
+*      RBBIDataWrapper  -  Instances of this class sit between the
+*                          raw data structs and the RulesBasedBreakIterator objects
+*                          that are created by applications.  The wrapper class
+*                          provides reference counting for the underlying data,
+*                          and direct pointers to data that would not otherwise
+*                          be accessible without ugly pointer arithmetic.  The
+*                          wrapper does not attempt to provide any higher level
+*                          abstractions for the data itself.
+*
+*                          There will be only one instance of RBBIDataWrapper for any
+*                          set of RBBI run time data being shared by instances
+*                          (clones) of RulesBasedBreakIterator.
+*/
 
 #ifndef __RBBIDATA_H__
 #define __RBBIDATA_H__
 
 #include "unicode/utypes.h"
+#include "unicode/udata.h"
+#include "udataswp.h"
+
+/**
+ * Swap RBBI data. See udataswp.h.
+ * @internal
+ */
+U_CAPI int32_t U_EXPORT2
+ubrk_swap(const UDataSwapper *ds,
+          const void *inData, int32_t length, void *outData,
+          UErrorCode *pErrorCode);
+
+#ifdef XP_CPLUSPLUS
+
 #include "unicode/uobject.h"
 #include "unicode/unistr.h"
-#include "unicode/udata.h"
 #include "utrie.h"
 
-
 U_NAMESPACE_BEGIN
 
-//
-//  The following structs map exactly onto the raw data from ICU common data file.
-//
+/*  
+ *   The following structs map exactly onto the raw data from ICU common data file. 
+ */
 struct RBBIDataHeader {
-    uint32_t         fMagic;       // == 0xbla0
-    uint32_t         fVersion;     // == 1
-    uint32_t         fLength;      // Total length in bytes of this RBBI Data,
-                                   //     including all sections, not just the header.
-    uint32_t         fCatCount;    // Number of character categories.
+    uint32_t         fMagic;       /*  == 0xbla0 */
+    uint32_t         fVersion;     /*  == 1 */
+    uint32_t         fLength;      /*  Total length in bytes of this RBBI Data, */
+                                   /*      including all sections, not just the header. */
+    uint32_t         fCatCount;    /*  Number of character categories. */
 
-    //
-    // Offsets and sizes of each of the subsections within the RBBI data.
-    // All offsets are bytes from the start of the RBBIDataHeader.
-    // All sizes are in bytes.
-    //
-    uint32_t         fFTable;      // forward state transition table.
+    /*  */
+    /*  Offsets and sizes of each of the subsections within the RBBI data. */
+    /*  All offsets are bytes from the start of the RBBIDataHeader. */
+    /*  All sizes are in bytes. */
+    /*  */
+    uint32_t         fFTable;      /*  forward state transition table. */
     uint32_t         fFTableLen;
-    uint32_t         fRTable;      // Offset to the reverse state transition table.
+    uint32_t         fRTable;      /*  Offset to the reverse state transition table. */
     uint32_t         fRTableLen;
-    uint32_t         fTrie;        // Offset to Trie data for character categories
+    uint32_t         fSFTable;     /*  safe point forward transition table */
+    uint32_t         fSFTableLen;
+    uint32_t         fSRTable;     /*  safe point reverse transition table */
+    uint32_t         fSRTableLen;
+    uint32_t         fTrie;        /*  Offset to Trie data for character categories */
     uint32_t         fTrieLen;
-    uint32_t         fRuleSource;  // Offset to the source for for the break
-    uint32_t         fRuleSourceLen;  //   rules.  Stored UChar *.
+    uint32_t         fRuleSource;  /*  Offset to the source for for the break */
+    uint32_t         fRuleSourceLen;  /*    rules.  Stored UChar *. */
 
-    uint32_t         fReserved[8]; // Reserved for expansion
+    uint32_t         fReserved[8]; /*  Reserved for expansion */
 
 };
 
 
 
 struct  RBBIStateTableRow {
-    int16_t          fAccepting;    // Non-zero if this row is for an accepting state.
-                                    // Value is the {nnn} value to return to calling
-                                    //    application.
-    int16_t          fLookAhead;    // Non-zero if this row is for a state that
-                                    //   corresponds to a '/' in the rule source.
-                                    //   Value is the same as the fAccepting
-                                    //     value for the rule (which will appear
-                                    //     in a different state.
-    int16_t          fTag;          // Non-zero if this row covers a {tagged} position
-                                    //    from a rule.  value is the tag number.
+    int16_t          fAccepting;    /*  Non-zero if this row is for an accepting state. */
+                                    /*  Value is the {nnn} value to return to calling */
+                                    /*     application. */
+    int16_t          fLookAhead;    /*  Non-zero if this row is for a state that */
+                                    /*    corresponds to a '/' in the rule source. */
+                                    /*    Value is the same as the fAccepting */
+                                    /*      value for the rule (which will appear */
+                                    /*      in a different state. */
+    int16_t          fTag;          /*  Non-zero if this row covers a {tagged} position */
+                                    /*     from a rule.  value is the tag number. */
     int16_t          fReserved;
-    uint16_t         fNextState[2]; // Next State, indexed by char category.
-                                    //   Array Size is fNumCols from the
-                                    //   state table header.
-                                    //   CAUTION:  see RBBITableBuilder::getTableSize()
-                                    //             before changing anything here.
+    uint16_t         fNextState[2]; /*  Next State, indexed by char category. */
+                                    /*    Array Size is fNumCols from the */
+                                    /*    state table header. */
+                                    /*    CAUTION:  see RBBITableBuilder::getTableSize() */
+                                    /*              before changing anything here. */
 };
 
 
 struct RBBIStateTable {
-    uint32_t         fNumStates;    // Number of states.
-    uint32_t         fRowLen;       // Length of a state table row, in bytes.
-    char             fTableData[4]; // First RBBIStateTableRow begins here.
-                                    //   (making it char[] simplifies ugly address
-                                    //    arithmetic for indexing variable length rows.)
+    uint32_t         fNumStates;    /*  Number of states.                                 */
+    uint32_t         fRowLen;       /*  Length of a state table row, in bytes.            */
+    uint32_t         fFlags;        /*  Option Flags for this state table                 */
+    uint32_t         fReserved;     /*  reserved                                          */
+    char             fTableData[4]; /*  First RBBIStateTableRow begins here.              */
+                                    /*    (making it char[] simplifies ugly address       */
+                                    /*     arithmetic for indexing variable length rows.) */
 };
 
+typedef enum {
+    RBBI_LOOKAHEAD_HARD_BREAK = 1
+} RBBIStateTableFlags;
 
-//
-//  The reference counting wrapper class
-//
+
+/*  */
+/*   The reference counting wrapper class */
+/*  */
 class RBBIDataWrapper : public UMemory {
 public:
     RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status);
@@ -109,14 +137,22 @@
     UBool                 operator ==(const RBBIDataWrapper &other) const;
     int32_t               hashCode();
     const UnicodeString  &getRuleSourceString();
+#ifdef RBBI_DEBUG
     void                  printData();
+    void                  printTable(const char *heading, const RBBIStateTable *table);
+#else
+    #define printData()
+    #define printTable(heading, table)
+#endif
 
-    //
-    //  Pointers to items within the data
-    //
+    /*  */
+    /*   Pointers to items within the data */
+    /*  */
     const RBBIDataHeader     *fHeader;
     const RBBIStateTable     *fForwardTable;
     const RBBIStateTable     *fReverseTable;
+    const RBBIStateTable     *fSafeFwdTable;
+    const RBBIStateTable     *fSafeRevTable;
     const UChar              *fRuleSource;
 
     UTrie               fTrie;
@@ -126,11 +162,14 @@
     UDataMemory        *fUDataMem;
     UnicodeString       fRuleString;
 
-    RBBIDataWrapper(const RBBIDataWrapper &other); // forbid copying of this class
-    RBBIDataWrapper &operator=(const RBBIDataWrapper &other); // forbid copying of this class
+    RBBIDataWrapper(const RBBIDataWrapper &other); /*  forbid copying of this class */
+    RBBIDataWrapper &operator=(const RBBIDataWrapper &other); /*  forbid copying of this class */
 };
 
+
+
 U_NAMESPACE_END
 
-#endif
+#endif /* C++ */
 
+#endif

Index: rbbinode.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/rbbinode.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- rbbinode.cpp	10 Sep 2003 02:42:02 -0000	1.1
+++ rbbinode.cpp	6 Apr 2004 10:07:59 -0000	1.2
@@ -263,8 +263,8 @@
 //    print.         Print out a single node, for debugging.
 //
 //-------------------------------------------------------------------------
-void RBBINode::print() {
 #ifdef RBBI_DEBUG
+void RBBINode::printNode() {
     static const char * const nodeTypeNames[] = {
                 "setRef",
                 "uset",
@@ -284,21 +284,23 @@
                 "opLParen"
     };
 
-    RBBIDebugPrintf("%10p  %12s  %10p  %10p  %10p      %4d     %6d   %d ",
-        (void *)this, nodeTypeNames[fType], (void *)fParent, (void *)fLeftChild, (void *)fRightChild,
-        fSerialNum, fFirstPos, fVal);
-    if (fType == varRef) {
-        printUnicodeString(fText);
+    if (this==NULL) {
+        RBBIDebugPrintf("%10p", (void *)this);
+    } else {
+        RBBIDebugPrintf("%10p  %12s  %10p  %10p  %10p      %4d     %6d   %d ",
+            (void *)this, nodeTypeNames[fType], (void *)fParent, (void *)fLeftChild, (void *)fRightChild,
+            fSerialNum, fFirstPos, fVal);
+        if (fType == varRef) {
+            RBBI_DEBUG_printUnicodeString(fText);
+        }
     }
     RBBIDebugPrintf("\n");
-#endif
 }
+#endif
 
 
 #ifdef RBBI_DEBUG
-void RBBINode::printUnicodeString(const UnicodeString &, int) {}
-#else
-void RBBINode::printUnicodeString(const UnicodeString &s, int minWidth)
+U_CFUNC void RBBI_DEBUG_printUnicodeString(const UnicodeString &s, int minWidth)
 {
     int i;
     for (i=0; i<s.length(); i++) {
@@ -318,24 +320,24 @@
 //
 //-------------------------------------------------------------------------
 #ifdef RBBI_DEBUG
-void RBBINode::printTree(UBool, UBool) {}
-#else
-void RBBINode::printTree(UBool printHeading, UBool doVars) {
+void RBBINode::printTree(UBool printHeading) {
     if (printHeading) {
         RBBIDebugPrintf( "-------------------------------------------------------------------\n"
                          "    Address       type         Parent   LeftChild  RightChild    serial  position value\n"
               );
     }
-    this->print();
-    // Only dump the definition under a variable reference if asked to.
-    // Unconditinally dump children of all other node types.
-    if (fType != varRef || doVars) {
-        if (fLeftChild != NULL) {
-            fLeftChild->printTree(FALSE);
-        }
-
-        if (fRightChild != NULL) {
-            fRightChild->printTree(FALSE);
+    this->printNode();
+    if (this != NULL) {
+        // Only dump the definition under a variable reference if asked to.
+        // Unconditinally dump children of all other node types.
+        if (fType != varRef) {
+            if (fLeftChild != NULL) {
+                fLeftChild->printTree(FALSE);
+            }
+            
+            if (fRightChild != NULL) {
+                fRightChild->printTree(FALSE);
+            }
         }
     }
 }

Index: rbbinode.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/rbbinode.h,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- rbbinode.h	10 Sep 2003 02:42:02 -0000	1.1
+++ rbbinode.h	6 Apr 2004 10:07:59 -0000	1.2
@@ -1,6 +1,6 @@
 /********************************************************************
  * COPYRIGHT:
- * Copyright (c) 2001-2002, International Business Machines Corporation and
+ * Copyright (c) 2001-2003, International Business Machines Corporation and
  * others. All Rights Reserved.
  ********************************************************************/
 
@@ -93,9 +93,16 @@
         void         flattenSets();
         void         findNodes(UVector *dest, RBBINode::NodeType kind, UErrorCode &status);
 
-        void        print();
-        void        printTree(UBool withHeading=TRUE, UBool doVars=FALSE);
-        static void printUnicodeString(const UnicodeString &s, int minWidth=0);
+#ifdef RBBI_DEBUG
+        void        printNode();
+        void        printTree(UBool withHeading);
+#else
+        // Do-nothing inline functions for non-debug builds.  Can't make empty defines for member
+        //   functions - they won't compile at the call sites.
+        int         fakeField;
+        #define printNode() fakeField=0;
+        #define printTree(withHeading) fakeField=0;
+#endif
 
     private:
         RBBINode &operator = (const RBBINode &other); // No defs.
@@ -104,6 +111,12 @@
         int           fSerialNum;           //  Debugging aids.
         static int    gLastSerial;
 };
+
+#ifdef RBBI_DEBUG
+U_CFUNC void 
+RBBI_DEBUG_printUnicodeString(const UnicodeString &s, int minWidth=0);
+#endif
+
 U_NAMESPACE_END
 
 #endif

Index: rbbirb.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/rbbirb.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- rbbirb.cpp	10 Sep 2003 02:42:02 -0000	1.1
+++ rbbirb.cpp	6 Apr 2004 10:07:59 -0000	1.2
@@ -47,7 +47,7 @@
                                        UErrorCode      &status)
  : fRules(rules)
 {
-    fStatus     = &status;
+    fStatus = &status; // status is checked below
     fParseError = &parseErr;
     fDebugEnv   = NULL;
 #ifdef RBBI_DEBUG
@@ -57,11 +57,28 @@
 
     fForwardTree        = NULL;
     fReverseTree        = NULL;
+    fSafeFwdTree        = NULL;
+    fSafeRevTree        = NULL;
+    fDefaultTree        = &fForwardTree;
     fForwardTables      = NULL;
     fReverseTables      = NULL;
-    fUSetNodes          = new UVector(status);
+    fSafeFwdTables      = NULL;
+    fSafeRevTables      = NULL;
+    fChainRules         = FALSE;
+    fLBCMNoChain        = FALSE;
+    fLookAheadHardBreak = FALSE;
+
+    UErrorCode oldstatus = status;   
+
+    fUSetNodes          = new UVector(status); // bcos status gets overwritten here
     fScanner            = new RBBIRuleScanner(this);
     fSetBuilder         = new RBBISetBuilder(this);
+    if (U_FAILURE(oldstatus)) {
+        status = oldstatus;
+    }
+    if (U_FAILURE(status)) {
+        return;
+    }
     if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0) {
         status = U_MEMORY_ALLOCATION_ERROR;
     }
@@ -89,8 +106,13 @@
     delete fSetBuilder;
     delete fForwardTables;
     delete fReverseTables;
+    delete fSafeFwdTables;
+    delete fSafeRevTables;
+
     delete fForwardTree;
     delete fReverseTree;
+    delete fSafeFwdTree;
+    delete fSafeRevTree;
     delete fScanner;
 }
 
@@ -123,11 +145,13 @@
     int32_t headerSize        = align8(sizeof(RBBIDataHeader));
     int32_t forwardTableSize  = align8(fForwardTables->getTableSize());
     int32_t reverseTableSize  = align8(fReverseTables->getTableSize());
+    int32_t safeFwdTableSize  = align8(fSafeFwdTables->getTableSize());
+    int32_t safeRevTableSize  = align8(fSafeRevTables->getTableSize());
     int32_t trieSize          = align8(fSetBuilder->getTrieSize());
     int32_t rulesSize         = align8((strippedRules.length()+1) * sizeof(UChar));
 
     int32_t         totalSize = headerSize + forwardTableSize + reverseTableSize
-                                + trieSize + rulesSize;
+                                + safeFwdTableSize + safeRevTableSize + trieSize + rulesSize;
     RBBIDataHeader  *data     = (RBBIDataHeader *)uprv_malloc(totalSize);
     if (data == NULL) {
         *fStatus = U_MEMORY_ALLOCATION_ERROR;
@@ -143,17 +167,24 @@
 
     data->fFTable        = headerSize;
     data->fFTableLen     = forwardTableSize;
-    data->fRTable        = data->fFTable + forwardTableSize;
+    data->fRTable        = data->fFTable  + forwardTableSize;
     data->fRTableLen     = reverseTableSize;
-    data->fTrie          = data->fRTable + reverseTableSize;
+    data->fSFTable       = data->fRTable  + reverseTableSize;
+    data->fSFTableLen    = safeFwdTableSize;
+    data->fSRTable       = data->fSFTable + safeFwdTableSize;
+    data->fSRTableLen    = safeRevTableSize;
+
+    data->fTrie          = data->fSRTable + safeRevTableSize;
     data->fTrieLen       = fSetBuilder->getTrieSize();
-    data->fRuleSource    = data->fTrie   + trieSize;
+    data->fRuleSource    = data->fTrie    + trieSize;
     data->fRuleSourceLen = strippedRules.length() * sizeof(UChar);
 
     uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
 
     fForwardTables->exportTable((uint8_t *)data + data->fFTable);
     fReverseTables->exportTable((uint8_t *)data + data->fRTable);
+    fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable);
+    fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
     fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
     strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
 
@@ -176,9 +207,7 @@
                                     UParseError      &parseError,
                                     UErrorCode       &status)
 {
-    if (U_FAILURE(status)) {
-        return NULL;
-    }
+    // status checked below
 
     //
     // Read the input rules, generate a parse tree, symbol table,
@@ -186,7 +215,7 @@
     //
     RBBIRuleBuilder  builder(rules, parseError, status);
     builder.fScanner->parse();
-    if (U_FAILURE(status)) {
+    if (U_FAILURE(status)) { // status checked here bcos build below doesn't
         return NULL;
     }
 
@@ -204,24 +233,29 @@
     //
     builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree);
     builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree);
-    if(builder.fForwardTables == NULL || builder.fReverseTables == NULL) {
+    builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree);
+    builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree);
+    if (U_SUCCESS(status)
+        && (builder.fForwardTables == NULL || builder.fReverseTables == NULL ||
+            builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL)) 
+    {
         status = U_MEMORY_ALLOCATION_ERROR;
         return NULL;
     }
 
     builder.fForwardTables->build();
     builder.fReverseTables->build();
+    builder.fSafeFwdTables->build();
+    builder.fSafeRevTables->build();
     if (U_FAILURE(status)) {
         return NULL;
     }
 
-
     //
     //   Package up the compiled data into a memory image
     //      in the run-time format.
     //
-    RBBIDataHeader   *data;
-    data = builder.flattenData();
+    RBBIDataHeader *data = builder.flattenData(); // returns NULL if error
 
 
     //
@@ -233,16 +267,14 @@
     //  Create a break iterator from the compiled rules.
     //     (Identical to creation from stored pre-compiled rules)
     //
+    // status is checked after init in construction.
     RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
-    /* test for NULL */
-    if(This == NULL) {
-        status = U_MEMORY_ALLOCATION_ERROR;
-        return NULL;
-    }
-
     if (U_FAILURE(status)) {
         delete This;
         This = NULL;
+    } 
+    else if(This == NULL) { // test for NULL
+        status = U_MEMORY_ALLOCATION_ERROR;
     }
     return This;
 }

Index: rbbirb.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/rbbirb.h,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- rbbirb.h	10 Sep 2003 02:42:02 -0000	1.1
+++ rbbirb.h	6 Apr 2004 10:07:59 -0000	1.2
@@ -1,10 +1,11 @@
 //
 //  rbbirb.h
 //
-//  Copyright (C) 2002, International Business Machines Corporation and others.
+//  Copyright (C) 2002-2003, International Business Machines Corporation and others.
 //  All Rights Reserved.
 //
-//  This file contains declarations for several from the Rule Based Break Iterator rule builder.
+//  This file contains declarations for several classes from the
+//    Rule Based Break Iterator rule builder.
 //
 
 
@@ -18,7 +19,7 @@
 #include "unicode/parseerr.h"
 #include "uhash.h"
 #include "uvector.h"
-#include "symtable.h"     // For UnicodeSet parsing, is the interface that
+#include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
                           //    looks up references to $variables within a set.
 
 
@@ -78,7 +79,14 @@
     virtual RBBINode *lookupNode(const UnicodeString &key) const;
     virtual void      addEntry  (const UnicodeString &key, RBBINode *val, UErrorCode &err);
 
-    virtual void      print() const;
+#ifdef RBBI_DEBUG
+    virtual void      rbbiSymtablePrint() const;
+#else
+    // A do-nothing inline function for non-debug builds.  Member funcs can't be empty
+    //  or the call sites won't compile.
+    int  fFakeField;
+    #define rbbiSymtablePrint() fFakeField=0; 
+#endif
 
 private:
     RBBISymbolTable(const RBBISymbolTable &other); // forbid copying of this class
@@ -121,12 +129,29 @@
     RBBIRuleScanner               *fScanner;         // The scanner.
     RBBINode                      *fForwardTree;     // The parse trees, generated by the scanner,
     RBBINode                      *fReverseTree;     //   then manipulated by subsequent steps.
+    RBBINode                      *fSafeFwdTree;
+    RBBINode                      *fSafeRevTree;
+
+    RBBINode                      **fDefaultTree;    // For rules not qualified with a !
+                                                     //   the tree to which they belong to.
+
+    UBool                         fChainRules;       // True for chained Unicode TR style rules.
+                                                     // False for traditional regexp rules.
+
+    UBool                         fLBCMNoChain;      // True:  suppress chaining of rules on
+                                                     //   chars with LineBreak property == CM.
+
+    UBool                         fLookAheadHardBreak;  // True:  Look ahead matches cause an
+                                                     // immediate break, no continuing for the
+                                                     // longest match.
 
     RBBISetBuilder                *fSetBuilder;      // Set and Character Category builder.
     UVector                       *fUSetNodes;       // Vector of all uset nodes.
 
     RBBITableBuilder              *fForwardTables;   // State transition tables
     RBBITableBuilder              *fReverseTables;
+    RBBITableBuilder              *fSafeFwdTables;
+    RBBITableBuilder              *fSafeRevTables;
 
     RBBIDataHeader                *flattenData();    // Create the flattened (runtime format)
                                                      // data tables..

Index: rbbirpt.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/rbbirpt.h,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- rbbirpt.h	10 Sep 2003 02:42:02 -0000	1.1
+++ rbbirpt.h	6 Apr 2004 10:07:59 -0000	1.2
@@ -6,7 +6,7 @@
 //    It is generated by the Perl script "rbbicst.pl" from
 //    the rule parser state definitions file "rbbirpt.txt".
 //
-//   Copyright (C) 2002 International Business Machines Corporation 
+//   Copyright (C) 2002-2003 International Business Machines Corporation 
 //   and others. All rights reserved.  
 //
 //---------------------------------------------------------------------------------
@@ -18,14 +18,15 @@
 // Character classes for RBBI rule scanning.
 //
     static const uint8_t kRuleSet_digit_char = 128;
-    static const uint8_t kRuleSet_rule_char = 129;
-    static const uint8_t kRuleSet_white_space = 130;
-    static const uint8_t kRuleSet_name_char = 131;
-    static const uint8_t kRuleSet_name_start_char = 132;
+    static const uint8_t kRuleSet_white_space = 129;
+    static const uint8_t kRuleSet_rule_char = 130;
+    static const uint8_t kRuleSet_name_start_char = 131;
+    static const uint8_t kRuleSet_name_char = 132;
 
 
 enum RBBI_RuleParseAction {
     doExprOrOperator,
+    doOptionEnd,
     doRuleErrorAssignExpr,
     doTagValue,
     doEndAssign,
@@ -51,6 +52,7 @@
     doEndOfRule,
     doUnaryOpPlus,
     doExprStart,
+    doOptionStart,
     doExprCatOperator,
     doReverseDir,
     doCheckVarDef,
@@ -73,92 +75,101 @@
 
 static const struct RBBIRuleTableEl gRuleParseStateTable[] = {
     {doNOP, 0, 0, 0, TRUE}
-    , {doExprStart, 254, 12, 8, FALSE}     //  1      start
-    , {doNOP, 130, 1,0,  TRUE}     //  2 
-    , {doExprStart, 36 /* $ */, 71, 81, FALSE}     //  3 
-    , {doReverseDir, 33 /* ! */, 11,0,  TRUE}     //  4 
+    , {doExprStart, 254, 21, 8, FALSE}     //  1      start
+    , {doNOP, 129, 1,0,  TRUE}     //  2 
+    , {doExprStart, 36 /* $ */, 80, 90, FALSE}     //  3 
+    , {doNOP, 33 /* ! */, 11,0,  TRUE}     //  4 
     , {doNOP, 59 /* ; */, 1,0,  TRUE}     //  5 
     , {doNOP, 252, 0,0,  FALSE}     //  6 
-    , {doExprStart, 255, 12, 8, FALSE}     //  7 
+    , {doExprStart, 255, 21, 8, FALSE}     //  7 
     , {doEndOfRule, 59 /* ; */, 1,0,  TRUE}     //  8      break-rule-end
-    , {doNOP, 130, 8,0,  TRUE}     //  9 
-    , {doRuleError, 255, 86,0,  FALSE}     //  10 
-    , {doExprStart, 255, 12, 8, FALSE}     //  11      reverse-rule
-    , {doRuleChar, 254, 21,0,  TRUE}     //  12      term
-    , {doNOP, 130, 12,0,  TRUE}     //  13 
-    , {doRuleChar, 129, 21,0,  TRUE}     //  14 
-    , {doNOP, 91 /* [ */, 77, 21, FALSE}     //  15 
-    , {doLParen, 40 /* ( */, 12, 21, TRUE}     //  16 
-    , {doNOP, 36 /* $ */, 71, 20, FALSE}     //  17 
-    , {doDotAny, 46 /* . */, 21,0,  TRUE}     //  18 
-    , {doRuleError, 255, 86,0,  FALSE}     //  19 
-    , {doCheckVarDef, 255, 21,0,  FALSE}     //  20      term-var-ref
-    , {doNOP, 130, 21,0,  TRUE}     //  21      expr-mod
-    , {doUnaryOpStar, 42 /* * */, 26,0,  TRUE}     //  22 
-    , {doUnaryOpPlus, 43 /* + */, 26,0,  TRUE}     //  23 
-    , {doUnaryOpQuestion, 63 /* ? */, 26,0,  TRUE}     //  24 
-    , {doNOP, 255, 26,0,  FALSE}     //  25 
-    , {doExprCatOperator, 254, 12,0,  FALSE}     //  26      expr-cont
-    , {doNOP, 130, 26,0,  TRUE}     //  27 
-    , {doExprCatOperator, 129, 12,0,  FALSE}     //  28 
-    , {doExprCatOperator, 91 /* [ */, 12,0,  FALSE}     //  29 
-    , {doExprCatOperator, 40 /* ( */, 12,0,  FALSE}     //  30 
-    , {doExprCatOperator, 36 /* $ */, 12,0,  FALSE}     //  31 
-    , {doExprCatOperator, 46 /* . */, 12,0,  FALSE}     //  32 
-    , {doExprCatOperator, 47 /* / */, 38,0,  FALSE}     //  33 
-    , {doExprCatOperator, 123 /* { */, 50,0,  TRUE}     //  34 
-    , {doExprOrOperator, 124 /* | */, 12,0,  TRUE}     //  35 
-    , {doExprRParen, 41 /* ) */, 255,0,  TRUE}     //  36 
-    , {doExprFinished, 255, 255,0,  FALSE}     //  37 
-    , {doSlash, 47 /* / */, 40,0,  TRUE}     //  38      look-ahead
-    , {doNOP, 255, 86,0,  FALSE}     //  39 
-    , {doExprCatOperator, 254, 12,0,  FALSE}     //  40      expr-cont-no-slash
-    , {doNOP, 130, 26,0,  TRUE}     //  41 
-    , {doExprCatOperator, 129, 12,0,  FALSE}     //  42 
-    , {doExprCatOperator, 91 /* [ */, 12,0,  FALSE}     //  43 
-    , {doExprCatOperator, 40 /* ( */, 12,0,  FALSE}     //  44 
-    , {doExprCatOperator, 36 /* $ */, 12,0,  FALSE}     //  45 
-    , {doExprCatOperator, 46 /* . */, 12,0,  FALSE}     //  46 
-    , {doExprOrOperator, 124 /* | */, 12,0,  TRUE}     //  47 
-    , {doExprRParen, 41 /* ) */, 255,0,  TRUE}     //  48 
-    , {doExprFinished, 255, 255,0,  FALSE}     //  49 
-    , {doNOP, 130, 50,0,  TRUE}     //  50      tag-open
-    , {doStartTagValue, 128, 53,0,  FALSE}     //  51 
-    , {doTagExpectedError, 255, 86,0,  FALSE}     //  52 
-    , {doNOP, 130, 57,0,  TRUE}     //  53      tag-value
-    , {doNOP, 125 /* } */, 57,0,  FALSE}     //  54 
-    , {doTagDigit, 128, 53,0,  TRUE}     //  55 
-    , {doTagExpectedError, 255, 86,0,  FALSE}     //  56 
-    , {doNOP, 130, 57,0,  TRUE}     //  57      tag-close
-    , {doTagValue, 125 /* } */, 60,0,  TRUE}     //  58 
-    , {doTagExpectedError, 255, 86,0,  FALSE}     //  59 
-    , {doExprCatOperator, 254, 12,0,  FALSE}     //  60      expr-cont-no-tag
-    , {doNOP, 130, 60,0,  TRUE}     //  61 
-    , {doExprCatOperator, 129, 12,0,  FALSE}     //  62 
-    , {doExprCatOperator, 91 /* [ */, 12,0,  FALSE}     //  63 
-    , {doExprCatOperator, 40 /* ( */, 12,0,  FALSE}     //  64 
-    , {doExprCatOperator, 36 /* $ */, 12,0,  FALSE}     //  65 
-    , {doExprCatOperator, 46 /* . */, 12,0,  FALSE}     //  66 
-    , {doExprCatOperator, 47 /* / */, 38,0,  FALSE}     //  67 
-    , {doExprOrOperator, 124 /* | */, 12,0,  TRUE}     //  68 
-    , {doExprRParen, 41 /* ) */, 255,0,  TRUE}     //  69 
-    , {doExprFinished, 255, 255,0,  FALSE}     //  70 
-    , {doStartVariableName, 36 /* $ */, 73,0,  TRUE}     //  71      scan-var-name
-    , {doNOP, 255, 86,0,  FALSE}     //  72 
-    , {doNOP, 132, 75,0,  TRUE}     //  73      scan-var-start
-    , {doVariableNameExpectedErr, 255, 86,0,  FALSE}     //  74 
-    , {doNOP, 131, 75,0,  TRUE}     //  75      scan-var-body
-    , {doEndVariableName, 255, 255,0,  FALSE}     //  76 
-    , {doScanUnicodeSet, 91 /* [ */, 255,0,  TRUE}     //  77      scan-unicode-set
-    , {doScanUnicodeSet, 112 /* p */, 255,0,  TRUE}     //  78 
-    , {doScanUnicodeSet, 80 /* P */, 255,0,  TRUE}     //  79 
-    , {doNOP, 255, 86,0,  FALSE}     //  80 
-    , {doNOP, 130, 81,0,  TRUE}     //  81      assign-or-rule
-    , {doStartAssign, 61 /* = */, 12, 84, TRUE}     //  82 
-    , {doNOP, 255, 20, 8, FALSE}     //  83 
-    , {doEndAssign, 59 /* ; */, 1,0,  TRUE}     //  84      assign-end
-    , {doRuleErrorAssignExpr, 255, 86,0,  FALSE}     //  85 
-    , {doExit, 255, 86,0,  TRUE}     //  86      errorDeath
+    , {doNOP, 129, 8,0,  TRUE}     //  9 
+    , {doRuleError, 255, 95,0,  FALSE}     //  10 
+    , {doNOP, 33 /* ! */, 13,0,  TRUE}     //  11      rev-option
+    , {doReverseDir, 255, 20, 8, FALSE}     //  12 
+    , {doOptionStart, 131, 15,0,  TRUE}     //  13      option-scan1
+    , {doRuleError, 255, 95,0,  FALSE}     //  14 
+    , {doNOP, 132, 15,0,  TRUE}     //  15      option-scan2
+    , {doOptionEnd, 255, 17,0,  FALSE}     //  16 
+    , {doNOP, 59 /* ; */, 1,0,  TRUE}     //  17      option-scan3
+    , {doNOP, 129, 17,0,  TRUE}     //  18 
+    , {doRuleError, 255, 95,0,  FALSE}     //  19 
+    , {doExprStart, 255, 21, 8, FALSE}     //  20      reverse-rule
+    , {doRuleChar, 254, 30,0,  TRUE}     //  21      term
+    , {doNOP, 129, 21,0,  TRUE}     //  22 
+    , {doRuleChar, 130, 30,0,  TRUE}     //  23 
+    , {doNOP, 91 /* [ */, 86, 30, FALSE}     //  24 
+    , {doLParen, 40 /* ( */, 21, 30, TRUE}     //  25 
+    , {doNOP, 36 /* $ */, 80, 29, FALSE}     //  26 
+    , {doDotAny, 46 /* . */, 30,0,  TRUE}     //  27 
+    , {doRuleError, 255, 95,0,  FALSE}     //  28 
+    , {doCheckVarDef, 255, 30,0,  FALSE}     //  29      term-var-ref
+    , {doNOP, 129, 30,0,  TRUE}     //  30      expr-mod
+    , {doUnaryOpStar, 42 /* * */, 35,0,  TRUE}     //  31 
+    , {doUnaryOpPlus, 43 /* + */, 35,0,  TRUE}     //  32 
+    , {doUnaryOpQuestion, 63 /* ? */, 35,0,  TRUE}     //  33 
+    , {doNOP, 255, 35,0,  FALSE}     //  34 
+    , {doExprCatOperator, 254, 21,0,  FALSE}     //  35      expr-cont
+    , {doNOP, 129, 35,0,  TRUE}     //  36 
+    , {doExprCatOperator, 130, 21,0,  FALSE}     //  37 
+    , {doExprCatOperator, 91 /* [ */, 21,0,  FALSE}     //  38 
+    , {doExprCatOperator, 40 /* ( */, 21,0,  FALSE}     //  39 
+    , {doExprCatOperator, 36 /* $ */, 21,0,  FALSE}     //  40 
+    , {doExprCatOperator, 46 /* . */, 21,0,  FALSE}     //  41 
+    , {doExprCatOperator, 47 /* / */, 47,0,  FALSE}     //  42 
+    , {doExprCatOperator, 123 /* { */, 59,0,  TRUE}     //  43 
+    , {doExprOrOperator, 124 /* | */, 21,0,  TRUE}     //  44 
+    , {doExprRParen, 41 /* ) */, 255,0,  TRUE}     //  45 
+    , {doExprFinished, 255, 255,0,  FALSE}     //  46 
+    , {doSlash, 47 /* / */, 49,0,  TRUE}     //  47      look-ahead
+    , {doNOP, 255, 95,0,  FALSE}     //  48 
+    , {doExprCatOperator, 254, 21,0,  FALSE}     //  49      expr-cont-no-slash
+    , {doNOP, 129, 35,0,  TRUE}     //  50 
+    , {doExprCatOperator, 130, 21,0,  FALSE}     //  51 
+    , {doExprCatOperator, 91 /* [ */, 21,0,  FALSE}     //  52 
+    , {doExprCatOperator, 40 /* ( */, 21,0,  FALSE}     //  53 
+    , {doExprCatOperator, 36 /* $ */, 21,0,  FALSE}     //  54 
+    , {doExprCatOperator, 46 /* . */, 21,0,  FALSE}     //  55 
+    , {doExprOrOperator, 124 /* | */, 21,0,  TRUE}     //  56 
+    , {doExprRParen, 41 /* ) */, 255,0,  TRUE}     //  57 
+    , {doExprFinished, 255, 255,0,  FALSE}     //  58 
+    , {doNOP, 129, 59,0,  TRUE}     //  59      tag-open
+    , {doStartTagValue, 128, 62,0,  FALSE}     //  60 
+    , {doTagExpectedError, 255, 95,0,  FALSE}     //  61 
+    , {doNOP, 129, 66,0,  TRUE}     //  62      tag-value
+    , {doNOP, 125 /* } */, 66,0,  FALSE}     //  63 
+    , {doTagDigit, 128, 62,0,  TRUE}     //  64 
+    , {doTagExpectedError, 255, 95,0,  FALSE}     //  65 
+    , {doNOP, 129, 66,0,  TRUE}     //  66      tag-close
+    , {doTagValue, 125 /* } */, 69,0,  TRUE}     //  67 
+    , {doTagExpectedError, 255, 95,0,  FALSE}     //  68 
+    , {doExprCatOperator, 254, 21,0,  FALSE}     //  69      expr-cont-no-tag
+    , {doNOP, 129, 69,0,  TRUE}     //  70 
+    , {doExprCatOperator, 130, 21,0,  FALSE}     //  71 
+    , {doExprCatOperator, 91 /* [ */, 21,0,  FALSE}     //  72 
+    , {doExprCatOperator, 40 /* ( */, 21,0,  FALSE}     //  73 
+    , {doExprCatOperator, 36 /* $ */, 21,0,  FALSE}     //  74 
+    , {doExprCatOperator, 46 /* . */, 21,0,  FALSE}     //  75 
+    , {doExprCatOperator, 47 /* / */, 47,0,  FALSE}     //  76 
+    , {doExprOrOperator, 124 /* | */, 21,0,  TRUE}     //  77 
+    , {doExprRParen, 41 /* ) */, 255,0,  TRUE}     //  78 
+    , {doExprFinished, 255, 255,0,  FALSE}     //  79 
+    , {doStartVariableName, 36 /* $ */, 82,0,  TRUE}     //  80      scan-var-name
+    , {doNOP, 255, 95,0,  FALSE}     //  81 
+    , {doNOP, 131, 84,0,  TRUE}     //  82      scan-var-start
+    , {doVariableNameExpectedErr, 255, 95,0,  FALSE}     //  83 
+    , {doNOP, 132, 84,0,  TRUE}     //  84      scan-var-body
+    , {doEndVariableName, 255, 255,0,  FALSE}     //  85 
+    , {doScanUnicodeSet, 91 /* [ */, 255,0,  TRUE}     //  86      scan-unicode-set
+    , {doScanUnicodeSet, 112 /* p */, 255,0,  TRUE}     //  87 
+    , {doScanUnicodeSet, 80 /* P */, 255,0,  TRUE}     //  88 
+    , {doNOP, 255, 95,0,  FALSE}     //  89 
+    , {doNOP, 129, 90,0,  TRUE}     //  90      assign-or-rule
+    , {doStartAssign, 61 /* = */, 21, 93, TRUE}     //  91 
+    , {doNOP, 255, 29, 8, FALSE}     //  92 
+    , {doEndAssign, 59 /* ; */, 1,0,  TRUE}     //  93      assign-end
+    , {doRuleErrorAssignExpr, 255, 95,0,  FALSE}     //  94 
+    , {doExit, 255, 95,0,  TRUE}     //  95      errorDeath
  };
 static const char * const RBBIRuleStateNames[] = {    0,
      "start",
@@ -169,6 +180,15 @@
     0,
     0,
      "break-rule-end",
+    0,
+    0,
+     "rev-option",
+    0,
+     "option-scan1",
+    0,
+     "option-scan2",
+    0,
+     "option-scan3",
     0,
     0,
      "reverse-rule",

Index: rbbirpt.txt
===================================================================
RCS file: /cvs/core/icu-sword/source/common/rbbirpt.txt,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- rbbirpt.txt	10 Sep 2003 02:42:02 -0000	1.1
+++ rbbirpt.txt	6 Apr 2004 10:07:59 -0000	1.2
@@ -1,7 +1,7 @@
 
 #*****************************************************************************
 #
-#   Copyright (C) 2002, International Business Machines Corporation and others.
+#   Copyright (C) 2002-2003, International Business Machines Corporation and others.
 #   All Rights Reserved.
 #
 #*****************************************************************************
@@ -58,7 +58,7 @@
     escaped                term                  ^break-rule-end    doExprStart                       
     white_space          n start                     
     '$'                    scan-var-name         ^assign-or-rule    doExprStart
-    '!'                  n reverse-rule                             doReverseDir
+    '!'                  n rev-option                             
     ';'                  n start                                                  # ignore empty rules.
     eof                    exit              
     default                term                  ^break-rule-end    doExprStart
@@ -73,9 +73,27 @@
      
 
 #
-#   Reverse Rule    We've just scanned a '!', indicating a reverse direction rule.
-#                   A rule expression must follow.
+#   !               We've just scanned a '!', indicating either a !!key word flag or a
+#                   !Reverse rule.
 #
+rev-option:
+    '!'                  n option-scan1   
+    default                reverse-rule           ^break-rule-end   doReverseDir
+    
+option-scan1:
+    name_start_char      n option-scan2                             doOptionStart
+    default                errorDeath                               doRuleError
+    
+option-scan2:
+    name_char            n option-scan2
+    default                option-scan3                             doOptionEnd
+    
+option-scan3:
+    ';'                  n start 
+    white_space          n option-scan3 
+    default                errorDeath                               doRuleError 
+    
+
 reverse-rule:
     default                term                   ^break-rule-end   doExprStart
     

Index: rbbiscan.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/rbbiscan.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- rbbiscan.cpp	10 Sep 2003 02:42:02 -0000	1.1
+++ rbbiscan.cpp	6 Apr 2004 10:07:59 -0000	1.2
@@ -32,6 +32,7 @@
 #include "rbbirb.h"
 #include "rbbinode.h"
 #include "rbbiscan.h"
+#include "rbbitblb.h"
 
 #include "uassert.h"
 
@@ -113,6 +114,8 @@
     fCharNum    = 0;
     fQuoteMode  = FALSE;
 
+    // Do not check status until after all critical fields are sufficiently initialized
+    //   that the destructor can run cleanly.
     if (U_FAILURE(*rb->fStatus)) {
         return;
     }
@@ -320,10 +323,11 @@
         // The ';' that terminates an expression really just functions as a '|' with
         //   a low operator prededence.
         //
-        // Forward and reverse rules are collected separately.  Or this rule into
-        //  the appropriate group of them.
+        // Each of the four sets of rules are collected separately.
+        //  (forward, reverse, safe_forward, safe_reverse)
+        //  OR this rule into the appropriate group of them.
         //
-        RBBINode **destRules = (fReverseRule? &fRB->fReverseTree : &fRB->fForwardTree);
+        RBBINode **destRules = (fReverseRule? &fRB->fReverseTree : fRB->fDefaultTree);
 
         if (*destRules != NULL) {
             // This is not the first rule encounted.
@@ -460,6 +464,33 @@
         break;
 
 
+    case doOptionStart:
+        // Scanning a !!option.   At the start of string.
+        fOptionStart = fScanIndex;
+        break;
+
+    case doOptionEnd:
+        {
+            UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart);
+            if (opt == "chain") {
+                fRB->fChainRules = TRUE;
+            } else if (opt == "LBCMNoChain") {
+                fRB->fLBCMNoChain = TRUE;
+            } else if (opt == "forward") {
+                fRB->fDefaultTree   = &fRB->fForwardTree;
+            } else if (opt == "reverse") {
+                fRB->fDefaultTree   = &fRB->fReverseTree;
+            } else if (opt == "safe_forward") {
+                fRB->fDefaultTree   = &fRB->fSafeFwdTree;
+            } else if (opt == "safe_reverse") {
+                fRB->fDefaultTree   = &fRB->fSafeRevTree;
+            } else if (opt == "lookAheadHardBreak") {
+                fRB->fLookAheadHardBreak = TRUE;
+            } else {
+                error(U_BRK_UNRECOGNIZED_OPTION);
+            }
+        }
+        break;
 
     case doReverseDir:
         fReverseRule = TRUE;
@@ -853,7 +884,7 @@
 //---------------------------------------------------------------------------------
 //
 //  Parse RBBI rules.   The state machine for rules parsing is here.
-//                      The state tables are hand-written in the file TODO.txt,
+//                      The state tables are hand-written in the file rbbirpt.txt,
 //                      and converted to the form used here by a perl
 //                      script rbbicst.pl
 //
@@ -1002,13 +1033,17 @@
     // We now have a parse tree for the rule expressions
     // and a list of all UnicodeSets that are referenced.
     //
-    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->print();}
+    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->rbbiSymtablePrint();}
     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree"))
     {
         RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n");
-        fRB->fForwardTree->printTree();
+        fRB->fForwardTree->printTree(TRUE);
         RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n");
-        fRB->fReverseTree->printTree();
+        fRB->fReverseTree->printTree(TRUE);
+        RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n");
+        fRB->fSafeFwdTree->printTree(TRUE);
+        RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n");
+        fRB->fSafeRevTree->printTree(TRUE);
     }
 
 }
@@ -1022,7 +1057,7 @@
 void RBBIRuleScanner::printNodeStack(const char *title) {
     int i;
     RBBIDebugPrintf("%s.  Dumping node stack...\n", title);
-    for (i=fNodeStackPtr; i>0; i--) {fNodeStack[i]->printTree();}
+    for (i=fNodeStackPtr; i>0; i--) {fNodeStack[i]->printTree(TRUE);}
 }
 
 
@@ -1078,8 +1113,8 @@
     pos.setIndex(fScanIndex);
     startPos = fScanIndex;
     UErrorCode localStatus = U_ZERO_ERROR;
-    uset = new UnicodeSet(fRB->fRules, pos,
-                         *fSymbolTable,
+    uset = new UnicodeSet(fRB->fRules, pos, USET_IGNORE_SPACE,
+                         fSymbolTable,
                          localStatus);
     if (U_FAILURE(localStatus)) {
         //  TODO:  Get more accurate position of the error from UnicodeSet's return info.

Index: rbbiscan.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/rbbiscan.h,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- rbbiscan.h	10 Sep 2003 02:42:02 -0000	1.1
+++ rbbiscan.h	6 Apr 2004 10:07:59 -0000	1.2
@@ -18,7 +18,7 @@
 #include "unicode/parseerr.h"
 #include "uhash.h"
 #include "uvector.h"
-#include "symtable.h"     // For UnicodeSet parsing, is the interface that
+#include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
                           //    looks up references to $variables within a set.
 #include "rbbinode.h"
 //#include "rbbitblb.h"
@@ -144,6 +144,9 @@
                                                      //  See rbbirpt.h.
 
     int32_t                        fRuleNum;         // Counts each rule as it is scanned.
+
+    int32_t                        fOptionStart;     // Input index of start of a !!option
+                                                     //   keyword, while being scanned.
 
     UnicodeSet *gRuleSet_rule_char;
     UnicodeSet *gRuleSet_white_space;

Index: rbbisetb.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/rbbisetb.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- rbbisetb.cpp	10 Sep 2003 02:42:02 -0000	1.1
+++ rbbisetb.cpp	6 Apr 2004 10:07:59 -0000	1.2
@@ -135,10 +135,13 @@
     //  Initialize the process by creating a single range encompassing all characters
     //  that is in no sets.
     //
-    fRangeList                = new RangeDescriptor(*fStatus);
+    fRangeList                = new RangeDescriptor(*fStatus); // will check for status here
     fRangeList->fStartChar    = 0;
     fRangeList->fEndChar      = 0x10ffff;
 
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
 
     //
     //  Find the set of non-overlapping ranges of characters
@@ -176,6 +179,9 @@
             //     over
             if (rlRange->fStartChar < inputSetRangeBegin) {
                 rlRange->split(inputSetRangeBegin, *fStatus);
+                if (U_FAILURE(*fStatus)) {
+                    return;
+                }
                 continue;
             }
 
@@ -186,12 +192,18 @@
             //   wholly inside the Unicode set.
             if (rlRange->fEndChar > inputSetRangeEnd) {
                 rlRange->split(inputSetRangeEnd+1, *fStatus);
+                if (U_FAILURE(*fStatus)) {
+                    return;
+                }
             }
 
             // The current rlRange is now entirely within the UnicodeSet range.
             // Add this unicode set to the list of sets for this rlRange
             if (rlRange->fIncludesSets->indexOf(usetNode) == -1) {
                 rlRange->fIncludesSets->addElement(usetNode, *fStatus);
+                if (U_FAILURE(*fStatus)) {
+                    return;
+                }
             }
 
             // Advance over ranges that we are finished with.
@@ -237,6 +249,7 @@
                       NULL,    //  Data array  (utrie will allocate one)
                       100000,  //  Max Data Length
                       0,       //  Initial value for all code points
+                      0,       //  Lead surrogate unit value
                       TRUE);   //  Keep Latin 1 in separately
 
 
@@ -334,12 +347,32 @@
 
 //------------------------------------------------------------------------
 //
+//   getFirstChar      Given a runtime RBBI character category, find
+//                     the first UChar32 that is in the set of chars 
+//                     in the category.
+//------------------------------------------------------------------------
+UChar32  RBBISetBuilder::getFirstChar(int32_t category) {
+    RangeDescriptor   *rlRange;
+    UChar32            retVal = (UChar32)-1;
+    for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
+        if (rlRange->fNum == category) {
+            retVal = rlRange->fStartChar;
+            break;
+        }
+    }
+    return retVal;
+}
+
+
+
+//------------------------------------------------------------------------
+//
 //   printRanges        A debugging function.
 //                      dump out all of the range definitions.
 //
 //------------------------------------------------------------------------
-void RBBISetBuilder::printRanges() {
 #ifdef RBBI_DEBUG
+void RBBISetBuilder::printRanges() {
     RangeDescriptor       *rlRange;
     int                    i;
 
@@ -357,12 +390,12 @@
                     setName = varRef->fText;
                 }
             }
-            RBBINode::printUnicodeString(setName); RBBIDebugPrintf("  ");
+            RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf("  ");
         }
         RBBIDebugPrintf("\n");
     }
-#endif
 }
+#endif
 
 
 //------------------------------------------------------------------------
@@ -371,6 +404,7 @@
 //                        dump out all of the range groups.
 //
 //------------------------------------------------------------------------
+#ifdef RBBI_DEBUG
 void RBBISetBuilder::printRangeGroups() {
     RangeDescriptor       *rlRange;
     RangeDescriptor       *tRange;
@@ -396,7 +430,7 @@
                         setName = varRef->fText;
                     }
                 }
-                RBBINode::printUnicodeString(setName); RBBIDebugPrintf(" ");
+                RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" ");
             }
 
             i = 0;
@@ -413,7 +447,7 @@
     }
     RBBIDebugPrintf("\n");
 }
-
+#endif
 
 
 //------------------------------------------------------------------------
@@ -422,8 +456,8 @@
 //                      dump out all of the set definitions.
 //
 //------------------------------------------------------------------------
-void RBBISetBuilder::printSets() {
 #ifdef RBBI_DEBUG
+void RBBISetBuilder::printSets() {
     int                   i;
 
     RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n");
@@ -447,17 +481,17 @@
                 setName = varRef->fText;
             }
         }
-        RBBINode::printUnicodeString(setName);
+        RBBI_DEBUG_printUnicodeString(setName);
         RBBIDebugPrintf("   ");
-        RBBINode::printUnicodeString(usetNode->fText);
+        RBBI_DEBUG_printUnicodeString(usetNode->fText);
         RBBIDebugPrintf("\n");
         if (usetNode->fLeftChild != NULL) {
-            usetNode->fLeftChild->printTree();
+            usetNode->fLeftChild->printTree(TRUE);
         }
     }
     RBBIDebugPrintf("\n");
-#endif
 }
+#endif
 
 
 
@@ -474,7 +508,14 @@
     this->fEndChar      = other.fEndChar;
     this->fNum          = other.fNum;
     this->fNext         = NULL;
+    UErrorCode oldstatus = status;
     this->fIncludesSets = new UVector(status);
+    if (U_FAILURE(oldstatus)) {
+        status = oldstatus;
+    }
+    if (U_FAILURE(status)) {
+        return;
+    }
     /* test for NULL */
     if (this->fIncludesSets == 0) {
         status = U_MEMORY_ALLOCATION_ERROR;
@@ -497,7 +538,14 @@
     this->fEndChar      = 0;
     this->fNum          = 0;
     this->fNext         = NULL;
+    UErrorCode oldstatus = status;
     this->fIncludesSets = new UVector(status);
+    if (U_FAILURE(oldstatus)) {
+        status = oldstatus;
+    }
+    if (U_FAILURE(status)) {
+        return;
+    }
     /* test for NULL */
     if(this->fIncludesSets == 0) {
         status = U_MEMORY_ALLOCATION_ERROR;
@@ -525,6 +573,9 @@
 void RangeDescriptor::split(UChar32 where, UErrorCode &status) {
     U_ASSERT(where>fStartChar && where<=fEndChar);
     RangeDescriptor *nr = new RangeDescriptor(*this, status);
+    if (U_FAILURE(status)) {
+        return;
+    }
     /* test for NULL */
     if(nr == 0) {
         status = U_MEMORY_ALLOCATION_ERROR;

Index: rbbisetb.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/rbbisetb.h,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- rbbisetb.h	10 Sep 2003 02:42:02 -0000	1.1
+++ rbbisetb.h	6 Apr 2004 10:07:59 -0000	1.2
@@ -2,7 +2,7 @@
 //  rbbisetb.h
 /*
 **********************************************************************
-*   Copyright (c) 2001, International Business Machines
+*   Copyright (c) 2001-2003, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */
@@ -14,7 +14,6 @@
 #include "unicode/uobject.h"
 #include "rbbirb.h"
 #include "uvector.h"
-#include "uhash.h"
 
 struct  UNewTrie;
 
@@ -86,9 +85,16 @@
                                    //    columns in the DFA state table
     int32_t  getTrieSize();        // Size in bytes of the serialized Trie.
     void     serializeTrie(uint8_t *where);  // write out the serialized Trie.
+    UChar32  getFirstChar(int32_t  val);
+#ifdef RBBI_DEBUG
     void     printSets();
     void     printRanges();
     void     printRangeGroups();
+#else
+    #define printSets()
+    #define printRanges()
+    #define printRangeGroups()
+#endif
 
 private:
     void           numberSets();

Index: rbbistbl.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/rbbistbl.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- rbbistbl.cpp	10 Sep 2003 02:42:02 -0000	1.1
+++ rbbistbl.cpp	6 Apr 2004 10:07:59 -0000	1.2
@@ -43,11 +43,12 @@
 {
     fHashTable       = NULL;
     fCachedSetLookup = NULL;
+    
+    fHashTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, &status);
+    // uhash_open checks status
     if (U_FAILURE(status)) {
         return;
     }
-
-    fHashTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, &status);
     uhash_setValueDeleter(fHashTable, RBBISymbolTableEntry_deleter);
 }
 
@@ -223,7 +224,8 @@
 //
 //  RBBISymbolTable::print    Debugging function, dump out the symbol table contents.
 //
-void RBBISymbolTable::print() const {
+#ifdef RBBI_DEBUG
+void RBBISymbolTable::rbbiSymtablePrint() const {
     RBBIDebugPrintf("Variable Definitions\n"
            "Name               Node Val     String Val\n"
            "----------------------------------------------------------------------\n");
@@ -237,9 +239,9 @@
         }
         RBBISymbolTableEntry  *s   = (RBBISymbolTableEntry *)e->value.pointer;
 
-        RBBINode::printUnicodeString(s->key, 15);
+        RBBI_DEBUG_printUnicodeString(s->key, 15);
         RBBIDebugPrintf("   %8p   ", (void *)s->val);
-        RBBINode::printUnicodeString(s->val->fLeftChild->fText);
+        RBBI_DEBUG_printUnicodeString(s->val->fLeftChild->fText);
         RBBIDebugPrintf("\n");
     }
 
@@ -251,12 +253,12 @@
             break;
         }
         RBBISymbolTableEntry  *s   = (RBBISymbolTableEntry *)e->value.pointer;
-        RBBINode::printUnicodeString(s->key);
-        s->val->fLeftChild->printTree();
+        RBBI_DEBUG_printUnicodeString(s->key);
+        s->val->fLeftChild->printTree(TRUE);
         RBBIDebugPrintf("\n");
     }
 }
-
+#endif
 
 
 

Index: rbbitblb.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/rbbitblb.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- rbbitblb.cpp	10 Sep 2003 02:42:02 -0000	1.1
+++ rbbitblb.cpp	6 Apr 2004 10:07:59 -0000	1.2
@@ -25,9 +25,20 @@
 
 RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) :
  fTree(*rootNode) {
-    fRB             = rb;
-    fStatus         = fRB->fStatus;
-    fDStates        = new UVector(*fStatus);
+    fRB                 = rb;
+    fStatus             = fRB->fStatus;
+    UErrorCode status   = U_ZERO_ERROR;
+    fDStates            = new UVector(status);
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
+    if (U_FAILURE(status)) {
+        *fStatus = status;
+        return;
+    }
+    if (fDStates == NULL) {
+        *fStatus = U_MEMORY_ALLOCATION_ERROR;;
+    }
 }
 
 
@@ -109,6 +120,13 @@
     }
 
     //
+    //  For "chained" rules, modify the followPos sets
+    //
+    if (fRB->fChainRules) {
+        calcChainedFollowPos(fTree);
+    }
+
+    //
     // Build the DFA state transition tables.
     //
     buildStateTable();
@@ -301,6 +319,97 @@
 
 //-----------------------------------------------------------------------------
 //
+//   calcChainedFollowPos.    Modify the previously calculated followPos sets
+//                            to implement rule chaining.  NOT described by Aho
+//
+//-----------------------------------------------------------------------------
+void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
+
+    UVector         endMarkerNodes(*fStatus);
+    UVector         leafNodes(*fStatus);
+    int32_t         i;
+
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
+
+    // get a list of all endmarker nodes.
+    tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
+
+    // get a list all leaf nodes 
+    tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
+
+    // Get all nodes that can be the start a match, which is FirstPosition(root)
+    UVector *matchStartNodes = tree->fFirstPosSet;
+
+
+    // Iteratate over all leaf nodes,
+    //
+    int32_t  endNodeIx;
+    int32_t  startNodeIx;
+
+    for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
+        RBBINode *tNode   = (RBBINode *)leafNodes.elementAt(endNodeIx);
+        RBBINode *endNode = NULL;
+
+        // Identify leaf nodes that correspond to overall rule match positions.
+        //   These include an endMarkerNode in their followPos sets.
+        for (i=0; i<endMarkerNodes.size(); i++) {
+            if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
+                endNode = tNode;
+                break;
+            }
+        }
+        if (endNode == NULL) {
+            // node wasn't an end node.  Try again with the next.
+            continue;
+        }
+
+        // We've got a node that can end a match.
+
+        // Line Break Specific hack:  If this node's val correspond to the $CM char class,
+        //                            don't chain from it.
+        // TODO:  Add rule syntax for this behavior, get specifics out of here and
+        //        into the rule file.
+        if (fRB->fLBCMNoChain) {
+            UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
+            U_ASSERT(c != -1);
+            ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
+            if (cLBProp == U_LB_COMBINING_MARK) {
+                continue;
+            }
+        }
+
+
+        // Now iterate over the nodes that can start a match, looking for ones
+        //   with the same char class as our ending node.
+        RBBINode *startNode;
+        for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
+            startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
+            if (startNode->fType != RBBINode::leafChar) {
+                continue;
+            }
+
+            if (endNode->fVal == startNode->fVal) {
+                // The end val (character class) of one possible match is the
+                //   same as the start of another.
+
+                // Add all nodes from the followPos of the start node to the
+                //  followPos set of the end node, which will have the effect of
+                //  letting matches transition from a match state at endNode
+                //  to the second char of a match starting with startNode.
+                setAdd(endNode->fFollowPos, startNode->fFollowPos);
+            }
+        }
+    }
+}
+
+
+//-----------------------------------------------------------------------------
+//
 //   buildStateTable()    Determine the set of runtime DFA states and the
 //                        transition tables for these states, by the algorithm
 //                        of fig. 3.44 in Aho.
@@ -309,19 +418,37 @@
 //
 //-----------------------------------------------------------------------------
 void RBBITableBuilder::buildStateTable() {
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
     //
     // Add a dummy state 0 - the stop state.  Not from Aho.
     int      lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
     RBBIStateDescriptor *failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
     failState->fPositions = new UVector(*fStatus);
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
     fDStates->addElement(failState, *fStatus);
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
 
     // initially, the only unmarked state in Dstates is firstpos(root),
     //       where toot is the root of the syntax tree for (r)#;
     RBBIStateDescriptor *initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
     initialState->fPositions = new UVector(*fStatus);
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
     setAdd(initialState->fPositions, fTree->fFirstPosSet);
     fDStates->addElement(initialState, *fStatus);
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
 
     // while there is an unmarked state T in Dstates do begin
     for (;;) {
@@ -383,8 +510,14 @@
                 if (!UinDstates)
                 {
                     RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
+                    if (U_FAILURE(*fStatus)) {
+                        return;
+                    }
                     newState->fPositions = U;
                     fDStates->addElement(newState, *fStatus);
+                    if (U_FAILURE(*fStatus)) {
+                        return;
+                    }
                     ux = fDStates->size()-1;
                 }
 
@@ -407,12 +540,22 @@
 //
 //-----------------------------------------------------------------------------
 void     RBBITableBuilder::flagAcceptingStates() {
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
     UVector     endMarkerNodes(*fStatus);
     RBBINode    *endMarker;
     int32_t     i;
     int32_t     n;
 
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
+
     fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
 
     for (i=0; i<endMarkerNodes.size(); i++) {
         endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
@@ -444,12 +587,18 @@
 //
 //-----------------------------------------------------------------------------
 void     RBBITableBuilder::flagLookAheadStates() {
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
     UVector     lookAheadNodes(*fStatus);
     RBBINode    *lookAheadNode;
     int32_t     i;
     int32_t     n;
 
     fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
     for (i=0; i<lookAheadNodes.size(); i++) {
         lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
 
@@ -471,12 +620,21 @@
 //
 //-----------------------------------------------------------------------------
 void     RBBITableBuilder::flagTaggedStates() {
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
     UVector     tagNodes(*fStatus);
     RBBINode    *tagNode;
     int32_t     i;
     int32_t     n;
 
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
     fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
     for (i=0; i<tagNodes.size(); i++) {                   // For each tag node t (all of 'em)
         tagNode = (RBBINode *)tagNodes.elementAt(i);
 
@@ -507,7 +665,7 @@
     int sourceSize       = source->size();
     int32_t  si, di;
 
-    for (si=0; si<sourceSize; si++) {
+    for (si=0; si<sourceSize && U_SUCCESS(*fStatus); si++) {
         void *elToAdd = source->elementAt(si);
         for (di=0; di<destOriginalSize; di++) {
             if (dest->elementAt(di) == elToAdd) {
@@ -515,7 +673,7 @@
             }
         }
         dest->addElement(elToAdd, *fStatus);
-    elementAlreadyInDest: ;
+        elementAlreadyInDest: ;
     }
 }
 
@@ -567,12 +725,12 @@
 //                 for each node in the tree.
 //
 //-----------------------------------------------------------------------------
-void RBBITableBuilder::printPosSets(RBBINode *n) {
 #ifdef RBBI_DEBUG
+void RBBITableBuilder::printPosSets(RBBINode *n) {
     if (n==NULL) {
         return;
     }
-    n->print();
+    n->printNode();
     RBBIDebugPrintf("         Nullable:  %s\n", n->fNullable?"TRUE":"FALSE");
 
     RBBIDebugPrintf("         firstpos:  ");
@@ -586,8 +744,8 @@
 
     printPosSets(n->fLeftChild);
     printPosSets(n->fRightChild);
-#endif
 }
+#endif
 
 
 
@@ -647,6 +805,11 @@
     table->fRowLen    = sizeof(RBBIStateTableRow) +
                             sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 2);
     table->fNumStates = fDStates->size();
+    table->fFlags     = 0;
+    if (fRB->fLookAheadHardBreak) {
+        table->fFlags  |= RBBI_LOOKAHEAD_HARD_BREAK;
+    }
+    table->fReserved  = 0;
 
     for (state=0; state<table->fNumStates; state++) {
         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
@@ -669,16 +832,16 @@
 //   printSet    Debug function.   Print the contents of a UVector
 //
 //-----------------------------------------------------------------------------
-void RBBITableBuilder::printSet(UVector *s) {
 #ifdef RBBI_DEBUG
+void RBBITableBuilder::printSet(UVector *s) {
     int32_t  i;
     for (i=0; i<s->size(); i++) {
         void *v = s->elementAt(i);
         RBBIDebugPrintf("%10p", v);
     }
     RBBIDebugPrintf("\n");
-#endif
 }
+#endif
 
 
 //-----------------------------------------------------------------------------
@@ -686,8 +849,8 @@
 //   printStates    Debug Function.  Dump the fully constructed state transition table.
 //
 //-----------------------------------------------------------------------------
-void RBBITableBuilder::printStates() {
 #ifdef RBBI_DEBUG
+void RBBITableBuilder::printStates() {
     int     c;    // input "character"
     int     n;    // state number
 
@@ -709,8 +872,8 @@
         RBBIDebugPrintf("\n");
     }
     RBBIDebugPrintf("\n\n");
-#endif
 }
+#endif
 
 
 
@@ -730,10 +893,16 @@
     fTagVal    = 0;
     fPositions = NULL;
     fDtran     = NULL;
+    
+    UErrorCode status = U_ZERO_ERROR;
+    fDtran     = new UVector(lastInputSymbol+1, status);
     if (U_FAILURE(*fStatus)) {
         return;
     }
-    fDtran     = new UVector(lastInputSymbol+1, *fStatus);
+    if (U_FAILURE(status)) {
+        *fStatus = status;
+        return;
+    }
     if (fDtran == NULL) {
         *fStatus = U_MEMORY_ALLOCATION_ERROR;
         return;

Index: rbbitblb.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/rbbitblb.h,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- rbbitblb.h	10 Sep 2003 02:42:03 -0000	1.1
+++ rbbitblb.h	6 Apr 2004 10:07:59 -0000	1.2
@@ -4,7 +4,7 @@
 
 /*
 **********************************************************************
-*   Copyright (c) 2002, International Business Machines
+*   Copyright (c) 2002-2003, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */
@@ -44,11 +44,13 @@
                                         //     Sufficient memory must exist at
                                         //     the specified location.
 
+
 private:
     void     calcNullable(RBBINode *n);
     void     calcFirstPos(RBBINode *n);
     void     calcLastPos(RBBINode  *n);
     void     calcFollowPos(RBBINode *n);
+    void     calcChainedFollowPos(RBBINode *n);
     void     buildStateTable();
     void     flagAcceptingStates();
     void     flagLookAheadStates();
@@ -60,10 +62,15 @@
     void     setAdd(UVector *dest, UVector *source);
     UBool    setEquals(UVector *a, UVector *b);
 
+#ifdef RBBI_DEBUG
     void     printSet(UVector *s);
-    void     printPosSets(RBBINode *n = NULL);
+    void     printPosSets(RBBINode *n /* = NULL*/);
     void     printStates();
-
+#else
+    #define  printSet(s)
+    #define  printPosSets(n)
+    #define  printStates()
+#endif
 
 private:
     RBBIRuleBuilder  *fRB;
@@ -74,6 +81,7 @@
     UVector          *fDStates;            //  D states (Aho's terminology)
                                            //  Index is state number
                                            //  Contents are RBBIStateDescriptor pointers.
+
 
     RBBITableBuilder(const RBBITableBuilder &other); // forbid copying of this class
     RBBITableBuilder &operator=(const RBBITableBuilder &other); // forbid copying of this class

Index: resbund.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/resbund.cpp,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- resbund.cpp	10 Sep 2003 02:42:03 -0000	1.3
+++ resbund.cpp	6 Apr 2004 10:07:59 -0000	1.4
@@ -167,7 +167,7 @@
  */
 //-----------------------------------------------------------------------------
 
-const char ResourceBundle::fgClassID=0;
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(ResourceBundle)
 
 ResourceBundle::ResourceBundle( const UnicodeString&    path,
                                 const Locale&           locale,
@@ -250,6 +250,11 @@
     }
 }
 
+ResourceBundle *
+ResourceBundle::clone() const {
+    return new ResourceBundle(*this);
+}
+
 void 
 ResourceBundle::constructForLocale(const UnicodeString& path,
                                    const Locale& locale,
@@ -390,6 +395,11 @@
     me->locName = new Locale(localeName);
   }
   return *locName;
+}
+
+const Locale ResourceBundle::getLocale(ULocDataLocaleType type, UErrorCode &status) const
+{
+  return ures_getLocaleByType(resource, type, &status);
 }
 
 //eof

Index: schriter.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/schriter.cpp,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- schriter.cpp	10 Sep 2003 02:42:03 -0000	1.3
+++ schriter.cpp	6 Apr 2004 10:07:59 -0000	1.4
@@ -1,6 +1,6 @@
 /*
 ******************************************************************************
-* Copyright (C) 1998-2001, International Business Machines Corporation and   *
+* Copyright (C) 1998-2003, International Business Machines Corporation and   *
 * others. All Rights Reserved.                                               *
 ******************************************************************************
 *
@@ -18,7 +18,7 @@
 
 U_NAMESPACE_BEGIN
 
-const char StringCharacterIterator::fgClassID = 0;
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringCharacterIterator)
 
 StringCharacterIterator::StringCharacterIterator()
   : UCharCharacterIterator(),

Index: sprpimpl.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/sprpimpl.h,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- sprpimpl.h	10 Sep 2003 02:42:03 -0000	1.1
+++ sprpimpl.h	6 Apr 2004 10:07:59 -0000	1.2
@@ -18,84 +18,100 @@
 #define SPRPIMPL_H
 
 #include "unicode/utypes.h"
+
+#if !UCONFIG_NO_IDNA
+
 #include "unicode/ustring.h"
 #include "unicode/parseerr.h"
+#include "unicode/usprep.h"
+#include "unicode/udata.h"
+#include "utrie.h"
+#include "udataswp.h"
 
-#if !UCONFIG_NO_IDNA
+#define _SPREP_DATA_TYPE "spp"
+
+enum UStringPrepType{
+    USPREP_UNASSIGNED           = 0x0000 ,
+    USPREP_MAP                  = 0x0001 ,
+    USPREP_PROHIBITED           = 0x0002 , 
+    USPREP_DELETE               = 0x0003 ,
+    USPREP_TYPE_LIMIT           = 0x0004  
+};
+
+typedef enum UStringPrepType UStringPrepType;
+
+#ifdef USPREP_TYPE_NAMES_ARRAY
+static const char* usprepTypeNames[] ={
+    "UNASSIGNED" ,          
+    "MAP" , 
+    "PROHIBITED" ,        
+    "DELETE",
+    "TYPE_LIMIT" 
+};
+#endif
 
 enum{
-    UIDNA_NO_VALUE          = 0x0000 ,
-    UIDNA_UNASSIGNED        = 0x0001 , 
-    UIDNA_PROHIBITED        = 0x0002 , 
-    UIDNA_MAP_NFKC          = 0x0003 , 
-    UIDNA_LABEL_SEPARATOR   = 0x0004 
+    _SPREP_NORMALIZATION_ON = 0x0001,
+    _SPREP_CHECK_BIDI_ON    = 0x0002
 };
+
 enum{
-    _IDNA_LENGTH_IN_MAPPING_TABLE = 0x0003 /*11*/
+    _SPREP_TYPE_THRESHOLD       = 0xFFF0,
+    _SPREP_MAX_INDEX_VALUE      = 0x3FBF,   /*16139*/ 
+    _SPREP_MAX_INDEX_TOP_LENGTH = 0x0003
 };
+
 /* indexes[] value names */
 enum {
-    _IDNA_INDEX_TRIE_SIZE,             /* number of bytes in normalization trie */
-    _IDNA_INDEX_MAPPING_DATA_SIZE,     /* The array that contains the mapping   */
-    _IDNA_INDEX_TOP=3                  /* changing this requires a new formatVersion */
+    _SPREP_INDEX_TRIE_SIZE                  = 0, /* number of bytes in StringPrep trie */
+    _SPREP_INDEX_MAPPING_DATA_SIZE          = 1, /* The array that contains the mapping   */
+    _SPREP_NORM_CORRECTNS_LAST_UNI_VERSION  = 2, /* The index of Unicode version of last entry in NormalizationCorrections.txt */ 
+    _SPREP_ONE_UCHAR_MAPPING_INDEX_START    = 3, /* The starting index of 1 UChar mapping index in the mapping data array */
+    _SPREP_TWO_UCHARS_MAPPING_INDEX_START   = 4, /* The starting index of 2 UChars mapping index in the mapping data array */
+    _SPREP_THREE_UCHARS_MAPPING_INDEX_START = 5, /* The starting index of 3 UChars mapping index in the mapping data array */
+    _SPREP_FOUR_UCHARS_MAPPING_INDEX_START  = 6, /* The starting index of 4 UChars mapping index in the mapping data array */
+    _SPREP_OPTIONS                          = 7, /* Bit set of options to turn on in the profile */
+    _SPREP_INDEX_TOP=16                          /* changing this requires a new formatVersion */
 };
 
-enum {
-    _IDNA_MAPPING_DATA_SIZE = 2000,
-    _IDNA_MAP_TO_NOTHING = 0x7FF
+typedef struct UStringPrepKey UStringPrepKey;
+
+
+struct UStringPrepKey{
+    char* name;
+    char* path;
 };
 
-#if defined(XP_CPLUSPLUS)
-static inline 
-void uprv_syntaxError(const UChar* rules, 
+struct UStringPrepProfile{
+    int32_t indexes[_SPREP_INDEX_TOP];
+    UTrie sprepTrie;
+    const uint16_t* mappingData;
+    UDataMemory* sprepData;
+    UBool isDataLoaded;
+    int32_t refCount;
+    UBool doNFKC;
+    UBool checkBiDi;
+};
+
+/**
+ * Helper function for populating the UParseError struct
+ * @internal
+ */
+U_CAPI void U_EXPORT2
+uprv_syntaxError(const UChar* rules, 
                  int32_t pos,
                  int32_t rulesLen,
-                 UParseError* parseError)
-{
-    if(parseError == NULL){
-        return;
-    }
-    if(pos == rulesLen && rulesLen >0){
-        pos--;
-    }
-    parseError->offset = pos;
-    parseError->line = 0 ; // we are not using line numbers 
-    
-    // for pre-context
-    int32_t start = (pos <=U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
-    int32_t stop  = pos;
-    
-    u_memcpy(parseError->preContext,rules+start,stop-start);
-    //null terminate the buffer
-    parseError->preContext[stop-start] = 0;
-    
-    //for post-context
-    start = pos;
-    if(start<rulesLen) {
-        U16_FWD_1(rules, start, rulesLen);
-    }
+                 UParseError* parseError);
 
-    stop  = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN)) : 
-                                                            rulesLen;
-    if(start < stop){
-        u_memcpy(parseError->postContext,rules+start,stop-start);
-        //null terminate the buffer
-        parseError->postContext[stop-start]= 0;
-    }
-    
-}
-#endif
 
-/* error codes for prototyping 
-#define U_IDNA_ERROR_START                      U_ERROR_LIMIT
-#define U_IDNA_PROHIBITED_CODEPOINT_FOUND_ERROR ((UErrorCode)(U_IDNA_ERROR_START + 1))
-#define U_IDNA_UNASSIGNED_CODEPOINT_FOUND_ERROR ((UErrorCode)(U_IDNA_ERROR_START + 2))
-#define U_IDNA_CHECK_BIDI_ERROR                 ((UErrorCode)(U_IDNA_ERROR_START + 3))
-#define U_IDNA_STD3_ASCII_RULES_ERROR           ((UErrorCode)(U_IDNA_ERROR_START + 4))
-#define U_IDNA_ACE_PREFIX_ERROR                 ((UErrorCode)(U_IDNA_ERROR_START + 5))
-#define U_IDNA_VERIFICATION_ERROR               ((UErrorCode)(U_IDNA_ERROR_START + 6))
-#define U_IDNA_LABEL_TOO_LONG_ERROR                  ((UErrorCode)(U_IDNA_ERROR_START + 8))   
-*/
+/**
+ * Swap StringPrep .spp profile data. See udataswp.h.
+ * @internal
+ */
+U_CAPI int32_t U_EXPORT2
+usprep_swap(const UDataSwapper *ds,
+            const void *inData, int32_t length, void *outData,
+            UErrorCode *pErrorCode);
 
 #endif /* #if !UCONFIG_NO_IDNA */
 

Index: uassert.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/uassert.h,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- uassert.h	10 Sep 2003 02:42:03 -0000	1.1
+++ uassert.h	6 Apr 2004 10:07:59 -0000	1.2
@@ -1,7 +1,7 @@
 /*
 ******************************************************************************
 *
-*   Copyright (C) 2002, International Business Machines
+*   Copyright (C) 2002-2003, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************
@@ -21,8 +21,12 @@
 #define U_ASSERT_H
 /* utypes.h is included to get the proper define for uint8_t */
 #include "unicode/utypes.h"
+#if U_RELEASE
+#define U_ASSERT(exp)
+#else
 #include <assert.h>
 #define U_ASSERT(exp) assert(exp)
+#endif
 #endif
 
 

Index: ubidi.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ubidi.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- ubidi.c	10 Sep 2003 02:42:03 -0000	1.3
+++ ubidi.c	6 Apr 2004 10:07:59 -0000	1.4
@@ -1,7 +1,7 @@
 /*  
 ******************************************************************************
 *
-*   Copyright (C) 1999-2001, International Business Machines
+*   Copyright (C) 1999-2003, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************
@@ -107,28 +107,6 @@
  * (L1) is not necessary in adjustWSLevels().
  */
 
-/* prototypes --------------------------------------------------------------- */
-
-static void
-getDirProps(UBiDi *pBiDi, const UChar *text);
-
-static UBiDiDirection
-resolveExplicitLevels(UBiDi *pBiDi);
-
-static UBiDiDirection
-checkExplicitLevels(UBiDi *pBiDi, UErrorCode *pErrorCode);
-
-static UBiDiDirection
-directionFromFlags(Flags flags);
-
-static void
-resolveImplicitLevels(UBiDi *pBiDi,
-                      int32_t start, int32_t limit,
-                      DirProp sor, DirProp eor);
-
-static void
-adjustWSLevels(UBiDi *pBiDi);
-
 /* to avoid some conditional statements, use tiny constant arrays */
 static const Flags flagLR[2]={ DIRPROP_FLAG(L), DIRPROP_FLAG(R) };
 static const Flags flagE[2]={ DIRPROP_FLAG(LRE), DIRPROP_FLAG(RLE) };
@@ -281,199 +259,6 @@
     }
 }
 
-/* ubidi_setPara ------------------------------------------------------------ */
-
-U_CAPI void U_EXPORT2
-ubidi_setPara(UBiDi *pBiDi, const UChar *text, int32_t length,
-              UBiDiLevel paraLevel, UBiDiLevel *embeddingLevels,
-              UErrorCode *pErrorCode) {
-    UBiDiDirection direction;
-
-    /* check the argument values */
-    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
-        return;
-    } else if(pBiDi==NULL || text==NULL ||
-              ((UBIDI_MAX_EXPLICIT_LEVEL<paraLevel) && !IS_DEFAULT_LEVEL(paraLevel)) ||
-              length<-1
-    ) {
-        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
-        return;
-    }
-
-    if(length==-1) {
-        length=u_strlen(text);
-    }
-
-    /* initialize the UBiDi structure */
-    pBiDi->text=text;
-    pBiDi->length=length;
-    pBiDi->paraLevel=paraLevel;
-    pBiDi->direction=UBIDI_LTR;
-    pBiDi->trailingWSStart=length;  /* the levels[] will reflect the WS run */
-
-    pBiDi->dirProps=NULL;
-    pBiDi->levels=NULL;
-    pBiDi->runs=NULL;
-
-    if(length==0) {
-        /*
-         * For an empty paragraph, create a UBiDi object with the paraLevel and
-         * the flags and the direction set but without allocating zero-length arrays.
-         * There is nothing more to do.
-         */
-        if(IS_DEFAULT_LEVEL(paraLevel)) {
-            pBiDi->paraLevel&=1;
-        }
-        if(paraLevel&1) {
-            pBiDi->flags=DIRPROP_FLAG(R);
-            pBiDi->direction=UBIDI_RTL;
-        } else {
-            pBiDi->flags=DIRPROP_FLAG(L);
-            pBiDi->direction=UBIDI_LTR;
-        }
-
-        pBiDi->runCount=0;
-        return;
-    }
-
-    pBiDi->runCount=-1;
-
-    /*
-     * Get the directional properties,
-     * the flags bit-set, and
-     * determine the partagraph level if necessary.
-     */
-    if(getDirPropsMemory(pBiDi, length)) {
-        pBiDi->dirProps=pBiDi->dirPropsMemory;
-        getDirProps(pBiDi, text);
-    } else {
-        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
-        return;
-    }
-
-    /* are explicit levels specified? */
-    if(embeddingLevels==NULL) {
-        /* no: determine explicit levels according to the (Xn) rules */\
-        if(getLevelsMemory(pBiDi, length)) {
-            pBiDi->levels=pBiDi->levelsMemory;
-            direction=resolveExplicitLevels(pBiDi);
-        } else {
-            *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
-            return;
-        }
-    } else {
-        /* set BN for all explicit codes, check that all levels are paraLevel..UBIDI_MAX_EXPLICIT_LEVEL */
-        pBiDi->levels=embeddingLevels;
-        direction=checkExplicitLevels(pBiDi, pErrorCode);
-        if(U_FAILURE(*pErrorCode)) {
-            return;
-        }
-    }
-
-    /*
-     * The steps after (X9) in the UBiDi algorithm are performed only if
-     * the paragraph text has mixed directionality!
-     */
-    pBiDi->direction=direction;
-    switch(direction) {
-    case UBIDI_LTR:
-        /* make sure paraLevel is even */
-        pBiDi->paraLevel=(UBiDiLevel)((pBiDi->paraLevel+1)&~1);
-
-        /* all levels are implicitly at paraLevel (important for ubidi_getLevels()) */
-        pBiDi->trailingWSStart=0;
-        break;
-    case UBIDI_RTL:
-        /* make sure paraLevel is odd */
-        pBiDi->paraLevel|=1;
-
-        /* all levels are implicitly at paraLevel (important for ubidi_getLevels()) */
-        pBiDi->trailingWSStart=0;
-        break;
-    default:
-        /*
-         * If there are no external levels specified and there
-         * are no significant explicit level codes in the text,
-         * then we can treat the entire paragraph as one run.
-         * Otherwise, we need to perform the following rules on runs of
-         * the text with the same embedding levels. (X10)
-         * "Significant" explicit level codes are ones that actually
-         * affect non-BN characters.
-         * Examples for "insignificant" ones are empty embeddings
-         * LRE-PDF, LRE-RLE-PDF-PDF, etc.
-         */
-        if(embeddingLevels==NULL && !(pBiDi->flags&DIRPROP_FLAG_MULTI_RUNS)) {
-            resolveImplicitLevels(pBiDi, 0, length,
-                                    GET_LR_FROM_LEVEL(pBiDi->paraLevel),
-                                    GET_LR_FROM_LEVEL(pBiDi->paraLevel));
-        } else {
-            /* sor, eor: start and end types of same-level-run */
-            UBiDiLevel *levels=pBiDi->levels;
-            int32_t start, limit=0;
-            UBiDiLevel level, nextLevel;
-            DirProp sor, eor;
-
-            /* determine the first sor and set eor to it because of the loop body (sor=eor there) */
-            level=pBiDi->paraLevel;
-            nextLevel=levels[0];
-            if(level<nextLevel) {
-                eor=GET_LR_FROM_LEVEL(nextLevel);
-            } else {
-                eor=GET_LR_FROM_LEVEL(level);
-            }
-
-            do {
-                /* determine start and limit of the run (end points just behind the run) */
-
-                /* the values for this run's start are the same as for the previous run's end */
-                sor=eor;
-                start=limit;
-                level=nextLevel;
-
-                /* search for the limit of this run */
-                while(++limit<length && levels[limit]==level) {}
-
-                /* get the correct level of the next run */
-                if(limit<length) {
-                    nextLevel=levels[limit];
-                } else {
-                    nextLevel=pBiDi->paraLevel;
-                }
-
-                /* determine eor from max(level, nextLevel); sor is last run's eor */
-                if((level&~UBIDI_LEVEL_OVERRIDE)<(nextLevel&~UBIDI_LEVEL_OVERRIDE)) {
-                    eor=GET_LR_FROM_LEVEL(nextLevel);
-                } else {
-                    eor=GET_LR_FROM_LEVEL(level);
-                }
-
-                /* if the run consists of overridden directional types, then there
-                   are no implicit types to be resolved */
-                if(!(level&UBIDI_LEVEL_OVERRIDE)) {
-                    resolveImplicitLevels(pBiDi, start, limit, sor, eor);
-                } else {
-                    /* remove the UBIDI_LEVEL_OVERRIDE flags */
-                    do {
-                        levels[start++]&=~UBIDI_LEVEL_OVERRIDE;
-                    } while(start<limit);
-                }
-            } while(limit<length);
-        }
-
-        /* reset the embedding levels for some non-graphic characters (L1), (X9) */
-        adjustWSLevels(pBiDi);
-
-        /* for "inverse BiDi", ubidi_getRuns() modifies the levels of numeric runs following RTL runs */
-        if(pBiDi->isInverse) {
-            if(!ubidi_getRuns(pBiDi)) {
-                *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
-                return;
-            }
-        }
-        break;
-    }
-}
-
 /* perform (P2)..(P3) ------------------------------------------------------- */
 
 /*
@@ -544,6 +329,19 @@
 
 /* perform (X1)..(X9) ------------------------------------------------------- */
 
+/* determine if the text is mixed-directional or single-directional */
+static UBiDiDirection
+directionFromFlags(Flags flags) {
+    /* if the text contains AN and neutrals, then some neutrals may become RTL */
+    if(!(flags&MASK_RTL || ((flags&DIRPROP_FLAG(AN)) && (flags&MASK_POSSIBLE_N)))) {
+        return UBIDI_LTR;
+    } else if(!(flags&MASK_LTR)) {
+        return UBIDI_RTL;
+    } else {
+        return UBIDI_MIXED;
+    }
+}
+
 /*
  * Resolve the explicit levels as specified by explicit embedding codes.
  * Recalculate the flags to have them reflect the real properties
@@ -596,7 +394,6 @@
  *
  * This implementation assumes that UBIDI_MAX_EXPLICIT_LEVEL is odd.
  */
-
 static UBiDiDirection
 resolveExplicitLevels(UBiDi *pBiDi) {
     const DirProp *dirProps=pBiDi->dirProps;
@@ -788,19 +585,6 @@
     return directionFromFlags(flags);
 }
 
-/* determine if the text is mixed-directional or single-directional */
-static UBiDiDirection
-directionFromFlags(Flags flags) {
-    /* if the text contains AN and neutrals, then some neutrals may become RTL */
-    if(!(flags&MASK_RTL || ((flags&DIRPROP_FLAG(AN)) && (flags&MASK_POSSIBLE_N)))) {
-        return UBIDI_LTR;
-    } else if(!(flags&MASK_LTR)) {
-        return UBIDI_RTL;
-    } else {
-        return UBIDI_MIXED;
-    }
-}
-
 /* perform rules (Wn), (Nn), and (In) on a run of the text ------------------ */
 
 /*
@@ -1211,7 +995,198 @@
     }
 }
 
-/* -------------------------------------------------------------------------- */
+/* ubidi_setPara ------------------------------------------------------------ */
+
+U_CAPI void U_EXPORT2
+ubidi_setPara(UBiDi *pBiDi, const UChar *text, int32_t length,
+              UBiDiLevel paraLevel, UBiDiLevel *embeddingLevels,
+              UErrorCode *pErrorCode) {
+    UBiDiDirection direction;
+
+    /* check the argument values */
+    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+        return;
+    } else if(pBiDi==NULL || text==NULL ||
+              ((UBIDI_MAX_EXPLICIT_LEVEL<paraLevel) && !IS_DEFAULT_LEVEL(paraLevel)) ||
+              length<-1
+    ) {
+        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+        return;
+    }
+
+    if(length==-1) {
+        length=u_strlen(text);
+    }
+
+    /* initialize the UBiDi structure */
+    pBiDi->text=text;
+    pBiDi->length=length;
+    pBiDi->paraLevel=paraLevel;
+    pBiDi->direction=UBIDI_LTR;
+    pBiDi->trailingWSStart=length;  /* the levels[] will reflect the WS run */
+
+    pBiDi->dirProps=NULL;
+    pBiDi->levels=NULL;
+    pBiDi->runs=NULL;
+
+    if(length==0) {
+        /*
+         * For an empty paragraph, create a UBiDi object with the paraLevel and
+         * the flags and the direction set but without allocating zero-length arrays.
+         * There is nothing more to do.
+         */
+        if(IS_DEFAULT_LEVEL(paraLevel)) {
+            pBiDi->paraLevel&=1;
+        }
+        if(paraLevel&1) {
+            pBiDi->flags=DIRPROP_FLAG(R);
+            pBiDi->direction=UBIDI_RTL;
+        } else {
+            pBiDi->flags=DIRPROP_FLAG(L);
+            pBiDi->direction=UBIDI_LTR;
+        }
+
+        pBiDi->runCount=0;
+        return;
+    }
+
+    pBiDi->runCount=-1;
+
+    /*
+     * Get the directional properties,
+     * the flags bit-set, and
+     * determine the partagraph level if necessary.
+     */
+    if(getDirPropsMemory(pBiDi, length)) {
+        pBiDi->dirProps=pBiDi->dirPropsMemory;
+        getDirProps(pBiDi, text);
+    } else {
+        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
+        return;
+    }
+
+    /* are explicit levels specified? */
+    if(embeddingLevels==NULL) {
+        /* no: determine explicit levels according to the (Xn) rules */\
+        if(getLevelsMemory(pBiDi, length)) {
+            pBiDi->levels=pBiDi->levelsMemory;
+            direction=resolveExplicitLevels(pBiDi);
+        } else {
+            *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
+            return;
+        }
+    } else {
+        /* set BN for all explicit codes, check that all levels are paraLevel..UBIDI_MAX_EXPLICIT_LEVEL */
+        pBiDi->levels=embeddingLevels;
+        direction=checkExplicitLevels(pBiDi, pErrorCode);
+        if(U_FAILURE(*pErrorCode)) {
+            return;
+        }
+    }
+
+    /*
+     * The steps after (X9) in the UBiDi algorithm are performed only if
+     * the paragraph text has mixed directionality!
+     */
+    pBiDi->direction=direction;
+    switch(direction) {
+    case UBIDI_LTR:
+        /* make sure paraLevel is even */
+        pBiDi->paraLevel=(UBiDiLevel)((pBiDi->paraLevel+1)&~1);
+
+        /* all levels are implicitly at paraLevel (important for ubidi_getLevels()) */
+        pBiDi->trailingWSStart=0;
+        break;
+    case UBIDI_RTL:
+        /* make sure paraLevel is odd */
+        pBiDi->paraLevel|=1;
+
+        /* all levels are implicitly at paraLevel (important for ubidi_getLevels()) */
+        pBiDi->trailingWSStart=0;
+        break;
+    default:
+        /*
+         * If there are no external levels specified and there
+         * are no significant explicit level codes in the text,
+         * then we can treat the entire paragraph as one run.
+         * Otherwise, we need to perform the following rules on runs of
+         * the text with the same embedding levels. (X10)
+         * "Significant" explicit level codes are ones that actually
+         * affect non-BN characters.
+         * Examples for "insignificant" ones are empty embeddings
+         * LRE-PDF, LRE-RLE-PDF-PDF, etc.
+         */
+        if(embeddingLevels==NULL && !(pBiDi->flags&DIRPROP_FLAG_MULTI_RUNS)) {
+            resolveImplicitLevels(pBiDi, 0, length,
+                                    GET_LR_FROM_LEVEL(pBiDi->paraLevel),
+                                    GET_LR_FROM_LEVEL(pBiDi->paraLevel));
+        } else {
+            /* sor, eor: start and end types of same-level-run */
+            UBiDiLevel *levels=pBiDi->levels;
+            int32_t start, limit=0;
+            UBiDiLevel level, nextLevel;
+            DirProp sor, eor;
+
+            /* determine the first sor and set eor to it because of the loop body (sor=eor there) */
+            level=pBiDi->paraLevel;
+            nextLevel=levels[0];
+            if(level<nextLevel) {
+                eor=GET_LR_FROM_LEVEL(nextLevel);
+            } else {
+                eor=GET_LR_FROM_LEVEL(level);
+            }
+
+            do {
+                /* determine start and limit of the run (end points just behind the run) */
+
+                /* the values for this run's start are the same as for the previous run's end */
+                sor=eor;
+                start=limit;
+                level=nextLevel;
+
+                /* search for the limit of this run */
+                while(++limit<length && levels[limit]==level) {}
+
+                /* get the correct level of the next run */
+                if(limit<length) {
+                    nextLevel=levels[limit];
+                } else {
+                    nextLevel=pBiDi->paraLevel;
+                }
+
+                /* determine eor from max(level, nextLevel); sor is last run's eor */
+                if((level&~UBIDI_LEVEL_OVERRIDE)<(nextLevel&~UBIDI_LEVEL_OVERRIDE)) {
+                    eor=GET_LR_FROM_LEVEL(nextLevel);
+                } else {
+                    eor=GET_LR_FROM_LEVEL(level);
+                }
+
+                /* if the run consists of overridden directional types, then there
+                   are no implicit types to be resolved */
+                if(!(level&UBIDI_LEVEL_OVERRIDE)) {
+                    resolveImplicitLevels(pBiDi, start, limit, sor, eor);
+                } else {
+                    /* remove the UBIDI_LEVEL_OVERRIDE flags */
+                    do {
+                        levels[start++]&=~UBIDI_LEVEL_OVERRIDE;
+                    } while(start<limit);
+                }
+            } while(limit<length);
+        }
+
+        /* reset the embedding levels for some non-graphic characters (L1), (X9) */
+        adjustWSLevels(pBiDi);
+
+        /* for "inverse BiDi", ubidi_getRuns() modifies the levels of numeric runs following RTL runs */
+        if(pBiDi->isInverse) {
+            if(!ubidi_getRuns(pBiDi)) {
+                *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
+                return;
+            }
+        }
+        break;
+    }
+}
 
 U_CAPI UBiDiDirection U_EXPORT2
 ubidi_getDirection(const UBiDi *pBiDi) {

Index: ubidiln.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ubidiln.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- ubidiln.c	10 Sep 2003 02:42:03 -0000	1.3
+++ ubidiln.c	6 Apr 2004 10:07:59 -0000	1.4
@@ -1,7 +1,7 @@
 /*  
 ******************************************************************************
 *
-*   Copyright (C) 1999-2001, International Business Machines
+*   Copyright (C) 1999-2003, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************
@@ -76,21 +76,38 @@
  * change the now shared levels for (L1).
  */
 
-/* prototypes --------------------------------------------------------------- */
+/* handle trailing WS (L1) -------------------------------------------------- */
 
+/*
+ * setTrailingWSStart() sets the start index for a trailing
+ * run of WS in the line. This is necessary because we do not modify
+ * the paragraph's levels array that we just point into.
+ * Using trailingWSStart is another form of performing (L1).
+ *
+ * To make subsequent operations easier, we also include the run
+ * before the WS if it is at the paraLevel - we merge the two here.
+ */
 static void
-setTrailingWSStart(UBiDi *pBiDi);
+setTrailingWSStart(UBiDi *pBiDi) {
+    /* pBiDi->direction!=UBIDI_MIXED */
 
-static void
-getSingleRun(UBiDi *pBiDi, UBiDiLevel level);
+    const DirProp *dirProps=pBiDi->dirProps;
+    UBiDiLevel *levels=pBiDi->levels;
+    int32_t start=pBiDi->length;
+    UBiDiLevel paraLevel=pBiDi->paraLevel;
 
-static void
-reorderLine(UBiDi *pBiDi, UBiDiLevel minLevel, UBiDiLevel maxLevel);
+    /* go backwards across all WS, BN, explicit codes */
+    while(start>0 && DIRPROP_FLAG(dirProps[start-1])&MASK_WS) {
+        --start;
+    }
 
-static UBool
-prepareReorder(const UBiDiLevel *levels, int32_t length,
-               int32_t *indexMap,
-               UBiDiLevel *pMinLevel, UBiDiLevel *pMaxLevel);
+    /* if the WS run can be merged with the previous run then do so here */
+    while(start>0 && levels[start-1]==paraLevel) {
+        --start;
+    }
+
+    pBiDi->trailingWSStart=start;
+}
 
 /* ubidi_setLine ------------------------------------------------------------ */
 
@@ -295,39 +312,6 @@
     }
 }
 
-/* handle trailing WS (L1) -------------------------------------------------- */
-
-/*
- * setTrailingWSStart() sets the start index for a trailing
- * run of WS in the line. This is necessary because we do not modify
- * the paragraph's levels array that we just point into.
- * Using trailingWSStart is another form of performing (L1).
- *
- * To make subsequent operations easier, we also include the run
- * before the WS if it is at the paraLevel - we merge the two here.
- */
-static void
-setTrailingWSStart(UBiDi *pBiDi) {
-    /* pBiDi->direction!=UBIDI_MIXED */
-
-    const DirProp *dirProps=pBiDi->dirProps;
-    UBiDiLevel *levels=pBiDi->levels;
-    int32_t start=pBiDi->length;
-    UBiDiLevel paraLevel=pBiDi->paraLevel;
-
-    /* go backwards across all WS, BN, explicit codes */
-    while(start>0 && DIRPROP_FLAG(dirProps[start-1])&MASK_WS) {
-        --start;
-    }
-
-    /* if the WS run can be merged with the previous run then do so here */
-    while(start>0 && levels[start-1]==paraLevel) {
-        --start;
-    }
-
-    pBiDi->trailingWSStart=start;
-}
-
 /* runs API functions ------------------------------------------------------- */
 
 U_CAPI int32_t U_EXPORT2
@@ -367,6 +351,144 @@
     }
 }
 
+/* in trivial cases there is only one trivial run; called by ubidi_getRuns() */
+static void
+getSingleRun(UBiDi *pBiDi, UBiDiLevel level) {
+    /* simple, single-run case */
+    pBiDi->runs=pBiDi->simpleRuns;
+    pBiDi->runCount=1;
+
+    /* fill and reorder the single run */
+    pBiDi->runs[0].logicalStart=MAKE_INDEX_ODD_PAIR(0, level);
+    pBiDi->runs[0].visualLimit=pBiDi->length;
+}
+
+/* reorder the runs array (L2) ---------------------------------------------- */
+
+/*
+ * Reorder the same-level runs in the runs array.
+ * Here, runCount>1 and maxLevel>=minLevel>=paraLevel.
+ * All the visualStart fields=logical start before reordering.
+ * The "odd" bits are not set yet.
+ *
+ * Reordering with this data structure lends itself to some handy shortcuts:
+ *
+ * Since each run is moved but not modified, and since at the initial maxLevel
+ * each sequence of same-level runs consists of only one run each, we
+ * don't need to do anything there and can predecrement maxLevel.
+ * In many simple cases, the reordering is thus done entirely in the
+ * index mapping.
+ * Also, reordering occurs only down to the lowest odd level that occurs,
+ * which is minLevel|1. However, if the lowest level itself is odd, then
+ * in the last reordering the sequence of the runs at this level or higher
+ * will be all runs, and we don't need the elaborate loop to search for them.
+ * This is covered by ++minLevel instead of minLevel|=1 followed
+ * by an extra reorder-all after the reorder-some loop.
+ * About a trailing WS run:
+ * Such a run would need special treatment because its level is not
+ * reflected in levels[] if this is not a paragraph object.
+ * Instead, all characters from trailingWSStart on are implicitly at
+ * paraLevel.
+ * However, for all maxLevel>paraLevel, this run will never be reordered
+ * and does not need to be taken into account. maxLevel==paraLevel is only reordered
+ * if minLevel==paraLevel is odd, which is done in the extra segment.
+ * This means that for the main reordering loop we don't need to consider
+ * this run and can --runCount. If it is later part of the all-runs
+ * reordering, then runCount is adjusted accordingly.
+ */
+static void
+reorderLine(UBiDi *pBiDi, UBiDiLevel minLevel, UBiDiLevel maxLevel) {
+    Run *runs;
+    UBiDiLevel *levels;
+    int32_t firstRun, endRun, limitRun, runCount,
+    temp;
+
+    /* nothing to do? */
+    if(maxLevel<=(minLevel|1)) {
+        return;
+    }
+
+    /*
+     * Reorder only down to the lowest odd level
+     * and reorder at an odd minLevel in a separate, simpler loop.
+     * See comments above for why minLevel is always incremented.
+     */
+    ++minLevel;
+
+    runs=pBiDi->runs;
+    levels=pBiDi->levels;
+    runCount=pBiDi->runCount;
+
+    /* do not include the WS run at paraLevel<=old minLevel except in the simple loop */
+    if(pBiDi->trailingWSStart<pBiDi->length) {
+        --runCount;
+    }
+
+    while(--maxLevel>=minLevel) {
+        firstRun=0;
+
+        /* loop for all sequences of runs */
+        for(;;) {
+            /* look for a sequence of runs that are all at >=maxLevel */
+            /* look for the first run of such a sequence */
+            while(firstRun<runCount && levels[runs[firstRun].logicalStart]<maxLevel) {
+                ++firstRun;
+            }
+            if(firstRun>=runCount) {
+                break;  /* no more such runs */
+            }
+
+            /* look for the limit run of such a sequence (the run behind it) */
+            for(limitRun=firstRun; ++limitRun<runCount && levels[runs[limitRun].logicalStart]>=maxLevel;) {}
+
+            /* Swap the entire sequence of runs from firstRun to limitRun-1. */
+            endRun=limitRun-1;
+            while(firstRun<endRun) {
+                temp=runs[firstRun].logicalStart;
+                runs[firstRun].logicalStart=runs[endRun].logicalStart;
+                runs[endRun].logicalStart=temp;
+
+                temp=runs[firstRun].visualLimit;
+                runs[firstRun].visualLimit=runs[endRun].visualLimit;
+                runs[endRun].visualLimit=temp;
+
+                ++firstRun;
+                --endRun;
+            }
+
+            if(limitRun==runCount) {
+                break;  /* no more such runs */
+            } else {
+                firstRun=limitRun+1;
+            }
+        }
+    }
+
+    /* now do maxLevel==old minLevel (==odd!), see above */
+    if(!(minLevel&1)) {
+        firstRun=0;
+
+        /* include the trailing WS run in this complete reordering */
+        if(pBiDi->trailingWSStart==pBiDi->length) {
+            --runCount;
+        }
+
+        /* Swap the entire sequence of all runs. (endRun==runCount) */
+        while(firstRun<runCount) {
+            temp=runs[firstRun].logicalStart;
+            runs[firstRun].logicalStart=runs[runCount].logicalStart;
+            runs[runCount].logicalStart=temp;
+
+            temp=runs[firstRun].visualLimit;
+            runs[firstRun].visualLimit=runs[runCount].visualLimit;
+            runs[runCount].visualLimit=temp;
+
+            ++firstRun;
+            --runCount;
+        }
+    }
+}
+
 /* compute the runs array --------------------------------------------------- */
 
 /*
@@ -485,15 +607,19 @@
                 /* now add the direction flags and adjust the visualLimit's to be just that */
                 ADD_ODD_BIT_FROM_LEVEL(runs[0].logicalStart, levels[runs[0].logicalStart]);
                 limit=runs[0].visualLimit;
-                for(i=1; i<runIndex; ++i) {
+
+				/* this loop will also handle the trailing WS run */
+                for(i=1; i<runCount; ++i) {
                     ADD_ODD_BIT_FROM_LEVEL(runs[i].logicalStart, levels[runs[i].logicalStart]);
                     limit=runs[i].visualLimit+=limit;
                 }
 
-                /* same for the trailing WS run */
+                /* Set the "odd" bit for the trailing WS run. */
+				/* For a RTL paragraph, it will be the *first* run in visual order. */
                 if(runIndex<runCount) {
-                    ADD_ODD_BIT_FROM_LEVEL(runs[i].logicalStart, pBiDi->paraLevel);
-                    runs[runIndex].visualLimit+=limit;
+					int32_t trailingRun = ((pBiDi->paraLevel & 1) != 0)? 0 : runIndex;
+
+                    ADD_ODD_BIT_FROM_LEVEL(runs[trailingRun].logicalStart, pBiDi->paraLevel);
                 }
             }
         }
@@ -501,142 +627,42 @@
     return TRUE;
 }
 
-/* in trivial cases there is only one trivial run; called by ubidi_getRuns() */
-static void
-getSingleRun(UBiDi *pBiDi, UBiDiLevel level) {
-    /* simple, single-run case */
-    pBiDi->runs=pBiDi->simpleRuns;
-    pBiDi->runCount=1;
-
-    /* fill and reorder the single run */
-    pBiDi->runs[0].logicalStart=MAKE_INDEX_ODD_PAIR(0, level);
-    pBiDi->runs[0].visualLimit=pBiDi->length;
-}
-
-/* reorder the runs array (L2) ---------------------------------------------- */
-
-/*
- * Reorder the same-level runs in the runs array.
- * Here, runCount>1 and maxLevel>=minLevel>=paraLevel.
- * All the visualStart fields=logical start before reordering.
- * The "odd" bits are not set yet.
- *
- * Reordering with this data structure lends itself to some handy shortcuts:
- *
- * Since each run is moved but not modified, and since at the initial maxLevel
- * each sequence of same-level runs consists of only one run each, we
- * don't need to do anything there and can predecrement maxLevel.
- * In many simple cases, the reordering is thus done entirely in the
- * index mapping.
- * Also, reordering occurs only down to the lowest odd level that occurs,
- * which is minLevel|1. However, if the lowest level itself is odd, then
- * in the last reordering the sequence of the runs at this level or higher
- * will be all runs, and we don't need the elaborate loop to search for them.
- * This is covered by ++minLevel instead of minLevel|=1 followed
- * by an extra reorder-all after the reorder-some loop.
- * About a trailing WS run:
- * Such a run would need special treatment because its level is not
- * reflected in levels[] if this is not a paragraph object.
- * Instead, all characters from trailingWSStart on are implicitly at
- * paraLevel.
- * However, for all maxLevel>paraLevel, this run will never be reordered
- * and does not need to be taken into account. maxLevel==paraLevel is only reordered
- * if minLevel==paraLevel is odd, which is done in the extra segment.
- * This means that for the main reordering loop we don't need to consider
- * this run and can --runCount. If it is later part of the all-runs
- * reordering, then runCount is adjusted accordingly.
- */
-static void
-reorderLine(UBiDi *pBiDi, UBiDiLevel minLevel, UBiDiLevel maxLevel) {
-    Run *runs;
-    UBiDiLevel *levels;
-    int32_t firstRun, endRun, limitRun, runCount,
-    temp;
-
-    /* nothing to do? */
-    if(maxLevel<=(minLevel|1)) {
-        return;
-    }
-
-    /*
-     * Reorder only down to the lowest odd level
-     * and reorder at an odd minLevel in a separate, simpler loop.
-     * See comments above for why minLevel is always incremented.
-     */
-    ++minLevel;
-
-    runs=pBiDi->runs;
-    levels=pBiDi->levels;
-    runCount=pBiDi->runCount;
+static UBool
+prepareReorder(const UBiDiLevel *levels, int32_t length,
+               int32_t *indexMap,
+               UBiDiLevel *pMinLevel, UBiDiLevel *pMaxLevel) {
+    int32_t start;
+    UBiDiLevel level, minLevel, maxLevel;
 
-    /* do not include the WS run at paraLevel<=old minLevel except in the simple loop */
-    if(pBiDi->trailingWSStart<pBiDi->length) {
-        --runCount;
+    if(levels==NULL || length<=0) {
+        return FALSE;
     }
 
-    while(--maxLevel>=minLevel) {
-        firstRun=0;
-
-        /* loop for all sequences of runs */
-        for(;;) {
-            /* look for a sequence of runs that are all at >=maxLevel */
-            /* look for the first run of such a sequence */
-            while(firstRun<runCount && levels[runs[firstRun].logicalStart]<maxLevel) {
-                ++firstRun;
-            }
-            if(firstRun>=runCount) {
-                break;  /* no more such runs */
-            }
-
-            /* look for the limit run of such a sequence (the run behind it) */
-            for(limitRun=firstRun; ++limitRun<runCount && levels[runs[limitRun].logicalStart]>=maxLevel;) {}
-
-            /* Swap the entire sequence of runs from firstRun to limitRun-1. */
-            endRun=limitRun-1;
-            while(firstRun<endRun) {
-                temp=runs[firstRun].logicalStart;
-                runs[firstRun].logicalStart=runs[endRun].logicalStart;
-                runs[endRun].logicalStart=temp;
-
-                temp=runs[firstRun].visualLimit;
-                runs[firstRun].visualLimit=runs[endRun].visualLimit;
-                runs[endRun].visualLimit=temp;
-
-                ++firstRun;
-                --endRun;
-            }
-
-            if(limitRun==runCount) {
-                break;  /* no more such runs */
-            } else {
-                firstRun=limitRun+1;
-            }
+    /* determine minLevel and maxLevel */
+    minLevel=UBIDI_MAX_EXPLICIT_LEVEL+1;
+    maxLevel=0;
+    for(start=length; start>0;) {
+        level=levels[--start];
+        if(level>UBIDI_MAX_EXPLICIT_LEVEL+1) {
+            return FALSE;
         }
-    }
-
-    /* now do maxLevel==old minLevel (==odd!), see above */
-    if(!(minLevel&1)) {
-        firstRun=0;
-
-        /* include the trailing WS run in this complete reordering */
-        if(pBiDi->trailingWSStart==pBiDi->length) {
-            --runCount;
+        if(level<minLevel) {
+            minLevel=level;
         }
-
-        /* Swap the entire sequence of all runs. (endRun==runCount) */
-        while(firstRun<runCount) {
-            temp=runs[firstRun].logicalStart;
-            runs[firstRun].logicalStart=runs[runCount].logicalStart;
-            runs[runCount].logicalStart=temp;
-
-            temp=runs[firstRun].visualLimit;
-            runs[firstRun].visualLimit=runs[runCount].visualLimit;
-            runs[runCount].visualLimit=temp;
-
-            ++firstRun;
-            --runCount;
+        if(level>maxLevel) {
+            maxLevel=level;
         }
     }
+    *pMinLevel=minLevel;
+    *pMaxLevel=maxLevel;
+
+    /* initialize the index map */
+    for(start=length; start>0;) {
+        --start;
+        indexMap[start]=start;
+    }
+
+    return TRUE;
 }
 
 /* reorder a line based on a levels array (L2) ------------------------------ */
@@ -762,44 +788,6 @@
             }
         }
     } while(--maxLevel>=minLevel);
-}
-
-static UBool
-prepareReorder(const UBiDiLevel *levels, int32_t length,
-               int32_t *indexMap,
-               UBiDiLevel *pMinLevel, UBiDiLevel *pMaxLevel) {
-    int32_t start;
-    UBiDiLevel level, minLevel, maxLevel;
-
-    if(levels==NULL || length<=0) {
-        return FALSE;
-    }
-
-    /* determine minLevel and maxLevel */
-    minLevel=UBIDI_MAX_EXPLICIT_LEVEL+1;
-    maxLevel=0;
-    for(start=length; start>0;) {
-        level=levels[--start];
-        if(level>UBIDI_MAX_EXPLICIT_LEVEL+1) {
-            return FALSE;
-        }
-        if(level<minLevel) {
-            minLevel=level;
-        }
-        if(level>maxLevel) {
-            maxLevel=level;
-        }
-    }
-    *pMinLevel=minLevel;
-    *pMaxLevel=maxLevel;
-
-    /* initialize the index map */
-    for(start=length; start>0;) {
-        --start;
-        indexMap[start]=start;
-    }
-
-    return TRUE;
 }
 
 /* API functions for logical<->visual mapping ------------------------------- */

Index: ubrk.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ubrk.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- ubrk.cpp	10 Sep 2003 02:42:03 -0000	1.1
+++ ubrk.cpp	6 Apr 2004 10:08:00 -0000	1.2
@@ -1,6 +1,6 @@
 /*
 *****************************************************************************************
-*   Copyright (C) 1996-2001, International Business Machines
+*   Copyright (C) 1996-2003, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *****************************************************************************************
 */
@@ -264,5 +264,20 @@
 {
     return ((RuleBasedBreakIterator *)bi)->getRuleStatus();
 }
+
+U_CAPI const char* U_EXPORT2
+ubrk_getLocaleByType(const UBreakIterator *bi, 
+                     ULocDataLocaleType type, 
+                     UErrorCode* status)
+{
+    if (bi == NULL) {
+        if (U_SUCCESS(*status)) {
+            *status = U_ILLEGAL_ARGUMENT_ERROR;
+        }
+        return NULL;
+    }
+    return ((BreakIterator*)bi)->getLocaleID(type, *status);
+}
+
 
 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

Index: uchar.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/uchar.c,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -d -r1.5 -r1.6
--- uchar.c	10 Sep 2003 02:42:03 -0000	1.5
+++ uchar.c	6 Apr 2004 10:08:00 -0000	1.6
@@ -30,7 +30,9 @@
 #include "ucln_cmn.h"
 #include "utrie.h"
 #include "ustr_imp.h"
+#include "udataswp.h"
 #include "uprops.h"
+#include "uassert.h"
 
 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
 
@@ -54,7 +56,10 @@
 static const UChar *ucharsTable=NULL;
 static int32_t countPropsVectors=0, propsVectorsColumns=0;
 
-static int8_t havePropsData=0;
+static int8_t havePropsData=0;     /*  == 0   ->  Data has not been loaded.
+                                    *   < 0   ->  Error occured attempting to load data.
+                                    *   > 0   ->  Data has been successfully loaded.
+                                    */
 
 /* index values loaded from uprops.dat */
 static int32_t indexes[UPROPS_INDEX_COUNT];
@@ -107,13 +112,14 @@
     propsVectors=NULL;
     countPropsVectors=0;
     dataErrorCode=U_ZERO_ERROR;
-    havePropsData=FALSE;
+    havePropsData=0;
 
     return TRUE;
 }
 
-static int8_t
-loadPropsData(void) {
+
+U_CFUNC int8_t
+uprv_loadPropsData(UErrorCode *errorCode) {
     /* load Unicode character properties data from file if necessary */
 
     /*
@@ -123,15 +129,14 @@
      */
     if(havePropsData==0) {
         UTrie trie={ 0 }, trie2={ 0 };
-        UErrorCode errorCode=U_ZERO_ERROR;
         UDataMemory *data;
         const uint32_t *p=NULL;
         int32_t length;
 
         /* open the data outside the mutex block */
-        data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode);
-        dataErrorCode=errorCode;
-        if(U_FAILURE(errorCode)) {
+        data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, errorCode);
+        dataErrorCode=*errorCode;
+        if(U_FAILURE(*errorCode)) {
             return havePropsData=-1;
         }
 
@@ -139,9 +144,9 @@
 
         /* unserialize the trie; it is directly after the int32_t indexes[UPROPS_INDEX_COUNT] */
         length=(int32_t)p[UPROPS_PROPS32_INDEX]*4;
-        length=utrie_unserialize(&trie, (const uint8_t *)(p+UPROPS_INDEX_COUNT), length-64, &errorCode);
-        if(U_FAILURE(errorCode)) {
-            dataErrorCode=errorCode;
+        length=utrie_unserialize(&trie, (const uint8_t *)(p+UPROPS_INDEX_COUNT), length-64, errorCode);
+        if(U_FAILURE(*errorCode)) {
+            dataErrorCode=*errorCode;
             udata_close(data);
             return havePropsData=-1;
         }
@@ -152,8 +157,8 @@
             p[UPROPS_ADDITIONAL_VECTORS_INDEX]!=0
         ) {
             length=(int32_t)(p[UPROPS_ADDITIONAL_VECTORS_INDEX]-p[UPROPS_ADDITIONAL_TRIE_INDEX])*4;
-            length=utrie_unserialize(&trie2, (const uint8_t *)(p+p[UPROPS_ADDITIONAL_TRIE_INDEX]), length, &errorCode);
-            if(U_FAILURE(errorCode)) {
+            length=utrie_unserialize(&trie2, (const uint8_t *)(p+p[UPROPS_ADDITIONAL_TRIE_INDEX]), length, errorCode);
+            if(U_FAILURE(*errorCode)) {
                 uprv_memset(&trie2, 0, sizeof(trie2));
             } else {
                 trie2.getFoldingOffset=getFoldingPropsOffset;
@@ -196,10 +201,150 @@
     return havePropsData;
 }
 
-/* constants and macros for access to the data */
+
+static int8_t 
+loadPropsData(void) {
+    UErrorCode   errorCode = U_ZERO_ERROR;
+    int8_t       retVal    = uprv_loadPropsData(&errorCode);
+    return retVal;
+}
+
+
+/* Unicode properties data swapping ----------------------------------------- */
+
+U_CAPI int32_t U_EXPORT2
+uprops_swap(const UDataSwapper *ds,
+            const void *inData, int32_t length, void *outData,
+            UErrorCode *pErrorCode) {
+    const UDataInfo *pInfo;
+    int32_t headerSize, i;
+
+    int32_t dataIndexes[UPROPS_INDEX_COUNT];
+    const int32_t *inData32;
+
+    /* udata_swapDataHeader checks the arguments */
+    headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
+    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+        return 0;
+    }
+
+    /* check data format and format version */
+    pInfo=(const UDataInfo *)((const char *)inData+4);
+    if(!(
+        pInfo->dataFormat[0]==0x55 &&   /* dataFormat="UPro" */
+        pInfo->dataFormat[1]==0x50 &&
+        pInfo->dataFormat[2]==0x72 &&
+        pInfo->dataFormat[3]==0x6f &&
+        pInfo->formatVersion[0]==3 &&
+        pInfo->formatVersion[2]==UTRIE_SHIFT &&
+        pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
+    )) {
+        udata_printError(ds, "uprops_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not a Unicode properties file\n",
+                         pInfo->dataFormat[0], pInfo->dataFormat[1],
+                         pInfo->dataFormat[2], pInfo->dataFormat[3],
+                         pInfo->formatVersion[0]);
+        *pErrorCode=U_UNSUPPORTED_ERROR;
+        return 0;
+    }
+
+    /* the properties file must contain at least the indexes array */
+    if(length>=0 && (length-headerSize)<sizeof(dataIndexes)) {
+        udata_printError(ds, "uprops_swap(): too few bytes (%d after header) for a Unicode properties file\n",
+                         length-headerSize);
+        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+        return 0;
+    }
+
+    /* read the indexes */
+    inData32=(const int32_t *)((const char *)inData+headerSize);
+    for(i=0; i<UPROPS_INDEX_COUNT; ++i) {
+        dataIndexes[i]=udata_readInt32(ds, inData32[i]);
+    }
+
+    /*
+     * comments are copied from the data format description in genprops/store.c
+     * indexes[] constants are in uprops.h
+     */
+    if(length>=0) {
+        int32_t *outData32;
+
+        if((length-headerSize)<(4*dataIndexes[UPROPS_RESERVED_INDEX])) {
+            udata_printError(ds, "uprops_swap(): too few bytes (%d after header) for a Unicode properties file\n",
+                             length-headerSize);
+            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+            return 0;
+        }
+
+        outData32=(int32_t *)((char *)outData+headerSize);
+
+        /* copy everything for inaccessible data (padding) */
+        if(inData32!=outData32) {
+            uprv_memcpy(outData32, inData32, 4*dataIndexes[UPROPS_RESERVED_INDEX]);
+        }
+
+        /* swap the indexes[16] */
+        ds->swapArray32(ds, inData32, 4*UPROPS_INDEX_COUNT, outData32, pErrorCode);
+
+        /*
+         * swap the main properties UTrie
+         * PT serialized properties trie, see utrie.h (byte size: 4*(i0-16))
+         */
+        utrie_swap(ds,
+            inData32+UPROPS_INDEX_COUNT,
+            4*(dataIndexes[UPROPS_PROPS32_INDEX]-UPROPS_INDEX_COUNT),
+            outData32+UPROPS_INDEX_COUNT,
+            pErrorCode);
+
+        /*
+         * swap the properties and exceptions words
+         * P  const uint32_t props32[i1-i0];
+         * E  const uint32_t exceptions[i2-i1];
+         */
+        ds->swapArray32(ds,
+            inData32+dataIndexes[UPROPS_PROPS32_INDEX],
+            4*(dataIndexes[UPROPS_EXCEPTIONS_TOP_INDEX]-dataIndexes[UPROPS_PROPS32_INDEX]),
+            outData32+dataIndexes[UPROPS_PROPS32_INDEX],
+            pErrorCode);
+
+        /*
+         * swap the UChars
+         * U  const UChar uchars[2*(i3-i2)];
+         */
+        ds->swapArray16(ds,
+            inData32+dataIndexes[UPROPS_EXCEPTIONS_TOP_INDEX],
+            4*(dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX]-dataIndexes[UPROPS_EXCEPTIONS_TOP_INDEX]),
+            outData32+dataIndexes[UPROPS_EXCEPTIONS_TOP_INDEX],
+            pErrorCode);
+
+        /*
+         * swap the additional UTrie
+         * i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties
+         */
+        utrie_swap(ds,
+            inData32+dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX],
+            4*(dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX]-dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX]),
+            outData32+dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX],
+            pErrorCode);
+
+        /*
+         * swap the properties vectors
+         * PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4];
+         */
+        ds->swapArray32(ds,
+            inData32+dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX],
+            4*(dataIndexes[UPROPS_RESERVED_INDEX]-dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX]),
+            outData32+dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX],
+            pErrorCode);
+    }
+
+    /* i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table */
+    return headerSize+4*dataIndexes[UPROPS_RESERVED_INDEX];
+}
+
+/* constants and macros for access to the data ------------------------------ */
 
 /* getting a uint32_t properties word from the data */
-#define HAVE_DATA (havePropsData>0 || (havePropsData==0 && loadPropsData()>0))
+#define HAVE_DATA (havePropsData>0 || loadPropsData()>0)
 #define VALIDATE(c) (((uint32_t)(c))<=0x10ffff && HAVE_DATA)
 #define GET_PROPS_UNSAFE(c, result) \
     UTRIE_GET16(&propsTrie, c, result); \
@@ -246,13 +391,12 @@
 
 U_CFUNC UBool
 uprv_haveProperties(UErrorCode *pErrorCode) {
-    if(HAVE_DATA) {
-        return TRUE;
-    } else {
-        *pErrorCode=dataErrorCode;
-        return FALSE;
+    if (havePropsData == 0) {
+        uprv_loadPropsData(pErrorCode);
     }
+    return (havePropsData>0);
 }
+
 
 /* API functions ------------------------------------------------------------ */
 

Index: uchriter.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/uchriter.cpp,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- uchriter.cpp	10 Sep 2003 02:42:03 -0000	1.4
+++ uchriter.cpp	6 Apr 2004 10:08:00 -0000	1.5
@@ -1,6 +1,6 @@
 /*
 ******************************************************************************
-* Copyright (C) 1998-2001, International Business Machines Corporation and   *
+* Copyright (C) 1998-2003, International Business Machines Corporation and   *
 * others. All Rights Reserved.                                               *
 ******************************************************************************
 */
@@ -11,7 +11,7 @@
 
 U_NAMESPACE_BEGIN
 
-const char UCharCharacterIterator::fgClassID = 0;
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UCharCharacterIterator)
 
 UCharCharacterIterator::UCharCharacterIterator()
   : CharacterIterator(),

Index: ucln_cmn.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ucln_cmn.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- ucln_cmn.c	10 Sep 2003 02:42:03 -0000	1.4
+++ ucln_cmn.c	6 Apr 2004 10:08:00 -0000	1.5
@@ -16,11 +16,18 @@
 
 #include "unicode/utypes.h"
 #include "unicode/uclean.h"
+#include "utracimp.h"
 #include "ustr_imp.h"
 #include "unormimp.h"
 #include "ucln_cmn.h"
 #include "umutex.h"
 #include "ucln.h"
+#include "cmemory.h"
+#include "uassert.h"
+
+static UBool gICUInitialized = FALSE;
+static UMTX  gICUInitMutex   = NULL;
+
 
 static cleanupFunc *gCleanupFunctions[UCLN_COMMON] = {
     NULL,
@@ -34,6 +41,7 @@
 ucln_registerCleanup(ECleanupLibraryType type,
                      cleanupFunc *func)
 {
+    U_ASSERT(UCLN_START < type && type < UCLN_COMMON);
     if (UCLN_START < type && type < UCLN_COMMON)
     {
         gCleanupFunctions[type] = func;
@@ -47,18 +55,19 @@
 U_CAPI void U_EXPORT2
 u_cleanup(void)
 {
+    ECleanupLibraryType libType;
 
-    ECleanupLibraryType libType = UCLN_START;
-    while (++libType < UCLN_COMMON)
-    {
+    UTRACE_ENTRY_OC(UTRACE_U_CLEANUP);
+    for (libType = UCLN_START+1; libType<UCLN_COMMON; libType++) {
         if (gCleanupFunctions[libType])
         {
             gCleanupFunctions[libType]();
+            gCleanupFunctions[libType] = NULL;
         }
-
     }
+
 #if !UCONFIG_NO_IDNA
-    ustrprep_cleanup();
+    usprep_cleanup();
 #endif
 #if !UCONFIG_NO_BREAK_ITERATION
 	breakiterator_cleanup();
@@ -80,15 +89,13 @@
     ucnv_io_cleanup();
     udata_cleanup();
     putil_cleanup();
-    /*
-     * WARNING! Destroying the global mutex can cause synchronization
-     * problems.  ICU must be reinitialized from a single thread
-     * before the library is used again.  You never want two
-     * threads trying to initialize the global mutex at the same
-     * time. The global mutex is being destroyed so that heap and
-     * resource checkers don't complain. [grhoten]
-     */
-    umtx_destroy(NULL);
+
+    umtx_destroy(&gICUInitMutex);
+    umtx_cleanup();
+    cmemory_cleanup();       /* undo any heap functions set by u_setMemoryFunctions(). */
+    gICUInitialized = FALSE;
+    UTRACE_EXIT();           /* Must be before utrace_cleanup(), which turns off tracing. */
+    utrace_cleanup();       
 }
 
 
@@ -102,15 +109,15 @@
 
 U_CAPI void U_EXPORT2
 u_init(UErrorCode *status) {
+    UTRACE_ENTRY_OC(UTRACE_U_INIT);
     /* Make sure the global mutexes are initialized. */
-    /*
-     * NOTE:  This section of code replicates functionality from GlobalMutexInitialize()
-     *        in the file mutex.cpp.  Any changes must be made in both places.
-     *        TODO:  combine them.
-     */
     umtx_init(NULL);
-    ucnv_init(status);
-    ures_init(status);
+    umtx_lock(&gICUInitMutex);
+    if (gICUInitialized || U_FAILURE(*status)) {
+        umtx_unlock(&gICUInitMutex);
+        UTRACE_EXIT_STATUS(*status);
+        return;
+    }
 
     /* Do any required init for services that don't have open operations
      * and use "only" the double-check initialization method for performance
@@ -119,10 +126,14 @@
      */
 
     /* Char Properties */
-    uprv_haveProperties(status);
+    uprv_loadPropsData(status);
 
 #if !UCONFIG_NO_NORMALIZATION
     /*  Normalization  */
     unorm_haveData(status);
 #endif
+    gICUInitialized = TRUE;    /* TODO:  don't set if U_FAILURE? */
+    umtx_unlock(&gICUInitMutex);
+    UTRACE_EXIT_STATUS(*status);
 }
+

Index: ucln_cmn.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ucln_cmn.h,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- ucln_cmn.h	10 Sep 2003 02:42:03 -0000	1.3
+++ ucln_cmn.h	6 Apr 2004 10:08:00 -0000	1.4
@@ -35,7 +35,7 @@
 
 U_CFUNC UBool breakiterator_cleanup(void);
 
-U_CFUNC UBool ustrprep_cleanup(void);
+U_CFUNC UBool usprep_cleanup(void);
 
 U_CFUNC UBool U_EXPORT2 ucnv_cleanup(void);
 
@@ -51,11 +51,17 @@
 
 U_CFUNC UBool service_cleanup(void);
 
+U_CFUNC UBool cmemory_cleanup(void);
+
+U_CFUNC UBool umtx_cleanup(void);
+
+U_CFUNC UBool utrace_cleanup(void);
 
 /* Only mutexes should be initialized in these functions. */
 
 U_CFUNC void ucnv_init(UErrorCode *status);
 
 U_CFUNC void ures_init(UErrorCode *status);
+
 
 #endif

Index: ucmndata.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ucmndata.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- ucmndata.c	10 Sep 2003 02:42:03 -0000	1.3
+++ ucmndata.c	6 Apr 2004 10:08:00 -0000	1.4
@@ -1,7 +1,7 @@
 /*
 ******************************************************************************
 *
-*   Copyright (C) 1999-2001, International Business Machines
+*   Copyright (C) 1999-2003, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************/
@@ -26,6 +26,33 @@
 #include "ucmndata.h"
 #include "udatamem.h"
 
+U_CFUNC uint16_t
+udata_getHeaderSize(const DataHeader *udh) {
+    if(udh==NULL) {
+        return 0;
+    } else if(udh->info.isBigEndian==U_IS_BIG_ENDIAN) {
+        /* same endianness */
+        return udh->dataHeader.headerSize;
+    } else {
+        /* opposite endianness */
+        uint16_t x=udh->dataHeader.headerSize;
+        return (uint16_t)((x<<8)|(x>>8));
+    }
+}
+
+U_CFUNC uint16_t
+udata_getInfoSize(const UDataInfo *info) {
+    if(info==NULL) {
+        return 0;
+    } else if(info->isBigEndian==U_IS_BIG_ENDIAN) {
+        /* same endianness */
+        return info->size;
+    } else {
+        /* opposite endianness */
+        uint16_t x=info->size;
+        return (uint16_t)((x<<8)|(x>>8));
+    }
+}
 
 /*----------------------------------------------------------------------------------*
  *                                                                                  *
@@ -48,18 +75,7 @@
 }  PointerTOC;
 
 
-
-typedef struct {
-    int32_t           nameOffset;
-    int32_t           dataOffset;
-}  OffsetTOCEntry;
-
-
-typedef struct {
-    uint32_t          count;
-    OffsetTOCEntry    entry[2];    /* Acutal size of array is from count. */
-}  OffsetTOC;
-
+/* definition of OffsetTOC struct types moved to ucmndata.h */
 
 /*----------------------------------------------------------------------------------*
  *                                                                                  *
@@ -68,7 +84,7 @@
  *----------------------------------------------------------------------------------*/
 static uint32_t offsetTOCEntryCount(const UDataMemory *pData) {
     int32_t          retVal=0;
-    const OffsetTOC *toc = (OffsetTOC *)pData->toc;
+    const UDataOffsetTOC *toc = (UDataOffsetTOC *)pData->toc;
     if (toc != NULL) {
         retVal = toc->count;
     } 
@@ -79,8 +95,9 @@
 static const DataHeader *
 offsetTOCLookupFn(const UDataMemory *pData,
                   const char *tocEntryName,
+                  int32_t *pLength,
                   UErrorCode *pErrorCode) {
-    const OffsetTOC  *toc = (OffsetTOC *)pData->toc;
+    const UDataOffsetTOC  *toc = (UDataOffsetTOC *)pData->toc;
     if(toc!=NULL) {
         const char *base=(const char *)pData->toc;
         uint32_t start, limit, number;
@@ -106,6 +123,11 @@
 /*      fprintf(stderr, "Found: %p\n",(base+toc[2*start+1])) */
             fprintf(stderr, "Found it\n");
 #endif
+            if((start+1)<toc->count) {
+                *pLength=(int32_t)(toc->entry[start+1].dataOffset-toc->entry[start].dataOffset);
+            } else {
+                *pLength=-1;
+            }
             return (const DataHeader *)&base[toc->entry[start].dataOffset];
         } else {
 #ifdef UDATA_DEBUG
@@ -135,6 +157,7 @@
 
 static const DataHeader *pointerTOCLookupFn(const UDataMemory *pData,
                    const char *name,
+                   int32_t *pLength,
                    UErrorCode *pErrorCode) {
     if(pData->toc!=NULL) {
         const PointerTOC *toc = (PointerTOC *)pData->toc;
@@ -159,6 +182,7 @@
 
         if(uprv_strcmp(name, toc->entry[start].entryName)==0) {
             /* found it */
+            *pLength=-1;
             return UDataMemory_normalizeDataPointer(toc->entry[start].pHeader);
         } else {
             return NULL;
@@ -202,7 +226,7 @@
         ) {
         /* dataFormat="CmnD" */
         udm->vFuncs = &CmnDFuncs;
-        udm->toc=(const char *)udm->pHeader+udm->pHeader->dataHeader.headerSize;
+        udm->toc=(const char *)udm->pHeader+udata_getHeaderSize(udm->pHeader);
     }
     else if(udm->pHeader->info.dataFormat[0]==0x54 &&
         udm->pHeader->info.dataFormat[1]==0x6f &&
@@ -212,7 +236,7 @@
         ) {
         /* dataFormat="ToCP" */
         udm->vFuncs = &ToCPFuncs;
-        udm->toc=(const char *)udm->pHeader+udm->pHeader->dataHeader.headerSize;
+        udm->toc=(const char *)udm->pHeader+udata_getHeaderSize(udm->pHeader);
     }
     else {
         /* dataFormat not recognized */
@@ -228,3 +252,22 @@
     }
 }
 
+/*
+ * TODO: Add a udata_swapPackageHeader() function that swaps an ICU .dat package
+ * header but not its sub-items.
+ * This function will be needed for automatic runtime swapping.
+ * Sub-items should not be swapped to limit the swapping to the parts of the
+ * package that are actually used.
+ *
+ * Since lengths of items are implicit in the order and offsets of their
+ * ToC entries, and since offsets are relative to the start of the ToC,
+ * a swapped version may need to generate a different data structure
+ * with pointers to the original data items and with their lengths
+ * (-1 for the last one if it is not known), and maybe even pointers to the
+ * swapped versions of the items.
+ * These pointers to swapped versions would establish a cache;
+ * instead, each open data item could simply own the storage for its swapped
+ * data. This fits better with the current design.
+ *
+ * markus 2003sep18 Jitterbug 2235
+ */

Index: ucmndata.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ucmndata.h,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- ucmndata.h	10 Sep 2003 02:42:03 -0000	1.3
+++ ucmndata.h	6 Apr 2004 10:08:00 -0000	1.4
@@ -1,7 +1,7 @@
 /*
 ******************************************************************************
 *
-*   Copyright (C) 1999-2001, International Business Machines
+*   Copyright (C) 1999-2003, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************/
@@ -43,6 +43,33 @@
     UDataInfo   info;
 } DataHeader;
 
+typedef struct {
+    uint32_t nameOffset;
+    uint32_t dataOffset;
+} UDataOffsetTOCEntry;
+
+typedef struct {
+    uint32_t count;
+    UDataOffsetTOCEntry entry[2];    /* Actual size of array is from count. */
+} UDataOffsetTOC;
+
+/**
+ * Get the header size from a const DataHeader *udh.
+ * Handles opposite-endian data.
+ *
+ * @internal
+ */
+U_CFUNC uint16_t
+udata_getHeaderSize(const DataHeader *udh);
+
+/**
+ * Get the UDataInfo.size from a const UDataInfo *info.
+ * Handles opposite-endian data.
+ *
+ * @internal
+ */
+U_CFUNC uint16_t
+udata_getInfoSize(const UDataInfo *info);
 
 /*
  *  "Virtual" functions for data lookup.
@@ -54,6 +81,7 @@
 typedef const DataHeader *
 (* LookupFn)(const UDataMemory *pData,
              const char *tocEntryName,
+             int32_t *pLength,
              UErrorCode *pErrorCode);
 
 typedef uint32_t

Index: ucmp8.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ucmp8.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- ucmp8.c	10 Sep 2003 02:42:03 -0000	1.4
+++ ucmp8.c	6 Apr 2004 10:08:00 -0000	1.5
@@ -1,7 +1,7 @@
 /*
 ********************************************************************
-* COPYRIGHT: 
-* Copyright (c) 1997-2001, International Business Machines Corporation and
+* COPYRIGHT:
+* Copyright (c) 1997-2003, International Business Machines Corporation and
 * others. All Rights Reserved.
 ********************************************************************
 */
@@ -9,12 +9,6 @@
 #include "ucmp8.h"
 #include "cmemory.h"
 
-static int32_t findOverlappingPosition(CompactByteArray* this_obj,
-                       uint32_t start, 
-                       const UChar *tempIndex, 
-                       int32_t tempIndexCount, 
-                       uint32_t cycle);
-
 /* internal constants*/
 
 
@@ -27,18 +21,18 @@
 U_CAPI void U_EXPORT2
 ucmp8_initBogus(CompactByteArray* array)
 {
-  CompactByteArray* this_obj = array;
+    CompactByteArray* this_obj = array;
 
-  if (this_obj == NULL) return;
+    if (this_obj == NULL) return;
 
-  this_obj->fStructSize = sizeof(CompactByteArray);
-  this_obj->fArray = NULL; 
-  this_obj->fIndex = NULL;
-  this_obj->fCount = UCMP8_kUnicodeCount;
-  this_obj->fCompact = FALSE; 
-  this_obj->fBogus = TRUE;
-  this_obj->fAlias = FALSE;
-  this_obj->fIAmOwned = TRUE;
+    this_obj->fStructSize = sizeof(CompactByteArray);
+    this_obj->fArray = NULL;
+    this_obj->fIndex = NULL;
+    this_obj->fCount = UCMP8_kUnicodeCount;
+    this_obj->fCompact = FALSE;
+    this_obj->fBogus = TRUE;
+    this_obj->fAlias = FALSE;
+    this_obj->fIAmOwned = TRUE;
 }
 
 /* debug flags*/
@@ -66,42 +60,42 @@
  * to data position number 8, which has elements "bced". In the compressed
  * version, index# 2 points to data position 1, which also has "bced"
  */
-  CompactByteArray* this_obj = array;
-  int32_t i;
-  
-  if (this_obj == NULL) return;
+    CompactByteArray* this_obj = array;
+    int32_t i;
 
-  this_obj->fStructSize = sizeof(CompactByteArray);
-  this_obj->fArray = NULL; 
-  this_obj->fIndex = NULL;
-  this_obj->fCount = UCMP8_kUnicodeCount;
-  this_obj->fCompact = FALSE; 
-  this_obj->fBogus = FALSE;
-  this_obj->fAlias = FALSE;
-  this_obj->fIAmOwned = TRUE;
+    if (this_obj == NULL) return;
 
+    this_obj->fStructSize = sizeof(CompactByteArray);
+    this_obj->fArray = NULL;
+    this_obj->fIndex = NULL;
+    this_obj->fCount = UCMP8_kUnicodeCount;
+    this_obj->fCompact = FALSE;
+    this_obj->fBogus = FALSE;
+    this_obj->fAlias = FALSE;
+    this_obj->fIAmOwned = TRUE;
 
-  this_obj->fArray = (int8_t*) uprv_malloc(sizeof(int8_t) * UCMP8_kUnicodeCount);
-  if (!this_obj->fArray) 
+
+    this_obj->fArray = (int8_t*) uprv_malloc(sizeof(int8_t) * UCMP8_kUnicodeCount);
+    if (!this_obj->fArray)
     {
-      this_obj->fBogus = TRUE;
-      return;
+        this_obj->fBogus = TRUE;
+        return;
     }
-  this_obj->fIndex = (uint16_t*) uprv_malloc(sizeof(uint16_t) * UCMP8_kIndexCount);
-  if (!this_obj->fIndex) 
+    this_obj->fIndex = (uint16_t*) uprv_malloc(sizeof(uint16_t) * UCMP8_kIndexCount);
+    if (!this_obj->fIndex)
     {
-      uprv_free(this_obj->fArray);
-      this_obj->fArray = NULL;
-      this_obj->fBogus = TRUE;
-      return;
+        uprv_free(this_obj->fArray);
+        this_obj->fArray = NULL;
+        this_obj->fBogus = TRUE;
+        return;
     }
-  for (i = 0; i < UCMP8_kUnicodeCount; ++i) 
+    for (i = 0; i < UCMP8_kUnicodeCount; ++i)
     {
-      this_obj->fArray[i] = defaultValue;
+        this_obj->fArray[i] = defaultValue;
     }
-  for (i = 0; i < UCMP8_kIndexCount; ++i) 
+    for (i = 0; i < UCMP8_kIndexCount; ++i)
     {
-      this_obj->fIndex[i] = (uint16_t)(i << UCMP8_kBlockShift);
+        this_obj->fIndex[i] = (uint16_t)(i << UCMP8_kBlockShift);
     }
 }
 
@@ -128,46 +122,45 @@
  * to data position number 8, which has elements "bced". In the compressed
  * version, index# 2 points to data position 1, which also has "bced"
  */
-  CompactByteArray* this_obj = (CompactByteArray*) uprv_malloc(sizeof(CompactByteArray));
-  int32_t i;
-
-  if (this_obj == NULL) return NULL;
+    CompactByteArray* this_obj = (CompactByteArray*) uprv_malloc(sizeof(CompactByteArray));
+    int32_t i;
 
-  this_obj->fStructSize = sizeof(CompactByteArray);
-  this_obj->fArray = NULL; 
-  this_obj->fIndex = NULL;
-  this_obj->fCount = UCMP8_kUnicodeCount;
-  this_obj->fCompact = FALSE; 
-  this_obj->fBogus = FALSE;
-  this_obj->fAlias = FALSE;
-  this_obj->fIAmOwned = FALSE;
+    if (this_obj == NULL) return NULL;
 
+    this_obj->fStructSize = sizeof(CompactByteArray);
+    this_obj->fArray = NULL;
+    this_obj->fIndex = NULL;
+    this_obj->fCount = UCMP8_kUnicodeCount;
+    this_obj->fCompact = FALSE;
+    this_obj->fBogus = FALSE;
+    this_obj->fAlias = FALSE;
+    this_obj->fIAmOwned = FALSE;
 
 
-  this_obj->fArray = (int8_t*) uprv_malloc(sizeof(int8_t) * UCMP8_kUnicodeCount);
-  if (!this_obj->fArray) 
+    this_obj->fArray = (int8_t*) uprv_malloc(sizeof(int8_t) * UCMP8_kUnicodeCount);
+    if (!this_obj->fArray)
     {
-      this_obj->fBogus = TRUE;
-      return NULL;
+        this_obj->fBogus = TRUE;
+        return NULL;
     }
-  this_obj->fIndex = (uint16_t*) uprv_malloc(sizeof(uint16_t) * UCMP8_kIndexCount);
-  if (!this_obj->fIndex) 
+    this_obj->fIndex = (uint16_t*) uprv_malloc(sizeof(uint16_t) * UCMP8_kIndexCount);
+    if (!this_obj->fIndex)
     {
-      uprv_free(this_obj->fArray);
-      this_obj->fArray = NULL;
-      this_obj->fBogus = TRUE;
-      return NULL;
+        uprv_free(this_obj->fArray);
+        this_obj->fArray = NULL;
+        this_obj->fBogus = TRUE;
+        return NULL;
     }
-  for (i = 0; i < UCMP8_kUnicodeCount; ++i) 
+    for (i = 0; i < UCMP8_kUnicodeCount; ++i)
     {
-      this_obj->fArray[i] = defaultValue;
+        this_obj->fArray[i] = defaultValue;
     }
-  for (i = 0; i < UCMP8_kIndexCount; ++i) 
+    for (i = 0; i < UCMP8_kIndexCount; ++i)
     {
-      this_obj->fIndex[i] = (uint16_t)(i << UCMP8_kBlockShift);
+        this_obj->fIndex[i] = (uint16_t)(i << UCMP8_kBlockShift);
     }
 
-  return this_obj;
+    return this_obj;
 }
 
 U_CAPI CompactByteArray* U_EXPORT2
@@ -206,19 +199,19 @@
                   int8_t *newValues,
                   int32_t count)
 {
-  if (this_obj) {
-    this_obj->fCount = count;
-    this_obj->fBogus = FALSE;
-    this_obj->fStructSize = sizeof(CompactByteArray);
+    if (this_obj) {
+        this_obj->fCount = count;
+        this_obj->fBogus = FALSE;
+        this_obj->fStructSize = sizeof(CompactByteArray);
 
-    this_obj->fArray = newValues;
-    this_obj->fIndex = indexArray;
-    this_obj->fCompact = (UBool)((count < UCMP8_kUnicodeCount) ? TRUE : FALSE);
-    this_obj->fAlias = FALSE;
-    this_obj->fIAmOwned = TRUE;
-  }
+        this_obj->fArray = newValues;
+        this_obj->fIndex = indexArray;
+        this_obj->fCompact = (UBool)((count < UCMP8_kUnicodeCount) ? TRUE : FALSE);
+        this_obj->fAlias = FALSE;
+        this_obj->fIAmOwned = TRUE;
+    }
 
-  return this_obj;
+    return this_obj;
 }
 
 U_CAPI CompactByteArray* U_EXPORT2
@@ -227,78 +220,78 @@
                   int8_t *newValues,
                   int32_t count)
 {
-  if (this_obj) {
-    this_obj->fArray = NULL;
-    this_obj->fIndex = NULL; 
-    this_obj->fCount = count;
-    this_obj->fBogus = FALSE;
-    this_obj->fStructSize = sizeof(CompactByteArray);
+    if (this_obj) {
+        this_obj->fArray = NULL;
+        this_obj->fIndex = NULL;
+        this_obj->fCount = count;
+        this_obj->fBogus = FALSE;
+        this_obj->fStructSize = sizeof(CompactByteArray);
 
-    this_obj->fArray = newValues;
-    this_obj->fIndex = indexArray;
-    this_obj->fCompact = (UBool)((count < UCMP8_kUnicodeCount) ? TRUE : FALSE);
-    this_obj->fAlias = TRUE;
-    this_obj->fIAmOwned = TRUE;
-  }
+        this_obj->fArray = newValues;
+        this_obj->fIndex = indexArray;
+        this_obj->fCompact = (UBool)((count < UCMP8_kUnicodeCount) ? TRUE : FALSE);
+        this_obj->fAlias = TRUE;
+        this_obj->fIAmOwned = TRUE;
+    }
 
-  return this_obj;
+    return this_obj;
 }
 
 /*=======================================================*/
 
 U_CAPI void U_EXPORT2
-ucmp8_close(CompactByteArray* this_obj) 
+ucmp8_close(CompactByteArray* this_obj)
 {
-  if(this_obj != NULL) {
-    if(!this_obj->fAlias) {
-      if(this_obj->fArray != NULL) {
-        uprv_free(this_obj->fArray);
-      }
-      if(this_obj->fIndex != NULL) {
-        uprv_free(this_obj->fIndex);
-      }
+    if(this_obj != NULL) {
+        if(!this_obj->fAlias) {
+            if(this_obj->fArray != NULL) {
+                uprv_free(this_obj->fArray);
+            }
+            if(this_obj->fIndex != NULL) {
+                uprv_free(this_obj->fIndex);
+            }
+        }
+        if(!this_obj->fIAmOwned) /* Called if 'init' was called instead of 'open'. */
+        {
+            uprv_free(this_obj);
+        }
     }
-    if(!this_obj->fIAmOwned) /* Called if 'init' was called instead of 'open'. */
-      {
-        uprv_free(this_obj);
-      }
-  }
 }
 
 
 /*=======================================================*/
- 
+
 U_CAPI void U_EXPORT2
-ucmp8_expand(CompactByteArray* this_obj) 
+ucmp8_expand(CompactByteArray* this_obj)
 {
-  /* can optimize later.
-   * if we have to expand, then walk through the blocks instead of using Get
-   * this code unpacks the array by copying the blocks to the normalized position.
-   * Example: Compressed
-   * INDEX# 0   1   2   3   4
-   * INDEX  0   4   1   8   2 ...
-   * ARRAY  abcdeabazyabc...
-   *  turns into
-   * Example: Expanded
-   * INDEX# 0   1   2   3   4
-   * INDEX  0   4   8   12  16 ...
-   * ARRAY  abcdeababcedzyabcdea...
-   */
+    /* can optimize later.
+     * if we have to expand, then walk through the blocks instead of using Get
+     * this code unpacks the array by copying the blocks to the normalized position.
+     * Example: Compressed
+     * INDEX# 0   1   2   3   4
+     * INDEX  0   4   1   8   2 ...
+     * ARRAY  abcdeabazyabc...
+     *  turns into
+     * Example: Expanded
+     * INDEX# 0   1   2   3   4
+     * INDEX  0   4   8   12  16 ...
+     * ARRAY  abcdeababcedzyabcdea...
+     */
     int32_t i;
-    if (this_obj->fCompact) 
+    if (this_obj->fCompact)
     {
       int8_t* tempArray;
       tempArray = (int8_t*) uprv_malloc(sizeof(int8_t) * UCMP8_kUnicodeCount);
-      if (!tempArray) 
+      if (!tempArray)
       {
           this_obj->fBogus = TRUE;
           return;
       }
-      for (i = 0; i < UCMP8_kUnicodeCount; ++i) 
+      for (i = 0; i < UCMP8_kUnicodeCount; ++i)
       {
           tempArray[i] = ucmp8_get(this_obj,(UChar)i);  /* HSYS : How expand?*/
       }
-      for (i = 0; i < UCMP8_kIndexCount; ++i) 
+      for (i = 0; i < UCMP8_kIndexCount; ++i)
       {
           this_obj->fIndex[i] = (uint16_t)(i<< UCMP8_kBlockShift);
       }
@@ -309,7 +302,7 @@
 
     }
 }
- 
+
 
 /*=======================================================*/
 /* this_obj->fArray:    an array to be overlapped
@@ -319,60 +312,62 @@
  *      inputHash[i] = XOR of values from i-count+1 to i
  */
 static int32_t
-findOverlappingPosition(CompactByteArray* this_obj, 
+findOverlappingPosition(CompactByteArray* this_obj,
             uint32_t start,
             const UChar* tempIndex,
             int32_t tempIndexCount,
-            uint32_t cycle) 
+            uint32_t cycle)
 {
-  /* this_obj is a utility routine for finding blocks that overlap.
-   * IMPORTANT: the cycle number is very important. Small cycles take a lot
-   * longer to work. In some cases, they may be able to get better compaction.
-   */
-    
-  int32_t i;
-  int32_t j;
-  int32_t currentCount;
-  
-  for (i = 0; i < tempIndexCount; i += cycle) 
-    {
-      currentCount = UCMP8_kBlockCount;
-      if (i + UCMP8_kBlockCount > tempIndexCount) 
-    {
-      currentCount = tempIndexCount - i;
-        } 
-      for (j = 0; j < currentCount; ++j) 
+    /* this_obj is a utility routine for finding blocks that overlap.
+     * IMPORTANT: the cycle number is very important. Small cycles take a lot
+     * longer to work. In some cases, they may be able to get better compaction.
+     */
+
+    int32_t i;
+    int32_t j;
+    int32_t currentCount;
+
+    for (i = 0; i < tempIndexCount; i += cycle)
     {
-      if (this_obj->fArray[start + j] != this_obj->fArray[tempIndex[i + j]]) break;
+        currentCount = UCMP8_kBlockCount;
+        if (i + UCMP8_kBlockCount > tempIndexCount)
+        {
+            currentCount = tempIndexCount - i;
         }
-      if (j == currentCount) break;
+        for (j = 0; j < currentCount; ++j)
+        {
+            if (this_obj->fArray[start + j] != this_obj->fArray[tempIndex[i + j]])
+                break;
+        }
+        if (j == currentCount)
+            break;
     }
-  
-  return i;
+
+    return i;
 }
 
 U_CAPI UBool U_EXPORT2
 ucmp8_isBogus(const CompactByteArray* this_obj)
 {
-  return (UBool)(this_obj == NULL || this_obj->fBogus);
+    return (UBool)(this_obj == NULL || this_obj->fBogus);
 }
 
 U_CAPI const int8_t* U_EXPORT2
 ucmp8_getArray(const CompactByteArray* this_obj)
 {
-  return this_obj->fArray;
+    return this_obj->fArray;
 }
 
 U_CAPI const uint16_t* U_EXPORT2
 ucmp8_getIndex(const CompactByteArray* this_obj)
 {
-  return this_obj->fIndex;
+    return this_obj->fIndex;
 }
 
 U_CAPI int32_t U_EXPORT2
 ucmp8_getCount(const CompactByteArray* this_obj)
 {
-  return this_obj->fCount;
+    return this_obj->fCount;
 }
 
 
@@ -381,12 +376,12 @@
       UChar c,
       int8_t value)
 {
-  if (this_obj->fCompact == TRUE) 
+    if (this_obj->fCompact == TRUE)
     {
-      ucmp8_expand(this_obj);
-      if (this_obj->fBogus) return;
+        ucmp8_expand(this_obj);
+        if (this_obj->fBogus) return;
     }
-  this_obj->fArray[(int32_t)c] = value;
+    this_obj->fArray[(int32_t)c] = value;
 }
 
 
@@ -396,176 +391,179 @@
            UChar end,
            int8_t value)
 {
-  int32_t i;
-  if (this_obj->fCompact == TRUE) 
+    int32_t i;
+    if (this_obj->fCompact == TRUE)
     {
-      ucmp8_expand(this_obj);
-      if (this_obj->fBogus) return;
+        ucmp8_expand(this_obj);
+        if (this_obj->fBogus)
+            return;
     }
-  for (i = start; i <= end; ++i) 
+    for (i = start; i <= end; ++i)
     {
-      this_obj->fArray[i] = value;
+        this_obj->fArray[i] = value;
     }
 }
 
 
 /*=======================================================*/
- 
+
 U_CAPI void U_EXPORT2
 ucmp8_compact(CompactByteArray* this_obj,
-          uint32_t cycle) 
+          uint32_t cycle)
 {
-  if (!this_obj->fCompact) 
-    {
-      /* this_obj actually does the compaction.
-       * it walks throught the contents of the expanded array, finding the
-       * first block in the data that matches the contents of the current index.
-       * As it works, it keeps an updated pointer to the last position,
-       * so that it knows how big to make the final array
-       * If the matching succeeds, then the index will point into the data
-       * at some earlier position.
-       * If the matching fails, then last position pointer will be bumped,
-       * and the index will point to that last block of data.
-       */
-      UChar*    tempIndex;
-      int32_t     tempIndexCount;
-      int8_t*     tempArray;
-      int32_t     iBlock, iIndex;
-      
-      /* fix cycle, must be 0 < cycle <= blockcount*/
-      if (cycle < 0) cycle = 1;
-      else if (cycle > (uint32_t)UCMP8_kBlockCount) cycle = UCMP8_kBlockCount;
-      
-      /* make temp storage, larger than we need*/
-      tempIndex = (UChar*) uprv_malloc(sizeof(UChar)* UCMP8_kUnicodeCount);
-      if (!tempIndex) 
-    {
-      this_obj->fBogus = TRUE;
-      return;
-        }               
-      /* set up first block.*/
-      tempIndexCount = UCMP8_kBlockCount;
-      for (iIndex = 0; iIndex < UCMP8_kBlockCount; ++iIndex) 
-    {
-      tempIndex[iIndex] = (uint16_t)iIndex;
-        }; /* endfor (iIndex = 0; .....)*/
-      this_obj->fIndex[0] = 0;
-      
-      /* for each successive block, find out its first position in the compacted array*/
-      for (iBlock = 1; iBlock < UCMP8_kIndexCount; ++iBlock) 
+    if (!this_obj->fCompact)
     {
-      int32_t newCount, firstPosition, block;
-      block = iBlock << UCMP8_kBlockShift;
-      /*      if (debugSmall) if (block > debugSmallLimit) break;*/
-      firstPosition = findOverlappingPosition(this_obj, 
-                          block,
-                          tempIndex,
-                          tempIndexCount,
-                          cycle);
-      
-      /* if not contained in the current list, copy the remainder
-       * invariant; cumulativeHash[iBlock] = XOR of values from iBlock-kBlockCount+1 to iBlock
-       * we do this_obj by XORing out cumulativeHash[iBlock-kBlockCount]
-       */
-      newCount = firstPosition + UCMP8_kBlockCount;
-      if (newCount > tempIndexCount) 
+        /* this_obj actually does the compaction.
+        * it walks throught the contents of the expanded array, finding the
+        * first block in the data that matches the contents of the current index.
+        * As it works, it keeps an updated pointer to the last position,
+        * so that it knows how big to make the final array
+        * If the matching succeeds, then the index will point into the data
+        * at some earlier position.
+        * If the matching fails, then last position pointer will be bumped,
+        * and the index will point to that last block of data.
+        */
+        UChar*    tempIndex;
+        int32_t     tempIndexCount;
+        int8_t*     tempArray;
+        int32_t     iBlock, iIndex;
+
+        /* fix cycle, must be 0 < cycle <= blockcount*/
+        if (cycle <= 0)
+            cycle = 1;
+        else if (cycle > (uint32_t)UCMP8_kBlockCount)
+            cycle = UCMP8_kBlockCount;
+
+        /* make temp storage, larger than we need*/
+        tempIndex = (UChar*) uprv_malloc(sizeof(UChar)* UCMP8_kUnicodeCount);
+        if (!tempIndex)
         {
-          for (iIndex = tempIndexCount; iIndex < newCount; ++iIndex) 
+            this_obj->fBogus = TRUE;
+            return;
+        }
+        /* set up first block.*/
+        tempIndexCount = UCMP8_kBlockCount;
+        for (iIndex = 0; iIndex < UCMP8_kBlockCount; ++iIndex)
         {
-          tempIndex[iIndex] = (uint16_t)(iIndex - firstPosition + block);
-        } /* endfor (iIndex = tempIndexCount....)*/
+            tempIndex[iIndex] = (uint16_t)iIndex;
+        } /* endfor (iIndex = 0; .....)*/
+        this_obj->fIndex[0] = 0;
+
+        /* for each successive block, find out its first position in the compacted array*/
+        for (iBlock = 1; iBlock < UCMP8_kIndexCount; ++iBlock)
+        {
+            int32_t newCount, firstPosition, block;
+            block = iBlock << UCMP8_kBlockShift;
+            /*      if (debugSmall) if (block > debugSmallLimit) break;*/
+            firstPosition = findOverlappingPosition(this_obj,
+                block,
+                tempIndex,
+                tempIndexCount,
+                cycle);
+
+            /* if not contained in the current list, copy the remainder
+            * invariant; cumulativeHash[iBlock] = XOR of values from iBlock-kBlockCount+1 to iBlock
+            * we do this_obj by XORing out cumulativeHash[iBlock-kBlockCount]
+            */
+            newCount = firstPosition + UCMP8_kBlockCount;
+            if (newCount > tempIndexCount)
+            {
+                for (iIndex = tempIndexCount; iIndex < newCount; ++iIndex)
+                {
+                    tempIndex[iIndex] = (uint16_t)(iIndex - firstPosition + block);
+                } /* endfor (iIndex = tempIndexCount....)*/
                 tempIndexCount = newCount;
             } /* endif (newCount > tempIndexCount)*/
-      this_obj->fIndex[iBlock] = (uint16_t)firstPosition;
+            this_obj->fIndex[iBlock] = (uint16_t)firstPosition;
         } /* endfor (iBlock = 1.....)*/
-      
-      /* now allocate and copy the items into the array*/
-      tempArray = (int8_t*) uprv_malloc(tempIndexCount * sizeof(int8_t));
-      if (!tempArray) 
-    {
-      this_obj->fBogus = TRUE;
-      uprv_free(tempIndex);
-      return;
+
+        /* now allocate and copy the items into the array*/
+        tempArray = (int8_t*) uprv_malloc(tempIndexCount * sizeof(int8_t));
+        if (!tempArray)
+        {
+            this_obj->fBogus = TRUE;
+            uprv_free(tempIndex);
+            return;
         }
-      for (iIndex = 0; iIndex < tempIndexCount; ++iIndex) 
-    {
-      tempArray[iIndex] = this_obj->fArray[tempIndex[iIndex]];
+        for (iIndex = 0; iIndex < tempIndexCount; ++iIndex)
+        {
+            tempArray[iIndex] = this_obj->fArray[tempIndex[iIndex]];
         }
-      uprv_free(this_obj->fArray);
-      this_obj->fArray = tempArray;
-      this_obj->fCount = tempIndexCount;
-      
-      
-      /* free up temp storage*/
-      uprv_free(tempIndex);
-      this_obj->fCompact = TRUE;
+        uprv_free(this_obj->fArray);
+        this_obj->fArray = tempArray;
+        this_obj->fCount = tempIndexCount;
+
+
+        /* free up temp storage*/
+        uprv_free(tempIndex);
+        this_obj->fCompact = TRUE;
     } /* endif (!this_obj->fCompact)*/
 }
 
 U_CAPI  uint32_t U_EXPORT2 ucmp8_flattenMem (const CompactByteArray* array, UMemoryStream *MS)
 {
-  int32_t size = 0;
+    int32_t size = 0;
 
-  uprv_mstrm_write32(MS, ICU_UCMP8_VERSION);
-  size += 4;
-  
-  uprv_mstrm_write32(MS, array->fCount);
-  size += 4;
-  
-  uprv_mstrm_writeBlock(MS, array->fIndex, sizeof(array->fIndex[0])*UCMP8_kIndexCount);
-  size += sizeof(array->fIndex[0])*UCMP8_kIndexCount;
-  
-  uprv_mstrm_writeBlock(MS, array->fArray, sizeof(array->fArray[0])*array->fCount);
-  size += sizeof(array->fArray[0])*array->fCount;
-  
-  while(size%4) /* end padding */
-  {
-      uprv_mstrm_writePadding(MS, 1); /* Pad total so far to even size */
-      size += 1;
-  }
+    uprv_mstrm_write32(MS, ICU_UCMP8_VERSION);
+    size += 4;
 
-  return size;
+    uprv_mstrm_write32(MS, array->fCount);
+    size += 4;
+
+    uprv_mstrm_writeBlock(MS, array->fIndex, sizeof(array->fIndex[0])*UCMP8_kIndexCount);
+    size += sizeof(array->fIndex[0])*UCMP8_kIndexCount;
+
+    uprv_mstrm_writeBlock(MS, array->fArray, sizeof(array->fArray[0])*array->fCount);
+    size += sizeof(array->fArray[0])*array->fCount;
+
+    while(size%4) /* end padding */
+    {
+        uprv_mstrm_writePadding(MS, 1); /* Pad total so far to even size */
+        size += 1;
+    }
+
+    return size;
 }
 
 /* We use sizeof(*array), etc so that this code can be as portable as 
-   possible between the ucmpX_ family. 
+   possible between the ucmpX_ family.
 */
 
 U_CAPI  void U_EXPORT2 ucmp8_initFromData(CompactByteArray *this_obj, const uint8_t **source, UErrorCode *status)
 {
-  uint32_t i;
-  const uint8_t *oldSource = *source;
+    uint32_t i;
+    const uint8_t *oldSource = *source;
 
-  if(U_FAILURE(*status))
-    return;
+    if(U_FAILURE(*status))
+        return;
 
- this_obj->fArray = NULL;
- this_obj->fIndex = NULL; 
- this_obj->fBogus = FALSE;
- this_obj->fStructSize = sizeof(CompactByteArray);
- this_obj->fCompact = TRUE;
- this_obj->fAlias = TRUE;
- this_obj->fIAmOwned = TRUE;
-  
- i = * ((const uint32_t*) *source);
- (*source) += 4;
+    this_obj->fArray = NULL;
+    this_obj->fIndex = NULL;
+    this_obj->fBogus = FALSE;
+    this_obj->fStructSize = sizeof(CompactByteArray);
+    this_obj->fCompact = TRUE;
+    this_obj->fAlias = TRUE;
+    this_obj->fIAmOwned = TRUE;
 
- if(i != ICU_UCMP8_VERSION)
- {
-   *status = U_INVALID_FORMAT_ERROR;
-   return;
- }
-  
- this_obj->fCount = * ((const uint32_t*)*source);
- (*source) += 4;
+    i = * ((const uint32_t*) *source);
+    (*source) += 4;
 
- this_obj->fIndex = (uint16_t*) *source;
- (*source) += sizeof(this_obj->fIndex[0])*UCMP8_kIndexCount;
+    if(i != ICU_UCMP8_VERSION)
+    {
+        *status = U_INVALID_FORMAT_ERROR;
+        return;
+    }
 
- this_obj->fArray = (int8_t*) *source;
- (*source) += sizeof(this_obj->fArray[0])*this_obj->fCount;
+    this_obj->fCount = * ((const uint32_t*)*source);
+    (*source) += 4;
 
- /* eat up padding */
- while((*source-(oldSource))%4)
-    (*source)++;
+    this_obj->fIndex = (uint16_t*) *source;
+    (*source) += sizeof(this_obj->fIndex[0])*UCMP8_kIndexCount;
+
+    this_obj->fArray = (int8_t*) *source;
+    (*source) += sizeof(this_obj->fArray[0])*this_obj->fCount;
+
+    /* eat up padding */
+    while((*source-(oldSource))%4)
+        (*source)++;
 }

Index: ucnv.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ucnv.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- ucnv.c	10 Sep 2003 02:42:03 -0000	1.4
+++ ucnv.c	6 Apr 2004 10:08:00 -0000	1.5
@@ -27,71 +27,14 @@
 #include "unicode/uset.h"
 #include "cmemory.h"
 #include "cstring.h"
+#include "uassert.h"
+#include "utracimp.h"
 #include "ustr_imp.h"
 #include "ucnv_imp.h"
 #include "ucnv_io.h"
 #include "ucnv_cnv.h"
 #include "ucnv_bld.h"
 
[...1650 lines suppressed...]
+        if((length=cnv->UCharErrorBufferLength)>0) {
+            uprv_memmove(cnv->UCharErrorBuffer+delta, cnv->UCharErrorBuffer,
+                         length*U_SIZEOF_UCHAR);
+        }
+        cnv->UCharErrorBufferLength=(int8_t)(length+delta);
+
+        cnv->UCharErrorBuffer[0]=buffer[i++];
+        if(delta>1) {
+            cnv->UCharErrorBuffer[1]=buffer[i];
+        }
+    }
+
+    *source=args.source;
+    return c;
 }
+
+/* ucnv_convert() and siblings ---------------------------------------------- */
 
 U_CAPI void U_EXPORT2
 ucnv_convertEx(UConverter *targetCnv, UConverter *sourceCnv,

Index: ucnv2022.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ucnv2022.c,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -d -r1.5 -r1.6
--- ucnv2022.c	10 Sep 2003 02:42:03 -0000	1.5
+++ ucnv2022.c	6 Apr 2004 10:08:00 -0000	1.6
@@ -34,12 +34,49 @@
 #include "unicode/uset.h"
 #include "unicode/ucnv_err.h"
 #include "unicode/ucnv_cb.h"
+#include "ucnv_imp.h"
 #include "ucnv_bld.h"
 #include "ucnv_cnv.h"
 #include "ucnvmbcs.h"
 #include "cstring.h"
 #include "cmemory.h"
 
+#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
[...4985 lines suppressed...]
+    FALSE,
+    FALSE,
+    0,
+    0,
+    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
+};
+static const UConverterSharedData _ISO2022CNData={
+    sizeof(UConverterSharedData),
+    ~((uint32_t) 0),
+    NULL,
+    NULL,
+    &_ISO2022CNStaticData,
+    FALSE,
+    &_ISO2022CNImpl,
+    0
+};
+
+
 
 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */

Index: ucnv_bld.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ucnv_bld.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- ucnv_bld.c	10 Sep 2003 02:42:03 -0000	1.4
+++ ucnv_bld.c	6 Apr 2004 10:08:00 -0000	1.5
@@ -24,8 +24,11 @@
 #include "unicode/ucnv.h"
 #include "unicode/ucnv_err.h"
 #include "unicode/uloc.h"
+#include "utracimp.h"
 #include "ucnv_io.h"
 #include "ucnv_bld.h"
+#include "ucnvmbcs.h"
+#include "ucnv_ext.h"
 #include "ucnv_cnv.h"
 #include "ucnv_imp.h"
 #include "uhash.h"
@@ -95,7 +98,9 @@
   { "cesu8", UCNV_CESU8 },
 #if !UCONFIG_NO_LEGACY_CONVERSION
   { "hz",UCNV_HZ },
+#endif
   { "imapmailboxname", UCNV_IMAP_MAILBOX },
+#if !UCONFIG_NO_LEGACY_CONVERSION
   { "iscii", UCNV_ISCII },
   { "iso2022", UCNV_ISO_2022 },
 #endif
@@ -193,7 +198,7 @@
  * Un flatten shared data from a UDATA..
  */
 static UConverterSharedData*
-ucnv_data_unFlattenClone(UDataMemory *pData, UErrorCode *status)
+ucnv_data_unFlattenClone(UConverterLoadArgs *pArgs, UDataMemory *pData, UErrorCode *status)
 {
     /* UDataInfo info; -- necessary only if some converters have different formatVersion */
     const uint8_t *raw = (const uint8_t *)udata_getMemory(pData);
@@ -222,12 +227,16 @@
     /* copy initial values from the static structure for this type */
     uprv_memcpy(data, converterData[type], sizeof(UConverterSharedData));
 
+#if 0 /* made UConverterMBCSTable part of UConverterSharedData -- markus 20031107 */
     /*
      * It would be much more efficient if the table were a direct member, not a pointer.
      * However, that would add to the size of all UConverterSharedData objects
      * even if they do not use this table (especially algorithmic ones).
      * If this changes, then the static templates from converterData[type]
      * need more entries.
+     *
+     * In principle, it would be cleaner if the load() function below
+     * allocated the table.
      */
     data->table = (UConverterTable *)uprv_malloc(sizeof(UConverterTable));
     if(data->table == NULL) {
@@ -236,7 +245,8 @@
         return NULL;
     }
     uprv_memset(data->table, 0, sizeof(UConverterTable));
-    
+#endif
+
     data->staticData = source;
     
     data->sharedDataCached = FALSE;
@@ -245,7 +255,7 @@
     data->dataMemory = (void*)pData; /* for future use */
 
     if(data->impl->load != NULL) {
-        data->impl->load(data, raw + source->structSize, status);
+        data->impl->load(data, pArgs, raw + source->structSize, status);
         if(U_FAILURE(*status)) {
             uprv_free(data->table);
             uprv_free(data);
@@ -259,28 +269,43 @@
  *goes to disk and opens it.
  *allocates the memory and returns a new UConverter object
  */
-static UConverterSharedData *createConverterFromFile(const char* pkg, const char *fileName, UErrorCode * err)
+static UConverterSharedData *createConverterFromFile(UConverterLoadArgs *pArgs, UErrorCode * err)
 {
     UDataMemory *data;
     UConverterSharedData *sharedData;
 
+    UTRACE_ENTRY_OC(UTRACE_UCNV_LOAD);
+
     if (err == NULL || U_FAILURE (*err)) {
+        UTRACE_EXIT_STATUS(*err);
         return NULL;
     }
 
-    data = udata_openChoice(pkg, DATA_TYPE, fileName, isCnvAcceptable, NULL, err);
+    UTRACE_DATA2(UTRACE_OPEN_CLOSE, "load converter %s from package %s", pArgs->name, pArgs->pkg);
+
+    data = udata_openChoice(pArgs->pkg, DATA_TYPE, pArgs->name, isCnvAcceptable, NULL, err);
     if(U_FAILURE(*err))
     {
+        UTRACE_EXIT_STATUS(*err);
         return NULL;
     }
 
-    sharedData = ucnv_data_unFlattenClone(data, err);
+    sharedData = ucnv_data_unFlattenClone(pArgs, data, err);
     if(U_FAILURE(*err))
     {
         udata_close(data);
+        UTRACE_EXIT_STATUS(*err);
         return NULL;
     }
 
+    /*
+     * TODO Store pkg in a field in the shared data so that delta-only converters
+     * can load base converters from the same package.
+     * If the pkg name is longer than the field, then either do not load the converter
+     * in the first place, or just set the pkg field to "".
+     */
+
+    UTRACE_EXIT_PTR_STATUS(sharedData, *err);
     return sharedData;
 }
 
@@ -423,8 +448,13 @@
 static UBool
 ucnv_deleteSharedConverterData(UConverterSharedData * deadSharedData)
 {
-    if (deadSharedData->referenceCounter > 0)
+    UTRACE_ENTRY_OC(UTRACE_UCNV_UNLOAD);
+    UTRACE_DATA2(UTRACE_OPEN_CLOSE, "unload converter %s shared data %p", deadSharedData->staticData->name, deadSharedData);
+
+    if (deadSharedData->referenceCounter > 0) {
+        UTRACE_EXIT_VALUE((int32_t)FALSE);
         return FALSE;
+    }
 
     if (deadSharedData->impl->unload != NULL) {
         deadSharedData->impl->unload(deadSharedData);
@@ -456,42 +486,94 @@
 #endif
 
     uprv_free(deadSharedData);
-    
+
+    UTRACE_EXIT_VALUE((int32_t)TRUE);
     return TRUE;
 }
 
+/**
+ * Load a non-algorithmic converter.
+ * If pkg==NULL, then this function must be called inside umtx_lock(&cnvCacheMutex).
+ */
+UConverterSharedData *
+ucnv_load(UConverterLoadArgs *pArgs, UErrorCode *err) {
+    UConverterSharedData *mySharedConverterData;
+
+    if(err == NULL || U_FAILURE(*err)) {
+        return NULL;
+    }
+
+    if(pArgs->pkg != NULL && *pArgs->pkg != 0) {
+        /* application-provided converters are not currently cached */
+        return createConverterFromFile(pArgs, err);
+    }
+
+    mySharedConverterData = ucnv_getSharedConverterData(pArgs->name);
+    if (mySharedConverterData == NULL)
+    {
+        /*Not cached, we need to stream it in from file */
+        mySharedConverterData = createConverterFromFile(pArgs, err);
+        if (U_FAILURE (*err) || (mySharedConverterData == NULL))
+        {
+            return NULL;
+        }
+        else
+        {
+            /* share it with other library clients */
+            ucnv_shareConverterData(mySharedConverterData);
+        }
+    }
+    else
+    {
+        /* The data for this converter was already in the cache.            */
+        /* Update the reference counter on the shared data: one more client */
+        mySharedConverterData->referenceCounter++;
+    }
+
+    return mySharedConverterData;
+}
+
+/**
+ * Unload a non-algorithmic converter.
+ * It must be sharedData->referenceCounter != ~0
+ * and this function must be called inside umtx_lock(&cnvCacheMutex).
+ */
 void
-ucnv_unloadSharedDataIfReady(UConverterSharedData *sharedData)
-{
-    umtx_lock(&cnvCacheMutex);
-    /*
-    Double checking doesn't work on some platforms.
-    Don't check referenceCounter outside of a mutex block.
-    */
-    if (sharedData->referenceCounter != ~0) {
+ucnv_unload(UConverterSharedData *sharedData) {
+    if(sharedData != NULL) {
         if (sharedData->referenceCounter > 0) {
             sharedData->referenceCounter--;
         }
-        
+    
         if((sharedData->referenceCounter <= 0)&&(sharedData->sharedDataCached == FALSE)) {
             ucnv_deleteSharedConverterData(sharedData);
         }
     }
-    umtx_unlock(&cnvCacheMutex);
 }
 
 void
-ucnv_incrementRefCount(UConverterSharedData *sharedData)
+ucnv_unloadSharedDataIfReady(UConverterSharedData *sharedData)
 {
-    umtx_lock(&cnvCacheMutex);
     /*
-    Double checking doesn't work on some platforms.
-    Don't check referenceCounter outside of a mutex block.
+    Checking whether it's an algorithic converter is okay
+    in multithreaded applications because the value never changes.
+    Don't check referenceCounter for any other value.
     */
-    if (sharedData->referenceCounter != ~0) {
+    if(sharedData != NULL && sharedData->referenceCounter != ~0) {
+        umtx_lock(&cnvCacheMutex);
+        ucnv_unload(sharedData);
+        umtx_unlock(&cnvCacheMutex);
+    }
+}
+
+void
+ucnv_incrementRefCount(UConverterSharedData *sharedData)
+{
+    if(sharedData != NULL && sharedData->referenceCounter != ~0) {
+        umtx_lock(&cnvCacheMutex);
         sharedData->referenceCounter++;
+        umtx_unlock(&cnvCacheMutex);
     }
-    umtx_unlock(&cnvCacheMutex);
 }
 
 static void
@@ -575,54 +657,58 @@
  * -Call dataConverter initializer (Data=TRUE, Cached=TRUE)
  * -Call AlgorithmicConverter initializer (Data=FALSE, Cached=TRUE)
  */
-UConverter *
-ucnv_createConverter(UConverter *myUConverter, const char *converterName, UErrorCode * err)
-{
-    char cnvName[UCNV_MAX_CONVERTER_NAME_LENGTH], locale[ULOC_FULLNAME_CAPACITY];
-    const char *realName;
+UConverterSharedData *
+ucnv_loadSharedData(const char *converterName, UConverterLookupData *lookup, UErrorCode * err) {
+    UConverterLookupData stackLookup;
     UConverterSharedData *mySharedConverterData = NULL;
     UErrorCode internalErrorCode = U_ZERO_ERROR;
-    uint32_t options = 0;
-    if (U_FAILURE (*err))
+
+    if (U_FAILURE (*err)) {
         return NULL;
+    }
 
-    locale[0] = 0;
+    if(lookup == NULL) {
+        lookup = &stackLookup;
+    }
+
+    lookup->locale[0] = 0;
+    lookup->options = 0;
 
     /* In case "name" is NULL we want to open the default converter. */
     if (converterName == NULL) {
-        realName = ucnv_io_getDefaultConverterName();
-        if (realName == NULL) {
+        lookup->realName = ucnv_io_getDefaultConverterName();
+        if (lookup->realName == NULL) {
             *err = U_MISSING_RESOURCE_ERROR;
             return NULL;
         }
         /* the default converter name is already canonical */
     } else {
         /* separate the converter name from the options */
-        parseConverterOptions(converterName, cnvName, locale, &options, err);
+        parseConverterOptions(converterName, lookup->cnvName, lookup->locale, &lookup->options, err);
         if (U_FAILURE(*err)) {
             /* Very bad name used. */
             return NULL;
         }
 
         /* get the canonical converter name */
-        realName = ucnv_io_getConverterName(cnvName, &internalErrorCode);
-        if (U_FAILURE(internalErrorCode) || realName == NULL) {
+        lookup->realName = ucnv_io_getConverterName(lookup->cnvName, &internalErrorCode);
+        if (U_FAILURE(internalErrorCode) || lookup->realName == NULL) {
             /*
             * set the input name in case the converter was added
             * without updating the alias table, or when there is no alias table
             */
-            realName = cnvName;
+            lookup->realName = lookup->cnvName;
         }
     }
 
     /* separate the converter name from the options */
-    if(realName != cnvName) {
-        parseConverterOptions(realName, cnvName, locale, &options, err);
-        realName = cnvName;
+    if(lookup->realName != lookup->cnvName) {
+        parseConverterOptions(lookup->realName, lookup->cnvName, lookup->locale, &lookup->options, err);
+        lookup->realName = lookup->cnvName;
     }
     
     /* get the shared data for an algorithmic converter, if it is one */
-    mySharedConverterData = (UConverterSharedData *)getAlgorithmicTypeFromName(realName);
+    mySharedConverterData = (UConverterSharedData *)getAlgorithmicTypeFromName(lookup->realName);
     if (mySharedConverterData == NULL)
     {
         /* it is a data-based converter, get its shared data.               */
@@ -630,50 +716,57 @@
         /*   converter data cache, and adding new entries to the cache      */
         /*   to prevent other threads from modifying the cache during the   */
         /*   process.                                                       */
+        UConverterLoadArgs args={ 0 };
+
+        args.size=sizeof(UConverterLoadArgs);
+        args.nestedLoads=1;
+        args.options=lookup->options;
+        args.pkg=NULL;
+        args.name=lookup->realName;
+
         umtx_lock(&cnvCacheMutex);
-        mySharedConverterData = ucnv_getSharedConverterData(realName);
-        if (mySharedConverterData == NULL)
-        {
-            /*Not cached, we need to stream it in from file */
-            mySharedConverterData = createConverterFromFile(NULL, realName, err);
-            if (U_FAILURE (*err) || (mySharedConverterData == NULL))
-            {
-                umtx_unlock(&cnvCacheMutex);
-                return NULL;
-            }
-            else
-            {
-                /* share it with other library clients */
-                ucnv_shareConverterData(mySharedConverterData);
-            }
-        }
-        else
+        mySharedConverterData = ucnv_load(&args, err);
+        umtx_unlock(&cnvCacheMutex);
+        if (U_FAILURE (*err) || (mySharedConverterData == NULL))
         {
-            /* The data for this converter was already in the cache.            */
-            /* Update the reference counter on the shared data: one more client */
-            mySharedConverterData->referenceCounter++;
+            return NULL;
         }
-        umtx_unlock(&cnvCacheMutex);
     }
 
-    myUConverter = ucnv_createConverterFromSharedData(myUConverter, mySharedConverterData, realName, locale, options, err);
+    return mySharedConverterData;
+}
 
-    if (U_FAILURE(*err))
-    {
-        /*
-        Checking whether it's an algorithic converter is okay
-        in multithreaded applications because the value never changes.
-        Don't check referenceCounter for any other value.
-        */
-        if (mySharedConverterData->referenceCounter != ~0) {
-            umtx_lock(&cnvCacheMutex);
-            --mySharedConverterData->referenceCounter;
-            umtx_unlock(&cnvCacheMutex);
+UConverter *
+ucnv_createConverter(UConverter *myUConverter, const char *converterName, UErrorCode * err)
+{
+    UConverterLookupData stackLookup;
+    UConverterSharedData *mySharedConverterData;
+
+    UTRACE_ENTRY_OC(UTRACE_UCNV_OPEN);
+
+    if(U_SUCCESS(*err)) {
+        UTRACE_DATA1(UTRACE_OPEN_CLOSE, "open converter %s", converterName);
+
+        mySharedConverterData = ucnv_loadSharedData(converterName, &stackLookup, err);
+
+        if(U_SUCCESS(*err)) {
+            myUConverter = ucnv_createConverterFromSharedData(
+                myUConverter, mySharedConverterData,
+                stackLookup.realName, stackLookup.locale, stackLookup.options,
+                err);
+
+            if(U_SUCCESS(*err)) {
+                UTRACE_EXIT_PTR_STATUS(myUConverter, *err);
+                return myUConverter;
+            } else {
+                ucnv_unloadSharedDataIfReady(mySharedConverterData);
+            }
         }
-        return NULL;
     }
 
-    return myUConverter;
+    /* exit with error */
+    UTRACE_EXIT_STATUS(*err);
+    return NULL;
 }
 
 UConverter *
@@ -681,11 +774,16 @@
                                 UConverterType type,
                                 const char *locale, uint32_t options,
                                 UErrorCode *err) {
+    UConverter *cnv;
     const UConverterSharedData *sharedData;
     UBool isAlgorithmicConverter;
 
+    UTRACE_ENTRY_OC(UTRACE_UCNV_OPEN_ALGORITHMIC);
+    UTRACE_DATA1(UTRACE_OPEN_CLOSE, "open algorithmic converter type %d", (int32_t)type);
+
     if(type<0 || UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES<=type) {
         *err = U_ILLEGAL_ARGUMENT_ERROR;
+        UTRACE_EXIT_STATUS(U_ILLEGAL_ARGUMENT_ERROR);
         return NULL;
     }
 
@@ -696,47 +794,66 @@
     if (isAlgorithmicConverter) {
         /* not a valid type, or not an algorithmic converter */
         *err = U_ILLEGAL_ARGUMENT_ERROR;
+        UTRACE_EXIT_STATUS(U_ILLEGAL_ARGUMENT_ERROR);
         return NULL;
     }
 
-    return ucnv_createConverterFromSharedData(myUConverter, (UConverterSharedData *)sharedData, "",
+    cnv = ucnv_createConverterFromSharedData(myUConverter, (UConverterSharedData *)sharedData, "",
                 locale != NULL ? locale : "", options, err);
+
+    UTRACE_EXIT_PTR_STATUS(cnv, *err);
+    return cnv;
 }
 
 UConverter*
 ucnv_createConverterFromPackage(const char *packageName, const char *converterName, UErrorCode * err)
 {
     char cnvName[UCNV_MAX_CONVERTER_NAME_LENGTH], locale[ULOC_FULLNAME_CAPACITY];
-    uint32_t options=0;
     UConverter *myUConverter;
-    UConverterSharedData *mySharedConverterData = NULL;
+    UConverterSharedData *mySharedConverterData;
+
+    UConverterLoadArgs args={ 0 };
+
+    UTRACE_ENTRY_OC(UTRACE_UCNV_OPEN_PACKAGE);
 
     if(U_FAILURE(*err)) {
+        UTRACE_EXIT_STATUS(*err);
         return NULL; 
     }
 
-    /* first, get the options out of the convertername string */
-    parseConverterOptions(converterName, cnvName, locale, &options, err);
+    UTRACE_DATA2(UTRACE_OPEN_CLOSE, "open converter %s from package %s", converterName, packageName);
+
+    args.size=sizeof(UConverterLoadArgs);
+    args.nestedLoads=1;
+    args.pkg=packageName;
+
+    /* first, get the options out of the converterName string */
+    parseConverterOptions(converterName, cnvName, locale, &args.options, err);
     if (U_FAILURE(*err)) {
         /* Very bad name used. */
+        UTRACE_EXIT_STATUS(*err);
         return NULL;
     }
+    args.name=cnvName;
     
     /* open the data, unflatten the shared structure */
-    mySharedConverterData = createConverterFromFile(packageName, cnvName, err);
+    mySharedConverterData = createConverterFromFile(&args, err);
     
     if (U_FAILURE(*err)) {
+        UTRACE_EXIT_STATUS(*err);
         return NULL; 
     }
 
     /* create the actual converter */
-    myUConverter = ucnv_createConverterFromSharedData(NULL, mySharedConverterData, cnvName, locale, options, err);
+    myUConverter = ucnv_createConverterFromSharedData(NULL, mySharedConverterData, cnvName, locale, args.options, err);
     
     if (U_FAILURE(*err)) {
         ucnv_close(myUConverter);
+        UTRACE_EXIT_STATUS(*err);
         return NULL; 
     }
     
+    UTRACE_EXIT_PTR_STATUS(myUConverter, *err);
     return myUConverter;
 }
 
@@ -768,13 +885,14 @@
     myUConverter->isExtraLocal = FALSE;
     myUConverter->sharedData = mySharedConverterData;
     myUConverter->options = options;
-    myUConverter->mode = UCNV_SI;
     myUConverter->fromCharErrorBehaviour = (UConverterToUCallback) UCNV_TO_U_CALLBACK_SUBSTITUTE;
     myUConverter->fromUCharErrorBehaviour = (UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE;
     myUConverter->toUnicodeStatus = myUConverter->sharedData->toUnicodeStatus;
+    myUConverter->maxBytesPerUChar = myUConverter->sharedData->staticData->maxBytesPerChar;
     myUConverter->subChar1 = myUConverter->sharedData->staticData->subChar1;
     myUConverter->subCharLen = myUConverter->sharedData->staticData->subCharLen;
     uprv_memcpy (myUConverter->subChar, myUConverter->sharedData->staticData->subChar, myUConverter->subCharLen);
+    myUConverter->preFromUFirstCP = U_SENTINEL;
 
     if(myUConverter != NULL && myUConverter->sharedData->impl->open != NULL) {
         myUConverter->sharedData->impl->open(myUConverter, realName, locale,options, err);
@@ -793,10 +911,13 @@
 ucnv_flushCache ()
 {
     UConverterSharedData *mySharedData = NULL;
-    int32_t pos = -1;
+    int32_t pos;
     int32_t tableDeletedNum = 0;
     const UHashElement *e;
     UErrorCode status = U_ILLEGAL_ARGUMENT_ERROR;
+    int32_t i, remaining;
+
+    UTRACE_ENTRY_OC(UTRACE_UCNV_FLUSH_CACHE);
 
     /* Close the default converter without creating a new one so that everything will be flushed. */
     ucnv_close(u_getDefaultConverter(&status));
@@ -804,8 +925,10 @@
     /*if shared data hasn't even been lazy evaluated yet
     * return 0
     */
-    if (SHARED_DATA_HASHTABLE == NULL)
+    if (SHARED_DATA_HASHTABLE == NULL) {
+        UTRACE_EXIT_VALUE((int32_t)0);
         return 0;
+    }
 
     /*creates an enumeration to iterate through every element in the
     * table
@@ -819,25 +942,354 @@
     *                   is protected by cnvCacheMutex.
     */
     umtx_lock(&cnvCacheMutex);
-    while ((e = uhash_nextElement (SHARED_DATA_HASHTABLE, &pos)) != NULL)
-    {
-        mySharedData = (UConverterSharedData *) e->value.pointer;
-        /*deletes only if reference counter == 0 */
-        if (mySharedData->referenceCounter == 0)
+    /*
+     * double loop: A delta/extension-only converter has a pointer to its base table's
+     * shared data; the first iteration of the outer loop may see the delta converter
+     * before the base converter, and unloading the delta converter may get the base
+     * converter's reference counter down to 0.
+     */
+    i = 0;
+    do {
+        remaining = 0;
+        pos = -1;
+        while ((e = uhash_nextElement (SHARED_DATA_HASHTABLE, &pos)) != NULL)
         {
-            tableDeletedNum++;
+            mySharedData = (UConverterSharedData *) e->value.pointer;
+            /*deletes only if reference counter == 0 */
+            if (mySharedData->referenceCounter == 0)
+            {
+                tableDeletedNum++;
             
-            UCNV_DEBUG_LOG("del",mySharedData->staticData->name,mySharedData);
+                UCNV_DEBUG_LOG("del",mySharedData->staticData->name,mySharedData);
             
-            uhash_removeElement(SHARED_DATA_HASHTABLE, e);
-            mySharedData->sharedDataCached = FALSE;
-            ucnv_deleteSharedConverterData (mySharedData);
+                uhash_removeElement(SHARED_DATA_HASHTABLE, e);
+                mySharedData->sharedDataCached = FALSE;
+                ucnv_deleteSharedConverterData (mySharedData);
+            } else {
+                ++remaining;
+            }
         }
-    }
+    } while(++i == 1 && remaining > 0);
     umtx_unlock(&cnvCacheMutex);
 
+    UTRACE_DATA1(UTRACE_INFO, "ucnv_flushCache() exits with %d converters remaining", remaining);
+
     ucnv_io_flushAvailableConverterCache();
 
+    UTRACE_EXIT_VALUE(tableDeletedNum);
     return tableDeletedNum;
 }
 
+/* data swapping ------------------------------------------------------------ */
+
+/* most of this might belong more properly into ucnvmbcs.c, but that is so large */
+
+#if !UCONFIG_NO_LEGACY_CONVERSION
+
+U_CAPI int32_t U_EXPORT2
+ucnv_swap(const UDataSwapper *ds,
+          const void *inData, int32_t length, void *outData,
+          UErrorCode *pErrorCode) {
+    const UDataInfo *pInfo;
+    int32_t headerSize;
+
+    const uint8_t *inBytes;
+    uint8_t *outBytes;
+
+    uint32_t offset, count, staticDataSize;
+    int32_t size;
+
+    const UConverterStaticData *inStaticData;
+    UConverterStaticData *outStaticData;
+
+    const _MBCSHeader *inMBCSHeader;
+    _MBCSHeader *outMBCSHeader;
+    _MBCSHeader mbcsHeader;
+    uint8_t outputType;
+
+    const int32_t *inExtIndexes;
+    int32_t extOffset;
+
+    /* udata_swapDataHeader checks the arguments */
+    headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
+    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+        return 0;
+    }
+
+    /* check data format and format version */
+    pInfo=(const UDataInfo *)((const char *)inData+4);
+    if(!(
+        pInfo->dataFormat[0]==0x63 &&   /* dataFormat="cnvt" */
+        pInfo->dataFormat[1]==0x6e &&
+        pInfo->dataFormat[2]==0x76 &&
+        pInfo->dataFormat[3]==0x74 &&
+        pInfo->formatVersion[0]==6 &&
+        pInfo->formatVersion[1]>=2
+    )) {
+        udata_printError(ds, "ucnv_swap(): data format %02x.%02x.%02x.%02x (format version %02x.%02x) is not recognized as an ICU .cnv conversion table\n",
+                         pInfo->dataFormat[0], pInfo->dataFormat[1],
+                         pInfo->dataFormat[2], pInfo->dataFormat[3],
+                         pInfo->formatVersion[0], pInfo->formatVersion[1]);
+        *pErrorCode=U_UNSUPPORTED_ERROR;
+        return 0;
+    }
+
+    inBytes=(const uint8_t *)inData+headerSize;
+    outBytes=(uint8_t *)outData+headerSize;
+
+    /* read the initial UConverterStaticData structure after the UDataInfo header */
+    inStaticData=(const UConverterStaticData *)inBytes;
+    outStaticData=(UConverterStaticData *)outBytes;
+
+    if(length<0) {
+        staticDataSize=ds->readUInt32(inStaticData->structSize);
+    } else {
+        length-=headerSize;
+        if( length<sizeof(UConverterStaticData) ||
+            (uint32_t)length<(staticDataSize=ds->readUInt32(inStaticData->structSize))
+        ) {
+            udata_printError(ds, "ucnv_swap(): too few bytes (%d after header) for an ICU .cnv conversion table\n",
+                             length);
+            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+            return 0;
+        }
+    }
+
+    if(length>=0) {
+        /* swap the static data */
+        if(inStaticData!=outStaticData) {
+            uprv_memcpy(outStaticData, inStaticData, staticDataSize);
+        }
+
+        ds->swapArray32(ds, &inStaticData->structSize, 4,
+                           &outStaticData->structSize, pErrorCode);
+        ds->swapArray32(ds, &inStaticData->codepage, 4,
+                           &outStaticData->codepage, pErrorCode);
+
+        ds->swapInvChars(ds, inStaticData->name, uprv_strlen(inStaticData->name),
+                            outStaticData->name, pErrorCode);
+        if(U_FAILURE(*pErrorCode)) {
+            udata_printError(ds, "ucnv_swap(): error swapping converter name - %s\n",
+                             u_errorName(*pErrorCode));
+            return 0;
+        }
+    }
+
+    inBytes+=staticDataSize;
+    outBytes+=staticDataSize;
+    if(length>=0) {
+        length-=(int32_t)staticDataSize;
+    }
+
+    /* check for supported conversionType values */
+    if(inStaticData->conversionType==UCNV_MBCS) {
+        /* swap MBCS data */
+        inMBCSHeader=(const _MBCSHeader *)inBytes;
+        outMBCSHeader=(_MBCSHeader *)outBytes;
+
+        if(!(inMBCSHeader->version[0]==4 || inMBCSHeader->version[1]>=1)) {
+            udata_printError(ds, "ucnv_swap(): unsupported _MBCSHeader.version %d.%d\n",
+                             inMBCSHeader->version[0], inMBCSHeader->version[1]);
+            *pErrorCode=U_UNSUPPORTED_ERROR;
+            return 0;
+        }
+
+        uprv_memcpy(mbcsHeader.version, inMBCSHeader->version, 4);
+        mbcsHeader.countStates=         ds->readUInt32(inMBCSHeader->countStates);
+        mbcsHeader.countToUFallbacks=   ds->readUInt32(inMBCSHeader->countToUFallbacks);
+        mbcsHeader.offsetToUCodeUnits=  ds->readUInt32(inMBCSHeader->offsetToUCodeUnits);
+        mbcsHeader.offsetFromUTable=    ds->readUInt32(inMBCSHeader->offsetFromUTable);
+        mbcsHeader.offsetFromUBytes=    ds->readUInt32(inMBCSHeader->offsetFromUBytes);
+        mbcsHeader.flags=               ds->readUInt32(inMBCSHeader->flags);
+        mbcsHeader.fromUBytesLength=    ds->readUInt32(inMBCSHeader->fromUBytesLength);
+
+        extOffset=(int32_t)mbcsHeader.flags>>8;
+        outputType=(uint8_t)mbcsHeader.flags;
+
+        /* make sure that the output type is known */
+        switch(outputType) {
+        case MBCS_OUTPUT_1:
+        case MBCS_OUTPUT_2:
+        case MBCS_OUTPUT_3:
+        case MBCS_OUTPUT_4:
+        case MBCS_OUTPUT_3_EUC:
+        case MBCS_OUTPUT_4_EUC:
+        case MBCS_OUTPUT_2_SISO:
+        case MBCS_OUTPUT_EXT_ONLY:
+            /* OK */
+            break;
+        default:
+            udata_printError(ds, "ucnv_swap(): unsupported MBCS output type 0x%x\n",
+                             outputType);
+            *pErrorCode=U_UNSUPPORTED_ERROR;
+            return 0;
+        }
+
+        /* calculate the length of the MBCS data */
+        if(extOffset==0) {
+            size=(int32_t)(mbcsHeader.offsetFromUBytes+mbcsHeader.fromUBytesLength);
+
+            /* avoid compiler warnings - not otherwise necessary, and the value does not matter */
+            inExtIndexes=NULL;
+        } else {
+            /* there is extension data after the base data, see ucnv_ext.h */
+            if(length>=0 && length<(extOffset+UCNV_EXT_INDEXES_MIN_LENGTH*4)) {
+                udata_printError(ds, "ucnv_swap(): too few bytes (%d after headers) for an ICU MBCS .cnv conversion table with extension data\n",
+                                 length);
+                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+                return 0;
+            }
+
+            inExtIndexes=(const int32_t *)(inBytes+extOffset);
+            size=extOffset+udata_readInt32(ds, inExtIndexes[UCNV_EXT_SIZE]);
+        }
+
+        if(length>=0) {
+            if(length<size) {
+                udata_printError(ds, "ucnv_swap(): too few bytes (%d after headers) for an ICU MBCS .cnv conversion table\n",
+                                 length);
+                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+                return 0;
+            }
+
+            /* copy the data for inaccessible bytes */
+            if(inBytes!=outBytes) {
+                uprv_memcpy(outBytes, inBytes, size);
+            }
+
+            /* swap the _MBCSHeader */
+            ds->swapArray32(ds, &inMBCSHeader->countStates, 7*4,
+                               &outMBCSHeader->countStates, pErrorCode);
+
+            if(outputType==MBCS_OUTPUT_EXT_ONLY) {
+                /*
+                 * extension-only file,
+                 * contains a base name instead of normal base table data
+                 */
+
+                /* swap the base name, between the header and the extension data */
+                ds->swapInvChars(ds, inMBCSHeader+1, uprv_strlen((const char *)(inMBCSHeader+1)),
+                                    outMBCSHeader+1, pErrorCode);
+            } else {
+                /* normal file with base table data */
+
+                /* swap the state table, 1kB per state */
+                ds->swapArray32(ds, inMBCSHeader+1, (int32_t)(mbcsHeader.countStates*1024),
+                                   outMBCSHeader+1, pErrorCode);
+
+                /* swap the toUFallbacks[] */
+                offset=sizeof(_MBCSHeader)+mbcsHeader.countStates*1024;
+                ds->swapArray32(ds, inBytes+offset, (int32_t)(mbcsHeader.countToUFallbacks*8),
+                                   outBytes+offset, pErrorCode);
+
+                /* swap the unicodeCodeUnits[] */
+                offset=mbcsHeader.offsetToUCodeUnits;
+                count=mbcsHeader.offsetFromUTable-offset;
+                ds->swapArray16(ds, inBytes+offset, (int32_t)count,
+                                   outBytes+offset, pErrorCode);
+
+                /* offset to the stage 1 table, independent of the outputType */
+                offset=mbcsHeader.offsetFromUTable;
+
+                if(outputType==MBCS_OUTPUT_1) {
+                    /* SBCS: swap the fromU tables, all 16 bits wide */
+                    count=(mbcsHeader.offsetFromUBytes-offset)+mbcsHeader.fromUBytesLength;
+                    ds->swapArray16(ds, inBytes+offset, (int32_t)count,
+                                       outBytes+offset, pErrorCode);
+                } else {
+                    /* otherwise: swap the stage tables separately */
+
+                    /* stage 1 table: uint16_t[0x440 or 0x40] */
+                    if(inStaticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
+                        count=0x440*2; /* for all of Unicode */
+                    } else {
+                        count=0x40*2; /* only BMP */
+                    }
+                    ds->swapArray16(ds, inBytes+offset, (int32_t)count,
+                                       outBytes+offset, pErrorCode);
+
+                    /* stage 2 table: uint32_t[] */
+                    offset+=count;
+                    count=mbcsHeader.offsetFromUBytes-offset;
+                    ds->swapArray32(ds, inBytes+offset, (int32_t)count,
+                                       outBytes+offset, pErrorCode);
+
+                    /* stage 3/result bytes: sometimes uint16_t[] or uint32_t[] */
+                    offset=mbcsHeader.offsetFromUBytes;
+                    count=mbcsHeader.fromUBytesLength;
+                    switch(outputType) {
+                    case MBCS_OUTPUT_2:
+                    case MBCS_OUTPUT_3_EUC:
+                    case MBCS_OUTPUT_2_SISO:
+                        ds->swapArray16(ds, inBytes+offset, (int32_t)count,
+                                           outBytes+offset, pErrorCode);
+                        break;
+                    case MBCS_OUTPUT_4:
+                        ds->swapArray32(ds, inBytes+offset, (int32_t)count,
+                                           outBytes+offset, pErrorCode);
+                        break;
+                    default:
+                        /* just uint8_t[], nothing to swap */
+                        break;
+                    }
+                }
+            }
+
+            if(extOffset!=0) {
+                /* swap the extension data */
+                inBytes+=extOffset;
+                outBytes+=extOffset;
+
+                /* swap toUTable[] */
+                offset=udata_readInt32(ds, inExtIndexes[UCNV_EXT_TO_U_INDEX]);
+                length=udata_readInt32(ds, inExtIndexes[UCNV_EXT_TO_U_LENGTH]);
+                ds->swapArray32(ds, inBytes+offset, length*4, outBytes+offset, pErrorCode);
+
+                /* swap toUUChars[] */
+                offset=udata_readInt32(ds, inExtIndexes[UCNV_EXT_TO_U_UCHARS_INDEX]);
+                length=udata_readInt32(ds, inExtIndexes[UCNV_EXT_TO_U_UCHARS_LENGTH]);
+                ds->swapArray16(ds, inBytes+offset, length*2, outBytes+offset, pErrorCode);
+
+                /* swap fromUTableUChars[] */
+                offset=udata_readInt32(ds, inExtIndexes[UCNV_EXT_FROM_U_UCHARS_INDEX]);
+                length=udata_readInt32(ds, inExtIndexes[UCNV_EXT_FROM_U_LENGTH]);
+                ds->swapArray16(ds, inBytes+offset, length*2, outBytes+offset, pErrorCode);
+
+                /* swap fromUTableValues[] */
+                offset=udata_readInt32(ds, inExtIndexes[UCNV_EXT_FROM_U_VALUES_INDEX]);
+                /* same length as for fromUTableUChars[] */
+                ds->swapArray32(ds, inBytes+offset, length*4, outBytes+offset, pErrorCode);
+
+                /* no need to swap fromUBytes[] */
+
+                /* swap fromUStage12[] */
+                offset=udata_readInt32(ds, inExtIndexes[UCNV_EXT_FROM_U_STAGE_12_INDEX]);
+                length=udata_readInt32(ds, inExtIndexes[UCNV_EXT_FROM_U_STAGE_12_LENGTH]);
+                ds->swapArray16(ds, inBytes+offset, length*2, outBytes+offset, pErrorCode);
+
+                /* swap fromUStage3[] */
+                offset=udata_readInt32(ds, inExtIndexes[UCNV_EXT_FROM_U_STAGE_3_INDEX]);
+                length=udata_readInt32(ds, inExtIndexes[UCNV_EXT_FROM_U_STAGE_3_LENGTH]);
+                ds->swapArray16(ds, inBytes+offset, length*2, outBytes+offset, pErrorCode);
+
+                /* swap fromUStage3b[] */
+                offset=udata_readInt32(ds, inExtIndexes[UCNV_EXT_FROM_U_STAGE_3B_INDEX]);
+                length=udata_readInt32(ds, inExtIndexes[UCNV_EXT_FROM_U_STAGE_3B_LENGTH]);
+                ds->swapArray32(ds, inBytes+offset, length*4, outBytes+offset, pErrorCode);
+
+                /* swap indexes[] */
+                length=udata_readInt32(ds, inExtIndexes[UCNV_EXT_INDEXES_LENGTH]);
+                ds->swapArray32(ds, inBytes, length*4, outBytes, pErrorCode);
+            }
+        }
+    } else {
+        udata_printError(ds, "ucnv_swap(): unknown conversionType=%d!=UCNV_MBCS\n",
+                         inStaticData->conversionType);
+        *pErrorCode=U_UNSUPPORTED_ERROR;
+        return 0;
+    }
+
+    return headerSize+(int32_t)staticDataSize+size;
+}
+
+#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */

Index: ucnv_bld.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ucnv_bld.h,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- ucnv_bld.h	10 Sep 2003 02:42:03 -0000	1.4
+++ ucnv_bld.h	6 Apr 2004 10:08:00 -0000	1.5
@@ -20,13 +20,20 @@
 #include "unicode/utypes.h"
 #include "unicode/ucnv.h"
 #include "unicode/ucnv_err.h"
-
+#include "ucnv_cnv.h"
+#include "ucnvmbcs.h"
+#include "ucnv_ext.h"
+#include "udataswp.h"
 
 /* size of the overflow buffers in UConverter, enough for escaping callbacks */
 #define UCNV_ERROR_BUFFER_LENGTH 32
 
+/* at most 4 bytes per substitution character (part of .cnv file format! see UConverterStaticData) */
 #define UCNV_MAX_SUBCHAR_LEN 4
 
+/* at most 8 bytes per character in toUBytes[] (UTF-8 uses up to 6) */
+#define UCNV_MAX_CHAR_LEN 8
+
 /* converter options bits */
 #define UCNV_OPTION_VERSION     0xf
 #define UCNV_OPTION_SWAP_LFNL   0x10
@@ -37,7 +44,10 @@
                  work.
               */
 
-union UConverterTable;
+union UConverterTable {
+    UConverterMBCSTable mbcs;
+};
+
 typedef union UConverterTable UConverterTable;
 
 struct UConverterImpl;
@@ -59,7 +69,7 @@
     int8_t conversionType;          /* +69: 1 conversion type */
 
     int8_t minBytesPerChar;         /* +70: 1 Minimum # bytes per char in this codepage */
-    int8_t maxBytesPerChar;         /* +71: 1 Maximum # bytes per char in this codepage */
+    int8_t maxBytesPerChar;         /* +71: 1 Maximum # bytes output per UChar in this codepage */
 
     uint8_t subChar[UCNV_MAX_SUBCHAR_LEN]; /* +72: 4  [note:  4 and 8 byte boundary] */
     int8_t subCharLen;              /* +76: 1 */
@@ -81,7 +91,7 @@
     uint32_t referenceCounter;      /* used to count number of clients, 0xffffffff for static SharedData */
 
     const void *dataMemory;         /* from udata_openChoice() - for cleanup */
-    UConverterTable *table;         /* Pointer to conversion data */
+    void *table;                    /* Unused. This used to be a UConverterTable - Pointer to conversion data - see mbcs below */
 
     const UConverterStaticData *staticData; /* pointer to the static (non changing) data. */
 
@@ -92,9 +102,23 @@
 
     /*initial values of some members of the mutable part of object */
     uint32_t toUnicodeStatus;
-};
 
-typedef struct UConverterSharedData UConverterSharedData;
+    /*
+     * Shared data structures currently come in two flavors:
+     * - readonly for built-in algorithmic converters
+     * - allocated for MBCS, with a pointer to an allocated UConverterTable
+     *   which always has a UConverterMBCSTable
+     *
+     * To eliminate one allocation, I am making the UConverterMBCSTable
+     * a member of the shared data. It is the last member so that static
+     * definitions of UConverterSharedData work as before.
+     * The table field above also remains to avoid updating all static
+     * definitions, but is now unused.
+     *
+     * markus 2003-nov-07
+     */
+    UConverterMBCSTable mbcs;
+};
 
 /* Defines a UConverter, the lightweight mutable part the user sees */
 
@@ -112,7 +136,7 @@
                                      UErrorCode *);
     /*
      * Error function pointer called when conversion issues
-     * occur during a T_UConverter_toUnicode call
+     * occur during a ucnv_toUnicode call
      */
     void (U_EXPORT2 *fromCharErrorBehaviour) (const void *context,
                                     UConverterToUnicodeArgs *args,
@@ -140,11 +164,29 @@
 
     UBool  useFallback;
     int8_t toULength;                   /* number of bytes in toUBytes */
-    uint8_t toUBytes[7];                /* more "toU status"; keeps the bytes of the current character */
+    uint8_t toUBytes[UCNV_MAX_CHAR_LEN-1];/* more "toU status"; keeps the bytes of the current character */
     uint32_t toUnicodeStatus;           /* Used to internalize stream status information */
     int32_t mode;
     uint32_t fromUnicodeStatus;
-    UChar    fromUSurrogateLead;        /* similar to toUBytes; keeps the lead surrogate of the current character */
+
+    /*
+     * More fromUnicode() status. Serves 3 purposes:
+     * - keeps a lead surrogate between buffers (similar to toUBytes[])
+     * - keeps a lead surrogate at the end of the stream,
+     *   which the framework handles as truncated input
+     * - if the fromUnicode() implementation returns to the framework
+     *   (ucnv.c ucnv_fromUnicode()), then the framework calls the callback
+     *   for this code point
+     */
+    UChar32 fromUChar32;
+
+    /*
+     * value for ucnv_getMaxCharSize()
+     *
+     * usually simply copied from the static data, but ucnvmbcs.c modifies
+     * the value depending on the converter type and options
+     */
+    int8_t maxBytesPerUChar;
 
     int8_t subCharLen;                  /* length of the codepage specific character sequence */
     int8_t invalidCharLength;
@@ -154,26 +196,50 @@
     int8_t UCharErrorBufferLength;      /* number of valid UChars in charErrorBuffer */
 
     uint8_t subChar1;                                   /* single-byte substitution character if different from subChar */
+    UBool useSubChar1;
     uint8_t subChar[UCNV_MAX_SUBCHAR_LEN];              /* codepage specific character sequence */
-    char invalidCharBuffer[UCNV_MAX_SUBCHAR_LEN];       /* bytes from last error/callback situation */
+    char invalidCharBuffer[UCNV_MAX_CHAR_LEN];          /* bytes from last error/callback situation */
     uint8_t charErrorBuffer[UCNV_ERROR_BUFFER_LENGTH];  /* codepage output from Error functions */
 
-    UChar invalidUCharBuffer[3];                        /* UChars from last error/callback situation */
+    UChar invalidUCharBuffer[U16_MAX_LENGTH];           /* UChars from last error/callback situation */
     UChar UCharErrorBuffer[UCNV_ERROR_BUFFER_LENGTH];   /* unicode output from Error functions */
 
+    /* fields for conversion extension */
+
+    /* store previous UChars/chars to continue partial matches */
+    UChar32 preFromUFirstCP;                /* >=0: partial match */
+    UChar preFromU[UCNV_EXT_MAX_UCHARS];
+    char preToU[UCNV_EXT_MAX_BYTES];
+    int8_t preFromULength, preToULength;    /* negative: replay */
+    int8_t preToUFirstLength;               /* length of first character */
 };
 
 U_CDECL_END /* end of UConverter */
 
-typedef struct
-  {
-    UConverter *OptGrpConverter[0x20];    /* Converter per Opt. grp. */
-    uint8_t    OptGroup;                  /* default Opt. grp. for this LMBCS session */
-    uint8_t    localeConverterIndex;      /* reasonable locale match for index */
+#define CONVERTER_FILE_EXTENSION ".cnv"
 
-  }
-UConverterDataLMBCS;
+/**
+ * Load a non-algorithmic converter.
+ * If pkg==NULL, then this function must be called inside umtx_lock(&cnvCacheMutex).
+ */
+UConverterSharedData *
+ucnv_load(UConverterLoadArgs *pArgs, UErrorCode *err);
 
-#define CONVERTER_FILE_EXTENSION ".cnv"
+/**
+ * Unload a non-algorithmic converter.
+ * It must be sharedData->referenceCounter != ~0
+ * and this function must be called inside umtx_lock(&cnvCacheMutex).
+ */
+void
+ucnv_unload(UConverterSharedData *sharedData);
+
+/**
+ * Swap ICU .cnv conversion tables. See udataswp.h.
+ * @internal
+ */
+U_CAPI int32_t U_EXPORT2
+ucnv_swap(const UDataSwapper *ds,
+          const void *inData, int32_t length, void *outData,
+          UErrorCode *pErrorCode);
 
 #endif /* _UCNV_BLD */

Index: ucnv_cb.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ucnv_cb.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- ucnv_cb.c	10 Sep 2003 02:42:03 -0000	1.4
+++ ucnv_cb.c	6 Apr 2004 10:08:01 -0000	1.5
@@ -1,6 +1,6 @@
 /*
 **********************************************************************
-*   Copyright (C) 2000-2001, International Business Machines
+*   Copyright (C) 2000-2003, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
  *  ucnv_cb.c:
@@ -35,50 +35,16 @@
                        int32_t offsetIndex,
                        UErrorCode * err)
 {
-    int32_t togo;
-    int8_t toerr;
-    int32_t i;
-
-    if((args->targetLimit - args->target) >= length) /* If the buffer fits.. */
-    {
-        uprv_memcpy(args->target, source, length);
-        args->target += length;
-        if(args->offsets) /* set all the offsets to the same # */
-        {
-            for(i=0;i<length;i++)
-            {
-                *(args->offsets++) = offsetIndex;
-            }
-        }
+    if(U_FAILURE(*err)) {
+        return;
     }
-    else
-    {
-        togo = (int32_t)(args->targetLimit - args->target);
-
-        uprv_memcpy(args->target, source, togo);
-        args->target += togo;
 
-        if(args->offsets)
-        {
-            for(i=0;i<togo;i++)
-            {
-                *(args->offsets++) = offsetIndex;
-            }
-        }
-
-        /* Now, copy the remainder into the errbuff */
-        source += togo;
-        toerr = (int8_t)(length - togo);
-
-        uprv_memcpy(args->converter->charErrorBuffer +
-            args->converter->charErrorBufferLength,
-            source,
-            toerr * sizeof(source[0]));
-        args->converter->charErrorBufferLength += toerr;
-
-        *err = U_BUFFER_OVERFLOW_ERROR;
-
-    }
+    ucnv_fromUWriteBytes(
+        args->converter,
+        source, length,
+        &args->target, args->targetLimit,
+        &args->offsets, offsetIndex,
+        err);
 }
 
 U_CAPI void  U_EXPORT2
@@ -232,55 +198,16 @@
                             int32_t offsetIndex,
                             UErrorCode * err)
 {
-    int32_t togo;
-    int8_t toerr;
-    int32_t i;
-
-    if(U_FAILURE(*err))
-    {
+    if(U_FAILURE(*err)) {
         return;
     }
 
-
-    if((args->targetLimit - args->target) >= length) /* If the buffer fits.. */
-    {
-        uprv_memcpy(args->target, source, length * sizeof(args->target[0]) );
-        args->target += length;
-        if(args->offsets) /* set all the offsets to the same # */
-        {
-            for(i=0;i<length;i++)
-            {
-                *(args->offsets++) = offsetIndex;
-            }
-        }
-    }
-    else
-    {
-        togo = (int32_t)(args->targetLimit - args->target);
-
-        uprv_memcpy(args->target, source, togo * sizeof(args->target[0])  );
-        args->target += togo;
-
-        if(args->offsets)
-        {
-            for(i=0;i<togo;i++)
-            {
-                *(args->offsets++) = offsetIndex;
-            }
-        }
-
-        /* Now, copy the remainder into the errbuff */
-        source += togo;
-        toerr = (int8_t)(length - togo);
-
-        uprv_memcpy(args->converter->UCharErrorBuffer +
-                    args->converter->UCharErrorBufferLength,
-                    source,
-                    toerr * sizeof(source[0]));
-        args->converter->UCharErrorBufferLength += toerr;
-
-        *err = U_BUFFER_OVERFLOW_ERROR;
-    }
+    ucnv_toUWriteUChars(
+        args->converter,
+        source, length,
+        &args->target, args->targetLimit,
+        &args->offsets, offsetIndex,
+        err);
 }
 
 U_CAPI void  U_EXPORT2

Index: ucnv_cnv.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ucnv_cnv.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- ucnv_cnv.c	10 Sep 2003 02:42:03 -0000	1.3
+++ ucnv_cnv.c	6 Apr 2004 10:08:01 -0000	1.4
@@ -20,240 +20,150 @@
 #include "unicode/ucnv.h"
 #include "unicode/uset.h"
 #include "ucnv_cnv.h"
+#include "ucnv_bld.h"
 #include "cmemory.h"
 
-/*Empties the internal unicode output buffer */
-void  ucnv_flushInternalUnicodeBuffer (UConverter * _this,
-                                  UChar * myTarget,
-                                  int32_t * myTargetIndex,
-                                  int32_t targetLength,
-                                  int32_t** offsets,
-                                  UErrorCode * err)
-{
-    int32_t myUCharErrorBufferLength = _this->UCharErrorBufferLength;
-    
-    if (myUCharErrorBufferLength <= targetLength)
-    {
-        /*we have enough space
-        *So we just copy the whole Error Buffer in to the output stream
-        */
-        uprv_memcpy (myTarget,
-            _this->UCharErrorBuffer,
-            sizeof (UChar) * myUCharErrorBufferLength);
-        if (offsets) 
-        {
-            int32_t i=0;
-            for (i=0; i<myUCharErrorBufferLength;i++) (*offsets)[i] = -1; 
-            *offsets += myUCharErrorBufferLength;
-        }
-        *myTargetIndex += myUCharErrorBufferLength;
-        _this->UCharErrorBufferLength = 0;
-    }
-    else
-    {
-        /* We don't have enough space so we copy as much as we can
-        * on the output stream and update the object
-        * by updating the internal buffer*/
-        uprv_memcpy (myTarget, _this->UCharErrorBuffer, sizeof (UChar) * targetLength);
-        if (offsets) 
-        {
-            int32_t i=0;
-            for (i=0; i< targetLength;i++) (*offsets)[i] = -1; 
-            *offsets += targetLength;
-        }
-        uprv_memmove (_this->UCharErrorBuffer,
-                    _this->UCharErrorBuffer + targetLength,
-                    sizeof (UChar) * (myUCharErrorBufferLength - targetLength));
-        _this->UCharErrorBufferLength -= (int8_t) targetLength;
-        *myTargetIndex = targetLength;
-        *err = U_BUFFER_OVERFLOW_ERROR;
-    }
+U_CFUNC void
+ucnv_getCompleteUnicodeSet(const UConverter *cnv,
+                   USet *set,
+                   UConverterUnicodeSet which,
+                   UErrorCode *pErrorCode) {
+    uset_addRange(set, 0, 0x10ffff);
 }
 
-/*Empties the internal codepage output buffer */
-void  ucnv_flushInternalCharBuffer (UConverter * _this,
-                               char *myTarget,
-                               int32_t * myTargetIndex,
-                               int32_t targetLength,
-                               int32_t** offsets,
-                               UErrorCode * err)
-{
-    int32_t myCharErrorBufferLength = _this->charErrorBufferLength;
-    
-    /*we have enough space */
-    if (myCharErrorBufferLength <= targetLength)
-    {
-        uprv_memcpy (myTarget, _this->charErrorBuffer, myCharErrorBufferLength);
-        if (offsets) 
-        {
-            int32_t i=0;
-            for (i=0; i<myCharErrorBufferLength;i++) (*offsets)[i] = -1; 
-            *offsets += myCharErrorBufferLength;
-        }
-        
-        *myTargetIndex += myCharErrorBufferLength;
-        _this->charErrorBufferLength = 0;
-    }
-    else
-    {
-        /* We don't have enough space so we copy as much as we can
-        * on the output stream and update the object
-        */
-        uprv_memcpy (myTarget, _this->charErrorBuffer, targetLength);
-        if (offsets) 
-        {
-            int32_t i=0;
-            for (i=0; i< targetLength;i++) (*offsets)[i] = -1; 
-            *offsets += targetLength;
-        }
-        uprv_memmove (_this->charErrorBuffer,
-            _this->charErrorBuffer + targetLength,
-            (myCharErrorBufferLength - targetLength));
-        _this->charErrorBufferLength -= (int8_t) targetLength;
-        *myTargetIndex = targetLength;
-        *err = U_BUFFER_OVERFLOW_ERROR;
-    }
+U_CFUNC void
+ucnv_getNonSurrogateUnicodeSet(const UConverter *cnv,
+                               USet *set,
+                               UConverterUnicodeSet which,
+                               UErrorCode *pErrorCode) {
+    uset_addRange(set, 0, 0xd7ff);
+    uset_addRange(set, 0xe000, 0x10ffff);
 }
 
-/**
- * This function is useful for implementations of getNextUChar().
- * After a call to a callback function or to toUnicode(), an output buffer
- * begins with a Unicode code point that needs to be returned as UChar32,
- * and all following code units must be prepended to the - potentially
- * prefilled - overflow buffer in the UConverter.
- * The buffer should be at least of capacity UTF_MAX_CHAR_LENGTH so that a
- * complete UChar32's UChars fit into it.
- *
- * @param cnv    The converter that will get remaining UChars copied to its overflow area.
- * @param buffer An array of UChars that was passed into a callback function
- *               or a toUnicode() function.
- * @param length The number of code units (UChars) that are actually in the buffer.
- *               This must be >0.
- * @return The code point from the first UChars in the buffer.
- */
-U_CFUNC UChar32
-ucnv_getUChar32KeepOverflow(UConverter *cnv, const UChar *buffer, int32_t length) {
-    UChar32 c;
-    int32_t i;
+U_CFUNC void
+ucnv_fromUWriteBytes(UConverter *cnv,
+                     const char *bytes, int32_t length,
+                     char **target, const char *targetLimit,
+                     int32_t **offsets,
+                     int32_t sourceIndex,
+                     UErrorCode *pErrorCode) {
+    char *t=*target;
+    int32_t *o;
 
-    if(length<=0) {
-        return 0xffff;
+    /* write bytes */
+    if(offsets==NULL || (o=*offsets)==NULL) {
+        while(length>0 && t<targetLimit) {
+            *t++=*bytes++;
+            --length;
+        }
+    } else {
+        /* output with offsets */
+        while(length>0 && t<targetLimit) {
+            *t++=*bytes++;
+            *o++=sourceIndex;
+            --length;
+        }
+        *offsets=o;
     }
+    *target=t;
 
-    /* get the first code point in the buffer */
-    i=0;
-    UTF_NEXT_CHAR(buffer, i, length, c);
-    if(i<length) {
-        /* there are UChars left in the buffer that need to go into the overflow buffer */
-        UChar *overflow=cnv->UCharErrorBuffer;
-        int32_t j=cnv->UCharErrorBufferLength;
-
-        if(j>0) {
-            /* move the overflow buffer contents to make room for the extra UChars */
-            int32_t k;
-
-            cnv->UCharErrorBufferLength=(int8_t)(k=(length-i)+j);
+    /* write overflow */
+    if(length>0) {
+        if(cnv!=NULL) {
+            t=(char *)cnv->charErrorBuffer;
+            cnv->charErrorBufferLength=(int8_t)length;
             do {
-                overflow[--k]=overflow[--j];
-            } while(j>0);
-        } else {
-            cnv->UCharErrorBufferLength=(int8_t)(length-i);
+                *t++=(uint8_t)*bytes++;
+            } while(--length>0);
         }
-
-        /* copy the remaining UChars to the beginning of the overflow buffer */
-        do {
-            overflow[j++]=buffer[i++];
-        } while(i<length);
+        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
     }
-    return c;
 }
 
-/* update target offsets after a callback call */
-U_CFUNC int32_t *
-ucnv_updateCallbackOffsets(int32_t *offsets, int32_t length, int32_t sourceIndex) {
-    if(offsets!=NULL) {
-        if(sourceIndex>=0) {
-            /* add the sourceIndex to the relative offsets that the callback wrote */
-            while(length>0) {
-                *offsets+=sourceIndex;
-                ++offsets;
-                --length;
-            }
-        } else {
-            /* sourceIndex==-1, set -1 offsets */
-            while(length>0) {
-                *offsets=-1;
-                ++offsets;
-                --length;
-            }
+U_CFUNC void
+ucnv_toUWriteUChars(UConverter *cnv,
+                    const UChar *uchars, int32_t length,
+                    UChar **target, const UChar *targetLimit,
+                    int32_t **offsets,
+                    int32_t sourceIndex,
+                    UErrorCode *pErrorCode) {
+    UChar *t=*target;
+    int32_t *o;
+
+    /* write UChars */
+    if(offsets==NULL || (o=*offsets)==NULL) {
+        while(length>0 && t<targetLimit) {
+            *t++=*uchars++;
+            --length;
         }
-        return offsets;
     } else {
-        return NULL;
+        /* output with offsets */
+        while(length>0 && t<targetLimit) {
+            *t++=*uchars++;
+            *o++=sourceIndex;
+            --length;
+        }
+        *offsets=o;
     }
-}
+    *target=t;
 
-/*
- * This is a simple implementation of ucnv_getNextUChar() that uses the
- * converter's toUnicode() function. See ucnv_cnv.h for details.
- */
-U_CFUNC UChar32
-ucnv_getNextUCharFromToUImpl(UConverterToUnicodeArgs *pArgs,
-                             T_ToUnicodeFunction toU,
-                             UBool collectPairs,
-                             UErrorCode *pErrorCode) {
-    UChar buffer[UTF_MAX_CHAR_LENGTH];
-    const char *realLimit=pArgs->sourceLimit;
+    /* write overflow */
+    if(length>0) {
+        if(cnv!=NULL) {
+            t=cnv->UCharErrorBuffer;
+            cnv->UCharErrorBufferLength=(int8_t)length;
+            do {
+                *t++=*uchars++;
+            } while(--length>0);
+        }
+        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+    }
+}
 
-    pArgs->target=buffer;
-    pArgs->targetLimit=buffer+UTF_MAX_CHAR_LENGTH;
+U_CFUNC void
+ucnv_toUWriteCodePoint(UConverter *cnv,
+                       UChar32 c,
+                       UChar **target, const UChar *targetLimit,
+                       int32_t **offsets,
+                       int32_t sourceIndex,
+                       UErrorCode *pErrorCode) {
+    UChar *t;
+    int32_t *o;
 
-    while(pArgs->source<realLimit) {
-        /* feed in one byte at a time to make sure to get only one character out */
-        pArgs->sourceLimit=pArgs->source+1;
-        pArgs->flush= (UBool)(pArgs->sourceLimit==realLimit);
+    t=*target;
 
-        /* convert this byte and check the result */
-        toU(pArgs, pErrorCode);
-        if(U_SUCCESS(*pErrorCode)) {
-            int32_t length=(int32_t)(pArgs->target-buffer);
+    if(t<targetLimit) {
+        if(c<=0xffff) {
+            *t++=(UChar)c;
+            c=U_SENTINEL;
+        } else /* c is a supplementary code point */ {
+            *t++=U16_LEAD(c);
+            c=U16_TRAIL(c);
+            if(t<targetLimit) {
+                *t++=(UChar)c;
+                c=U_SENTINEL;
+            }
+        }
 
-            /* this test is UTF-16 specific */
-            if(/* some output and
-                  (source consumed or don't collect surrogate pairs or not a surrogate or a surrogate pair) */
-               length>0 &&
-               (pArgs->flush || !collectPairs || !UTF_IS_FIRST_SURROGATE(buffer[0]) || length==2)
-            ) {
-                return ucnv_getUChar32KeepOverflow(pArgs->converter, buffer, length);
+        /* write offsets */
+        if(offsets!=NULL && (o=*offsets)!=NULL) {
+            *o++=sourceIndex;
+            if((*target+1)<t) {
+                *o++=sourceIndex;
             }
-            /* else continue with the loop */
-        } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
-            *pErrorCode=U_ZERO_ERROR;
-            return ucnv_getUChar32KeepOverflow(pArgs->converter, buffer, UTF_MAX_CHAR_LENGTH);
-        } else {
-            /* U_FAILURE() */
-            return 0xffff;
+            *offsets=o;
         }
     }
 
-    /* no output because of empty input or only state changes and skipping callbacks */
-    *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
-    return 0xffff;
-}
-
-U_CFUNC void
-ucnv_getCompleteUnicodeSet(const UConverter *cnv,
-                   USet *set,
-                   UConverterUnicodeSet which,
-                   UErrorCode *pErrorCode) {
-    uset_addRange(set, 0, 0x10ffff);
-}
+    *target=t;
 
-U_CFUNC void
-ucnv_getNonSurrogateUnicodeSet(const UConverter *cnv,
-                               USet *set,
-                               UConverterUnicodeSet which,
-                               UErrorCode *pErrorCode) {
-    uset_addRange(set, 0, 0xd7ff);
-    uset_addRange(set, 0xe000, 0x10ffff);
+    /* write overflow from c */
+    if(c>=0) {
+        if(cnv!=NULL) {
+            int8_t i=0;
+            U16_APPEND_UNSAFE(cnv->UCharErrorBuffer, i, c);
+            cnv->UCharErrorBufferLength=i;
+        }
+        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+    }
 }

Index: ucnv_cnv.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ucnv_cnv.h,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- ucnv_cnv.h	10 Sep 2003 02:42:03 -0000	1.3
+++ ucnv_cnv.h	6 Apr 2004 10:08:01 -0000	1.4
@@ -21,14 +21,6 @@
 #include "unicode/utypes.h"
 #include "unicode/ucnv.h"
 #include "unicode/ucnv_err.h"
-#include "ucnv_bld.h"
-#include "ucnvmbcs.h"
-
-union UConverterTable
-  {
-    UConverterMBCSTable mbcs;
-  };
-
 
 U_CDECL_BEGIN
 
@@ -38,14 +30,29 @@
 /*
  * #define missingUCharMarker 0xfffe
  *
- * there are actually two values used in toUnicode tables:
+ * commented out because there are actually two values used in toUnicode tables:
  * U+fffe "unassigned"
  * U+ffff "illegal"
  */
 
+/** Forward declaration, see ucnv_bld.h */
+struct UConverterSharedData;
+typedef struct UConverterSharedData UConverterSharedData;
+
+/* function types for UConverterImpl ---------------------------------------- */
 
+/* struct with arguments for UConverterLoad and ucnv_load() */
+typedef struct {
+    int32_t size;               /* sizeof(UConverterLoadArgs) */
+    int32_t nestedLoads;        /* count nested ucnv_load() calls */
+    int32_t reserved;           /* reserved - for good alignment of the pointers */
+    uint32_t options;
+    const char *pkg, *name;
+} UConverterLoadArgs;
 
-typedef void (*UConverterLoad) (UConverterSharedData *sharedData, const uint8_t *raw, UErrorCode *pErrorCode);
+typedef void (*UConverterLoad) (UConverterSharedData *sharedData,
+                                UConverterLoadArgs *pArgs,
+                                const uint8_t *raw, UErrorCode *pErrorCode);
 typedef void (*UConverterUnload) (UConverterSharedData *sharedData);
 
 typedef void (*UConverterOpen) (UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *pErrorCode);
@@ -59,11 +66,60 @@
 
 typedef void (*UConverterReset) (UConverter *cnv, UConverterResetChoice choice);
 
-typedef void (*T_ToUnicodeFunction) (UConverterToUnicodeArgs *, UErrorCode *);
+/*
+ * Converter implementation function(s) for ucnv_toUnicode().
+ * If the toUnicodeWithOffsets function pointer is NULL,
+ * then the toUnicode function will be used and the offsets will be set to -1.
+ *
+ * Must maintain state across buffers. Use toUBytes[toULength] for partial input
+ * sequences; it will be checked in ucnv.c at the end of the input stream
+ * to detect truncated input.
+ * Some converters may need additional detection and may then set U_TRUNCATED_CHAR_FOUND.
+ *
+ * The toUnicodeWithOffsets must write exactly as many offset values as target
+ * units. Write offset values of -1 for when the source index corresponding to
+ * the output unit is not known (e.g., the character started in an earlier buffer).
+ * The pArgs->offsets pointer need not be moved forward.
+ *
+ * At function return, either one of the following conditions must be true:
+ * - U_BUFFER_OVERFLOW_ERROR and the target is full: target==targetLimit
+ * - another error code with toUBytes[toULength] set to the offending input
+ * - no error, and the source is consumed: source==sourceLimit
+ *
+ * The ucnv.c code will handle the end of the input (reset)
+ * (reset, and truncation detection) and callbacks.
+ */
+typedef void (*UConverterToUnicode) (UConverterToUnicodeArgs *, UErrorCode *);
 
-typedef void (*T_FromUnicodeFunction) (UConverterFromUnicodeArgs *, UErrorCode *);
+/*
+ * Same rules as for UConverterToUnicode.
+ * A lead surrogate is kept in fromUChar32 across buffers, and if an error
+ * occurs, then the offending input code point must be put into fromUChar32
+ * as well.
+ */
+typedef void (*UConverterFromUnicode) (UConverterFromUnicodeArgs *, UErrorCode *);
 
-typedef UChar32 (*T_GetNextUCharFunction) (UConverterToUnicodeArgs *, UErrorCode *);
+/*
+ * Converter implementation function for ucnv_getNextUChar().
+ * If the function pointer is NULL, then the toUnicode function will be used.
+ *
+ * Will be called at a character boundary (toULength==0).
+ * May return with
+ * - U_INDEX_OUTOFBOUNDS_ERROR if there was no output for the input
+ *   (the return value will be ignored)
+ * - U_TRUNCATED_CHAR_FOUND or another error code (never U_BUFFER_OVERFLOW_ERROR!)
+ *   with toUBytes[toULength] set to the offending input
+ *   (the return value will be ignored)
+ * - return UCNV_GET_NEXT_UCHAR_USE_TO_U, without moving the source pointer,
+ *   to indicate that the ucnv.c code shall call the toUnicode function instead
+ * - return a real code point result
+ *
+ * Unless UCNV_GET_NEXT_UCHAR_USE_TO_U is returned, the source bytes must be consumed.
+ *
+ * The ucnv.c code will handle the end of the input (reset)
+ * (except for truncation detection!) and callbacks.
+ */
+typedef UChar32 (*UConverterGetNextUChar) (UConverterToUnicodeArgs *, UErrorCode *);
 
 typedef void (*UConverterGetStarters)(const UConverter* converter,
                                       UBool starters[256],
@@ -116,20 +172,6 @@
 
 UBool CONVERSION_U_SUCCESS (UErrorCode err);
 
-void ucnv_flushInternalUnicodeBuffer (UConverter * _this,
-                                 UChar * myTarget,
-                                 int32_t * myTargetIndex,
-                                 int32_t targetLength,
-                                 int32_t** offsets,
-                                 UErrorCode * err);
-
-void ucnv_flushInternalCharBuffer (UConverter * _this,
-                              char *myTarget,
-                              int32_t * myTargetIndex,
-                              int32_t targetLength,
-                              int32_t** offsets,
-                              UErrorCode * err);
-
 /**
  * UConverterImpl contains all the data and functions for a converter type.
  * Its function pointers work much like a C++ vtable.
@@ -156,11 +198,11 @@
     UConverterClose close;
     UConverterReset reset;
 
-    T_ToUnicodeFunction toUnicode;
-    T_ToUnicodeFunction toUnicodeWithOffsets;
-    T_FromUnicodeFunction fromUnicode;
-    T_FromUnicodeFunction fromUnicodeWithOffsets;
-    T_GetNextUCharFunction getNextUChar;
+    UConverterToUnicode toUnicode;
+    UConverterToUnicode toUnicodeWithOffsets;
+    UConverterFromUnicode fromUnicode;
+    UConverterFromUnicode fromUnicodeWithOffsets;
+    UConverterGetNextUChar getNextUChar;
 
     UConverterGetStarters getStarters;
     UConverterGetName getName;
@@ -180,40 +222,6 @@
 
 U_CDECL_END
 
-/**
- * This function is useful for implementations of getNextUChar().
- * After a call to a callback function or to toUnicode(), an output buffer
- * begins with a Unicode code point that needs to be returned as UChar32,
- * and all following code units must be prepended to the - potentially
- * prefilled - overflow buffer in the UConverter.
- * The buffer should be at least of capacity UTF_MAX_CHAR_LENGTH so that a
- * complete UChar32's UChars fit into it.
- *
- * @param cnv    The converter that will get remaining UChars copied to its overflow area.
- * @param buffer An array of UChars that was passed into a callback function
- *               or a toUnicode() function.
- * @param length The number of code units (UChars) that are actually in the buffer.
- *               This must be >0.
- * @return The code point from the first UChars in the buffer.
- */
-U_CFUNC UChar32
-ucnv_getUChar32KeepOverflow(UConverter *cnv, const UChar *buffer, int32_t length);
-
-/**
- * This helper function updates the offsets array after a callback function call.
- * It adds the sourceIndex to each offsets item, or sets each of them to -1 if
- * sourceIndex==-1.
- *
- * @param offsets The pointer to offsets entry that corresponds to the first target
- *                unit that the callback wrote.
- * @param length  The number of output units that the callback wrote.
- * @param sourceIndex The sourceIndex of the input sequence that the callback
- *                    function was called for.
- * @return offsets+length if offsets!=NULL, otherwise NULL
- */
-U_CFUNC int32_t *
-ucnv_updateCallbackOffsets(int32_t *offsets, int32_t length, int32_t sourceIndex);
-
 /** Always use fallbacks from codepage to Unicode */
 #define TO_U_USE_FALLBACK(useFallback) TRUE
 #define UCNV_TO_U_USE_FALLBACK(cnv) TRUE
@@ -224,30 +232,12 @@
 #define UCNV_FROM_U_USE_FALLBACK(cnv, c) FROM_U_USE_FALLBACK((cnv)->useFallback, c)
 
 /**
- * This is a simple implementation of ucnv_getNextUChar() that uses the
- * converter's toUnicode() function.
- *
- * \par
- * A surrogate pair from a single byte sequence is always
- * combined to a supplementary code point.
- * A surrogate pair from consecutive byte sequences is only combined
- * if collectPairs is set. This is necessary for SCSU
- * but not allowed for most legacy codepages.
- *
- * @param pArgs The argument structure supplied by ucnv_getNextUChar()
- * @param toU   A function pointer to the converter's toUnicode() function
- * @param collectPairs indicates whether separate surrogate results from
- *                     consecutive byte sequences should be combined into
- *                     a single code point
- * @param pErrorCode An ICU error code parameter
- * @return The Unicode code point as a result of a conversion of a minimal
- *         number of input bytes
+ * Magic number for ucnv_getNextUChar(), returned by a
+ * getNextUChar() implementation to indicate to use the converter's toUnicode()
+ * instead of the native function.
+ * @internal
  */
-U_CFUNC UChar32
-ucnv_getNextUCharFromToUImpl(UConverterToUnicodeArgs *pArgs,
-                             T_ToUnicodeFunction toU,
-                             UBool collectPairs,
-                             UErrorCode *pErrorCode);
+#define UCNV_GET_NEXT_UCHAR_USE_TO_U -9
 
 U_CFUNC void
 ucnv_getCompleteUnicodeSet(const UConverter *cnv,
@@ -260,5 +250,28 @@
                                USet *set,
                                UConverterUnicodeSet which,
                                UErrorCode *pErrorCode);
+
+U_CFUNC void
+ucnv_fromUWriteBytes(UConverter *cnv,
+                     const char *bytes, int32_t length,
+                     char **target, const char *targetLimit,
+                     int32_t **offsets,
+                     int32_t sourceIndex,
+                     UErrorCode *pErrorCode);
+U_CFUNC void
+ucnv_toUWriteUChars(UConverter *cnv,
+                    const UChar *uchars, int32_t length,
+                    UChar **target, const UChar *targetLimit,
+                    int32_t **offsets,
+                    int32_t sourceIndex,
+                    UErrorCode *pErrorCode);
+
+U_CFUNC void
+ucnv_toUWriteCodePoint(UConverter *cnv,
+                       UChar32 c,
+                       UChar **target, const UChar *targetLimit,
+                       int32_t **offsets,
+                       int32_t sourceIndex,
+                       UErrorCode *pErrorCode);
 
 #endif /* UCNV_CNV */

Index: ucnv_imp.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ucnv_imp.h,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- ucnv_imp.h	10 Sep 2003 02:42:03 -0000	1.3
+++ ucnv_imp.h	6 Apr 2004 10:08:01 -0000	1.4
@@ -21,6 +21,7 @@
 #define UCNV_IMP_H
 
 #include "unicode/utypes.h"
+#include "unicode/uloc.h"
 #include "ucnv_bld.h"
 
 /* figures out if we need to go to file to read in the data tables.
@@ -54,6 +55,21 @@
 
 UConverter* ucnv_createConverterFromPackage(const char *packageName, const char *converterName,  
                                             UErrorCode *err);
+
+typedef struct {
+    char cnvName[UCNV_MAX_CONVERTER_NAME_LENGTH], locale[ULOC_FULLNAME_CAPACITY];
+    const char *realName;
+    uint32_t options;
+} UConverterLookupData;
+
+/**
+ * Load a converter but do not create a UConverter object.
+ * Simply return the UConverterSharedData.
+ * Performs alias lookup etc.
+ * @internal
+ */
+UConverterSharedData *
+ucnv_loadSharedData(const char *converterName, UConverterLookupData *lookup, UErrorCode * err);
 
 /**
  * This may unload the shared data in a thread safe manner.

Index: ucnv_io.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ucnv_io.c,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -d -r1.5 -r1.6
--- ucnv_io.c	10 Sep 2003 02:42:03 -0000	1.5
+++ ucnv_io.c	6 Apr 2004 10:08:01 -0000	1.6
@@ -33,6 +33,8 @@
 #include "unicode/udata.h"
 
 #include "umutex.h"
+#include "uarrsort.h"
+#include "udataswp.h"
 #include "cstring.h"
 #include "cmemory.h"
 #include "ucnv_io.h"
@@ -51,14 +53,18 @@
  * First there is the size of the Table of Contents (TOC). The TOC
  * entries contain the size of each section. In order to find the offset
  * you just need to sum up the previous offsets.
+ * The TOC length and entries are an array of uint32_t values.
+ * The first section after the TOC starts immediately after the TOC.
  *
  * 1) This section contains a list of converters. This list contains indexes
  * into the string table for the converter name. The index of this list is
  * also used by other sections, which are mentioned later on.
+ * This list is not sorted.
  *
  * 2) This section contains a list of tags. This list contains indexes
  * into the string table for the tag name. The index of this list is
  * also used by other sections, which are mentioned later on.
+ * This list is in priority order of standards.
  *
  * 3) This section contains a list of sorted unique aliases. This
  * list contains indexes into the string table for the alias name. The
@@ -157,6 +163,20 @@
 
 static UDataMemory *gAliasData=NULL;
 
+enum {
+    tocLengthIndex=0,
+    converterListIndex=1,
+    tagListIndex=2,
+    aliasListIndex=3,
+    untaggedConvArrayIndex=4,
+    taggedAliasArrayIndex=5,
+    taggedAliasListsIndex=6,
+    reservedIndex1=7,
+    stringTableIndex=8,
+    minTocLength=8, /* min. tocLength in the file, does not count the tocLengthIndex! */
+    offsetsCount    /* length of the swapper's temporary offsets[] */
+};
+
 static const uint16_t *gConverterList = NULL;
 static const uint16_t *gTagList = NULL;
 static const uint16_t *gAliasList = NULL;
@@ -224,7 +244,7 @@
         table = (const uint16_t *)udata_getMemory(data);
 
         tableStart      = ((const uint32_t *)(table))[0];
-        if (tableStart < 8) {
+        if (tableStart < minTocLength) {
             *pErrorCode = U_INVALID_FORMAT_ERROR;
             udata_close(data);
             return FALSE;
@@ -341,18 +361,36 @@
 
 /* @see ucnv_compareNames */
 U_CFUNC char * U_EXPORT2
-ucnv_io_stripForCompare(char *dst, const char *name) {
+ucnv_io_stripASCIIForCompare(char *dst, const char *name) {
     char c1 = *name;
     char *dstItr = dst;
 
     while (c1) {
         /* Ignore delimiters '-', '_', and ' ' */
-        while ((c1 = *name) == '-' || c1 == '_' || c1 == ' ') {
+        while ((c1 = *name) == 0x2d || c1 == 0x5f || c1 == 0x20) {
             ++name;
         }
 
         /* lowercase for case-insensitive comparison */
-        *(dstItr++) = uprv_tolower(c1);
+        *(dstItr++) = uprv_asciitolower(c1);
+        ++name;
+    }
+    return dst;
+}
+
+U_CFUNC char * U_EXPORT2
+ucnv_io_stripEBCDICForCompare(char *dst, const char *name) {
+    char c1 = *name;
+    char *dstItr = dst;
+
+    while (c1) {
+        /* Ignore delimiters '-', '_', and ' ' */
+        while ((c1 = *name) == 0x60 || c1 == 0x6d || c1 == 0x40) {
+            ++name;
+        }
+
+        /* lowercase for case-insensitive comparison */
+        *(dstItr++) = uprv_ebcdictolower(c1);
         ++name;
     }
     return dst;
@@ -612,7 +650,7 @@
         if (myContext->listIdx < listCount) {
             const char *myStr = GET_STRING(currList[myContext->listIdx++]);
             if (resultLength) {
-                *resultLength = uprv_strlen(myStr);
+                *resultLength = (int32_t)uprv_strlen(myStr);
             }
             return myStr;
         }
@@ -891,7 +929,7 @@
     if (*myContext < gConverterListSize) {
         const char *myStr = GET_STRING(gConverterList[(*myContext)++]);
         if (resultLength) {
-            *resultLength = uprv_strlen(myStr);
+            *resultLength = (int32_t)uprv_strlen(myStr);
         }
         return myStr;
     }
@@ -1043,6 +1081,255 @@
     }
 }
 
+/* alias table swapping ----------------------------------------------------- */
+
+typedef char * U_CALLCONV StripForCompareFn(char *dst, const char *name);
+
+/*
+ * row of a temporary array
+ *
+ * gets platform-endian charset string indexes and sorting indexes;
+ * after sorting this array by strings, the actual arrays are permutated
+ * according to the sorting indexes
+ */
+typedef struct Row {
+    uint16_t strIndex, sortIndex;
+} Row;
+
+typedef struct TempTable {
+    const char *chars;
+    Row *rows;
+    uint16_t *resort;
+    StripForCompareFn *stripForCompare;
+} TempTable;
+
+enum {
+    STACK_ROW_CAPACITY=500
+};
+
+static int32_t
+io_compareRows(const void *context, const void *left, const void *right) {
+    char strippedLeft[UCNV_MAX_CONVERTER_NAME_LENGTH],
+         strippedRight[UCNV_MAX_CONVERTER_NAME_LENGTH];
+
+    TempTable *tempTable=(TempTable *)context;
+    const char *chars=tempTable->chars;
+
+    return (int32_t)uprv_strcmp(tempTable->stripForCompare(strippedLeft, chars+2*((const Row *)left)->strIndex),
+                                tempTable->stripForCompare(strippedRight, chars+2*((const Row *)right)->strIndex));
+}
+
+U_CAPI int32_t U_EXPORT2
+ucnv_swapAliases(const UDataSwapper *ds,
+                 const void *inData, int32_t length, void *outData,
+                 UErrorCode *pErrorCode) {
+    const UDataInfo *pInfo;
+    int32_t headerSize;
+
+    const uint16_t *inTable;
+    uint32_t toc[offsetsCount];
+    uint32_t offsets[offsetsCount]; /* 16-bit-addressed offsets from inTable/outTable */
+    uint32_t i, count, tocLength, topOffset;
+
+    Row rows[STACK_ROW_CAPACITY];
+    uint16_t resort[STACK_ROW_CAPACITY];
+    TempTable tempTable;
+
+    /* udata_swapDataHeader checks the arguments */
+    headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
+    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+        return 0;
+    }
+
+    /* check data format and format version */
+    pInfo=(const UDataInfo *)((const char *)inData+4);
+    if(!(
+        pInfo->dataFormat[0]==0x43 &&   /* dataFormat="CvAl" */
+        pInfo->dataFormat[1]==0x76 &&
+        pInfo->dataFormat[2]==0x41 &&
+        pInfo->dataFormat[3]==0x6c &&
+        pInfo->formatVersion[0]==3
+    )) {
+        udata_printError(ds, "ucnv_swapAliases(): data format %02x.%02x.%02x.%02x (format version %02x) is not an alias table\n",
+                         pInfo->dataFormat[0], pInfo->dataFormat[1],
+                         pInfo->dataFormat[2], pInfo->dataFormat[3],
+                         pInfo->formatVersion[0]);
+        *pErrorCode=U_UNSUPPORTED_ERROR;
+        return 0;
+    }
+
+    /* an alias table must contain at least the table of contents array */
+    if(length>=0 && (length-headerSize)<4*(1+minTocLength)) {
+        udata_printError(ds, "ucnv_swapAliases(): too few bytes (%d after header) for an alias table\n",
+                         length-headerSize);
+        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+        return 0;
+    }
+
+    inTable=(const uint16_t *)((const char *)inData+headerSize);
+    toc[tocLengthIndex]=tocLength=ds->readUInt32(((const uint32_t *)inTable)[tocLengthIndex]);
+    if(tocLength<minTocLength) {
+        udata_printError(ds, "ucnv_swapAliases(): table of contents too short (%u sections)\n", tocLength);
+        *pErrorCode=U_INVALID_FORMAT_ERROR;
+        return 0;
+    }
+
+    /* read the known part of the table of contents */
+    for(i=converterListIndex; i<=minTocLength; ++i) {
+        toc[i]=ds->readUInt32(((const uint32_t *)inTable)[i]);
+    }
+
+    /* compute offsets */
+    offsets[tocLengthIndex]=0;
+    offsets[converterListIndex]=2*(1+tocLength); /* count two 16-bit units per toc entry */
+    for(i=tagListIndex; i<=stringTableIndex; ++i) {
+        offsets[i]=offsets[i-1]+toc[i-1];
+    }
+
+    /* compute the overall size of the after-header data, in numbers of 16-bit units */
+    topOffset=offsets[i-1]+toc[i-1];
+
+    if(length>=0) {
+        uint16_t *outTable;
+        const uint16_t *p, *p2;
+        uint16_t *q, *q2;
+        uint16_t oldIndex;
+
+        if((length-headerSize)<(2*(int32_t)topOffset)) {
+            udata_printError(ds, "ucnv_swapAliases(): too few bytes (%d after header) for an alias table\n",
+                             length-headerSize);
+            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+            return 0;
+        }
+
+        outTable=(uint16_t *)((char *)outData+headerSize);
+
+        /* swap the entire table of contents */
+        ds->swapArray32(ds, inTable, 4*(1+tocLength), outTable, pErrorCode);
+
+        /* swap strings */
+        ds->swapInvChars(ds, inTable+offsets[stringTableIndex], 2*(int32_t)toc[stringTableIndex],
+                             outTable+offsets[stringTableIndex], pErrorCode);
+        if(U_FAILURE(*pErrorCode)) {
+            udata_printError(ds, "ucnv_swapAliases().swapInvChars(charset names) failed - %s\n",
+                             u_errorName(*pErrorCode));
+            return 0;
+        }
+
+        if(ds->inCharset==ds->outCharset) {
+            /* no need to sort, just swap all 16-bit values together */
+            ds->swapArray16(ds,
+                            inTable+offsets[converterListIndex],
+                            2*(int32_t)(offsets[stringTableIndex]-offsets[converterListIndex]),
+                            outTable+offsets[converterListIndex],
+                            pErrorCode);
+        } else {
+            /* allocate the temporary table for sorting */
+            count=toc[aliasListIndex];
+
+            tempTable.chars=(const char *)(outTable+offsets[stringTableIndex]); /* sort by outCharset */
+
+            if(count<=STACK_ROW_CAPACITY) {
+                tempTable.rows=rows;
+                tempTable.resort=resort;
+            } else {
+                tempTable.rows=(Row *)uprv_malloc(count*sizeof(Row)+count*2);
+                if(tempTable.rows==NULL) {
+                    udata_printError(ds, "ucnv_swapAliases(): unable to allocate memory for sorting tables (max length: %u)\n",
+                                     count);
+                    *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
+                    return 0;
+                }
+                tempTable.resort=(uint16_t *)(tempTable.rows+count);
+            }
+
+            if(ds->outCharset==U_ASCII_FAMILY) {
+                tempTable.stripForCompare=ucnv_io_stripASCIIForCompare;
+            } else /* U_EBCDIC_FAMILY */ {
+                tempTable.stripForCompare=ucnv_io_stripEBCDICForCompare;
+            }
+
+            /*
+             * Sort unique aliases+mapped names.
+             *
+             * We need to sort the list again by outCharset strings because they
+             * sort differently for different charset families.
+             * First we set up a temporary table with the string indexes and
+             * sorting indexes and sort that.
+             * Then we permutate and copy/swap the actual values.
+             */
+            p=inTable+offsets[aliasListIndex];
+            q=outTable+offsets[aliasListIndex];
+
+            p2=inTable+offsets[untaggedConvArrayIndex];
+            q2=outTable+offsets[untaggedConvArrayIndex];
+
+            for(i=0; i<count; ++i) {
+                tempTable.rows[i].strIndex=ds->readUInt16(p[i]);
+                tempTable.rows[i].sortIndex=(uint16_t)i;
+            }
+
+            uprv_sortArray(tempTable.rows, (int32_t)count, sizeof(Row),
+                           io_compareRows, &tempTable,
+                           FALSE, pErrorCode);
+
+            if(U_SUCCESS(*pErrorCode)) {
+                /* copy/swap/permutate items */
+                if(p!=q) {
+                    for(i=0; i<count; ++i) {
+                        oldIndex=tempTable.rows[i].sortIndex;
+                        ds->swapArray16(ds, p+oldIndex, 2, q+i, pErrorCode);
+                        ds->swapArray16(ds, p2+oldIndex, 2, q2+i, pErrorCode);
+                    }
+                } else {
+                    /*
+                     * If we swap in-place, then the permutation must use another
+                     * temporary array (tempTable.resort)
+                     * before the results are copied to the outBundle.
+                     */
+                    uint16_t *r=tempTable.resort;
+
+                    for(i=0; i<count; ++i) {
+                        oldIndex=tempTable.rows[i].sortIndex;
+                        ds->swapArray16(ds, p+oldIndex, 2, r+i, pErrorCode);
+                    }
+                    uprv_memcpy(q, r, 2*count);
+
+                    for(i=0; i<count; ++i) {
+                        oldIndex=tempTable.rows[i].sortIndex;
+                        ds->swapArray16(ds, p2+oldIndex, 2, r+i, pErrorCode);
+                    }
+                    uprv_memcpy(q2, r, 2*count);
+                }
+            }
+
+            if(tempTable.rows!=rows) {
+                uprv_free(tempTable.rows);
+            }
+
+            if(U_FAILURE(*pErrorCode)) {
+                udata_printError(ds, "ucnv_swapAliases().uprv_sortArray(%u items) failed - %s\n",
+                                 count, u_errorName(*pErrorCode));
+                return 0;
+            }
+
+            /* swap remaining 16-bit values */
+            ds->swapArray16(ds,
+                            inTable+offsets[converterListIndex],
+                            2*(int32_t)(offsets[aliasListIndex]-offsets[converterListIndex]),
+                            outTable+offsets[converterListIndex],
+                            pErrorCode);
+            ds->swapArray16(ds,
+                            inTable+offsets[taggedAliasArrayIndex],
+                            2*(int32_t)(offsets[stringTableIndex]-offsets[taggedAliasArrayIndex]),
+                            outTable+offsets[taggedAliasArrayIndex],
+                            pErrorCode);
+        }
+    }
+
+    return headerSize+2*(int32_t)topOffset;
+}
+
 /*
  * Hey, Emacs, please set the following:
  *
@@ -1051,4 +1338,3 @@
  * End:
  *
  */
-

Index: ucnv_io.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ucnv_io.h,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- ucnv_io.h	10 Sep 2003 02:42:03 -0000	1.3
+++ ucnv_io.h	6 Apr 2004 10:08:01 -0000	1.4
@@ -14,6 +14,7 @@
 #define UCNV_IO_H
 
 #include "unicode/utypes.h"
+#include "udataswp.h"
 
 #define UCNV_AMBIGUOUS_ALIAS_MAP_BIT 0x8000
 #define UCNV_CONVERTER_INDEX_MASK 0xFFF
@@ -21,14 +22,26 @@
 #define UCNV_NUM_HIDDEN_TAGS 1
 
 /**
+ * \var ucnv_io_stripForCompare
  * Remove the underscores, dashes and spaces from the name, and convert
  * the name to lower case.
  * @param dst The destination buffer, which is <= the buffer of name.
  * @param dst The destination buffer, which is <= the buffer of name.
  * @return the destination buffer.
  */
+#if U_CHARSET_FAMILY==U_ASCII_FAMILY
+#   define ucnv_io_stripForCompare ucnv_io_stripASCIIForCompare
+#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
+#   define ucnv_io_stripForCompare ucnv_io_stripEBCDICForCompare
+#else
+#   error U_CHARSET_FAMILY is not valid
+#endif
+
 U_CFUNC char * U_EXPORT2
-ucnv_io_stripForCompare(char *dst, const char *name);
+ucnv_io_stripASCIIForCompare(char *dst, const char *name);
+
+U_CFUNC char * U_EXPORT2
+ucnv_io_stripEBCDICForCompare(char *dst, const char *name);
 
 /**
  * Map a converter alias name to a canonical converter name.
@@ -136,6 +149,15 @@
  */
 U_CFUNC void
 ucnv_io_setDefaultConverterName(const char *name);
+
+/**
+ * Swap an ICU converter alias table. See ucnv_io.c.
+ * @internal
+ */
+U_CAPI int32_t U_EXPORT2
+ucnv_swapAliases(const UDataSwapper *ds,
+                 const void *inData, int32_t length, void *outData,
+                 UErrorCode *pErrorCode);
 
 #endif /* _UCNV_IO */
 

Index: ucnv_lmb.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ucnv_lmb.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- ucnv_lmb.c	10 Sep 2003 02:42:03 -0000	1.3
+++ ucnv_lmb.c	6 Apr 2004 10:08:01 -0000	1.4
@@ -27,12 +27,18 @@
 
 #if !UCONFIG_NO_LEGACY_CONVERSION
 
-#include "cmemory.h"
 #include "unicode/ucnv_err.h"
-#include "ucnv_bld.h"
 #include "unicode/ucnv.h"
+#include "unicode/uset.h"
+#include "cmemory.h"
+#include "cstring.h"
+#include "uassert.h"
+#include "ucnv_imp.h"
+#include "ucnv_bld.h"
 #include "ucnv_cnv.h"
 
+#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
+
 /*
   LMBCS
 
@@ -218,7 +224,13 @@
 keeps a mapping between optimization groups and IBM character sets, so that
 ICU converters can be created and used as needed. */
 
-static const char * const OptGroupByteToCPName[ULMBCS_CTRLOFFSET] = {
+/* As you can see, even though any byte below 0x20 could be an optimization 
+byte, only those at 0x13 or below can map to an actual converter. To limit
+some loops and searches, we define a value for that last group converter:*/
+
+#define ULMBCS_GRP_LAST       0x13   /* last LMBCS group that has a converter */
+
+static const char * const OptGroupByteToCPName[ULMBCS_GRP_LAST + 1] = {
    /* 0x0000 */ "lmb-excp", /* internal home for the LOTUS exceptions list */
    /* 0x0001 */ "ibm-850",
    /* 0x0002 */ "ibm-851",
@@ -244,12 +256,6 @@
    and 0x0019, the 1-2-3 system range control char */      
 };
 
-/* As you can see, even though any byte below 0x20 could be an optimization 
-byte, only those at 0x13 or below can map to an actual converter. To limit
-some loops and searches, we define a value for that last group converter:*/
-
-#define ULMBCS_GRP_LAST       0x13   /* last LMBCS group that has a converter */
-
 
 /* That's approximately all the data that's needed for translating 
   LMBCS to Unicode. 
@@ -480,7 +486,7 @@
       if (*pTable->LocaleID == *LocaleID) /* Check only first char for speed */
       {
          /* First char matches - check whole name, for entry-length */
-         if (strncmp(pTable->LocaleID, LocaleID, strlen(pTable->LocaleID)) == 0)
+         if (uprv_strncmp(pTable->LocaleID, LocaleID, strlen(pTable->LocaleID)) == 0)
             return pTable->OptGroup;
       }
       else
@@ -505,6 +511,13 @@
   the definitions of these structures, see unicode\ucnv_bld.h
 */
 
+typedef struct
+  {
+    UConverterSharedData *OptGrpConverter[ULMBCS_GRP_LAST+1];    /* Converter per Opt. grp. */
+    uint8_t    OptGroup;                  /* default Opt. grp. for this LMBCS session */
+    uint8_t    localeConverterIndex;      /* reasonable locale match for index */
+  }
+UConverterDataLMBCS;
 
 
 #define DECLARE_LMBCS_DATA(n) \
@@ -518,17 +531,17 @@
     _LMBCSToUnicodeWithOffsets,\
     _LMBCSFromUnicode,\
     _LMBCSFromUnicode,\
-    _LMBCSGetNextUChar,\
     NULL,\
     NULL,\
     NULL,\
     NULL,\
-    ucnv_getCompleteUnicodeSet\
+    _LMBCSSafeClone,\
+    _LMBCSGetUnicodeSet\
 };\
 static const UConverterStaticData _LMBCSStaticData##n={\
   sizeof(UConverterStaticData),\
  "LMBCS-"  #n,\
-    0, UCNV_IBM, UCNV_LMBCS_##n, 1, 2,\
+    0, UCNV_IBM, UCNV_LMBCS_##n, 1, 3,\
     { 0x3f, 0, 0, 0 },1,FALSE,FALSE,0,0,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} \
 };\
 const UConverterSharedData _LMBCSData##n={\
@@ -558,21 +571,32 @@
                        ulmbcs_byte_t OptGroup
                        )
 {
-   UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS*)uprv_malloc (sizeof (UConverterDataLMBCS));
-   if(extraInfo != NULL)
+    UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS*)uprv_malloc (sizeof (UConverterDataLMBCS));
+    if(extraInfo != NULL)
     {
-       ulmbcs_byte_t i;
-       ulmbcs_byte_t imax;
-       imax = sizeof(extraInfo->OptGrpConverter)/sizeof(extraInfo->OptGrpConverter[0]);
+        ulmbcs_byte_t i;
 
-       for (i=0; i < imax; i++)         
-       {
-            extraInfo->OptGrpConverter[i] =
-               (OptGroupByteToCPName[i] != NULL) ? 
-               ucnv_open(OptGroupByteToCPName[i], err) : NULL;
-       }
-       extraInfo->OptGroup = OptGroup;
-       extraInfo->localeConverterIndex = FindLMBCSLocale(locale);
+        uprv_memset(extraInfo, 0, sizeof(UConverterDataLMBCS));
+
+        for (i=0; i <= ULMBCS_GRP_LAST && U_SUCCESS(*err); i++)         
+        {
+            if(OptGroupByteToCPName[i] != NULL) {
+                extraInfo->OptGrpConverter[i] = ucnv_loadSharedData(OptGroupByteToCPName[i], NULL, err);
+            }
+        }
+
+        if(U_SUCCESS(*err)) {
+            extraInfo->OptGroup = OptGroup;
+            extraInfo->localeConverterIndex = FindLMBCSLocale(locale);
+        } else {
+            /* one of the subconverters could not be loaded, unload the previous ones */
+            while(i > 0) {
+                if(extraInfo->OptGrpConverter[--i] != NULL) {
+                    ucnv_unloadSharedDataIfReady(extraInfo->OptGrpConverter[i]);
+                    extraInfo->OptGrpConverter[i] = NULL;
+                }
+            }
+        }
    } 
    else
    {
@@ -584,30 +608,69 @@
 static void 
 _LMBCSClose(UConverter *   _this) 
 {
-    if (_this->extraInfo != NULL && !_this->isExtraLocal)
+    if (_this->extraInfo != NULL)
     {
         ulmbcs_byte_t Ix;
         UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS *) _this->extraInfo;
 
-        for (Ix=0; Ix < ULMBCS_GRP_UNICODE; Ix++)
+        for (Ix=0; Ix <= ULMBCS_GRP_LAST; Ix++)
         {
            if (extraInfo->OptGrpConverter[Ix] != NULL)
-              ucnv_close (extraInfo->OptGrpConverter[Ix]);
+              ucnv_unloadSharedDataIfReady(extraInfo->OptGrpConverter[Ix]);
+        }
+        if (!_this->isExtraLocal) {
+            uprv_free (_this->extraInfo);
         }
-        uprv_free (_this->extraInfo);
     }
 }
 
-/* 
-Here's an all-crash stop for debugging, since ICU does not have asserts.
-Turn this on by defining LMBCS_DEBUG, or by changing it to 
-#if 1 
-*/
-#if LMBCS_DEBUG
-#define MyAssert(b) {if (!(b)) {*(char *)0 = 1;}}
-#else
-#define MyAssert(b) 
-#endif
+typedef struct LMBCSClone {
+    UConverter cnv;
+    UConverterDataLMBCS lmbcs;
+} LMBCSClone;
+
+static UConverter * 
+_LMBCSSafeClone(const UConverter *cnv, 
+                void *stackBuffer, 
+                int32_t *pBufferSize, 
+                UErrorCode *status) {
+    LMBCSClone *newLMBCS;
+    UConverterDataLMBCS *extraInfo;
+    int32_t i;
+
+    if(*pBufferSize<=0) {
+        *pBufferSize=(int32_t)sizeof(LMBCSClone);
+        return NULL;
+    }
+
+    extraInfo=(UConverterDataLMBCS *)cnv->extraInfo;
+    newLMBCS=(LMBCSClone *)stackBuffer;
+
+    /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
+
+    uprv_memcpy(&newLMBCS->lmbcs, extraInfo, sizeof(UConverterDataLMBCS));
+
+    /* share the subconverters */
+    for(i = 0; i <= ULMBCS_GRP_LAST; ++i) {
+        if(extraInfo->OptGrpConverter[i] != NULL) {
+            ucnv_incrementRefCount(extraInfo->OptGrpConverter[i]);
+        }
+    }
+
+    newLMBCS->cnv.extraInfo = &newLMBCS->lmbcs;
+    newLMBCS->cnv.isExtraLocal = TRUE;
+    return &newLMBCS->cnv;
+}
+
+static void
+_LMBCSGetUnicodeSet(const UConverter *cnv,
+                   USet *set,
+                   UConverterUnicodeSet which,
+                   UErrorCode *pErrorCode) {
+    /* all but U+F6xx, see LMBCS explanation above (search for F6xx) */
+    uset_addRange(set, 0, 0xf5ff);
+    uset_addRange(set, 0xf700, 0x10ffff);
+}
 
 /* 
    Here's the basic helper function that we use when converting from
@@ -627,33 +690,21 @@
 )   
 {
    ulmbcs_byte_t  * pLMBCS = pStartLMBCS;
-   UConverter * xcnv = extraInfo->OptGrpConverter[group];
+   UConverterSharedData * xcnv = extraInfo->OptGrpConverter[group];
 
    int bytesConverted;
    uint32_t value;
    ulmbcs_byte_t firstByte;
 
-   MyAssert(xcnv);
-   MyAssert(group<ULMBCS_GRP_UNICODE);
+   U_ASSERT(xcnv);
+   U_ASSERT(group<ULMBCS_GRP_UNICODE);
 
-   bytesConverted = _MBCSFromUChar32(xcnv->sharedData, *pUniChar, &value, FALSE);
+   bytesConverted = _MBCSFromUChar32(xcnv, *pUniChar, &value, FALSE);
 
    /* get the first result byte */
-   switch(bytesConverted)
-   {
-   case 4:
-      firstByte = (ulmbcs_byte_t)(value >> 24);
-      break;
-   case 3:
-      firstByte = (ulmbcs_byte_t)(value >> 16);
-      break;
-   case 2:
-      firstByte = (ulmbcs_byte_t)(value >> 8);
-      break;
-   case 1:
-      firstByte = (ulmbcs_byte_t)value;
-      break;
-   default:
+   if(bytesConverted > 0) {
+      firstByte = (ulmbcs_byte_t)(value >> ((bytesConverted - 1) * 8));
+   } else {
       /* most common failure mode is an unassigned character */
       groups_tried[group] = TRUE;
       return 0;
@@ -664,7 +715,7 @@
    /* All initial byte values in lower ascii range should have been caught by now,
       except with the exception group.
     */
-   MyAssert((firstByte <= ULMBCS_C0END) || (firstByte >= ULMBCS_C1START) || (group == ULMBCS_GRP_EXCEPT));
+   U_ASSERT((firstByte <= ULMBCS_C0END) || (firstByte >= ULMBCS_C1START) || (group == ULMBCS_GRP_EXCEPT));
    
    /* use converted data: first write 0, 1 or two group bytes */
    if (group != ULMBCS_GRP_EXCEPT && extraInfo->OptGroup != group)
@@ -826,7 +877,7 @@
          }
          if (!bytes_written)    /* the ambiguous group cases  (Strategy 3) */
          {
-            memset(groups_tried, 0, sizeof(groups_tried));
+            uprv_memset(groups_tried, 0, sizeof(groups_tried));
 
          /* check for non-default optimization group (Strategy 3A )*/
             if (extraInfo->OptGroup != 1 
@@ -930,16 +981,6 @@
 /* Now, the Unicode from LMBCS section */
 
 
-/*
-    Special codes for the getNextUnicodeWorker -- usually as the result of 
-    special error-callback behavior:
-    ULMBCS_SKIP     To control skipping over LMBCS sequences
-    ULMBCS_MULTI    To indicate that a single LMBCS char translates to 
-                    multiple uniChars 
-*/
-#define ULMBCS_SKIP     U_ERROR_LIMIT
-#define ULMBCS_MULTI    ULMBCS_SKIP+1
- 
 /* A function to call when we are looking at the Unicode group byte in LMBCS */
 static UChar
 GetUniFromLMBCSUni(char const ** ppLMBCSin)  /* Called with LMBCS-style Unicode byte stream */
@@ -958,26 +999,22 @@
 
 
 /* CHECK_SOURCE_LIMIT: Helper macro to verify that there are at least'index' 
-   bytes left in source up to  sourceLimit.Errors appropriately if not 
+   bytes left in source up to  sourceLimit.Errors appropriately if not.
+   If we reach the limit, then update the source pointer to there to consume
+   all input as required by ICU converter semantics.
 */
 
 #define CHECK_SOURCE_LIMIT(index) \
      if (args->source+index > args->sourceLimit){\
          *err = U_TRUNCATED_CHAR_FOUND;\
-         args->source = saveSource;\
+         args->source = args->sourceLimit;\
          return 0xffff;}
 
-/* Return the Unicode representation for the current LMBCS character
-
-   This worker function is used by both ucnv_getNextUChar() and ucnv_ToUnicode().  
-   The last parameter says whether the return value should be treated as UTF-16 or
-   UTF-32. The only difference is in surrogate handling
-*/
+/* Return the Unicode representation for the current LMBCS character */
 
 static UChar32 
 _LMBCSGetNextUCharWorker(UConverterToUnicodeArgs*   args,
-                         UErrorCode*   err,
-                         UBool         returnUTF32)
+                         UErrorCode*   err)
 {
      UChar32 uniChar = 0;    /* an output UNICODE char */
      ulmbcs_byte_t   CurByte; /* A byte from the input stream */
@@ -1015,7 +1052,7 @@
     {
         UConverterDataLMBCS * extraInfo;
         ulmbcs_byte_t group; 
-        UConverter* cnv; 
+        UConverterSharedData *cnv; 
         
         if (CurByte == ULMBCS_GRP_CTRL)  /* Control character group - no opt group update */
         {
@@ -1027,27 +1064,16 @@
         else 
         if (CurByte == ULMBCS_GRP_UNICODE) /* Unicode compatibility group: BigEndian UTF16 */
         {
-            UChar second;
             CHECK_SOURCE_LIMIT(2);
      
-            uniChar = GetUniFromLMBCSUni(&(args->source));
-        
-            /* at this point we are usually done, but we need to make sure we are not in 
-             a situation where we can successfully put together a surrogate pair */
-
-            if(returnUTF32 && UTF_IS_FIRST_SURROGATE(uniChar) && (args->source+3 <= args->sourceLimit)
-             && *(args->source)++ == ULMBCS_GRP_UNICODE
-             && UTF_IS_SECOND_SURROGATE(second = GetUniFromLMBCSUni(&(args->source))))
-            {
-                uniChar = UTF16_GET_PAIR_VALUE(uniChar, second);
-            }
+            /* don't check for error indicators fffe/ffff below */
+            return GetUniFromLMBCSUni(&(args->source));
         }
         else if (CurByte <= ULMBCS_CTRLOFFSET)  
         {
             group = CurByte;                   /* group byte is in the source */
             extraInfo = (UConverterDataLMBCS *) args->converter->extraInfo;
-            cnv = extraInfo->OptGrpConverter[group];
-            if (!cnv)
+            if (group > ULMBCS_GRP_LAST || (cnv = extraInfo->OptGrpConverter[group]) == NULL)
             {
                 /* this is not a valid group byte - no converter*/
                 *err = U_INVALID_CHAR_FOUND;
@@ -1061,12 +1087,12 @@
                 if (*args->source == group) {
                     /* single byte */
                     ++args->source;
-                    uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &args->source, args->source + 1, FALSE);
+                    uniChar = _MBCSSimpleGetNextUChar(cnv, args->source, 1, FALSE);
+                    ++args->source;
                 } else {
                     /* double byte */
-                    const char *newLimit = args->source + 2;
-                    uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &args->source, newLimit, FALSE);
-                    args->source = newLimit; /* set the correct limit even in case of an error */
+                    uniChar = _MBCSSimpleGetNextUChar(cnv, args->source, 2, FALSE);
+                    args->source += 2;
                 }
             }
             else {                                  /* single byte conversion */
@@ -1075,14 +1101,13 @@
         
                 if (CurByte >= ULMBCS_C1START)
                 {
-                    uniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(cnv->sharedData, CurByte);
+                    uniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(cnv, CurByte);
                 }
                 else
                 {
                     /* The non-optimizable oddballs where there is an explicit byte 
                     * AND the second byte is not in the upper ascii range
                     */
-                    const char *s;
                     char bytes[2];
 
                     extraInfo = (UConverterDataLMBCS *) args->converter->extraInfo;
@@ -1091,8 +1116,7 @@
                     /* Lookup value must include opt group */
                     bytes[0] = group;
                     bytes[1] = CurByte;
-                    s = bytes;
-                    uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &s, bytes + 2, FALSE);
+                    uniChar = _MBCSSimpleGetNextUChar(cnv, bytes, 2, FALSE);
                 }
             }
         }
@@ -1103,92 +1127,31 @@
             cnv = extraInfo->OptGrpConverter[group];
             if (group >= ULMBCS_DOUBLEOPTGROUP_START)    /* double byte conversion */
             {
-                if (!_MBCSIsLeadByte(cnv->sharedData, CurByte))
+                if (!_MBCSIsLeadByte(cnv, CurByte))
                 {
                     CHECK_SOURCE_LIMIT(0);
 
                     /* let the MBCS conversion consume CurByte again */
-                    --args->source;
-                    uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &args->source, args->source + 1, FALSE);
+                    uniChar = _MBCSSimpleGetNextUChar(cnv, args->source - 1, 1, FALSE);
                 }
                 else
                 {
                     CHECK_SOURCE_LIMIT(1);
                     /* let the MBCS conversion consume CurByte again */
-                    --args->source;
-                    /* since we know that we start at a lead byte, args->source _will_ be incremented by 2 */
-                    uniChar = _MBCSSimpleGetNextUChar(cnv->sharedData, &args->source, args->source + 2, FALSE);
+                    uniChar = _MBCSSimpleGetNextUChar(cnv, args->source - 1, 2, FALSE);
+                    ++args->source;
                 }
             }
             else                                   /* single byte conversion */
             {
-                uniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(cnv->sharedData, CurByte);
+                uniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(cnv, CurByte);
             }
         }
     }
-    if (((uint32_t)uniChar - 0xfffe) <= 1) /* 0xfffe<=uniChar<=0xffff */
-    {
-        UConverterToUnicodeArgs cbArgs = *args;
-        UConverterCallbackReason reason;
-        UChar UCh;
-
-        if (uniChar == 0xfffe)
-        {
-            reason = UCNV_UNASSIGNED;
-            *err = U_INVALID_CHAR_FOUND;
-        }
-        else
-        {
-            reason = UCNV_ILLEGAL;
-            *err = U_ILLEGAL_CHAR_FOUND;
-        }
-
-        cbArgs.target = &UCh;
-        cbArgs.targetLimit = &UCh + 1;
-        cbArgs.converter->fromCharErrorBehaviour(cbArgs.converter->toUContext,
-                                &cbArgs,
-                                saveSource,
-                                args->source - saveSource, 
-                                reason,
-                                err);
-
-        if (cbArgs.target != &UCh)
-        {
-            uniChar = (UChar32) UCh;
-        }
-          /* Did error functor skip */
-        if (U_SUCCESS(*err) && cbArgs.target == &UCh)    
-        {   
-            *err = ULMBCS_SKIP;
-        }
-        /* Did error functor try to write multiple UChars? */
-        else if (*err == U_BUFFER_OVERFLOW_ERROR)
-        {
-            *err = ULMBCS_MULTI;
-        }
-    }
     return uniChar;
 }
 
 
-/* The exported function that gets one UTF32 character from a LMBCS stream
-*/
-static UChar32 
-_LMBCSGetNextUChar(UConverterToUnicodeArgs*   args,
-                   UErrorCode*   err)
-{
-    UChar32 nextUChar;
-    do {
-        nextUChar = _LMBCSGetNextUCharWorker(args, err, TRUE);
-    }   while (*err == ULMBCS_SKIP);
-        
-    if (*err == ULMBCS_MULTI)
-    {
-        *err = U_ZERO_ERROR;  
-    }
-    return nextUChar;
-}
-
 /* The exported function that converts lmbcs to one or more
    UChars - currently UTF-16
 */
@@ -1196,50 +1159,44 @@
 _LMBCSToUnicodeWithOffsets(UConverterToUnicodeArgs*    args,
                      UErrorCode*    err)
 {
+   char LMBCS [ULMBCS_CHARSIZE_MAX];
    UChar uniChar;    /* one output UNICODE char */
-   const char * saveSource = args->source; /* beginning of current code point */
+   const char * saveSource; /* beginning of current code point */
    const char * pStartLMBCS = args->source;  /* beginning of whole string */
+   const char * errSource = NULL; /* pointer to actual input in case an error occurs */
+   int8_t savebytes = 0;
 
-   if (args->targetLimit == args->target)         /* error check may belong in common code */
-   {
-      *err = U_BUFFER_OVERFLOW_ERROR;
-      return;
-   }
-   
    /* Process from source to limit, or until error */
-   while (!*err && args->sourceLimit > args->source && args->targetLimit > args->target)
+   while (U_SUCCESS(*err) && args->sourceLimit > args->source && args->targetLimit > args->target)
    {
       saveSource = args->source; /* beginning of current code point */
 
-      if (args->converter->invalidCharLength) /* reassemble char from previous call */
+      if (args->converter->toULength) /* reassemble char from previous call */
       {
-        char LMBCS [ULMBCS_CHARSIZE_MAX];
-        const char *pLMBCS = LMBCS, *saveSourceLimit; 
-        size_t size_old = args->converter->invalidCharLength;
+        const char *saveSourceLimit; 
+        size_t size_old = args->converter->toULength;
 
-         /* limit from source is either reminder of temp buffer, or user limit on source */
+         /* limit from source is either remainder of temp buffer, or user limit on source */
         size_t size_new_maybe_1 = sizeof(LMBCS) - size_old;
         size_t size_new_maybe_2 = args->sourceLimit - args->source;
         size_t size_new = (size_new_maybe_1 < size_new_maybe_2) ? size_new_maybe_1 : size_new_maybe_2;
          
       
-        uprv_memcpy(LMBCS, args->converter->invalidCharBuffer, size_old);
+        uprv_memcpy(LMBCS, args->converter->toUBytes, size_old);
         uprv_memcpy(LMBCS + size_old, args->source, size_new);
         saveSourceLimit = args->sourceLimit;
-        args->source = pLMBCS;
-        args->sourceLimit = pLMBCS+size_old+size_new;
-        uniChar = (UChar) _LMBCSGetNextUCharWorker(args, err, FALSE);
-        pLMBCS = args->source;
-        args->source =saveSource;
+        args->source = errSource = LMBCS;
+        args->sourceLimit = LMBCS+size_old+size_new;
+        savebytes = (int8_t)(size_old+size_new);
+        uniChar = (UChar) _LMBCSGetNextUCharWorker(args, err);
+        args->source = saveSource + ((args->source - LMBCS) - size_old);
         args->sourceLimit = saveSourceLimit;
-        args->source += (pLMBCS - LMBCS - size_old);
 
-        if (*err == U_TRUNCATED_CHAR_FOUND && !args->flush)
+        if (*err == U_TRUNCATED_CHAR_FOUND)
         {
             /* evil special case: source buffers so small a char spans more than 2 buffers */
-            int8_t savebytes = (int8_t)(size_old+size_new);
-            args->converter->invalidCharLength = savebytes;
-            uprv_memcpy(args->converter->invalidCharBuffer, LMBCS, savebytes);
+            args->converter->toULength = savebytes;
+            uprv_memcpy(args->converter->toUBytes, LMBCS, savebytes);
             args->source = args->sourceLimit;
             *err = U_ZERO_ERROR;
             return;
@@ -1247,12 +1204,14 @@
          else
          {
             /* clear the partial-char marker */
-            args->converter->invalidCharLength = 0;
+            args->converter->toULength = 0;
          }
       }
       else
       {
-         uniChar = (UChar) _LMBCSGetNextUCharWorker(args, err, FALSE);
+         errSource = saveSource;
+         uniChar = (UChar) _LMBCSGetNextUCharWorker(args, err);
+         savebytes = (int8_t)(args->source - saveSource);
       }
       if (U_SUCCESS(*err))
       {
@@ -1273,53 +1232,22 @@
             *err = U_ILLEGAL_CHAR_FOUND;
          }
       }
-      else if (*err == ULMBCS_MULTI)
-      {
-          UChar * pUChar = args->converter->UCharErrorBuffer; 
-          int8_t BufferLength = args->converter->UCharErrorBufferLength;
-
-          *err = U_ZERO_ERROR;
-          do
-          { /* error functor wants to write multiple UniChars */
-            *(args->target)++ = uniChar;
-            if(args->offsets)
-            {
-               *(args->offsets)++ = saveSource - pStartLMBCS;
-            }
-            uniChar = *pUChar++;
-          }
-          while(BufferLength-- && args->targetLimit > args->target);
-
-          if (++BufferLength > 0)
-          {     /* fix up remaining UChars that can't fit in caller's buffer */
-              uprv_memmove( args->converter->UCharErrorBuffer, 
-                            args->converter->UCharErrorBuffer + args->converter->UCharErrorBufferLength - BufferLength,
-                            sizeof(UChar) * BufferLength);
-          }
-          args->converter->UCharErrorBufferLength = BufferLength;
-      }
-      else if (*err == ULMBCS_SKIP)
-      {
-          *err = U_ZERO_ERROR; /* and just go around again..*/
-      }
    }
    /* if target ran out before source, return U_BUFFER_OVERFLOW_ERROR */
    if (U_SUCCESS(*err) && args->sourceLimit > args->source && args->targetLimit <= args->target)
    {
       *err = U_BUFFER_OVERFLOW_ERROR;
    }
-
-   /* If character incomplete, store away partial char if more to come */
-   if (*err == U_TRUNCATED_CHAR_FOUND) 
+   else if (U_FAILURE(*err)) 
    {
-         args->source = args->sourceLimit;
-         if (!args->flush )
-         {
-            int8_t savebytes = (int8_t)(args->sourceLimit - saveSource);
-            args->converter->invalidCharLength = (int8_t)savebytes;
-            uprv_memcpy(args->converter->invalidCharBuffer, saveSource, savebytes);
-            *err = U_ZERO_ERROR;
-         }
+      /* If character incomplete or unmappable/illegal, store it in toUBytes[] */
+      args->converter->toULength = savebytes;
+      if (savebytes > 0) {
+         uprv_memcpy(args->converter->toUBytes, errSource, savebytes);
+      }
+      if (*err == U_TRUNCATED_CHAR_FOUND) {
+         *err = U_ZERO_ERROR;
+      }
    }
 }
 

Index: ucnv_u16.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ucnv_u16.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- ucnv_u16.c	10 Sep 2003 02:42:03 -0000	1.1
+++ ucnv_u16.c	6 Apr 2004 10:08:02 -0000	1.2
@@ -21,405 +21,512 @@
 #include "ucnv_cnv.h"
 #include "cmemory.h"
 
-/* UTF-16 Platform Endian --------------------------------------------------- */
+/* UTF-16BE ----------------------------------------------------------------- */
+
+#if U_IS_BIG_ENDIAN
+#   define _UTF16PEFromUnicodeWithOffsets   _UTF16BEFromUnicodeWithOffsets
+#else
+#   define _UTF16PEFromUnicodeWithOffsets   _UTF16LEFromUnicodeWithOffsets
+#endif
[...1404 lines suppressed...]
+
+    cnv->mode=state;
 }
 
 static UChar32
@@ -740,11 +1292,11 @@
                    UErrorCode *pErrorCode) {
     switch(pArgs->converter->mode) {
     case 8:
-        return T_UConverter_getNextUChar_UTF16_BE(pArgs, pErrorCode);
+        return _UTF16BEGetNextUChar(pArgs, pErrorCode);
     case 9:
-        return T_UConverter_getNextUChar_UTF16_LE(pArgs, pErrorCode);
+        return _UTF16LEGetNextUChar(pArgs, pErrorCode);
     default:
-        return ucnv_getNextUCharFromToUImpl(pArgs, _UTF16ToUnicodeWithOffsets, TRUE, pErrorCode);
+        return UCNV_GET_NEXT_UCHAR_USE_TO_U;
     }
 }
 

Index: ucnv_u32.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ucnv_u32.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- ucnv_u32.c	10 Sep 2003 02:42:03 -0000	1.1
+++ ucnv_u32.c	6 Apr 2004 10:08:02 -0000	1.2
@@ -23,74 +23,15 @@
 
 #define MAXIMUM_UCS2            0x0000FFFF
 #define MAXIMUM_UTF             0x0010FFFF
-#define MAXIMUM_UCS4            0x7FFFFFFF
 #define HALF_SHIFT              10
 #define HALF_BASE               0x0010000
 #define HALF_MASK               0x3FF
 #define SURROGATE_HIGH_START    0xD800
-#define SURROGATE_HIGH_END      0xDBFF
 #define SURROGATE_LOW_START     0xDC00
-#define SURROGATE_LOW_END       0xDFFF
[...964 lines suppressed...]
             break;
         }
-        cnv->mode=0; /* reset */
-    } else {
-        cnv->mode=state;
     }
+
+    cnv->mode=state;
 }
 
 static UChar32
@@ -1270,7 +1122,7 @@
     case 9:
         return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
     default:
-        return ucnv_getNextUCharFromToUImpl(pArgs, _UTF32ToUnicodeWithOffsets, FALSE, pErrorCode);
+        return UCNV_GET_NEXT_UCHAR_USE_TO_U;
     }
 }
 

Index: ucnv_u7.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ucnv_u7.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- ucnv_u7.c	10 Sep 2003 02:42:03 -0000	1.1
+++ ucnv_u7.c	6 Apr 2004 10:08:02 -0000	1.2
@@ -22,7 +22,6 @@
 
 /* UTF-7 -------------------------------------------------------------------- */
 
-/* ### TODO: in user guide, document version option (=1 for escaping set O characters) */
 /*
  * UTF-7 is a stateful encoding of Unicode.
  * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
@@ -247,7 +246,6 @@
     sourceIndex=byteIndex==0 ? 0 : -1;
     nextSourceIndex=0;
 
-loop:
     if(inDirectMode) {
 directMode:
         /*
@@ -270,8 +268,8 @@
                 /* illegal */
                 bytes[0]=b;
                 byteIndex=1;
-                nextSourceIndex=sourceIndex+1;
-                goto callback;
+                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                break;
             } else if(b!=PLUS) {
                 /* write directly encoded character */
                 *target++=b;
@@ -312,7 +310,8 @@
                 if(b>=126) {
                     /* illegal - test other illegal US-ASCII values by base64Value==-3 */
                     inDirectMode=TRUE;
-                    goto callback;
+                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                    break;
                 } else if((base64Value=fromBase64[b])>=0) {
                     /* collect base64 bytes into UChars */
                     switch(base64Counter) {
@@ -377,7 +376,8 @@
                         /* absorb the minus and leave the Unicode Mode */
                         if(bits!=0) {
                             /* bits are illegally left over, a UChar is incomplete */
-                            goto callback;
+                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                            break;
                         }
                     }
                     sourceIndex=nextSourceIndex;
@@ -392,7 +392,8 @@
                         bytes[0]=PLUS;
                         bytes[1]=b;
                         byteIndex=2;
-                        goto callback;
+                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                        break;
                     } else if(bits==0) {
                         /* un-read the character in case it is a plus sign */
                         --source;
@@ -400,12 +401,14 @@
                         goto directMode;
                     } else {
                         /* bits are illegally left over, a UChar is incomplete */
-                        goto callback;
+                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                        break;
                     }
                 } else /* base64Value==-3 for illegal characters */ {
                     /* illegal */
                     inDirectMode=TRUE;
-                    goto callback;
+                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                    break;
                 }
             } else {
                 /* target is full */
@@ -414,91 +417,26 @@
             }
         }
     }
-endloop:
 
-    if(pArgs->flush && source>=sourceLimit) {
-        /* reset the state for the next conversion */
-        if(!inDirectMode && bits!=0 && U_SUCCESS(*pErrorCode)) {
-            /* a character byte sequence remains incomplete */
-            *pErrorCode=U_TRUNCATED_CHAR_FOUND;
-        }
-        cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
-        cnv->toULength=0;
-    } else {
-        /* set the converter state back into UConverter */
-        cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
-        cnv->toULength=byteIndex;
+    if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
+        /*
+         * if we are in Unicode mode, then the byteIndex might not be 0,
+         * but that is ok if bits==0
+         * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
+         * (not true for IMAP-mailbox-name where we must end in direct mode)
+         */
+        byteIndex=0;
     }
 
-finish:
+    /* set the converter state back into UConverter */
+    cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
+    cnv->toULength=byteIndex;
+
     /* write back the updated pointers */
     pArgs->source=(const char *)source;
     pArgs->target=target;
     pArgs->offsets=offsets;
     return;
-
-callback:
-    /* call the callback function with all the preparations and post-processing */
-    /* update the arguments structure */
-    pArgs->source=(const char *)source;
-    pArgs->target=target;
-    pArgs->offsets=offsets;
-
-    /* copy the current bytes to invalidCharBuffer */
-    for(b=0; b<(uint8_t)byteIndex; ++b) {
-        cnv->invalidCharBuffer[b]=(char)bytes[b];
-    }
-    cnv->invalidCharLength=byteIndex;
-
-    /* set the converter state in UConverter to deal with the next character */
-    cnv->toUnicodeStatus=(uint32_t)inDirectMode<<24;
-    cnv->toULength=0;
-
-    /* call the callback function */
-    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
-    cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, cnv->invalidCharLength, UCNV_ILLEGAL, pErrorCode);
-
-    /* get the converter state from UConverter */
-    {
-        uint32_t status=cnv->toUnicodeStatus;
-        inDirectMode=(UBool)((status>>24)&1);
-        base64Counter=(int8_t)(status>>16);
-        bits=(uint16_t)status;
-    }
-    byteIndex=cnv->toULength;
-
-    /* update target and deal with offsets if necessary */
-    offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
-    target=pArgs->target;
-
-    /* update the source pointer and index */
-    sourceIndex=nextSourceIndex+((const uint8_t *)pArgs->source-source);
-    source=(const uint8_t *)pArgs->source;
-
-    /*
-     * If the callback overflowed the target, then we need to
-     * stop here with an overflow indication.
-     */
-    if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
-        goto endloop;
-    } else if(cnv->UCharErrorBufferLength>0) {
-        /* target is full */
-        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
-        goto endloop;
-    } else if(U_FAILURE(*pErrorCode)) {
-        /* break on error */
-        cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
-        cnv->toULength=0;
-        goto finish;
-    } else {
-        goto loop;
-    }
-}
-
-static UChar32
-_UTF7GetNextUChar(UConverterToUnicodeArgs *pArgs,
-                  UErrorCode *pErrorCode) {
-    return ucnv_getNextUCharFromToUImpl(pArgs, pArgs->converter->sharedData->impl->toUnicode, TRUE, pErrorCode);
 }
 
 static void
@@ -788,7 +726,7 @@
     _UTF7ToUnicodeWithOffsets,
     _UTF7FromUnicodeWithOffsets,
     _UTF7FromUnicodeWithOffsets,
-    _UTF7GetNextUChar,
+    NULL,
 
     NULL,
     _UTF7GetName,
@@ -967,7 +905,6 @@
     sourceIndex=byteIndex==0 ? 0 : -1;
     nextSourceIndex=0;
 
-loop:
     if(inDirectMode) {
 directMode:
         /*
@@ -989,8 +926,8 @@
                 /* illegal */
                 bytes[0]=b;
                 byteIndex=1;
-                nextSourceIndex=sourceIndex+1;
-                goto callback;
+                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                break;
             } else if(b!=AMPERSAND) {
                 /* write directly encoded character */
                 *target++=b;
@@ -1032,7 +969,8 @@
                 if(b>0x7e) {
                     /* illegal - test other illegal US-ASCII values by base64Value==-3 */
                     inDirectMode=TRUE;
-                    goto callback;
+                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                    break;
                 } else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
                     /* collect base64 bytes into UChars */
                     switch(base64Counter) {
@@ -1053,7 +991,8 @@
                         if(isLegalIMAP(c)) {
                             /* illegal */
                             inDirectMode=TRUE;
-                            goto callback;
+                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                            goto endloop;
                         }
                         *target++=c;
                         if(offsets!=NULL) {
@@ -1070,7 +1009,8 @@
                         if(isLegalIMAP(c)) {
                             /* illegal */
                             inDirectMode=TRUE;
-                            goto callback;
+                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                            goto endloop;
                         }
                         *target++=c;
                         if(offsets!=NULL) {
@@ -1087,7 +1027,8 @@
                         if(isLegalIMAP(c)) {
                             /* illegal */
                             inDirectMode=TRUE;
-                            goto callback;
+                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                            goto endloop;
                         }
                         *target++=c;
                         if(offsets!=NULL) {
@@ -1116,7 +1057,8 @@
                         if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
                             /* bits are illegally left over, a UChar is incomplete */
                             /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
-                            goto callback;
+                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                            break;
                         }
                     }
                     sourceIndex=nextSourceIndex;
@@ -1134,7 +1076,8 @@
                     /* base64Value==-3 for illegal characters */
                     /* illegal */
                     inDirectMode=TRUE;
-                    goto callback;
+                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                    break;
                 }
             } else {
                 /* target is full */
@@ -1145,83 +1088,41 @@
     }
 endloop:
 
-    if(pArgs->flush && source>=sourceLimit) {
-        /* reset the state for the next conversion */
-        if(!inDirectMode && U_SUCCESS(*pErrorCode)) {
-            /* a character byte sequence remains incomplete - IMAP must end in ASCII/direct mode */
-            *pErrorCode=U_TRUNCATED_CHAR_FOUND;
+    /*
+     * the end of the input stream and detection of truncated input
+     * are handled by the framework, but here we must check if we are in Unicode
+     * mode and byteIndex==0 because we must end in direct mode
+     *
+     * conditions:
+     *   successful
+     *   in Unicode mode and byteIndex==0
+     *   end of input and no truncated input
+     */
+    if( U_SUCCESS(*pErrorCode) &&
+        !inDirectMode && byteIndex==0 &&
+        pArgs->flush && source>=sourceLimit
+    ) {
+        if(base64Counter==-1) {
+            /* & at the very end of the input */
+            /* make the ampersand the reported sequence */
+            bytes[0]=AMPERSAND;
+            byteIndex=1;
         }
-        cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
-        cnv->toULength=0;
-    } else {
-        /* set the converter state back into UConverter */
-        cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
-        cnv->toULength=byteIndex;
+        /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
+
+        inDirectMode=TRUE; /* avoid looping */
+        *pErrorCode=U_TRUNCATED_CHAR_FOUND;
     }
 
-finish:
+    /* set the converter state back into UConverter */
+    cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
+    cnv->toULength=byteIndex;
+
     /* write back the updated pointers */
     pArgs->source=(const char *)source;
     pArgs->target=target;
     pArgs->offsets=offsets;
     return;
-
-callback:
-    /* call the callback function with all the preparations and post-processing */
-    /* update the arguments structure */
-    pArgs->source=(const char *)source;
-    pArgs->target=target;
-    pArgs->offsets=offsets;
-
-    /* copy the current bytes to invalidCharBuffer */
-    for(b=0; b<(uint8_t)byteIndex; ++b) {
-        cnv->invalidCharBuffer[b]=(char)bytes[b];
-    }
-    cnv->invalidCharLength=byteIndex;
-
-    /* set the converter state in UConverter to deal with the next character */
-    cnv->toUnicodeStatus=(uint32_t)inDirectMode<<24;
-    cnv->toULength=0;
-
-    /* call the callback function */
-    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
-    cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, cnv->invalidCharLength, UCNV_ILLEGAL, pErrorCode);
-
-    /* get the converter state from UConverter */
-    {
-        uint32_t status=cnv->toUnicodeStatus;
-        inDirectMode=(UBool)((status>>24)&1);
-        base64Counter=(int8_t)(status>>16);
-        bits=(uint16_t)status;
-    }
-    byteIndex=cnv->toULength;
-
-    /* update target and deal with offsets if necessary */
-    offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
-    target=pArgs->target;
-
-    /* update the source pointer and index */
-    sourceIndex=nextSourceIndex+((const uint8_t *)pArgs->source-source);
-    source=(const uint8_t *)pArgs->source;
-
-    /*
-     * If the callback overflowed the target, then we need to
-     * stop here with an overflow indication.
-     */
-    if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
-        goto endloop;
-    } else if(cnv->UCharErrorBufferLength>0) {
-        /* target is full */
-        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
-        goto endloop;
-    } else if(U_FAILURE(*pErrorCode)) {
-        /* break on error */
-        cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
-        cnv->toULength=0;
-        goto finish;
-    } else {
-        goto loop;
-    }
 }
 
 static void
@@ -1525,7 +1426,7 @@
     _IMAPToUnicodeWithOffsets,
     _IMAPFromUnicodeWithOffsets,
     _IMAPFromUnicodeWithOffsets,
-    _UTF7GetNextUChar,
+    NULL,
 
     NULL,
     NULL,
@@ -1537,7 +1438,7 @@
 static const UConverterStaticData _IMAPStaticData={
     sizeof(UConverterStaticData),
     "IMAP-mailbox-name",
-    0, /* TODO CCSID for UTF-7 */
+    0, /* TODO CCSID for IMAP-mailbox-name */
     UCNV_IBM, UCNV_IMAP_MAILBOX,
     1, 4,
     { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */

Index: ucnv_u8.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ucnv_u8.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- ucnv_u8.c	10 Sep 2003 02:42:03 -0000	1.1
+++ ucnv_u8.c	6 Apr 2004 10:08:02 -0000	1.2
@@ -29,16 +29,14 @@
 
 /* Keep these here to make finicky compilers happy */
 
-U_CFUNC void T_UConverter_toUnicode_UTF8(UConverterToUnicodeArgs *args,
+/*U_CFUNC void T_UConverter_toUnicode_UTF8(UConverterToUnicodeArgs *args,
                                          UErrorCode *err);
 U_CFUNC void T_UConverter_toUnicode_UTF8_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
-                                                       UErrorCode *err);
+                                                       UErrorCode *err);*/
 U_CFUNC void T_UConverter_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
                                            UErrorCode *err);
 U_CFUNC void T_UConverter_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
                                                         UErrorCode *err);
-U_CFUNC UChar32 T_UConverter_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
-                                               UErrorCode *err);
 
 
 /* UTF-8 -------------------------------------------------------------------- */
@@ -88,64 +86,7 @@
 static const uint32_t
 utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
 
-/**
- * Calls invalid char callback when an invalid character sequence is encountered.
- * It presumes that the converter has a callback to call.
- *
- * @returns true when callback fails
- */
-static UBool
-T_UConverter_toUnicode_InvalidChar_Callback(UConverterToUnicodeArgs * args,
-                                            UConverterCallbackReason reason,
-                                            UErrorCode *err)
-{
-    UConverter *converter = args->converter;
-
-    if (U_SUCCESS(*err))
-    {
-        if (reason == UCNV_ILLEGAL) {
-            *err = U_ILLEGAL_CHAR_FOUND;
-        } else {
-            *err = U_INVALID_CHAR_FOUND;
-        }
-    }
-
-    /* copy the toUBytes to the invalidCharBuffer */
-    uprv_memcpy(converter->invalidCharBuffer,
-                converter->toUBytes,
-                converter->toULength);
-    converter->invalidCharLength = converter->toULength;
-
-    /* Call the ErrorFunction */
-    args->converter->fromCharErrorBehaviour(converter->toUContext,
-                                            args,
-                                            converter->invalidCharBuffer,
-                                            converter->invalidCharLength,
-                                            reason,
-                                            err);
-
-    return (UBool)U_FAILURE(*err);
-}
-
-static UBool
-T_UConverter_toUnicode_InvalidChar_OffsetCallback(UConverterToUnicodeArgs * args,
-                                                  int32_t currentOffset,
-                                                  UConverterCallbackReason reason,
-                                                  UErrorCode *err)
-{
-    int32_t *saveOffsets = args->offsets;
-    UBool result;
-    
-    result = T_UConverter_toUnicode_InvalidChar_Callback(args, reason, err);
-
-    while (saveOffsets < args->offsets)
-    {
-        *(saveOffsets++) = currentOffset;
-    }
-    return result;
-}
-
-U_CFUNC void T_UConverter_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
+static void T_UConverter_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
                                   UErrorCode * err)
 {
     const unsigned char *mySource = (unsigned char *) args->source;
@@ -158,7 +99,6 @@
     int32_t i, inBytes;
   
     /* Restore size of current sequence */
-start:
     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
     {
         inBytes = args->converter->mode;            /* restore # of bytes to consume */
@@ -200,19 +140,10 @@
                 }
                 else
                 {
-                    if (args->flush)
-                    {
-                        if (U_SUCCESS(*err))
-                        {
-                            *err = U_TRUNCATED_CHAR_FOUND;
-                        }
-                    }
-                    else
-                    {    /* stores a partially calculated target*/
-                        args->converter->toUnicodeStatus = ch;
-                        args->converter->mode = inBytes;
-                        args->converter->toULength = (int8_t) i;
-                    }
+                    /* stores a partially calculated target*/
+                    args->converter->toUnicodeStatus = ch;
+                    args->converter->mode = inBytes;
+                    args->converter->toULength = (int8_t) i;
                     goto donefornow;
                 }
             }
@@ -236,6 +167,7 @@
                 (isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch)))
             {
                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
+                args->converter->toULength = 0;
                 if (ch <= MAXIMUM_UCS2) 
                 {
                     /* fits in 16 bits */
@@ -263,22 +195,9 @@
             }
             else
             {
-                args->source = (const char *) mySource;
-                args->target = myTarget;
-
                 args->converter->toULength = (int8_t)i;
-                if (T_UConverter_toUnicode_InvalidChar_Callback(args, UCNV_ILLEGAL, err))
-                {
-                    /* Stop if the error wasn't handled */
-                    /* args and err should already be set properly */
-                    return;
-                }
-
-                mySource = (unsigned char *) args->source;
-                myTarget = args->target;
-
-                /* goto the start to handle state left behind by the callback */
-                goto start;
+                *err = U_ILLEGAL_CHAR_FOUND;
+                break;
             }
         }
     }
@@ -294,7 +213,7 @@
     args->source = (const char *) mySource;
 }
 
-U_CFUNC void T_UConverter_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
+static void T_UConverter_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
                                                 UErrorCode * err)
 {
     const unsigned char *mySource = (unsigned char *) args->source;
@@ -309,7 +228,6 @@
     int32_t i, inBytes;
 
     /* Restore size of current sequence */
-start:
     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
     {
         inBytes = args->converter->mode;            /* restore # of bytes to consume */
@@ -350,20 +268,9 @@
                 }
                 else
                 {
-                    if (args->flush)
-                    {
-                        if (U_SUCCESS(*err)) 
-                        {
-                            *err = U_TRUNCATED_CHAR_FOUND;
-                            args->converter->toUnicodeStatus = 0;
-                        }
-                    }
-                    else
-                    {
-                        args->converter->toUnicodeStatus = ch;
-                        args->converter->mode = inBytes;
-                        args->converter->toULength = (int8_t)i;
-                    }
+                    args->converter->toUnicodeStatus = ch;
+                    args->converter->mode = inBytes;
+                    args->converter->toULength = (int8_t)i;
                     goto donefornow;
                 }
             }
@@ -387,6 +294,7 @@
                 (isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch)))
             {
                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
+                args->converter->toULength = 0;
                 if (ch <= MAXIMUM_UCS2) 
                 {
                     /* fits in 16 bits */
@@ -416,26 +324,9 @@
             }
             else
             {
-                args->source = (const char *) mySource;
-                args->target = myTarget;
-                args->offsets = myOffsets;
-
                 args->converter->toULength = (int8_t)i;
-                if (T_UConverter_toUnicode_InvalidChar_OffsetCallback(args,
-                    offsetNum, UCNV_ILLEGAL, err))
-                {
-                    /* Stop if the error wasn't handled */
-                    /* args and err should already be set properly */
-                    return;
-                }
-
-                offsetNum += i + ((unsigned char *) args->source - mySource);
-                mySource = (unsigned char *) args->source;
-                myTarget = args->target;
-                myOffsets = args->offsets;
-
-                /* goto the start to handle state left behind by the callback */
-                goto start;
+                *err = U_ILLEGAL_CHAR_FOUND;
+                break;
             }
         }
     }
@@ -460,14 +351,14 @@
     const UChar *sourceLimit = args->sourceLimit;
     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
     UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
-    uint32_t ch, ch2;
+    UChar32 ch, ch2;
     int16_t indexToWrite;
     char temp[4];
 
-    if (cnv->fromUSurrogateLead && myTarget < targetLimit)
+    if (cnv->fromUChar32 && myTarget < targetLimit)
     {
-        ch = cnv->fromUSurrogateLead;
-        cnv->fromUSurrogateLead = 0;
+        ch = cnv->fromUChar32;
+        cnv->fromUChar32 = 0;
         goto lowsurrogate;
     }
 
@@ -511,63 +402,21 @@
                         } else {
                             /* this is an unmatched lead code unit (1st surrogate) */
                             /* callback(illegal) */
-                            ch2 = ch;
+                            cnv->fromUChar32 = ch;
+                            *err = U_ILLEGAL_CHAR_FOUND;
+                            break;
                         }
                     } else {
                         /* no more input */
-                        cnv->fromUSurrogateLead = (UChar)ch;
+                        cnv->fromUChar32 = ch;
                         break;
                     }
                 } else {
                     /* this is an unmatched trail code unit (2nd surrogate) */
                     /* callback(illegal) */
-                    ch2 = ch;
-                }
-
-                if(ch2 != 0) {
-                    /* call the callback function with all the preparations and post-processing */
+                    cnv->fromUChar32 = ch;
                     *err = U_ILLEGAL_CHAR_FOUND;
-
-                    /* update the arguments structure */
-                    args->source=mySource;
-                    args->target=(char *)myTarget;
-
-                    /* write the code point as code units */
-                    cnv->invalidUCharBuffer[0] = (UChar)ch2;
-                    cnv->invalidUCharLength = 1;
-
-                    /* call the callback function */
-                    cnv->fromUCharErrorBehaviour(cnv->fromUContext, args, cnv->invalidUCharBuffer, 1, ch2, UCNV_ILLEGAL, err);
-
-                    /* get the converter state from UConverter */
-                    ch = cnv->fromUSurrogateLead;
-                    cnv->fromUSurrogateLead = 0;
-
-                    myTarget=(uint8_t *)args->target;
-                    mySource=args->source;
-
-                    /*
-                     * If the callback overflowed the target, then we need to
-                     * stop here with an overflow indication.
-                     */
-                    if(*err==U_BUFFER_OVERFLOW_ERROR) {
-                        break;
-                    } else if(U_FAILURE(*err)) {
-                        /* break on error */
-                        break;
-                    } else if(cnv->charErrorBufferLength>0) {
-                        /* target is full */
-                        *err=U_BUFFER_OVERFLOW_ERROR;
-                        break;
-                        /*
-                         * } else if(ch != 0) { ...
-                         * ### TODO 2002jul01 markus: It looks like this code (from ucnvmbcs.c)
-                         * does not handle the case where the callback leaves ch=fromUSurrogateLead!=0 .
-                         * We would have to check myTarget<targetLimit and goto lowsurrogate?!
-                         */
-                    }
-
-                    continue;
+                    break;
                 }
             }
 
@@ -604,11 +453,6 @@
     {
         *err = U_BUFFER_OVERFLOW_ERROR;
     }
-    if(args->flush && mySource >= sourceLimit && cnv->fromUSurrogateLead != 0 && U_SUCCESS(*err)) {
-        /* a Unicode code point remains incomplete (only a first surrogate) */
-        *err = U_TRUNCATED_CHAR_FOUND;
-        cnv->fromUSurrogateLead = 0;
-    }
 
     args->target = (char *) myTarget;
     args->source = mySource;
@@ -624,15 +468,15 @@
     const UChar *sourceLimit = args->sourceLimit;
     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
     UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
-    uint32_t ch, ch2;
+    UChar32 ch, ch2;
     int32_t offsetNum, nextSourceIndex;
     int16_t indexToWrite;
     char temp[4];
 
-    if (cnv->fromUSurrogateLead && myTarget < targetLimit)
+    if (cnv->fromUChar32 && myTarget < targetLimit)
     {
-        ch = cnv->fromUSurrogateLead;
-        cnv->fromUSurrogateLead = 0;
+        ch = cnv->fromUChar32;
+        cnv->fromUChar32 = 0;
         offsetNum = -1;
         nextSourceIndex = 0;
         goto lowsurrogate;
@@ -686,69 +530,21 @@
                         } else {
                             /* this is an unmatched lead code unit (1st surrogate) */
                             /* callback(illegal) */
-                            ch2 = ch;
+                            cnv->fromUChar32 = ch;
+                            *err = U_ILLEGAL_CHAR_FOUND;
+                            break;
                         }
                     } else {
                         /* no more input */
-                        cnv->fromUSurrogateLead = (UChar)ch;
+                        cnv->fromUChar32 = ch;
                         break;
                     }
                 } else {
                     /* this is an unmatched trail code unit (2nd surrogate) */
                     /* callback(illegal) */
-                    ch2 = ch;
-                }
-
-                if(ch2 != 0) {
-                    /* call the callback function with all the preparations and post-processing */
+                    cnv->fromUChar32 = ch;
                     *err = U_ILLEGAL_CHAR_FOUND;
-
-                    /* update the arguments structure */
-                    args->source=mySource;
-                    args->target=(char *)myTarget;
-                    args->offsets=myOffsets;
-
-                    /* write the code point as code units */
-                    cnv->invalidUCharBuffer[0] = (UChar)ch2;
-                    cnv->invalidUCharLength = 1;
-
-                    /* call the callback function */
-                    cnv->fromUCharErrorBehaviour(cnv->fromUContext, args, cnv->invalidUCharBuffer, 1, ch2, UCNV_ILLEGAL, err);
-
-                    /* get the converter state from UConverter */
-                    ch = cnv->fromUSurrogateLead;
-                    cnv->fromUSurrogateLead = 0;
-
-                    /* update target and deal with offsets if necessary */
-                    myOffsets=ucnv_updateCallbackOffsets(myOffsets, ((uint8_t *)args->target)-myTarget, offsetNum);
-                    myTarget=(uint8_t *)args->target;
-
-                    /* update the source pointer and index */
-                    offsetNum=nextSourceIndex+(args->source-mySource);
-                    mySource=args->source;
-
-                    /*
-                     * If the callback overflowed the target, then we need to
-                     * stop here with an overflow indication.
-                     */
-                    if(*err==U_BUFFER_OVERFLOW_ERROR) {
-                        break;
-                    } else if(U_FAILURE(*err)) {
-                        /* break on error */
-                        break;
-                    } else if(cnv->charErrorBufferLength>0) {
-                        /* target is full */
-                        *err=U_BUFFER_OVERFLOW_ERROR;
-                        break;
-                        /*
-                         * } else if(ch != 0) { ...
-                         * ### TODO 2002jul01 markus: It looks like this code (from ucnvmbcs.c)
-                         * does not handle the case where the callback leaves ch=fromUSurrogateLead!=0 .
-                         * We would have to check myTarget<targetLimit and goto lowsurrogate?!
-                         */
-                    }
-
-                    continue;
+                    break;
                 }
             }
 
@@ -787,172 +583,148 @@
     {
         *err = U_BUFFER_OVERFLOW_ERROR;
     }
-    if(args->flush && mySource >= sourceLimit && cnv->fromUSurrogateLead != 0 && U_SUCCESS(*err)) {
-        /* a Unicode code point remains incomplete (only a first surrogate) */
-        *err = U_TRUNCATED_CHAR_FOUND;
-        cnv->fromUSurrogateLead = 0;
-    }
 
     args->target = (char *) myTarget;
     args->source = mySource;
     args->offsets = myOffsets;
 }
 
-U_CFUNC UChar32 T_UConverter_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
+static UChar32 T_UConverter_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
                                                UErrorCode *err) {
-    UChar buffer[2];
-    const char *sourceInitial;
+    UConverter *cnv;
+    const uint8_t *sourceInitial;
     const uint8_t *source;
-    UChar* myUCharPtr;
     uint16_t extraBytesToWrite;
     uint8_t myByte;
     UChar32 ch;
-    int8_t isLegalSequence;
-    UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
+    int8_t i, isLegalSequence;
 
-    while (args->source < args->sourceLimit)
+    /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
+
+    cnv = args->converter;
+    sourceInitial = source = (const uint8_t *)args->source;
+    if (source >= (const uint8_t *)args->sourceLimit)
     {
-        sourceInitial = args->source;
-        myByte = (uint8_t)*(args->source++);
-        if (myByte < 0x80)
-        {
-            return (UChar32)myByte;
-        }
+        /* no input */
+        *err = U_INDEX_OUTOFBOUNDS_ERROR;
+        return 0xffff;
+    }
 
-        extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
-        if (extraBytesToWrite == 0) {
-            isLegalSequence = FALSE;
-            ch = 0;
-            goto CALL_ERROR_FUNCTION;
-        }
+    myByte = (uint8_t)*(source++);
+    if (myByte < 0x80)
+    {
+        args->source = (const char *)source;
+        return (UChar32)myByte;
+    }
 
-        /*The byte sequence is longer than the buffer area passed*/
-        source = (const uint8_t *)args->source;
-        if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
-        {
-            *err = U_TRUNCATED_CHAR_FOUND;
-            return 0xffff;
-        }
-        else
-        {
-            isLegalSequence = 1;
-            ch = myByte << 6;
-            switch(extraBytesToWrite)
-            {     
-              /* note: code falls through cases! (sic)*/ 
-            case 6:
-                ch += (myByte = *source++);
-                ch <<= 6;
-                if (!UTF8_IS_TRAIL(myByte))
-                {
-                    isLegalSequence = 0;
-                    break;
-                }
-            case 5:
-                ch += (myByte = *source++);
-                ch <<= 6;
-                if (!UTF8_IS_TRAIL(myByte))
-                {
-                    isLegalSequence = 0;
-                    break;
-                }
-            case 4:
-                ch += (myByte = *source++);
-                ch <<= 6;
-                if (!UTF8_IS_TRAIL(myByte))
-                {
-                    isLegalSequence = 0;
-                    break;
-                }
-            case 3:
-                ch += (myByte = *source++);
-                ch <<= 6;
-                if (!UTF8_IS_TRAIL(myByte))
-                {
-                    isLegalSequence = 0;
-                    break;
-                }
-            case 2:
-                ch += (myByte = *source++);
-                if (!UTF8_IS_TRAIL(myByte))
-                {
-                    isLegalSequence = 0;
-                }
-            };
-        }
-        ch -= offsetsFromUTF8[extraBytesToWrite];
+    extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
+    if (extraBytesToWrite == 0) {
+        cnv->toUBytes[0] = myByte;
+        cnv->toULength = 1;
+        *err = U_ILLEGAL_CHAR_FOUND;
         args->source = (const char *)source;
+        return 0xffff;
+    }
 
-        /*
-         * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
-         * - use only trail bytes after a lead byte (checked above)
-         * - use the right number of trail bytes for a given lead byte
-         * - encode a code point <= U+10ffff
-         * - use the fewest possible number of bytes for their code points
-         * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
-         *
-         * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
-         * There are no irregular sequences any more.
-         * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
-         */
-        if (isLegalSequence && (uint32_t)ch <= MAXIMUM_UTF && (uint32_t)ch >= utf8_minChar32[extraBytesToWrite]) {
-            if(isCESU8) {
-                if(extraBytesToWrite <= 3) {
-                    if( UTF_IS_FIRST_SURROGATE(ch) &&
-                        (const char *)(source + 3) <= args->sourceLimit &&
-                        source[0] == 0xed && (source[1] & 0xf0) == 0xb0 && (source[2] & 0xc0) == 0x80
-                    ) {
-                        /* ch is a lead surrogate followed by a trail surrogate */
-                        ch = (ch << 10) +
-                             ((source[1] & 0xf) << 6) + (source[2] & 0x3f) -
-                             ((0xd800 << 10) - 0x10000);
-                        args->source = (const char *)(source + 3);
-                    }
-                    return ch; /* return the code point */
-                }
-                /* illegal CESU-8 */
+    /*The byte sequence is longer than the buffer area passed*/
+    if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
+    {
+        /* check if all of the remaining bytes are trail bytes */
+        cnv->toUBytes[0] = myByte;
+        i = 1;
+        *err = U_TRUNCATED_CHAR_FOUND;
+        while(source < (const uint8_t *)args->sourceLimit) {
+            if(U8_IS_TRAIL(myByte = *source)) {
+                cnv->toUBytes[i++] = myByte;
+                ++source;
             } else {
-                if(!UTF_IS_SURROGATE(ch)) {
-                    return ch; /* return the code point */
-                }
-                /* illegal UTF-8 */
+                /* error even before we run out of input */
+                *err = U_ILLEGAL_CHAR_FOUND;
+                break;
             }
         }
+        cnv->toULength = i;
+        args->source = (const char *)source;
+        return 0xffff;
+    }
 
-CALL_ERROR_FUNCTION:
-        extraBytesToWrite = (uint16_t)(args->source - sourceInitial);
-        args->converter->invalidCharLength = (uint8_t)extraBytesToWrite;
-        uprv_memcpy(args->converter->invalidCharBuffer, sourceInitial, extraBytesToWrite);
-
-        myUCharPtr = buffer;
-        *err = U_ILLEGAL_CHAR_FOUND;
-        args->target = myUCharPtr;
-        args->targetLimit = buffer + 2;
-        args->converter->fromCharErrorBehaviour(args->converter->toUContext,
-                                        args,
-                                        sourceInitial,
-                                        extraBytesToWrite,
-                                        UCNV_ILLEGAL,
-                                        err);
-
-        if(U_SUCCESS(*err)) {
-            extraBytesToWrite = (uint16_t)(args->target - buffer);
-            if(extraBytesToWrite > 0) {
-                return ucnv_getUChar32KeepOverflow(args->converter, buffer, extraBytesToWrite);
-            }
-            /* else (callback did not write anything) continue */
-        } else if(*err == U_BUFFER_OVERFLOW_ERROR) {
-            *err = U_ZERO_ERROR;
-            return ucnv_getUChar32KeepOverflow(args->converter, buffer, 2);
-        } else {
-            /* break on error */
-            /* ### what if a callback set an error but _also_ generated output?! */
-            return 0xffff;
+    isLegalSequence = 1;
+    ch = myByte << 6;
+    switch(extraBytesToWrite)
+    {     
+      /* note: code falls through cases! (sic)*/ 
+    case 6:
+        ch += (myByte = *source);
+        ch <<= 6;
+        if (!UTF8_IS_TRAIL(myByte))
+        {
+            isLegalSequence = 0;
+            break;
         }
+        ++source;
+    case 5:
+        ch += (myByte = *source);
+        ch <<= 6;
+        if (!UTF8_IS_TRAIL(myByte))
+        {
+            isLegalSequence = 0;
+            break;
+        }
+        ++source;
+    case 4:
+        ch += (myByte = *source);
+        ch <<= 6;
+        if (!UTF8_IS_TRAIL(myByte))
+        {
+            isLegalSequence = 0;
+            break;
+        }
+        ++source;
+    case 3:
+        ch += (myByte = *source);
+        ch <<= 6;
+        if (!UTF8_IS_TRAIL(myByte))
+        {
+            isLegalSequence = 0;
+            break;
+        }
+        ++source;
+    case 2:
+        ch += (myByte = *source);
+        if (!UTF8_IS_TRAIL(myByte))
+        {
+            isLegalSequence = 0;
+            break;
+        }
+        ++source;
+    };
+    ch -= offsetsFromUTF8[extraBytesToWrite];
+    args->source = (const char *)source;
+
+    /*
+     * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
+     * - use only trail bytes after a lead byte (checked above)
+     * - use the right number of trail bytes for a given lead byte
+     * - encode a code point <= U+10ffff
+     * - use the fewest possible number of bytes for their code points
+     * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
+     *
+     * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
+     * There are no irregular sequences any more.
+     */
+    if (isLegalSequence &&
+        (uint32_t)ch <= MAXIMUM_UTF &&
+        (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
+        !U_IS_SURROGATE(ch)
+    ) {
+        return ch; /* return the code point */
     }
 
-    /* no input or only skipping callback calls */
-    *err = U_INDEX_OUTOFBOUNDS_ERROR;
+    for(i = 0; sourceInitial < source; ++i) {
+        cnv->toUBytes[i] = *sourceInitial++;
+    }
+    cnv->toULength = i;
+    *err = U_ILLEGAL_CHAR_FOUND;
     return 0xffff;
 } 
 
@@ -985,7 +757,8 @@
 static const UConverterStaticData _UTF8StaticData={
     sizeof(UConverterStaticData),
     "UTF-8",
-    1208, UCNV_IBM, UCNV_UTF8, 1, 4,
+    1208, UCNV_IBM, UCNV_UTF8,
+    1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
     0,
     0,
@@ -1001,6 +774,29 @@
 
 /* CESU-8 converter data ---------------------------------------------------- */
 
+static const UConverterImpl _CESU8Impl={
+    UCNV_CESU8,
+
+    NULL,
+    NULL,
+
+    NULL,
+    NULL,
+    NULL,
+
+    T_UConverter_toUnicode_UTF8,
+    T_UConverter_toUnicode_UTF8_OFFSETS_LOGIC,
+    T_UConverter_fromUnicode_UTF8,
+    T_UConverter_fromUnicode_UTF8_OFFSETS_LOGIC,
+    NULL,
+
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    ucnv_getCompleteUnicodeSet
+};
+
 static const UConverterStaticData _CESU8StaticData={
     sizeof(UConverterStaticData),
     "CESU-8",
@@ -1014,6 +810,6 @@
 
 const UConverterSharedData _CESU8Data={
     sizeof(UConverterSharedData), ~((uint32_t) 0),
-    NULL, NULL, &_CESU8StaticData, FALSE, &_UTF8Impl,
+    NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl,
     0
 };

Index: ucnvbocu.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ucnvbocu.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- ucnvbocu.c	10 Sep 2003 02:42:03 -0000	1.1
+++ ucnvbocu.c	6 Apr 2004 10:08:03 -0000	1.2
@@ -14,7 +14,7 @@
 *   created by: Markus W. Scherer
 *
 *   This is an implementation of the Binary Ordered Compression for Unicode,
-*   in its MIME-friendly form as defined in ### TODO http://... 1. doc/papers 2. design
+*   in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
 */
 
 #include "unicode/utypes.h"
@@ -402,7 +402,7 @@
     offsets=pArgs->offsets;
 
     /* get the converter state from UConverter */
-    c=cnv->fromUSurrogateLead;
+    c=cnv->fromUChar32;
     prev=(int32_t)cnv->fromUnicodeStatus;
     if(prev==0) {
         prev=BOCU1_ASCII_PREV;
@@ -424,47 +424,25 @@
     if(targetCapacity>diff) {
         targetCapacity=diff;
     }
-    /* ### TODO if WithOffsets is never used without offsets, then remove all offsets==NULL branches and checks */
-    if(offsets==NULL) {
-        while(targetCapacity>0 && (c=*source)<0x3000) {
-            if(c<=0x20) {
-                if(c!=0x20) {
-                    prev=BOCU1_ASCII_PREV;
-                }
-                *target++=(uint8_t)c;
-            } else {
-                diff=c-prev;
-                if(DIFF_IS_SINGLE(diff)) {
-                    prev=BOCU1_SIMPLE_PREV(c);
-                    *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
-                } else {
-                    break;
-                }
+    while(targetCapacity>0 && (c=*source)<0x3000) {
+        if(c<=0x20) {
+            if(c!=0x20) {
+                prev=BOCU1_ASCII_PREV;
             }
+            *target++=(uint8_t)c;
+            *offsets++=nextSourceIndex++;
             ++source;
             --targetCapacity;
-        }
-    } else {
-        while(targetCapacity>0 && (c=*source)<0x3000) {
-            if(c<=0x20) {
-                if(c!=0x20) {
-                    prev=BOCU1_ASCII_PREV;
-                }
-                *target++=(uint8_t)c;
+        } else {
+            diff=c-prev;
+            if(DIFF_IS_SINGLE(diff)) {
+                prev=BOCU1_SIMPLE_PREV(c);
+                *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
                 *offsets++=nextSourceIndex++;
                 ++source;
                 --targetCapacity;
             } else {
-                diff=c-prev;
-                if(DIFF_IS_SINGLE(diff)) {
-                    prev=BOCU1_SIMPLE_PREV(c);
-                    *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
-                    *offsets++=nextSourceIndex++;
-                    ++source;
-                    --targetCapacity;
-                } else {
-                    break;
-                }
+                break;
             }
         }
     }
@@ -488,9 +466,7 @@
                     prev=BOCU1_ASCII_PREV;
                 }
                 *target++=(uint8_t)c;
-                if(offsets!=NULL) {
-                    *offsets++=sourceIndex;
-                }
+                *offsets++=sourceIndex;
                 --targetCapacity;
 
                 sourceIndex=nextSourceIndex;
@@ -527,9 +503,7 @@
             prev=BOCU1_PREV(c);
             if(DIFF_IS_SINGLE(diff)) {
                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
-                if(offsets!=NULL) {
-                    *offsets++=sourceIndex;
-                }
+                *offsets++=sourceIndex;
                 --targetCapacity;
                 sourceIndex=nextSourceIndex;
                 if(c<0x3000) {
@@ -551,10 +525,8 @@
                 }
                 *target++=(uint8_t)diff;
                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
-                if(offsets!=NULL) {
-                    *offsets++=sourceIndex;
-                    *offsets++=sourceIndex;
-                }
+                *offsets++=sourceIndex;
+                *offsets++=sourceIndex;
                 targetCapacity-=2;
                 sourceIndex=nextSourceIndex;
             } else {
@@ -566,40 +538,23 @@
                 /* write the output character bytes from diff and length */
                 /* from the first if in the loop we know that targetCapacity>0 */
                 if(length<=targetCapacity) {
-                    if(offsets==NULL) {
-                        switch(length) {
-                            /* each branch falls through to the next one */
-                        case 4:
-                            *target++=(uint8_t)(diff>>24);
-                        case 3:
-                            *target++=(uint8_t)(diff>>16);
-                        /* case 2: handled above */
-                            *target++=(uint8_t)(diff>>8);
-                        /* case 1: handled above */
-                            *target++=(uint8_t)diff;
-                        default:
-                            /* will never occur */
-                            break;
-                        }
-                    } else {
-                        switch(length) {
-                            /* each branch falls through to the next one */
-                        case 4:
-                            *target++=(uint8_t)(diff>>24);
-                            *offsets++=sourceIndex;
-                        case 3:
-                            *target++=(uint8_t)(diff>>16);
-                            *offsets++=sourceIndex;
-                        case 2:
-                            *target++=(uint8_t)(diff>>8);
-                            *offsets++=sourceIndex;
-                        /* case 1: handled above */
-                            *target++=(uint8_t)diff;
-                            *offsets++=sourceIndex;
-                        default:
-                            /* will never occur */
-                            break;
-                        }
+                    switch(length) {
+                        /* each branch falls through to the next one */
+                    case 4:
+                        *target++=(uint8_t)(diff>>24);
+                        *offsets++=sourceIndex;
+                    case 3:
+                        *target++=(uint8_t)(diff>>16);
+                        *offsets++=sourceIndex;
+                    case 2:
+                        *target++=(uint8_t)(diff>>8);
+                        *offsets++=sourceIndex;
+                    /* case 1: handled above */
+                        *target++=(uint8_t)diff;
+                        *offsets++=sourceIndex;
+                    default:
+                        /* will never occur */
+                        break;
                     }
                     targetCapacity-=length;
                     sourceIndex=nextSourceIndex;
@@ -635,19 +590,13 @@
                         /* each branch falls through to the next one */
                     case 3:
                         *target++=(uint8_t)(diff>>16);
-                        if(offsets!=NULL) {
-                            *offsets++=sourceIndex;
-                        }
+                        *offsets++=sourceIndex;
                     case 2:
                         *target++=(uint8_t)(diff>>8);
-                        if(offsets!=NULL) {
-                            *offsets++=sourceIndex;
-                        }
+                        *offsets++=sourceIndex;
                     case 1:
                         *target++=(uint8_t)diff;
-                        if(offsets!=NULL) {
-                            *offsets++=sourceIndex;
-                        }
+                        *offsets++=sourceIndex;
                     default:
                         /* will never occur */
                         break;
@@ -666,19 +615,9 @@
         }
     }
 
-    if(pArgs->flush && source>=sourceLimit) {
-        /* reset the state for the next conversion */
-        if(c<0 && U_SUCCESS(*pErrorCode)) {
-            /* a Unicode code point remains incomplete (only a first surrogate) */
-            *pErrorCode=U_TRUNCATED_CHAR_FOUND;
-        }
-        cnv->fromUSurrogateLead=0;
-        cnv->fromUnicodeStatus=BOCU1_ASCII_PREV;
-    } else {
-        /* set the converter state back into UConverter */
-        cnv->fromUSurrogateLead= c<0 ? (UChar)-c : 0;
-        cnv->fromUnicodeStatus=(uint32_t)prev;
-    }
+    /* set the converter state back into UConverter */
+    cnv->fromUChar32= c<0 ? -c : 0;
+    cnv->fromUnicodeStatus=(uint32_t)prev;
 
     /* write back the updated pointers */
     pArgs->source=source;
@@ -711,7 +650,7 @@
     targetCapacity=pArgs->targetLimit-pArgs->target;
 
     /* get the converter state from UConverter */
-    c=cnv->fromUSurrogateLead;
+    c=cnv->fromUChar32;
     prev=(int32_t)cnv->fromUnicodeStatus;
     if(prev==0) {
         prev=BOCU1_ASCII_PREV;
@@ -897,19 +836,9 @@
         }
     }
 
-    if(pArgs->flush && source>=sourceLimit) {
-        /* reset the state for the next conversion */
-        if(c<0 && U_SUCCESS(*pErrorCode)) {
-            /* a Unicode code point remains incomplete (only a first surrogate) */
-            *pErrorCode=U_TRUNCATED_CHAR_FOUND;
-        }
-        cnv->fromUSurrogateLead=0;
-        cnv->fromUnicodeStatus=BOCU1_ASCII_PREV;
-    } else {
-        /* set the converter state back into UConverter */
-        cnv->fromUSurrogateLead= c<0 ? (UChar)-c : 0;
-        cnv->fromUnicodeStatus=(uint32_t)prev;
-    }
+    /* set the converter state back into UConverter */
+    cnv->fromUChar32= c<0 ? -c : 0;
+    cnv->fromUnicodeStatus=(uint32_t)prev;
 
     /* write back the updated pointers */
     pArgs->source=source;
@@ -1039,7 +968,6 @@
     nextSourceIndex=0;
 
     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
-loop:
     if(count>0 && byteIndex>0 && target<targetLimit) {
         goto getTrail;
     }
@@ -1052,53 +980,29 @@
     if(count>diff) {
         count=diff;
     }
-    if(offsets==NULL) {
-        while(count>0) {
-            if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
-                c=prev+(c-BOCU1_MIDDLE);
-                if(c<0x3000) {
-                    *target++=(UChar)c;
-                    prev=BOCU1_SIMPLE_PREV(c);
-                } else {
-                    break;
-                }
-            } else if(c<=0x20) {
-                if(c!=0x20) {
-                    prev=BOCU1_ASCII_PREV;
-                }
-                *target++=(UChar)c;
-            } else {
-                break;
-            }
-            ++source;
-            --count;
-        }
-        /* sourceIndex and nextSourceIndex are wrong but does not matter */
-    } else {
-        while(count>0) {
-            if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
-                c=prev+(c-BOCU1_MIDDLE);
-                if(c<0x3000) {
-                    *target++=(UChar)c;
-                    *offsets++=nextSourceIndex++;
-                    prev=BOCU1_SIMPLE_PREV(c);
-                } else {
-                    break;
-                }
-            } else if(c<=0x20) {
-                if(c!=0x20) {
-                    prev=BOCU1_ASCII_PREV;
-                }
+    while(count>0) {
+        if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
+            c=prev+(c-BOCU1_MIDDLE);
+            if(c<0x3000) {
                 *target++=(UChar)c;
                 *offsets++=nextSourceIndex++;
+                prev=BOCU1_SIMPLE_PREV(c);
             } else {
                 break;
             }
-            ++source;
-            --count;
+        } else if(c<=0x20) {
+            if(c!=0x20) {
+                prev=BOCU1_ASCII_PREV;
+            }
+            *target++=(UChar)c;
+            *offsets++=nextSourceIndex++;
+        } else {
+            break;
         }
-        sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
+        ++source;
+        --count;
     }
+    sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
 
     /* decode a sequence of single and lead bytes */
     while(source<sourceLimit) {
@@ -1115,9 +1019,7 @@
             c=prev+(c-BOCU1_MIDDLE);
             if(c<0x3000) {
                 *target++=(UChar)c;
-                if(offsets!=NULL) {
-                    *offsets++=sourceIndex;
-                }
+                *offsets++=sourceIndex;
                 prev=BOCU1_SIMPLE_PREV(c);
                 sourceIndex=nextSourceIndex;
                 goto fastSingle;
@@ -1131,9 +1033,7 @@
                 prev=BOCU1_ASCII_PREV;
             }
             *target++=(UChar)c;
-            if(offsets!=NULL) {
-                *offsets++=sourceIndex;
-            }
+            *offsets++=sourceIndex;
             sourceIndex=nextSourceIndex;
             continue;
         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
@@ -1151,7 +1051,8 @@
                 bytes[0]=source[-2];
                 bytes[1]=source[-1];
                 byteIndex=2;
-                goto callback;
+                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                break;
             }
         } else if(c==BOCU1_RESET) {
             /* only reset the state, no code point */
@@ -1181,7 +1082,8 @@
                 /* trail byte in any position */
                 c=decodeBocu1TrailByte(count, c);
                 if(c<0) {
-                    goto callback;
+                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                    goto endloop;
                 }
 
                 diff+=c;
@@ -1190,7 +1092,8 @@
                     byteIndex=0;
                     c=prev+diff;
                     if((uint32_t)c>0x10ffff) {
-                        goto callback;
+                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                        goto endloop;
                     }
                     break;
                 }
@@ -1201,23 +1104,17 @@
         prev=BOCU1_PREV(c);
         if(c<=0xffff) {
             *target++=(UChar)c;
-            if(offsets!=NULL) {
-                *offsets++=sourceIndex;
-            }
+            *offsets++=sourceIndex;
         } else {
             /* output surrogate pair */
             *target++=UTF16_LEAD(c);
             if(target<targetLimit) {
                 *target++=UTF16_TRAIL(c);
-                if(offsets!=NULL) {
-                    *offsets++=sourceIndex;
-                    *offsets++=sourceIndex;
-                }
+                *offsets++=sourceIndex;
+                *offsets++=sourceIndex;
             } else {
                 /* target overflow */
-                if(offsets!=NULL) {
-                    *offsets++=sourceIndex;
-                }
+                *offsets++=sourceIndex;
                 cnv->UCharErrorBuffer[0]=UTF16_TRAIL(c);
                 cnv->UCharErrorBufferLength=1;
                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
@@ -1228,90 +1125,22 @@
     }
 endloop:
 
-    if(pArgs->flush && source>=sourceLimit) {
-        /* reset the state for the next conversion */
-        if(byteIndex>0 && U_SUCCESS(*pErrorCode)) {
-            /* a character byte sequence remains incomplete */
-            *pErrorCode=U_TRUNCATED_CHAR_FOUND;
-        }
+    if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
+        /* set the converter state in UConverter to deal with the next character */
         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
         cnv->mode=0;
-        cnv->toULength=0;
     } else {
         /* set the converter state back into UConverter */
         cnv->toUnicodeStatus=(uint32_t)prev;
         cnv->mode=(diff<<2)|count;
-        cnv->toULength=byteIndex;
     }
+    cnv->toULength=byteIndex;
 
-finish:
     /* write back the updated pointers */
     pArgs->source=(const char *)source;
     pArgs->target=target;
     pArgs->offsets=offsets;
     return;
-
-callback:
-    /* call the callback function with all the preparations and post-processing */
-    /* update the arguments structure */
-    pArgs->source=(const char *)source;
-    pArgs->target=target;
-    pArgs->offsets=offsets;
-
-    /* copy the current bytes to invalidCharBuffer */
-    cnv->invalidCharBuffer[0]=bytes[0];
-    cnv->invalidCharBuffer[1]=bytes[1];
-    cnv->invalidCharBuffer[2]=bytes[2];
-    cnv->invalidCharBuffer[3]=bytes[3];
-    cnv->invalidCharLength=(int8_t)byteIndex;
-
-    /* set the converter state in UConverter to deal with the next character */
-    cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
-    cnv->mode=0;
-    cnv->toULength=0;
-
-    /* call the callback function */
-    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
-    cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, (const char *)bytes, byteIndex, UCNV_ILLEGAL, pErrorCode);
-
-    /* get the converter state from UConverter */
-    prev=(int32_t)cnv->toUnicodeStatus;
-    if(prev==0) {
-        prev=BOCU1_ASCII_PREV;
-    }
-    diff=cnv->mode;
-    count=diff&3;
-    diff>>=2;
-
-    byteIndex=cnv->toULength;
-
-    /* update target and deal with offsets if necessary */
-    offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
-    target=pArgs->target;
-
-    /* update the source pointer and index */
-    sourceIndex=nextSourceIndex+((const uint8_t *)pArgs->source-source);
-    source=(const uint8_t *)pArgs->source;
-
-    /*
-     * If the callback overflowed the target, then we need to
-     * stop here with an overflow indication.
-     */
-    if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
-        goto endloop;
-    } else if(cnv->UCharErrorBufferLength>0) {
-        /* target is full */
-        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
-        goto endloop;
-    } else if(U_FAILURE(*pErrorCode)) {
-        /* reset and break on error */
-        cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
-        cnv->mode=0;
-        cnv->toULength=0;
-        goto finish;
-    } else {
-        goto loop;
-    }
 }
 
 /*
@@ -1356,7 +1185,6 @@
     bytes=cnv->toUBytes;
 
     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
-loop:
     if(count>0 && byteIndex>0 && target<targetLimit) {
         goto getTrail;
     }
@@ -1431,7 +1259,8 @@
                 bytes[0]=source[-2];
                 bytes[1]=source[-1];
                 byteIndex=2;
-                goto callback;
+                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                break;
             }
         } else if(c==BOCU1_RESET) {
             /* only reset the state, no code point */
@@ -1459,7 +1288,8 @@
                 /* trail byte in any position */
                 c=decodeBocu1TrailByte(count, c);
                 if(c<0) {
-                    goto callback;
+                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                    goto endloop;
                 }
 
                 diff+=c;
@@ -1468,7 +1298,8 @@
                     byteIndex=0;
                     c=prev+diff;
                     if((uint32_t)c>0x10ffff) {
-                        goto callback;
+                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                        goto endloop;
                     }
                     break;
                 }
@@ -1495,85 +1326,21 @@
     }
 endloop:
 
-    if(pArgs->flush && source>=sourceLimit) {
-        /* reset the state for the next conversion */
-        if(byteIndex>0 && U_SUCCESS(*pErrorCode)) {
-            /* a character byte sequence remains incomplete */
-            *pErrorCode=U_TRUNCATED_CHAR_FOUND;
-        }
+    if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
+        /* set the converter state in UConverter to deal with the next character */
         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
         cnv->mode=0;
-        cnv->toULength=0;
     } else {
         /* set the converter state back into UConverter */
         cnv->toUnicodeStatus=(uint32_t)prev;
         cnv->mode=(diff<<2)|count;
-        cnv->toULength=byteIndex;
     }
+    cnv->toULength=byteIndex;
 
-finish:
     /* write back the updated pointers */
     pArgs->source=(const char *)source;
     pArgs->target=target;
     return;
-
-callback:
-    /* call the callback function with all the preparations and post-processing */
-    /* update the arguments structure */
-    pArgs->source=(const char *)source;
-    pArgs->target=target;
-
-    /* copy the current bytes to invalidCharBuffer */
-    cnv->invalidCharBuffer[0]=bytes[0];
-    cnv->invalidCharBuffer[1]=bytes[1];
-    cnv->invalidCharBuffer[2]=bytes[2];
-    cnv->invalidCharBuffer[3]=bytes[3];
-    cnv->invalidCharLength=(int8_t)byteIndex;
-
-    /* set the converter state in UConverter to deal with the next character */
-    cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
-    cnv->mode=0;
-    cnv->toULength=0;
-
-    /* call the callback function */
-    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
-    cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, (const char *)bytes, byteIndex, UCNV_ILLEGAL, pErrorCode);
-
-    /* get the converter state from UConverter */
-    prev=(int32_t)cnv->toUnicodeStatus;
-    if(prev==0) {
-        prev=BOCU1_ASCII_PREV;
-    }
-    diff=cnv->mode;
-    count=diff&3;
-    diff>>=2;
-
-    byteIndex=cnv->toULength;
-
-    target=pArgs->target;
-
-    /* update the source pointer and index */
-    source=(const uint8_t *)pArgs->source;
-
-    /*
-     * If the callback overflowed the target, then we need to
-     * stop here with an overflow indication.
-     */
-    if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
-        goto endloop;
-    } else if(cnv->UCharErrorBufferLength>0) {
-        /* target is full */
-        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
-        goto endloop;
-    } else if(U_FAILURE(*pErrorCode)) {
-        /* reset and break on error */
-        cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
-        cnv->mode=0;
-        cnv->toULength=0;
-        goto finish;
-    } else {
-        goto loop;
-    }
 }
 
 /* miscellaneous ------------------------------------------------------------ */

Index: ucnvhz.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ucnvhz.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- ucnvhz.c	10 Sep 2003 02:42:03 -0000	1.4
+++ ucnvhz.c	6 Apr 2004 10:08:03 -0000	1.5
@@ -69,7 +69,7 @@
     cnv->toUnicodeStatus = 0;
     cnv->fromUnicodeStatus= 0;
     cnv->mode=0;
-    cnv->fromUSurrogateLead=0x0000;
+    cnv->fromUChar32=0x0000;
     cnv->extraInfo = uprv_malloc (sizeof (UConverterDataHZ));
     if(cnv->extraInfo != NULL){
         ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("ibm-1386",errorCode);
@@ -108,7 +108,7 @@
     }
     if(choice!=UCNV_RESET_TO_UNICODE) {
         cnv->fromUnicodeStatus= 0;
-        cnv->fromUSurrogateLead=0x0000; 
+        cnv->fromUChar32=0x0000; 
         if(cnv->extraInfo != NULL){
             ((UConverterDataHZ*)cnv->extraInfo)->isEscapeAppended = FALSE;
             ((UConverterDataHZ*)cnv->extraInfo)->targetIndex = 0;
@@ -142,22 +142,20 @@
 static void 
 UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
                                                             UErrorCode* err){
-    char tempBuf[3];
-    const char* pBuf;
+    char tempBuf[2];
     const char *mySource = ( char *) args->source;
     UChar *myTarget = args->target;
-    char *tempLimit = &tempBuf[3]; 
     const char *mySourceLimit = args->sourceLimit;
     UChar32 targetUniChar = 0x0000;
     UChar mySourceChar = 0x0000;
     UConverterDataHZ* myData=(UConverterDataHZ*)(args->converter->extraInfo);
        
-    if ((args->converter == NULL) || (args->targetLimit < args->target) || (args->sourceLimit < args->source)){
+    if ((args->converter == NULL) || (args->targetLimit < args->target) || (mySourceLimit < args->source)){
         *err = U_ILLEGAL_ARGUMENT_ERROR;
         return;
     }
     
-    while(mySource< args->sourceLimit){
+    while(mySource< mySourceLimit){
         
         if(myTarget < args->targetLimit){
             
@@ -234,19 +232,14 @@
                     tempBuf[1] = (char) (mySourceChar+0x80);
                     mySourceChar= (UChar)(((args->converter->toUnicodeStatus+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80));
                     args->converter->toUnicodeStatus =0x00;
-                    pBuf = &tempBuf[0];
-                    tempLimit = &tempBuf[2]+1;
                     targetUniChar = _MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
-                        &pBuf,tempLimit,args->converter->useFallback);
+                        tempBuf, 2, args->converter->useFallback);
                 }
             }
             else{
                 if(args->converter->fromUnicodeStatus == 0x00){
-                    tempBuf[0] = (char) mySourceChar;
-                    pBuf = &tempBuf[0];
-                    tempLimit = &tempBuf[1];
                     targetUniChar = _MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
-                        &pBuf,tempLimit,args->converter->useFallback);
+                        mySource - 1, 1, args->converter->useFallback);
                 }
                 else{
                     goto SAVE_STATE;
@@ -262,62 +255,22 @@
             }
             else if(targetUniChar>=0xfffe){
 SAVE_STATE:
-                {
-                   const char *saveSource = args->source;
-                    UChar *saveTarget = args->target; 
-                    int32_t *saveOffsets = args->offsets;
-                    
-                    UConverterCallbackReason reason;
-                    int32_t currentOffset ;
-                    int32_t saveIndex = (int32_t)(myTarget - args->target);
-
-                    args->converter->invalidCharLength=0;
-                   
-                    if(targetUniChar == 0xfffe){
-                        reason = UCNV_UNASSIGNED;
-                        *err = U_INVALID_CHAR_FOUND;
-                    }
-                    else{
-                        reason = UCNV_ILLEGAL;
-                        *err = U_ILLEGAL_CHAR_FOUND;
-                    }
-                    if(myData->isStateDBCS){
-
-                        args->converter->invalidCharBuffer[args->converter->invalidCharLength++] = (char)(tempBuf[0]-0x80);
-                        args->converter->invalidCharBuffer[args->converter->invalidCharLength++] = (char)(tempBuf[1]-0x80);
-                        currentOffset= (int32_t)(mySource - args->source -2);
-                    
-                    }
-                    else{
-                        args->converter->invalidCharBuffer[args->converter->invalidCharLength++] = (char)mySourceChar;
-                        currentOffset= (int32_t)(mySource - args->source -1);
-                    }
-                    args->offsets = args->offsets?args->offsets+(myTarget - args->target):0;
-                    args->target = myTarget;
-                    args->source = mySource;
-                    myTarget = saveTarget;
-                    args->converter->fromCharErrorBehaviour ( 
-                         args->converter->toUContext, 
-                         args, 
-                         args->converter->invalidCharBuffer, 
-                         args->converter->invalidCharLength, 
-                         reason, 
-                         err); 
-
-                    if(args->offsets){
-                        args->offsets = saveOffsets; 
-
-                        for (;saveIndex < (args->target - myTarget);saveIndex++) {
-                          args->offsets[saveIndex] += currentOffset;
-                        } 
-                    }
-                    args->source  = saveSource;
-                    myTarget = args->target;
-                    args->target  = saveTarget;
-                    args->offsets = saveOffsets;
-                    if(U_FAILURE(*err))
-                        break;
+                if(targetUniChar == 0xfffe){
+                    *err = U_INVALID_CHAR_FOUND;
                 }
+                else{
+                    *err = U_ILLEGAL_CHAR_FOUND;
+                }
+                if(myData->isStateDBCS){
+                    args->converter->toUBytes[0] = (uint8_t)(tempBuf[0]-0x80);
+                    args->converter->toUBytes[1] = (uint8_t)(tempBuf[1]-0x80);
+                    args->converter->toULength=2;
+                }
+                else{
+                    args->converter->toUBytes[0] = (uint8_t)mySourceChar;
+                    args->converter->toULength=1;
+                }
+                break;
             }
         }
         else{
@@ -325,18 +278,6 @@
             break;
         }
     }
-    if((args->flush==TRUE)
-        && (mySource == mySourceLimit) 
-        && ( args->converter->toUnicodeStatus !=0x00)){
-            *err = U_TRUNCATED_CHAR_FOUND;
-            args->converter->toUnicodeStatus = 0x00;
-    }
-    /* Reset the state of converter if we consumed 
-     * the source and flush is true
-     */
-    if( (mySource == mySourceLimit) && args->flush){
-         _HZReset(args->converter, UCNV_RESET_TO_UNICODE);
-    }
 
     args->target = myTarget;
     args->source = mySource;
@@ -347,11 +288,11 @@
 UConverter_fromUnicode_HZ_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
                                                       UErrorCode * err){
     const UChar *mySource = args->source;
-    unsigned char *myTarget = (unsigned char *) args->target;
+    char *myTarget = args->target;
     int32_t* offsets = args->offsets;
     int32_t mySourceIndex = 0;
     int32_t myTargetIndex = 0;
-    int32_t targetLength = (int32_t)(args->targetLimit - args->target);
+    int32_t targetLength = (int32_t)(args->targetLimit - myTarget);
     int32_t mySourceLength = (int32_t)(args->sourceLimit - args->source);
     int32_t length=0;
     uint32_t targetUniChar = 0x0000;
@@ -359,16 +300,15 @@
     UConverterDataHZ *myConverterData=(UConverterDataHZ*)args->converter->extraInfo;
     UBool isTargetUCharDBCS = (UBool) myConverterData->isTargetUCharDBCS;
     UBool oldIsTargetUCharDBCS = isTargetUCharDBCS;
-    UConverterCallbackReason reason;
     UBool isEscapeAppended =FALSE;
     int len =0;
     const char* escSeq=NULL;
     
-    if ((args->converter == NULL) || (args->targetLimit < args->target) || (args->sourceLimit < args->source)){
+    if ((args->converter == NULL) || (args->targetLimit < myTarget) || (args->sourceLimit < args->source)){
         *err = U_ILLEGAL_ARGUMENT_ERROR;
         return;
     }
-    if(args->converter->fromUSurrogateLead!=0 && myTargetIndex < targetLength) {
+    if(args->converter->fromUChar32!=0 && myTargetIndex < targetLength) {
         goto getTrail;
     }
     /*writing the char to the output stream */
@@ -376,7 +316,7 @@
         targetUniChar = missingCharMarker;
         if (myTargetIndex < targetLength){
             
-            c=mySourceChar = (UChar) args->source[mySourceIndex++];
+            c=mySourceChar = (UChar) mySource[mySourceIndex++];
             
 
             oldIsTargetUCharDBCS = isTargetUCharDBCS;
@@ -418,12 +358,12 @@
             
                 if(isTargetUCharDBCS){
                     if( myTargetIndex <targetLength){
-                        args->target[myTargetIndex++] =(char) ((targetUniChar >> 8) -0x80);
+                        myTarget[myTargetIndex++] =(char) ((targetUniChar >> 8) -0x80);
                         if(offsets){
                             *(offsets++) = mySourceIndex-1;
                         }
                         if(myTargetIndex < targetLength){
-                            args->target[myTargetIndex++] =(char) ((targetUniChar & 0x00FF) -0x80);
+                            myTarget[myTargetIndex++] =(char) ((targetUniChar & 0x00FF) -0x80);
                             if(offsets){
                                 *(offsets++) = mySourceIndex-1;
                             }
@@ -439,7 +379,7 @@
 
                 }else{
                     if( myTargetIndex <targetLength){
-                        args->target[myTargetIndex++] = (char) (targetUniChar );
+                        myTarget[myTargetIndex++] = (char) (targetUniChar );
                         if(offsets){
                             *(offsets++) = mySourceIndex-1;
                         }
@@ -452,16 +392,12 @@
 
             }
             else{
-                /* oops.. the code point is unassingned
-                 * set the error and reason
-                 */
-                reason =UCNV_UNASSIGNED;
-                *err =U_INVALID_CHAR_FOUND;
+                /* oops.. the code point is unassigned */
                 /*Handle surrogates */
                 /*check if the char is a First surrogate*/
                 if(UTF_IS_SURROGATE(mySourceChar)) {
                     if(UTF_IS_SURROGATE_FIRST(mySourceChar)) {
-                        args->converter->fromUSurrogateLead=(UChar)mySourceChar;
+                        args->converter->fromUChar32=mySourceChar;
 getTrail:
                         /*look ahead to find the trail surrogate*/
                         if(mySourceIndex <  mySourceLength) {
@@ -469,87 +405,32 @@
                             UChar trail=(UChar) args->source[mySourceIndex];
                             if(UTF_IS_SECOND_SURROGATE(trail)) {
                                 ++mySourceIndex;
-                                mySourceChar=UTF16_GET_PAIR_VALUE(args->converter->fromUSurrogateLead, trail);
-                                args->converter->fromUSurrogateLead=0x00;
+                                mySourceChar=UTF16_GET_PAIR_VALUE(args->converter->fromUChar32, trail);
+                                args->converter->fromUChar32=0x00;
                                 /* there are no surrogates in GB2312*/
                                 *err = U_INVALID_CHAR_FOUND;
-                                reason=UCNV_UNASSIGNED;
                                 /* exit this condition tree */
                             } else {
                                 /* this is an unmatched lead code unit (1st surrogate) */
                                 /* callback(illegal) */
-                                reason=UCNV_ILLEGAL;
                                 *err=U_ILLEGAL_CHAR_FOUND;
                             }
                         } else {
                             /* no more input */
                             *err = U_ZERO_ERROR;
-                            break;
                         }
                     } else {
                         /* this is an unmatched trail code unit (2nd surrogate) */
                         /* callback(illegal) */
-                        reason=UCNV_ILLEGAL;
                         *err=U_ILLEGAL_CHAR_FOUND;
                     }
+                } else {
+                    /* callback(unassigned) for a BMP code point */
+                    *err = U_INVALID_CHAR_FOUND;
                 }
 
-                {
-                    int32_t saveIndex=0;
-                    int32_t currentOffset = (args->offsets) ? *(offsets-1)+1:0;
-                    char * saveTarget = args->target;
-                    const UChar* saveSource = args->source;
-                    int32_t *saveOffsets = args->offsets;
-
-                    args->converter->invalidUCharLength = 0;
-
-                    if(mySourceChar>0xffff){
-                        args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++] =(uint16_t)(((mySourceChar)>>10)+0xd7c0);
-                        args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++] =(uint16_t)(((mySourceChar)&0x3ff)|0xdc00);
-                    }
-                    else{
-                        args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++] =(UChar)mySourceChar;
-                    }
-                
-                    myConverterData->isTargetUCharDBCS = (UBool)isTargetUCharDBCS;
-                    args->target += myTargetIndex;
-                    args->source += mySourceIndex;
-                    args->offsets = args->offsets?offsets:0;
-                    
-
-                    saveIndex = myTargetIndex; 
-                    /*copies current values for the ErrorFunctor to update */ 
-                    /*Calls the ErrorFunctor */ 
-                    args->converter->fromUCharErrorBehaviour ( args->converter->fromUContext, 
-                                  args, 
-                                  args->converter->invalidUCharBuffer, 
-                                  args->converter->invalidUCharLength, 
-                                 (UChar32) (mySourceChar), 
-                                  reason, 
-                                  err);
-                    /*Update the local Indexes so that the conversion 
-                    *can restart at the right points 
-                    */ 
-                    myTargetIndex = (int32_t)(args->target - (char*)myTarget);
-                    mySourceIndex = (int32_t)(args->source - mySource);
-                    args->offsets = saveOffsets; 
-                    saveIndex = myTargetIndex - saveIndex;
-                    if(args->offsets){
-                        args->offsets = saveOffsets; 
-                        while(saveIndex-->0){
-                             *offsets = currentOffset;
-                              offsets++;
-                        }
-                    }
-                    isTargetUCharDBCS=myConverterData->isTargetUCharDBCS;
-                    args->source = saveSource;
-                    args->target = saveTarget;
-                    args->offsets = saveOffsets;
-                    args->converter->fromUSurrogateLead=0x00;
-                    if (U_FAILURE (*err))
-                        break;
-
-                }
+                args->converter->fromUChar32=mySourceChar;
+                break;
             }
         }
         else{
@@ -558,19 +439,6 @@
         }
         targetUniChar=missingCharMarker;
     }
-    /*If at the end of conversion we are still carrying state information
-     *flush is TRUE, we can deduce that the input stream is truncated
-     */
-    if (args->converter->fromUSurrogateLead !=0 && (mySourceIndex == mySourceLength) && args->flush){
-        *err = U_TRUNCATED_CHAR_FOUND;
-        args->converter->toUnicodeStatus = 0x00;
-    }
-    /* Reset the state of converter if we consumed 
-     * the source and flush is true
-     */
-    if( (mySourceIndex == mySourceLength) && args->flush){
-        _HZReset(args->converter, UCNV_RESET_FROM_UNICODE);
-    }
 
     args->target += myTargetIndex;
     args->source += mySourceIndex;
@@ -628,7 +496,6 @@
 
     localClone = (struct cloneStruct *)stackBuffer;
     uprv_memcpy(&localClone->cnv, cnv, sizeof(UConverter));
-    localClone->cnv.isCopyLocal = TRUE;
 
     uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(UConverterDataHZ));
     localClone->cnv.extraInfo = &localClone->mydata;

Index: ucnvisci.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ucnvisci.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- ucnvisci.c	10 Sep 2003 02:42:03 -0000	1.4
+++ ucnvisci.c	6 Apr 2004 10:08:03 -0000	1.5
@@ -116,7 +116,7 @@
     MaskEnum currentMaskFromUnicode; /* mask for current state in toUnicode */
     MaskEnum currentMaskToUnicode;   /* mask for current state in toUnicode */
     MaskEnum defMaskToUnicode;       /* mask for default state in toUnicode */
-    UBool isFirstBuffer;
+    UBool isFirstBuffer;             /* boolean for fromUnicode to see if we need to announce the first script */
     char name[30];
 }UConverterDataISCII; 
 
@@ -197,13 +197,12 @@
         data->contextCharToUnicode=NO_CHAR_MARKER;
     }
     if(choice!=UCNV_RESET_TO_UNICODE) {
-        cnv->fromUSurrogateLead=0x0000; 
+        cnv->fromUChar32=0x0000; 
         data->contextCharFromUnicode=0x00;
         data->currentMaskFromUnicode=data->defDeltaToUnicode;
         data->currentDeltaFromUnicode=data->defDeltaToUnicode;
+        data->isFirstBuffer=TRUE;
     }
-    data->isFirstBuffer=TRUE;
-
 }
 
 /** 
@@ -811,7 +810,6 @@
     int32_t* offsets = args->offsets;
     uint32_t targetByteUnit = 0x0000;
     UChar32 sourceChar = 0x0000;
-    UConverterCallbackReason reason;
     UBool useFallback;
     UConverterDataISCII *converterData;
     uint16_t newDelta=0;
@@ -828,7 +826,7 @@
     newDelta=converterData->currentDeltaFromUnicode;
     range = (uint16_t)(newDelta/DELTA);
     
-    if(args->converter->fromUSurrogateLead!=0 && target <targetLimit) {
+    if((sourceChar = args->converter->fromUChar32)!=0) {
         goto getTrail;
     }
 
@@ -946,16 +944,10 @@
              }
         }
         else{
-            /* oops.. the code point is unassingned
-             * set the error and reason
-             */
-            reason =UCNV_UNASSIGNED;
-            *err =U_INVALID_CHAR_FOUND;
-
+            /* oops.. the code point is unassigned */
             /*check if the char is a First surrogate*/
             if(UTF_IS_SURROGATE(sourceChar)) {
                 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
-                    args->converter->fromUSurrogateLead=(UChar)sourceChar;
 getTrail:
                     /*look ahead to find the trail surrogate*/
                     if(source <  sourceLimit) {
@@ -963,111 +955,34 @@
                         UChar trail= (*source);
                         if(UTF_IS_SECOND_SURROGATE(trail)) {
                             source++;
-                            sourceChar=UTF16_GET_PAIR_VALUE(args->converter->fromUSurrogateLead, trail);
-                            args->converter->fromUSurrogateLead=0x00;
-                            reason =UCNV_UNASSIGNED;
+                            sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
                             *err =U_INVALID_CHAR_FOUND;
                             /* convert this surrogate code point */
                             /* exit this condition tree */
                         } else {
                             /* this is an unmatched lead code unit (1st surrogate) */
                             /* callback(illegal) */
-                            sourceChar =  args->converter->fromUSurrogateLead;
-                            reason=UCNV_ILLEGAL;
                             *err=U_ILLEGAL_CHAR_FOUND;
                         }
                     } else {
                         /* no more input */
                         *err = U_ZERO_ERROR;
-                        break;
                     }
                 } else {
                     /* this is an unmatched trail code unit (2nd surrogate) */
                     /* callback(illegal) */
-                    reason=UCNV_ILLEGAL;
                     *err=U_ILLEGAL_CHAR_FOUND;
                 }
+            } else {
+                /* callback(unassigned) for a BMP code point */
+                *err = U_INVALID_CHAR_FOUND;
             }
-            {
-                /*variables for callback */
-                const UChar* saveSource =NULL;
-                char* saveTarget =NULL;
-                int32_t* saveOffsets =NULL;
-                int currentOffset =0;
-                int32_t saveIndex =0;
-
-                args->converter->invalidUCharLength = 0;
-
-                if(sourceChar>0xffff){
-                    /* we have got a surrogate pair... dissable and populate the invalidUCharBuffer */
-                    args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++] 
-                        =(uint16_t)(((sourceChar)>>10)+0xd7c0);
-                    args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++] 
-                        =(uint16_t)(((sourceChar)&0x3ff)|0xdc00);
-                }
-                else{
-                    args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++] 
-                        =(UChar)sourceChar;
-                }
-                
-                if(offsets){
-                    currentOffset = *(offsets-1)+1;
-                }
-                saveSource = args->source;
-                saveTarget = args->target;
-                saveOffsets = args->offsets;
-                args->target = (char*)target;
-                args->source = source;
-                args->offsets = offsets;
-
-                /*copies current values for the ErrorFunctor to update */
-                /*Calls the ErrorFunctor */
-                args->converter->fromUCharErrorBehaviour ( args->converter->fromUContext, 
-                              args, 
-                              args->converter->invalidUCharBuffer, 
-                              args->converter->invalidUCharLength, 
-                             (UChar32) (sourceChar), 
-                              reason, 
-                              err);
-
-                saveIndex = (int32_t)(args->target - (char*)target);
-                if(args->offsets){
-                    args->offsets = saveOffsets;
-                    while(saveIndex-->0){
-                         *offsets = currentOffset;
-                          offsets++;
-                    }
-                }
-                target = (unsigned char*)args->target;
-                args->source=saveSource;
-                args->target=saveTarget;
-                args->offsets=saveOffsets;
-                args->converter->fromUSurrogateLead=0x00;
 
-                if (U_FAILURE (*err)){
-                    break;
-                }
-            }
+            args->converter->fromUChar32=sourceChar;
+            break;
         }
-
-
     }/* end while(mySourceIndex<mySourceLength) */
 
-
-    /*If at the end of conversion we are still carrying state information
-     *flush is TRUE, we can deduce that the input stream is truncated
-     */
-    if (args->converter->fromUSurrogateLead !=0 && (source == sourceLimit) && args->flush){
-        *err = U_TRUNCATED_CHAR_FOUND;
-    }
-    /* Reset the state of converter if we consumed 
-     * the source and flush is true
-     */
-    if( (source == sourceLimit) && args->flush){
-       /*reset converter*/
-        _ISCIIReset(args->converter,UCNV_RESET_FROM_UNICODE);
-    }
-
     /*save the state and return */
     args->source = source;
     args->target = (char*)target;
@@ -1154,7 +1069,6 @@
     uint32_t targetUniChar = 0x0000;
     uint8_t sourceChar = 0x0000;
     UConverterDataISCII* data;
-    UConverterCallbackReason reason;
     UChar32* toUnicodeStatus=NULL;
     UChar* contextCharToUnicode = NULL;
 
@@ -1193,17 +1107,14 @@
                     data->currentDeltaToUnicode = data->defDeltaToUnicode;
                     data->currentMaskToUnicode = data->defMaskToUnicode;
                 }else{
-                    
                     if((sourceChar >= 0x21 && sourceChar <= 0x3F)){
                         /* these are display codes consume and continue */
                     }else{
                         *err =U_ILLEGAL_CHAR_FOUND;
                         /* reset */
                         *contextCharToUnicode=NO_CHAR_MARKER;
-                        reason = UCNV_ILLEGAL;
                         goto CALLBACK;
                     }
-
                 }
 
                 /* reset */
@@ -1233,11 +1144,9 @@
                     /* byte unit is unassigned */
                     targetUniChar = missingCharMarker;
                     *err= U_INVALID_CHAR_FOUND;
-                    reason = UCNV_UNASSIGNED;
                 }else{
                     /* only 0xA1 - 0xEE are legal after EXT char */
                     *contextCharToUnicode= NO_CHAR_MARKER;
-                    reason= UCNV_ILLEGAL;
                     *err = U_ILLEGAL_CHAR_FOUND;
                 }
                 goto CALLBACK;
@@ -1345,49 +1254,11 @@
                 /* we reach here only if targetUniChar == missingCharMarker 
                  * so assign codes to reason and err
                  */
-                reason = UCNV_UNASSIGNED;
                 *err = U_INVALID_CHAR_FOUND;
 CALLBACK:
-                 {
-                    const char *saveSource = args->source;
-                    UChar *saveTarget = args->target;
-                    int32_t *saveOffsets = NULL;
-                    int32_t currentOffset = (int32_t)(source - args->source -1);
-                    int32_t saveIndex = (int32_t)(target - args->target);
-
-                    args->converter->invalidCharLength=0;
-
-                    args->converter->invalidCharBuffer[args->converter->invalidCharLength++] =
-                        (char) sourceChar;
-
-                    if(args->offsets){
-                        saveOffsets=args->offsets;
-                        args->offsets = args->offsets+(target - args->target);
-                    }
-
-                    args->target =target;
-                    target =saveTarget;
-                    args->source = source;
-
-                    args->converter->fromCharErrorBehaviour ( 
-                         args->converter->toUContext, 
-                         args, 
-                         args->converter->invalidCharBuffer, 
-                         args->converter->invalidCharLength, 
-                         reason, 
-                         err);
-
-                    if(args->offsets){
-                        args->offsets = saveOffsets;
-
-                        for (;saveIndex < (args->target - target);saveIndex++) {
-                          *(args->offsets)++ = currentOffset;
-                        }
-                    }
-                    target=args->target;
-                    args->source  = saveSource;
-                    args->target  = saveTarget;
-                }
+                args->converter->toUBytes[0] = (uint8_t) sourceChar;
+                args->converter->toULength = 1;
+                break;
             }
 
         }
@@ -1396,26 +1267,30 @@
             break;
         }
     }
-    if((args->flush==TRUE)
-            && (source == sourceLimit) 
-            && data->contextCharToUnicode != NO_CHAR_MARKER){
-        /* if we have ATR in context it is an error */
-        if(data->contextCharToUnicode==ATR || data->contextCharToUnicode==EXT || *toUnicodeStatus == missingCharMarker){
-            *err = U_TRUNCATED_CHAR_FOUND;
+
+    if(U_SUCCESS(*err) && args->flush && source == sourceLimit) {
+        /* end of the input stream */
+        UConverter *cnv = args->converter;
+
+        if(*contextCharToUnicode==ATR || *contextCharToUnicode==EXT || *contextCharToUnicode==ISCII_INV){
+            /* set toUBytes[] */
+            cnv->toUBytes[0] = (uint8_t)*contextCharToUnicode;
+            cnv->toULength = 1;
+
+            /* avoid looping on truncated sequences */
+            *contextCharToUnicode = NO_CHAR_MARKER;
         }else{
+            cnv->toULength = 0;
+        }
+
+        if(*toUnicodeStatus != missingCharMarker) {
+            /* output a remaining target character */
             WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source - args->source -1),
                             *toUnicodeStatus,data->currentDeltaToUnicode,err);
-           *toUnicodeStatus = missingCharMarker;
+            *toUnicodeStatus = missingCharMarker;
         }
-
-    }
-    /* Reset the state of converter if we consumed 
-     * the source and flush is true
-     */
-    if( (source == sourceLimit) && args->flush){
-        /*reset converter*/
-        _ISCIIReset(args->converter,UCNV_RESET_TO_UNICODE);
     }
+
     args->target = target;
     args->source = source;
 }
@@ -1448,7 +1323,6 @@
 
     localClone = (struct cloneStruct *)stackBuffer;
     uprv_memcpy(&localClone->cnv, cnv, sizeof(UConverter));
-    localClone->cnv.isCopyLocal = TRUE;
 
     uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(UConverterDataISCII));
     localClone->cnv.extraInfo = &localClone->mydata;

Index: ucnvlat1.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ucnvlat1.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- ucnvlat1.c	10 Sep 2003 02:42:03 -0000	1.3
+++ ucnvlat1.c	6 Apr 2004 10:08:03 -0000	1.4
@@ -146,23 +146,21 @@
 _Latin1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
                               UErrorCode *pErrorCode) {
     UConverter *cnv;
-    const UChar *source, *sourceLimit, *lastSource;
-    uint8_t *target;
+    const UChar *source, *sourceLimit;
+    uint8_t *target, *oldTarget;
     int32_t targetCapacity, length;
     int32_t *offsets;
 
-    UChar32 c, max;
+    UChar32 cp;
+    UChar c, max;
 
     int32_t sourceIndex;
 
-    UConverterCallbackReason reason;
-    int32_t i;
-
     /* set up the local pointers */
     cnv=pArgs->converter;
     source=pArgs->source;
     sourceLimit=pArgs->sourceLimit;
-    target=(uint8_t *)pArgs->target;
+    target=oldTarget=(uint8_t *)pArgs->target;
     targetCapacity=pArgs->targetLimit-pArgs->target;
     offsets=pArgs->offsets;
 
@@ -173,11 +171,10 @@
     }
 
     /* get the converter state from UConverter */
-    c=cnv->fromUSurrogateLead;
+    cp=cnv->fromUChar32;
 
     /* sourceIndex=-1 if the current character began in the previous buffer */
-    sourceIndex= c==0 ? 0 : -1;
-    lastSource=source;
+    sourceIndex= cp==0 ? 0 : -1;
 
     /*
      * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
@@ -189,13 +186,12 @@
     }
 
     /* conversion loop */
-    if(c!=0 && targetCapacity>0) {
+    if(cp!=0 && targetCapacity>0) {
         goto getTrail;
     }
 
 #if LATIN1_UNROLL_FROM_UNICODE
     /* unroll the loop with the most common case */
-unrolled:
     if(targetCapacity>=16) {
         int32_t count, loops;
         UChar u, oredChars;
@@ -247,7 +243,7 @@
         targetCapacity-=16*count;
 
         if(offsets!=NULL) {
-            lastSource+=16*count;
+            oldTarget+=16*count;
             while(count>0) {
                 *offsets++=sourceIndex++;
                 *offsets++=sourceIndex++;
@@ -268,156 +264,62 @@
                 --count;
             }
         }
-
-        c=0;
     }
 #endif
 
-    while(targetCapacity>0) {
-        /*
-         * Get a correct Unicode code point:
-         * a single UChar for a BMP code point or
-         * a matched surrogate pair for a "surrogate code point".
-         */
-        c=*source++;
-        if(c<=max) {
-            /* convert the Unicode code point */
-            *target++=(uint8_t)c;
-            --targetCapacity;
+    /* conversion loop */
+    c=0;
+    while(targetCapacity>0 && (c=*source++)<=max) {
+        /* convert the Unicode code point */
+        *target++=(uint8_t)c;
+        --targetCapacity;
+    }
 
-            /* normal end of conversion: prepare for a new character */
-            c=0;
-        } else {
-            if(!UTF_IS_SURROGATE(c)) {
-                /* callback(unassigned) */
-                reason=UCNV_UNASSIGNED;
-                *pErrorCode=U_INVALID_CHAR_FOUND;
-            } else if(UTF_IS_SURROGATE_FIRST(c)) {
+    if(c>max) {
+        cp=c;
+        if(!U_IS_SURROGATE(cp)) {
+            /* callback(unassigned) */
+        } else if(U_IS_SURROGATE_LEAD(cp)) {
 getTrail:
-                if(source<sourceLimit) {
-                    /* test the following code unit */
-                    UChar trail=*source;
-                    if(UTF_IS_SECOND_SURROGATE(trail)) {
-                        ++source;
-                        c=UTF16_GET_PAIR_VALUE(c, trail);
-                        /* this codepage does not map supplementary code points */
-                        /* callback(unassigned) */
-                        reason=UCNV_UNASSIGNED;
-                        *pErrorCode=U_INVALID_CHAR_FOUND;
-                    } else {
-                        /* this is an unmatched lead code unit (1st surrogate) */
-                        /* callback(illegal) */
-                        reason=UCNV_ILLEGAL;
-                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
-                    }
+            if(source<sourceLimit) {
+                /* test the following code unit */
+                UChar trail=*source;
+                if(U16_IS_TRAIL(trail)) {
+                    ++source;
+                    cp=U16_GET_SUPPLEMENTARY(cp, trail);
+                    /* this codepage does not map supplementary code points */
+                    /* callback(unassigned) */
                 } else {
-                    /* no more input */
-                    break;
+                    /* this is an unmatched lead code unit (1st surrogate) */
+                    /* callback(illegal) */
                 }
             } else {
-                /* this is an unmatched trail code unit (2nd surrogate) */
-                /* callback(illegal) */
-                reason=UCNV_ILLEGAL;
-                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
-            }
-
-            /* call the callback function with all the preparations and post-processing */
-            /* get the number of code units for c to correctly advance sourceIndex after the callback call */
-            length=UTF_CHAR_LENGTH(c);
-
-            /* set offsets since the start or the last callback */
-            if(offsets!=NULL) {
-                int32_t count=(int32_t)(source-lastSource);
-
-                /* do not set the offset for the callback-causing character */
-                count-=length;
-
-                while(count>0) {
-                    *offsets++=sourceIndex++;
-                    --count;
-                }
-                /* offset and sourceIndex are now set for the current character */
-            }
-
-            /* update the arguments structure */
-            pArgs->source=source;
-            pArgs->target=(char *)target;
-            pArgs->offsets=offsets;
-
-            /* set the converter state in UConverter to deal with the next character */
-            cnv->fromUSurrogateLead=0;
-
-            /* write the code point as code units */
-            i=0;
-            UTF_APPEND_CHAR_UNSAFE(cnv->invalidUCharBuffer, i, c);
-            cnv->invalidUCharLength=(int8_t)i;
-            /* i==length */
-
-            /* call the callback function */
-            cnv->fromUCharErrorBehaviour(cnv->fromUContext, pArgs, cnv->invalidUCharBuffer, i, c, reason, pErrorCode);
-
-            /* get the converter state from UConverter */
-            c=cnv->fromUSurrogateLead;
-
-            /* update target and deal with offsets if necessary */
-            offsets=ucnv_updateCallbackOffsets(offsets, ((uint8_t *)pArgs->target)-target, sourceIndex);
-            target=(uint8_t *)pArgs->target;
-
-            /* update the source pointer and index */
-            sourceIndex+=length+(pArgs->source-source);
-            source=lastSource=pArgs->source;
-            targetCapacity=(uint8_t *)pArgs->targetLimit-target;
-            length=sourceLimit-source;
-            if(length<targetCapacity) {
-                targetCapacity=length;
-            }
-
-            /*
-             * If the callback overflowed the target, then we need to
-             * stop here with an overflow indication.
-             */
-            if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
-                break;
-            } else if(U_FAILURE(*pErrorCode)) {
-                /* break on error */
-                c=0;
-                break;
-            } else if(cnv->charErrorBufferLength>0) {
-                /* target is full */
-                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
-                break;
+                /* no more input */
+                cnv->fromUChar32=cp;
+                goto noMoreInput;
             }
-
-#if LATIN1_UNROLL_FROM_UNICODE
-            goto unrolled;
-#endif
+        } else {
+            /* this is an unmatched trail code unit (2nd surrogate) */
+            /* callback(illegal) */
         }
-    }
 
-    if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) {
-        /* target is full */
-        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+        *pErrorCode= U_IS_SURROGATE(cp) ? U_ILLEGAL_CHAR_FOUND : U_INVALID_CHAR_FOUND;
+        cnv->fromUChar32=cp;
     }
+noMoreInput:
 
-    /* set offsets since the start or the last callback */
+    /* set offsets since the start */
     if(offsets!=NULL) {
-        size_t count=source-lastSource;
+        size_t count=target-oldTarget;
         while(count>0) {
             *offsets++=sourceIndex++;
             --count;
         }
     }
 
-    if(pArgs->flush && source>=sourceLimit) {
-        /* reset the state for the next conversion */
-        if(c!=0 && U_SUCCESS(*pErrorCode)) {
-            /* a Unicode code point remains incomplete (only a first surrogate) */
-            *pErrorCode=U_TRUNCATED_CHAR_FOUND;
-        }
-        cnv->fromUSurrogateLead=0;
-    } else {
-        /* set the converter state back into UConverter */
-        cnv->fromUSurrogateLead=(UChar)c;
+    if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) {
+        /* target is full */
+        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
     }
 
     /* write back the updated pointers */
@@ -479,23 +381,24 @@
 static void
 _ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
                            UErrorCode *pErrorCode) {
-    const uint8_t *source, *sourceLimit, *lastSource;
-    UChar *target;
+    const uint8_t *source, *sourceLimit;
+    UChar *target, *oldTarget;
     int32_t targetCapacity, length;
     int32_t *offsets;
 
     int32_t sourceIndex;
 
+    uint8_t c;
+
     /* set up the local pointers */
     source=(const uint8_t *)pArgs->source;
     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
-    target=pArgs->target;
+    target=oldTarget=pArgs->target;
     targetCapacity=pArgs->targetLimit-pArgs->target;
     offsets=pArgs->offsets;
 
     /* sourceIndex=-1 if the current character began in the previous buffer */
     sourceIndex=0;
-    lastSource=source;
 
     /*
      * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
@@ -508,7 +411,6 @@
 
 #if ASCII_UNROLL_TO_UNICODE
     /* unroll the loop with the most common case */
-unrolled:
     if(targetCapacity>=16) {
         int32_t count, loops;
         UChar oredChars;
@@ -544,7 +446,7 @@
         targetCapacity-=16*count;
 
         if(offsets!=NULL) {
-            lastSource+=16*count;
+            oldTarget+=16*count;
             while(count>0) {
                 *offsets++=sourceIndex++;
                 *offsets++=sourceIndex++;
@@ -569,86 +471,26 @@
 #endif
 
     /* conversion loop */
-    while(targetCapacity>0) {
-        if((*target++=*source++)<=0x7f) {
-            --targetCapacity;
-        } else {
-            UConverter *cnv;
-
-            /* back out the illegal character */
-            --target;
-
-            /* call the callback function with all the preparations and post-processing */
-            cnv=pArgs->converter;
-
-            /* callback(illegal) */
-            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
-
-            /* set offsets since the start or the last callback */
-            if(offsets!=NULL) {
-                int32_t count=(int32_t)(source-lastSource);
-
-                /* predecrement: do not set the offset for the callback-causing character */
-                while(--count>0) {
-                    *offsets++=sourceIndex++;
-                }
-                /* offset and sourceIndex are now set for the current character */
-            }
-
-            /* update the arguments structure */
-            pArgs->source=(const char *)source;
-            pArgs->target=target;
-            pArgs->offsets=offsets;
-
-            /* copy the current bytes to invalidCharBuffer */
-            cnv->invalidCharBuffer[0]=*(source-1);
-            cnv->invalidCharLength=1;
-
-            /* call the callback function */
-            cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, 1, UCNV_ILLEGAL, pErrorCode);
-
-            /* update target and deal with offsets if necessary */
-            offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
-            target=pArgs->target;
-
-            /* update the source pointer and index */
-            sourceIndex+=1+((const uint8_t *)pArgs->source-source);
-            source=lastSource=(const uint8_t *)pArgs->source;
-            targetCapacity=pArgs->targetLimit-target;
-            length=sourceLimit-source;
-            if(length<targetCapacity) {
-                targetCapacity=length;
-            }
-
-            /*
-             * If the callback overflowed the target, then we need to
-             * stop here with an overflow indication.
-             */
-            if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
-                break;
-            } else if(U_FAILURE(*pErrorCode)) {
-                /* break on error */
-                break;
-            } else if(cnv->UCharErrorBufferLength>0) {
-                /* target is full */
-                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
-                break;
-            }
-
-#if ASCII_UNROLL_TO_UNICODE
-            goto unrolled;
-#endif
-        }
+    c=0;
+    while(targetCapacity>0 && (c=*source++)<=0x7f) {
+        *target++=c;
+        --targetCapacity;
     }
 
-    if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) {
+    if(c>0x7f) {
+        /* callback(illegal); copy the current bytes to toUBytes[] */
+        UConverter *cnv=pArgs->converter;
+        cnv->toUBytes[0]=c;
+        cnv->toULength=1;
+        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+    } else if(source<sourceLimit && target>=pArgs->targetLimit) {
         /* target is full */
         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
     }
 
-    /* set offsets since the start or the last callback */
+    /* set offsets since the start */
     if(offsets!=NULL) {
-        size_t count=source-lastSource;
+        size_t count=target-oldTarget;
         while(count>0) {
             *offsets++=sourceIndex++;
             --count;
@@ -665,62 +507,25 @@
 static UChar32
 _ASCIIGetNextUChar(UConverterToUnicodeArgs *pArgs,
                    UErrorCode *pErrorCode) {
-    UChar buffer[UTF_MAX_CHAR_LENGTH];
     const uint8_t *source;
     uint8_t b;
 
-    /* set up the local pointers */
     source=(const uint8_t *)pArgs->source;
-
-    /* conversion loop */
-    while(source<(const uint8_t *)pArgs->sourceLimit) {
+    if(source<(const uint8_t *)pArgs->sourceLimit) {
         b=*source++;
         pArgs->source=(const char *)source;
         if(b<=0x7f) {
             return b;
         } else {
-            /* call the callback function with all the preparations and post-processing */
             UConverter *cnv=pArgs->converter;
-
-            /* callback(illegal) */
+            cnv->toUBytes[0]=b;
+            cnv->toULength=1;
             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
-
-            /* update the arguments structure */
-            pArgs->target=buffer;
-            pArgs->targetLimit=buffer+UTF_MAX_CHAR_LENGTH;
-
-            /* copy the current byte to invalidCharBuffer */
-            cnv->invalidCharBuffer[0]=(char)b;
-            cnv->invalidCharLength=1;
-
-            /* call the callback function */
-            cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, 1, UCNV_ILLEGAL, pErrorCode);
-
-            /* update the source pointer */
-            source=(const uint8_t *)pArgs->source;
-
-            /*
-             * return the first character if the callback wrote some
-             * we do not need to goto finish because the converter state is already set
-             */
-            if(U_SUCCESS(*pErrorCode)) {
-                int32_t length=pArgs->target-buffer;
-                if(length>0) {
-                    return ucnv_getUChar32KeepOverflow(cnv, buffer, length);
-                }
-                /* else (callback did not write anything) continue */
-            } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
-                *pErrorCode=U_ZERO_ERROR;
-                return ucnv_getUChar32KeepOverflow(cnv, buffer, UTF_MAX_CHAR_LENGTH);
-            } else {
-                /* break on error */
-                /* ### what if a callback set an error but _also_ generated output?! */
-                return 0xffff;
-            }
+            return 0xffff;
         }
     }
 
-    /* no output because of empty input or only skipping callbacks */
+    /* no output because of empty input */
     *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
     return 0xffff;
 }

Index: ucnvmbcs.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ucnvmbcs.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- ucnvmbcs.c	10 Sep 2003 02:42:03 -0000	1.3
+++ ucnvmbcs.c	6 Apr 2004 10:08:03 -0000	1.4
@@ -21,7 +21,7 @@
 *   - efficient distinction of unassigned vs. illegal byte sequences
 *   - it is possible in fromUnicode() to directly deal with simple
 *     stateful encodings (used for EBCDIC_STATEFUL)
-*   - it is possible to convert Unicode code points other than U+0000
+*   - it is possible to convert Unicode code points
 *     to a single zero byte (but not as a fallback except for SBCS)
 *
 *   Remaining limitations in fromUnicode:
@@ -29,6 +29,10 @@
 *   - except for SBCS codepages: no fallback mapping from Unicode to a zero byte
 *   - limitation to up to 4 bytes per character
[...4574 lines suppressed...]
-                /* write the result as UChars and output */
-                i=0;
-                UTF_APPEND_CHAR_UNSAFE(u, i, linear);
-                ucnv_cbToUWriteUChars(pArgs, u, i, 0, pErrorCode);
-                return;
-            }
-        }
-    }
-
-    /* copy the current bytes to invalidCharBuffer */
-    for(i=0; i<length; ++i) {
-        cnv->invalidCharBuffer[i]=codeUnits[i];
-    }
-    cnv->invalidCharLength=(int8_t)length;
-
-    /* call the normal callback function */
-    cnv->fromCharErrorBehaviour(context, pArgs, codeUnits, length, reason, pErrorCode);
-}
 
 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */

Index: ucnvmbcs.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ucnvmbcs.h,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- ucnvmbcs.h	10 Sep 2003 02:42:03 -0000	1.3
+++ ucnvmbcs.h	6 Apr 2004 10:08:03 -0000	1.4
@@ -1,7 +1,7 @@
 /*
 ******************************************************************************
 *
-*   Copyright (C) 2000-2001, International Business Machines
+*   Copyright (C) 2000-2003, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************
@@ -19,10 +19,117 @@
 
 #include "unicode/utypes.h"
 #include "unicode/ucnv.h"
-#include "ucnv_bld.h"
+#include "ucnv_cnv.h"
+
+/**
+ * ICU conversion (.cnv) data file structure, following the usual UDataInfo
+ * header.
+ *
+ * Format version: 6.2
+ *
+ * struct UConverterStaticData -- struct containing the converter name, IBM CCSID,
+ *                                min/max bytes per character, etc.
+ *                                see ucnv_bld.h
+ *
+ * --------------------
+ *
+ * The static data is followed by conversionType-specific data structures.
+ * At the moment, there are only variations of MBCS converters. They all have
+ * the same toUnicode structures, while the fromUnicode structures for SBCS
+ * differ from those for other MBCS-style converters.
+ *
+ * _MBCSHeader.version 4.2 adds an optional conversion extension data structure.
+ * If it is present, then an ICU version reading header versions 4.0 or 4.1
+ * will be able to use the base table and ignore the extension.
+ *
+ * The unicodeMask in the static data is part of the base table data structure.
+ * Especially, the UCNV_HAS_SUPPLEMENTARY flag determines the length of the
+ * fromUnicode stage 1 array.
+ * The static data unicodeMask refers only to the base table's properties if
+ * a base table is included.
+ * In an extension-only file, the static data unicodeMask is 0.
+ * The extension data indexes have a separate field with the unicodeMask flags.
+ *
+ * MBCS-style data structure following the static data.
+ * Offsets are counted in bytes from the beginning of the MBCS header structure.
+ * Details about usage in comments in ucnvmbcs.c.
+ *
+ * struct _MBCSHeader (see the definition in this header file below)
+ * contains 32-bit fields as follows:
+ * 8 values:
+ *  0   uint8_t[4]  MBCS version in UVersionInfo format (currently 4.2.0.0)
+ *  1   uint32_t    countStates
+ *  2   uint32_t    countToUFallbacks
+ *  3   uint32_t    offsetToUCodeUnits
+ *  4   uint32_t    offsetFromUTable
+ *  5   uint32_t    offsetFromUBytes
+ *  6   uint32_t    flags, bits:
+ *                      31.. 8 offsetExtension -- _MBCSHeader.version 4.2 (ICU 2.8) and higher
+ *                                                0 for older versions and if
+ *                                                there is not extension structure
+ *                       7.. 0 outputType
+ *  7   uint32_t    fromUBytesLength -- _MBCSHeader.version 4.1 (ICU 2.4) and higher
+ *                  counts bytes in fromUBytes[]
+ *
+ * if(outputType==MBCS_OUTPUT_EXT_ONLY) {
+ *     -- base table name for extension-only table
+ *     char baseTableName[variable]; -- with NUL plus padding for 4-alignment
+ *
+ *     -- all _MBCSHeader fields except for version and flags are 0
+ * } else {
+ *     -- normal base table with optional extension
+ *
+ *     int32_t stateTable[countStates][256];
+ *    
+ *     struct _MBCSToUFallback { (fallbacks are sorted by offset)
+ *         uint32_t offset;
+ *         UChar32 codePoint;
+ *     } toUFallbacks[countToUFallbacks];
+ *    
+ *     uint16_t unicodeCodeUnits[(offsetFromUTable-offsetToUCodeUnits)/2];
+ *                  (padded to an even number of units)
+ *    
+ *     -- stage 1 tables
+ *     if(staticData.unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
+ *         -- stage 1 table for all of Unicode
+ *         uint16_t fromUTable[0x440]; (32-bit-aligned)
+ *     } else {
+ *         -- BMP-only tables have a smaller stage 1 table
+ *         uint16_t fromUTable[0x40]; (32-bit-aligned)
+ *     }
+ *    
+ *     -- stage 2 tables
+ *        length determined by top of stage 1 and bottom of stage 3 tables
+ *     if(outputType==MBCS_OUTPUT_1) {
+ *         -- SBCS: pure indexes
+ *         uint16_t stage 2 indexes[?];
+ *     } else {
+ *         -- DBCS, MBCS, EBCDIC_STATEFUL, ...: roundtrip flags and indexes
+ *         uint32_t stage 2 flags and indexes[?];
+ *     }
+ *    
+ *     -- stage 3 tables with byte results
+ *     if(outputType==MBCS_OUTPUT_1) {
+ *         -- SBCS: each 16-bit result contains flags and the result byte, see ucnvmbcs.c
+ *         uint16_t fromUBytes[fromUBytesLength/2];
+ *     } else {
+ *         -- DBCS, MBCS, EBCDIC_STATEFUL, ... 2/3/4 bytes result, see ucnvmbcs.c
+ *         uint8_t fromUBytes[fromUBytesLength]; or
+ *         uint16_t fromUBytes[fromUBytesLength/2]; or
+ *         uint32_t fromUBytes[fromUBytesLength/4];
+ *     }
+ * }
+ *
+ * -- extension table, details see ucnv_ext.h
+ * int32_t indexes[>=32]; ...
+ */
 
 /* MBCS converter data and state -------------------------------------------- */
 
+enum {
+    MBCS_MAX_STATE_COUNT=128
+};
+
 /**
  * MBCS action codes for conversions to Unicode.
  * These values are in bits 23..20 of the state table entries.
@@ -98,7 +205,13 @@
     MBCS_OUTPUT_4_EUC,      /* 9 */
 
     MBCS_OUTPUT_2_SISO=12,  /* c */
-    MBCS_OUTPUT_2_HZ        /* d */
+    MBCS_OUTPUT_2_HZ,       /* d */
+
+    MBCS_OUTPUT_EXT_ONLY,   /* e */
+
+    MBCS_OUTPUT_COUNT,
+
+    MBCS_OUTPUT_DBCS_ONLY=0xdb  /* runtime-only type for DBCS-only handling of SISO tables */
 };
 
 /**
@@ -116,7 +229,7 @@
  */
 typedef struct UConverterMBCSTable {
     /* toUnicode */
-    uint8_t countStates;
+    uint8_t countStates, dbcsOnlyState, stateTableOwned;
     uint32_t countToUFallbacks;
 
     const int32_t (*stateTable)/*[countStates]*/[256];
@@ -133,35 +246,14 @@
 
     /* converter name for swaplfnl */
     char *swapLFNLName;
+
+    /* extension data */
+    struct UConverterSharedData *baseSharedData;
+    const int32_t *extIndexes;
 } UConverterMBCSTable;
 
 /**
- * MBCS data structure as part of a .cnv file:
- *
- * uint32_t [8]; -- 8 values:
- *  0   MBCS version in UVersionInfo format (1.0.0.0)
- *  1   countStates
- *  2   countToUFallbacks
- *  3   offsetToUCodeUnits (offsets are counted from the beginning of this header structure)
- *  4   offsetFromUTable
- *  5   offsetFromUBytes
- *  6   flags, bits:
- *          31.. 8 reserved
- *           7.. 0 outputType
- *  7   fromUBytesLength -- header.version 4.1 (ICU 2.4) and higher
- *
- * stateTable[countStates][256];
- *
- * struct { (fallbacks are sorted by offset)
- *     uint32_t offset;
- *     UChar32 codePoint;
- * } toUFallbacks[countToUFallbacks];
- *
- * uint16_t unicodeCodeUnits[?]; (even number of units or padded)
- *
- * uint16_t fromUTable[0x440+?]; (32-bit-aligned)
- *
- * uint8_t fromUBytes[?];
+ * MBCS data header. See data format description above.
  */
 typedef struct {
     UVersionInfo version;
@@ -174,11 +266,13 @@
              fromUBytesLength;
 } _MBCSHeader;
 
-/**
+/*
  * This is a simple version of _MBCSGetNextUChar() that is used
  * by other converter implementations.
+ * It only returns an "assigned" result if it consumes the entire input.
  * It does not use state from the converter, nor error codes.
  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
+ * It handles conversion extensions but not GB 18030.
  *
  * Return value:
  * U+fffe   unassigned
@@ -187,12 +281,13 @@
  */
 U_CFUNC UChar32
 _MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
-                        const char **pSource, const char *sourceLimit,
+                        const char *source, int32_t length,
                         UBool useFallback);
 
 /**
  * This version of _MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages.
  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
+ * It does not handle conversion extensions (_extToU()).
  */
 U_CFUNC UChar32
 _MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
@@ -205,7 +300,7 @@
  * returns fallback values.
  */
 #define _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(sharedData, b) \
-    (UChar)MBCS_ENTRY_FINAL_VALUE_16((sharedData)->table->mbcs.stateTable[0][(uint8_t)(b)])
+    (UChar)MBCS_ENTRY_FINAL_VALUE_16((sharedData)->mbcs.stateTable[0][(uint8_t)(b)])
 
 /**
  * This is an internal function that allows other converter implementations
@@ -216,13 +311,14 @@
 
 /** This is a macro version of _MBCSIsLeadByte(). */
 #define _MBCS_IS_LEAD_BYTE(sharedData, byte) \
-    (UBool)MBCS_ENTRY_IS_TRANSITION((sharedData)->table->mbcs.stateTable[0][(uint8_t)(byte)])
+    (UBool)MBCS_ENTRY_IS_TRANSITION((sharedData)->mbcs.stateTable[0][(uint8_t)(byte)])
 
-/**
+/*
  * This is another simple conversion function for internal use by other
  * conversion implementations.
  * It does not use the converter state nor call callbacks.
  * It does not handle the EBCDIC swaplfnl option (set in UConverter).
+ * It handles conversion extensions but not GB 18030.
  *
  * It converts one single Unicode code point into codepage bytes, encoded
  * as one 32-bit value. The function returns the number of bytes in *pValue:
@@ -264,5 +360,33 @@
 _MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
                           UErrorCode *pErrorCode);
 
+/*
+ * Internal function returning a UnicodeSet for toUnicode() conversion.
+ * Currently only used for ISO-2022-CN, and only handles roundtrip mappings.
+ * In the future, if we add support for reverse-fallback sets, this function
+ * needs to be updated, and called for each initial state.
+ * Does not currently handle extensions.
+ * Does not empty the set first.
+ */
+U_CFUNC void
+_MBCSGetUnicodeSetForBytes(const UConverterSharedData *sharedData,
+                           USet *set,
+                           UConverterUnicodeSet which,
+                           uint8_t state, int32_t lowByte, int32_t highByte,
+                           UErrorCode *pErrorCode);
+
+/*
+ * Internal function returning a UnicodeSet for toUnicode() conversion.
+ * Currently only used for ISO-2022-CN, and only handles roundtrip mappings.
+ * In the future, if we add support for fallback sets, this function
+ * needs to be updated.
+ * Handles extensions.
+ * Does not empty the set first.
+ */
+U_CFUNC void
+_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
+                             USet *set,
+                             UConverterUnicodeSet which,
+                             UErrorCode *pErrorCode);
 
 #endif

Index: ucnvscsu.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ucnvscsu.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- ucnvscsu.c	10 Sep 2003 02:42:03 -0000	1.3
+++ ucnvscsu.c	6 Apr 2004 10:08:03 -0000	1.4
@@ -181,7 +181,7 @@
             break;
         }
 
-        cnv->fromUSurrogateLead=0;
+        cnv->fromUChar32=0;
     }
 }
 
@@ -216,8 +216,6 @@
 
 /* SCSU-to-Unicode conversion functions ------------------------------------- */
 
-/* ### TODO check operator precedence | << + < */
-
 static void
 _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
                           UErrorCode *pErrorCode) {
@@ -272,11 +270,9 @@
      * The end of the input or output buffer is also handled by the slower loop.
      * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
      *
-     * The callback handling is done by jumping (goto) to the callback section at the end
-     * of the function. From there, it either jumps to here to continue or to
-     * the endloop section to clean up and return.
+     * The callback handling is done by returning with an error code.
+     * The conversion framework actually calls the callback function.
      */
-loop:
     if(isSingleByteMode) {
         /* fast path for single-byte mode */
         if(state==readCommand) {
@@ -367,13 +363,20 @@
                     goto fastUnicode;
                 } else /* Srs */ {
                     /* callback(illegal) */
-                    cnv->invalidCharBuffer[0]=b;
-                    cnv->invalidCharLength=1;
-                    goto callback;
+                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                    cnv->toUBytes[0]=b;
+                    cnv->toULength=1;
+                    goto endloop;
                 }
+
+                /* store the first byte of a multibyte sequence in toUBytes[] */
+                cnv->toUBytes[0]=b;
+                cnv->toULength=1;
                 break;
             case quotePairOne:
                 byteOne=b;
+                cnv->toUBytes[1]=b;
+                cnv->toULength=2;
                 state=quotePairTwo;
                 break;
             case quotePairTwo:
@@ -426,6 +429,8 @@
             case definePairOne:
                 dynamicWindow=(int8_t)((b>>5)&7);
                 byteOne=(uint8_t)(b&0x1f);
+                cnv->toUBytes[1]=b;
+                cnv->toULength=2;
                 state=definePairTwo;
                 break;
             case definePairTwo:
@@ -436,10 +441,9 @@
             case defineOne:
                 if(b==0) {
                     /* callback(illegal): Reserved window offset value 0 */
-                    cnv->invalidCharBuffer[0]=(char)(SD0+dynamicWindow);
-                    cnv->invalidCharBuffer[1]=b;
-                    cnv->invalidCharLength=2;
-                    goto callback;
+                    cnv->toUBytes[1]=b;
+                    cnv->toULength=2;
+                    goto endloop;
                 } else if(b<gapThreshold) {
                     scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
                 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
@@ -448,10 +452,9 @@
                     scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
                 } else {
                     /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
-                    cnv->invalidCharBuffer[0]=(char)(SD0+dynamicWindow);
-                    cnv->invalidCharBuffer[1]=b;
-                    cnv->invalidCharLength=2;
-                    goto callback;
+                    cnv->toUBytes[1]=b;
+                    cnv->toULength=2;
+                    goto endloop;
                 }
                 sourceIndex=nextSourceIndex;
                 state=readCommand;
@@ -487,6 +490,8 @@
             case readCommand:
                 if((uint8_t)(b-UC0)>(Urs-UC0)) {
                     byteOne=b;
+                    cnv->toUBytes[0]=b;
+                    cnv->toULength=1;
                     state=quotePairTwo;
                 } else if(/* UC0<=b && */ b<=UC7) {
                     dynamicWindow=(int8_t)(b-UC0);
@@ -496,23 +501,32 @@
                 } else if(/* UD0<=b && */ b<=UD7) {
                     dynamicWindow=(int8_t)(b-UD0);
                     isSingleByteMode=TRUE;
+                    cnv->toUBytes[0]=b;
+                    cnv->toULength=1;
                     state=defineOne;
                     goto singleByteMode;
                 } else if(b==UDX) {
                     isSingleByteMode=TRUE;
+                    cnv->toUBytes[0]=b;
+                    cnv->toULength=1;
                     state=definePairOne;
                     goto singleByteMode;
                 } else if(b==UQU) {
+                    cnv->toUBytes[0]=b;
+                    cnv->toULength=1;
                     state=quotePairOne;
                 } else /* Urs */ {
                     /* callback(illegal) */
-                    cnv->invalidCharBuffer[0]=b;
-                    cnv->invalidCharLength=1;
-                    goto callback;
+                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                    cnv->toUBytes[0]=b;
+                    cnv->toULength=1;
+                    goto endloop;
                 }
                 break;
             case quotePairOne:
                 byteOne=b;
+                cnv->toUBytes[1]=b;
+                cnv->toULength=2;
                 state=quotePairTwo;
                 break;
             case quotePairTwo:
@@ -528,80 +542,25 @@
     }
 endloop:
 
-    if(pArgs->flush && source>=sourceLimit) {
-        /* reset the state for the next conversion */
-        if(state!=readCommand && U_SUCCESS(*pErrorCode)) {
-            /* a character byte sequence remains incomplete */
-            *pErrorCode=U_TRUNCATED_CHAR_FOUND;
-        }
-        _SCSUReset(cnv, UCNV_RESET_TO_UNICODE);
-    } else {
-        /* set the converter state back into UConverter */
-        scsu->toUIsSingleByteMode=isSingleByteMode;
-        scsu->toUState=state;
-        scsu->toUQuoteWindow=quoteWindow;
-        scsu->toUDynamicWindow=dynamicWindow;
-        scsu->toUByteOne=byteOne;
+    /* set the converter state back into UConverter */
+    if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
+        /* reset to deal with the next character */
+        state=readCommand;
+    } else if(state==readCommand) {
+        /* not in a multi-byte sequence, reset toULength */
+        cnv->toULength=0;
     }
+    scsu->toUIsSingleByteMode=isSingleByteMode;
+    scsu->toUState=state;
+    scsu->toUQuoteWindow=quoteWindow;
+    scsu->toUDynamicWindow=dynamicWindow;
+    scsu->toUByteOne=byteOne;
 
-finish:
     /* write back the updated pointers */
     pArgs->source=(const char *)source;
     pArgs->target=target;
     pArgs->offsets=offsets;
     return;
-
-callback:
-    /* call the callback function with all the preparations and post-processing */
-    /* update the arguments structure */
-    pArgs->source=(const char *)source;
-    pArgs->target=target;
-    pArgs->offsets=offsets;
-    /* the current bytes were copied to invalidCharBuffer before the goto callback jump */
-
-    /* set the converter state in UConverter to deal with the next character */
-    scsu->toUIsSingleByteMode=isSingleByteMode;
-    scsu->toUState=readCommand;
-    scsu->toUQuoteWindow=quoteWindow;
-    scsu->toUDynamicWindow=dynamicWindow;
-    scsu->toUByteOne=0;
-
-    /* call the callback function */
-    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
-    cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, cnv->invalidCharLength, UCNV_ILLEGAL, pErrorCode);
-
-    /* get the converter state from UConverter */
-    isSingleByteMode=scsu->toUIsSingleByteMode;
-    state=scsu->toUState;
-    quoteWindow=scsu->toUQuoteWindow;
-    dynamicWindow=scsu->toUDynamicWindow;
-    byteOne=scsu->toUByteOne;
-
-    /* update target and deal with offsets if necessary */
-    offsets=ucnv_updateCallbackOffsets(offsets, (int32_t)(pArgs->target-target), sourceIndex);
-    target=pArgs->target;
-
-    /* update the source pointer and index */
-    sourceIndex=(int32_t)(nextSourceIndex+((const uint8_t *)pArgs->source-source));
-    source=(const uint8_t *)pArgs->source;
-
-    /*
-     * If the callback overflowed the target, then we need to
-     * stop here with an overflow indication.
-     */
-    if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
-        goto endloop;
-    } else if(cnv->UCharErrorBufferLength>0) {
-        /* target is full */
-        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
-        goto endloop;
-    } else if(U_FAILURE(*pErrorCode)) {
-        /* break on error */
-        _SCSUReset(cnv, UCNV_RESET_TO_UNICODE);
-        goto finish;
-    } else {
-        goto loop;
-    }
 }
 
 /*
@@ -619,7 +578,6 @@
     const uint8_t *source, *sourceLimit;
     UChar *target;
     const UChar *targetLimit;
-
     UBool isSingleByteMode;
     uint8_t state, byteOne;
     int8_t quoteWindow, dynamicWindow;
@@ -658,11 +616,9 @@
      * The end of the input or output buffer is also handled by the slower loop.
      * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
      *
-     * The callback handling is done by jumping (goto) to the callback section at the end
-     * of the function. From there, it either jumps to here to continue or to
-     * the endloop section to clean up and return.
+     * The callback handling is done by returning with an error code.
+     * The conversion framework actually calls the callback function.
      */
-loop:
     if(isSingleByteMode) {
         /* fast path for single-byte mode */
         if(state==readCommand) {
@@ -731,13 +687,20 @@
                     goto fastUnicode;
                 } else /* Srs */ {
                     /* callback(illegal) */
-                    cnv->invalidCharBuffer[0]=b;
-                    cnv->invalidCharLength=1;
-                    goto callback;
+                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                    cnv->toUBytes[0]=b;
+                    cnv->toULength=1;
+                    goto endloop;
                 }
+
+                /* store the first byte of a multibyte sequence in toUBytes[] */
+                cnv->toUBytes[0]=b;
+                cnv->toULength=1;
                 break;
             case quotePairOne:
                 byteOne=b;
+                cnv->toUBytes[1]=b;
+                cnv->toULength=2;
                 state=quotePairTwo;
                 break;
             case quotePairTwo:
@@ -772,6 +735,8 @@
             case definePairOne:
                 dynamicWindow=(int8_t)((b>>5)&7);
                 byteOne=(uint8_t)(b&0x1f);
+                cnv->toUBytes[1]=b;
+                cnv->toULength=2;
                 state=definePairTwo;
                 break;
             case definePairTwo:
@@ -781,10 +746,9 @@
             case defineOne:
                 if(b==0) {
                     /* callback(illegal): Reserved window offset value 0 */
-                    cnv->invalidCharBuffer[0]=(char)(SD0+dynamicWindow);
-                    cnv->invalidCharBuffer[1]=b;
-                    cnv->invalidCharLength=2;
-                    goto callback;
+                    cnv->toUBytes[1]=b;
+                    cnv->toULength=2;
+                    goto endloop;
                 } else if(b<gapThreshold) {
                     scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
                 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
@@ -793,10 +757,9 @@
                     scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
                 } else {
                     /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
-                    cnv->invalidCharBuffer[0]=(char)(SD0+dynamicWindow);
-                    cnv->invalidCharBuffer[1]=b;
-                    cnv->invalidCharLength=2;
-                    goto callback;
+                    cnv->toUBytes[1]=b;
+                    cnv->toULength=2;
+                    goto endloop;
                 }
                 state=readCommand;
                 goto fastSingle;
@@ -825,6 +788,8 @@
             case readCommand:
                 if((uint8_t)(b-UC0)>(Urs-UC0)) {
                     byteOne=b;
+                    cnv->toUBytes[0]=b;
+                    cnv->toULength=1;
                     state=quotePairTwo;
                 } else if(/* UC0<=b && */ b<=UC7) {
                     dynamicWindow=(int8_t)(b-UC0);
@@ -833,23 +798,32 @@
                 } else if(/* UD0<=b && */ b<=UD7) {
                     dynamicWindow=(int8_t)(b-UD0);
                     isSingleByteMode=TRUE;
+                    cnv->toUBytes[0]=b;
+                    cnv->toULength=1;
                     state=defineOne;
                     goto singleByteMode;
                 } else if(b==UDX) {
                     isSingleByteMode=TRUE;
+                    cnv->toUBytes[0]=b;
+                    cnv->toULength=1;
                     state=definePairOne;
                     goto singleByteMode;
                 } else if(b==UQU) {
+                    cnv->toUBytes[0]=b;
+                    cnv->toULength=1;
                     state=quotePairOne;
                 } else /* Urs */ {
                     /* callback(illegal) */
-                    cnv->invalidCharBuffer[0]=b;
-                    cnv->invalidCharLength=1;
-                    goto callback;
+                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                    cnv->toUBytes[0]=b;
+                    cnv->toULength=1;
+                    goto endloop;
                 }
                 break;
             case quotePairOne:
                 byteOne=b;
+                cnv->toUBytes[1]=b;
+                cnv->toULength=2;
                 state=quotePairTwo;
                 break;
             case quotePairTwo:
@@ -861,80 +835,24 @@
     }
 endloop:
 
-    if(pArgs->flush && source>=sourceLimit) {
-        /* reset the state for the next conversion */
-        if(state!=readCommand && U_SUCCESS(*pErrorCode)) {
-            /* a character byte sequence remains incomplete */
-            *pErrorCode=U_TRUNCATED_CHAR_FOUND;
-        }
-        _SCSUReset(cnv, UCNV_RESET_TO_UNICODE);
-    } else {
-        /* set the converter state back into UConverter */
-        scsu->toUIsSingleByteMode=isSingleByteMode;
-        scsu->toUState=state;
-        scsu->toUQuoteWindow=quoteWindow;
-        scsu->toUDynamicWindow=dynamicWindow;
-        scsu->toUByteOne=byteOne;
+    /* set the converter state back into UConverter */
+    if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
+        /* reset to deal with the next character */
+        state=readCommand;
+    } else if(state==readCommand) {
+        /* not in a multi-byte sequence, reset toULength */
+        cnv->toULength=0;
     }
+    scsu->toUIsSingleByteMode=isSingleByteMode;
+    scsu->toUState=state;
+    scsu->toUQuoteWindow=quoteWindow;
+    scsu->toUDynamicWindow=dynamicWindow;
+    scsu->toUByteOne=byteOne;
 
-finish:
     /* write back the updated pointers */
     pArgs->source=(const char *)source;
     pArgs->target=target;
     return;
-
-callback:
-    /* call the callback function with all the preparations and post-processing */
-    /* update the arguments structure */
-    pArgs->source=(const char *)source;
-    pArgs->target=target;
-    /* the current bytes were copied to invalidCharBuffer before the goto callback jump */
-
-    /* set the converter state in UConverter to deal with the next character */
-    scsu->toUIsSingleByteMode=isSingleByteMode;
-    scsu->toUState=readCommand;
-    scsu->toUQuoteWindow=quoteWindow;
-    scsu->toUDynamicWindow=dynamicWindow;
-    scsu->toUByteOne=0;
-
-    /* call the callback function */
-    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
-    cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, cnv->invalidCharLength, UCNV_ILLEGAL, pErrorCode);
-
-    /* get the converter state from UConverter */
-    isSingleByteMode=scsu->toUIsSingleByteMode;
-    state=scsu->toUState;
-    quoteWindow=scsu->toUQuoteWindow;
-    dynamicWindow=scsu->toUDynamicWindow;
-    byteOne=scsu->toUByteOne;
-
-    target=pArgs->target;
-
-    source=(const uint8_t *)pArgs->source;
-
-    /*
-     * If the callback overflowed the target, then we need to
-     * stop here with an overflow indication.
-     */
-    if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
-        goto endloop;
-    } else if(cnv->UCharErrorBufferLength>0) {
-        /* target is full */
-        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
-        goto endloop;
-    } else if(U_FAILURE(*pErrorCode)) {
-        /* break on error */
-        _SCSUReset(cnv, UCNV_RESET_TO_UNICODE);
-        goto finish;
-    } else {
-        goto loop;
-    }
-}
-
-static UChar32
-_SCSUGetNextUChar(UConverterToUnicodeArgs *pArgs,
-                  UErrorCode *pErrorCode) {
-    return ucnv_getNextUCharFromToUImpl(pArgs, _SCSUToUnicode, TRUE, pErrorCode);
 }
 
 /* SCSU-from-Unicode conversion functions ----------------------------------- */
@@ -1095,7 +1013,6 @@
 
     int32_t sourceIndex, nextSourceIndex;
 
-    uint32_t i;
     int32_t length;
 
     /* variables for compression heuristics */
@@ -1120,7 +1037,7 @@
     dynamicWindow=scsu->fromUDynamicWindow;
     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
 
-    c=cnv->fromUSurrogateLead;
+    c=cnv->fromUChar32;
 
     /* sourceIndex=-1 if the current character began in the previous buffer */
     sourceIndex= c==0 ? 0 : -1;
@@ -1188,7 +1105,8 @@
                         } else {
                             /* this is an unmatched lead code unit (1st surrogate) */
                             /* callback(illegal) */
-                            goto callback;
+                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                            goto endloop;
                         }
                     } else {
                         /* no more input */
@@ -1197,7 +1115,8 @@
                 } else {
                     /* this is an unmatched trail code unit (2nd surrogate) */
                     /* callback(illegal) */
-                    goto callback;
+                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                    goto endloop;
                 }
 
                 /* compress supplementary character U+10000..U+10ffff */
@@ -1383,7 +1302,8 @@
                         } else {
                             /* this is an unmatched lead code unit (1st surrogate) */
                             /* callback(illegal) */
-                            goto callback;
+                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                            goto endloop;
                         }
                     } else {
                         /* no more input */
@@ -1392,7 +1312,8 @@
                 } else {
                     /* this is an unmatched trail code unit (2nd surrogate) */
                     /* callback(illegal) */
-                    goto callback;
+                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                    goto endloop;
                 }
 
                 /* compress supplementary character */
@@ -1443,22 +1364,12 @@
     }
 endloop:
 
-    if(pArgs->flush && source>=sourceLimit) {
-        /* reset the state for the next conversion */
-        if(c!=0 && U_SUCCESS(*pErrorCode)) {
-            /* a character byte sequence remains incomplete */
-            *pErrorCode=U_TRUNCATED_CHAR_FOUND;
-        }
-        _SCSUReset(cnv, UCNV_RESET_FROM_UNICODE);
-    } else {
-        /* set the converter state back into UConverter */
-        scsu->fromUIsSingleByteMode=isSingleByteMode;
-        scsu->fromUDynamicWindow=dynamicWindow;
+    /* set the converter state back into UConverter */
+    scsu->fromUIsSingleByteMode=isSingleByteMode;
+    scsu->fromUDynamicWindow=dynamicWindow;
 
-        cnv->fromUSurrogateLead=(UChar)c;
-    }
+    cnv->fromUChar32=c;
 
-finish:
     /* write back the updated pointers */
     pArgs->source=source;
     pArgs->target=(char *)target;
@@ -1566,59 +1477,6 @@
         c=0;
         goto endloop;
     }
-
-callback:
-    /* call the callback function with all the preparations and post-processing */
-    /* update the arguments structure */
-    pArgs->source=source;
-    pArgs->target=(char *)target;
-    pArgs->offsets=offsets;
-    /* set the converter state in UConverter to deal with the next character */
-    scsu->fromUIsSingleByteMode=isSingleByteMode;
-    scsu->fromUDynamicWindow=dynamicWindow;
-    cnv->fromUSurrogateLead=0;
-
-    /* write the code point as code units */
-    i=0;
-    UTF_APPEND_CHAR_UNSAFE(cnv->invalidUCharBuffer, i, c);
-    cnv->invalidUCharLength=(int8_t)i;
-
-    /* call the callback function */
-    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
-    cnv->fromUCharErrorBehaviour(cnv->fromUContext, pArgs, cnv->invalidUCharBuffer, i, c, UCNV_ILLEGAL, pErrorCode);
-
-    /* get the converter state from UConverter */
-    isSingleByteMode=scsu->fromUIsSingleByteMode;
-    dynamicWindow=scsu->fromUDynamicWindow;
-    currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
-    c=cnv->fromUSurrogateLead;
-
-    /* update target and deal with offsets if necessary */
-    offsets=ucnv_updateCallbackOffsets(offsets, (int32_t)(((uint8_t *)pArgs->target)-target), sourceIndex);
-    target=(uint8_t *)pArgs->target;
-
-    /* update the source pointer and index */
-    sourceIndex=(int32_t)(nextSourceIndex+(pArgs->source-source));
-    source=pArgs->source;
-    targetCapacity=(int32_t)((uint8_t *)pArgs->targetLimit-target);
-
-    /*
-     * If the callback overflowed the target, then we need to
-     * stop here with an overflow indication.
-     */
-    if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
-        goto endloop;
-    } else if(cnv->charErrorBufferLength>0) {
-        /* target is full */
-        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
-        goto endloop;
-    } else if(U_FAILURE(*pErrorCode)) {
-        /* break on error */
-        _SCSUReset(cnv, UCNV_RESET_FROM_UNICODE);
-        goto finish;
-    } else {
-        goto loop;
-    }
 }
 
 /*
@@ -1643,7 +1501,6 @@
 
     uint32_t c, delta;
 
-    uint32_t i;
     int32_t length;
 
     /* variables for compression heuristics */
@@ -1667,7 +1524,7 @@
     dynamicWindow=scsu->fromUDynamicWindow;
     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
 
-    c=cnv->fromUSurrogateLead;
+    c=cnv->fromUChar32;
 
     /* similar conversion "loop" as in toUnicode */
 loop:
@@ -1720,7 +1577,8 @@
                         } else {
                             /* this is an unmatched lead code unit (1st surrogate) */
                             /* callback(illegal) */
-                            goto callback;
+                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                            goto endloop;
                         }
                     } else {
                         /* no more input */
@@ -1729,7 +1587,8 @@
                 } else {
                     /* this is an unmatched trail code unit (2nd surrogate) */
                     /* callback(illegal) */
-                    goto callback;
+                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                    goto endloop;
                 }
 
                 /* compress supplementary character U+10000..U+10ffff */
@@ -1902,7 +1761,8 @@
                         } else {
                             /* this is an unmatched lead code unit (1st surrogate) */
                             /* callback(illegal) */
-                            goto callback;
+                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                            goto endloop;
                         }
                     } else {
                         /* no more input */
@@ -1911,7 +1771,8 @@
                 } else {
                     /* this is an unmatched trail code unit (2nd surrogate) */
                     /* callback(illegal) */
-                    goto callback;
+                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                    goto endloop;
                 }
 
                 /* compress supplementary character */
@@ -1961,22 +1822,12 @@
     }
 endloop:
 
-    if(pArgs->flush && source>=sourceLimit) {
-        /* reset the state for the next conversion */
-        if(c!=0 && U_SUCCESS(*pErrorCode)) {
-            /* a character byte sequence remains incomplete */
-            *pErrorCode=U_TRUNCATED_CHAR_FOUND;
-        }
-        _SCSUReset(cnv, UCNV_RESET_FROM_UNICODE);
-    } else {
-        /* set the converter state back into UConverter */
-        scsu->fromUIsSingleByteMode=isSingleByteMode;
-        scsu->fromUDynamicWindow=dynamicWindow;
+    /* set the converter state back into UConverter */
+    scsu->fromUIsSingleByteMode=isSingleByteMode;
+    scsu->fromUDynamicWindow=dynamicWindow;
 
-        cnv->fromUSurrogateLead=(UChar)c;
-    }
+    cnv->fromUChar32=c;
 
-finish:
     /* write back the updated pointers */
     pArgs->source=source;
     pArgs->target=(char *)target;
@@ -2052,54 +1903,6 @@
         c=0;
         goto endloop;
     }
-
-callback:
-    /* call the callback function with all the preparations and post-processing */
-    /* update the arguments structure */
-    pArgs->source=source;
-    pArgs->target=(char *)target;
-    /* set the converter state in UConverter to deal with the next character */
-    scsu->fromUIsSingleByteMode=isSingleByteMode;
-    scsu->fromUDynamicWindow=dynamicWindow;
-    cnv->fromUSurrogateLead=0;
-
-    /* write the code point as code units */
-    i=0;
-    UTF_APPEND_CHAR_UNSAFE(cnv->invalidUCharBuffer, i, c);
-    cnv->invalidUCharLength=(int8_t)i;
-
-    /* call the callback function */
-    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
-    cnv->fromUCharErrorBehaviour(cnv->fromUContext, pArgs, cnv->invalidUCharBuffer, i, c, UCNV_ILLEGAL, pErrorCode);
-
-    /* get the converter state from UConverter */
-    isSingleByteMode=scsu->fromUIsSingleByteMode;
-    dynamicWindow=scsu->fromUDynamicWindow;
-    currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
-    c=cnv->fromUSurrogateLead;
-
-    target=(uint8_t *)pArgs->target;
-
-    source=pArgs->source;
-    targetCapacity=(int32_t)((uint8_t *)pArgs->targetLimit-target);
-
-    /*
-     * If the callback overflowed the target, then we need to
-     * stop here with an overflow indication.
-     */
-    if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
-        goto endloop;
-    } else if(cnv->charErrorBufferLength>0) {
-        /* target is full */
-        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
-        goto endloop;
-    } else if(U_FAILURE(*pErrorCode)) {
-        /* break on error */
-        _SCSUReset(cnv, UCNV_RESET_FROM_UNICODE);
-        goto finish;
-    } else {
-        goto loop;
-    }
 }
 
 /* miscellaneous ------------------------------------------------------------ */
@@ -2166,8 +1969,7 @@
     }
 
     localClone = (struct cloneStruct *)stackBuffer;
-    uprv_memcpy(&localClone->cnv, cnv, sizeof(UConverter));
-    localClone->cnv.isCopyLocal = TRUE;
+    /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
 
     uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData));
     localClone->cnv.extraInfo = &localClone->mydata;
@@ -2177,9 +1979,6 @@
 }
 
 
-
-
-
 static const UConverterImpl _SCSUImpl={
     UCNV_SCSU,
 
@@ -2194,7 +1993,7 @@
     _SCSUToUnicodeWithOffsets,
     _SCSUFromUnicode,
     _SCSUFromUnicodeWithOffsets,
-    _SCSUGetNextUChar,
+    NULL,
 
     NULL,
     _SCSUGetName,
@@ -2209,7 +2008,13 @@
     0, /* CCSID for SCSU */
     UCNV_IBM, UCNV_SCSU,
     1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */
-    { 0x0e, 0xff, 0xfd, 0 }, 3, /* ### the subchar really must be written by an SCSU function! */
+    /*
+     * ### TODO the subchar really must be written by an SCSU function
+     * however, currently SCSU's fromUnicode() never causes errors, therefore
+     * no callbacks will be called and no subchars written
+     * See Jitterbug 2837 - RFE: forbid converting surrogate code points in all charsets
+     */
+    { 0x0e, 0xff, 0xfd, 0 }, 3,
     FALSE, FALSE,
     0,
     0,
@@ -2221,5 +2026,3 @@
     NULL, NULL, &_SCSUStaticData, FALSE, &_SCSUImpl,
     0
 };
-
-/* ### clarify: if an error occurs, does a converter reset itself? or is it in a defined or undefined state? */

Index: udata.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/udata.c,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -d -r1.5 -r1.6
--- udata.c	10 Sep 2003 02:42:03 -0000	1.5
+++ udata.c	6 Apr 2004 10:08:04 -0000	1.6
@@ -252,6 +252,7 @@
     int32_t           nameLen;
     UHashtable       *htable;
     UDataMemory      *oldValue = NULL;
+    UErrorCode        subErr = U_ZERO_ERROR;
 
     if (U_FAILURE(*pErr)) {
         return NULL;
@@ -286,22 +287,24 @@
     umtx_lock(NULL);
     oldValue = uhash_get(htable, path);
     if (oldValue != NULL) {
-        *pErr = U_USING_DEFAULT_WARNING; }
+        subErr = U_USING_DEFAULT_WARNING;
+    }
     else {
         uhash_put(
             htable,
             newElement->name,               /* Key   */
             newElement,                     /* Value */
-            pErr);
+            &subErr);
     }
     umtx_unlock(NULL);
 
 #ifdef UDATA_DEBUG
-    fprintf(stderr, "Cache: [%s] <<< %p : %s\n", newElement->name, 
-            newElement->item, u_errorName(*pErr));
+    fprintf(stderr, "Cache: [%s] <<< %p : %s. vFunc=%p\n", newElement->name, 
+    newElement->item, u_errorName(subErr), newElement->item->vFuncs);
 #endif
 
-    if (*pErr == U_USING_DEFAULT_WARNING || U_FAILURE(*pErr)) {
+    if (subErr == U_USING_DEFAULT_WARNING || U_FAILURE(subErr)) {
+        *pErr = subErr; /* copy sub err unto fillin ONLY if something happens. */
         uprv_free(newElement->name);
         uprv_free(newElement->item);
         uprv_free(newElement);
@@ -313,6 +316,61 @@
 
 
 
+/*-------------------------------------------------------------------------------
+ *
+ *   TinyString   -  a small set of really simple string functions, for 
+ *                   the purpose of consolidating buffer overflow code in one place
+ *
+ *                   Use wherever you would otherwise declare a fixed sized  char[xx] buffer.
+ *                   Do non-growing ops by accessing fields of struct directly
+ *                   Grow using the append function to automatically extend buffer
+ *                   as needed.
+ *
+ *-------------------------------------------------------------------------------*/
+typedef struct TinyString {
+    char      *s;
+    int32_t    length;
+    char       fStaticBuf[100];
+    int32_t    fCapacity;
+} TinyString;
+
+static void TinyString_init(TinyString *This) {
+    This->s = This->fStaticBuf;
+    *This->s = 0;
+    This->length = 0;
+    This->fCapacity = sizeof(This->fStaticBuf)-1;
+}
+
+static void TinyString_append(TinyString *This, const char *what) {
+    int32_t  newLen;
+    newLen = This->length + uprv_strlen(what); 
+    if (newLen >= This->fCapacity) { 
+        int32_t newCapacity = newLen * 2; 
+        char *newBuf = (char *)uprv_malloc(newCapacity+1); 
+        if (newBuf != NULL) { 
+            uprv_strcpy(newBuf, This->s); 
+            if (This->s != This->fStaticBuf) { 
+                uprv_free(This->s);
+            } 
+            This->s = newBuf; 
+            This->fCapacity = newCapacity; 
+        } 
+    }
+    if (newLen < This->fCapacity) { 
+        uprv_strcat(This->s, what);
+        This->length = newLen;
+    } 
+}
+    
+static void TinyString_dt(TinyString *This) {
+    if (This->s != This->fStaticBuf) { 
+        uprv_free(This->s); 
+    }
+    TinyString_init(This);
+}
+
+
+
 
 /*----------------------------------------------------------------------*==============
  *                                                                      *
@@ -321,7 +379,8 @@
  *                                                                      *
  *----------------------------------------------------------------------*/
 
-#define U_DATA_PATHITER_BUFSIZ  1024   /* paths can't be longer than this */
+#define U_DATA_PATHITER_BUFSIZ  128        /* Size of local buffer for paths         */
+                                           /*   Overflow causes malloc of larger buf */
 
 typedef struct 
 {
@@ -330,12 +389,17 @@
     const char *basename;                          /* item's basename (icudt22e_mt.res)*/
     const char *suffix;                            /* item suffix (can be null) */
 
-    uint32_t     basenameLen;                      /* length of basename */
-    char        itemPath[U_DATA_PATHITER_BUFSIZ];  /* path passed in with item name */
+    uint32_t    basenameLen;                       /* length of basename */
 
-    char        pathBuffer[U_DATA_PATHITER_BUFSIZ];  /* output path for this it'ion */
+    char       *itemPath;                          /* path passed in with item name */
+    char        itemPathBuf[U_DATA_PATHITER_BUFSIZ];
 
-    UBool       checkLastFour;                       /* if TRUE then allow paths such as '/foo/myapp.dat'  to match, checks last 4 chars of suffix with last 4 of path, then previous chars. */
+    char       *pathBuffer;                        /* output path for this it'ion */
+    char        pathBufferA[U_DATA_PATHITER_BUFSIZ];
+
+    UBool       checkLastFour;                     /* if TRUE then allow paths such as '/foo/myapp.dat'
+                                                    * to match, checks last 4 chars of suffix with
+                                                    * last 4 of path, then previous chars. */
     
 }  UDataPathIterator;
 
@@ -346,9 +410,12 @@
  * @param iter  The iterator to be initialized. Its current state does not matter. 
  * @param path  The full pathname to be iterated over.  If NULL, defaults to U_ICUDATA_NAME 
  * @param item  Item to be searched for.  Can include full path, such as /a/b/foo.dat 
- * @param suffix  Optional item suffix, if not-null (ex. ".dat") then 'path' can contain 'item' explicitly. Ex:   'stuff.dat' would be found in '/a/foo:/tmp/stuff.dat:/bar/baz' as item #2.   '/blarg/stuff.dat' would also be found.
+ * @param suffix  Optional item suffix, if not-null (ex. ".dat") then 'path' can contain 'item' explicitly.
+ *               Ex:   'stuff.dat' would be found in '/a/foo:/tmp/stuff.dat:/bar/baz' as item #2.   
+ *                     '/blarg/stuff.dat' would also be found.
  */
-static void udata_pathiter_init(UDataPathIterator *iter, const char *path, const char *item, const char *suffix, UBool doCheckLastFour)
+static void udata_pathiter_init(UDataPathIterator *iter, const char *path,
+                                const char *item, const char *suffix, UBool doCheckLastFour)
 {
 #ifdef UDATA_DEBUG
         fprintf(stderr, "SUFFIX1=%s [%p]\n", suffix, suffix);
@@ -370,26 +437,53 @@
     }
 
     /** Item path **/
+    iter->itemPath   = iter->itemPathBuf;
     if(iter->basename == item) {
         iter->itemPath[0] = 0;
         iter->nextPath = iter->path;
     } else { 
-        uprv_strncpy(iter->itemPath, item, iter->basename - item);
-        iter->itemPath[iter->basename-item]=0;
+        int32_t  itemPathLen = iter->basename-item;
+        if (itemPathLen >= U_DATA_PATHITER_BUFSIZ) {
+            char *t = (char *)uprv_malloc(itemPathLen+1);
+            if (t != NULL) {
+                iter->itemPath = t;
+            } else {
+                /* Malloc failed.  Ignore the itemPath. */
+                itemPathLen = 0;
+            }
+        }
+        uprv_strncpy(iter->itemPath, item, itemPathLen);
+        iter->itemPath[itemPathLen]=0;
         iter->nextPath = iter->itemPath;
     }
 #ifdef UDATA_DEBUG
-        fprintf(stderr, "SUFFIX=%s [%p]\n", suffix, suffix);
+    fprintf(stderr, "SUFFIX=%s [%p]\n", suffix, suffix);
 #endif
     
+    /** Suffix  **/
     if(suffix != NULL) {
         iter->suffix = suffix;
     } else {
         iter->suffix = "";
     }
-
+    
     iter->checkLastFour = doCheckLastFour;
     
+    /* pathBuffer will hold the output path strings returned by the this iterator
+     *   Get an upper bound of possible string size, and make sure that the buffer
+     *   is big enough (sum of length of each piece, 2 extra delimiters, + trailing NULL) */
+    {
+        int32_t  maxPathLen = uprv_strlen(iter->path) + uprv_strlen(item) + uprv_strlen(iter->suffix) + 2;  
+        iter->pathBuffer = iter->pathBufferA;
+        if (maxPathLen >= U_DATA_PATHITER_BUFSIZ) {
+            iter->pathBuffer = (char *)uprv_malloc(maxPathLen);
+            if (iter->pathBuffer == NULL) {
+                iter->pathBuffer = iter->pathBufferA;
+                iter->path = "";
+            }
+        }
+    }
+
 #ifdef UDATA_DEBUG
     fprintf(stderr, "%p: init %s -> [path=%s], [base=%s], [suff=%s], [itempath=%s], [nextpath=%s], [checklast4=%s]\n",
             iter,
@@ -531,6 +625,20 @@
 }
 
 
+/*
+ *   Path Iterator Destructor.  Clean up any allocated storage
+ */
+static void udata_pathiter_dt(UDataPathIterator *iter) {
+     if (iter->itemPath != iter->itemPathBuf) {
+         uprv_free(iter->itemPath);
+         iter->itemPath = NULL;
+     }
+     if (iter->pathBuffer != iter->pathBufferA) {
+         uprv_free(iter->pathBuffer);
+         iter->pathBuffer = NULL;
+     }
+}
+
 /* ==================================================================================*/
 
 
@@ -562,7 +670,6 @@
     UDataMemory tData;
     UDataPathIterator iter;
     const char *pathBuffer;
-    int32_t pathLen;
     const char *inBasename;
 
     if (U_FAILURE(*pErrorCode)) {
@@ -621,7 +728,7 @@
     udata_pathiter_init(&iter, u_getDataDirectory(), path, ".dat", TRUE);
 
     while((UDataMemory_isLoaded(&tData)==FALSE) && 
-          (pathBuffer = udata_pathiter_next(&iter, &pathLen)) != NULL)
+          (pathBuffer = udata_pathiter_next(&iter, NULL)) != NULL)
     {
 #ifdef UDATA_DEBUG
         fprintf(stderr, "ocd: trying path %s - ", pathBuffer);
@@ -631,6 +738,7 @@
         fprintf(stderr, "%s\n", UDataMemory_isLoaded(&tData)?"LOADED":"not loaded");
 #endif
     }
+    udata_pathiter_dt(&iter);    /* Note:  this call may invalidate "pathBuffer" */
 
 #if defined(OS390_STUBDATA) && defined(OS390BATCH)
     if (!UDataMemory_isLoaded(&tData)) {
@@ -824,9 +932,8 @@
 
     if(pHeader->dataHeader.magic1==0xda &&
         pHeader->dataHeader.magic2==0x27 &&
-        pHeader->info.isBigEndian==U_IS_BIG_ENDIAN &&
         (isAcceptable==NULL || isAcceptable(context, type, name, &pHeader->info))
-        ) {
+    ) {
         rDataMem=UDataMemory_createNewInstance(fatalErr);
         if (U_FAILURE(*fatalErr)) {
             return NULL;
@@ -881,16 +988,18 @@
              UDataMemoryIsAcceptable *isAcceptable, void *context,
              UErrorCode *pErrorCode)
 {
-    UDataPathIterator iter;
-    const char *pathBuffer;
-    int32_t pathLen;
+    UDataMemory         *retVal = NULL;
 
-    char                tocEntryName[100];
-    char                oldStylePath[1024];
-    char                oldStylePathBasename[100];
+    UDataPathIterator   iter;
+    const char         *pathBuffer;
+
+    TinyString          tocEntryName;
+    TinyString          oldStylePath;
+    TinyString          oldStylePathBasename;
     const char         *dataPath;
 
     const char         *tocEntrySuffix;
+    int32_t             tocEntrySuffixIndex;
     UDataMemory         dataMemory;
     UDataMemory        *pCommonData;
     UDataMemory        *pEntryData;
@@ -898,25 +1007,30 @@
     const char         *inBasename;
     UErrorCode          errorCode=U_ZERO_ERROR;
     UBool               isICUData= (UBool)(path==NULL);
+
+    TinyString_init(&tocEntryName);
+    TinyString_init(&oldStylePath);
+    TinyString_init(&oldStylePathBasename);
+
     /* Make up a full mame by appending the type to the supplied
      *  name, assuming that a type was supplied.
      */
 
     /* prepend the package */
-    uprv_strcpy(tocEntryName, packageNameFromPath(path));
-
-    tocEntrySuffix = tocEntryName+uprv_strlen(tocEntryName); /* suffix starts here */
+    TinyString_append(&tocEntryName, packageNameFromPath(path));
 
-    uprv_strcat(tocEntryName, "_");
+    tocEntrySuffixIndex = tocEntryName.length;
 
-    uprv_strcat(tocEntryName, name);
+    TinyString_append(&tocEntryName, "_");
+    TinyString_append(&tocEntryName, name);
     if(type!=NULL && *type!=0) {
-        uprv_strcat(tocEntryName, ".");
-        uprv_strcat(tocEntryName, type);
+        TinyString_append(&tocEntryName, ".");
+        TinyString_append(&tocEntryName, type);
     }
+    tocEntrySuffix = tocEntryName.s+tocEntrySuffixIndex; /* suffix starts here */
 
 #ifdef UDATA_DEBUG
-    fprintf(stderr, " tocEntryName = %s\n", tocEntryName);
+    fprintf(stderr, " tocEntryName = %s\n", tocEntryName.s);
 #endif    
 
 
@@ -946,19 +1060,23 @@
         */
 
         char *rightSlash;
-        uprv_strcpy(oldStylePath, path);
-        oldStylePath[uprv_strlen(path)-1]=0; /* chop off trailing slash */
+        TinyString_append(&oldStylePath, path);
+        /* chop off trailing slash */
+        oldStylePath.length--;
+        oldStylePath.s[oldStylePath.length] = 0;
         
-        rightSlash = (char*)uprv_strrchr(oldStylePath, U_FILE_SEP_CHAR);
+        rightSlash = (char*)uprv_strrchr(oldStylePath.s, U_FILE_SEP_CHAR);
         if(rightSlash != NULL) {
             rightSlash++;
-            inBasename = uprv_strcpy(oldStylePathBasename, rightSlash);
-            uprv_strcat(oldStylePath, U_FILE_SEP_STRING);
-            uprv_strcat(oldStylePath, inBasename);  /* one more time, for the base name */
-            path = oldStylePath;
+            TinyString_append(&oldStylePathBasename, rightSlash);
+            inBasename = oldStylePathBasename.s;
+            TinyString_append(&oldStylePath, U_FILE_SEP_STRING);
+            TinyString_append(&oldStylePath, inBasename);  /* one more time, for the base name */
+            path = oldStylePath.s;
         } else {
             *pErrorCode = U_FILE_ACCESS_ERROR;  /* hopelessly bad case */
-            return NULL;
+            retVal = NULL;
+            goto commonReturn;
         }
     }
     /* End of dealing with a null basename */
@@ -969,7 +1087,7 @@
     /* init path iterator for individual files */
     udata_pathiter_init(&iter, dataPath, path, tocEntrySuffix, FALSE);
     
-    while((pathBuffer = udata_pathiter_next(&iter, &pathLen)))
+    while((pathBuffer = udata_pathiter_next(&iter, NULL)))
     {
 #ifdef UDATA_DEBUG
         fprintf(stderr, "UDATA: trying individual file %s\n", pathBuffer);
@@ -988,7 +1106,9 @@
 #ifdef UDATA_DEBUG
                 fprintf(stderr, "** Mapped file: %s\n", pathBuffer);
 #endif
-                return pEntryData;
+                udata_pathiter_dt(&iter);
+                retVal = pEntryData;
+                goto commonReturn;
             }
             
             /* the data is not acceptable, or some error occured.  Either way, unmap the memory */
@@ -996,7 +1116,9 @@
             
             /* If we had a nasty error, bail out completely.  */
             if (U_FAILURE(*pErrorCode)) {
-                return NULL;
+                udata_pathiter_dt(&iter);
+                retVal = NULL;
+                goto commonReturn;
             }
             
             /* Otherwise remember that we found data but didn't like it for some reason  */
@@ -1006,6 +1128,7 @@
         fprintf(stderr, "%s\n", UDataMemory_isLoaded(&dataMemory)?"LOADED":"not loaded");
 #endif
     }
+    udata_pathiter_dt(&iter);
 
     /* #2 */
 
@@ -1022,8 +1145,10 @@
         pCommonData=openCommonData(path, isICUData, &errorCode); /** search for pkg **/
 
         if(U_SUCCESS(errorCode)) {
+            int32_t length;
+
             /* look up the data piece in the common data */
-            pHeader=pCommonData->vFuncs->Lookup(pCommonData, tocEntryName, &errorCode);
+            pHeader=pCommonData->vFuncs->Lookup(pCommonData, tocEntryName.s, &length, &errorCode);
 #ifdef UDATA_DEBUG
             fprintf(stderr, "pHeader=%p\n", pHeader);
 #endif
@@ -1033,10 +1158,13 @@
             fprintf(stderr, "pEntryData=%p\n", pEntryData);
 #endif
                 if (U_FAILURE(*pErrorCode)) {
-                    return NULL;
+                    retVal = NULL;
+                    goto commonReturn;
                 }
                 if (pEntryData != NULL) {
-                    return pEntryData;
+                    pEntryData->length = length;
+                    retVal =  pEntryData;
+                    goto commonReturn;
                 }
             }
         }
@@ -1058,7 +1186,10 @@
             *pErrorCode=errorCode;
         }
     }
-    return NULL;
+
+commonReturn:
+    TinyString_dt(&tocEntryName);
+    return retVal;
 }
 
 
@@ -1090,7 +1221,7 @@
                  UDataMemoryIsAcceptable *isAcceptable, void *context,
                  UErrorCode *pErrorCode) {
 #ifdef UDATA_DEBUG
-  fprintf(stderr, "udata_openChoice(): Opening: %s . %s\n", name, type);fflush(stderr);
+    fprintf(stderr, "udata_openChoice(): Opening: %s . %s\n", name, type);fflush(stderr);
 #endif
 
     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
@@ -1110,10 +1241,16 @@
     if(pInfo!=NULL) {
         if(pData!=NULL && pData->pHeader!=NULL) {
             const UDataInfo *info=&pData->pHeader->info;
-            if(pInfo->size>info->size) {
-                pInfo->size=info->size;
+            uint16_t dataInfoSize=udata_getInfoSize(info);
+            if(pInfo->size>dataInfoSize) {
+                pInfo->size=dataInfoSize;
+            }
+            uprv_memcpy((uint16_t *)pInfo+1, (const uint16_t *)info+1, pInfo->size-2);
+            if(info->isBigEndian!=U_IS_BIG_ENDIAN) {
+                /* opposite endianness */
+                uint16_t x=info->reservedWord;
+                pInfo->reservedWord=(uint16_t)((x<<8)|(x>>8));
             }
-            uprv_memcpy((uint16_t *)pInfo+1, (uint16_t *)info+1, pInfo->size-2);
         } else {
             pInfo->size=0;
         }

Index: udatamem.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/udatamem.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- udatamem.c	10 Sep 2003 02:42:03 -0000	1.4
+++ udatamem.c	6 Apr 2004 10:08:04 -0000	1.5
@@ -25,6 +25,7 @@
 
 void UDataMemory_init(UDataMemory *This) {
     uprv_memset(This, 0, sizeof(UDataMemory));
+    This->length=-1;
 }
 
 
@@ -100,12 +101,58 @@
 U_CAPI const void * U_EXPORT2
 udata_getMemory(UDataMemory *pData) {
     if(pData!=NULL && pData->pHeader!=NULL) {
-        return (char *)(pData->pHeader)+pData->pHeader->dataHeader.headerSize;
+        return (char *)(pData->pHeader)+udata_getHeaderSize(pData->pHeader);
     } else {
         return NULL;
     }
 }
 
+/**
+ * Get the length of the data item if possible.
+ * The length may be up to 15 bytes larger than the actual data.
+ *
+ * TODO Consider making this function public.
+ * It would have to return the actual length in more cases.
+ * For example, the length of the last item in a .dat package could be
+ * computed from the size of the whole .dat package minus the offset of the
+ * last item.
+ * The size of a file that was directly memory-mapped could be determined
+ * using some system API.
+ *
+ * In order to get perfect values for all data items, we may have to add a
+ * length field to UDataInfo, but that complicates data generation
+ * and may be overkill.
+ *
+ * @param pData The data item.
+ * @return the length of the data item, or -1 if not known
+ * @internal Currently used only in cintltst/udatatst.c
+ */
+U_CAPI int32_t U_EXPORT2
+udata_getLength(UDataMemory *pData) {
+    if(pData!=NULL && pData->pHeader!=NULL && pData->length>=0) {
+        /*
+         * subtract the header size,
+         * return only the size of the actual data starting at udata_getMemory()
+         */
+        return pData->length-udata_getHeaderSize(pData->pHeader);
+    } else {
+        return -1;
+    }
+}
+
+/**
+ * Get the memory including the data header.
+ * Used in cintltst/udatatst.c
+ * @internal
+ */
+U_CAPI const void * U_EXPORT2
+udata_getRawMemory(UDataMemory *pData) {
+    if(pData!=NULL && pData->pHeader!=NULL) {
+        return pData->pHeader;
+    } else {
+        return NULL;
+    }
+}
 
 UBool  UDataMemory_isLoaded(UDataMemory *This) {
     return This->pHeader != NULL;

Index: udatamem.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/udatamem.h,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- udatamem.h	10 Sep 2003 02:42:03 -0000	1.3
+++ udatamem.h	6 Apr 2004 10:08:04 -0000	1.4
@@ -1,7 +1,7 @@
 /*
 ******************************************************************************
 *
-*   Copyright (C) 1999-2001, International Business Machines
+*   Copyright (C) 1999-2003, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************/
@@ -39,6 +39,7 @@
                                    /* Only non-null if a close operation should unmap */
                                    /*  the associated data, and additional info       */
                                    /*   beyond the mapAddr is needed to do that.      */
+    int32_t           length;      /* Length of the data in bytes; -1 if unknown.     */
 };
 
 UDataMemory     *UDataMemory_createNewInstance(UErrorCode *pErr);
@@ -49,5 +50,12 @@
 
 
 const DataHeader *UDataMemory_normalizeDataPointer(const void *p);
+
+U_CAPI int32_t U_EXPORT2
+udata_getLength(UDataMemory *pData);
+
+U_CAPI const void * U_EXPORT2
+udata_getRawMemory(UDataMemory *pData);
+
 #endif
 

Index: uhash.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/uhash.c,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -d -r1.5 -r1.6
--- uhash.c	10 Sep 2003 02:42:03 -0000	1.5
+++ uhash.c	6 Apr 2004 10:08:04 -0000	1.6
@@ -1,6 +1,6 @@
 /*
 ******************************************************************************
-*   Copyright (C) 1997-2001, International Business Machines
+*   Copyright (C) 1997-2003, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 ******************************************************************************
 *   Date        Name        Description
@@ -273,6 +273,14 @@
     return _uhash_find(hash, keyholder, hash->keyHasher(keyholder))->value.integer;
 }
 
+U_CAPI int32_t U_EXPORT2
+uhash_igeti(const UHashtable *hash,
+           int32_t key) {
+    UHashTok keyholder;
+    keyholder.integer = key;
+    return _uhash_find(hash, keyholder, hash->keyHasher(keyholder))->value.integer;
+}
+
 U_CAPI void* U_EXPORT2
 uhash_put(UHashtable *hash,
           void* key,
@@ -309,6 +317,20 @@
     valueholder.integer = value;
     return _uhash_put(hash, keyholder, valueholder,
                       HINT_KEY_POINTER,
+                      status).integer;
+}
+
+
+U_CAPI int32_t U_EXPORT2
+uhash_iputi(UHashtable *hash,
+           int32_t key,
+           int32_t value,
+           UErrorCode *status) {
+    UHashTok keyholder, valueholder;
+    keyholder.integer = key;
+    valueholder.integer = value;
+    return _uhash_put(hash, keyholder, valueholder,
+                      0, /* neither is a ptr */
                       status).integer;
 }
 

Index: uhash.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/uhash.h,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -d -r1.5 -r1.6
--- uhash.h	10 Sep 2003 02:42:03 -0000	1.5
+++ uhash.h	6 Apr 2004 10:08:04 -0000	1.6
@@ -1,6 +1,6 @@
 /*
 ******************************************************************************
-*   Copyright (C) 1997-2001, International Business Machines
+*   Copyright (C) 1997-2003, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 ******************************************************************************
 *   Date        Name        Description
@@ -342,6 +342,24 @@
            UErrorCode *status);
 
 /**
+ * Put a (key=integer, value=integer) item in a UHashtable.  If the
+ * keyDeleter is non-NULL, then the hashtable owns 'key' after this
+ * call.  valueDeleter must be NULL.  Storing a 0 value is the same as
+ * calling uhash_remove().
+ * @param hash The target UHashtable.
+ * @param key The key to store.
+ * @param value The integer value to store.
+ * @param status A pointer to an UErrorCode to receive any errors.
+ * @return The previous value, or 0 if none.
+ * @see uhash_get
+ */
+U_CAPI int32_t U_EXPORT2 
+uhash_iputi(UHashtable *hash,
+           int32_t key,
+           int32_t value,
+           UErrorCode *status);
+
+/**
  * Retrieve a pointer value from a UHashtable using a pointer key,
  * as previously stored by uhash_put().
  * @param hash The target UHashtable.
@@ -373,6 +391,16 @@
 U_CAPI int32_t U_EXPORT2 
 uhash_geti(const UHashtable *hash,
            const void* key);
+/**
+ * Retrieve an integer value from a UHashtable using an integer key,
+ * as previously stored by uhash_iputi().
+ * @param hash The target UHashtable.
+ * @param key An integer key stored in a hashtable
+ * @return The requested item, or 0 if not found.
+ */
+U_CAPI int32_t U_EXPORT2 
+uhash_igeti(const UHashtable *hash,
+           int32_t key);
 
 /**
  * Remove an item from a UHashtable stored by uhash_put().

Index: uidna.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/uidna.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- uidna.cpp	10 Sep 2003 02:42:03 -0000	1.1
+++ uidna.cpp	6 Apr 2004 10:08:04 -0000	1.2
@@ -20,7 +20,7 @@
 
 #include "unicode/uidna.h"
 #include "unicode/ustring.h"
-#include "strprep.h"
+#include "unicode/usprep.h"
 #include "punycode.h"
 #include "ustr_imp.h"
 #include "cmemory.h"
@@ -40,6 +40,7 @@
 #define CAPITAL_Z        0x005A
 #define LOWER_CASE_DELTA 0x0020
 #define FULL_STOP        0x002E
+#define DATA_FILE_NAME   "uidna"
 
 inline static UChar 
 toASCIILower(UChar ch){
@@ -120,20 +121,87 @@
 }
 
 
-U_CAPI int32_t U_EXPORT2
-uidna_toASCII(const UChar* src, int32_t srcLength, 
-              UChar* dest, int32_t destCapacity,
-              int32_t options,
-              UParseError* parseError,
-              UErrorCode* status){
-    
-    if(status == NULL || U_FAILURE(*status)){
-        return 0;
+/**
+ * Ascertain if the given code point is a label separator as 
+ * defined by the IDNA RFC
+ * 
+ * @param ch The code point to be ascertained
+ * @return true if the char is a label separator
+ * @draft ICU 2.8
+ */
+static inline UBool isLabelSeparator(UChar ch){
+    switch(ch){
+        case 0x002e:
+        case 0x3002:
+        case 0xFF0E:
+        case 0xFF61:
+            return TRUE;
+        default:
+            return FALSE;           
     }
-    if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
-        *status = U_ILLEGAL_ARGUMENT_ERROR;
-        return 0;
+}
+
+// returns the length of the label excluding the separator
+// if *limit == separator then the length returned does not include 
+// the separtor.
+static inline int32_t
+getNextSeparator(UChar *src,int32_t srcLength,
+                 UChar **limit,
+                 UBool *done){
+    if(srcLength == -1){
+        int32_t i;
+        for(i=0 ; ;i++){
+            if(src[i] == 0){
+                *limit = src + i; // point to null
+                *done = TRUE;
+                return i;
+            }
+            if(isLabelSeparator(src[i])){
+                *limit = src + (i+1); // go past the delimiter
+                return i;
+                
+            }
+        }
+    }else{
+        int32_t i;
+        for(i=0;i<srcLength;i++){
+            if(isLabelSeparator(src[i])){
+                *limit = src + (i+1); // go past the delimiter
+                return i;
+            }
+        }
+        // we have not found the delimiter
+        // if(i==srcLength)
+        *limit = src+srcLength;
+        *done = TRUE;
+
+        return i;
     }
+}
+static inline UBool isLDHChar(UChar ch){
+    // high runner case
+    if(ch>0x007A){
+        return FALSE;
+    }
+    //[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A]
+    if( (ch==0x002D) || 
+        (0x0030 <= ch && ch <= 0x0039) ||
+        (0x0041 <= ch && ch <= 0x005A) ||
+        (0x0061 <= ch && ch <= 0x007A)
+      ){
+        return TRUE;
+    }
+    return FALSE;
+}
+
+static int32_t 
+_internal_toASCII(const UChar* src, int32_t srcLength, 
+                  UChar* dest, int32_t destCapacity,
+                  int32_t options,
+                  UStringPrepProfile* nameprep,
+                  UParseError* parseError,
+                  UErrorCode* status){
+
     UChar b1Stack[MAX_LABEL_BUFFER_SIZE], b2Stack[MAX_LABEL_BUFFER_SIZE];
     //initialize pointers to stack buffers
     UChar  *b1 = b1Stack, *b2 = b2Stack;
@@ -142,7 +210,7 @@
             b2Capacity = MAX_LABEL_BUFFER_SIZE ,
             reqLength=0;
 
-
+    int32_t namePrepOptions = ((options & UIDNA_ALLOW_UNASSIGNED) != 0) ? USPREP_ALLOW_UNASSIGNED: 0;
     UBool* caseFlags = NULL;
     
     // the source contains all ascii codepoints
@@ -153,18 +221,12 @@
     int32_t j=0;
 
     //get the options
-    UBool allowUnassigned   = (UBool)((options & UIDNA_ALLOW_UNASSIGNED) != 0);
     UBool useSTD3ASCIIRules = (UBool)((options & UIDNA_USE_STD3_RULES) != 0);
-    
+
     int32_t failPos = -1;
-    // step 2
-    StringPrep* prep = StringPrep::createNameprepInstance(*status);
 
-    if(U_FAILURE(*status)){
-        goto CLEANUP;
-    }
-    
-    b1Len = prep->process(src,srcLength,b1, b1Capacity,allowUnassigned, parseError, *status);
+    // step 2    
+    b1Len = usprep_prepare(nameprep, src, srcLength, b1, b1Capacity, namePrepOptions, parseError, status);
     
     if(*status == U_BUFFER_OVERFLOW_ERROR){
         // redo processing of string
@@ -177,7 +239,7 @@
 
         *status = U_ZERO_ERROR; // reset error
         
-        b1Len = prep->process(src,srcLength,b1, b1Len,allowUnassigned, parseError, *status);
+        b1Len = usprep_prepare(nameprep, src, srcLength, b1, b1Len, namePrepOptions, parseError, status);
     }
     // error bail out
     if(U_FAILURE(*status)){
@@ -192,7 +254,7 @@
         // here we do not assemble surrogates
         // since we know that LDH code points
         // are in the ASCII range only
-        if(prep->isLDHChar(b1[j])==FALSE){
+        if(isLDHChar(b1[j])==FALSE){
             srcIsLDH = FALSE;
             failPos = j;
         }
@@ -292,30 +354,20 @@
     }
     uprv_free(caseFlags);
     
-    delete prep;
-
     return u_terminateUChars(dest, destCapacity, reqLength, status);
 }
 
-
-U_CAPI int32_t U_EXPORT2
-uidna_toUnicode(const UChar* src, int32_t srcLength,
-                UChar* dest, int32_t destCapacity,
-                int32_t options,
-                UParseError* parseError,
-                UErrorCode* status){
-
-    if(status == NULL || U_FAILURE(*status)){
-        return 0;
-    }
-    if( (src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
-        *status = U_ILLEGAL_ARGUMENT_ERROR;
-        return 0;
-    }
+static int32_t
+_internal_toUnicode(const UChar* src, int32_t srcLength,
+                    UChar* dest, int32_t destCapacity,
+                    int32_t options,
+                    UStringPrepProfile* nameprep,
+                    UParseError* parseError,
+                    UErrorCode* status){
 
     //get the options
-    UBool allowUnassigned   = (UBool)((options & UIDNA_ALLOW_UNASSIGNED) != 0);
     UBool useSTD3ASCIIRules = (UBool)((options & UIDNA_USE_STD3_RULES) != 0);
+    int32_t namePrepOptions = ((options & UIDNA_ALLOW_UNASSIGNED) != 0) ? USPREP_ALLOW_UNASSIGNED: 0; 
     
     UChar b1Stack[MAX_LABEL_BUFFER_SIZE], b2Stack[MAX_LABEL_BUFFER_SIZE], b3Stack[MAX_LABEL_BUFFER_SIZE];
 
@@ -326,8 +378,7 @@
             b2Capacity = MAX_LABEL_BUFFER_SIZE,
             b3Capacity = MAX_LABEL_BUFFER_SIZE,
             reqLength=0;
-    
-    StringPrep* prep = StringPrep::createNameprepInstance(*status);
+
     b1Len = 0;
     UBool* caseFlags = NULL;
 
@@ -335,10 +386,6 @@
     UBool srcIsLDH = TRUE;
     int32_t failPos =0;
 
-    if(U_FAILURE(*status)){
-        goto CLEANUP;
-    }
-
     // step 1: find out if all the codepoints in src are ASCII  
     if(srcLength==-1){
         srcLength = 0;
@@ -349,7 +396,7 @@
             // here we do not assemble surrogates
             // since we know that LDH code points
             // are in the ASCII range only
-            if(prep->isLDHChar(src[srcLength])==FALSE){
+            if(isLDHChar(src[srcLength])==FALSE){
                 srcIsLDH = FALSE;
                 failPos = srcLength;
             }
@@ -363,7 +410,7 @@
             // here we do not assemble surrogates
             // since we know that LDH code points
             // are in the ASCII range only
-            if(prep->isLDHChar(src[j])==FALSE){
+            if(isLDHChar(src[j])==FALSE){
                 srcIsLDH = FALSE;
                 failPos = j;
             }
@@ -372,7 +419,7 @@
 
     if(srcIsASCII == FALSE){
         // step 2: process the string
-        b1Len = prep->process(src,srcLength,b1,b1Capacity,allowUnassigned, parseError, *status);
+        b1Len = usprep_prepare(nameprep, src, srcLength, b1, b1Capacity, namePrepOptions, parseError, status);
         if(*status == U_BUFFER_OVERFLOW_ERROR){
             // redo processing of string
             /* we do not have enough room so grow the buffer*/
@@ -384,7 +431,7 @@
 
             *status = U_ZERO_ERROR; // reset error
             
-            b1Len = prep->process(src,srcLength,b1, b1Len,allowUnassigned,  parseError, *status);
+            b1Len = usprep_prepare(nameprep, src, srcLength, b1, b1Len, namePrepOptions, parseError, status);
         }
         //bail out on error
         if(U_FAILURE(*status)){
@@ -495,8 +542,7 @@
     }
     uprv_free(caseFlags);
     
-    delete prep;
-    
+   
     // The RFC states that 
     // <quote>
     // ToUnicode never fails. If any step fails, then the original input
@@ -518,46 +564,65 @@
     return u_terminateUChars(dest, destCapacity, reqLength, status);
 }
 
-// returns the length of the label excluding the separator
-// if *limit == separator then the length returned does not include 
-// the separtor.
-static int32_t
-getNextSeparator(UChar *src,int32_t srcLength,StringPrep* prep,
-                 UChar **limit,
-                 UBool *done,
-                 UErrorCode *status){
-    if(srcLength == -1){
-        int32_t i;
-        for(i=0 ; ;i++){
-            if(src[i] == 0){
-                *limit = src + i; // point to null
-                *done = TRUE;
-                return i;
-            }
-            if(prep->isLabelSeparator(src[i],*status)){
-                *limit = src + (i+1); // go past the delimiter
-                return i;
-                
-            }
-        }
-    }else{
-        int32_t i;
-        for(i=0;i<srcLength;i++){
-            if(prep->isLabelSeparator(src[i],*status)){
-                *limit = src + (i+1); // go past the delimiter
-                return i;
-            }
-        }
-        // we have not found the delimiter
-        // if(i==srcLength)
-        *limit = src+srcLength;
-        *done = TRUE;
+U_CAPI int32_t U_EXPORT2
+uidna_toASCII(const UChar* src, int32_t srcLength, 
+              UChar* dest, int32_t destCapacity,
+              int32_t options,
+              UParseError* parseError,
+              UErrorCode* status){
+    
+    if(status == NULL || U_FAILURE(*status)){
+        return 0;
+    }
+    if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }
 
-        return i;
+    UStringPrepProfile* nameprep = usprep_open(NULL,DATA_FILE_NAME, status);
+    
+    if(U_FAILURE(*status)){
+        return -1;
     }
+    
+    int32_t retLen = _internal_toASCII(src, srcLength, dest, destCapacity, options, nameprep, parseError, status);
+    
+    /* close the profile*/
+    usprep_close(nameprep);
+    
+    return retLen;
 }
 
 U_CAPI int32_t U_EXPORT2
+uidna_toUnicode(const UChar* src, int32_t srcLength,
+                UChar* dest, int32_t destCapacity,
+                int32_t options,
+                UParseError* parseError,
+                UErrorCode* status){
+
+    if(status == NULL || U_FAILURE(*status)){
+        return 0;
+    }
+    if( (src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }  
+    
+    UStringPrepProfile* nameprep = usprep_open(NULL, DATA_FILE_NAME, status);
+    
+    if(U_FAILURE(*status)){
+        return -1;
+    }
+    
+    int32_t retLen = _internal_toUnicode(src, srcLength, dest, destCapacity, options, nameprep, parseError, status);
+
+    usprep_close(nameprep);
+    
+    return retLen;
+}
+
+
+U_CAPI int32_t U_EXPORT2
 uidna_IDNToASCII(  const UChar *src, int32_t srcLength,
                    UChar* dest, int32_t destCapacity,
                    int32_t options,
@@ -574,7 +639,7 @@
 
     int32_t reqLength = 0;
 
-    StringPrep* prep = StringPrep::createNameprepInstance(*status);
+    UStringPrepProfile* nameprep = usprep_open(NULL, DATA_FILE_NAME, status);
     
     if(U_FAILURE(*status)){
         return 0;
@@ -592,11 +657,12 @@
 
     for(;;){
 
-        labelLen = getNextSeparator(labelStart,remainingLen, prep, &delimiter,&done, status);
+        labelLen = getNextSeparator(labelStart,remainingLen, &delimiter,&done);
         
-        labelReqLength = uidna_toASCII( labelStart, labelLen, 
-                                        currentDest, remainingDestCapacity, 
-                                        options, parseError, status);
+        labelReqLength = _internal_toASCII( labelStart, labelLen, 
+                                            currentDest, remainingDestCapacity, 
+                                            options, nameprep, 
+                                            parseError, status);
 
         if(*status == U_BUFFER_OVERFLOW_ERROR){
             
@@ -636,7 +702,7 @@
 
     }
    
-    delete prep;
+    usprep_close(nameprep);
     
     return u_terminateUChars(dest, destCapacity, reqLength, status);
 }
@@ -658,7 +724,7 @@
 
     int32_t reqLength = 0;
 
-    StringPrep* prep = StringPrep::createNameprepInstance(*status);
+    UStringPrepProfile* nameprep = usprep_open(NULL, DATA_FILE_NAME, status);
     
     if(U_FAILURE(*status)){
         return 0;
@@ -676,11 +742,12 @@
 
     for(;;){
 
-        labelLen = getNextSeparator(labelStart,remainingLen, prep, &delimiter,&done, status);
+        labelLen = getNextSeparator(labelStart,remainingLen, &delimiter,&done);
         
-        labelReqLength = uidna_toUnicode(labelStart, labelLen, 
-                                         currentDest, remainingDestCapacity, 
-                                         options, parseError, status);
+        labelReqLength = _internal_toUnicode(labelStart, labelLen, 
+                                             currentDest, remainingDestCapacity, 
+                                             options, nameprep, 
+                                             parseError, status);
 
         if(*status == U_BUFFER_OVERFLOW_ERROR){
             
@@ -721,7 +788,7 @@
 
     }
    
-    delete prep;
+    usprep_close(nameprep);
     
     return u_terminateUChars(dest, destCapacity, reqLength, status);
 }

Index: uiter.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/uiter.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- uiter.cpp	10 Sep 2003 02:42:03 -0000	1.1
+++ uiter.cpp	6 Apr 2004 10:08:04 -0000	1.2
@@ -232,13 +232,19 @@
  * except that UChars are assembled from byte pairs.
  */
 
+/* internal helper function */
+static inline UChar32
+utf16BEIteratorGet(UCharIterator *iter, int32_t index) {
+    const uint8_t *p=(const uint8_t *)iter->context;
+    return ((UChar)p[2*index]<<8)|(UChar)p[2*index+1];
+}
+
 static UChar32 U_CALLCONV
 utf16BEIteratorCurrent(UCharIterator *iter) {
     int32_t index;
 
     if((index=iter->index)<iter->limit) {
-        const uint8_t *p=(const uint8_t *)iter->context;
-        return ((UChar)p[2*index]<<8)|(UChar)p[2*index+1];
+        return utf16BEIteratorGet(iter, index);
     } else {
         return U_SENTINEL;
     }
@@ -249,9 +255,8 @@
     int32_t index;
 
     if((index=iter->index)<iter->limit) {
-        const uint8_t *p=(const uint8_t *)iter->context;
         iter->index=index+1;
-        return ((UChar)p[2*index]<<8)|(UChar)p[2*index+1];
+        return utf16BEIteratorGet(iter, index);
     } else {
         return U_SENTINEL;
     }
@@ -262,9 +267,8 @@
     int32_t index;
 
     if((index=iter->index)>iter->start) {
-        const uint8_t *p=(const uint8_t *)iter->context;
         iter->index=--index;
-        return ((UChar)p[2*index]<<8)|(UChar)p[2*index+1];
+        return utf16BEIteratorGet(iter, index);
     } else {
         return U_SENTINEL;
     }
@@ -758,7 +762,7 @@
             iter->index=iter->length; /* may or may not be <0 (unknown) */
             iter->start=iter->limit;
             iter->reservedField=0;
-            return iter->index>=0 ? iter->index : UITER_UNKNOWN_INDEX;
+            return iter->index>=0 ? iter->index : (int32_t)UITER_UNKNOWN_INDEX;
         }
     }
 
@@ -840,7 +844,7 @@
 
 static UBool U_CALLCONV
 utf8IteratorHasNext(UCharIterator *iter) {
-    return iter->reservedField!=0 || iter->start<iter->limit;
+    return iter->start<iter->limit || iter->reservedField!=0;
 }
 
 static UBool U_CALLCONV

Index: uloc.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/uloc.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- uloc.c	10 Sep 2003 02:42:03 -0000	1.4
+++ uloc.c	6 Apr 2004 10:08:04 -0000	1.5
@@ -37,6 +37,10 @@
 #include "cstring.h"
 #include "cmemory.h"
 #include "ucln_cmn.h"
+#include "locmap.h"
+#include "uarrsort.h"
+#include "uenumimp.h"
+
 
 /****************************************************************************
   Global variable and type definitions
@@ -46,28 +50,33 @@
[...1725 lines suppressed...]
+                  while(nextSeparator[i - 1] == ' ') {
+                      i--;
+                  }
+                  uprv_strncpy(buffer, nextSeparator, i);
+                  result = u_terminateChars(buffer, bufferCapacity, i, status);
+              } else {
+                  /* give a bigger buffer, please */
+                  *status = U_BUFFER_OVERFLOW_ERROR;
+                  if(startSearchHere) {
+                      result = startSearchHere - nextSeparator;
+                  } else {
+                      result = uprv_strlen(nextSeparator); 
+                  }
+              }
+              return result;
+          }
+      }
+    }
+    return 0;
 }

Index: umapfile.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/umapfile.c,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -d -r1.5 -r1.6
--- umapfile.c	10 Sep 2003 02:42:03 -0000	1.5
+++ umapfile.c	6 Apr 2004 10:08:05 -0000	1.6
@@ -46,7 +46,7 @@
 #   define MAP_IMPLEMENTATION MAP_WIN32
 
 /* ### Todo: properly auto detect mmap(). Until then, just add your platform here. */
-#elif U_HAVE_MMAP || defined(AIX) || defined(HPUX) || defined(OS390) || defined(PTX)
+#elif U_HAVE_MMAP || defined(U_AIX) || defined(U_HPUX) || defined(OS390) || defined(PTX)
     typedef size_t MemoryMap;
 
 #   define IS_MAP(map) ((map)!=0)
@@ -177,7 +177,7 @@
         }
 
         /* get a view of the mapping */
-#ifndef HPUX
+#ifndef U_HPUX
         data=mmap(0, length, PROT_READ, MAP_SHARED,  fd, 0);
 #else
         data=mmap(0, length, PROT_READ, MAP_PRIVATE, fd, 0);

Index: umutex.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/umutex.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- umutex.c	10 Sep 2003 02:42:03 -0000	1.4
+++ umutex.c	6 Apr 2004 10:08:05 -0000	1.5
@@ -32,28 +32,14 @@
 #endif
 
 
-/* Check our settings... */
 #include "unicode/utypes.h"
 #include "uassert.h"
+#include "ucln_cmn.h"
 
 
 #if defined(POSIX) && (ICU_USE_THREADS==1)
-  /* Usage: uncomment the following, and breakpoint WeAreDeadlocked to
-     find reentrant issues. */
-/* # define POSIX_DEBUG_REENTRANCY 1 */
 # include <pthread.h> /* must be first, so that we get the multithread versions of things. */
 
-# ifdef POSIX_DEBUG_REENTRANCY
- pthread_t      gLastThread;
- UBool         gInMutex;
-
- U_EXPORT void WeAreDeadlocked();
-
- void WeAreDeadlocked()
- {
-    puts("ARGH!! We're deadlocked.. break on WeAreDeadlocked() next time.");
- }
-# endif /* POSIX_DEBUG_REENTRANCY */
 #endif /* POSIX && (ICU_USE_THREADS==1) */
 
 #ifdef WIN32
@@ -69,191 +55,271 @@
 #include "umutex.h"
 #include "cmemory.h"
 
-#if (ICU_USE_THREADS == 1)
+/*
+ * A note on ICU Mutex Initialization and ICU startup:
+ *
+ *   ICU mutexes, as used through the rest of the ICU code, are self-initializing.
+ *   To make this work, ICU uses the _ICU GLobal Mutex_ to synchronize the lazy init
+ *   of other ICU mutexes.  For the global mutex itself, we need some other mechanism
+ *   to safely initialize it on first use.  This becomes important if two or more
+ *   threads were more or less simultaenously the first to use ICU in a process, and
+ *   were racing into the mutex initialization code.
+ *
+ *   The solution for the global mutex init is platform dependent.
+ *   On POSIX systems, C-style init can be used on a mutex, with the 
+ *   macro PTHREAD_MUTEX_INITIALIZER.  The mutex is then ready for use, without
+ *   first calling pthread_mutex_init().
+ *
+ *   Windows has no equivalent statically initialized mutex or CRITICAL SECION.
+ *   InitializeCriticalSection() must be called.  If the global mutex does not
+ *   appear to be initialized, a thread will create and initialize a new
+ *   CRITICAL_SECTION, then use a Windows InterlockedCompareAndExchange to
+ *   avoid problems with race conditions.
+ *
+ *   If an application has overridden the ICU mutex implementation
+ *   by calling u_setMutexFunctions(), the user supplied init function must
+ *   be safe in the event that multiple threads concurrently attempt to init
+ *   the same mutex.  The first thread should do the init, and the others should
+ *   have no effect.
+ *
+ */ 
 
-/* the global mutex. Use it proudly and wash it often. */
-static UMTX    gGlobalMutex = NULL;
-# ifdef _DEBUG
-static int32_t gRecursionCount = 0;       /* Detect Recursive entries.  For debugging only. */
-# endif
+/* The global ICU mutex.   */
+#if defined(WIN32) 
+static UMTX              gGlobalMutex      = NULL;
 
-#if defined(WIN32)
-static CRITICAL_SECTION gPlatformMutex;
+#elif defined(POSIX) 
+#if (ICU_USE_THREADS == 1)
+static pthread_mutex_t   gGlobalPosixMutex = PTHREAD_MUTEX_INITIALIZER;
+static UMTX              gGlobalMutex      = &gGlobalPosixMutex;
+static UMTX              gIncDecMutex      = NULL;
+#else
+static UMTX              gGlobalMutex      = NULL;
+static UMTX              gIncDecMutex      = NULL;
+#endif  
 
-#elif defined(POSIX)
-static pthread_mutex_t gPlatformMutex;    /* The global ICU mutex   */
-static pthread_mutex_t gIncDecMutex;      /* For use by atomic inc/dec, on Unixes only */    
+#else 
+/* Unknown platform.  OK so long as ICU_USE_THREAD is not set.  
+                      Note that user can still set mutex functions at run time,
+                      and that the global mutex variable is still needed in that case. */
+static UMTX              gGlobalMutex      = NULL;
+#if (ICU_USE_THREADS == 1)
+#error no ICU mutex implementation for this platform
+#endif
+#endif
 
+/* Detect Recursive locking of the global mutex.  For debugging only. */
+#if defined(WIN32) && defined(_DEBUG) && (ICU_USE_THREADS==1)
+static int32_t gRecursionCount = 0;       
 #endif
-#endif /* ICU_USE_THREADS==1 */
 
 
+/*
+ *  User mutex implementation functions.  If non-null, call back to these rather than
+ *  directly using the system (Posix or Windows) APIs.
+ *    (declarations are in uclean.h)
+ */
+static UMtxInitFn    *pMutexInitFn    = NULL;
+static UMtxFn        *pMutexDestroyFn = NULL;
+static UMtxFn        *pMutexLockFn    = NULL;
+static UMtxFn        *pMutexUnlockFn  = NULL;
+static const void    *gMutexContext   = NULL;
 
-U_CAPI UBool U_EXPORT2
-umtx_isInitialized(UMTX *mutex)
-{
-#if (ICU_USE_THREADS == 1)
-    if (mutex == NULL)
-    {
-        return (UBool)(gGlobalMutex != NULL);
-    } else {
-        UBool isInited;
-        umtx_lock(NULL);
-        isInited = (*mutex != NULL);
-        umtx_unlock(NULL);
-        return isInited;
-    }
-#else
-    return TRUE;    /* Since we don't use threads, it's considered initialized. */
-#endif /* ICU_USE_THREADS==1 */
-}
 
+
+/*
+ *   umtx_lock
+ */
 U_CAPI void  U_EXPORT2
 umtx_lock(UMTX *mutex)
 {
-#if (ICU_USE_THREADS == 1)
-    if (mutex == NULL)
-    {
+    if (mutex == NULL) {
         mutex = &gGlobalMutex;
     }
 
-    if (*mutex == NULL)
-    {
-        /* Lazy init of a non-global mutexes on first lock is NOT safe on processors
-         *  that reorder memory operations.  */
-        /* U_ASSERT(FALSE);    TODO:  Turn this back on */
-        if (mutex != &gGlobalMutex) {
-            umtx_init(mutex);
-        } else {
-            umtx_init(NULL);  /* initialize the global mutex - only get 
-                                 here if C++ static init is NOT working,
-                                 and u_init() hasn't been called.
-                                 
-                                 Not thread-safe if this call is contended! */
-        }
+    if (*mutex == NULL) {
+        /* Lock of an uninitialized mutex.  Initialize it before proceeding.   */
+        umtx_init(mutex);    
     }
 
-#if defined(WIN32)
+    if (pMutexLockFn != NULL) {
+        (*pMutexLockFn)(gMutexContext, mutex);
+    } else {
 
-    EnterCriticalSection((CRITICAL_SECTION*) *mutex);
-    #ifdef _DEBUG
-    if (mutex == &gGlobalMutex) {
-        gRecursionCount++;
-        U_ASSERT(gRecursionCount == 1);
+#if (ICU_USE_THREADS == 1)
+#if defined(WIN32)
+        EnterCriticalSection((CRITICAL_SECTION*) *mutex);
+#elif defined(POSIX)
+        pthread_mutex_lock((pthread_mutex_t*) *mutex);
+#endif   /* cascade of platforms */
+#endif /* ICU_USE_THREADS==1 */
     }
-    #endif /*_DEBUG*/
 
-#elif defined(POSIX)
+#if defined(WIN32) && defined(_DEBUG) && (ICU_USE_THREADS==1)
+        if (mutex == &gGlobalMutex) {         /* Detect Reentrant locking of the global mutex.      */
+            gRecursionCount++;                /* Recursion causes deadlocks on Unixes.              */
+            U_ASSERT(gRecursionCount == 1);   /* Detection works on Windows.  Debug problems there. */
+        }
+#endif /*_DEBUG*/
+}
 
-#  ifdef POSIX_DEBUG_REENTRANCY
-    if (gInMutex == TRUE && mutex == &gGlobalMutex) /* in the mutex -- possible deadlock*/
-        if(pthread_equal(gLastThread, pthread_self()))
-            WeAreDeadlocked();
-#  endif
-    pthread_mutex_lock((pthread_mutex_t*) *mutex);
 
-#  ifdef POSIX_DEBUG_REENTRANCY
-    if (mutex == &gGlobalMutex) {
-        gLastThread = pthread_self();
-        gInMutex = TRUE;
-    }
-#  endif
-#endif
-#endif /* ICU_USE_THREADS==1 */
-}
 
+/*
+ * umtx_unlock
+ */
 U_CAPI void  U_EXPORT2
 umtx_unlock(UMTX* mutex)
 {
-#if (ICU_USE_THREADS==1)
-    if(mutex == NULL)
-    {
+    if(mutex == NULL) {
         mutex = &gGlobalMutex;
     }
 
-    if(*mutex == NULL)
-    {
-        return; /* jitterbug 135, fix for multiprocessor machines */
+    if(*mutex == NULL)    {
+        U_ASSERT(FALSE);  /* This mutex is not initialized.     */
+        return; 
     }
 
-#if defined (WIN32)
-    #ifdef _DEBUG
+#if defined (WIN32) && defined (_DEBUG) && (ICU_USE_THREADS==1)
     if (mutex == &gGlobalMutex) {
         gRecursionCount--;
-        U_ASSERT(gRecursionCount == 0);
-    }
-    #endif /*_DEBUG*/
-    LeaveCriticalSection((CRITICAL_SECTION*)*mutex);
-
-#elif defined (POSIX)
-    pthread_mutex_unlock((pthread_mutex_t*)*mutex);
-
-#ifdef POSIX_DEBUG_REENTRANCY
-    if (mutex == &gGlobalMutex) {
-        gInMutex = FALSE;
+        U_ASSERT(gRecursionCount == 0);  /* Detect unlock of an already unlocked mutex */
     }
 #endif
 
-#endif
+    if (pMutexUnlockFn) {
+        (*pMutexUnlockFn)(gMutexContext, mutex);
+    } else {
+#if (ICU_USE_THREADS==1)
+#if defined (WIN32)
+        LeaveCriticalSection((CRITICAL_SECTION*)*mutex);
+#elif defined (POSIX)
+        pthread_mutex_unlock((pthread_mutex_t*)*mutex);
+#endif  /* cascade of platforms */
 #endif /* ICU_USE_THREADS == 1 */
+    }
 }
 
 
 
 /*
  *   umtx_raw_init    Do the platform specific mutex allocation and initialization
+ *                    for all ICU mutexes _except_ the ICU global mutex.
  */
+static void umtx_raw_init(UMTX *mutex) {
+    if (pMutexInitFn != NULL) {
+        UErrorCode status = U_ZERO_ERROR;
+        (*pMutexInitFn)(gMutexContext, mutex, &status);
+        if (U_FAILURE(status)) {
+            /* TODO:  how should errors here be handled? */
+            return;
+        }
+    } else {
+
 #if (ICU_USE_THREADS == 1)
-static UMTX umtx_raw_init(void  *mem) {
     #if defined (WIN32)
-        if (mem == NULL) {
-            mem = uprv_malloc(sizeof(CRITICAL_SECTION));
-            if (mem == NULL) {return NULL;}
+        CRITICAL_SECTION *cs = uprv_malloc(sizeof(CRITICAL_SECTION));
+        if (cs == NULL) {
+            return;
         }
-        InitializeCriticalSection((CRITICAL_SECTION*)mem);
+        InitializeCriticalSection(cs);
+        *mutex = cs;
     #elif defined( POSIX )
-        if (mem == NULL) {
-            mem = uprv_malloc(sizeof(pthread_mutex_t));
-            if (mem == NULL) {return NULL;}
+        pthread_mutex_t *m = (pthread_mutex_t *)uprv_malloc(sizeof(pthread_mutex_t));
+        if (m == NULL) {
+            return;
         }
         # if defined (HPUX_CMA)
-            pthread_mutex_init((pthread_mutex_t*)mem, pthread_mutexattr_default);
+            pthread_mutex_init(m, pthread_mutexattr_default);
         # else
-            pthread_mutex_init((pthread_mutex_t*)mem, NULL);
+            pthread_mutex_init(m, NULL);
         # endif
-    #endif
-    return (UMTX *)mem;
+        *mutex = m;
+    #endif /* cascade of platforms */
+#else  /* ICU_USE_THREADS */
+        *mutex = mutex;      /* With no threads, we must still set the mutex to
+                              * some non-null value to make the rest of the
+                              *   (not ifdefed) mutex code think that it is initialized.
+                              */
+#endif /* ICU_USE_THREADS */
+    }
 }
-#endif  /* ICU_USE_THREADS */
 
 
-U_CAPI void  U_EXPORT2
-umtx_init(UMTX *mutex)
-{
-#if (ICU_USE_THREADS == 1)
 
-    if (mutex == NULL) /* initialize the global mutex */
+/*
+ *   initGlobalMutex    Do the platform specific initialization of the ICU global mutex.
+ *                      Separated out from the other mutexes because it is different:
+ *                      Mutex storage is static for POSIX, init must be thread safe 
+ *                      without the use of another mutex.
+ */
+static void initGlobalMutex() {
+    /*
+     * Call user mutex init function if one has been specified and the global mutex
+     *  is not already initialized.  
+     */
+    if (pMutexInitFn != NULL) {
+        if (gGlobalMutex==NULL) {
+            UErrorCode status = U_ZERO_ERROR;
+            (*pMutexInitFn)(gMutexContext, &gGlobalMutex, &status);
+            if (U_FAILURE(status)) {
+                /* TODO:  how should errors here be handled? */
+                return;
+            }
+        }
+        return;
+    }
+
+    /* No user override of mutex functions.
+     *   Use default ICU mutex implementations.
+     */
+#if (ICU_USE_THREADS == 1)
+    #if defined (WIN32)
     {
-        /* Note:  The initialization of the global mutex is NOT thread safe.   */
-        if (gGlobalMutex != NULL) {
-            return;
+        void              *t;
+        CRITICAL_SECTION  *ourCritSec = uprv_malloc(sizeof(CRITICAL_SECTION)); 
+        InitializeCriticalSection(ourCritSec);
+#if defined (InterlockedCompareExchangePointer) || defined (_WIN64)
+        t = InterlockedCompareExchangePointer(&gGlobalMutex, ourCritSec, NULL);
+#else
+        /* Note that the headers from Microsoft's WIndows SDK define InterlockedCompareExchangePointer
+         * for all platforms, but the old headers included with MSVC 6 do not.
+         */
+        t = (void *)InterlockedCompareExchange(&gGlobalMutex, ourCritSec, NULL);
+#endif
+        if (t != NULL) {
+            /* Some other thread stored into gGlobalMutex first.  Discard the critical
+             *  section we just created; the system will go with the other one.
+             */
+            DeleteCriticalSection(ourCritSec);
+            uprv_free(ourCritSec);
         }
-        gGlobalMutex = umtx_raw_init(&gPlatformMutex);
+    }
+    #elif defined( POSIX )
+        /*  No Action Required.  Global mutex set up with C static initialization. */
+        U_ASSERT(gGlobalMutex == &gGlobalPosixMutex);
+    #endif /* cascade of platforms */
+#else  /* ICU_USE_THREADS */
+        gGlobalMutex = &gGlobalMutex;  /* With no threads, we must still set the mutex to
+                                        * some non-null value to make the rest of the
+                                        *   (not ifdefed) mutex code think that it is initialized.
+                                        */
+#endif /* ICU_USE_THREADS */
+}
+
 
-       # ifdef POSIX_DEBUG_REENTRANCY
-           gInMutex = FALSE;
-       # endif
-       #ifdef _DEBUG
-           gRecursionCount = 0;
-       #endif
 
-       #ifdef POSIX
-       umtx_raw_init(&gIncDecMutex);
-       #endif
 
+
+U_CAPI void  U_EXPORT2
+umtx_init(UMTX *mutex)
+{
+    if (mutex == NULL || mutex == &gGlobalMutex) {
+        initGlobalMutex();
     } else {
-        /* Not the global mutex.
-         *  Thread safe initialization, using the global mutex.
+        /* 
+         *  Thread safe initialization of mutexes other than the global one,
+         *  using the global mutex.
          */  
         UBool isInitialized; 
         UMTX tMutex = NULL;
@@ -265,7 +331,7 @@
             return;
         }
 
-        tMutex = umtx_raw_init(NULL);
+        umtx_raw_init(&tMutex);
 
         umtx_lock(NULL);
         if (*mutex == NULL) {
@@ -274,121 +340,212 @@
         }
         umtx_unlock(NULL);
         
-        umtx_destroy(&tMutex);  /* NOP if (tmutex == NULL)  */
+        if (tMutex != NULL) {
+            umtx_destroy(&tMutex); 
+        }
     }
-#endif /* ICU_USE_THREADS==1 */
 }
 
+
+/*
+ *  umtx_destroy.    Un-initialize a mutex, releasing any underlying resources
+ *                   that it may be holding.  Destroying an already destroyed
+ *                   mutex has no effect.  Unlike umtx_init(), this function
+ *                   is not thread safe;  two threads must not concurrently try to
+ *                   destroy the same mutex.
+ */                  
 U_CAPI void  U_EXPORT2
 umtx_destroy(UMTX *mutex) {
-#if (ICU_USE_THREADS == 1)
-    if (mutex == NULL) /* destroy the global mutex */
-    {
+    if (mutex == NULL) {  /* destroy the global mutex */
         mutex = &gGlobalMutex;
     }
-
-    if (*mutex == NULL) /* someone already did it. */
+    
+    if (*mutex == NULL) {  /* someone already did it. */
         return;
+    }
 
-#if defined (WIN32)
-    DeleteCriticalSection((CRITICAL_SECTION*)*mutex);
-
-#elif defined (POSIX)
-    pthread_mutex_destroy((pthread_mutex_t*)*mutex);
-
+#if defined (POSIX)
+    /*  The life of the inc/dec mutex for POSIX is tied to that of the global mutex. */
+    if (mutex == &gGlobalMutex) {
+        umtx_destroy(&gIncDecMutex);
+    }
 #endif
 
-    if (*mutex != gGlobalMutex)
-    {
+    if (pMutexDestroyFn != NULL) {
+        (*pMutexDestroyFn)(gMutexContext, mutex);
+        *mutex = NULL;
+    } else {
+#if (ICU_USE_THREADS == 1)
+#if defined (WIN32)
+        DeleteCriticalSection((CRITICAL_SECTION*)*mutex);
         uprv_free(*mutex);
+        *mutex = NULL;
+#elif defined (POSIX)
+        if (*mutex != &gGlobalPosixMutex) {
+            /* Only POSIX mutexes other than the ICU global mutex get destroyed. */
+            pthread_mutex_destroy((pthread_mutex_t*)*mutex);
+            uprv_free(*mutex);
+            *mutex = NULL;
+        } 
+#endif /* chain of platforms */
+#else  /* ICU_USE_THREADS==1 */
+           /* NO ICU Threads.  We still need to zero out the mutex pointer, so that
+            * it appears to be uninitialized     */
+        *mutex = NULL;
+#endif   /* ICU_USE_THREADS */
+        
     }
 
-    *mutex = NULL;
-#endif /* ICU_USE_THREADS==1 */
 }
 
 
-#if (ICU_USE_THREADS == 1) 
 
+U_CAPI void U_EXPORT2 
+u_setMutexFunctions(const void *context, UMtxInitFn *i, UMtxFn *d, UMtxFn *l, UMtxFn *u,
+                    UErrorCode *status) {
+    if (U_FAILURE(*status)) {
+        return;
+    }
 
-/*
- *  umtx_atomic_inc
- *  umtx_atomic_dec
- */
+    /* Can not set a mutex function to a NULL value  */
+    if (i==NULL || d==NULL || l==NULL || u==NULL) {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+        return;
+    }
 
-#if defined (WIN32)
-/*
- * Win32 - use the Windows API functions for atomic increment and decrement.
- */
-U_CAPI int32_t U_EXPORT2
-umtx_atomic_inc(int32_t *p)
-{
-    return InterlockedIncrement(p);
-}
+    /* If ICU is not in an initial state, disallow this operation. */
+    if (cmemory_inUse()) {
+        *status = U_INVALID_STATE_ERROR;
+        return;
+    }
 
-U_CAPI int32_t U_EXPORT2
-umtx_atomic_dec(int32_t *p)
-{
-    return InterlockedDecrement(p); 
+    /* Swap in the mutex function pointers.  */
+    pMutexInitFn    = i;
+    pMutexDestroyFn = d;
+    pMutexLockFn    = l;
+    pMutexUnlockFn  = u;
+    gMutexContext   = context;
+    gGlobalMutex    = NULL;         /* For POSIX, the global mutex will be pre-initialized */
+                                    /*   Undo that, force re-initialization when u_init()  */
+                                    /*   happens.                                          */
 }
 
-#elif defined (POSIX)
-/*
- * POSIX platforms without specific atomic operations.  Use a posix mutex
- *   to protect the increment and decrement.
- *   The IncDecMutex is in static storage so we don't have to come back and delete it
- *   when the process exits.
- */
 
-U_CAPI int32_t U_EXPORT2
-umtx_atomic_inc(int32_t *p)
-{
-    int32_t    retVal;
 
-    pthread_mutex_lock(&gIncDecMutex);
-    retVal = ++(*p);
-    pthread_mutex_unlock(&gIncDecMutex);
-    return retVal;
-}
+/*-----------------------------------------------------------------
+ *
+ *  Atomic Increment and Decrement
+ *     umtx_atomic_inc
+ *     umtx_atomic_dec
+ *
+ *----------------------------------------------------------------*/
+
+/* Pointers to user-supplied inc/dec functions.  Null if no funcs have been set.  */
+static UMtxAtomicFn  *pIncFn = NULL;
+static UMtxAtomicFn  *pDecFn = NULL;
+static void *gIncDecContext  = NULL;
 
 
 U_CAPI int32_t U_EXPORT2
-umtx_atomic_dec(int32_t *p)
-{
-    int32_t    retVal;
+umtx_atomic_inc(int32_t *p)  {
+    int32_t retVal;
+    if (pIncFn) {
+        retVal = (*pIncFn)(gIncDecContext, p);
+    } else {
+        #if defined (WIN32) && ICU_USE_THREADS == 1
+            retVal = InterlockedIncrement(p);
+        #elif defined (POSIX) && ICU_USE_THREADS == 1
+            umtx_lock(&gIncDecMutex);
+            retVal = ++(*p);
+            umtx_unlock(&gIncDecMutex);
+        #else
+            /* Unknown Platform, or ICU thread support compiled out. */
+            retVal = ++(*p);
+        #endif
+    }
+    return retVal;
+}
 
-    pthread_mutex_lock(&gIncDecMutex);
-    retVal = --(*p);
-    pthread_mutex_unlock(&gIncDecMutex);
+U_CAPI int32_t U_EXPORT2
+umtx_atomic_dec(int32_t *p) {
+    int32_t retVal;
+    if (pDecFn) {
+        retVal = (*pDecFn)(gIncDecContext, p);
+    } else {
+        #if defined (WIN32) && ICU_USE_THREADS == 1
+            retVal = InterlockedDecrement(p);
+        #elif defined (POSIX) && ICU_USE_THREADS == 1
+            umtx_lock(&gIncDecMutex);
+            retVal = --(*p);
+            umtx_unlock(&gIncDecMutex);
+        #else
+            /* Unknown Platform, or ICU thread support compiled out. */
+            retVal = --(*p);
+        #endif
+    }
     return retVal;
 }
 
+/* TODO:  Some POSIXy platforms have atomic inc/dec functions available.  Use them. */
 
-#else 
-   
-/* No recognized platform.  */
-#error  No atomic increment and decrement defined for this platform. \
-        Either use the --disable-threads configure option, or define those functions in this file.
 
-#endif   /* Platform selection for atomic_inc and dec. */
 
 
-#else  /* (ICU_USE_THREADS == 0) */
 
-/* Threads disabled here */
+U_CAPI void U_EXPORT2
+u_setAtomicIncDecFunctions(const void *context, UMtxAtomicFn *ip, UMtxAtomicFn *dp,
+                                UErrorCode *status) {
+    int32_t   testInt;
+    if (U_FAILURE(*status)) {
+        return;
+    }
+    /* Can not set a mutex function to a NULL value  */
+    if (ip==NULL || dp==NULL) {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+        return;
+    }
+    /* If ICU is not in an initial state, disallow this operation. */
+    if (cmemory_inUse()) {
+        *status = U_INVALID_STATE_ERROR;
+        return;
+    }
 
-U_CAPI int32_t U_EXPORT2
-umtx_atomic_inc(int32_t *p) {
-    return ++(*p);
-}
+    pIncFn = ip;
+    pDecFn = dp;
 
-U_CAPI int32_t U_EXPORT2
-umtx_atomic_dec(int32_t *p) {
-    return --(*p);
+    testInt = 0;
+    U_ASSERT(umtx_atomic_inc(&testInt) == 1);     /* Sanity Check.    Do the functions work at all? */
+    U_ASSERT(testInt == 1);
+    U_ASSERT(umtx_atomic_dec(&testInt) == 0);
+    U_ASSERT(testInt == 0);
 }
 
-#endif /* (ICU_USE_THREADS == 1) */
 
 
+/*
+ *  Mutex Cleanup Function
+ *
+ *      Destroy the global mutex(es), and reset the mutex function callback pointers.
+ */
+U_CFUNC UBool umtx_cleanup(void) {
+    umtx_destroy(NULL);
+    pMutexInitFn    = NULL;
+    pMutexDestroyFn = NULL;
+    pMutexLockFn    = NULL;
+    pMutexUnlockFn  = NULL;
+    gMutexContext   = NULL;
+
+    gGlobalMutex    = NULL;
+#if defined (POSIX)
+    gIncDecMutex    = NULL;
+#if (ICU_USE_THREADS == 1)
+    gGlobalMutex    = &gGlobalPosixMutex;
+#endif
+#endif
+    pIncFn         = NULL;
+    pDecFn         = NULL;
+
+    return TRUE;
+}
 
 

Index: umutex.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/umutex.h,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- umutex.h	10 Sep 2003 02:42:03 -0000	1.3
+++ umutex.h	6 Apr 2004 10:08:05 -0000	1.4
@@ -1,6 +1,6 @@
 /*
 **********************************************************************
-*   Copyright (C) 1997-2001, International Business Machines
+*   Copyright (C) 1997-2003, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *
@@ -19,80 +19,79 @@
 #define UMUTEX_H
 
 #include "unicode/utypes.h"
+#include "unicode/uclean.h"  
 
-/**
- * Mutex data type.
- * @internal
- */
-typedef void *UMTX;
 
 /* APP_NO_THREADS is an old symbol. We'll honour it if present. */
 #ifdef APP_NO_THREADS
 # define ICU_USE_THREADS 0
 #endif
 
-/* Default: use threads. */
+/* ICU_USE_THREADS
+ *
+ *   Allows thread support (use of mutexes) to be compiled out of ICU.
+ *   Default: use threads.
+ *   Even with thread support compiled out, applications may override the
+ *   (empty) mutex implementation with the u_setMutexFunctions() functions.
+ */ 
 #ifndef ICU_USE_THREADS
 # define ICU_USE_THREADS 1
 #endif
 
 /*
- * Code within this library which accesses protected data should
- * instantiate a Mutex object while doing so.  Notice that there is
- * only one coarse-grained lock which applies to this entire library,
- * so keep locking short and sweet.
+ * Code within ICU that accesses shared static or global data should
+ * instantiate a Mutex object while doing so.  The unnamed global mutex
+ * is used throughout ICU, so keep locking short and sweet.
  *
  * For example:
  *
  * void Function(int arg1, int arg2)
  * {
- *   static Object* foo; // Shared read-write object
- *   Mutex mutex;
+ *   static Object* foo;     // Shared read-write object
+ *   umtx_lock(NULL);        // Lock the ICU global mutex
  *   foo->Method();
- *   // When 'mutex' goes out of scope and gets destroyed here
- *   // the lock is released
+ *   umtx_unlock(NULL);
  * }
  *
- * Note: Do NOT use the form 'Mutex mutex();' as that merely
- * forward-declares a function returning a Mutex. This is a common
- * mistake which silently slips through the compiler!!  */
-
+ * an alternative C++ mutex API is defined in the file common/mutex.h
+ */
 
-/* Lock a mutex. Pass in NULL if you want the (ick) Single Global
-   Mutex. 
- * @param mutex The given mutex to be locked
+/* Lock a mutex. 
+ * @param mutex The given mutex to be locked.  Pass NULL to specify
+ *              the global ICU mutex.  Recursive locks are an error
+ *              and may cause a deadlock on some platforms.
  */
 U_CAPI void U_EXPORT2 umtx_lock   ( UMTX* mutex ); 
 
 /* Unlock a mutex. Pass in NULL if you want the single global
    mutex. 
- * @param mutex The given mutex to be unlocked
+ * @param mutex The given mutex to be unlocked.  Pass NULL to specify
+ *              the global ICU mutex.
  */
 U_CAPI void U_EXPORT2 umtx_unlock ( UMTX* mutex );
 
 /* Initialize a mutex. Use it this way:
    umtx_init( &aMutex ); 
- * ICU Mutexes, aside from the global mutex, must be explicitly initialized
- * before use.
+ * ICU Mutexes do not need explicit initialization before use.  Use of this
+ *   function is not necessary.
+ * Initialization of an already initialized mutex has no effect, and is safe to do.
+ * Initialization of mutexes is thread safe.  Two threads can concurrently 
+ *   initialize the same mutex without causing problems.
  * @param mutex The given mutex to be initialized
  */
 U_CAPI void U_EXPORT2 umtx_init   ( UMTX* mutex );
 
 /* Destroy a mutex. This will free the resources of a mutex.
-   Use it this way:
-   umtx_destroy( &aMutex ); 
- * @param mutex The given mutex to be destroyed
+ * Use it this way:
+ *   umtx_destroy( &aMutex ); 
+ * Destroying an already destroyed mutex has no effect, and causes no problems.
+ * This function is not thread safe.  Two threads must not attempt to concurrently
+ *   destroy the same mutex.
+ * @param mutex The given mutex to be destroyed.
  */
 U_CAPI void U_EXPORT2 umtx_destroy( UMTX *mutex );
 
-/* Is a mutex initialized? 
-   Use it this way:
-      umtx_isInitialized( &aMutex ); 
-   This function is not normally needed.  It is more efficient to 
-   unconditionally call umtx_init(&aMutex) than it is to check first. 
- * @param mutex The given mutex to be tested
-*/
-U_CAPI UBool U_EXPORT2 umtx_isInitialized( UMTX *mutex );
+
 
 /*
  * Atomic Increment and Decrement of an int32_t value.

Index: unames.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/unames.c,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -d -r1.5 -r1.6
--- unames.c	10 Sep 2003 02:42:03 -0000	1.5
+++ unames.c	6 Apr 2004 10:08:05 -0000	1.6
@@ -1,4 +1,3 @@
-
 /*
 ******************************************************************************
 *
@@ -30,6 +29,7 @@
 #include "cstring.h"
 #include "ucln_cmn.h"
 #include "uprops.h"
+#include "udataswp.h"
 
 /* prototypes ------------------------------------------------------------- */
[...1622 lines suppressed...]
+                    while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) {
+                        --stringsCount;
+                    }
+                    ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode);
+                }
+                break;
+            default:
+                udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
+                                 inRange->type, i);
+                *pErrorCode=U_UNSUPPORTED_ERROR;
+                return 0;
+            }
+        }
+    }
+
+    return headerSize+(int32_t)offset;
+}
 
 /*
  * Hey, Emacs, please set the following:

Index: unifilt.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/unifilt.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- unifilt.cpp	10 Sep 2003 02:42:03 -0000	1.1
+++ unifilt.cpp	6 Apr 2004 10:08:05 -0000	1.2
@@ -1,5 +1,7 @@
 /*
-* Copyright (C) 2001, International Business Machines Corporation and others. All Rights Reserved.
+**********************************************************************
+* Copyright (c) 2001-2003, International Business Machines
+* Corporation and others.  All Rights Reserved.
 **********************************************************************
 *   Date        Name        Description
 *   07/18/01    aliu        Creation.
@@ -11,7 +13,11 @@
 
 U_NAMESPACE_BEGIN
 
-const char UnicodeFilter::fgClassID=0;
+/* Define this here due to the lack of another file.
+   It can't be defined in the header */
+UnicodeMatcher::~UnicodeMatcher() {}
+
+UnicodeFilter::~UnicodeFilter() {}
 
 /**
  * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
@@ -20,6 +26,8 @@
 UnicodeMatcher* UnicodeFilter::toMatcher() const {
     return (UnicodeMatcher*) this;
 }
+
+void UnicodeFilter::setData(const TransliterationRuleData*) {}
 
 /**
  * Default implementation of UnicodeMatcher::matches() for Unicode

Index: unifunct.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/unifunct.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- unifunct.cpp	10 Sep 2003 02:42:03 -0000	1.1
+++ unifunct.cpp	6 Apr 2004 10:08:05 -0000	1.2
@@ -1,19 +1,15 @@
 /*
 **********************************************************************
-* Copyright (c) 2002, International Business Machines
+* Copyright (c) 2002-2003, International Business Machines
 * Corporation and others.  All Rights Reserved.
 **********************************************************************
-* $Source$ 
-* $Date$ 
-* $Revision$
-**********************************************************************
 */
 
 #include "unicode/unifunct.h"
 
 U_NAMESPACE_BEGIN
 
-const char UnicodeFunctor::fgClassID = 0;
+UnicodeFunctor::~UnicodeFunctor() {}
 
 UnicodeMatcher* UnicodeFunctor::toMatcher() const {
     return 0;

Index: uniset.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/uniset.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- uniset.cpp	10 Sep 2003 02:42:03 -0000	1.1
+++ uniset.cpp	6 Apr 2004 10:08:05 -0000	1.2
@@ -12,7 +12,8 @@
 #include "unicode/parsepos.h"
 #include "unicode/uchar.h"
 #include "unicode/uscript.h"
-#include "symtable.h"
+#include "unicode/symtable.h"
+#include "ruleiter.h"
 #include "cmemory.h"
 #include "uhash.h"
 #include "util.h"
@@ -55,11 +56,12 @@
 #define UPPER_N         ((UChar)78)     /*N*/
[...1120 lines suppressed...]
+        // syntaxError(chars, "Invalid property pattern");
+        ec = U_MALFORMED_SET;
+        return;
+    }
+    chars.jumpahead(pos.getIndex());
+    rebuiltPat.append(pattern, 0, pos.getIndex());
+}
+
 //----------------------------------------------------------------
 // Inclusions list
 //----------------------------------------------------------------
@@ -3540,7 +3513,7 @@
     int32_t x;
     do {
         x = (low + high) >> 1;
-        UChar ch = CASE_PAIRS[x << 1];
+        UChar ch = CASE_PAIRS[(uint32_t)(x << 1)];
         if (folded < ch) {
             high = x - 1;
         } else if (folded > ch) {

Index: unistr.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/unistr.cpp,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -d -r1.5 -r1.6
--- unistr.cpp	10 Sep 2003 02:42:04 -0000	1.5
+++ unistr.cpp	6 Apr 2004 10:08:05 -0000	1.6
@@ -101,7 +101,19 @@
 
 U_NAMESPACE_BEGIN
 
-const char UnicodeString::fgClassID=0;
+/* The Replaceable virtual destructor can't be defined in the header
+   due to how AIX works with multiple definitions of virtual functions.
+*/
+Replaceable::~Replaceable() {}
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
+
+UnicodeString
+operator+ (const UnicodeString &s1, const UnicodeString &s2) {
+    return
+        UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
+            append(s1).
+                append(s2);
+}
 
 //========================================
 // Reference Counting functions, put at top of file so that optimizing compilers
@@ -272,7 +284,7 @@
     fCapacity = US_STACKBUF_SIZE;
     fArray = fStackBuffer;
     fFlags = kShortString;
-  } else if(buffLength < -1 || buffLength > buffCapacity) {
+  } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
     setToBogus();
   } else if(buffLength == -1) {
     // fLength = u_strlen(buff); but do not look beyond buffCapacity
@@ -1041,9 +1053,16 @@
     return *this;
   }
 
-  if(buffLength < 0 || buffLength > buffCapacity) {
+  if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
     setToBogus();
     return *this;
+  } else if(buffLength == -1) {
+    // buffLength = u_strlen(buff); but do not look beyond buffCapacity
+    const UChar *p = buffer, *limit = buffer + buffCapacity;
+    while(p != limit && *p != 0) {
+      ++p;
+    }
+    buffLength = (int32_t)(p - buffer);
   }
 
   releaseArray();
@@ -1161,18 +1180,19 @@
   UBreakIterator *cTitleIter = 0;
 
   if(toWhichCase == TO_TITLE) {
+    errorCode = U_ZERO_ERROR;
     if(titleIter != 0) {
       cTitleIter = (UBreakIterator *)titleIter;
+      ubrk_setText(cTitleIter, oldArray, oldLength, &errorCode);
     } else {
-      errorCode = U_ZERO_ERROR;
       cTitleIter = ubrk_open(UBRK_WORD, locale.getName(),
                              oldArray, oldLength,
                              &errorCode);
-      if(U_FAILURE(errorCode)) {
-        uprv_free(bufferToDelete);
-        setToBogus();
-        return *this;
-      }
+    }
+    if(U_FAILURE(errorCode)) {
+      uprv_free(bufferToDelete);
+      setToBogus();
+      return *this;
     }
   }
 #endif
@@ -1657,8 +1677,11 @@
                 const char *codepage)
 {
   // if there's nothing to convert, do nothing
-  if(codepageData == 0 || dataLength <= 0) {
+  if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
     return;
+  }
+  if(dataLength == -1) {
+    dataLength = uprv_strlen(codepageData);
   }
 
   UErrorCode status = U_ZERO_ERROR;

Index: unorm.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/unorm.cpp,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -d -r1.5 -r1.6
--- unorm.cpp	10 Sep 2003 02:42:04 -0000	1.5
+++ unorm.cpp	6 Apr 2004 10:08:05 -0000	1.6
@@ -41,6 +41,7 @@
 #include "umutex.h"
 #include "utrie.h"
 #include "unicode/uset.h"
+#include "udataswp.h"
 
 /*
  * Status of tailored normalization
@@ -148,23 +149,6 @@
     return norm32<_NORM_JAMO_V_TOP;
 }
 
[...1761 lines suppressed...]
+            offset+=count;
+        }
+
+        /* swap the aux UTrie */
+        count=indexes[_NORM_INDEX_AUX_TRIE_SIZE];
+        if(count!=0) {
+            utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
+            offset+=count;
+        }
+
+        /* swap the uint16_t combiningTable[] */
+        count=indexes[_NORM_INDEX_CANON_SET_COUNT]*2;
+        ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
+        offset+=count;
+    }
+
+    return headerSize+size;
 }
 
 #endif /* #if !UCONFIG_NO_NORMALIZATION */

Index: unormimp.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/unormimp.h,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -d -r1.5 -r1.6
--- unormimp.h	10 Sep 2003 02:42:04 -0000	1.5
+++ unormimp.h	6 Apr 2004 10:08:06 -0000	1.6
@@ -26,6 +26,7 @@
 #include "unicode/uset.h"
 #include "utrie.h"
 #include "ustr_imp.h"
+#include "udataswp.h"
 
 /*
  * This new implementation of the normalization code loads its data from
@@ -373,6 +374,15 @@
 unorm_addPropertyStarts(USet *set, UErrorCode *pErrorCode);
 
 /**
+ * Swap unorm.icu. See udataswp.h.
+ * @internal
+ */
+U_CAPI int32_t U_EXPORT2
+unorm_swap(const UDataSwapper *ds,
+           const void *inData, int32_t length, void *outData,
+           UErrorCode *pErrorCode);
+
+/**
  * Description of the format of unorm.dat version 2.2.
  *
  * Main change from version 1 to version 2:
@@ -391,7 +401,7 @@
  * unorm.dat customarily begins with a UDataInfo structure, see udata.h and .c.
  * After that there are the following structures:
  *
- * uint16_t indexes[_NORM_INDEX_TOP];           -- _NORM_INDEX_TOP=32, see enum in this file
+ * int32_t indexes[_NORM_INDEX_TOP];            -- _NORM_INDEX_TOP=32, see enum in this file
  *
  * UTrie normTrie;                              -- size in bytes=indexes[_NORM_INDEX_TRIE_SIZE]
  * 

Index: uobject.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/uobject.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- uobject.cpp	10 Sep 2003 02:42:04 -0000	1.1
+++ uobject.cpp	6 Apr 2004 10:08:06 -0000	1.2
@@ -1,7 +1,7 @@
 /*
 ******************************************************************************
 *
-*   Copyright (C) 2002, International Business Machines
+*   Copyright (C) 2002-2003, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************
@@ -15,13 +15,12 @@
 */
 
 #include "unicode/uobject.h"
-
-#if U_OVERRIDE_CXX_ALLOCATION
-
 #include "cmemory.h"
 
 U_NAMESPACE_BEGIN
 
+#if U_OVERRIDE_CXX_ALLOCATION
+
 /*
  * Default implementation of UMemory::new/delete
  * using uprv_malloc() and uprv_free().
@@ -77,7 +76,10 @@
     }
 }
 
+#endif
+
+UObject::~UObject() {}
+
 U_NAMESPACE_END
 
-#endif
 

Index: uprops.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/uprops.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- uprops.c	10 Sep 2003 02:42:04 -0000	1.1
+++ uprops.c	6 Apr 2004 10:08:06 -0000	1.2
@@ -26,51 +26,118 @@
 
 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
 
+#ifdef DEBUG
+#include <stdio.h>
+#endif
+
+/**
+ * Get the next non-ignorable ASCII character from a property name
+ * and lowercases it.
+ * @return ((advance count for the name)<<8)|character
+ */
+static U_INLINE int32_t
+getASCIIPropertyNameChar(const char *name) {
+    int32_t i;
+    char c;
+
+    /* Ignore delimiters '-', '_', and ASCII White_Space */
+    for(i=0;
+        (c=name[i++])==0x2d || c==0x5f ||
+        c==0x20 || (0x09<=c && c<=0x0d);
+    ) {}
+
+    if(c!=0) {
+        return (i<<8)|(uint8_t)uprv_asciitolower((char)c);
+    } else {
+        return i<<8;
+    }
+}
+
+/**
+ * Get the next non-ignorable EBCDIC character from a property name
+ * and lowercases it.
+ * @return ((advance count for the name)<<8)|character
+ */
+static U_INLINE int32_t
+getEBCDICPropertyNameChar(const char *name) {
+    int32_t i;
+    char c;
+
+    /* Ignore delimiters '-', '_', and EBCDIC White_Space */
+    for(i=0;
+        (c=name[i++])==0x60 || c==0x6d ||
+        c==0x40 || c==0x05 || c==0x15 || c==0x25 || c==0x0b || c==0x0c || c==0x0d;
+    ) {}
+
+    if(c!=0) {
+        return (i<<8)|(uint8_t)uprv_ebcdictolower((char)c);
+    } else {
+        return i<<8;
+    }
+}
+
 /**
  * Unicode property names and property value names are compared
  * "loosely". Property[Value]Aliases.txt say:
  *   "With loose matching of property names, the case distinctions, whitespace,
  *    and '_' are ignored."
  *
- * This function does just that, for ASCII (char *) name strings.
+ * This function does just that, for (char *) name strings.
  * It is almost identical to ucnv_compareNames() but also ignores
- * ASCII White_Space characters (U+0009..U+000d).
+ * C0 White_Space characters (U+0009..U+000d, and U+0085 on EBCDIC).
  *
  * @internal
  */
+
 U_CAPI int32_t U_EXPORT2
-uprv_comparePropertyNames(const char *name1, const char *name2) {
-    int32_t rc;
-    unsigned char c1, c2;
+uprv_compareASCIIPropertyNames(const char *name1, const char *name2) {
+    int32_t rc, r1, r2;
 
     for(;;) {
-        /* Ignore delimiters '-', '_', and ASCII White_Space */
-        while((c1=(unsigned char)*name1)=='-' || c1=='_' ||
-              c1==' ' || c1=='\t' || c1=='\n' || c1=='\v' || c1=='\f' || c1=='\r'
-        ) {
-            ++name1;
+        r1=getASCIIPropertyNameChar(name1);
+        r2=getASCIIPropertyNameChar(name2);
+
+        /* If we reach the ends of both strings then they match */
+        if(((r1|r2)&0xff)==0) {
+            return 0;
         }
-        while((c2=(unsigned char)*name2)=='-' || c2=='_' ||
-              c2==' ' || c2=='\t' || c2=='\n' || c2=='\v' || c2=='\f' || c2=='\r'
-        ) {
-            ++name2;
+        
+        /* Compare the lowercased characters */
+        if(r1!=r2) {
+            rc=(r1&0xff)-(r2&0xff);
+            if(rc!=0) {
+                return rc;
+            }
         }
 
+        name1+=r1>>8;
+        name2+=r2>>8;
+    }
+}
+
+U_CAPI int32_t U_EXPORT2
+uprv_compareEBCDICPropertyNames(const char *name1, const char *name2) {
+    int32_t rc, r1, r2;
+
+    for(;;) {
+        r1=getEBCDICPropertyNameChar(name1);
+        r2=getEBCDICPropertyNameChar(name2);
+
         /* If we reach the ends of both strings then they match */
-        if((c1|c2)==0) {
+        if(((r1|r2)&0xff)==0) {
             return 0;
         }
         
-        /* Case-insensitive comparison */
-        if(c1!=c2) {
-            rc=(int32_t)(unsigned char)uprv_tolower(c1)-(int32_t)(unsigned char)uprv_tolower(c2);
+        /* Compare the lowercased characters */
+        if(r1!=r2) {
+            rc=(r1&0xff)-(r2&0xff);
             if(rc!=0) {
                 return rc;
             }
         }
 
-        ++name1;
-        ++name2;
+        name1+=r1>>8;
+        name2+=r2>>8;
     }
 }
 
@@ -190,10 +257,14 @@
 
 U_CAPI UBool U_EXPORT2
 uprv_isRuleWhiteSpace(UChar32 c) {
-    /* "white space" in the sense of ICU rule parsers: Cf+White_Space */
-    return
-        u_charType(c)==U_FORMAT_CHAR ||
-        u_hasBinaryProperty(c, UCHAR_WHITE_SPACE);
+    /* "white space" in the sense of ICU rule parsers
+       This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
+       See UTR #31: http://www.unicode.org/reports/tr31/.
+       U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
+    */
+    return (c >= 0x0009 && c <= 0x2029 &&
+            (c <= 0x000D || c == 0x0020 || c == 0x0085 ||
+             c == 0x200E || c == 0x200F || c >= 0x2028));
 }
 
 static const UChar _PATTERN[] = {
@@ -429,6 +500,20 @@
  * Do not use a UnicodeSet pattern because that causes infinite recursion;
  * UnicodeSet depends on the inclusions set.
  */
+#ifdef DEBUG 
+static uint32_t 
+strrch(const char* source,uint32_t sourceLen,char find){
+    const char* tSourceEnd =source + (sourceLen-1);
+    while(tSourceEnd>= source){
+        if(*tSourceEnd==find){
+            return (uint32_t)(tSourceEnd-source);
+        }
+        tSourceEnd--;
+    }
+    return (uint32_t)(tSourceEnd-source);
+}
+#endif
+
 U_CAPI void U_EXPORT2
 uprv_getInclusions(USet* set, UErrorCode *pErrorCode) {
     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
@@ -441,4 +526,64 @@
     unorm_addPropertyStarts(set, pErrorCode);
 #endif
     uchar_addPropertyStarts(set, pErrorCode);
+
+#ifdef DEBUG
+    {
+        UChar* result=NULL;
+        int32_t resultCapacity=0;
+        int32_t bufLen = uset_toPattern(set,result,resultCapacity,TRUE,pErrorCode);
+        char* resultChars = NULL;
+        if(*pErrorCode == U_BUFFER_OVERFLOW_ERROR){
+            uint32_t len = 0, add=0;
+            char *buf=NULL, *current = NULL;
+            *pErrorCode = U_ZERO_ERROR;
+            resultCapacity = bufLen;
+            result = (UChar*) uprv_malloc(resultCapacity * U_SIZEOF_UCHAR);
+            bufLen = uset_toPattern(set,result,resultCapacity,TRUE,pErrorCode);
+            resultChars = (char*) uprv_malloc(len+1);
+            u_UCharsToChars(result,resultChars,bufLen);
+            resultChars[bufLen] = 0;
+            buf = resultChars;
+            /*printf(resultChars);*/
+             while(len < bufLen){
+                    add = 70-5/* for ", +\n */;
+                    current = buf +len;
+                    if (add < (bufLen-len)) {
+                        uint32_t index = strrch(current,add,'\\');
+                        if (index > add) {
+                            index = add;
+                        } else {
+                            int32_t num =index-1;
+                            uint32_t seqLen;
+                            while(num>0){
+                                if(current[num]=='\\'){
+                                    num--;
+                                }else{
+                                    break;
+                                }
+                            }
+                            if ((index-num)%2==0) {
+                                index--;
+                            }
+                            seqLen = (current[index+1]=='u') ? 6 : 2;
+                            if ((add-index) < seqLen) {
+                                add = index + seqLen;
+                            }
+                        }
+                    }
+                    fwrite("\"",1,1,stdout);
+                    if(len+add<bufLen){
+                        fwrite(current,1,add,stdout);
+                        fwrite("\" +\n",1,4,stdout);
+                    }else{
+                        fwrite(current,1,bufLen-len,stdout);
+                    }
+                    len+=add;
+                }
+
+        }
+        uprv_free(result);
+        uprv_free(resultChars);
+    }
+#endif
 }

Index: uprops.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/uprops.h,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- uprops.h	10 Sep 2003 02:42:04 -0000	1.1
+++ uprops.h	6 Apr 2004 10:08:06 -0000	1.2
@@ -22,6 +22,7 @@
 
 #include "unicode/utypes.h"
 #include "unicode/uset.h"
+#include "udataswp.h"
 
 /* indexes[] entries */
 enum {
@@ -201,19 +202,32 @@
 uprv_getMaxValues(int32_t column);
 
 /**
+ * \var uprv_comparePropertyNames
  * Unicode property names and property value names are compared
  * "loosely". Property[Value]Aliases.txt say:
  *   "With loose matching of property names, the case distinctions, whitespace,
  *    and '_' are ignored."
  *
- * This function does just that, for ASCII (char *) name strings.
+ * This function does just that, for (char *) name strings.
  * It is almost identical to ucnv_compareNames() but also ignores
- * ASCII White_Space characters (U+0009..U+000d).
+ * C0 White_Space characters (U+0009..U+000d, and U+0085 on EBCDIC).
  *
  * @internal
  */
+
 U_CAPI int32_t U_EXPORT2
-uprv_comparePropertyNames(const char *name1, const char *name2);
+uprv_compareASCIIPropertyNames(const char *name1, const char *name2);
+
+U_CAPI int32_t U_EXPORT2
+uprv_compareEBCDICPropertyNames(const char *name1, const char *name2);
+
+#if U_CHARSET_FAMILY==U_ASCII_FAMILY
+#   define uprv_comparePropertyNames uprv_compareASCIIPropertyNames
+#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
+#   define uprv_comparePropertyNames uprv_compareEBCDICPropertyNames
+#else
+#   error U_CHARSET_FAMILY is not valid
+#endif
 
 /** Turn a bit index into a bit flag. @internal */
 #define FLAG(n) ((uint32_t)1<<(n))
@@ -353,5 +367,23 @@
  */
 U_CAPI void U_EXPORT2
 uprv_getInclusions(USet* set, UErrorCode *pErrorCode);
+
+/**
+ * Swap the ICU Unicode properties file. See uchar.c.
+ * @internal
+ */
+U_CAPI int32_t U_EXPORT2
+uprops_swap(const UDataSwapper *ds,
+            const void *inData, int32_t length, void *outData,
+            UErrorCode *pErrorCode);
+
+/**
+ * Swap the ICU Unicode character names file. See uchar.c.
+ * @internal
+ */
+U_CAPI int32_t U_EXPORT2
+uchar_swapNames(const UDataSwapper *ds,
+                const void *inData, int32_t length, void *outData,
+                UErrorCode *pErrorCode);
 
 #endif

Index: uresbund.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/uresbund.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- uresbund.c	10 Sep 2003 02:42:04 -0000	1.4
+++ uresbund.c	6 Apr 2004 10:08:06 -0000	1.5
@@ -20,8 +20,9 @@
 */
 
 #include "unicode/ustring.h"
-
+#include "unicode/ucnv.h"
 #include "uresimp.h"
+#include "ustr_imp.h"
 #include "cwchar.h"
 #include "ucln_cmn.h"
 #include "cmemory.h"
@@ -216,9 +217,11 @@
         if (cache != NULL && uhash_count(cache) == 0) {
             uhash_close(cache);
             cache = NULL;
-            umtx_destroy(&resbMutex);
         }
     }
+    if (cache == NULL && resbMutex != NULL) {
+        umtx_destroy(&resbMutex);
+    }
     return (cache == NULL);
 }
 
@@ -569,21 +572,54 @@
         const UChar *alias = res_getAlias(rdata, r, &len);   
         if(len > 0) {
           /* we have an alias, now let's cut it up */
+          char stackAlias[200];
           char *chAlias = NULL, *path = NULL, *locale = NULL, *keyPath = NULL;
-          chAlias = (char *)uprv_malloc((len+1)*sizeof(char));
-          /* test for NULL */
-          if(chAlias == NULL) {
-            *status = U_MEMORY_ALLOCATION_ERROR;
-            return NULL;
+          int32_t capacity;
+
+          /*
+           * Allocate enough space for both the char * version
+           * of the alias and parent->fResPath.
+           *
+           * We do this so that res_findResource() can modify the path,
+           * which allows us to remove redundant _res_findResource() variants
+           * in uresdata.c.
+           * res_findResource() now NUL-terminates each segment so that table keys
+           * can always be compared with strcmp() instead of strncmp().
+           * Saves code there and simplifies testing and code coverage.
+           *
+           * markus 2003oct17
+           */
+          ++len; /* count the terminating NUL */
+          if(parent != NULL && parent->fResPath != NULL) {
+            capacity = uprv_strlen(parent->fResPath) + 1;
+          } else {
+            capacity = 0;
+          }
+          if(capacity < len) {
+            capacity = len;
+          }
+          if(capacity <= sizeof(stackAlias)) {
+            capacity = sizeof(stackAlias);
+            chAlias = stackAlias;
+          } else {
+            chAlias = (char *)uprv_malloc(capacity);
+            /* test for NULL */
+            if(chAlias == NULL) {
+              *status = U_MEMORY_ALLOCATION_ERROR;
+              return NULL;
             }
+          }
           u_UCharsToChars(alias, chAlias, len);
-          chAlias[len] = 0;
 
           if(*chAlias == RES_PATH_SEPARATOR) {
             /* there is a path included */
             locale = uprv_strchr(chAlias+1, RES_PATH_SEPARATOR);
-            *locale = 0;
-            locale++;
+            if(locale == NULL) {
+                locale = uprv_strchr(chAlias, 0); /* avoid locale == NULL to make code below work */
+            } else {
+                *locale = 0;
+                locale++;
+            }
             path = chAlias+1;
             if(uprv_strcmp(path, "ICUDATA") == 0) { /* want ICU data */
               path = NULL;
@@ -614,23 +650,40 @@
                 /* first, we are going to get a corresponding parent 
                  * resource to the one we are searching.
                  */
-                const char* aKey = parent->fResPath;
+                char *aKey = parent->fResPath;
                 if(aKey) {
+                  uprv_strcpy(chAlias, aKey); /* allocated large enough above */
+                  aKey = chAlias;
                   r = res_findResource(&(mainRes->fResData), mainRes->fRes, &aKey, &temp);
                 } else {
                   r = mainRes->fRes;
                 }
                 if(key) {
-                /* we need to make keyPath from parents fResPath and 
-                 * current key, if there is a key associated
-                 */
-                  aKey = key;
+                  /* we need to make keyPath from parent's fResPath and
+                   * current key, if there is a key associated
+                   */
+                  len = uprv_strlen(key) + 1;
+                  if(len > capacity) {
+                    capacity = len;
+                    if(chAlias == stackAlias) {
+                      chAlias = (char *)uprv_malloc(capacity);
+                    } else {
+                      chAlias = (char *)uprv_realloc(chAlias, capacity);
+                    }
+                    if(chAlias == NULL) {
+                      ures_close(mainRes);
+                      *status = U_MEMORY_ALLOCATION_ERROR;
+                      return NULL;
+                    }
+                  }
+                  uprv_memcpy(chAlias, key, len);
+                  aKey = chAlias;
                   r = res_findResource(&(mainRes->fResData), r, &aKey, &temp);
                 } else if(index != -1) {
                 /* if there is no key, but there is an index, try to get by the index */
                 /* here we have either a table or an array, so get the element */
-                  if(RES_GET_TYPE(r) == URES_TABLE) {
-                    r = res_getTableItemByIndex(&(mainRes->fResData), r, index, &aKey);
+                  if(RES_GET_TYPE(r) == URES_TABLE || RES_GET_TYPE(r) == URES_TABLE32) {
+                    r = res_getTableItemByIndex(&(mainRes->fResData), r, index, (const char **)&aKey);
                   } else { /* array */
                     r = res_getArrayItem(&(mainRes->fResData), r, index);
                   }
@@ -651,7 +704,7 @@
                  */
                 result = mainRes;
                 while(*keyPath && U_SUCCESS(*status)) {
-                  r = res_findResource(&(result->fResData), result->fRes, (const char**)&keyPath, &temp);
+                  r = res_findResource(&(result->fResData), result->fRes, &keyPath, &temp);
                   if(r == RES_BOGUS) {
                     *status = U_MISSING_RESOURCE_ERROR;
                     result = resB;
@@ -664,8 +717,12 @@
             } else { /* we failed to open the resource we're aliasing to */
               *status = intStatus;
             }
-            uprv_free(chAlias);
-            ures_close(mainRes);
+            if(chAlias != stackAlias) {
+              uprv_free(chAlias);
+            }
+            if(mainRes != result) {
+              ures_close(mainRes);
+            }
             return result;
           }
         } else {
@@ -687,6 +744,7 @@
         }
         ures_setIsStackObject(resB, FALSE);
         resB->fResPath = NULL;
+        resB->fResPathLen = 0;
     } else {
         if(resB->fData != NULL) {
             entryClose(resB->fData);
@@ -697,6 +755,7 @@
         if(ures_isStackObject(resB) != FALSE) {
             ures_initStackObject(resB);
         }
+        ures_freeResPath(resB);
     }
     resB->fData = realData;
     entryIncrease(resB->fData);
@@ -704,7 +763,8 @@
     resB->fIsTopLevel = FALSE;
     resB->fIndex = -1;
     resB->fKey = key;
-    ures_freeResPath(resB);
+    resB->fParentRes = parent;
+    resB->fTopLevelData = parent->fTopLevelData;
     if(parent->fResPath) {
       ures_appendResPath(resB, parent->fResPath, parent->fResPathLen);
     }
@@ -759,6 +819,7 @@
         }
         uprv_memcpy(r, original, sizeof(UResourceBundle));
         r->fResPath = NULL;
+        r->fResPathLen = 0;
         if(original->fResPath) {
           ures_appendResPath(r, original->fResPath, original->fResPathLen);
         }
@@ -794,6 +855,7 @@
         case URES_BINARY:
         case URES_ARRAY:
         case URES_TABLE:
+        case URES_TABLE32:
         default:
             *status = U_RESOURCE_TYPE_MISMATCH;
     }
@@ -818,6 +880,7 @@
   case URES_INT_VECTOR:
   case URES_ARRAY:
   case URES_TABLE:
+  case URES_TABLE32:
   default:
     *status = U_RESOURCE_TYPE_MISMATCH;
   }
@@ -842,6 +905,7 @@
   case URES_ARRAY:
   case URES_BINARY:
   case URES_TABLE:
+  case URES_TABLE32:
   default:
     *status = U_RESOURCE_TYPE_MISMATCH;
   }
@@ -883,10 +947,13 @@
 
 
 U_CAPI UResType U_EXPORT2 ures_getType(UResourceBundle *resB) {
+  UResType type;
+
   if(resB == NULL) {
     return URES_NONE;
   }
-  return (UResType) (RES_GET_TYPE(resB->fRes));
+  type = (UResType) RES_GET_TYPE(resB->fRes);
+  return type == URES_TABLE32 ? URES_TABLE : type;
 }
 
 U_CAPI const char * U_EXPORT2 ures_getKey(UResourceBundle *resB) {
@@ -952,6 +1019,7 @@
     case URES_STRING:
       return res_getString(&(resB->fResData), resB->fRes, len); 
     case URES_TABLE:
+    case URES_TABLE32:
       r = res_getTableItemByIndex(&(resB->fResData), resB->fRes, resB->fIndex, key);
       if(r == RES_BOGUS && resB->fHasFallback) {
         /* TODO: do the fallback */
@@ -1000,6 +1068,7 @@
         case URES_STRING:
             return ures_copyResb(fillIn, resB, status);
         case URES_TABLE:
+        case URES_TABLE32:
             r = res_getTableItemByIndex(&(resB->fResData), resB->fRes, resB->fIndex, &key);
             if(r == RES_BOGUS && resB->fHasFallback) {
                 /* TODO: do the fallback */
@@ -1042,6 +1111,7 @@
         case URES_STRING:
             return ures_copyResb(fillIn, resB, status);
         case URES_TABLE:
+        case URES_TABLE32:
             r = res_getTableItemByIndex(&(resB->fResData), resB->fRes, indexR, &key);
             if(r == RES_BOGUS && resB->fHasFallback) {
                 /* TODO: do the fallback */
@@ -1084,6 +1154,7 @@
         case URES_STRING:
             return res_getString(&(resB->fResData), resB->fRes, len);
         case URES_TABLE:
+        case URES_TABLE32:
             r = res_getTableItemByIndex(&(resB->fResData), resB->fRes, indexS, &key);
             if(r == RES_BOGUS && resB->fHasFallback) {
                 /* TODO: do the fallback */
@@ -1120,16 +1191,21 @@
   char *packageName = NULL;
   char *pathToResource = NULL;
   char *locale = NULL, *localeEnd = NULL;
+  int32_t length;
+
   if(status == NULL || U_FAILURE(*status)) {
     return result;
   }
-  pathToResource = (char *)uprv_malloc((uprv_strlen(path)+1)*sizeof(char));
+
+  length = uprv_strlen(path)+1;
+  pathToResource = (char *)uprv_malloc(length*sizeof(char));
   /* test for NULL */
   if(pathToResource == NULL) {
     *status = U_MEMORY_ALLOCATION_ERROR;
     return result;
-    }
-  uprv_strcpy(pathToResource, path);
+  }
+  uprv_memcpy(pathToResource, path, length);
+
   locale = pathToResource;
   if(*pathToResource == RES_PATH_SEPARATOR) { /* there is a path specification */
     pathToResource++;
@@ -1143,7 +1219,7 @@
     }
   }
 
-  localeEnd = strchr(locale, RES_PATH_SEPARATOR);
+  localeEnd = uprv_strchr(locale, RES_PATH_SEPARATOR);
   if(localeEnd != NULL) {
     *localeEnd = 0;
   }
@@ -1163,11 +1239,10 @@
 }
 
 U_CAPI UResourceBundle* U_EXPORT2
-ures_findSubResource(const UResourceBundle *resB, const char* path, UResourceBundle *fillIn, UErrorCode *status) 
+ures_findSubResource(const UResourceBundle *resB, char* path, UResourceBundle *fillIn, UErrorCode *status) 
 {
   Resource res = RES_BOGUS;
   UResourceBundle *result = fillIn;
-  const char *pathToResource = path;
   const char *key;
 
   if(status == NULL || U_FAILURE(*status)) {
@@ -1176,7 +1251,7 @@
 
   /* here we do looping and circular alias checking */
 
-  res = res_findResource(&(resB->fResData), resB->fRes, &pathToResource, &key); 
+  res = res_findResource(&(resB->fResData), resB->fRes, &path, &key); 
 
   if(res != RES_BOGUS) {
     result = init_resb_result(&(resB->fResData), res, key, -1, resB->fData, resB, 0, fillIn, status);
@@ -1187,6 +1262,61 @@
   return result;
 }
 
+U_CAPI UResourceBundle* U_EXPORT2 
+ures_getByKeyWithFallback(const UResourceBundle *resB, 
+                          const char* inKey, 
+                          UResourceBundle *fillIn, 
+                          UErrorCode *status) {
+    Resource res = RES_BOGUS;
+    /*UResourceDataEntry *realData = NULL;*/
+    const char *key = inKey;
+
+    if (status==NULL || U_FAILURE(*status)) {
+        return fillIn;
+    }
+    if(resB == NULL) {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+        return fillIn;
+    }
+
+    if(RES_GET_TYPE(resB->fRes) == URES_TABLE || RES_GET_TYPE(resB->fRes) == URES_TABLE32) {
+        int32_t t;
+        res = res_getTableItemByKey(&(resB->fResData), resB->fRes, &t, &key);
+        if(res == RES_BOGUS) {
+            int32_t i = 0;
+            UResourceDataEntry *dataEntry = resB->fData;
+            char path[256];
+            char* myPath = path;
+
+            while(res == RES_BOGUS && dataEntry->fParent != NULL) { /* Otherwise, we'll look in parents */
+                dataEntry = dataEntry->fParent;
+                if(dataEntry->fBogus == U_ZERO_ERROR) {
+                  uprv_strncpy(path, resB->fResPath, resB->fResPathLen);
+                  uprv_strcpy(path+resB->fResPathLen, inKey);
+                  myPath = path;
+                  key = inKey;
+                  i++;
+                  res = res_findResource(&(dataEntry->fData), dataEntry->fData.rootRes, &myPath, &key); 
+                  /*res = res_getTableItemByKey(&(resB->fData), resB->fData.rootRes, &indexR, resTag);*/
+                }
+            }
+            /*const ResourceData *rd = getFallbackData(resB, &key, &realData, &res, status);*/
+            if(res != RES_BOGUS) {
+              /* check if resB->fResPath gives the right name here */
+                return init_resb_result(&(dataEntry->fData), res, key, -1, dataEntry, resB, 0, fillIn, status);
+            } else {
+                *status = U_MISSING_RESOURCE_ERROR;
+            }
+        } else {
+            return init_resb_result(&(resB->fResData), res, key, -1, resB->fData, resB, 0, fillIn, status);
+        }
+    } 
+    else {
+        *status = U_RESOURCE_TYPE_MISMATCH;
+    }
+    return fillIn;
+}
+
 
 U_CAPI UResourceBundle* U_EXPORT2 ures_getByKey(const UResourceBundle *resB, const char* inKey, UResourceBundle *fillIn, UErrorCode *status) {
     Resource res = RES_BOGUS;
@@ -1201,7 +1331,7 @@
         return fillIn;
     }
 
-    if(RES_GET_TYPE(resB->fRes) == URES_TABLE) {
+    if(RES_GET_TYPE(resB->fRes) == URES_TABLE || RES_GET_TYPE(resB->fRes) == URES_TABLE32) {
         int32_t t;
         res = res_getTableItemByKey(&(resB->fResData), resB->fRes, &t, &key);
         if(res == RES_BOGUS) {
@@ -1253,7 +1383,7 @@
         return NULL;
     }
 
-    if(RES_GET_TYPE(resB->fRes) == URES_TABLE) {
+    if(RES_GET_TYPE(resB->fRes) == URES_TABLE || RES_GET_TYPE(resB->fRes) == URES_TABLE32) {
         int32_t t=0;
 
         res = res_getTableItemByKey(&(resB->fResData), resB->fRes, &t, &key);
@@ -1266,6 +1396,7 @@
                     switch (RES_GET_TYPE(res)) {
                     case URES_STRING:
                     case URES_TABLE:
+                    case URES_TABLE32:
                     case URES_ARRAY:
                         return res_getString(rd, res, len);
                     case URES_ALIAS:
@@ -1289,6 +1420,7 @@
             switch (RES_GET_TYPE(res)) {
             case URES_STRING:
             case URES_TABLE:
+            case URES_TABLE32:
             case URES_ARRAY:
                 return res_getString(&(resB->fResData), res, len);
             case URES_ALIAS:
@@ -1344,6 +1476,35 @@
     }
 }
 
+U_CAPI const char* U_EXPORT2 
+ures_getLocaleByType(const UResourceBundle* resourceBundle, 
+                     ULocDataLocaleType type, 
+                     UErrorCode* status) {
+    if (status==NULL || U_FAILURE(*status)) {
+        return NULL;
+    }
+    if (!resourceBundle) {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+        return NULL;
+    } else {
+        switch(type) {
+        case ULOC_ACTUAL_LOCALE:
+            return resourceBundle->fData->fName;
+            break;
+        case ULOC_VALID_LOCALE:
+            return resourceBundle->fTopLevelData->fName;
+            break;
+        case ULOC_REQUESTED_LOCALE:
+            return NULL;
+            break;
+        default:
+            *status = U_ILLEGAL_ARGUMENT_ERROR;
+            return NULL;
+        }
+    }
+}
+
+
 /*
 U_CFUNC void ures_setResPath(UResourceBundle *resB, const char* toAdd) {
   if(resB->fResPath == NULL) {
@@ -1388,6 +1549,7 @@
   resB->fResPathLen = 0;
 }
 
+
 U_CFUNC const char* ures_getName(const UResourceBundle* resB) {
   if(resB == NULL) {
     return NULL;
@@ -1442,18 +1604,13 @@
         r->fSize = res_countArrayItems(&(r->fResData), r->fRes);
         /*r->fParent = RES_BOGUS;*/
         /*r->fResPath = NULL;*/
+        r->fParentRes = NULL;
+        r->fTopLevelData = r->fData;
+
         ures_freeResPath(r);
-        /*
-        if(r->fData->fPath != NULL) {
-          ures_setResPath(r, r->fData->fPath);
-          ures_appendResPath(r, RES_PATH_PACKAGE_S);
-          ures_appendResPath(r, r->fData->fName);
-        } else {
-          ures_setResPath(r, r->fData->fName);
-        }
-        */
     }
 }
+
 U_CAPI UResourceBundle*  U_EXPORT2
 ures_open(const char* path,
                     const char* localeID,
@@ -1469,7 +1626,7 @@
     }
 
     /* first "canonicalize" the locale ID */
-    length = uloc_getName(localeID, canonLocaleID, sizeof(canonLocaleID), status);
+    length = uloc_getBaseName(localeID, canonLocaleID, sizeof(canonLocaleID), status);
     if(U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING) {
         *status = U_ILLEGAL_ARGUMENT_ERROR;
         return NULL;
@@ -1492,6 +1649,8 @@
         uprv_free(r);
         return NULL;
     }
+    r->fParentRes = NULL;
+    r->fTopLevelData = r->fData;
 
     hasData = r->fData;
     while(hasData->fBogus != U_ZERO_ERROR) {
@@ -1513,6 +1672,7 @@
     /*r->fParent = RES_BOGUS;*/
     r->fSize = res_countArrayItems(&(r->fResData), r->fRes);
     r->fResPath = NULL;
+
     /*
     if(r->fData->fPath != NULL) {
       ures_setResPath(r, r->fData->fPath);
@@ -1531,25 +1691,35 @@
                   const char* localeID, 
                   UErrorCode* status)
 {
-    UResourceBundle *r;
-    int32_t pathSize = u_strlen(myPath) + 1;
-    char *path = (char *)uprv_malloc(pathSize);
-    /* test for NULL */
-    if(path == NULL) {
-        *status = U_MEMORY_ALLOCATION_ERROR;
+    char path[2048];
+    UConverter *cnv;
+    int32_t length;
+
+    if(status==NULL || U_FAILURE(*status)) {
+        return NULL;
+    }
+    if(myPath==NULL) {
+        *status=U_ILLEGAL_ARGUMENT_ERROR;
         return NULL;
     }
 
-    u_UCharsToChars(myPath, path, pathSize);
-
-    r = ures_open(path, localeID, status);
-    uprv_free(path);
+    cnv=u_getDefaultConverter(status);
+    if(U_FAILURE(*status)) {
+        return NULL;
+    }
 
-    if (U_FAILURE(*status)) {
+    length=ucnv_fromUChars(cnv, path, sizeof(path), myPath, -1, status);
+    u_releaseDefaultConverter(cnv);
+    if(U_FAILURE(*status)) {
+        return NULL;
+    }
+    if(length>=sizeof(path)) {
+        /* not NUL-terminated - path too long */
+        *status=U_ILLEGAL_ARGUMENT_ERROR;
         return NULL;
     }
 
-    return r;
+    return ures_open(path, localeID, status);
 }
 
 /**
@@ -1597,15 +1767,9 @@
     /*r->fParent = RES_BOGUS;*/
     r->fSize = res_countArrayItems(&(r->fResData), r->fRes);
     r->fResPath = NULL;
-    /*
-    if(r->fData->fPath != NULL) {
-      ures_setResPath(r, r->fData->fPath);
-      ures_appendResPath(r, RES_PATH_PACKAGE_S);
-      ures_appendResPath(r, r->fData->fName);
-    } else {
-      ures_setResPath(r, r->fData->fName);
-    }
-    */
+    r->fParentRes = NULL;
+    r->fTopLevelData = r->fData;
+
     return r;
 }
 
@@ -1625,7 +1789,7 @@
 
 
 U_CFUNC void ures_initStackObject(UResourceBundle* resB) {
-  memset(resB, 0, sizeof(UResourceBundle));
+  uprv_memset(resB, 0, sizeof(UResourceBundle));
   ures_setIsStackObject(resB, TRUE);
 }
 

Index: uresdata.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/uresdata.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- uresdata.c	10 Sep 2003 02:42:04 -0000	1.4
+++ uresdata.c	6 Apr 2004 10:08:06 -0000	1.5
@@ -20,18 +20,24 @@
 */
 
 #include "unicode/utypes.h"
-#include "cstring.h"
 #include "unicode/udata.h"
+#include "cmemory.h"
+#include "cstring.h"
+#include "uarrsort.h"
+#include "udataswp.h"
+#include "ucol_swp.h"
 #include "uresdata.h"
[...1027 lines suppressed...]
+            tempTable.resort=(int32_t *)(tempTable.rows+maxTableLength);
+        }
+
+        /* swap the resources */
+        ures_swapResource(ds, inBundle, outBundle, rootRes, URES_NO_SPECIAL_TYPE, &tempTable, pErrorCode);
+        if(U_FAILURE(*pErrorCode)) {
+            udata_printError(ds, "ures_swapResource(root res=%08x) failed - %s\n",
+                             rootRes, u_errorName(*pErrorCode));
+        }
+
+        if(tempTable.rows!=rows) {
+            uprv_free(tempTable.rows);
+        }
+
+        /* swap the root resource and indexes */
+        ds->swapArray32(ds, inBundle, stringsBottom*4, outBundle, pErrorCode);
+    }
+
+    return headerSize+4*top;
 }

Index: uresdata.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/uresdata.h,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- uresdata.h	10 Sep 2003 02:42:04 -0000	1.3
+++ uresdata.h	6 Apr 2004 10:08:06 -0000	1.4
@@ -1,7 +1,7 @@
 /*
 ******************************************************************************
 *                                                                            *
-* Copyright (C) 1999-2002, International Business Machines                   *
+* Copyright (C) 1999-2003, International Business Machines                   *
 *                Corporation and others. All Rights Reserved.                *
 *                                                                            *
 ******************************************************************************
@@ -20,6 +20,7 @@
 
 #include "unicode/utypes.h"
 #include "unicode/udata.h"
+#include "udataswp.h"
 
 /*
  * A Resource is a 32-bit value that has 2 bit fields:
@@ -38,16 +39,38 @@
 #define RES_GET_INT(res) (((int32_t)((res)<<4L))>>4L)
 #define RES_GET_UINT(res) ((res)&0x0fffffff)
 
+/* indexes[] value names; indexes are generally 32-bit (Resource) indexes */
+enum {
+    URES_INDEX_LENGTH,          /* [0] contains URES_INDEX_TOP==the length of indexes[] */
+    URES_INDEX_STRINGS_TOP,     /* [1] contains the top of the strings, */
+                                /*     same as the bottom of resources, rounded up */
+    URES_INDEX_RESOURCES_TOP,   /* [2] contains the top of all resources */
+    URES_INDEX_BUNDLE_TOP,      /* [3] contains the top of the bundle, */
+                                /*     in case it were ever different from [2] */
+    URES_INDEX_MAX_TABLE_LENGTH,/* [4] max. length of any table */
+    URES_INDEX_TOP
+};
+
+/* number of bytes at the beginning of the bundle before the strings start */
+enum {
+    URES_STRINGS_BOTTOM=(1+URES_INDEX_TOP)*4
+};
+
 /*
- * File format for .res resource bundle files (formatVersion=1)
+ * File format for .res resource bundle files (formatVersion=1.1)
  *
  * An ICU4C resource bundle file (.res) is a binary, memory-mappable file
  * with nested, hierarchical data structures.
  * It physically contains the following:
  *
  *   Resource root; -- 32-bit Resource item, root item for this bundle's tree;
- *                     currently, the root item must be a table resource item
- *   char keys[]; -- up to 65k of characters for key strings,
+ *                     currently, the root item must be a table or table32 resource item
+ *   int32_t indexes[indexes[0]]; -- array of indexes for friendly
+ *                                   reading and swapping; see URES_INDEX_* above
+ *                                   new in formatVersion 1.1
+ *   char keys[]; -- characters for key strings
+ *                   (formatVersion 1.0: up to 65k of characters; 1.1: <2G)
+ *                   (minus the space for root and indexes[]),
  *                   which consist of invariant characters (ASCII/EBCDIC) and are NUL-terminated;
  *                   padded to multiple of 4 bytes for 4-alignment of the following data
  *   data; -- data directly and indirectly indexed by the root item;
@@ -110,6 +133,8 @@
  *                      - this value should be 32-aligned -
  * 2  Table:            uint16_t count, uint16_t keyStringOffsets[count], (uint16_t padding), Resource[count]
  * 3  Alias:            (physically same value layout as string, new in ICU 2.4)
+ * 4  Table32:          int32_t count, int32_t keyStringOffsets[count], Resource[count]
+ *                      (new in formatVersion 1.1/ICU 2.8)
  *
  * 7  Integer:          (28-bit offset is integer value)
  * 8  Array:            int32_t count, Resource[count]
@@ -171,11 +196,23 @@
 U_CFUNC int32_t
 res_countArrayItems(const ResourceData *pResData, const Resource res);
 
-U_CFUNC int32_t res_getTableSize(const ResourceData *pResData, Resource table);
-
 U_CFUNC Resource res_getArrayItem(const ResourceData *pResData, Resource array, const int32_t indexS);
 U_CFUNC Resource res_getTableItemByIndex(const ResourceData *pResData, Resource table, int32_t indexS, const char ** key);
 U_CFUNC Resource res_getTableItemByKey(const ResourceData *pResData, Resource table, int32_t *indexS, const char* * key);
-U_CFUNC Resource res_findResource(const ResourceData *pResData, Resource r, const char** path, const char** key);
+
+/*
+ * Modifies the contents of *path (replacing separators with NULs),
+ * and also moves *path forward while it finds items.
+ */
+U_CFUNC Resource res_findResource(const ResourceData *pResData, Resource r, char** path, const char** key);
+
+/**
+ * Swap an ICU resource bundle. See udataswp.h.
+ * @internal
+ */
+U_CAPI int32_t U_EXPORT2
+ures_swap(const UDataSwapper *ds,
+          const void *inData, int32_t length, void *outData,
+          UErrorCode *pErrorCode);
 
 #endif

Index: uresimp.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/uresimp.h,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- uresimp.h	10 Sep 2003 02:42:04 -0000	1.3
+++ uresimp.h	6 Apr 2004 10:08:06 -0000	1.4
@@ -53,7 +53,7 @@
     int32_t fHashKey; /* for faster access in the hashtable */
 };
 
-#define RES_BUFSIZE 256
+#define RES_BUFSIZE 64
 #define RES_PATH_SEPARATOR   '/'
 #define RES_PATH_SEPARATOR_S   "/"
 
@@ -73,14 +73,12 @@
     ResourceData fResData;
     Resource fRes;
 
-    /* parent of this resource - 
-     * lives in the same data entry 
-     */
-    /* This cannot be done right now - need support in genrb */
-    /*Resource fParent; */
+    UResourceDataEntry *fTopLevelData; /* for getting the valid locale */
+    const UResourceBundle *fParentRes; /* needed to get the actual locale for a child resource */
+
 };
 
-U_CFUNC void ures_initStackObject(UResourceBundle* resB);
+U_CAPI void U_EXPORT2 ures_initStackObject(UResourceBundle* resB);
 U_CFUNC void ures_setIsStackObject( UResourceBundle* resB, UBool state);
 U_CFUNC UBool ures_isStackObject( UResourceBundle* resB);
 
@@ -90,6 +88,7 @@
 U_CFUNC void ures_appendResPath(UResourceBundle *resB, const char* toAdd, int32_t lenToAdd);
 /*U_CFUNC void ures_setResPath(UResourceBundle *resB, const char* toAdd);*/
 U_CFUNC void ures_freeResPath(UResourceBundle *resB);
+U_CFUNC void ures_freeRequestedLocale(UResourceBundle *resB);
 
 /* Candidates for export */
 U_CFUNC UResourceBundle *ures_copyResb(UResourceBundle *r, const UResourceBundle *original, UErrorCode *status);
@@ -129,7 +128,14 @@
  */
 U_CAPI UResourceBundle* U_EXPORT2
 ures_findSubResource(const UResourceBundle *resB, 
-                     const char* pathToResource, 
+                     char* pathToResource, 
                      UResourceBundle *fillIn, UErrorCode *status);
+
+U_CAPI UResourceBundle* U_EXPORT2 
+ures_getByKeyWithFallback(const UResourceBundle *resB, 
+                          const char* inKey, 
+                          UResourceBundle *fillIn, 
+                          UErrorCode *status);
+
 
 #endif /*URESIMP_H*/

Index: uset.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/uset.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- uset.cpp	10 Sep 2003 02:42:04 -0000	1.1
+++ uset.cpp	6 Apr 2004 10:08:06 -0000	1.2
@@ -27,6 +27,7 @@
 #include "unicode/uniset.h"
 #include "cmemory.h"
 #include "unicode/ustring.h"
+#include "unicode/parsepos.h"
 
 U_CAPI USet* U_EXPORT2
 uset_open(UChar32 start, UChar32 end) {
@@ -58,7 +59,7 @@
                  UErrorCode* ec)
 {
     UnicodeString pat(patternLength==-1, pattern, patternLength);
-    UnicodeSet* set = new UnicodeSet(pat, options, *ec);
+    UnicodeSet* set = new UnicodeSet(pat, options, NULL, *ec);
     /* test for NULL */
     if(set == 0) {
         *ec = U_MEMORY_ALLOCATION_ERROR;
@@ -76,6 +77,35 @@
 U_CAPI void U_EXPORT2
 uset_close(USet* set) {
     delete (UnicodeSet*) set;
+}
+
+U_CAPI int32_t U_EXPORT2 
+uset_applyPattern(USet *set,
+                  const UChar *pattern, int32_t patternLength,
+                  uint32_t options,
+                  UErrorCode *status){
+
+    // status code needs to be checked since we 
+    // dereference it
+    if(status == NULL || U_FAILURE(*status)){
+        return 0;
+    }
+
+    // check only the set paramenter
+    // if pattern is NULL or null terminate
+    // UnicodeString constructor takes care of it
+    if(set == NULL){
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }
+
+    UnicodeString pat(pattern, patternLength);
+
+    ParsePosition pos;
+   
+    ((UnicodeSet*) set)->applyPattern(pat, pos, options, NULL, *status);
+    
+    return pos.getIndex();
 }
 
 U_CAPI int32_t U_EXPORT2

Index: usetiter.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/usetiter.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- usetiter.cpp	10 Sep 2003 02:42:04 -0000	1.1
+++ usetiter.cpp	6 Apr 2004 10:08:06 -0000	1.2
@@ -1,10 +1,8 @@
 /*
 **********************************************************************
-* Copyright (c) 2002, International Business Machines
+* Copyright (c) 2002-2003, International Business Machines
 * Corporation and others.  All Rights Reserved.
 **********************************************************************
-* $Source$ 
-**********************************************************************
 */
 #include "unicode/usetiter.h"
 #include "unicode/uniset.h"
@@ -13,7 +11,7 @@
 
 U_NAMESPACE_BEGIN
 
-const char UnicodeSetIterator::fgClassID=0;
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSetIterator)
 
 /**
  * Create an iterator

Index: ustr_imp.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ustr_imp.h,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- ustr_imp.h	10 Sep 2003 02:42:04 -0000	1.4
+++ ustr_imp.h	6 Apr 2004 10:08:06 -0000	1.5
@@ -1,6 +1,6 @@
 /*  
 **********************************************************************
-*   Copyright (C) 1999-2001, International Business Machines
+*   Copyright (C) 1999-2003, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *   file name:  ustr_imp.h
@@ -57,10 +57,22 @@
  * Are the Unicode properties loaded?
  * This must be used before internal functions are called that do
  * not perform this check.
+ * Generate a debug assertion failure if data is not loaded, to flag the fact
+ *   that u_init() wasn't called first, before trying to access character properties.
  * @internal
  */
 U_CFUNC UBool
 uprv_haveProperties(UErrorCode *pErrorCode);
+
+/**
+  * Load the Unicode property data.
+  * Intended primarily for use from u_init().
+  * Has no effect if property data is already loaded.
+  * NOT thread safe.
+  * @internal
+  */
+U_CFUNC int8_t
+uprv_loadPropsData(UErrorCode *errorCode);
 
 /**
  * Type of a function that may be passed to the internal case mapping functions

Index: ustrenum.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ustrenum.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- ustrenum.cpp	10 Sep 2003 02:42:04 -0000	1.1
+++ ustrenum.cpp	6 Apr 2004 10:08:06 -0000	1.2
@@ -1,6 +1,6 @@
 /*
 **********************************************************************
-* Copyright (c) 2002, International Business Machines
+* Copyright (c) 2002-2003, International Business Machines
 * Corporation and others.  All Rights Reserved.
 **********************************************************************
 * Author: Alan Liu
@@ -14,6 +14,102 @@
 #include "ustrenum.h"
 #include "cstring.h"
 #include "cmemory.h"
+
+// StringEnumeration implementation ---------------------------------------- ***
+
+StringEnumeration::StringEnumeration()
+    : chars(charsBuffer), charsCapacity(sizeof(charsBuffer)) {
+}
+
+StringEnumeration::~StringEnumeration() {
+    if (chars != NULL && chars != charsBuffer) {
+        uprv_free(chars);
+    }
+}
+
+// StringEnumeration base class clone() default implementation, does not clone
+StringEnumeration *
+StringEnumeration::clone() const {
+  return NULL;
+}
+
+const char *
+StringEnumeration::next(int32_t *resultLength, UErrorCode &status) {
+    const UnicodeString *s=snext(status);
+    if(s!=NULL) {
+        unistr=*s;
+        ensureCharsCapacity(unistr.length()+1, status);
+        if(U_SUCCESS(status)) {
+            if(resultLength!=NULL) {
+                *resultLength=unistr.length();
+            }
+            unistr.extract(0, INT32_MAX, chars, charsCapacity, "");
+            return chars;
+        }
+    }
+
+    return NULL;
+}
+
+const UChar *
+StringEnumeration::unext(int32_t *resultLength, UErrorCode &status) {
+    const UnicodeString *s=snext(status);
+    if(s!=NULL) {
+        unistr=*s;
+        if(U_SUCCESS(status)) {
+            if(resultLength!=NULL) {
+                *resultLength=unistr.length();
+            }
+            return unistr.getTerminatedBuffer();
+        }
+    }
+
+    return NULL;
+}
+
+void
+StringEnumeration::ensureCharsCapacity(int32_t capacity, UErrorCode &status) {
+    if(U_SUCCESS(status) && capacity>charsCapacity) {
+        if(capacity<(charsCapacity+charsCapacity/2)) {
+            // avoid allocation thrashing
+            capacity=charsCapacity+charsCapacity/2;
+        }
+        if(chars!=charsBuffer) {
+            uprv_free(chars);
+        }
+        chars=(char *)uprv_malloc(capacity);
+        if(chars==NULL) {
+            chars=charsBuffer;
+            charsCapacity=sizeof(charsBuffer);
+            status=U_MEMORY_ALLOCATION_ERROR;
+        } else {
+            charsCapacity=capacity;
+        }
+    }
+}
+
+UnicodeString *
+StringEnumeration::setChars(const char *s, int32_t length, UErrorCode &status) {
+    if(U_SUCCESS(status) && s!=NULL) {
+        if(length<0) {
+            length=uprv_strlen(s);
+        }
+
+        UChar *buffer=unistr.getBuffer(length+1);
+        if(buffer!=NULL) {
+            u_charsToUChars(s, buffer, length);
+            buffer[length]=0;
+            unistr.releaseBuffer(length);
+            return &unistr;
+        } else {
+            status=U_MEMORY_ALLOCATION_ERROR;
+        }
+    }
+
+    return NULL;
+}
+
+// C wrapper --------------------------------------------------------------- ***
 
 #define THIS(en) ((StringEnumeration*)(en->context))
 

Index: ustring.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ustring.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- ustring.c	10 Sep 2003 02:42:04 -0000	1.4
+++ ustring.c	6 Apr 2004 10:08:06 -0000	1.5
@@ -1478,6 +1478,24 @@
             }
             ++(*offset);
         }
+        if (result < 0 || result >= 0x110000) {
+            goto err;
+        }
+        /* If an escape sequence specifies a lead surrogate, see if
+         * there is a trail surrogate after it, either as an escape or
+         * as a literal.  If so, join them up into a supplementary.
+         */
+        if (*offset < length && U16_IS_LEAD(result)) {
+            int32_t ahead = *offset + 1;
+            c = charAt(*offset, context);
+            if (c == 0x5C /*'\\'*/ && ahead < length) {
+                c = (UChar) u_unescapeAt(charAt, &ahead, length, context);
+            }
+            if (U16_IS_TRAIL(c)) {
+                *offset = ahead;
+                result = U16_GET_SUPPLEMENTARY(result, c);
+            }
+        }
         return result;
     }
 

Index: ustrtrns.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/ustrtrns.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- ustrtrns.c	10 Sep 2003 02:42:04 -0000	1.4
+++ ustrtrns.c	6 Apr 2004 10:08:06 -0000	1.5
@@ -57,6 +57,7 @@
 }
 
 #define _STACK_BUFFER_CAPACITY 1000
+#define _BUFFER_CAPACITY_MULTIPLIER 2
 
 U_CAPI UChar* U_EXPORT2 
 u_strFromUTF32(UChar   *dest,
@@ -77,7 +78,7 @@
         return NULL;
     }
     
-    if((srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
+    if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
         return NULL;
     }
@@ -163,7 +164,7 @@
     }
     
     
-    if((srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
+    if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
         return NULL;
     }
@@ -234,7 +235,7 @@
         return NULL;
     }
         
-    if((srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
+    if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
         return NULL;
     }
@@ -332,7 +333,7 @@
         return NULL;
     }
         
-    if((srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
+    if((pSrc==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
         return NULL;
     }
@@ -487,7 +488,7 @@
             
             /* we dont have enough room on the stack grow the buffer */
             if(!u_growAnyBufferFromStatic(stackBuffer,(void**) &tempBuf, &tempBufCapacity, 
-                (2*(pSrcLimit-pSrc)+100), count,sizeof(char))){
+                (_BUFFER_CAPACITY_MULTIPLIER * (srcLength)), count,sizeof(char))){
                 goto cleanup;
             }
           
@@ -522,7 +523,7 @@
      * here we assume that every char requires 
      * no more than 2 wchar_ts
      */
-    intTargetCapacity =  (count*2+1) /*for null termination */;
+    intTargetCapacity =  (count * _BUFFER_CAPACITY_MULTIPLIER + 1) /*for null termination */;
     intTarget = (wchar_t*)uprv_malloc( intTargetCapacity * sizeof(wchar_t) );
 
     if(intTarget){
@@ -547,7 +548,7 @@
                 int numWritten = (pIntTarget-intTarget);
                 u_growAnyBufferFromStatic(NULL,(void**) &intTarget,
                                           &intTargetCapacity,
-                                          intTargetCapacity*2,
+                                          intTargetCapacity * _BUFFER_CAPACITY_MULTIPLIER,
                                           numWritten,
                                           sizeof(wchar_t));
                 pIntTarget = intTarget;
@@ -559,12 +560,14 @@
                 }
 
             }else{
+                int32_t nulVal;
                 /*scan for nulls */
                 /* we donot check for limit since tempBuf is null terminated */
                 while(tempBuf[nulLen++] != 0){
                 }
-                pIntTarget = pIntTarget + retVal+1;
-                remaining -=(retVal+1);
+                nulVal = (nulLen < srcLength) ? 1 : 0; 
+                pIntTarget = pIntTarget + retVal+nulVal;
+                remaining -=(retVal+nulVal);
             
                 /* check if we have reached the source limit*/
                 if(nulLen>=(count)){
@@ -614,7 +617,7 @@
         return NULL;
     }
         
-    if((srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
+    if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
         return NULL;
     }
@@ -645,6 +648,7 @@
     return _strToWCS(dest,destCapacity,pDestLength,src,srcLength, pErrorCode);
     
 #endif
+
 }
 
 #if !defined(U_WCHAR_IS_UTF16) && !defined(U_WCHAR_IS_UTF32)
@@ -693,7 +697,7 @@
             }else if(retVal == cStackCap){
                 /* Should rarely occur */
                 u_growAnyBufferFromStatic(cStack,(void**)&pCSrc,&cStackCap,
-                    cStackCap*2,0,sizeof(char));
+                    cStackCap * _BUFFER_CAPACITY_MULTIPLIER, 0, sizeof(char));
                 pCSave = pCSrc;
             }else{
                 /* converted every thing */
@@ -726,7 +730,7 @@
                     pCSrc = pCSave;
                     /* we do not have enough room so grow the buffer*/
                     u_growAnyBufferFromStatic(cStack,(void**)&pCSrc,&cStackCap,
-                           2*cStackCap+(nulLen*MB_CUR_MAX),len,sizeof(char));
+                           _BUFFER_CAPACITY_MULTIPLIER*cStackCap+(nulLen*MB_CUR_MAX),len,sizeof(char));
 
                     pCSave = pCSrc;
                     pCSrc = pCSave+len;
@@ -788,7 +792,7 @@
                 /* convert to chars */
                 retVal = uprv_wcstombs(pCSrc,pWStack,remaining);
             
-                pCSrc += retVal +1;
+                pCSrc += retVal;
                 pSrc  += nulLen;
                 srcLength-=nulLen; /* decrement the srcLength */
                 break;
@@ -866,7 +870,7 @@
         return NULL;
     }
         
-    if((srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
+    if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
         return NULL;
     }

Index: util.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/util.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- util.cpp	10 Sep 2003 02:42:04 -0000	1.1
+++ util.cpp	6 Apr 2004 10:08:06 -0000	1.2
@@ -1,6 +1,6 @@
 /*
 **********************************************************************
-*   Copyright (c) 2001, International Business Machines
+*   Copyright (c) 2001-2003, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *   Date        Name        Description
@@ -68,7 +68,7 @@
  * Return true if the character is NOT printable ASCII.
  */
 UBool ICU_Utility::isUnprintable(UChar32 c) {
-    return !(c == 0x0A || (c >= 0x20 && c <= 0x7E));
+    return !(c >= 0x20 && c <= 0x7E);
 }
 
 /**

Index: utrie.c
===================================================================
RCS file: /cvs/core/icu-sword/source/common/utrie.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- utrie.c	10 Sep 2003 02:42:04 -0000	1.1
+++ utrie.c	6 Apr 2004 10:08:06 -0000	1.2
@@ -23,6 +23,7 @@
 #endif
 
 #include "unicode/utypes.h"
+#include "udataswp.h"
 #include "cmemory.h"
 #include "utrie.h"
 
@@ -34,7 +35,8 @@
 U_CAPI UNewTrie * U_EXPORT2
 utrie_open(UNewTrie *fillIn,
            uint32_t *aliasData, int32_t maxDataLength,
-           uint32_t initialValue, UBool latin1Linear) {
+           uint32_t initialValue, uint32_t leadUnitValue,
+           UBool latin1Linear) {
     UNewTrie *trie;
     int32_t i, j;
 
@@ -89,6 +91,7 @@
         trie->data[--j]=initialValue;
     }
 
+    trie->leadUnitValue=leadUnitValue;
     trie->indexLength=UTRIE_MAX_INDEX_LENGTH;
     trie->dataCapacity=maxDataLength;
     trie->isLatin1Linear=latin1Linear;
@@ -118,7 +121,9 @@
         isDataAllocated=TRUE;
     }
 
-    trie=utrie_open(fillIn, aliasData, aliasDataCapacity, other->data[0], other->isLatin1Linear);
+    trie=utrie_open(fillIn, aliasData, aliasDataCapacity,
+                    other->data[0], other->leadUnitValue,
+                    other->isLatin1Linear);
     if(trie==NULL) {
         uprv_free(aliasData);
     } else {
@@ -154,6 +159,20 @@
     return trie->data;
 }
 
+static int32_t
+utrie_allocDataBlock(UNewTrie *trie) {
+    int32_t newBlock, newTop;
+
+    newBlock=trie->dataLength;
+    newTop=newBlock+UTRIE_DATA_BLOCK_LENGTH;
+    if(newTop>trie->dataCapacity) {
+        /* out of memory in the data array */
+        return -1;
+    }
+    trie->dataLength=newTop;
+    return newBlock;
+}
+
 /**
  * No error checking for illegal arguments.
  *
@@ -162,7 +181,7 @@
  */
 static int32_t
 utrie_getDataBlock(UNewTrie *trie, UChar32 c) {
-    int32_t indexValue, newBlock, newTop;
+    int32_t indexValue, newBlock;
 
     c>>=UTRIE_SHIFT;
     indexValue=trie->index[c];
@@ -171,13 +190,11 @@
     }
 
     /* allocate a new data block */
-    newBlock=trie->dataLength;
-    newTop=newBlock+UTRIE_DATA_BLOCK_LENGTH;
-    if(newTop>trie->dataCapacity) {
+    newBlock=utrie_allocDataBlock(trie);
+    if(newBlock<0) {
         /* out of memory in the data array */
         return -1;
     }
-    trie->dataLength=newTop;
     trie->index[c]=newBlock;
 
     /* copy-on-write for a block from a setRange() */
@@ -385,15 +402,30 @@
     uprv_memcpy(leadIndexes, index+(0xd800>>UTRIE_SHIFT), 4*UTRIE_SURROGATE_BLOCK_COUNT);
 
     /*
-     * to protect the copied lead surrogate values,
-     * mark all their indexes as repeat blocks
-     * (causes copy-on-write)
+     * set all values for lead surrogate code *units* to leadUnitValue
+     * so that, by default, runtime lookups will find no data for associated
+     * supplementary code points, unless there is data for such code points
+     * which will result in a non-zero folding value below that is set for
+     * the respective lead units
+     *
+     * the above saved the indexes for surrogate code *points*
+     * fill the indexes with simplified code from utrie_setRange32()
      */
-    for(c=0xd800; c<=0xdbff; ++c) {
-        block=index[c>>UTRIE_SHIFT];
-        if(block>0) {
-            index[c>>UTRIE_SHIFT]=-block;
+    if(trie->leadUnitValue==trie->data[0]) {
+        block=0; /* leadUnitValue==initialValue, use all-initial-value block */
+    } else {
+        /* create and fill the repeatBlock */
+        block=utrie_allocDataBlock(trie);
+        if(block<0) {
+            /* data table overflow */
+            *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
+            return;
         }
+        utrie_fillBlock(trie->data+block, 0, UTRIE_DATA_BLOCK_LENGTH, trie->leadUnitValue, trie->data[0], TRUE);
+        block=-block; /* negative block number to indicate that it is a repeat block */
+    }
+    for(c=(0xd800>>UTRIE_SHIFT); c<(0xdc00>>UTRIE_SHIFT); ++c) {
+        trie->index[c]=block;
     }
 
     /*
@@ -418,10 +450,14 @@
             /* is there an identical index block? */
             block=_findSameIndexBlock(index, indexLength, c>>UTRIE_SHIFT);
 
-            /* get a folded value for [c..c+0x400[ and, if 0, set it for the lead surrogate */
+            /*
+             * get a folded value for [c..c+0x400[ and,
+             * if different from the value for the lead surrogate code point,
+             * set it for the lead surrogate code unit
+             */
             value=getFoldedValue(trie, c, block+UTRIE_SURROGATE_BLOCK_COUNT);
-            if(value!=0) {
-                if(!utrie_set32(trie, 0xd7c0+(c>>10), value)) {
+            if(value!=utrie_get32(trie, U16_LEAD(c), NULL)) {
+                if(!utrie_set32(trie, U16_LEAD(c), value)) {
                     /* data table overflow */
                     *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
                     return;
@@ -670,7 +706,7 @@
      */
     uint32_t options;
 
-    /** indexLength is a multiple of 1024>>UTRIE_SHIFT */
+    /** indexLength is a multiple of UTRIE_SURROGATE_BLOCK_COUNT */
     int32_t indexLength;
 
     /** dataLength>=UTRIE_DATA_BLOCK_LENGTH */
@@ -870,6 +906,79 @@
     }
 }
 
+/* swapping ----------------------------------------------------------------- */
+
+U_CAPI int32_t U_EXPORT2
+utrie_swap(const UDataSwapper *ds,
+           const void *inData, int32_t length, void *outData,
+           UErrorCode *pErrorCode) {
+    const UTrieHeader *inTrie;
+    UTrieHeader trie;
+    int32_t size;
+    UBool dataIs32;
+
+    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+        return 0;
+    }
+    if(ds==NULL || inData==NULL || (length>=0 && outData==NULL)) {
+        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }
+
+    /* setup and swapping */
+    if(length>=0 && length<sizeof(UTrieHeader)) {
+        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+        return 0;
+    }
+
+    inTrie=(const UTrieHeader *)inData;
+    trie.signature=ds->readUInt32(inTrie->signature);
+    trie.options=ds->readUInt32(inTrie->options);
+    trie.indexLength=udata_readInt32(ds, inTrie->indexLength);
+    trie.dataLength=udata_readInt32(ds, inTrie->dataLength);
+
+    if( trie.signature!=0x54726965 ||
+        (trie.options&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_SHIFT ||
+        ((trie.options>>UTRIE_OPTIONS_INDEX_SHIFT)&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_INDEX_SHIFT ||
+        trie.indexLength<UTRIE_BMP_INDEX_LENGTH ||
+        (trie.indexLength&(UTRIE_SURROGATE_BLOCK_COUNT-1))!=0 ||
+        trie.dataLength<UTRIE_DATA_BLOCK_LENGTH ||
+        (trie.dataLength&(UTRIE_DATA_GRANULARITY-1))!=0 ||
+        ((trie.options&UTRIE_OPTIONS_LATIN1_IS_LINEAR)!=0 && trie.dataLength<(UTRIE_DATA_BLOCK_LENGTH+0x100))
+    ) {
+        *pErrorCode=U_INVALID_FORMAT_ERROR; /* not a UTrie */
+        return 0;
+    }
+
+    dataIs32=(UBool)((trie.options&UTRIE_OPTIONS_DATA_IS_32_BIT)!=0);
+    size=sizeof(UTrieHeader)+trie.indexLength*2+trie.dataLength*(dataIs32?4:2);
+
+    if(length>=0) {
+        UTrieHeader *outTrie;
+
+        if(length<size) {
+            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+            return 0;
+        }
+
+        outTrie=(UTrieHeader *)outData;
+
+        /* swap the header */
+        ds->swapArray32(ds, inTrie, sizeof(UTrieHeader), outTrie, pErrorCode);
+
+        /* swap the index and the data */
+        if(dataIs32) {
+            ds->swapArray16(ds, inTrie+1, trie.indexLength*2, outTrie+1, pErrorCode);
+            ds->swapArray32(ds, (const uint16_t *)(inTrie+1)+trie.indexLength, trie.dataLength*4,
+                                     (uint16_t *)(outTrie+1)+trie.indexLength, pErrorCode);
+        } else {
+            ds->swapArray16(ds, inTrie+1, (trie.indexLength+trie.dataLength)*2, outTrie+1, pErrorCode);
+        }
+    }
+
+    return size;
+}
+
 /* enumeration -------------------------------------------------------------- */
 
 /* default UTrieEnumValue() returns the input value itself */
@@ -963,23 +1072,25 @@
     for(l=0xd800; l<0xdc00;) {
         /* lead surrogate access */
         offset=index[l>>UTRIE_SHIFT]<<UTRIE_INDEX_SHIFT;
-        if(data32!=NULL) {
-            if(offset==0) {
-                /* no entries for a whole block of lead surrogates */
-                l+=UTRIE_DATA_BLOCK_LENGTH;
-                c+=UTRIE_DATA_BLOCK_LENGTH<<10;
-                continue;
-            }
-            value=data32[offset+(l&UTRIE_MASK)];
-        } else {
-            if(offset==trie->indexLength) {
-                /* no entries for a whole block of lead surrogates */
-                l+=UTRIE_DATA_BLOCK_LENGTH;
-                c+=UTRIE_DATA_BLOCK_LENGTH<<10;
-                continue;
+        if(offset==(data32!=NULL ? 0 : trie->indexLength)) {
+            /* no entries for a whole block of lead surrogates */
+            if(prevValue!=initialValue) {
+                if(prev<c) {
+                    if(!enumRange(context, prev, c, prevValue)) {
+                        return;
+                    }
+                }
+                prevBlock=0;
+                prev=c;
+                prevValue=initialValue;
             }
-            value=index[offset+(l&UTRIE_MASK)];
+
+            l+=UTRIE_DATA_BLOCK_LENGTH;
+            c+=UTRIE_DATA_BLOCK_LENGTH<<10;
+            continue;
         }
+
+        value= data32!=NULL ? data32[offset+(l&UTRIE_MASK)] : index[offset+(l&UTRIE_MASK)];
 
         /* enumerate trail surrogates for this lead surrogate */
         offset=trie->getFoldingOffset(value);

Index: utrie.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/utrie.h,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- utrie.h	10 Sep 2003 02:42:04 -0000	1.1
+++ utrie.h	6 Apr 2004 10:08:06 -0000	1.2
@@ -18,6 +18,7 @@
 #define __UTRIE_H__
 
 #include "unicode/utypes.h"
+#include "udataswp.h"
 
 U_CDECL_BEGIN
 
@@ -80,7 +81,7 @@
 
     /**
      * Number of index (stage 1) entries per lead surrogate.
-     * Same as number of indexe entries for 1024 trail surrogates,
+     * Same as number of index entries for 1024 trail surrogates,
      * ==0x400>>UTRIE_SHIFT
      */
     UTRIE_SURROGATE_BLOCK_COUNT=(1<<UTRIE_SURROGATE_BLOCK_BITS),
@@ -492,6 +493,7 @@
     int32_t index[UTRIE_MAX_INDEX_LENGTH];
     uint32_t *data;
 
+    uint32_t leadUnitValue;
     int32_t indexLength, dataCapacity, dataLength;
     UBool isAllocated, isDataAllocated;
     UBool isLatin1Linear, isCompacted;
@@ -546,6 +548,8 @@
  * @param maxDataLength the capacity of aliasData (if not NULL) or
  *                      the length of the data array to be allocated
  * @param initialValue the initial value that is set for all code points
+ * @param leadUnitValue the value for lead surrogate code _units_ that do not
+ *                      have associated supplementary data
  * @param latin1Linear a flag indicating whether the Latin-1 range is to be allocated and
  *                     kept in a linear, contiguous part of the data array
  * @return a pointer to the initialized fillIn or the allocated and initialized new UNewTrie
@@ -553,7 +557,8 @@
 U_CAPI UNewTrie * U_EXPORT2
 utrie_open(UNewTrie *fillIn,
            uint32_t *aliasData, int32_t maxDataLength,
-           uint32_t initialValue, UBool latin1Linear);
+           uint32_t initialValue, uint32_t leadUnitValue,
+           UBool latin1Linear);
 
 /**
  * Clone a build-time trie structure with all entries.
@@ -658,6 +663,15 @@
                 UNewTrieGetFoldedValue *getFoldedValue,
                 UBool reduceTo16Bits,
                 UErrorCode *pErrorCode);
+
+/**
+ * Swap a serialized UTrie.
+ * @internal
+ */
+U_CAPI int32_t U_EXPORT2
+utrie_swap(const UDataSwapper *ds,
+           const void *inData, int32_t length, void *outData,
+           UErrorCode *pErrorCode);
 
 U_CDECL_END
 

Index: uvector.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/uvector.cpp,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- uvector.cpp	10 Sep 2003 02:42:04 -0000	1.4
+++ uvector.cpp	6 Apr 2004 10:08:06 -0000	1.5
@@ -1,6 +1,6 @@
 /*
 ******************************************************************************
-* Copyright (C) 1999-2001, International Business Machines Corporation and   *
+* Copyright (C) 1999-2003, International Business Machines Corporation and   *
 * others. All Rights Reserved.                                               *
 ******************************************************************************
 *   Date        Name        Description
@@ -23,7 +23,7 @@
 #define HINT_KEY_POINTER   (1)
 #define HINT_KEY_INTEGER   (0)
  
-const char UVector::fgClassID=0;
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UVector)
 
 UVector::UVector(UErrorCode &status) :
     count(0),
@@ -464,8 +464,6 @@
         ++count;
     }
 }
-
-const char UStack::fgClassID=0;
 
 UStack::UStack(UErrorCode &status) :
     UVector(status)

Index: uvector.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/uvector.h,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- uvector.h	10 Sep 2003 02:42:04 -0000	1.4
+++ uvector.h	6 Apr 2004 10:08:06 -0000	1.5
@@ -250,14 +250,14 @@
      *
      * @draft ICU 2.2
      */
-    static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
+    static UClassID getStaticClassID();
 
     /**
      * ICU "poor man's RTTI", returns a UClassID for the actual class.
      *
      * @draft ICU 2.2
      */
-    virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }
+    virtual UClassID getDynamicClassID() const;
 
 private:
     void _init(int32_t initialCapacity, UErrorCode &status);
@@ -272,11 +272,6 @@
     // Disallow
     UVector& operator=(const UVector&);
 
-    /**
-     * The address of this static class variable serves as this class's ID
-     * for ICU "poor man's RTTI".
-     */
-    static const char fgClassID;
 };
 
 
@@ -330,14 +325,14 @@
      *
      * @draft ICU 2.2
      */
-    static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
+    static UClassID getStaticClassID();
 
     /**
      * ICU "poor man's RTTI", returns a UClassID for the actual class.
      *
      * @draft ICU 2.2
      */
-    virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }
+    virtual UClassID getDynamicClassID() const;
 
 private:
     // Disallow
@@ -345,12 +340,6 @@
 
     // Disallow
     UStack& operator=(const UStack&);
-
-    /**
-     * The address of this static class variable serves as this class's ID
-     * for ICU "poor man's RTTI".
-     */
-    static const char fgClassID;
 };
 
 

Index: uvectr32.cpp
===================================================================
RCS file: /cvs/core/icu-sword/source/common/uvectr32.cpp,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- uvectr32.cpp	10 Sep 2003 02:42:04 -0000	1.1
+++ uvectr32.cpp	6 Apr 2004 10:08:06 -0000	1.2
@@ -21,7 +21,7 @@
  * token is assumed to be an integer. This is needed for iSeries
  */
  
-const char UVector32::fgClassID=0;
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UVector32)
 
 UVector32::UVector32(UErrorCode &status) :
     count(0),

Index: uvectr32.h
===================================================================
RCS file: /cvs/core/icu-sword/source/common/uvectr32.h,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- uvectr32.h	10 Sep 2003 02:42:04 -0000	1.1
+++ uvectr32.h	6 Apr 2004 10:08:06 -0000	1.2
@@ -105,9 +105,9 @@
 
     int32_t lastElementi(void) const;
 
-    int32_t indexOf(int32_t obj, int32_t startIndex = 0) const;
+    int32_t indexOf(int32_t elem, int32_t startIndex = 0) const;
 
-    UBool contains(int32_t obj) const;
+    UBool contains(int32_t elem) const;
 
     UBool containsAll(const UVector32& other) const;
 
@@ -154,7 +154,7 @@
      * Insert the given integer into this vector at its sorted position.
      * The current elements are assumed to be sorted already.
      */
-    void sortedInsert(int32_t obj, UErrorCode& ec);
+    void sortedInsert(int32_t elem, UErrorCode& ec);
 
     /**
      * Returns a pointer to the internal array holding the vector.
@@ -166,14 +166,14 @@
      *
      * @draft ICU 2.2
      */
-    static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
+    static UClassID getStaticClassID();
 
     /**
      * ICU "poor man's RTTI", returns a UClassID for the actual class.
      *
      * @draft ICU 2.2
      */
-    virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }
+    virtual UClassID getDynamicClassID() const;
 
 private:
     void _init(int32_t initialCapacity, UErrorCode &status);
@@ -184,18 +184,12 @@
     // Disallow
     UVector32& operator=(const UVector32&);
 
-    /**
-     * The address of this static class variable serves as this class's ID
-     * for ICU "poor man's RTTI".
-     */
-    static const char fgClassID;
-
 
     //  API Functions for Stack operations.
     //  In the original UVector, these were in a separate derived class, UStack.
     //  Here in UVector32, they are all together.
 public:
-    UBool empty(void) const;
+    UBool empty(void) const;   // TODO:  redundant, same as empty().  Remove it?
 
     int32_t peeki(void) const;
     

--- digitlst.cpp DELETED ---

--- digitlst.h DELETED ---

--- mutex.cpp DELETED ---

--- nameprep.cpp DELETED ---

--- nameprep.h DELETED ---

--- strprep.cpp DELETED ---

--- strprep.h DELETED ---

--- symtable.h DELETED ---