Tue, 9 Sep 2003 19:43:42 -0700

Update of /usr/local/cvsroot/icu-sword/source/i18n
In directory www:/tmp/cvs-serv19862/source/i18n

Added Files:
	.cvsignore Makefile.in anytrans.cpp anytrans.h bocsu.c bocsu.h 
	buddhcal.cpp buddhcal.h calendar.cpp choicfmt.cpp coleitr.cpp 
	coll.cpp cpdtrans.cpp cpdtrans.h datefmt.cpp dcfmtsym.cpp 
	decimfmt.cpp dtfmtsym.cpp esctrn.cpp esctrn.h fmtable.cpp 
	format.cpp funcrepl.cpp funcrepl.h gregocal.cpp hextouni.cpp 
	hextouni.h i18n.dsp i18n.rc i18n.vcproj japancal.cpp 
	japancal.h msgfmt.cpp name2uni.cpp name2uni.h nfrlist.h 
	nfrs.cpp nfrs.h nfrule.cpp nfrule.h nfsubs.cpp nfsubs.h 
	nortrans.cpp nortrans.h nultrans.cpp nultrans.h numfmt.cpp 
	quant.cpp quant.h rbnf.cpp rbt.cpp rbt.h rbt_data.cpp 
	rbt_data.h rbt_pars.cpp rbt_pars.h rbt_rule.cpp rbt_rule.h 
	rbt_set.cpp rbt_set.h regexcmp.cpp regexcmp.h regexcst.h 
	regexcst.pl regexcst.txt regeximp.h regexst.cpp regexst.h 
	rematch.cpp remtrans.cpp remtrans.h repattrn.cpp search.cpp 
	simpletz.cpp smpdtfmt.cpp sortkey.cpp strmatch.cpp strmatch.h 
	strrepl.cpp strrepl.h stsearch.cpp tblcoll.cpp timezone.cpp 
	titletrn.cpp titletrn.h tolowtrn.cpp tolowtrn.h toupptrn.cpp 
	toupptrn.h translit.cpp transreg.cpp transreg.h tridpars.cpp 
	tridpars.h tzdat.h ucal.cpp ucln_in.c ucln_in.h ucol.cpp 
	ucol_bld.cpp ucol_bld.h ucol_cnt.cpp ucol_cnt.h ucol_elm.cpp 
	ucol_elm.h ucol_imp.h ucol_tok.cpp ucol_tok.h ucol_wgt.c 
	ucol_wgt.h ucoleitr.cpp ucurr.cpp udat.cpp umsg.cpp umsg_imp.h 
	unesctrn.cpp unesctrn.h uni2name.cpp uni2name.h unifltlg.cpp 
	unitohex.cpp unitohex.h unum.cpp usearch.cpp usrchimp.h 
	utrans.cpp 
Log Message:
ICU 2.6 commit

--- NEW FILE: anytrans.cpp ---
/*
*****************************************************************
* Copyright (c) 2002-2003, International Business Machines Corporation
* and others.  All Rights Reserved.
*****************************************************************
* Date        Name        Description
* 06/06/2002  aliu        Creation.
*****************************************************************
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "unicode/uobject.h"
#include "unicode/uscript.h"
#include "nultrans.h"
#include "anytrans.h"
#include "uvector.h"
#include "tridpars.h"
#include "hash.h"

//------------------------------------------------------------
// Constants

static const UChar TARGET_SEP = 45; // '-'
static const UChar VARIANT_SEP = 47; // '/'
static const UChar ANY[] = {65,110,121,0}; // "Any"
static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null"
static const UChar LATIN_PIVOT[] = {45,76,97,116,105,110,59,76,97,116,105,110,45,0}; // "-Latin;Latin-"

//------------------------------------------------------------

U_CDECL_BEGIN
/**
 * Deleter function for Transliterator*.
 */
static void U_CALLCONV
_deleteTransliterator(void *obj) {
    delete (Transliterator*) obj;    
}
U_CDECL_END

//------------------------------------------------------------

U_NAMESPACE_BEGIN

//------------------------------------------------------------
// ScriptRunIterator

/**
 * Returns a series of ranges corresponding to scripts. They will be
 * of the form:
 *
 * ccccSScSSccccTTcTcccc   - c = common, S = first script, T = second
 * |            |          - first run (start, limit)
 *          |           |  - second run (start, limit)
 *
 * That is, the runs will overlap. The reason for this is so that a
 * transliterator can consider common characters both before and after
 * the scripts.
 */
class ScriptRunIterator : public UMemory {
private:
    const Replaceable& text;
    int32_t textStart;
    int32_t textLimit;

public:
    /**
     * The code of the current run, valid after next() returns.  May
     * be USCRIPT_INVALID_CODE if and only if the entire text is
     * COMMON/INHERITED.
     */
    UScriptCode scriptCode;

    /**
     * The start of the run, inclusive, valid after next() returns.
     */
    int32_t start;

    /**
     * The end of the run, exclusive, valid after next() returns.
     */
    int32_t limit;

    /**
     * Constructs a run iterator over the given text from start
     * (inclusive) to limit (exclusive).
     */
    ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit);

    /**
     * Returns TRUE if there are any more runs.  TRUE is always
     * returned at least once.  Upon return, the caller should
     * examine scriptCode, start, and limit.
     */
    UBool next();

    /**
     * Adjusts internal indices for a change in the limit index of the
     * given delta.  A positive delta means the limit has increased.
     */
    void adjustLimit(int32_t delta);

private:
    ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class
    ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class
};

ScriptRunIterator::ScriptRunIterator(const Replaceable& theText,
                                     int32_t myStart, int32_t myLimit) :
    text(theText)
{
    textStart = myStart;
    textLimit = myLimit;
    limit = myStart;
}

UBool ScriptRunIterator::next() {
    UChar32 ch;
    UScriptCode s;
    UErrorCode ec = U_ZERO_ERROR;

    scriptCode = USCRIPT_INVALID_CODE; // don't know script yet
    start = limit;

    // Are we done?
    if (start == textLimit) {
        return FALSE;
    }

    // Move start back to include adjacent COMMON or INHERITED
    // characters
    while (start > textStart) {
        ch = text.char32At(start - 1); // look back
        s = uscript_getScript(ch, &ec);
        if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) {
            --start;
        } else {
            break;
        }
    }

    // Move limit ahead to include COMMON, INHERITED, and characters
    // of the current script.
    while (limit < textLimit) {
        ch = text.char32At(limit); // look ahead
        s = uscript_getScript(ch, &ec);
        if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) {
            if (scriptCode == USCRIPT_INVALID_CODE) {
                scriptCode = s;
            } else if (s != scriptCode) {
                break;
            }
        }
        ++limit;
    }

    // Return TRUE even if the entire text is COMMON / INHERITED, in
    // which case scriptCode will be USCRIPT_INVALID_CODE.
    return TRUE;
}

void ScriptRunIterator::adjustLimit(int32_t delta) {
    limit += delta;
    textLimit += delta;
}

//------------------------------------------------------------
// AnyTransliterator

const char AnyTransliterator::fgClassID=0;

AnyTransliterator::AnyTransliterator(const UnicodeString& id,
                                     const UnicodeString& theTarget,
                                     const UnicodeString& theVariant,
                                     UScriptCode theTargetScript,
                                     UErrorCode& ec) :
    Transliterator(id, NULL),
    targetScript(theTargetScript) 
{
    cache = uhash_open(uhash_hashLong, uhash_compareLong, &ec);
    uhash_setValueDeleter(cache, _deleteTransliterator);

    target = theTarget;
    if (theVariant.length() > 0) {
        target.append(VARIANT_SEP).append(theVariant);
    }
}

AnyTransliterator::~AnyTransliterator() {
    uhash_close(cache);
}

/**
 * Copy constructor.
 */
AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
    Transliterator(o),
    target(o.target),
    targetScript(o.targetScript)
{
    // Don't copy the cache contents
    UErrorCode ec = U_ZERO_ERROR;
    cache = uhash_open(uhash_hashLong, uhash_compareLong, &ec);
    uhash_setValueDeleter(cache, _deleteTransliterator);
}

/**
 * Transliterator API.
 */
Transliterator* AnyTransliterator::clone() const {
    return new AnyTransliterator(*this);
}

/**
 * Implements {@link Transliterator#handleTransliterate}.
 */
void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
                                            UBool isIncremental) const {
    int32_t allStart = pos.start;
    int32_t allLimit = pos.limit;

    ScriptRunIterator it(text, pos.contextStart, pos.contextLimit);

    while (it.next()) {
        // Ignore runs in the ante context
        if (it.limit <= allStart) continue;

        // Try to instantiate transliterator from it.scriptCode to
        // our target or target/variant
        Transliterator* t = getTransliterator(it.scriptCode);

        if (t == NULL) {
            // We have no transliterator.  Do nothing, but keep
            // pos.start up to date.
            pos.start = it.limit;
            continue;
        }

        // If the run end is before the transliteration limit, do
        // a non-incremental transliteration.  Otherwise do an
        // incremental one.
        UBool incremental = isIncremental && (it.limit >= allLimit);

        pos.start = uprv_max(allStart, it.start);
        pos.limit = uprv_min(allLimit, it.limit);
        int32_t limit = pos.limit;
        t->filteredTransliterate(text, pos, incremental);
        int32_t delta = pos.limit - limit;
        allLimit += delta;
        it.adjustLimit(delta);

        // We're done if we enter the post context
        if (it.limit >= allLimit) break;
    }

    // Restore limit.  pos.start is fine where the last transliterator
    // left it, or at the end of the last run.
    pos.limit = allLimit;
}

Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {

    if (source == targetScript || source == USCRIPT_INVALID_CODE) {
        return NULL;
    }

    Transliterator* t = (Transliterator*) uhash_iget(cache, (int32_t) source);
    if (t == NULL) {
        UErrorCode ec = U_ZERO_ERROR;
        UnicodeString sourceName(uscript_getName(source), "");
        UnicodeString id(sourceName);
        id.append(TARGET_SEP).append(target);

        t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
        if (U_FAILURE(ec) || t == NULL) {
            delete t;

            // Try to pivot around Latin, our most common script
            id = sourceName;
            id.append(LATIN_PIVOT).append(target);
            t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
            if (U_FAILURE(ec) || t == NULL) {
                delete t;
                t = NULL;
            }
        }

        if (t != NULL) {
            uhash_iput(cache, (int32_t) source, t, &ec);
        }
    }

    return t;
}

/**
 * Return the script code for a given name, or -1 if not found.
 */
UScriptCode AnyTransliterator::scriptNameToCode(const UnicodeString& name) {
    char buf[128];
    UScriptCode code;
    UErrorCode ec = U_ZERO_ERROR;

    name.extract(0, 128, buf, 128, "");
    if (uscript_getCode(buf, &code, 1, &ec) != 1 ||
        U_FAILURE(ec)) {
        code = USCRIPT_INVALID_CODE;
    }
    return code;
}

/**
 * Registers standard transliterators with the system.  Called by
 * Transliterator during initialization.  Scan all current targets and
 * register those that are scripts T as Any-T/V.
 */
void AnyTransliterator::registerIDs() {

    UErrorCode ec;
    Hashtable seen(TRUE);

    int32_t sourceCount = Transliterator::_countAvailableSources();
    for (int32_t s=0; s<sourceCount; ++s) {
        UnicodeString source;
        Transliterator::_getAvailableSource(s, source);

        // Ignore the "Any" source
        if (source.caseCompare(ANY, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue;

        int32_t targetCount = Transliterator::_countAvailableTargets(source);
        for (int32_t t=0; t<targetCount; ++t) {
            UnicodeString target;
            Transliterator::_getAvailableTarget(t, source, target);

            // Only process each target once
            if (seen.geti(target) != 0) continue;
            ec = U_ZERO_ERROR;
            seen.puti(target, 1, ec);

            // Get the script code for the target.  If not a script, ignore.
            UScriptCode targetScript = scriptNameToCode(target);
            if (targetScript == USCRIPT_INVALID_CODE) continue;

            int32_t variantCount = Transliterator::_countAvailableVariants(source, target);
            // assert(variantCount >= 1);
            for (int32_t v=0; v<variantCount; ++v) {
                UnicodeString variant;
                Transliterator::_getAvailableVariant(v, source, target, variant);

                UnicodeString id;
                TransliteratorIDParser::STVtoID(ANY, target, variant, id);
                ec = U_ZERO_ERROR;
                AnyTransliterator* t = new AnyTransliterator(id, target, variant,
                                                             targetScript, ec);
                if (U_FAILURE(ec)) {
                    delete t;
                } else {
                    Transliterator::_registerInstance(t);
                    Transliterator::_registerSpecialInverse(target, NULL_ID, FALSE);
                }
            }
        }
    }
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */

//eof

--- NEW FILE: anytrans.h ---
/*
*****************************************************************
* Copyright (c) 2002, International Business Machines Corporation
* and others.  All Rights Reserved.
*****************************************************************
* $Source: /usr/local/cvsroot/icu-sword/source/i18n/anytrans.h,v $ 
* $Revision: 1.1 $
*****************************************************************
* Date        Name        Description
* 06/06/2002  aliu        Creation.
*****************************************************************
*/
#ifndef _ANYTRANS_H_
#define _ANYTRANS_H_

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "unicode/translit.h"
#include "unicode/uscript.h"
#include "uhash.h"

U_NAMESPACE_BEGIN

/**
 * A transliterator named Any-T or Any-T/V, where T is the target
 * script and V is the optional variant, that uses multiple
 * transliterators, all going to T or T/V, all with script sources.
 * The target must be a script.  It partitions text into runs of the
 * same script, and then based on the script of each run,
 * transliterates from that script to the given target or
 * target/variant.  Adjacent COMMON or INHERITED script characters are
 * included in each run.
 *
 * @author Alan Liu
 */
class U_I18N_API AnyTransliterator : public Transliterator {

    /**
     * Cache mapping UScriptCode values to Transliterator*.
     */
    UHashtable* cache;

    /**
     * The target or target/variant string.
     */
    UnicodeString target;

    /**
     * The target script code.  Never USCRIPT_INVALID_CODE.
     */
    UScriptCode targetScript;

    /**
     * The address of this static class variable serves as this class's ID
     * for ICU "poor man's RTTI".
     */
    static const char fgClassID;

public:

    /**
     * Destructor.
     */
    virtual ~AnyTransliterator();

    /**
     * Copy constructor.
     */
    AnyTransliterator(const AnyTransliterator&);

    /**
     * Transliterator API.
     */
    Transliterator* clone() const;

    /**
     * Implements {@link Transliterator#handleTransliterate}.
     */
    virtual void handleTransliterate(Replaceable& text, UTransPosition& index,
                                     UBool incremental) const;

    /**
     * ICU "poor man's RTTI", returns a UClassID for the actual class.
     *
     * @draft ICU 2.2
     */
    virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }

    /**
     * ICU "poor man's RTTI", returns a UClassID for this class.
     *
     * @draft ICU 2.2
     */
    static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }

private:

    /**
     * Private constructor
     * @param id the ID of the form S-T or S-T/V, where T is theTarget
     * and V is theVariant.  Must not be empty.
     * @param theTarget the target name.  Must not be empty, and must
     * name a script corresponding to theTargetScript.
     * @param theVariant the variant name, or the empty string if
     * there is no variant
     * @param theTargetScript the script code corresponding to
     * theTarget.
     * @param ec error code, fails if the internal hashtable cannot be
     * allocated
     */
    AnyTransliterator(const UnicodeString& id,
                      const UnicodeString& theTarget,
                      const UnicodeString& theVariant,
                      UScriptCode theTargetScript,
                      UErrorCode& ec);

    /**
     * Returns a transliterator from the given source to our target or
     * target/variant.  Returns NULL if the source is the same as our
     * target script, or if the source is USCRIPT_INVALID_CODE.
     * Caches the result and returns the same transliterator the next
     * time.  The caller does NOT own the result and must not delete
     * it.
     */
    Transliterator* getTransliterator(UScriptCode source) const;

    /**
     * Registers standard transliterators with the system.  Called by
     * Transliterator during initialization.
     */
    static void registerIDs();

    friend class Transliterator; // for registerIDs()

    /**
     * Return the script code for a given name, or
     * USCRIPT_INVALID_CODE if not found.
     */
    static UScriptCode scriptNameToCode(const UnicodeString& name);
};

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */

#endif

--- NEW FILE: buddhcal.cpp ---
/*
*******************************************************************************
* Copyright (C) 2003, International Business Machines Corporation and    *
* others. All Rights Reserved.                                                *
*******************************************************************************
*
* File BUDDHCAL.CPP
*
* Modification History:
*  05/13/2003    srl     copied from gregocal.cpp
*
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_FORMATTING

#include "buddhcal.h"
#include "unicode/gregocal.h"
#include "mutex.h"
#include <float.h>

U_NAMESPACE_BEGIN

const char BuddhistCalendar::fgClassID = 0; // Value is irrelevant

static const int32_t kMaxEra = 0; // only 1 era

static const int32_t kBuddhistEraStart = -543;  // 544 BC (Gregorian)

static const int32_t kGregorianEpoch = 1970; 

BuddhistCalendar::BuddhistCalendar(const Locale& aLocale, UErrorCode& success)
  :   GregorianCalendar(aLocale, success)
{
    setTimeInMillis(getNow(), success); // Call this again now that the vtable is set up properly.
}

BuddhistCalendar::~BuddhistCalendar()
{
}

BuddhistCalendar::BuddhistCalendar(const BuddhistCalendar& source)
  : GregorianCalendar(source)
{
}

BuddhistCalendar& BuddhistCalendar::operator= ( const BuddhistCalendar& right)
{
  GregorianCalendar::operator=(right);
  return *this;
}

Calendar* BuddhistCalendar::clone(void) const
{
  return new BuddhistCalendar(*this);
}

const char *BuddhistCalendar::getType() const
{
  return "buddhist";
}

int32_t
BuddhistCalendar::getMaximum(UCalendarDateFields field) const
{
  if(field == UCAL_ERA) {
    return kMaxEra;
  } else {
    return GregorianCalendar::getMaximum(field);
  }
}

int32_t
BuddhistCalendar::getLeastMaximum(UCalendarDateFields field) const
{
  if(field == UCAL_ERA) {
    return kMaxEra;
  } else {
    return GregorianCalendar::getLeastMaximum(field);
  }
}

int32_t
BuddhistCalendar::monthLength(int32_t month, int32_t year) const
{
  return GregorianCalendar::monthLength(month,year);
}

int32_t
BuddhistCalendar::monthLength(int32_t month) const
{
    UErrorCode status = U_ZERO_ERROR;
    int32_t year = internalGet(UCAL_YEAR);
    // ignore era
    return GregorianCalendar::monthLength(month, getGregorianYear(status));
}

int32_t BuddhistCalendar::internalGetEra() const
{
    return isSet(UCAL_ERA) ? internalGet(UCAL_ERA) : BE;  
}

int32_t
BuddhistCalendar::getGregorianYear(UErrorCode &status)  const
{
  int32_t year = (fStamp[UCAL_YEAR] != kUnset) ? internalGet(UCAL_YEAR) : kGregorianEpoch+kBuddhistEraStart;
  int32_t era = BE;
  if (fStamp[UCAL_ERA] != kUnset) {
    era = internalGet(UCAL_ERA);
    if (era != BE) {
      status = U_ILLEGAL_ARGUMENT_ERROR;
      return kGregorianEpoch + kBuddhistEraStart;
    }
  }
  return year + kBuddhistEraStart;
}

void BuddhistCalendar::timeToFields(UDate theTime, UBool quick, UErrorCode& status)
{
  GregorianCalendar::timeToFields(theTime, quick, status);

  int32_t era = internalGet(UCAL_ERA);
  int32_t year = internalGet(UCAL_YEAR);

  if(era == GregorianCalendar::BC) {
    year = 1-year;
    era = BuddhistCalendar::BE;
  } else if(era == GregorianCalendar::AD) {
    era = BuddhistCalendar::BE;
  } else {
    status = U_INTERNAL_PROGRAM_ERROR;
  }

  year = year - kBuddhistEraStart;

  internalSet(UCAL_ERA, era);
  internalSet(UCAL_YEAR, year);
}

void BuddhistCalendar::add(UCalendarDateFields field, int32_t amount, UErrorCode& status)
{
    if (U_FAILURE(status)) 
        return;

    if (amount == 0) 
        return;   // Do nothing!

    if(field == UCAL_YEAR /* || field == UCAL_YEAR_WOY */) {
        int32_t year = internalGet(field);
        int32_t era = internalGetEra();

        year += amount;

        set(field,year);
        pinDayOfMonth();
    } else {
      GregorianCalendar::add(field,amount,status);
    }
}

// default century
const UDate     BuddhistCalendar::fgSystemDefaultCentury        = DBL_MIN;
const int32_t   BuddhistCalendar::fgSystemDefaultCenturyYear    = -1;

UDate           BuddhistCalendar::fgSystemDefaultCenturyStart       = DBL_MIN;
int32_t         BuddhistCalendar::fgSystemDefaultCenturyStartYear   = -1;

UBool BuddhistCalendar::haveDefaultCentury() const
{
  return TRUE;
}

UDate BuddhistCalendar::defaultCenturyStart() const
{
  return internalGetDefaultCenturyStart();
}

int32_t BuddhistCalendar::defaultCenturyStartYear() const
{
  return internalGetDefaultCenturyStartYear();
}

UDate
BuddhistCalendar::internalGetDefaultCenturyStart() const
{
  // lazy-evaluate systemDefaultCenturyStart
  UBool needsUpdate;
  { 
    Mutex m;
    needsUpdate = (fgSystemDefaultCenturyStart == fgSystemDefaultCentury);
  }

  if (needsUpdate) {
    initializeSystemDefaultCentury();
  }

  // use defaultCenturyStart unless it's the flag value;
  // then use systemDefaultCenturyStart

  return fgSystemDefaultCenturyStart;
}

int32_t
BuddhistCalendar::internalGetDefaultCenturyStartYear() const
{
  // lazy-evaluate systemDefaultCenturyStartYear
  UBool needsUpdate;
  { 
    Mutex m;
    needsUpdate = (fgSystemDefaultCenturyStart == fgSystemDefaultCentury);
  }

  if (needsUpdate) {
    initializeSystemDefaultCentury();
  }

  // use defaultCenturyStart unless it's the flag value;
  // then use systemDefaultCenturyStartYear

  return    fgSystemDefaultCenturyStartYear;
}

void
BuddhistCalendar::initializeSystemDefaultCentury()
{
  // initialize systemDefaultCentury and systemDefaultCenturyYear based
  // on the current time.  They'll be set to 80 years before
  // the current time.
  // No point in locking as it should be idempotent.
  if (fgSystemDefaultCenturyStart == fgSystemDefaultCentury)
  {
    UErrorCode status = U_ZERO_ERROR;
    Calendar *calendar = new BuddhistCalendar(Locale("th_TH_TRADITIONAL"),status);
    if (calendar != NULL && U_SUCCESS(status))
    {
      calendar->setTime(Calendar::getNow(), status);
      calendar->add(UCAL_YEAR, -80, status);
      UDate    newStart =  calendar->getTime(status);
      int32_t  newYear  =  calendar->get(UCAL_YEAR, status);
      {
        Mutex m;
        fgSystemDefaultCenturyStart = newStart;
        fgSystemDefaultCenturyStartYear = newYear;
      }
      delete calendar;
    }
    // We have no recourse upon failure unless we want to propagate the failure
    // out.
  }
}

U_NAMESPACE_END

#endif

--- NEW FILE: buddhcal.h ---
/*
* Copyright (C) 2003, International Business Machines Corporation and others. All Rights Reserved.
********************************************************************************
*
* File BUDDHCAL.H
*
* Modification History:
*
*   Date        Name        Description
*   05/13/2003  srl          copied from gregocal.h
********************************************************************************
*/

#ifndef BUDDHCAL_H
#define BUDDHCAL_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_FORMATTING

#include "unicode/calendar.h"
#include "unicode/gregocal.h"

U_NAMESPACE_BEGIN

/**
 * Concrete class which provides the Buddhist calendar.
 * 
 * <code>BuddhistCalendar</code> is a subclass of <code>GregorianCalendar</code>
 * that numbers years since the birth of the Buddha. This is the civil calendar
 * in some predominantly Buddhist countries such as Thailand, and it is used for
 * religious purposes elsewhere.
 * 
 * The Buddhist calendar is identical to the Gregorian calendar in all respects
 * except for the year and era. Years are numbered since the birth of the
 * Buddha in 543 BC (Gregorian), so that 1 AD (Gregorian) is equivalent to 544
 * BE (Buddhist Era) and 1998 AD is 2541 BE.
 * 
 * The Buddhist Calendar has only one allowable era: <code>BE</code>. If the
 * calendar is not in lenient mode (see <code>setLenient</code>), dates before
 * 1/1/1 BE are rejected as an illegal argument.
 * 
 * @internal
 */
class U_I18N_API BuddhistCalendar : public GregorianCalendar {
public:

    /**
     * Useful constants for BuddhistCalendar.  Only one Era.
     * @internal
     */
    enum EEras {
       BE
    };

    /**
     * Constructs a BuddhistCalendar based on the current time in the default time zone
     * with the given locale.
     *
     * @param aLocale  The given locale.
     * @param success  Indicates the status of BuddhistCalendar object construction.
     *                 Returns U_ZERO_ERROR if constructed successfully.
     * @stable ICU 2.0
     */
    BuddhistCalendar(const Locale& aLocale, UErrorCode& success);

    /**
     * Destructor
     * @internal
     */
    virtual ~BuddhistCalendar();

    /**
     * Copy constructor
     * @param source    the object to be copied.
     * @internal
     */
    BuddhistCalendar(const BuddhistCalendar& source);

    /**
     * Default assignment operator
     * @param right    the object to be copied.
     * @internal
     */
    BuddhistCalendar& operator=(const BuddhistCalendar& right);

    /**
     * Create and return a polymorphic copy of this calendar.
     * @return    return a polymorphic copy of this calendar.
     * @internal
     */
    virtual Calendar* clone(void) const;

public:

    /**
     * Override Calendar Returns a unique class ID POLYMORPHICALLY. Pure virtual
     * override. This method is to implement a simple version of RTTI, since not all C++
     * compilers support genuine RTTI. Polymorphic operator==() and clone() methods call
     * this method.
     *
     * @return   The class ID for this object. All objects of a given class have the
     *           same class ID. Objects of other classes have different class IDs.
     * @internal
     */
    virtual UClassID getDynamicClassID(void) const;

    /**
     * Return the class ID for this class. This is useful only for comparing to a return
     * value from getDynamicClassID(). For example:
     *
     *      Base* polymorphic_pointer = createPolymorphicObject();
     *      if (polymorphic_pointer->getDynamicClassID() ==
     *          Derived::getStaticClassID()) ...
     *
     * @return   The class ID for all objects of this class.
     * @internal
     */
    static inline UClassID getStaticClassID(void);

    /**
     * return the calendar type, "buddhist".
     *
     * @return calendar type
     * @internal
     */
    virtual const char * getType() const;

    /**
     * (Overrides Calendar) UDate Arithmetic function. Adds the specified (signed) amount
     * of time to the given time field, based on the calendar's rules.  For more
     * information, see the documentation for Calendar::add().
     *
     * @param field   The time field.
     * @param amount  The amount of date or time to be added to the field.
     * @param status  Output param set to success/failure code on exit. If any value
     *                previously set in the time field is invalid, this will be set to
     *                an error status.
     * @draft ICU 2.6.
     */
    virtual void add(UCalendarDateFields field, int32_t amount, UErrorCode& status);

    /**
     * API overrides
     * @private
     */
    int32_t getMaximum(UCalendarDateFields field) const;
    int32_t getLeastMaximum(UCalendarDateFields field) const;
    inline virtual int32_t getMaximum(EDateFields field) const { return getMaximum((UCalendarDateFields)field); }
    inline virtual int32_t getLeastMaximum(EDateFields field) const { return getLeastMaximum((UCalendarDateFields)field); }
    inline virtual void add(EDateFields field, int32_t amount, UErrorCode& status) { add((UCalendarDateFields)field, amount, status); }

private:
    BuddhistCalendar(); // default constructor not implemented

    static const char fgClassID;

 protected:
    virtual int32_t monthLength(int32_t month) const; 
    virtual int32_t monthLength(int32_t month, int32_t year) const; 
    int32_t getGregorianYear(UErrorCode& status) const;

    virtual int32_t internalGetEra() const;
    virtual void timeToFields(UDate theTime, UBool quick, UErrorCode& status);
    virtual UBool haveDefaultCentury() const;
    virtual UDate defaultCenturyStart() const;
    virtual int32_t defaultCenturyStartYear() const;

 private: // default century stuff.
    /**
     * The system maintains a static default century start date.  This is initialized
     * the first time it is used.  Before then, it is set to SYSTEM_DEFAULT_CENTURY to
     * indicate an uninitialized state.  Once the system default century date and year
     * are set, they do not change.
     */
    static UDate         fgSystemDefaultCenturyStart;

    /**
     * See documentation for systemDefaultCenturyStart.
     */
    static int32_t          fgSystemDefaultCenturyStartYear;

    /**
     * Default value that indicates the defaultCenturyStartYear is unitialized
     */
    static const int32_t    fgSystemDefaultCenturyYear;

    static const UDate        fgSystemDefaultCentury;

    /**
     * Returns the beginning date of the 100-year window that dates with 2-digit years
     * are considered to fall within.
     * @return    the beginning date of the 100-year window that dates with 2-digit years
     *            are considered to fall within.
     */
    UDate         internalGetDefaultCenturyStart(void) const;

    /**
     * Returns the first year of the 100-year window that dates with 2-digit years
     * are considered to fall within.
     * @return    the first year of the 100-year window that dates with 2-digit years
     *            are considered to fall within.
     */
    int32_t          internalGetDefaultCenturyStartYear(void) const;

    /**
     * Initializes the 100-year window that dates with 2-digit years are considered
     * to fall within so that its start date is 80 years before the current time.
     */
    static void  initializeSystemDefaultCentury(void);
};

inline UClassID
BuddhistCalendar::getStaticClassID(void)
{ return (UClassID)&fgClassID; }

inline UClassID
BuddhistCalendar::getDynamicClassID(void) const
{ return BuddhistCalendar::getStaticClassID(); }

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_FORMATTING */

#endif // _GREGOCAL
//eof

--- NEW FILE: cpdtrans.h ---
/*
**********************************************************************
*   Copyright (C) 1999-2003, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   11/17/99    aliu        Creation.
**********************************************************************
*/
#ifndef CPDTRANS_H
#define CPDTRANS_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "unicode/translit.h"

U_NAMESPACE_BEGIN

class U_I18N_API UVector;
class TransliteratorRegistry;

/**
 * A transliterator that is composed of two or more other
 * transliterator objects linked together. For example, if one
 * transliterator transliterates from script A to script B, and
 * another transliterates from script B to script C, the two may be
 * combined to form a new transliterator from A to C.
 *
 * Composed transliterators may not behave as expected. For
 * example, inverses may not combine to form the identity
 * transliterator. See the class documentation for {@link
 * Transliterator} for details.
 *
 * @author Alan Liu
 * @internal Use transliterator factory methods instead since this class will be removed in that release.
 */
class U_I18N_API CompoundTransliterator : public Transliterator {

    Transliterator** trans;

    int32_t count;

    /**
     * For compound RBTs (those with an ::id block before and/or after
     * the main rule block) we record the index of the RBT here.
     * Otherwise, this should have a value of -1.  We need this
     * information to implement toRules().
     */
    int32_t compoundRBTIndex;

public:

    /**
     * Constructs a new compound transliterator given an array of
     * transliterators.  The array of transliterators may be of any
     * length, including zero or one, however, useful compound
     * transliterators have at least two components.
     * @param transliterators array of <code>Transliterator</code>
     * objects
     * @param transliteratorCount The number of
     * <code>Transliterator</code> objects in transliterators.
     * @param adoptedFilter the filter.  Any character for which
     * <tt>filter.contains()</tt> returns <tt>false</tt> will not be
     * altered by this transliterator.  If <tt>filter</tt> is
     * <tt>null</tt> then no filtering is applied.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    CompoundTransliterator(Transliterator* const transliterators[],
                           int32_t transliteratorCount,
                           UnicodeFilter* adoptedFilter = 0);

    /**
     * Constructs a new compound transliterator.
     * @param id compound ID
     * @param dir either UTRANS_FORWARD or UTRANS_REVERSE
     * @param adoptedFilter a global filter for this compound transliterator
     * or NULL
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    CompoundTransliterator(const UnicodeString& id,
                           UTransDirection dir,
                           UnicodeFilter* adoptedFilter,
                           UParseError& parseError,
                           UErrorCode& status);

    /**
     * Constructs a new compound transliterator in the FORWARD
     * direction with a NULL filter.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    CompoundTransliterator(const UnicodeString& id,
                           UParseError& parseError,
                           UErrorCode& status);
    /**
     * Destructor.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    virtual ~CompoundTransliterator();

    /**
     * Copy constructor.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    CompoundTransliterator(const CompoundTransliterator&);

    /**
     * Assignment operator.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    CompoundTransliterator& operator=(const CompoundTransliterator&);

    /**
     * Transliterator API.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    Transliterator* clone(void) const;

    /**
     * Returns the number of transliterators in this chain.
     * @return number of transliterators in this chain.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    virtual int32_t getCount(void) const;

    /**
     * Returns the transliterator at the given index in this chain.
     * @param index index into chain, from 0 to <code>getCount() - 1</code>
     * @return transliterator at the given index
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    virtual const Transliterator& getTransliterator(int32_t index) const;

    /**
     * Sets the transliterators.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    void setTransliterators(Transliterator* const transliterators[],
                            int32_t count);

    /**
     * Adopts the transliterators.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    void adoptTransliterators(Transliterator* adoptedTransliterators[],
                              int32_t count);

    /**
     * Override Transliterator:
     * Create a rule string that can be passed to createFromRules()
     * to recreate this transliterator.
     * @param result the string to receive the rules.  Previous
     * contents will be deleted.
     * @param escapeUnprintable if TRUE then convert unprintable
     * character to their hex escape representations, \uxxxx or
     * \Uxxxxxxxx.  Unprintable characters are those other than
     * U+000A, U+0020..U+007E.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    virtual UnicodeString& toRules(UnicodeString& result,
                                   UBool escapeUnprintable) const;

 protected:
    /**
     * Implement Transliterator framework
     */
    virtual void handleGetSourceSet(UnicodeSet& result) const;

 public:
    /**
     * Override Transliterator framework
     */
    virtual UnicodeSet& getTargetSet(UnicodeSet& result) const;

protected:
    /**
     * Implements {@link Transliterator#handleTransliterate}.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    virtual void handleTransliterate(Replaceable& text, UTransPosition& index,
                                     UBool incremental) const;

public:

    /**
     * ICU "poor man's RTTI", returns a UClassID for the actual class.
     *
     * @draft ICU 2.2
     */
    virtual inline UClassID getDynamicClassID() const;

    /**
     * ICU "poor man's RTTI", returns a UClassID for this class.
     *
     * @draft ICU 2.2
     */
    static inline UClassID getStaticClassID();

private:

    friend class Transliterator;
    friend class TransliteratorAlias; // to access private ct

    /**
     * Private constructor for compound RBTs.  Construct a compound
     * transliterator using the given idBlock, with the adoptedTrans
     * inserted at the idSplitPoint.
     */
    CompoundTransliterator(const UnicodeString& ID,
                           const UnicodeString& idBlock,
                           int32_t idSplitPoint,
                           Transliterator *adoptedTrans,
                           UErrorCode& status);

    /**
     * Private constructor for Transliterator.
     */
    CompoundTransliterator(UVector& list,
                           UParseError& parseError,
                           UErrorCode& status);

    void init(const UnicodeString& id,
              UTransDirection direction,
              int32_t idSplitPoint,
              Transliterator *adoptedRbt,
              UBool fixReverseID,
              UErrorCode& status);

    void init(UVector& list,
              UTransDirection direction,
              UBool fixReverseID,
              UErrorCode& status);

    /**
     * Return the IDs of the given list of transliterators, concatenated
     * with ';' delimiting them.  Equivalent to the perlish expression
     * join(';', map($_.getID(), transliterators).
     */
    UnicodeString joinIDs(Transliterator* const transliterators[],
                          int32_t transCount);

    void freeTransliterators(void);

    void computeMaximumContextLength(void);

    /**
     * The address of this static class variable serves as this class's ID
     * for ICU "poor man's RTTI".
     */
    static const char fgClassID;
};

inline UClassID
CompoundTransliterator::getStaticClassID()
{ return (UClassID)&fgClassID; }

inline UClassID
CompoundTransliterator::getDynamicClassID() const
{ return CompoundTransliterator::getStaticClassID(); }

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */

#endif

--- NEW FILE: funcrepl.cpp ---
/*
**********************************************************************
*   Copyright (c) 2002, International Business Machines Corporation
*   and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   02/04/2002  aliu        Creation.
**********************************************************************
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "unicode/translit.h"
#include "unicode/uniset.h"
#include "funcrepl.h"

static const UChar AMPERSAND = 38; // '&'
static const UChar OPEN[]    = {40,32,0}; // "( "
static const UChar CLOSE[]   = {32,41,0}; // " )"

U_NAMESPACE_BEGIN

const char FunctionReplacer::fgClassID=0;

/**
 * Construct a replacer that takes the output of the given
 * replacer, passes it through the given transliterator, and emits
 * the result as output.
 */
FunctionReplacer::FunctionReplacer(Transliterator* adoptedTranslit,
                                   UnicodeFunctor* adoptedReplacer) {
    translit = adoptedTranslit;
    replacer = adoptedReplacer;
}

/**
 * Copy constructor.
 */
FunctionReplacer::FunctionReplacer(const FunctionReplacer& other) {
    translit = other.translit->clone();
    replacer = other.replacer->clone();
}

/**
 * Destructor
 */
FunctionReplacer::~FunctionReplacer() {
    delete translit;
    delete replacer;
}

/**
 * Implement UnicodeFunctor
 */
UnicodeFunctor* FunctionReplacer::clone() const {
    return new FunctionReplacer(*this);
}

/**
 * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
 * and return the pointer.
 */
UnicodeReplacer* FunctionReplacer::toReplacer() const {
    return (UnicodeReplacer*) this;
}

/**
 * UnicodeReplacer API
 */
int32_t FunctionReplacer::replace(Replaceable& text,
                                  int32_t start,
                                  int32_t limit,
                                  int32_t& cursor) {

    // First delegate to subordinate replacer
    int32_t len = replacer->toReplacer()->replace(text, start, limit, cursor);
    limit = start + len;

    // Now transliterate
    limit = translit->transliterate(text, start, limit);

    return limit - start;
}

/**
 * UnicodeReplacer API
 */
UnicodeString& FunctionReplacer::toReplacerPattern(UnicodeString& rule,
                                                   UBool escapeUnprintable) const {
    UnicodeString str;
    rule.truncate(0);
    rule.append(AMPERSAND);
    rule.append(translit->getID());
    rule.append(OPEN);
    rule.append(replacer->toReplacer()->toReplacerPattern(str, escapeUnprintable));
    rule.append(CLOSE);
    return rule;
}

/**
 * Implement UnicodeReplacer
 */
void FunctionReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
    UnicodeSet set;
    toUnionTo.addAll(translit->getTargetSet(set));
}

/**
 * UnicodeFunctor API
 */
void FunctionReplacer::setData(const TransliterationRuleData* d) {
    replacer->setData(d);
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */

//eof

--- NEW FILE: funcrepl.h ---
/*
**********************************************************************
*   Copyright (c) 2002, International Business Machines Corporation
*   and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   02/04/2002  aliu        Creation.
**********************************************************************
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "unicode/unifunct.h"
#include "unicode/unirepl.h"

U_NAMESPACE_BEGIN

class Transliterator;

/**
 * A replacer that calls a transliterator to generate its output text.
 * The input text to the transliterator is the output of another
 * UnicodeReplacer object.  That is, this replacer wraps another
 * replacer with a transliterator.
 *
 * Added U_I18N_API to make the definition of a global operator delete work.
 * See Jitterbug 2581. markus 2002dec17
 *
 * @author Alan Liu
 */
class U_I18N_API FunctionReplacer : public UnicodeFunctor, public UnicodeReplacer {

 private:

    /**
     * The transliterator.  Must not be null.  OWNED.
     */
    Transliterator* translit;

    /**
     * The replacer object.  This generates text that is then
     * processed by 'translit'.  Must not be null.  OWNED.
     */
    UnicodeFunctor* replacer;

    /**
     * The address of this static class variable serves as this class's ID
     * for ICU "poor man's RTTI".
     */
    static const char fgClassID;

 public:

    /**
     * Construct a replacer that takes the output of the given
     * replacer, passes it through the given transliterator, and emits
     * the result as output.
     */
    FunctionReplacer(Transliterator* adoptedTranslit,
                     UnicodeFunctor* adoptedReplacer);

    /**
     * Copy constructor.
     */
    FunctionReplacer(const FunctionReplacer& other);

    /**
     * Destructor
     */
    virtual ~FunctionReplacer();

    /**
     * Implement UnicodeFunctor
     */
    virtual UnicodeFunctor* clone() const;

    /**
     * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
     * and return the pointer.
     */
    virtual UnicodeReplacer* toReplacer() const;

    /**
     * UnicodeReplacer API
     */
    virtual int32_t replace(Replaceable& text,
                            int32_t start,
                            int32_t limit,
                            int32_t& cursor);

    /**
     * UnicodeReplacer API
     */
    virtual UnicodeString& toReplacerPattern(UnicodeString& rule,
                                             UBool escapeUnprintable) const;

    /**
     * Implement UnicodeReplacer
     */
    virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const;

    /**
     * UnicodeFunctor API
     */
    virtual void setData(const TransliterationRuleData*);

    /**
     * ICU "poor man's RTTI", returns a UClassID for the actual class.
     *
     * @draft ICU 2.2
     */
    virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }

    /**
     * ICU "poor man's RTTI", returns a UClassID for this class.
     *
     * @draft ICU 2.2
     */
    static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
};

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */

//eof

--- NEW FILE: hextouni.h ---
/*
**********************************************************************
*   Copyright (C) 1999-2003, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   11/17/99    aliu        Creation.
**********************************************************************
*/
#ifndef HEXTOUNI_H
#define HEXTOUNI_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "unicode/translit.h"

U_NAMESPACE_BEGIN

/**
 * A transliterator that converts from hexadecimal Unicode escape
 * sequences to the characters they represent.  For example, "U+0040"
 * and '\u0040'.  A default HexToUnicodeTransliterator recognizes the
 * prefixes "U+", "u+", "&#92;U", and "&#92;u".  Hex values may be
 * upper- or lowercase.  By calling the applyPattern() method, one
 * or more custom prefix/suffix pairs may be specified.  See
 * applyPattern() for details.
 *
 * @author Alan Liu
 * @internal Use transliterator factory methods instead since this class will be removed in that release.
 */
class U_I18N_API HexToUnicodeTransliterator : public Transliterator {

    /**
     * ID for this transliterator.
     */
    static const char _ID[];

    /**
     * The pattern used by the default constructor
     */
    static const UChar DEFAULT_PATTERN[];

    // Character constants defined here to avoid ASCII dependency
    enum {
        SEMICOLON = 0x003B, // ';'
        ZERO      = 0x0030, // '0'
        POUND     = 0x0023, // '#'
        BACKSLASH = 0x005C  // '\\'
    };

    /**
     * The pattern for this transliterator
     */
    UnicodeString pattern;

    /**
     * The processed pattern specification.  See applyPattern() for
     * details.
     */
    UnicodeString affixes;

    /**
     * The number of different affix sets in affixes.
     */
    int32_t affixCount;

    /**
     * The address of this static class variable serves as this class's ID
     * for ICU "poor man's RTTI".
     */
    static const char fgClassID;

public:

    /**
     * Constructs a transliterator that recognizes the standard
     * prefixes "&#92;u", "&#92;U", "u+", and "U+", each with no
     * suffix.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    HexToUnicodeTransliterator(UnicodeFilter* adoptedFilter = 0);

    /**
     * Constructs a custom transliterator with the given pattern.
     * @see #applyPattern
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    HexToUnicodeTransliterator(const UnicodeString& pattern,
                               UErrorCode& status);

    /**
     * Constructs a custom transliterator with the given pattern
     * and filter.
     * @see #applyPattern
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    HexToUnicodeTransliterator(const UnicodeString& pattern,
                               UnicodeFilter* adoptedFilter,
                               UErrorCode& status);

    /**
     * Destructor.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    virtual ~HexToUnicodeTransliterator();

    /**
     * Copy constructor.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    HexToUnicodeTransliterator(const HexToUnicodeTransliterator&);

    /**
     * Assignment operator.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    HexToUnicodeTransliterator& operator=(const HexToUnicodeTransliterator&);

    /**
     * Transliterator API.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    Transliterator* clone(void) const;

    /**
     * Set the patterns recognized by this transliterator.  One or
     * more patterns may be specified, separated by semicolons (';').
     * Each pattern contains zero or more prefix characters, one or
     * more digit characters, and zero or more suffix characters.  The
     * digit characters indicates optional digits ('#') followed by
     * required digits ('0').  The total number of digits cannot
     * exceed 4, and must be at least 1 required digit.  Use a
     * backslash ('\\') to escape any of the special characters.  An
     * empty pattern is allowed; it specifies a transliterator that
     * does nothing.
     *
     * <p>Example: "U+0000;<###0>" specifies two patterns.  The first
     * has a prefix of "U+", exactly four digits, and no suffix.  The
     * second has a prefix of "<", between one and four digits, and a
     * suffix of ">".
     *
     * <p><pre>
     * pattern := spec | ( pattern ';' spec )
     * spec := prefix-char* digit-spec suffix-char*
     * digit-spec := '#'* '0'+
     * prefix-char := [^special-char] | '\\' special-char
     * suffix-char := [^special-char] | '\\' special-char
     * special-char := ';' | '0' | '#' | '\\'
     * </pre>
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    void applyPattern(const UnicodeString& thePattern, UErrorCode& status);

    /**
     * Return this transliterator's pattern.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    const UnicodeString& toPattern(void) const;

    /**
     * Implements {@link Transliterator#handleTransliterate}.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    virtual void handleTransliterate(Replaceable& text, UTransPosition& offset,
                                     UBool isIncremental) const;

    /**
     * ICU "poor man's RTTI", returns a UClassID for the actual class.
     *
     * @draft ICU 2.2
     */
    virtual inline UClassID getDynamicClassID() const;

    /**
     * ICU "poor man's RTTI", returns a UClassID for this class.
     *
     * @draft ICU 2.2
     */
    static inline UClassID getStaticClassID();
};

inline HexToUnicodeTransliterator::~HexToUnicodeTransliterator() {}

inline UClassID
HexToUnicodeTransliterator::getStaticClassID()
{ return (UClassID)&fgClassID; }

inline UClassID
HexToUnicodeTransliterator::getDynamicClassID() const
{ return HexToUnicodeTransliterator::getStaticClassID(); }

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */

#endif

--- NEW FILE: i18n.vcproj ---
<?xml version="1.0" encoding = "Windows-1252"?>
<VisualStudioProject
	ProjectType="Visual C++"
	Version="7.00"
	Name="i18n"
	SccProjectName=""
	SccLocalPath="">
	<Platforms>
		<Platform
			Name="Win32"/>
	</Platforms>
	<Configurations>
		<Configuration
			Name="Release|Win32"
			OutputDirectory=".\..\..\lib"
			IntermediateDirectory=".\Release"
			ConfigurationType="2"
			UseOfMFC="0"
			ATLMinimizesCRunTimeLibraryUsage="FALSE"
[...1149 lines suppressed...]
					<Tool
						Name="VCCustomBuildTool"
						CommandLine="copy    $(InputPath)    ..\..\include\unicode
"
						Outputs="..\..\include\unicode\utrans.h"/>
				</FileConfiguration>
				<FileConfiguration
					Name="Debug|Win32">
					<Tool
						Name="VCCustomBuildTool"
						CommandLine="copy    $(InputPath)    ..\..\include\unicode
"
						Outputs="..\..\include\unicode\utrans.h"/>
				</FileConfiguration>
			</File>
		</Filter>
	</Files>
	<Globals>
	</Globals>
</VisualStudioProject>

--- NEW FILE: japancal.cpp ---
/*
*******************************************************************************
* Copyright (C) 2003, International Business Machines Corporation and         *
* others. All Rights Reserved.                                                *
*******************************************************************************
*
* File JAPANCAL.CPP
*
* Modification History:
*  05/16/2003    srl     copied from buddhcal.cpp
*
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_FORMATTING

#include "japancal.h"
#include "unicode/gregocal.h"

//#define U_DEBUG_JCAL

#ifdef U_DEBUG_JCAL
#include <stdio.h>
#endif

U_NAMESPACE_BEGIN

const char JapaneseCalendar::fgClassID = 0; // Value is irrelevant

//  Gregorian date of each emperor's ascension
//  Years are AD, months are 1-based.
static const struct { 
   int16_t year;
   int8_t  month;
   int8_t  day;
} kEraInfo[] =  {
      //  Year  Month Day
      {   645,    6, 19 },   // Taika   0
      {   650,    2, 15 },   // Hakuchi 1
      {   672,    1,  1 },   // Hakuho  2
      {   686,    7, 20 },   // Shucho  3
      {   701,    3, 21 },   // Taiho   4
      {   704,    5, 10 },   // Keiun   5
      {   708,    1, 11 },   // Wado    6
      {   715,    9,  2 },   // Reiki   7
      {   717,   11, 17 },   // Yoro    8
      {   724,    2,  4 },   // Jinki   9
      {   729,    8,  5 },   // Tempyo  10
      {   749,    4, 14 },   // Tempyo-kampo 11
      {   749,    7,  2 },   // Tempyo-shoho 12
      {   757,    8, 18 },   // Tempyo-hoji  13
      {   765,    1,  7 },   // Tempho-jingo 14
      {   767,    8, 16 },   // Jingo-keiun  15
      {   770,   10,  1 },   // Hoki         16
      {   781,    1,  1 },   // Ten-o        17
      {   782,    8, 19 },   // Enryaku      18
      {   806,    5, 18 },   // Daido        19
      {   810,    9, 19 },   // Konin        20
      {   824,    1,  5 },   // Tencho
      {   834,    1,  3 },   // Showa
      {   848,    6, 13 },   // Kajo
      {   851,    4, 28 },   // Ninju
      {   854,   11, 30 },   // Saiko
      {   857,    2, 21 },   // Tennan
      {   859,    4, 15 },   // Jogan
      {   877,    4, 16 },   // Genkei
      {   885,    2, 21 },   // Ninna
      {   889,    4, 27 },   // Kampyo       30
      {   898,    4, 26 },   // Shotai
      {   901,    7, 15 },   // Engi
      {   923,    4, 11 },   // Encho
      {   931,    4, 26 },   // Shohei
      {   938,    5, 22 },   // Tengyo
      {   947,    4, 22 },   // Tenryaku
      {   957,   10, 27 },   // Tentoku
      {   961,    2, 16 },   // Owa
      {   964,    7, 10 },   // Koho
      {   968,    8, 13 },   // Anna        40
      {   970,    3, 25 },   // Tenroku
      {   973,   12, 20 },   // Ten-en
      {   976,    7, 13 },   // Jogen
      {   978,   11, 29 },   // Tengen
      {   983,    4, 15 },   // Eikan
      {   985,    4, 27 },   // Kanna
      {   987,    4,  5 },   // Ei-en
      {   989,    8,  8 },   // Eiso
      {   990,   11,  7 },   // Shoryaku
      {   995,    2, 22 },   // Chotoku      50
      {   999,    1, 13 },   // Choho
      {  1004,    7, 20 },   // Kanko
      {  1012,   12, 25 },   // Chowa
      {  1017,    4, 23 },   // Kannin
      {  1021,    2,  2 },   // Jian
      {  1024,    7, 13 },   // Manju
      {  1028,    7, 25 },   // Chogen
      {  1037,    4, 21 },   // Choryaku
      {  1040,   11, 10 },   // Chokyu
      {  1044,   11, 24 },   // Kantoku      60
      {  1046,    4, 14 },   // Eisho
      {  1053,    1, 11 },   // Tengi
      {  1058,    8, 29 },   // Kohei
      {  1065,    8,  2 },   // Jiryaku
      {  1069,    4, 13 },   // Enkyu
      {  1074,    8, 23 },   // Shoho
      {  1077,   11, 17 },   // Shoryaku
      {  1081,    2, 10 },   // Eiho
      {  1084,    2,  7 },   // Otoku
      {  1087,    4,  7 },   // Kanji       70
      {  1094,   12, 15 },   // Kaho
      {  1096,   12, 17 },   // Eicho
      {  1097,   11, 21 },   // Shotoku
      {  1099,    8, 28 },   // Kowa
      {  1104,    2, 10 },   // Choji
      {  1106,    4,  9 },   // Kasho
      {  1108,    8,  3 },   // Tennin
      {  1110,    7, 13 },   // Ten-ei
      {  1113,    7, 13 },   // Eikyu
      {  1118,    4,  3 },   // Gen-ei      80
      {  1120,    4, 10 },   // Hoan
      {  1124,    4,  3 },   // Tenji
      {  1126,    1, 22 },   // Daiji
      {  1131,    1, 29 },   // Tensho
      {  1132,    8, 11 },   // Chosho
      {  1135,    4, 27 },   // Hoen
      {  1141,    7, 10 },   // Eiji
      {  1142,    4, 28 },   // Koji
      {  1144,    2, 23 },   // Tenyo
      {  1145,    7, 22 },   // Kyuan      90
      {  1151,    1, 26 },   // Ninpei
      {  1154,   10, 28 },   // Kyuju
      {  1156,    4, 27 },   // Hogen
      {  1159,    4, 20 },   // Heiji
      {  1160,    1, 10 },   // Eiryaku
      {  1161,    9,  4 },   // Oho
      {  1163,    3, 29 },   // Chokan
      {  1165,    6,  5 },   // Eiman
      {  1166,    8, 27 },   // Nin-an
      {  1169,    4,  8 },   // Kao       100
      {  1171,    4, 21 },   // Shoan
      {  1175,    7, 28 },   // Angen
      {  1177,    8,  4 },   // Jisho
      {  1181,    7, 14 },   // Yowa
      {  1182,    5, 27 },   // Juei
      {  1184,    4, 16 },   // Genryuku
      {  1185,    8, 14 },   // Bunji
      {  1190,    4, 11 },   // Kenkyu
      {  1199,    4, 27 },   // Shoji
      {  1201,    2, 13 },   // Kennin     110
      {  1204,    2, 20 },   // Genkyu
      {  1206,    4, 27 },   // Ken-ei
      {  1207,   10, 25 },   // Shogen
      {  1211,    3,  9 },   // Kenryaku
      {  1213,   12,  6 },   // Kenpo
      {  1219,    4, 12 },   // Shokyu
      {  1222,    4, 13 },   // Joo
      {  1224,   11, 20 },   // Gennin
      {  1225,    4, 20 },   // Karoku
      {  1227,   12, 10 },   // Antei      120
      {  1229,    3,  5 },   // Kanki
      {  1232,    4,  2 },   // Joei
      {  1233,    4, 15 },   // Tempuku
      {  1234,   11,  5 },   // Bunryaku
      {  1235,    9, 19 },   // Katei
      {  1238,   11, 23 },   // Ryakunin
      {  1239,    2,  7 },   // En-o
      {  1240,    7, 16 },   // Ninji
      {  1243,    2, 26 },   // Kangen
      {  1247,    2, 28 },   // Hoji      130
      {  1249,    3, 18 },   // Kencho
      {  1256,   10,  5 },   // Kogen
      {  1257,    3, 14 },   // Shoka
      {  1259,    3, 26 },   // Shogen
      {  1260,    4, 13 },   // Bun-o
      {  1261,    2, 20 },   // Kocho
      {  1264,    2, 28 },   // Bun-ei
      {  1275,    4, 25 },   // Kenji
      {  1278,    2, 29 },   // Koan
      {  1288,    4, 28 },   // Shoo      140
      {  1293,    8, 55 },   // Einin
      {  1299,    4, 25 },   // Shoan
      {  1302,   11, 21 },   // Kengen
      {  1303,    8,  5 },   // Kagen
      {  1306,   12, 14 },   // Tokuji
      {  1308,   10,  9 },   // Enkei
      {  1311,    4, 28 },   // Ocho
      {  1312,    3, 20 },   // Showa
      {  1317,    2,  3 },   // Bunpo
      {  1319,    4, 28 },   // Geno      150
      {  1321,    2, 23 },   // Genkyo
      {  1324,   12,  9 },   // Shochu
      {  1326,    4, 26 },   // Kareki
      {  1329,    8, 29 },   // Gentoku
      {  1331,    8,  9 },   // Genko
      {  1334,    1, 29 },   // Kemmu
      {  1336,    2, 29 },   // Engen
      {  1340,    4, 28 },   // Kokoku
      {  1346,   12,  8 },   // Shohei
      {  1370,    7, 24 },   // Kentoku       160
      {  1372,    4,  1 },   // Bunch\u0169
      {  1375,    5, 27 },   // Tenju
      {  1381,    2, 10 },   // Kowa
      {  1384,    4, 28 },   // Gench\u0169
      {  1384,    2, 27 },   // Meitoku
      {  1379,    3, 22 },   // Koryaku
      {  1387,    8, 23 },   // Kakei
      {  1389,    2,  9 },   // Koo
      {  1390,    3, 26 },   // Meitoku
      {  1394,    7,  5 },   // Oei           170
      {  1428,    4, 27 },   // Shocho
      {  1429,    9,  5 },   // Eikyo
      {  1441,    2, 17 },   // Kakitsu
      {  1444,    2,  5 },   // Bun-an
      {  1449,    7, 28 },   // Hotoku
      {  1452,    7, 25 },   // Kyotoku
      {  1455,    7, 25 },   // Kosho
      {  1457,    9, 28 },   // Choroku
      {  1460,   12, 21 },   // Kansho
      {  1466,    2, 28 },   // Bunsho        180
      {  1467,    3,  3 },   // Onin
      {  1469,    4, 28 },   // Bunmei
      {  1487,    7, 29 },   // Chokyo
      {  1489,    8, 21 },   // Entoku
      {  1492,    7, 19 },   // Meio
      {  1501,    2, 29 },   // Bunki
      {  1504,    2, 30 },   // Eisho
      {  1521,    8, 23 },   // Taiei
      {  1528,    8, 20 },   // Kyoroku
      {  1532,    7, 29 },   // Tenmon       190
      {  1555,   10, 23 },   // Koji
      {  1558,    2, 28 },   // Eiroku
      {  1570,    4, 23 },   // Genki
      {  1573,    7, 28 },   // Tensho
      {  1592,   12,  8 },   // Bunroku
      {  1596,   10, 27 },   // Keicho
      {  1615,    7, 13 },   // Genwa
      {  1624,    2, 30 },   // Kan-ei
      {  1644,   12, 16 },   // Shoho
      {  1648,    2, 15 },   // Keian       200
      {  1652,    9, 18 },   // Shoo
      {  1655,    4, 13 },   // Meiryaku
      {  1658,    7, 23 },   // Manji
      {  1661,    4, 25 },   // Kanbun
      {  1673,    9, 21 },   // Enpo
      {  1681,    9, 29 },   // Tenwa
      {  1684,    2, 21 },   // Jokyo
      {  1688,    9, 30 },   // Genroku
      {  1704,    3, 13 },   // Hoei
      {  1711,    4, 25 },   // Shotoku      210
      {  1716,    6, 22 },   // Kyoho
      {  1736,    4, 28 },   // Genbun
      {  1741,    2, 27 },   // Kanpo
      {  1744,    2, 21 },   // Enkyo
      {  1748,    7, 12 },   // Kan-en
      {  1751,   10, 27 },   // Horyaku
      {  1764,    6,  2 },   // Meiwa
      {  1772,   11, 16 },   // An-ei
      {  1781,    4,  2 },   // Tenmei
      {  1789,    1, 25 },   // Kansei      220
      {  1801,    2,  5 },   // Kyowa
      {  1804,    2, 11 },   // Bunka
      {  1818,    4, 22 },   // Bunsei
      {  1830,   12, 10 },   // Tenpo
      {  1844,   12,  2 },   // Koka
      {  1848,    2, 28 },   // Kaei
      {  1854,   11, 27 },   // Ansei
      {  1860,    3, 18 },   // Man-en
      {  1861,    2, 19 },   // Bunkyu
      {  1864,    2, 20 },   // Genji        230
      {  1865,    4,  7 },   // Keio     231
      {  1868,    9,  8 },   // Meiji    232
      {  1912,    7, 30 },   // Taisho   233
      {  1926,   12, 25 },   // Showa    234
      {  1989,    1,  8 }   // Heisei    235
     };

#define kEraCount (sizeof(kEraInfo)/sizeof(kEraInfo[0]))

const uint32_t JapaneseCalendar::kCurrentEra = (kEraCount-1);

JapaneseCalendar::JapaneseCalendar(const Locale& aLocale, UErrorCode& success)
  :   GregorianCalendar(aLocale, success)
{
    setTimeInMillis(getNow(), success); // Call this again now that the vtable is set up properly.
}

JapaneseCalendar::~JapaneseCalendar()
{
}

JapaneseCalendar::JapaneseCalendar(const JapaneseCalendar& source)
  : GregorianCalendar(source)
{
}

JapaneseCalendar& JapaneseCalendar::operator= ( const JapaneseCalendar& right)
{
  GregorianCalendar::operator=(right);
  return *this;
}

Calendar* JapaneseCalendar::clone(void) const
{
  return new JapaneseCalendar(*this);
}

const char *JapaneseCalendar::getType() const
{
  return "japanese";
}

int32_t
JapaneseCalendar::getMaximum(UCalendarDateFields field) const
{
  if(field == UCAL_ERA) {
    return kCurrentEra;
  } else {
    return GregorianCalendar::getMaximum(field);
  }
}

int32_t
JapaneseCalendar::getLeastMaximum(UCalendarDateFields field) const
{
  if(field == UCAL_ERA) {
    return kCurrentEra;
  } else {
    return GregorianCalendar::getLeastMaximum(field);
  }
}

int32_t
JapaneseCalendar::monthLength(int32_t month, int32_t year) const
{
  return GregorianCalendar::monthLength(month,year);
}

int32_t
JapaneseCalendar::monthLength(int32_t month) const
{
  UErrorCode status = U_ZERO_ERROR;
  int32_t year = internalGet(UCAL_YEAR);
  // ignore era
  return GregorianCalendar::monthLength(month, getGregorianYear(status));
}

int32_t JapaneseCalendar::getDefaultMonthInYear() const
{
  UErrorCode status  = U_ZERO_ERROR;
  int32_t era = internalGetEra();
  int32_t year = getGregorianYear(status);
  // TODO do we assume we can trust 'era'?  What if it is denormalized?

  int32_t month = GregorianCalendar::getDefaultMonthInYear();

  // Find out if we are at the edge of an era

  if(year == kEraInfo[era].year) {
    // Yes, we're in the first year of this era.
    return kEraInfo[era].month-1;
  }

  if(era < kCurrentEra) { 
    // if we're not in the current era, 
    //    fail_here;
  }

  return month;
}

int32_t JapaneseCalendar::getDefaultDayInMonth(int32_t month) const
{
  UErrorCode status  = U_ZERO_ERROR;
  int32_t era = internalGetEra();
  int32_t year = getGregorianYear(status);
  int32_t day = GregorianCalendar::getDefaultDayInMonth(month);

  if(year == kEraInfo[era].year) {
    if(month == (kEraInfo[era].month-1)) {
      return kEraInfo[era].day;
    }
  }

  return day;
}

int32_t JapaneseCalendar::internalGetEra() const
{
    return isSet(UCAL_ERA) ? internalGet(UCAL_ERA) : kCurrentEra;  
}

int32_t
JapaneseCalendar::getGregorianYear(UErrorCode &status)  const
{
  int32_t year = (fStamp[UCAL_YEAR] != kUnset) ? internalGet(UCAL_YEAR) : 1; // default year = 1
  int32_t era = kCurrentEra;
  if (fStamp[UCAL_ERA] != kUnset) {
    era = internalGet(UCAL_ERA);
  }

  if ((era<0)||(era>kCurrentEra)) {
      status = U_ILLEGAL_ARGUMENT_ERROR;
      return 0 ;
  }
  return year + kEraInfo[era].year - 1;
}

void JapaneseCalendar::timeToFields(UDate theTime, UBool quick, UErrorCode& status)
{
  GregorianCalendar::timeToFields(theTime, quick, status);

  // these are the gregorian era and year
  int32_t era = internalGet(UCAL_ERA);
  int32_t year = internalGet(UCAL_YEAR);
  if(era == GregorianCalendar::BC) {
    year = 1 - year;
  }

  //  grego [e+y] -> e+y
  int32_t low = 0;

  // Short circuit for recent years.  Most modern computations will
  // occur in the current era and won't require the binary search.
  // Note that if the year is == the current era year, then we use
  // the binary search to handle the month/dom comparison.
#ifdef U_DEBUG_JCAL
  fprintf(stderr, "==  %d:%d \n", era, year);
#endif

  if (year > kEraInfo[kCurrentEra].year) {
    low = kCurrentEra;
#ifdef U_DEBUG_JCAL
    fprintf(stderr, " low=%d (special)\n", low);
#endif
  } else {
    // Binary search
    int32_t high = kEraCount;

#ifdef U_DEBUG_JCAL
    fprintf(stderr, " high=%d\n", high);
#endif
    while (low < high - 1) {
      int32_t i = (low + high) / 2;
      int32_t diff = year - kEraInfo[i].year;

#ifdef U_DEBUG_JCAL
      fprintf(stderr, "  d=%d   low=%d, high=%d. Considering %d:M%d D%d Y%d. { we are ?:M%d D%d Y%d }\n",
              diff,low, high, i, kEraInfo[i].month-1, kEraInfo[i].day,  kEraInfo[i].year, internalGet(UCAL_MONTH), internalGet(UCAL_DATE),year);
#endif

      // If years are the same, then compare the months, and if those
      // are the same, compare days of month.  In the ERAS array
      // months are 1-based for easier maintenance.
      if (diff == 0) {
        diff = internalGet(UCAL_MONTH) - (kEraInfo[i].month - 1);
#ifdef U_DEBUG_JCAL
              fprintf(stderr, "diff now %d (M)  = %d - %d - 1\n", diff, internalGet(UCAL_MONTH), kEraInfo[i].month);
#endif
        if (diff == 0) {
          diff = internalGet(UCAL_DATE) - kEraInfo[i].day;
#ifdef U_DEBUG_JCAL
          fprintf(stderr, "diff now %d (D)\n", diff);
#endif
        }
      }
      if (diff >= 0) {
        low = i;
      } else {
        high = i;
      }
#ifdef U_DEBUG_JCAL
      fprintf(stderr, ". low=%d, high=%d, i=%d, diff=%d.. %d\n", low, high, i, diff, year);
#endif

    }
  }

#ifdef U_DEBUG_JCAL
  fprintf(stderr, "  low[era]=%d,.. %d\n", low, year);
#endif
  // Now we've found the last era that starts before this date, so
  // adjust the year to count from the start of that era.  Note that
  // all dates before the first era will fall into the first era by
  // the algorithm.

  internalSet(UCAL_ERA, low);
  internalSet(UCAL_YEAR, year - kEraInfo[low].year + 1);
#ifdef U_DEBUG_JCAL
  fprintf(stderr, "  Set ERA=%d, year=%d\n", low, year-kEraInfo[low].year+1);
#endif

}

/*
  Disable pivoting 
*/
UBool JapaneseCalendar::haveDefaultCentury() const
{
  return FALSE;
}

UDate JapaneseCalendar::defaultCenturyStart() const
{
  return 0;// WRONG
}

int32_t JapaneseCalendar::defaultCenturyStartYear() const
{
  return 0;
}

U_NAMESPACE_END

#endif

--- NEW FILE: japancal.h ---
/*
* Copyright (C) 2003, International Business Machines Corporation and others. All Rights Reserved.
********************************************************************************
*
* File JAPANCAL.H
*
* Modification History:
*
*   Date        Name        Description
*   05/13/2003  srl         copied from gregocal.h
********************************************************************************
*/

#ifndef JAPANCAL_H
#define JAPANCAL_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_FORMATTING

#include "unicode/calendar.h"
#include "unicode/gregocal.h"

U_NAMESPACE_BEGIN

/**
 * Concrete class which provides the Japanese calendar.
 * 
 * <code>JapaneseCalendar</code> is a subclass of <code>GregorianCalendar</code>
 * that numbers years and eras based on the reigns of the Japanese emperors.
 * The Japanese calendar is identical to the Gregorian calendar in all respects
 * except for the year and era. The ascension of each emperor to the throne
 * begins a new era, and the years of that era are numbered starting with the
 * year of ascension as year 1.
 * 
 * Note that in the year of an imperial ascension, there are two possible sets
 * of year and era values: that for the old era and for the new. For example, a
 * new era began on January 7, 1989 AD. Strictly speaking, the first six days
 * of that year were in the Showa era, e.g. "January 6, 64 Showa", while the rest
 * of the year was in the Heisei era, e.g. "January 7, 1 Heisei". This class
 * handles this distinction correctly when computing dates. However, in lenient
 * mode either form of date is acceptable as input. 
 * 
 * In modern times, eras have started on January 8, 1868 AD, Gregorian (Meiji),
 * July 30, 1912 (Taisho), December 25, 1926 (Showa), and January 7, 1989 (Heisei). Constants
 * for these eras, suitable for use in the <code>UCAL_ERA</code> field, are provided
 * in this class. Note that the number used for each era is more or
 * less arbitrary. Currently, the era starting in 1053 AD is era #0; however this
 * may change in the future as we add more historical data. Use the predefined
 * constants rather than using actual, absolute numbers.
 * 
 * @internal
 */
class U_I18N_API JapaneseCalendar : public GregorianCalendar {
public:

    /**
     * Useful constants for JapaneseCalendar.  
     * @internal
     */
    static const uint32_t kCurrentEra; // the current era

    /**
     * Constructs a JapaneseCalendar based on the current time in the default time zone
     * with the given locale.
     *
     * @param aLocale  The given locale.
     * @param success  Indicates the status of JapaneseCalendar object construction.
     *                 Returns U_ZERO_ERROR if constructed successfully.
     * @stable ICU 2.0
     */
    JapaneseCalendar(const Locale& aLocale, UErrorCode& success);

    /**
     * Destructor
     * @internal
     */
    virtual ~JapaneseCalendar();

    /**
     * Copy constructor
     * @param source    the object to be copied.
     * @internal
     */
    JapaneseCalendar(const JapaneseCalendar& source);

    /**
     * Default assignment operator
     * @param right    the object to be copied.
     * @internal
     */
    JapaneseCalendar& operator=(const JapaneseCalendar& right);

    /**
     * Create and return a polymorphic copy of this calendar.
     * @return    return a polymorphic copy of this calendar.
     * @internal
     */
    virtual Calendar* clone(void) const;

public:

    /**
     * Override Calendar Returns a unique class ID POLYMORPHICALLY. Pure virtual
     * override. This method is to implement a simple version of RTTI, since not all C++
     * compilers support genuine RTTI. Polymorphic operator==() and clone() methods call
     * this method.
     *
     * @return   The class ID for this object. All objects of a given class have the
     *           same class ID. Objects of other classes have different class IDs.
     * @internal
     */
    virtual UClassID getDynamicClassID(void) const;

    /**
     * Return the class ID for this class. This is useful only for comparing to a return
     * value from getDynamicClassID(). For example:
     *
     *      Base* polymorphic_pointer = createPolymorphicObject();
     *      if (polymorphic_pointer->getDynamicClassID() ==
     *          Derived::getStaticClassID()) ...
     *
     * @return   The class ID for all objects of this class.
     * @internal
     */
    static inline UClassID getStaticClassID(void);

    /**
     * return the calendar type, "japanese".
     *
     * @return calendar type
     * @internal
     */
    virtual const char * getType() const;

    /**
     * @internal 
     * @return TRUE if this calendar has the notion of a default century
     */
    virtual UBool haveDefaultCentury() const;
    virtual UDate defaultCenturyStart() const;
    virtual int32_t defaultCenturyStartYear() const;

    /** 
     * @internal
     * API overrides
     */
    int32_t getMaximum(UCalendarDateFields field) const;
    int32_t getLeastMaximum(UCalendarDateFields field) const;
    inline virtual int32_t getMaximum(EDateFields field) const { return getMaximum((UCalendarDateFields)field); }
    inline virtual int32_t getLeastMaximum(EDateFields field) const { return getLeastMaximum((UCalendarDateFields)field); }

private:
    JapaneseCalendar(); // default constructor not implemented

    static const char fgClassID;

protected:
    virtual int32_t monthLength(int32_t month) const; 
    virtual int32_t monthLength(int32_t month, int32_t year) const; 
    int32_t getGregorianYear(UErrorCode& status) const;
    virtual int32_t internalGetEra() const;
    virtual void timeToFields(UDate theTime, UBool quick, UErrorCode& status);

    /**
     * (Overrides Calendar) Converts Calendar's time field values to GMT as
     * milliseconds. In this case, we have to be concerned with filling in inconsistent
     * information. For example, if the year and era only are set, need to make sure
     * month & date are set correctly.  Ex, 'Heisei 1' starts Jan 8th, not Jan 1st.  
     * Default month and date values will end up giving the wrong Era.
     *
     * @param status  Output param set to success/failure code on exit. If any value
     *                previously set in the time field is invalid, this will be set to
     *                an error status.
     * @stable ICU 2.0
     */

    /***
     * Called by computeJulianDay.  Returns the default month (0-based) for the year,
     * taking year and era into account.  Defaults to 0 for Gregorian, which doesn't care.
     */
    virtual int32_t getDefaultMonthInYear() const;

    /***
     * Called by computeJulianDay.  Returns the default day (1-based) for the month,
     * taking currently-set year and era into account.  Defaults to 1 for Gregorian, which doesn't care. 
     */
    virtual int32_t getDefaultDayInMonth(int32_t month) const;
};

inline UClassID
JapaneseCalendar::getStaticClassID(void)
{ return (UClassID)&fgClassID; }

inline UClassID
JapaneseCalendar::getDynamicClassID(void) const
{ return JapaneseCalendar::getStaticClassID(); }

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_FORMATTING */

#endif // _GREGOCAL
//eof

--- NEW FILE: nultrans.h ---
/*
**********************************************************************
*   Copyright (c) 2000-2003, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   01/11/2000  aliu        Creation.
**********************************************************************
*/
#ifndef NULTRANS_H
#define NULTRANS_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "unicode/translit.h"

U_NAMESPACE_BEGIN

/**
 * A transliterator that leaves text unchanged.
 * @author Alan Liu
 * @internal Use transliterator factory methods instead since this class will be removed in that release.
 */
class U_I18N_API NullTransliterator : public Transliterator {

public:

    /**
     * ID for this transliterator.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    static const UChar ID[]; // public for Transliterator

    /**
     * ID for this transliterator.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    static const UChar SHORT_ID[]; // public for Transliterator

    /**
     * Constructs a transliterator.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    NullTransliterator();

    /**
     * Destructor.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    virtual ~NullTransliterator();

    /**
     * Transliterator API.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    Transliterator* clone(void) const;

    /**
     * Implements {@link Transliterator#handleTransliterate}.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    virtual void handleTransliterate(Replaceable& text, UTransPosition& offset,
                                     UBool isIncremental) const;

    /**
     * ICU "poor man's RTTI", returns a UClassID for the actual class.
     *
     * @draft ICU 2.2
     */
    virtual inline UClassID getDynamicClassID() const;

    /**
     * ICU "poor man's RTTI", returns a UClassID for this class.
     *
     * @draft ICU 2.2
     */
    static inline UClassID getStaticClassID();

private:

    /**
     * The address of this static class variable serves as this class's ID
     * for ICU "poor man's RTTI".
     */
    static const char fgClassID;
};

inline NullTransliterator::NullTransliterator() : Transliterator(ID, 0) {}

inline NullTransliterator::~NullTransliterator() {}

inline UClassID
NullTransliterator::getStaticClassID()
{ return (UClassID)&fgClassID; }

inline UClassID
NullTransliterator::getDynamicClassID() const
{ return NullTransliterator::getStaticClassID(); }

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */

#endif

--- NEW FILE: rbt.h ---
/*
**********************************************************************
*   Copyright (C) 1999-2003, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   11/17/99    aliu        Creation.
**********************************************************************
*/
#ifndef RBT_H
#define RBT_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "unicode/translit.h"
#include "unicode/utypes.h"
#include "unicode/parseerr.h"

U_NAMESPACE_BEGIN

class TransliterationRuleData;

/**
 * <code>RuleBasedTransliterator</code> is a transliterator
 * that reads a set of rules in order to determine how to perform
 * translations. Rule sets are stored in resource bundles indexed by
 * name. Rules within a rule set are separated by semicolons (';').
 * To include a literal semicolon, prefix it with a backslash ('\').
 * Whitespace, as defined by <code>Character.isWhitespace()</code>,
 * is ignored. If the first non-blank character on a line is '#',
 * the entire line is ignored as a comment. 
 * 
 * Each set of rules consists of two groups, one forward, and one
 * reverse. This is a convention that is not enforced; rules for one
 * direction may be omitted, with the result that translations in
 * that direction will not modify the source text. In addition,
 * bidirectional forward-reverse rules may be specified for
 * symmetrical transformations.
 * 
 * Rule syntax 
 * 
 * Rule statements take one of the following forms: 
 * 
 * <dl>
 * <dt><code>$alefmadda=\u0622;</code></dt>
 * <dd>Variable definition. The name on the
 * left is assigned the text on the right. In this example,
 * after this statement, instances of the left hand name,
 * &quot;<code>$alefmadda</code>&quot;, will be replaced by
 * the Unicode character U+0622. Variable names must begin
 * with a letter and consist only of letters, digits, and
 * underscores. Case is significant. Duplicate names cause
 * an exception to be thrown, that is, variables cannot be
 * redefined. The right hand side may contain well-formed
 * text of any length, including no text at all (&quot;<code>$empty=;</code>&quot;).
 * The right hand side may contain embedded <code>UnicodeSet</code>
 * patterns, for example, &quot;<code>$softvowel=[eiyEIY]</code>&quot;.</dd>
 * <dd>&nbsp;</dd>
 * <dt><code>ai&gt;$alefmadda;</code></dt>
 * <dd>Forward translation rule. This rule
 * states that the string on the left will be changed to the
 * string on the right when performing forward
 * transliteration.</dd>
 * <dt>&nbsp;</dt>
 * <dt><code>ai<$alefmadda;</code></dt>
 * <dd>Reverse translation rule. This rule
 * states that the string on the right will be changed to
 * the string on the left when performing reverse
 * transliteration.</dd>
 * </dl>
 * 
 * <dl>
 * <dt><code>ai<>$alefmadda;</code></dt>
 * <dd>Bidirectional translation rule. This
 * rule states that the string on the right will be changed
 * to the string on the left when performing forward
 * transliteration, and vice versa when performing reverse
 * transliteration.</dd>
 * </dl>
 * 
 * Translation rules consist of a match pattern and an output
 * string. The match pattern consists of literal characters,
 * optionally preceded by context, and optionally followed by
 * context. Context characters, like literal pattern characters,
 * must be matched in the text being transliterated. However, unlike
 * literal pattern characters, they are not replaced by the output
 * text. For example, the pattern &quot;<code>abc{def}</code>&quot;
 * indicates the characters &quot;<code>def</code>&quot; must be
 * preceded by &quot;<code>abc</code>&quot; for a successful match.
 * If there is a successful match, &quot;<code>def</code>&quot; will
 * be replaced, but not &quot;<code>abc</code>&quot;. The final '<code>}</code>'
 * is optional, so &quot;<code>abc{def</code>&quot; is equivalent to
 * &quot;<code>abc{def}</code>&quot;. Another example is &quot;<code>{123}456</code>&quot;
 * (or &quot;<code>123}456</code>&quot;) in which the literal
 * pattern &quot;<code>123</code>&quot; must be followed by &quot;<code>456</code>&quot;.
 * 
 * 
 * The output string of a forward or reverse rule consists of
 * characters to replace the literal pattern characters. If the
 * output string contains the character '<code>|</code>', this is
 * taken to indicate the location of the cursor after
 * replacement. The cursor is the point in the text at which the
 * next replacement, if any, will be applied. The cursor is usually
 * placed within the replacement text; however, it can actually be
 * placed into the precending or following context by using the
 * special character '<code>@</code>'. Examples:
 * 
 * <blockquote>
 * <code>a {foo} z &gt; | @ bar; # foo -&gt; bar, move cursor
 * before a 
 * {foo} xyz &gt; bar @@|; #&nbsp;foo -&gt; bar, cursor between
 * y and z</code>
 * </blockquote>
 * 
 * UnicodeSet
 * 
 * <code>UnicodeSet</code> patterns may appear anywhere that
 * makes sense. They may appear in variable definitions.
 * Contrariwise, <code>UnicodeSet</code> patterns may themselves
 * contain variable references, such as &quot;<code>$a=[a-z];$not_a=[^$a]</code>&quot;,
 * or &quot;<code>$range=a-z;$ll=[$range]</code>&quot;.
 * 
 * <code>UnicodeSet</code> patterns may also be embedded directly
 * into rule strings. Thus, the following two rules are equivalent:
 * 
 * <blockquote>
 * <code>$vowel=[aeiou]; $vowel&gt;'*'; # One way to do this 
 * [aeiou]&gt;'*';
 * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#
 * Another way</code>
 * </blockquote>
 * 
 * See {@link UnicodeSet} for more documentation and examples.
 * 
 * Segments
 * 
 * Segments of the input string can be matched and copied to the
 * output string. This makes certain sets of rules simpler and more
 * general, and makes reordering possible. For example:
 * 
 * <blockquote>
 * <code>([a-z]) &gt; $1 $1;
 * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#
 * double lowercase letters 
 * ([:Lu:]) ([:Ll:]) &gt; $2 $1; # reverse order of Lu-Ll pairs</code>
 * </blockquote>
 * 
 * The segment of the input string to be copied is delimited by
 * &quot;<code>(</code>&quot; and &quot;<code>)</code>&quot;. Up to
 * nine segments may be defined. Segments may not overlap. In the
 * output string, &quot;<code>$1</code>&quot; through &quot;<code>$9</code>&quot;
 * represent the input string segments, in left-to-right order of
 * definition.
 * 
 * Anchors
 * 
 * Patterns can be anchored to the beginning or the end of the text. This is done with the
 * special characters '<code>^</code>' and '<code>$</code>'. For example:
 * 
 * <blockquote>
 * <code>^ a&nbsp;&nbsp; &gt; 'BEG_A'; &nbsp;&nbsp;# match 'a' at start of text 
 * &nbsp; a&nbsp;&nbsp; &gt; 'A';&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # match other instances
 * of 'a' 
 * &nbsp; z $ &gt; 'END_Z'; &nbsp;&nbsp;# match 'z' at end of text 
 * &nbsp; z&nbsp;&nbsp; &gt; 'Z';&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # match other instances
 * of 'z'</code>
 * </blockquote>
 * 
 * It is also possible to match the beginning or the end of the text using a <code>UnicodeSet</code>.
 * This is done by including a virtual anchor character '<code>$</code>' at the end of the
 * set pattern. Although this is usually the match chafacter for the end anchor, the set will
 * match either the beginning or the end of the text, depending on its placement. For
 * example:
 * 
 * <blockquote>
 * <code>$x = [a-z$]; &nbsp;&nbsp;# match 'a' through 'z' OR anchor 
 * $x 1&nbsp;&nbsp;&nbsp; &gt; 2;&nbsp;&nbsp; # match '1' after a-z or at the start 
 * &nbsp;&nbsp; 3 $x &gt; 4; &nbsp;&nbsp;# match '3' before a-z or at the end</code>
 * </blockquote>
 * 
 * Example 
 * 
 * The following example rules illustrate many of the features of
 * the rule language. 
 * 
 * <table border="0" cellpadding="4">
 * <tr>
 * <td valign="top">Rule 1.</td>
 * <td valign="top" nowrap><code>abc{def}&gt;x|y</code></td>
 * </tr>
 * <tr>
 * <td valign="top">Rule 2.</td>
 * <td valign="top" nowrap><code>xyz&gt;r</code></td>
 * </tr>
 * <tr>
 * <td valign="top">Rule 3.</td>
 * <td valign="top" nowrap><code>yz&gt;q</code></td>
 * </tr>
 * </table>
 * 
 * Applying these rules to the string &quot;<code>adefabcdefz</code>&quot;
 * yields the following results: 
 * 
 * <table border="0" cellpadding="4">
 * <tr>
 * <td valign="top" nowrap><code>|adefabcdefz</code></td>
 * <td valign="top">Initial state, no rules match. Advance
 * cursor.</td>
 * </tr>
 * <tr>
 * <td valign="top" nowrap><code>a|defabcdefz</code></td>
 * <td valign="top">Still no match. Rule 1 does not match
 * because the preceding context is not present.</td>
 * </tr>
 * <tr>
 * <td valign="top" nowrap><code>ad|efabcdefz</code></td>
 * <td valign="top">Still no match. Keep advancing until
 * there is a match...</td>
 * </tr>
 * <tr>
 * <td valign="top" nowrap><code>ade|fabcdefz</code></td>
 * <td valign="top">...</td>
 * </tr>
 * <tr>
 * <td valign="top" nowrap><code>adef|abcdefz</code></td>
 * <td valign="top">...</td>
 * </tr>
 * <tr>
 * <td valign="top" nowrap><code>adefa|bcdefz</code></td>
 * <td valign="top">...</td>
 * </tr>
 * <tr>
 * <td valign="top" nowrap><code>adefab|cdefz</code></td>
 * <td valign="top">...</td>
 * </tr>
 * <tr>
 * <td valign="top" nowrap><code>adefabc|defz</code></td>
 * <td valign="top">Rule 1 matches; replace &quot;<code>def</code>&quot;
 * with &quot;<code>xy</code>&quot; and back up the cursor
 * to before the '<code>y</code>'.</td>
 * </tr>
 * <tr>
 * <td valign="top" nowrap><code>adefabcx|yz</code></td>
 * <td valign="top">Although &quot;<code>xyz</code>&quot; is
 * present, rule 2 does not match because the cursor is
 * before the '<code>y</code>', not before the '<code>x</code>'.
 * Rule 3 does match. Replace &quot;<code>yz</code>&quot;
 * with &quot;<code>q</code>&quot;.</td>
 * </tr>
 * <tr>
 * <td valign="top" nowrap><code>adefabcxq|</code></td>
 * <td valign="top">The cursor is at the end;
 * transliteration is complete.</td>
 * </tr>
 * </table>
 * 
 * The order of rules is significant. If multiple rules may match
 * at some point, the first matching rule is applied. 
 * 
 * Forward and reverse rules may have an empty output string.
 * Otherwise, an empty left or right hand side of any statement is a
 * syntax error. 
 * 
 * Single quotes are used to quote any character other than a
 * digit or letter. To specify a single quote itself, inside or
 * outside of quotes, use two single quotes in a row. For example,
 * the rule &quot;<code>'&gt;'&gt;o''clock</code>&quot; changes the
 * string &quot;<code>&gt;</code>&quot; to the string &quot;<code>o'clock</code>&quot;.
 * 
 * 
 * Notes 
 * 
 * While a RuleBasedTransliterator is being built, it checks that
 * the rules are added in proper order. For example, if the rule
 * &quot;a&gt;x&quot; is followed by the rule &quot;ab&gt;y&quot;,
 * then the second rule will throw an exception. The reason is that
 * the second rule can never be triggered, since the first rule
 * always matches anything it matches. In other words, the first
 * rule masks the second rule. 
 * 
 * @author Alan Liu
 * @internal Use transliterator factory methods instead since this class will be removed in that release.
 */
class U_I18N_API RuleBasedTransliterator : public Transliterator {

    /**
     * The data object is immutable, so we can freely share it with
     * other instances of RBT, as long as we do NOT own this object.
     */
    TransliterationRuleData* data;

    /**
     * If true, we own the data object and must delete it.
     */
    UBool isDataOwned;

public:

    /**
     * Constructs a new transliterator from the given rules.
     * @param rules rules, separated by ';'
     * @param direction either FORWARD or REVERSE.
     * @exception IllegalArgumentException if rules are malformed.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    RuleBasedTransliterator(const UnicodeString& id,
                            const UnicodeString& rules,
                            UTransDirection direction,
                            UnicodeFilter* adoptedFilter,
                            UParseError& parseError,
                            UErrorCode& status);

    /**
     * Constructs a new transliterator from the given rules.
     * @param rules rules, separated by ';'
     * @param direction either FORWARD or REVERSE.
     * @exception IllegalArgumentException if rules are malformed.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    RuleBasedTransliterator(const UnicodeString& id,
                            const UnicodeString& rules,
                            UTransDirection direction,
                            UnicodeFilter* adoptedFilter,
                            UErrorCode& status);

    /**
     * Covenience constructor with no filter.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    RuleBasedTransliterator(const UnicodeString& id,
                            const UnicodeString& rules,
                            UTransDirection direction,
                            UErrorCode& status);

    /**
     * Covenience constructor with no filter and FORWARD direction.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    RuleBasedTransliterator(const UnicodeString& id,
                            const UnicodeString& rules,
                            UErrorCode& status);

    /**
     * Covenience constructor with FORWARD direction.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    RuleBasedTransliterator(const UnicodeString& id,
                            const UnicodeString& rules,
                            UnicodeFilter* adoptedFilter,
                            UErrorCode& status);
 private:

     friend class TransliteratorRegistry; // to access TransliterationRuleData convenience ctor
    /**
     * Covenience constructor.
     * @param id            the id for the transliterator.
     * @param theData       the rule data for the transliterator.
     * @param adoptedFilter the filter for the transliterator
     */
    RuleBasedTransliterator(const UnicodeString& id,
                            const TransliterationRuleData* theData,
                            UnicodeFilter* adoptedFilter = 0);

    friend class Transliterator; // to access following ct

    /**
     * Internal constructor.
     * @param id            the id for the transliterator.
     * @param theData       the rule data for the transliterator.
     * @param isDataAdopted determine who will own the 'data' object. True, the caller should not delete 'data'.
     */
    RuleBasedTransliterator(const UnicodeString& id,
                            TransliterationRuleData* data,
                            UBool isDataAdopted);

 public:

    /**
     * Copy constructor.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    RuleBasedTransliterator(const RuleBasedTransliterator&);

    virtual ~RuleBasedTransliterator();

    /**
     * Implement Transliterator API.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    Transliterator* clone(void) const;

 protected:
    /**
     * Implements {@link Transliterator#handleTransliterate}.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    virtual void handleTransliterate(Replaceable& text, UTransPosition& offsets,
                                     UBool isIncremental) const;

 public:
    /**
     * Return a representation of this transliterator as source rules.
     * These rules will produce an equivalent transliterator if used
     * to construct a new transliterator.
     * @param result the string to receive the rules.  Previous
     * contents will be deleted.
     * @param escapeUnprintable if TRUE then convert unprintable
     * character to their hex escape representations, \uxxxx or
     * \Uxxxxxxxx.  Unprintable characters are those other than
     * U+000A, U+0020..U+007E.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    virtual UnicodeString& toRules(UnicodeString& result,
                                   UBool escapeUnprintable) const;

 protected:
    /**
     * Implement Transliterator framework
     */
    virtual void handleGetSourceSet(UnicodeSet& result) const;

 public:
    /**
     * Override Transliterator framework
     */
    virtual UnicodeSet& getTargetSet(UnicodeSet& result) const;

    /**
     * Return the class ID for this class.  This is useful only for
     * comparing to a return value from getDynamicClassID().  For example:
     * <pre>
     * .      Base* polymorphic_pointer = createPolymorphicObject();
     * .      if (polymorphic_pointer->getDynamicClassID() ==
     * .          Derived::getStaticClassID()) ...
     * </pre>
     * @return          The class ID for all objects of this class.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    static UClassID getStaticClassID(void) { return (UClassID)&fgClassID; }

    /**
     * Returns a unique class ID <b>polymorphically</b>.  This method
     * is to implement a simple version of RTTI, since not all C++
     * compilers support genuine RTTI.  Polymorphic operator==() and
     * clone() methods call this method.
     * 
     * <p>Concrete subclasses of Transliterator that wish clients to
     * be able to identify them should implement getDynamicClassID()
     * and also a static method and data member:
     * 
     * <pre>
     * static UClassID getStaticClassID() { return (UClassID)&fgClassID; }
     * static char fgClassID;
     * </pre>
     *
     * Subclasses that do not implement this method will have a
     * dynamic class ID of Transliterator::getStatisClassID().
     *
     * @return The class ID for this object. All objects of a given
     * class have the same class ID.  Objects of other classes have
     * different class IDs.
     */
    virtual UClassID getDynamicClassID(void) const { return getStaticClassID(); };

private:

    /**
     * Class identifier for RuleBasedTransliterator.
     */
    static const char fgClassID;

    void _construct(const UnicodeString& rules,
                    UTransDirection direction,
                    UParseError& parseError,
                    UErrorCode& status);
};

/**
 * Constructs a new transliterator from the given rules.
 * @param id            the id for the transliterator.
 * @param rules         rules, separated by ';'
 * @param direction     either FORWARD or REVERSE.
 * @param adoptedFilter the filter for this transliterator.
 * @param parseError    Struct to recieve information on position 
 *                      of error if an error is encountered
 * @param status        Output param set to success/failure code.
 * @exception IllegalArgumentException if rules are malformed
 * or direction is invalid.
 */
inline RuleBasedTransliterator::RuleBasedTransliterator(
                            const UnicodeString& id,
                            const UnicodeString& rules,
                            UTransDirection direction,
                            UnicodeFilter* adoptedFilter,
                            UParseError& parseError,
                            UErrorCode& status) :
    Transliterator(id, adoptedFilter) {
    _construct(rules, direction,parseError,status);
}

/**
 * Constructs a new transliterator from the given rules.
 * @param id            the id for the transliterator.
 * @param rules         rules, separated by ';'
 * @param direction     either FORWARD or REVERSE.
 * @param adoptedFilter the filter for this transliterator.
 * @param status        Output param set to success/failure code.
 * @exception IllegalArgumentException if rules are malformed
 * or direction is invalid.
 */
inline RuleBasedTransliterator::RuleBasedTransliterator(
                            const UnicodeString& id,
                            const UnicodeString& rules,
                            UTransDirection direction,
                            UnicodeFilter* adoptedFilter,
                            UErrorCode& status) :
    Transliterator(id, adoptedFilter) {
    UParseError parseError;
    _construct(rules, direction,parseError, status);
}

/**
 * Covenience constructor with no filter.
 */
inline RuleBasedTransliterator::RuleBasedTransliterator(
                            const UnicodeString& id,
                            const UnicodeString& rules,
                            UTransDirection direction,
                            UErrorCode& status) :
    Transliterator(id, 0) {
    UParseError parseError;
    _construct(rules, direction,parseError, status);
}

/**
 * Covenience constructor with no filter and FORWARD direction.
 */
inline RuleBasedTransliterator::RuleBasedTransliterator(
                            const UnicodeString& id,
                            const UnicodeString& rules,
                            UErrorCode& status) :
    Transliterator(id, 0) {
    UParseError parseError;
    _construct(rules, UTRANS_FORWARD, parseError, status);
}

/**
 * Covenience constructor with FORWARD direction.
 */
inline RuleBasedTransliterator::RuleBasedTransliterator(
                            const UnicodeString& id,
                            const UnicodeString& rules,
                            UnicodeFilter* adoptedFilter,
                            UErrorCode& status) :
    Transliterator(id, adoptedFilter) {
    UParseError parseError;
    _construct(rules, UTRANS_FORWARD,parseError, status);
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */

#endif

--- NEW FILE: regexcmp.cpp ---

//
//  file:  regexcmp.cpp
//
//  Copyright (C) 2002-2003 International Business Machines Corporation and others.
//  All Rights Reserved.
//
//  This file contains the ICU regular expression compiler, which is responsible
//  for processing a regular expression pattern into the compiled form that
//  is used by the match finding engine.
//

#include "unicode/utypes.h"

#if !UCONFIG_NO_REGULAR_EXPRESSIONS

#include "unicode/unistr.h"
#include "unicode/uniset.h"
#include "unicode/uchar.h"
[...3493 lines suppressed...]
    if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
        usetFlags |= USET_CASE_INSENSITIVE;
    }
    if (fModeFlags & UREGEX_COMMENTS) {
        usetFlags |= USET_IGNORE_SPACE;
    }

    // Build the UnicodeSet from the set pattern we just built up in a string.
    uset = new UnicodeSet(setPattern,  usetFlags, *fStatus);
    if (U_FAILURE(*fStatus)) {
        delete uset;
        uset =  NULL;
    }

    nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
    return uset;
};

U_NAMESPACE_END
#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS

--- NEW FILE: regexcmp.h ---
//
//  regexcmp.h
//
//  Copyright (C) 2002-2003, International Business Machines Corporation and others.
//  All Rights Reserved.
//
//  This file contains declarations for the class RegexCompile
//
//  This class is internal to the regular expression implementation.
//  For the public Regular Expression API, see the file "unicode/regex.h"
//

#ifndef RBBISCAN_H
#define RBBISCAN_H

#include "unicode/utypes.h"
#if !UCONFIG_NO_REGULAR_EXPRESSIONS

#include "unicode/uobject.h"
#include "unicode/uniset.h"
#include "unicode/parseerr.h"
#include "uhash.h"
#include "uvector.h"

U_NAMESPACE_BEGIN

//--------------------------------------------------------------------------------
//
//  class RegexCompile    Contains the regular expression compiler.
//
//--------------------------------------------------------------------------------
static const int    kStackSize = 100;               // The size of the state stack for
                                                    //   pattern parsing.  Corresponds roughly
                                                    //   to the depth of parentheses nesting
                                                    //   that is allowed in the rules.

enum EParseAction {dummy01, dummy02};               // Placeholder enum for the specifier for
                                                    //   actions that are specified in the
                                                    //   rule parsing state table.
struct  RegexTableEl;
class   RegexPattern;

class RegexCompile : public UMemory {
public:

    struct RegexPatternChar {
        UChar32             fChar;
        UBool               fQuoted;
    };

    RegexCompile(RegexPattern *rp, UErrorCode &e);

    void       compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e);

    virtual    ~RegexCompile();

    void        nextChar(RegexPatternChar &c);      // Get the next char from the input stream.

    static void cleanup();                       // Memory cleanup

    // Categories of parentheses in pattern.
    //   The category is saved in the compile-time parentheses stack frame, and
    //   determines the code to be generated when the matching close ) is encountered.
    enum EParenClass {
        plain        = -1,               // No special handling
        capturing    = -2, 
        atomic       = -3,
        lookAhead    = -4,
        negLookAhead = -5,
        flags        = -6,
        lookBehind   = -7,
        lookBehindN  = -8
    };

private:

    UBool       doParseActions(EParseAction a);
    void        error(UErrorCode e);                   // error reporting convenience function.

    UChar32     nextCharLL();
    UChar32     peekCharLL();
    UnicodeSet  *scanSet();
    UnicodeSet  *scanProp();
    void        handleCloseParen();
    int32_t     blockTopLoc(UBool reserve);          // Locate a position in the compiled pattern
                                                     //  at the top of the just completed block
                                                     //  or operation, and optionally ensure that
                                                     //  there is space to add an opcode there.
    void        compileSet(UnicodeSet *theSet);      // Generate the compiled pattern for
                                                     //   a reference to a UnicodeSet.
    void        compileInterval(int32_t InitOp,      // Generate the code for a {min,max} quantifier.
                               int32_t LoopOp);
    UBool       compileInlineInterval();             // Generate inline code for a {min,max} quantifier
    void        literalChar(UChar32 c);              // Compile a literal char
    void        fixLiterals(UBool split=FALSE);      // Fix literal strings.
    void        insertOp(int32_t where);             // Open up a slot for a new op in the
                                                     //   generated code at the specified location.
    void        emitONE_CHAR(UChar32 c);             // EMit a ONE_CHAR op into the compiled code,
                                                     //   taking case mode into account.
    int32_t     minMatchLength(int32_t start,
                               int32_t end);
    int32_t     maxMatchLength(int32_t start,
                               int32_t end);
    void        matchStartType();
    void        stripNOPs();
    void        OptDotStar();

    UErrorCode                    *fStatus;
    RegexPattern                  *fRXPat;
    UParseError                   *fParseErr;

    //
    //  Data associated with low level character scanning
    //
    int32_t                       fScanIndex;        // Index of current character being processed
                                                     //   in the rule input string.
    int32_t                       fNextIndex;        // Index of the next character, which
                                                     //   is the first character not yet scanned.
    UBool                         fQuoteMode;        // Scan is in a \Q...\E quoted region
    UBool                         fInBackslashQuote; // Scan is between a '\' and the following char.
    UBool                         fEOLComments;      // When scan is just after '(?',  inhibit #... to 
                                                     //   end of line comments, in favor of (?#...) comments.
    int                           fLineNum;          // Line number in input file.
    int                           fCharNum;          // Char position within the line.
    UChar32                       fLastChar;         // Previous char, needed to count CR-LF
                                                     //   as a single line, not two.
    UChar32                       fPeekChar;         // Saved char, if we've scanned ahead.

    RegexPatternChar              fC;                // Current char for parse state machine
                                                     //   processing.

    //
    //   Data for the state machine that parses the regular expression.
    //
    RegexTableEl                  **fStateTable;     // State Transition Table for regex Rule
                                                     //   parsing.  index by p[state][char-class]

    uint16_t                      fStack[kStackSize];  // State stack, holds state pushes
    int                           fStackPtr;           //  and pops as specified in the state
                                                       //  transition rules.

    //
    //  Data associated with the generation of the pcode for the match engine
    //
    int32_t                       fModeFlags;        // Match Flags.  (Case Insensitive, etc.)
    int32_t                       fNewModeFlags;     // New flags, while compiling (?i, holds state
                                                     //   until last flag is scanned.
    UBool                         fSetModeFlag;      // true for (?ismx, false for (?-ismx

    int32_t                       fStringOpStart;    // While a literal string is being scanned
                                                     //   holds the start index within RegexPattern.
                                                     //   fLiteralText where the string is being stored.

    int32_t                       fPatternLength;    // Length of the input pattern string.

    UVector32                     fParenStack;       // parentheses stack.  Each frame consists of
                                                     //   the positions of compiled pattern operations
                                                     //   needing fixup, followed by negative value.  The  
                                                     //   first entry in each frame is the position of the
                                                     //   spot reserved for use when a quantifier
                                                     //   needs to add a SAVE at the start of a (block)
                                                     //   The negative value (-1, -2,...) indicates
                                                     //   the kind of paren that opened the frame.  Some
                                                     //   need special handling on close.

    int32_t                       fMatchOpenParen;   // The position in the compiled pattern
                                                     //   of the slot reserved for a state save
                                                     //   at the start of the most recently processed
                                                     //   parenthesized block.
    int32_t                       fMatchCloseParen;  // The position in the pattern of the first
                                                     //   location after the most recently processed
                                                     //   parenthesized block.

    int32_t                       fIntervalLow;      // {lower, upper} interval quantifier values.
    int32_t                       fIntervalUpper;    // Placed here temporarily, when pattern is
                                                     //   initially scanned.  Each new interval
                                                     //   encountered overwrites these values.
                                                     //   -1 for the upper interval value means none
                                                     //   was specified (unlimited occurences.)

    int32_t                       fNameStartPos;     // Starting position of a \N{NAME} name in a
                                                     //   pattern, valid while remainder of name is
                                                     //   scanned.
};

U_NAMESPACE_END
#endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
#endif   // RBBISCAN_H

--- NEW FILE: regexcst.h ---
//---------------------------------------------------------------------------------
//
// Generated Header File.  Do not edit by hand.
//    This file contains the state table for the ICU Regular Expression Pattern Parser
//    It is generated by the Perl script "regexcst.pl" from
//    the rule parser state definitions file "regexcst.txt".
//
//   Copyright (C) 2002-2003 International Business Machines Corporation 
//   and others. All rights reserved.  
//
//---------------------------------------------------------------------------------
#ifndef RBBIRPT_H
#define RBBIRPT_H

U_NAMESPACE_BEGIN
//
// Character classes for regex pattern scanning.
//
    static const uint8_t kRuleSet_digit_char = 128;
    static const uint8_t kRuleSet_white_space = 129;
    static const uint8_t kRuleSet_rule_char = 130;

enum Regex_PatternParseAction {
    doPossessivePlus,
    doCloseParen,
    doProperty,
    doBeginMatchMode,
    doOrOperator,
    doOpenCaptureParen,
    doBadOpenParenType,
    doRuleError,
    doIntevalLowerDigit,
    doBackslashs,
    doOctal,
    doNGOpt,
    doBackslashw,
    doMismatchedParenErr,
    doOpenLookBehind,
    doBackslashz,
    doIntervalError,
    doStar,
    doCaret,
    doEnterQuoteMode,
    doNGStar,
    doMatchMode,
    doIntervalUpperDigit,
    doOpenLookAheadNeg,
    doPlus,
    doOpenNonCaptureParen,
    doBackslashA,
    doBackslashB,
    doNGPlus,
    doSetMatchMode,
    doPatFinish,
    doBackslashD,
    doPossessiveInterval,
    doEscapeError,
    doBackslashG,
    doSuppressComments,
    doMatchModeParen,
    doOpt,
    doInterval,
    doLiteralChar,
    doIntervalInit,
    doOpenAtomicParen,
    doBackslashS,
    doOpenLookAhead,
    doBackRef,
    doDollar,
    doDotAny,
    doBackslashW,
    doBackslashX,
    doScanUnicodeSet,
    doBackslashZ,
    doPerlInline,
    doPossessiveOpt,
    doNOP,
    doConditionalExpr,
    doExit,
    doNGInterval,
    doPatStart,
    doBackslashb,
    doPossessiveStar,
    doBackslashd,
    doIntervalSame,
    doOpenLookBehindNeg,
    rbbiLastAction};

//-------------------------------------------------------------------------------
//
//  RegexTableEl       represents the structure of a row in the transition table
//                     for the pattern parser state machine.
//-------------------------------------------------------------------------------
struct RegexTableEl {
    Regex_PatternParseAction      fAction;
    uint8_t                       fCharClass;       // 0-127:    an individual ASCII character
                                                    // 128-255:  character class index
    uint8_t                       fNextState;       // 0-250:    normal next-state numbers
                                                    // 255:      pop next-state from stack.
    uint8_t                       fPushState;
    UBool                         fNextChar;
};

static const struct RegexTableEl gRuleParseStateTable[] = {
    {doNOP, 0, 0, 0, TRUE}
    , {doPatStart, 255, 2,0,  FALSE}     //  1      start
    , {doLiteralChar, 254, 14,0,  TRUE}     //  2      term
    , {doLiteralChar, 130, 14,0,  TRUE}     //  3 
    , {doScanUnicodeSet, 91 /* [ */, 14,0,  TRUE}     //  4 
    , {doNOP, 40 /* ( */, 27,0,  TRUE}     //  5 
    , {doDotAny, 46 /* . */, 14,0,  TRUE}     //  6 
    , {doCaret, 94 /* ^ */, 2,0,  TRUE}     //  7 
    , {doDollar, 36 /* $ */, 2,0,  TRUE}     //  8 
    , {doNOP, 92 /* \ */, 79,0,  TRUE}     //  9 
    , {doOrOperator, 124 /* | */, 2,0,  TRUE}     //  10 
    , {doCloseParen, 41 /* ) */, 255,0,  TRUE}     //  11 
    , {doPatFinish, 253, 2,0,  FALSE}     //  12 
    , {doRuleError, 255, 100,0,  FALSE}     //  13 
    , {doNOP, 42 /* * */, 57,0,  TRUE}     //  14      expr-quant
    , {doNOP, 43 /* + */, 60,0,  TRUE}     //  15 
    , {doNOP, 63 /* ? */, 63,0,  TRUE}     //  16 
    , {doIntervalInit, 123 /* { */, 66,0,  TRUE}     //  17 
    , {doNOP, 40 /* ( */, 23,0,  TRUE}     //  18 
    , {doNOP, 255, 20,0,  FALSE}     //  19 
    , {doOrOperator, 124 /* | */, 2,0,  TRUE}     //  20      expr-cont
    , {doCloseParen, 41 /* ) */, 255,0,  TRUE}     //  21 
    , {doNOP, 255, 2,0,  FALSE}     //  22 
    , {doSuppressComments, 63 /* ? */, 25,0,  TRUE}     //  23      open-paren-quant
    , {doNOP, 255, 27,0,  FALSE}     //  24 
    , {doNOP, 35 /* # */, 46, 14, TRUE}     //  25      open-paren-quant2
    , {doNOP, 255, 29,0,  FALSE}     //  26 
    , {doSuppressComments, 63 /* ? */, 29,0,  TRUE}     //  27      open-paren
    , {doOpenCaptureParen, 255, 2, 14, FALSE}     //  28 
    , {doOpenNonCaptureParen, 58 /* : */, 2, 14, TRUE}     //  29      open-paren-extended
    , {doOpenAtomicParen, 62 /* > */, 2, 14, TRUE}     //  30 
    , {doOpenLookAhead, 61 /* = */, 2, 20, TRUE}     //  31 
    , {doOpenLookAheadNeg, 33 /* ! */, 2, 20, TRUE}     //  32 
    , {doNOP, 60 /* < */, 43,0,  TRUE}     //  33 
    , {doNOP, 35 /* # */, 46, 2, TRUE}     //  34 
    , {doBeginMatchMode, 105 /* i */, 49,0,  FALSE}     //  35 
    , {doBeginMatchMode, 109 /* m */, 49,0,  FALSE}     //  36 
    , {doBeginMatchMode, 115 /* s */, 49,0,  FALSE}     //  37 
    , {doBeginMatchMode, 120 /* x */, 49,0,  FALSE}     //  38 
    , {doBeginMatchMode, 45 /* - */, 49,0,  FALSE}     //  39 
    , {doConditionalExpr, 40 /* ( */, 100,0,  TRUE}     //  40 
    , {doPerlInline, 123 /* { */, 100,0,  TRUE}     //  41 
    , {doBadOpenParenType, 255, 100,0,  FALSE}     //  42 
    , {doOpenLookBehind, 61 /* = */, 2, 20, TRUE}     //  43      open-paren-lookbehind
    , {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE}     //  44 
    , {doBadOpenParenType, 255, 100,0,  FALSE}     //  45 
    , {doNOP, 41 /* ) */, 255,0,  TRUE}     //  46      paren-comment
    , {doMismatchedParenErr, 253, 100,0,  FALSE}     //  47 
    , {doNOP, 255, 46,0,  TRUE}     //  48 
    , {doMatchMode, 105 /* i */, 49,0,  TRUE}     //  49      paren-flag
    , {doMatchMode, 109 /* m */, 49,0,  TRUE}     //  50 
    , {doMatchMode, 115 /* s */, 49,0,  TRUE}     //  51 
    , {doMatchMode, 120 /* x */, 49,0,  TRUE}     //  52 
    , {doMatchMode, 45 /* - */, 49,0,  TRUE}     //  53 
    , {doSetMatchMode, 41 /* ) */, 2,0,  TRUE}     //  54 
    , {doMatchModeParen, 58 /* : */, 2, 14, TRUE}     //  55 
    , {doNOP, 255, 100,0,  FALSE}     //  56 
    , {doNGStar, 63 /* ? */, 20,0,  TRUE}     //  57      quant-star
    , {doPossessiveStar, 43 /* + */, 20,0,  TRUE}     //  58 
    , {doStar, 255, 20,0,  FALSE}     //  59 
    , {doNGPlus, 63 /* ? */, 20,0,  TRUE}     //  60      quant-plus
    , {doPossessivePlus, 43 /* + */, 20,0,  TRUE}     //  61 
    , {doPlus, 255, 20,0,  FALSE}     //  62 
    , {doNGOpt, 63 /* ? */, 20,0,  TRUE}     //  63      quant-opt
    , {doPossessiveOpt, 43 /* + */, 20,0,  TRUE}     //  64 
    , {doOpt, 255, 20,0,  FALSE}     //  65 
    , {doNOP, 129, 66,0,  TRUE}     //  66      interval-open
    , {doNOP, 128, 69,0,  FALSE}     //  67 
    , {doIntervalError, 255, 100,0,  FALSE}     //  68 
    , {doIntevalLowerDigit, 128, 69,0,  TRUE}     //  69      interval-lower
    , {doNOP, 44 /* , */, 73,0,  TRUE}     //  70 
    , {doIntervalSame, 125 /* } */, 76,0,  TRUE}     //  71 
    , {doIntervalError, 255, 100,0,  FALSE}     //  72 
    , {doIntervalUpperDigit, 128, 73,0,  TRUE}     //  73      interval-upper
    , {doNOP, 125 /* } */, 76,0,  TRUE}     //  74 
    , {doIntervalError, 255, 100,0,  FALSE}     //  75 
    , {doNGInterval, 63 /* ? */, 20,0,  TRUE}     //  76      interval-type
    , {doPossessiveInterval, 43 /* + */, 20,0,  TRUE}     //  77 
    , {doInterval, 255, 20,0,  FALSE}     //  78 
    , {doBackslashA, 65 /* A */, 2,0,  TRUE}     //  79      backslash
    , {doBackslashB, 66 /* B */, 2,0,  TRUE}     //  80 
    , {doBackslashb, 98 /* b */, 2,0,  TRUE}     //  81 
    , {doBackslashd, 100 /* d */, 14,0,  TRUE}     //  82 
    , {doBackslashD, 68 /* D */, 14,0,  TRUE}     //  83 
    , {doBackslashG, 71 /* G */, 2,0,  TRUE}     //  84 
    , {doProperty, 78 /* N */, 14,0,  FALSE}     //  85 
    , {doProperty, 112 /* p */, 14,0,  FALSE}     //  86 
    , {doProperty, 80 /* P */, 14,0,  FALSE}     //  87 
    , {doEnterQuoteMode, 81 /* Q */, 2,0,  TRUE}     //  88 
    , {doBackslashS, 83 /* S */, 14,0,  TRUE}     //  89 
    , {doBackslashs, 115 /* s */, 14,0,  TRUE}     //  90 
    , {doBackslashW, 87 /* W */, 14,0,  TRUE}     //  91 
    , {doBackslashw, 119 /* w */, 14,0,  TRUE}     //  92 
    , {doBackslashX, 88 /* X */, 14,0,  TRUE}     //  93 
    , {doBackslashZ, 90 /* Z */, 2,0,  TRUE}     //  94 
    , {doBackslashz, 122 /* z */, 2,0,  TRUE}     //  95 
    , {doOctal, 48 /* 0 */, 14,0,  TRUE}     //  96 
    , {doBackRef, 128, 14,0,  TRUE}     //  97 
    , {doEscapeError, 253, 100,0,  FALSE}     //  98 
    , {doLiteralChar, 255, 14,0,  TRUE}     //  99 
    , {doExit, 255, 100,0,  TRUE}     //  100      errorDeath
 };
static const char * const RegexStateNames[] = {    0,
     "start",
     "term",
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
     "expr-quant",
    0,
    0,
    0,
    0,
    0,
     "expr-cont",
    0,
    0,
     "open-paren-quant",
    0,
     "open-paren-quant2",
    0,
     "open-paren",
    0,
     "open-paren-extended",
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
     "open-paren-lookbehind",
    0,
    0,
     "paren-comment",
    0,
    0,
     "paren-flag",
    0,
    0,
    0,
    0,
    0,
    0,
    0,
     "quant-star",
    0,
    0,
     "quant-plus",
    0,
    0,
     "quant-opt",
    0,
    0,
     "interval-open",
    0,
    0,
     "interval-lower",
    0,
    0,
    0,
     "interval-upper",
    0,
    0,
     "interval-type",
    0,
    0,
     "backslash",
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
     "errorDeath",
    0};

U_NAMESPACE_END
#endif

--- NEW FILE: regexcst.pl ---
#!/usr/bin/perl
#  ********************************************************************
#  * COPYRIGHT:
#  * Copyright (c) 2002-2003, International Business Machines Corporation and
#  * others. All Rights Reserved.
#  ********************************************************************
#
#  regexcst.pl
#            Compile the regular expression paser state table data into initialized C data.
#            Usage:
#                   cd icu/source/i18n
#                   perl regexcst.pl < regexcst.txt > regexcst.h
#
#             The output file, regexcst.h, is included by some of the .cpp regex
#             implementation files.   This perl script is NOT run as part
#             of a normal ICU build.  It is run by hand when needed, and the
#             regexcst.h generated file is put back into cvs.
#
#             See regexcst.txt for a description of the input format for this script.
#
#             This script is derived from rbbicst.pl, which peforms the same function
#             for the Rule Based Break Iterator Rule Parser.  Perhaps they could be
#             merged?
#
#*********************************************************************
#   Copyright (C) 2002 International Business Machines Corporation   *
#   and others. All rights reserved.                                 *
#*********************************************************************

$num_states = 1;         # Always the state number for the line being compiled.
$line_num  = 0;          # The line number in the input file.

$states{"pop"} = 255;    # Add the "pop"  to the list of defined state names.
                         # This prevents any state from being labelled with "pop",
                         #  and resolves references to "pop" in the next state field.

line_loop: while (<>) {
    chomp();
    $line = $_;
    @fields = split();
    $line_num++;

    # Remove # comments, which are any fields beginning with a #, plus all
    #  that follow on the line.
    for ($i=0; $i<@fields; $i++) {
        if ($fields[$i] =~ /^#/) {
            @fields = @fields[0 .. $i-1];
            last;
        }
    }
    # ignore blank lines, and those with no fields left after stripping comments..
    if (@fields == 0) {
        next;
    }

    #
    # State Label:  handling.
    #    Does the first token end with a ":"?  If so, it's the name  of a state.
    #    Put in a hash, together with the current state number,
    #        so that we can later look up the number from the name.
    #
    if (@fields[0] =~ /.*:$/) {
        $state_name = @fields[0];
        $state_name =~ s/://;        # strip off the colon from the state name.

        if ($states{$state_name} != 0) {
            print "  rbbicst: at line $line-num duplicate definition of state $state_name\n";
        }
        $states{$state_name} = $num_states;
        $stateNames[$num_states] = $state_name;

        # if the label was the only thing on this line, go on to the next line,
        # otherwise assume that a state definition is on the same line and fall through.
        if (@fields == 1) {
            next line_loop;
        }
        shift @fields;                       # shift off label field in preparation
                                             #  for handling the rest of the line.
    }

    #
    # State Transition line.
    #   syntax is this,
    #       character   [n]  target-state  [^push-state]  [function-name]
    #   where
    #      [something]   is an optional something
    #      character     is either a single quoted character e.g. '['
    #                       or a name of a character class, e.g. white_space
    #

    $state_line_num[$num_states] = $line_num;   # remember line number with each state
                                                #  so we can make better error messages later.
    #
    # First field, character class or literal character for this transition.
    #
    if ($fields[0] =~ /^'.'$/) {
        # We've got a quoted literal character.
        $state_literal_chars[$num_states] = $fields[0];
        $state_literal_chars[$num_states] =~ s/'//g;
    } else {
        # We've got the name of a character class.
        $state_char_class[$num_states] = $fields[0];
        if ($fields[0] =~ /[\W]/) {
            print "  rbbicsts:  at line $line_num, bad character literal or character class name.\n";
            print "     scanning $fields[0]\n";
            exit(-1);
        }
    }
    shift @fields;

    #
    # do the 'n' flag
    #
    $state_flag[$num_states] = "FALSE";
    if ($fields[0] eq "n") {
        $state_flag[$num_states] = "TRUE";
        shift @fields;
    }

    #
    # do the destination state.
    #
    $state_dest_state[$num_states] = $fields[0];
    if ($fields[0] eq "") {
        print "  rbbicsts:  at line $line_num, destination state missing.\n";
        exit(-1);
    }
    shift @fields;

    #
    # do the push state, if present.
    #
    if ($fields[0] =~ /^\^/) {
        $fields[0] =~ s/^\^//;
        $state_push_state[$num_states] = $fields[0];
        if ($fields[0] eq "" ) {
            print "  rbbicsts:  at line $line_num, expected state after ^ (no spaces).\n";
            exit(-1);
        }
        shift @fields;
    }

    #
    # Lastly, do the optional action name.
    #
    if ($fields[0] ne "") {
        $state_func_name[$num_states] = $fields[0];
        shift @fields;
    }

    #
    #  There should be no fields left on the line at this point.
    #
    if (@fields > 0) {
       print "  rbbicsts:  at line $line_num, unexpected extra stuff on input line.\n";
       print "     scanning $fields[0]\n";
   }
   $num_states++;
}

#
# We've read in the whole file, now go back and output the
#   C source code for the state transition table.
#
# We read all states first, before writing anything,  so that the state numbers
# for the destination states are all available to be written.
#

#
# Make hashes for the names of the character classes and
#      for the names of the actions that appeared.
#
for ($state=1; $state < $num_states; $state++) {
    if ($state_char_class[$state] ne "") {
        if ($charClasses{$state_char_class[$state]} == 0) {
            $charClasses{$state_char_class[$state]} = 1;
        }
    }
    if ($state_func_name[$state] eq "") {
        $state_func_name[$state] = "doNOP";
    }
    if ($actions{$state_action_name[$state]} == 0) {
        $actions{$state_func_name[$state]} = 1;
    }
}

#
# Check that all of the destination states have been defined
#
#
$states{"exit"} = 0;              # Predefined state name, terminates state machine.
for ($state=1; $state<$num_states; $state++) {
   if ($states{$state_dest_state[$state]} == 0 && $state_dest_state[$state] ne "exit") {
       print "Error at line $state_line_num[$state]: target state \"$state_dest_state[$state]\" is not defined.\n";
       $errors++;
   }
   if ($state_push_state[$state] ne "" && $states{$state_push_state[$state]} == 0) {
       print "Error at line $state_line_num[$state]: target state \"$state_push_state[$state]\" is not defined.\n";
       $errors++;
   }
}

die if ($errors>0);

print "//---------------------------------------------------------------------------------\n";
print "//\n";
print "// Generated Header File.  Do not edit by hand.\n";
print "//    This file contains the state table for the ICU Regular Expression Pattern Parser\n";
print "//    It is generated by the Perl script \"regexcst.pl\" from\n";
print "//    the rule parser state definitions file \"regexcst.txt\".\n";
print "//\n";
print "//   Copyright (C) 2002-2003 International Business Machines Corporation \n";
print "//   and others. All rights reserved.  \n";
print "//\n";
print "//---------------------------------------------------------------------------------\n";
print "#ifndef RBBIRPT_H\n";
print "#define RBBIRPT_H\n";
print "\n";
print "U_NAMESPACE_BEGIN\n";

#
# Emit the constants for indicies of Unicode Sets
#   Define one constant for each of the character classes encountered.
#   At the same time, store the index corresponding to the set name back into hash.
#
print "//\n";
print "// Character classes for regex pattern scanning.\n";
print "//\n";
$i = 128;                   # State Table values for Unicode char sets range from 128-250.
                            # Sets "default", "quoted", etc. get special handling.
                            #  They have no corresponding UnicodeSet object in the state machine,
                            #    but are handled by special case code.  So we emit no reference
                            #    to a UnicodeSet object to them here.
foreach $setName (keys %charClasses) {
    if ($setName eq "default") {
        $charClasses{$setName} = 255;}
    elsif ($setName eq "quoted") {
        $charClasses{$setName} = 254;}
    elsif ($setName eq "eof") {
        $charClasses{$setName} = 253;}
    else {
        # Normal character class.  Fill in array with a ptr to the corresponding UnicodeSet in the state machine.
       print "    static const uint8_t kRuleSet_$setName = $i;\n";
        $charClasses{$setName} = $i;
        $i++;
    }
}
print "\n\n";

#
# Emit the enum for the actions to be performed.
#
print "enum Regex_PatternParseAction {\n";
foreach $act (keys %actions) {
    print "    $act,\n";
}
print "    rbbiLastAction};\n\n";

#
# Emit the struct definition for transtion table elements.
#
print "//-------------------------------------------------------------------------------\n";
print "//\n";
print "//  RegexTableEl       represents the structure of a row in the transition table\n";
print "//                     for the pattern parser state machine.\n";
print "//-------------------------------------------------------------------------------\n";
print "struct RegexTableEl {\n";
print "    Regex_PatternParseAction      fAction;\n";
print "    uint8_t                       fCharClass;       // 0-127:    an individual ASCII character\n";
print "                                                    // 128-255:  character class index\n";
print "    uint8_t                       fNextState;       // 0-250:    normal next-state numbers\n";
print "                                                    // 255:      pop next-state from stack.\n";
print "    uint8_t                       fPushState;\n";
print "    UBool                         fNextChar;\n";
print "};\n\n";

#
# emit the state transition table
#
print "static const struct RegexTableEl gRuleParseStateTable[] = {\n";
print "    {doNOP, 0, 0, 0, TRUE}\n";    # State 0 is a dummy.  Real states start with index = 1.
for ($state=1; $state < $num_states; $state++) {
    print "    , {$state_func_name[$state],";
    if ($state_literal_chars[$state] ne "") {
        $c = $state_literal_chars[$state];
        printf(" %d /* $c */,", ord($c));   #  use numeric value, so EBCDIC machines are ok.
    }else {
        print " $charClasses{$state_char_class[$state]},";
    }
    print " $states{$state_dest_state[$state]},";

    # The push-state field is optional.  If omitted, fill field with a zero, which flags
    #   the state machine that there is no push state.
    if ($state_push_state[$state] eq "") {
        print "0, ";
    } else {
        print " $states{$state_push_state[$state]},";
    }
    print " $state_flag[$state]} ";

    # Put out a C++ comment showing the number (index) of this state row,
    #   and, if this is the first row of the table for this state, the state name.
    print "    //  $state ";
    if ($stateNames[$state] ne "") {
        print "     $stateNames[$state]";
    }
    print "\n";
};
print " };\n";

#
# emit a mapping array from state numbers to state names.
#
#    This array is used for producing debugging output from the pattern parser.
#
print "static const char * const RegexStateNames[] = {";
for ($state=0; $state<$num_states; $state++) {
    if ($stateNames[$state] ne "") {
        print "     \"$stateNames[$state]\",\n";
    } else {
        print "    0,\n";
    }
}
print "    0};\n\n";

print "U_NAMESPACE_END\n";
print "#endif\n";

--- NEW FILE: regexcst.txt ---

#*****************************************************************************
#
#   Copyright (C) 2002-2003, International Business Machines Corporation and others.
#   All Rights Reserved.
#
#*****************************************************************************
#
#  file:  regexcst.txt
#  ICU Regular Expression Parser State Table
#
#     This state table is used when reading and parsing a regular expression pattern
#     The pattern parser uses a state machine; the data in this file define the
#     state transitions that occur for each input character.
#
#     *** This file defines the regex pattern grammar.   This is it.
#     *** The determination of what is accepted is here.
#
#     This file is processed by a perl script "regexcst.pl" to produce initialized C arrays
#     that are then built with the rule parser.
#

#
# Here is the syntax of the state definitions in this file:
#
#
#StateName:
#   input-char           n next-state           ^push-state     action    
#   input-char           n next-state           ^push-state     action    
#       |                |   |                      |             |
#       |                |   |                      |             |--- action to be performed by state machine
#       |                |   |                      |                  See function RBBIRuleScanner::doParseActions()
#       |                |   |                      |
#       |                |   |                      |--- Push this named state onto the state stack.
#       |                |   |                           Later, when next state is specified as "pop",
#       |                |   |                           the pushed state will become the current state.
#       |                |   |
#       |                |   |--- Transition to this state if the current input character matches the input
#       |                |        character or char class in the left hand column.  "pop" causes the next
#       |                |        state to be popped from the state stack.
#       |                |
#       |                |--- When making the state transition specified on this line, advance to the next
#       |                     character from the input only if 'n' appears here.
#       |
#       |--- Character or named character classes to test for.  If the current character being scanned
#            matches, peform the actions and go to the state specified on this line.
#            The input character is tested sequentally, in the order written.  The characters and
#            character classes tested for do not need to be mutually exclusive.  The first match wins.
#            

#
#  start state, scan position is at the beginning of the pattern.
#
start:
   default                 term                                     doPatStart

#
#  term.  At a position where we can accept the start most items in a pattern.
#
term:
    quoted               n expr-quant                               doLiteralChar
    rule_char            n expr-quant                               doLiteralChar
    '['                  n expr-quant                               doScanUnicodeSet
    '('                  n open-paren                     
    '.'                  n expr-quant                               doDotAny
    '^'                  n term                                     doCaret
    '$'                  n term                                     doDollar
    '\'                  n backslash
    '|'                  n  term                                    doOrOperator
    ')'                  n  pop                                     doCloseParen
    eof	                   term                                     doPatFinish
    default                errorDeath                               doRuleError

#
#   expr-quant    We've just finished scanning a term, now look for the optional
#                 trailing quantifier - *, +, ?, *?,  etc.
#
expr-quant:
    '*'                  n  quant-star                       
    '+'                  n  quant-plus                              
    '?'                  n  quant-opt     
    '{'                  n  interval-open                          doIntervalInit
    '('                  n  open-paren-quant
    default                 expr-cont 

#
#  expr-cont      Expression, continuation.  At a point where additional terms are
#                                            allowed, but not required.  No Quantifiers
#
expr-cont:
    '|'                  n  term                                    doOrOperator
    ')'                  n  pop                                     doCloseParen
    default                 term                                    

#
#   open-paren-quant   Special case handling for comments appearing before a quantifier,
#                        e.g.   x(?#comment )*
#                      Open parens from expr-quant come here; anything but a (?# comment
#                      branches into the normal parenthesis sequence as quickly as possible.
#
open-paren-quant:
    '?'                  n  open-paren-quant2                      doSuppressComments
    default                 open-paren

open-paren-quant2:
    '#'                  n  paren-comment   ^expr-quant
    default                 open-paren-extended

#
#   open-paren    We've got an open paren.  We need to scan further to
#                 determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
#
open-paren:
    '?'                  n  open-paren-extended                     doSuppressComments
    default                 term            ^expr-quant             doOpenCaptureParen

open-paren-extended:
    ':'                  n  term            ^expr-quant             doOpenNonCaptureParen  #  (?:
    '>'                  n  term            ^expr-quant             doOpenAtomicParen      #  (?>
    '='                  n  term            ^expr-cont              doOpenLookAhead        #  (?=
    '!'                  n  term            ^expr-cont              doOpenLookAheadNeg     #  (?!
    '<'                  n  open-paren-lookbehind
    '#'                  n  paren-comment   ^term
    'i'                     paren-flag                              doBeginMatchMode
    'm'                     paren-flag                              doBeginMatchMode
    's'                     paren-flag                              doBeginMatchMode
    'x'                     paren-flag                              doBeginMatchMode
    '-'                     paren-flag                              doBeginMatchMode
    '('                  n  errorDeath                              doConditionalExpr
    '{'                  n  errorDeath                              doPerlInline
    default                 errorDeath                              doBadOpenParenType

open-paren-lookbehind:
    '='                  n  term            ^expr-cont              doOpenLookBehind       #  (?<=
    '!'                  n  term            ^expr-cont              doOpenLookBehindNeg    #  (?<!
    default                 errorDeath                              doBadOpenParenType

#
#   paren-comment    We've got a (?# ... )  style comment.  Eat pattern text till we get to the ')'
#                    TODO:  should parens nest here?  Check what perl does.
#
paren-comment:
    ')'                  n  pop
    eof		                errorDeath                              doMismatchedParenErr
    default              n  paren-comment

#
#  paren-flag    Scanned a (?ismx-ismx  flag setting 
#                 
paren-flag:
    'i'                  n  paren-flag                              doMatchMode
    'm'                  n  paren-flag                              doMatchMode
    's'                  n  paren-flag                              doMatchMode
    'x'                  n  paren-flag                              doMatchMode
    '-'                  n  paren-flag                              doMatchMode
    ')'                  n  term                                    doSetMatchMode
    ':'                  n  term              ^expr-quant           doMatchModeParen
    default                 errorDeath

#
#  quant-star     Scanning a '*' quantifier.  Need to look ahead to decide
#                 between plain '*', '*?', '*+'
#
quant-star:
     '?'                 n  expr-cont                               doNGStar               #  *?
     '+'                 n  expr-cont                               doPossessiveStar       #  *+
     default                expr-cont                               doStar

#
#  quant-plus     Scanning a '+' quantifier.  Need to look ahead to decide
#                 between plain '+', '+?', '++'
#
quant-plus:
     '?'                 n  expr-cont                               doNGPlus               #  *?
     '+'                 n  expr-cont                               doPossessivePlus       #  *+
     default                expr-cont                               doPlus

#
#  quant-opt  Scanning a '?' quantifier.  Need to look ahead to decide
#                  between plain '?', '??', '?+'
#
quant-opt:
     '?'                 n  expr-cont                               doNGOpt                 #  ??
     '+'                 n  expr-cont                               doPossessiveOpt         #  ?+
     default                expr-cont                               doOpt                   #  ?

#
#   Interval         scanning a '{', the opening delimiter for an interval specification
#                                   {number} or {min, max} or {min, }
#
interval-open:
    white_space          n  interval-open                                  # TODO:  is white space allowed here in non-free mode?
    digit_char              interval-lower                          
    default                 errorDeath                              doIntervalError

interval-lower:
    digit_char           n  interval-lower                          doIntevalLowerDigit
    ','			         n  interval-upper
    '}'                  n  interval-type                           doIntervalSame             # {n}
    default                 errorDeath                              doIntervalError

interval-upper:
    digit_char           n  interval-upper                          doIntervalUpperDigit
    '}'                  n  interval-type
    default                 errorDeath                              doIntervalError

interval-type:
    '?'                  n  expr-cont                               doNGInterval                # {n,m}?
    '+'                  n  expr-cont                               doPossessiveInterval        # {n,m}+
    default                 expr-cont                               doInterval                  # {m,n}

#
#  backslash        #  Backslash.  Figure out which of the \thingies we have encountered.
#                                  The low level next-char function will have preprocessed
#                                  some of them already; those won't come here.
backslash:
   'A'                   n  term                                    doBackslashA
   'B'                   n  term                                    doBackslashB
   'b'                   n  term                                    doBackslashb
   'd'                   n  expr-quant                              doBackslashd
   'D'                   n  expr-quant                              doBackslashD
   'G'                   n  term                                    doBackslashG
   'N'                      expr-quant                              doProperty       #   \N{NAME}  named char
   'p'                      expr-quant                              doProperty       #   \p{Lu}  style property
   'P'                      expr-quant                              doProperty
   'Q'                   n  term                                    doEnterQuoteMode
   'S'                   n  expr-quant                              doBackslashS
   's'                   n  expr-quant                              doBackslashs
   'W'                   n  expr-quant                              doBackslashW
   'w'                   n  expr-quant                              doBackslashw
   'X'                   n  expr-quant                              doBackslashX
   'Z'                   n  term                                    doBackslashZ
   'z'                   n  term                                    doBackslashz
   '0'                   n  expr-quant                              doOctal
   digit_char	         n  expr-quant                              doBackRef         #  Will scan multiple digits
   eof                      errorDeath                              doEscapeError
   default               n  expr-quant		                    doLiteralChar     #  Escaped literal char.		       

#
# errorDeath.   This state is specified as the next state whenever a syntax error
#               in the source rules is detected.  Barring bugs, the state machine will never
#               actually get here, but will stop because of the action associated with the error.
#               But, just in case, this state asks the state machine to exit.
errorDeath:
    default              n errorDeath                               doExit

--- NEW FILE: regeximp.h ---
// 
//   Copyright (C) 2002-2003 International Business Machines Corporation 
//   and others. All rights reserved.  
//
//   file:  regeximp.h
//
//           ICU Regular Expressions,
//               Definitions of constant values used in the compiled form of
//               a regular expression pattern.
//

#ifndef _REGEXIMP_H
#define _REGEXIMP_H

#include "cmemory.h"

U_NAMESPACE_BEGIN

//
//  debugging support.  Enable one or more of the #defines immediately following
//
#ifdef _DEBUG
//#define REGEX_SCAN_DEBUG
//#define REGEX_DUMP_DEBUG
//#define REGEX_RUN_DEBUG
#endif
//  End of #defines inteded to be directly set.

#if defined(REGEX_SCAN_DEBUG) || defined(REGEX_RUN_DEBUG) || defined(REGEX_DUMP_DEBUG)
#define REGEX_DEBUG 1
#include <stdio.h>
#endif

#ifdef REGEX_SCAN_DEBUG
#define REGEX_SCAN_DEBUG_PRINTF printf
#else
#define REGEX_SCAN_DEBUG_PRINTF
#endif

#ifdef REGEX_DUMP_DEBUG
#define REGEX_DUMP_DEBUG_PRINTF printf
#else
#define REGEX_DUMP_DEBUG_PRINTF
#endif

#ifdef REGEX_RUN_DEBUG
#define REGEX_RUN_DEBUG_PRINTF printf
#define REGEX_DUMP_DEBUG_PRINTF printf
#else
#define REGEX_RUN_DEBUG_PRINTF
#endif

//
//  Opcode types     In the compiled form of the regexp, these are the type, or opcodes,
//                   of the entries.
//
enum {
     URX_RESERVED_OP   = 0,    // For multi-operand ops, most non-first words.
     URX_RESERVED_OP_N = 255,  // For multi-operand ops, negative operand values.
     URX_BACKTRACK     = 1,
     URX_END           = 2,
     URX_ONECHAR       = 3,    // Value field is the 21 bit unicode char to match
     URX_STRING        = 4,    // Value field is index of string start
     URX_STRING_LEN    = 5,    // Value field is string length (code units)
     URX_STATE_SAVE    = 6,    // Value field is pattern position to push
     URX_NOP           = 7,
     URX_START_CAPTURE = 8,    // Value field is capture group number.
     URX_END_CAPTURE   = 9,    // Value field is capture group number
     URX_STATIC_SETREF = 10,   // Value field is index of set in array of sets.   
     URX_SETREF        = 11,   // Value field is index of set in array of sets.
     URX_DOTANY        = 12, 
     URX_JMP           = 13,   // Value field is destination position in
                                                    //   the pattern.
     URX_FAIL          = 14,   // Stop match operation,  No match.

     URX_JMP_SAV       = 15,   // Operand:  JMP destination location
     URX_BACKSLASH_B   = 16,   // Value field:  0:  \b    1:  \B
     URX_BACKSLASH_G   = 17, 
     URX_JMP_SAV_X     = 18,   // Conditional JMP_SAV,
                               //    Used in (x)+, breaks loop on zero length match.
                               //    Operand:  Jmp destination.
     URX_BACKSLASH_X   = 19,
     URX_BACKSLASH_Z   = 20,   // \z   Unconditional end of line.

     URX_DOTANY_ALL    = 21,   // ., in the . matches any mode.
     URX_BACKSLASH_D   = 22,   // Value field:  0:  \d    1:  \D
     URX_CARET         = 23,   // Value field:  1:  multi-line mode.
     URX_DOLLAR        = 24,  // Also for \Z

     URX_CTR_INIT      = 25,   // Counter Inits for {Interval} loops.
     URX_CTR_INIT_NG   = 26,   //   3 kinds, normal, non-greedy, and possessive.
                               //   These are 4 word opcodes.  See description.
                               //    First Operand:  Data loc of counter variable
                               //    2nd   Operand:  Pat loc of the URX_CTR_LOOPx 
                               //                    at the end of the loop.
                               //    3rd   Operand:  Minimum count.
                               //    4th   Operand:  Max count, -1 for unbounded.

     URX_DOTANY_PL     = 27,   // .+, match rest of the line.  Fail already at end.

     URX_CTR_LOOP      = 28,   // Loop Ops for {interval} loops.
     URX_CTR_LOOP_NG   = 29,   //   Also in three flavors.
                               //   Operand is loc of corresponding CTR_INIT.

     URX_DOTANY_ALL_PL = 30,   // .+, match rest of the Input.  Fail if already at end

     URX_RELOC_OPRND   = 31,   // Operand value in multi-operand ops that refers
                               //   back into compiled pattern code, and thus must
                               //   be relocated when inserting/deleting ops in code.

     URX_STO_SP        = 32,   // Store the stack ptr.  Operand is location within
                               //   matcher data (not stack data) to store it.
     URX_LD_SP         = 33,   // Load the stack pointer.  Operand is location
                               //   to load from.
     URX_BACKREF       = 34,   // Back Reference.  Parameter is the index of the
                               //   capture group variables in the state stack frame.
     URX_STO_INP_LOC   = 35,   // Store the input location.  Operand is location
                               //   within the matcher stack frame.
     URX_JMPX          = 36,  // Conditional JMP.
                               //   First Operand:  JMP target location.
                               //   Second Operand:  Data location containing an 
                               //     input position.  If current input position ==
                               //     saved input position, FAIL rather than taking
                               //     the JMP
     URX_LA_START      = 37,   // Starting a LookAround expression.
                               //   Save InputPos and SP in static data.
                               //   Operand:  Static data offset for the save
     URX_LA_END        = 38,   // Ending a Lookaround expression.
                               //   Restore InputPos and Stack to saved values.
                               //   Operand:  Static data offset for saved data.
     URX_ONECHAR_I     = 39,   // Test for case-insensitive match of a literal character.
                               //   Operand:  the literal char.
     URX_STRING_I      = 40,   // Case insensitive string compare.
                               //   First Operand:  Index of start of string in string literals
                               //   Second Operand (next word in compiled code):
                               //     the length of the string.
     URX_BACKREF_I     = 41,   // Case insensitive back reference.
                               //   Parameter is the index of the
                               //   capture group variables in the state stack frame.
     URX_DOLLAR_M      = 42,   // $ in multi-line mode.
     URX_CARET_M       = 43,   // ^ in multi-line mode.
     URX_LB_START      = 44,   // LookBehind Start.
                               //   Paramater is data location
     URX_LB_CONT       = 45,   // LookBehind Continue.
                               //   Param 0:  the data location
                               //   Param 1:  The minimum length of the look-behind match
                               //   Param 2:  The max length of the look-behind match
     URX_LB_END        = 46,   // LookBehind End.
                               //   Parameter is the data location.
                               //     Check that match ended at the right spot,
                               //     Restore original input string len.
     URX_LBN_CONT      = 47,   // Negative LookBehind Continue
                               //   Param 0:  the data location
                               //   Param 1:  The minimum length of the look-behind match
                               //   Param 2:  The max     length of the look-behind match
                               //   Param 3:  The pattern loc following the look-behind block.
     URX_LBN_END       = 48,   // Negative LookBehind end
                               //   Parameter is the data location.
                               //   Check that the match ended at the right spot.
     URX_STAT_SETREF_N = 49,   // Reference to a prebuilt set (e.g. \w), negated  
                               //   Operand is index of set in array of sets.
     URX_LOOP_SR_I     = 50,   // Init a [set]* loop.
                               //   Operand is the sets index in array of user sets.
     URX_LOOP_C        = 51,   // Continue a [set]* or OneChar* loop.
                               //   Operand is a matcher static data location.
                               //   Must always immediately follow  LOOP_x_I instruction.
     URX_LOOP_DOT_I    = 52    // .*, initialization of the optimized loop.
                               //   Operand value:
                               //      0:  Normal (. doesn't match new-line) mode.
                               //      1:  . matches new-line mode.

};           

// Keep this list of opcode names in sync with the above enum
//   Used for debug printing only.
#define URX_OPCODE_NAMES       \
        "               ",     \
        "BACKTRACK",           \
        "END",                 \
        "ONECHAR",             \
        "STRING",              \
        "STRING_LEN",          \
        "STATE_SAVE",          \
        "NOP",                 \
        "START_CAPTURE",       \
        "END_CAPTURE",         \
        "URX_STATIC_SETREF",   \
        "SETREF",              \
        "DOTANY",              \
        "JMP",                 \
        "FAIL",                \
        "JMP_SAV",             \
        "BACKSLASH_B",         \
        "BACKSLASH_G",         \
        "JMP_SAV_X",           \
        "BACKSLASH_X",         \
        "BACKSLASH_Z",         \
        "DOTANY_ALL",          \
        "BACKSLASH_D",         \
        "CARET",               \
        "DOLLAR",              \
        "CTR_INIT",            \
        "CTR_INIT_NG",         \
        "DOTANY_PL",           \
        "CTR_LOOP",            \
        "CTR_LOOP_NG",         \
        "DOTANY_ALL_PL",       \
        "RELOC_OPRND",         \
        "STO_SP",              \
        "LD_SP",               \
        "BACKREF",             \
        "STO_INP_LOC",         \
        "JMPX",                \
        "LA_START",            \
        "LA_END",              \
        "ONECHAR_I",           \
        "STRING_I",            \
        "BACKREF_I",           \
        "DOLLAR_M",            \
        "CARET_M",             \
        "LB_START",            \
        "LB_CONT",             \
        "LB_END",              \
        "LBN_CONT",            \
        "LBN_END",             \
        "STAT_SETREF_N",       \
        "LOOP_SR_I",           \
        "LOOP_C",              \
        "LOOP_DOT_I"

//
//  Convenience macros for assembling and disassembling a compiled operation.
//
#define URX_BUILD(type, val) (int32_t)((type << 24) | (val))
#define URX_TYPE(x)          ((uint32_t)(x) >> 24) 
#define URX_VAL(x)           ((x) & 0xffffff)

//
//  Access to Unicode Sets composite character properties
//     The sets are accessed by the match engine for things like \w (word boundary)
//     
enum {
     URX_ISWORD_SET  = 1,
     URX_ISALNUM_SET = 2,
     URX_ISALPHA_SET = 3,
     URX_ISSPACE_SET = 4,

     URX_GC_NORMAL,          // Sets for finding grapheme cluster boundaries.
     URX_GC_EXTEND,
     URX_GC_CONTROL,
     URX_GC_L,
     URX_GC_LV,
     URX_GC_LVT,
     URX_GC_V,
     URX_GC_T,

     URX_LAST_SET,

     URX_NEG_SET     = 0x800000          // Flag bit to reverse sense of set
                                         //   membership test.
};

//
//  Match Engine State Stack Frame Layout.
//
struct REStackFrame {
    int32_t            fInputIdx;        // Position of next character in the input string
    int32_t            fPatIdx;          // Position of next Op in the compiled pattern
    int32_t            fExtra[2];        // Extra state, for capture group start/ends
                                         //   atomic parentheses, repeat counts, etc.
                                         //   Locations assigned at pattern compile time.
};

//
//  Start-Of-Match type.  Used by find() to quickly scan to positions where a
//                        match might start before firing up the full match engine.
//
enum StartOfMatch {
    START_NO_INFO,             // No hint available.
    START_CHAR,                // Match starts with a literal code point.
    START_SET,                 // Match starts with something matching a set.
    START_START,               // Match starts at start of buffer only (^ or \A)
    START_LINE,                // Match starts with ^ in multi-line mode.
    START_STRING               // Match starts with a literal string.
};

#define START_OF_MATCH_STR(v) ((v)==START_NO_INFO? "START_NO_INFO" : \
                               (v)==START_CHAR?    "START_CHAR"    : \
                               (v)==START_SET?     "START_SET"     : \
                               (v)==START_START?   "START_START"   : \
                               (v)==START_LINE?    "START_LINE"    : \
                               (v)==START_STRING?  "START_STRING"  : \
                                                   "ILLEGAL")

//
//  8 bit set, to fast-path latin-1 set membership tests.
//
struct Regex8BitSet : public UMemory {
    inline Regex8BitSet();
    inline void operator = (const Regex8BitSet &s);
    inline void init(const UnicodeSet *src);
    inline UBool contains(UChar32 c);
    inline void  add(UChar32 c);
    int8_t d[32];
};

inline Regex8BitSet::Regex8BitSet() {
    uprv_memset(d, 0, sizeof(d));
}

inline UBool Regex8BitSet::contains(UChar32 c) {
    // No bounds checking!  This is deliberate.
    return ((d[c>>3] & 1 <<(c&7)) != 0);
};

inline void  Regex8BitSet::add(UChar32 c) {
    d[c>>3] |= 1 << (c&7);
};

inline void Regex8BitSet::init(const UnicodeSet *s) {
    if (s != NULL) {
        for (int i=0; i<255; i++) {
            if (s->contains(i)) {
                this->add(i);
            }
        }
    }
}

inline void Regex8BitSet::operator = (const Regex8BitSet &s) {
   uprv_memcpy(d, s.d, sizeof(d));
}

U_NAMESPACE_END
#endif

--- NEW FILE: regexst.cpp ---
//
//  regexst.h
//
//  Copyright (C) 2003, International Business Machines Corporation and others.
//  All Rights Reserved.
//
//  This file contains class RegexStaticSets
//
//  This class is internal to the regular expression implementation.
//  For the public Regular Expression API, see the file "unicode/regex.h"
//
//  RegexStaticSets groups together the common UnicodeSets that are needed
//   for compiling or executing RegularExpressions.  This grouping simplifies
//   the thread safe lazy creation and sharing of these sets across
//   all instances of regular expressions.
//
#include "unicode/utypes.h"

#if !UCONFIG_NO_REGULAR_EXPRESSIONS

#include "unicode/unistr.h"
#include "unicode/uniset.h"
#include "unicode/uchar.h"
#include "unicode/regex.h"
#include "uprops.h"
#include "cmemory.h"
#include "cstring.h"
#include "uassert.h"
#include "ucln_in.h"
#include "umutex.h"

#include "regexcst.h"   // Contains state table for the regex pattern parser.
                        //   generated by a Perl script.
#include "regexst.h"

U_NAMESPACE_BEGIN

//----------------------------------------------------------------------------------------
//
// Unicode Set pattern strings for all of the required constant sets.
//               Initialized with hex values for portability to EBCDIC based machines.
//                Really ugly, but there's no good way to avoid it.
//
//----------------------------------------------------------------------------------------

// "Rule Char" Characters are those with no special meaning, and therefore do not
//    need to be escaped to appear as literals in a regexp.  Expressed
//    as the inverse of those needing escaping --  [^\*\?\+\[\(\)\{\}\^\$\|\\\.]
static const UChar gRuleSet_rule_char_pattern[]       = {
 //   [    ^      \     *     \     ?     \     +     \     [     \     (     /     )
    0x5b, 0x5e, 0x5c, 0x2a, 0x5c, 0x3f, 0x5c, 0x2b, 0x5c, 0x5b, 0x5c, 0x28, 0x5c, 0x29,
 //   \     {    \     }     \     ^     \     $     \     |     \     \     \     .     ]
    0x5c, 0x7b,0x5c, 0x7d, 0x5c, 0x5e, 0x5c, 0x24, 0x5c, 0x7c, 0x5c, 0x5c, 0x5c, 0x2e, 0x5d, 0};

static const UChar gRuleSet_digit_char_pattern[] = {
//    [    0      -    9     ]
    0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};
//static const UnicodeSet *gRuleDigits = NULL;

//
//   Here are the backslash escape characters that ICU's unescape() function
//    will handle.
//
static const UChar gUnescapeCharPattern[] = {
//    [     a     c     e     f     n     r     t     u     U     x    ]
    0x5b, 0x61, 0x63, 0x65, 0x66, 0x6e, 0x72, 0x74, 0x75, 0x55, 0x78, 0x5d, 0};

//
//  White space characters that may appear within a pattern in free-form mode
//
static const UChar gRuleWhiteSpacePattern[] = {
    /* "[[:Cf:][:WSpace:]]" */
    91, 91, 58, 67, 102, 58, 93, 91, 58, 87,
        83, 112, 97, 99, 101, 58, 93, 93, 0 };

//
//  Unicode Set Definitions for Regular Expression  \w
//
static const UChar gIsWordPattern[] = {
//    [     \     p     {     L     l     }     \     p     {     L     u     }
    0x5b, 0x5c, 0x70, 0x7b, 0x4c, 0x6c, 0x7d, 0x5c, 0x70, 0x7b, 0x4c, 0x75, 0x7d,
//          \     p     {     L     t     }     \     p     {     L     o     }
          0x5c, 0x70, 0x7b, 0x4c, 0x74, 0x7d, 0x5c, 0x70, 0x7b, 0x4c, 0x6f, 0x7d,
//          \     p     {     N     d     }     _     ]
          0x5c, 0x70, 0x7b, 0x4e, 0x64, 0x7d, 0x5f, 0x5d, 0};

//
//  Unicode Set Definitions for Regular Expression  \s
//
    static const UChar gIsSpacePattern[] = {
//    [     \     t     \     n     \     f     \     r     \     p     {     Z     }     ]
    0x5b, 0x5c, 0x74, 0x5c, 0x6e, 0x5c, 0x66, 0x5c, 0x72, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5d,  0};

//
//  UnicodeSets used in implementation of Grapheme Cluster detection, \X
//
    static const UChar gGC_ControlPattern[] = {
//    [     [     :     Z     l     :     ]     [     :     Z     p     :     ]    
    0x5b, 0x5b, 0x3a, 0x5A, 0x6c, 0x3a, 0x5d, 0x5b, 0x3a, 0x5A, 0x70, 0x3a, 0x5d, 
//    [     :     C     c     :     ]     [     :     C     f     :     ]     ] 
    0x5b, 0x3a, 0x43, 0x63, 0x3a, 0x5d, 0x5b, 0x3a, 0x43, 0x66, 0x3a, 0x5d, 0x5d, 0};

    static const UChar gGC_ExtendPattern[] = {
//    [     \     p     {     G     r     a     p     h     e     m     e     _
    0x5b, 0x5c, 0x70, 0x7b, 0x47, 0x72, 0x61, 0x70, 0x68, 0x65, 0x6d, 0x65, 0x5f,
//    E     x     t     e     n     d     }     ]
    0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x7d, 0x5d, 0};

    static const UChar gGC_LPattern[] = {
//    [     \     p     {     H     a     n     g     u     l     _     S     y     l    
    0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
//    l     a     b     l     e     _     T     y     p     e     =     L     }     ]
    0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x7d,  0x5d, 0}; 

    static const UChar gGC_VPattern[] = {
//    [     \     p     {     H     a     n     g     u     l     _     S     y     l    
    0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
//    l     a     b     l     e     _     T     y     p     e     =     V     }     ]
    0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x56, 0x7d,  0x5d, 0}; 

    static const UChar gGC_TPattern[] = {
//    [     \     p     {     H     a     n     g     u     l     _     S     y     l    
    0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
//    l     a     b     l     e     _     T     y     p     e     =     T     }    ]
    0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x54, 0x7d, 0x5d, 0}; 

    static const UChar gGC_LVPattern[] = {
//    [     \     p     {     H     a     n     g     u     l     _     S     y     l    
    0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
//    l     a     b     l     e     _     T     y     p     e     =     L     V     }     ]
    0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x7d, 0x5d, 0}; 

    static const UChar gGC_LVTPattern[] = {
//    [     \     p     {     H     a     n     g     u     l     _     S     y     l    
    0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
//    l     a     b     l     e     _     T     y     p     e     =     L     V     T     }     ]
    0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x54, 0x7d, 0x5d, 0}; 

RegexStaticSets *RegexStaticSets::gStaticSets = NULL;

RegexStaticSets::RegexStaticSets(UErrorCode *status) {
    // First zero out everything  
    int i;
    for (i=0; i<URX_LAST_SET; i++) {
        fPropSets[i] = NULL;
    }
    for (i=0; i<10; i++) {
        fRuleSets[i] = NULL;
    }
    fUnescapeCharSet = NULL;
    fRuleDigits      = NULL;
    fEmptyString     = NULL;

    // Then init the sets to their correct values.
    fPropSets[URX_ISWORD_SET]  = new UnicodeSet(gIsWordPattern,     *status);
    fPropSets[URX_ISSPACE_SET] = new UnicodeSet(gIsSpacePattern,    *status);    
    fPropSets[URX_GC_EXTEND]   = new UnicodeSet(gGC_ExtendPattern,  *status);
    fPropSets[URX_GC_CONTROL]  = new UnicodeSet(gGC_ControlPattern, *status);
    fPropSets[URX_GC_L]        = new UnicodeSet(gGC_LPattern,       *status);
    fPropSets[URX_GC_V]        = new UnicodeSet(gGC_VPattern,       *status);
    fPropSets[URX_GC_T]        = new UnicodeSet(gGC_TPattern,       *status);
    fPropSets[URX_GC_LV]       = new UnicodeSet(gGC_LVPattern,      *status);
    fPropSets[URX_GC_LVT]      = new UnicodeSet(gGC_LVTPattern,     *status);
    if (U_FAILURE(*status)) {
        // Bail out if we were unable to create the above sets.
        // The rest of the initialization needs them, so we cannot proceed.
        return;
    }

    //
    // The following sets  are dynamically constructed, because their
    //   intialization strings would be unreasonable.
    //

    //
    //  "Normal" is the set of characters that don't need special handling
    //            when finding grapheme cluster boundaries.
    //
    fPropSets[URX_GC_NORMAL] = new UnicodeSet;
    fPropSets[URX_GC_NORMAL]->complement();
    fPropSets[URX_GC_NORMAL]->remove(0xac00, 0xd7a4);
    fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_CONTROL]);
    fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_L]);
    fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_V]);
    fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_T]);

    // Initialize the 8-bit fast bit sets from the parallel full
    //   UnicodeSets.
    for (i=0; i<URX_LAST_SET; i++) {
        fPropSets8[i].init(fPropSets[i]);
    }

    // Sets used while parsing rules, but not referenced from the parse state table
    fRuleSets[kRuleSet_rule_char-128]   = new UnicodeSet(gRuleSet_rule_char_pattern,  *status);
    fRuleSets[kRuleSet_white_space-128] = new UnicodeSet(gRuleWhiteSpacePattern,      *status);
    fRuleSets[kRuleSet_digit_char-128]  = new UnicodeSet(gRuleSet_digit_char_pattern, *status);
    fRuleDigits                         = new UnicodeSet(gRuleSet_digit_char_pattern, *status);
    fUnescapeCharSet                    = new UnicodeSet(gUnescapeCharPattern,        *status);

    // Empty UnicodeString, for use by matchers with NULL input.
    fEmptyString = new UnicodeString;
};

RegexStaticSets::~RegexStaticSets() {
    int i;

    for (i=0; i<URX_LAST_SET; i++) {
        delete fPropSets[i];
        fPropSets[i] = NULL;
    }
    for (i=0; i<10; i++) {
        delete fRuleSets[i];
        fRuleSets[i] = NULL;
    }
    delete fUnescapeCharSet;
    fUnescapeCharSet = NULL;
    delete fRuleDigits;
    fRuleDigits = NULL;
    delete fEmptyString;
    fEmptyString = NULL;
};

void RegexStaticSets::initGlobals(UErrorCode *status) {
    umtx_lock(NULL);
    RegexStaticSets *p = gStaticSets;
    umtx_unlock(NULL);
    if (p == NULL) {
        p = new RegexStaticSets(status);
        if (U_FAILURE(*status)) {
            delete p;
            return;
        }
        umtx_lock(NULL);
        if (gStaticSets == NULL) {
            gStaticSets = p;
            p = NULL;
        }
        umtx_unlock(NULL);
        if (p) {
            delete p;
        }
        ucln_i18n_registerCleanup();
    }
}

U_NAMESPACE_END
#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS

--- NEW FILE: regexst.h ---
//
//  regexst.h
//
//  Copyright (C) 2003, International Business Machines Corporation and others.
//  All Rights Reserved.
//
//  This file contains declarations for the class RegexStaticSets
//
//  This class is internal to the regular expression implementation.
//  For the public Regular Expression API, see the file "unicode/regex.h"
//
//  RegexStaticSets groups together the common UnicodeSets that are needed
//   for compiling or executing RegularExpressions.  This grouping simplifies
//   the thread safe lazy creation and sharing of these sets across
//   all instances of regular expressions.
//

#ifndef REGEXST_H
#define REGEXST_H

#include "unicode/utypes.h"
#if !UCONFIG_NO_REGULAR_EXPRESSIONS

#include "regeximp.h"

U_NAMESPACE_BEGIN

class  UnicodeSet;

class RegexStaticSets : public UMemory {
public:
    static RegexStaticSets *gStaticSets;  // Ptr to all lazily initialized constant
                                          //   shared sets.

    RegexStaticSets(UErrorCode *status);         
    ~RegexStaticSets();
    static void    initGlobals(UErrorCode *status);

    UnicodeSet    *fPropSets[URX_LAST_SET];     // The sets for common regex items, e.g. \s
    Regex8BitSet   fPropSets8[URX_LAST_SET];    // Fast bitmap sets for latin-1 range for above.

    UnicodeSet    *fRuleSets[10];               // Sets used while parsing regexp patterns.
    UnicodeSet    *fUnescapeCharSet;            // Set of chars handled by unescape when
                                                //   encountered with a \ in a pattern.
    UnicodeSet    *fRuleDigits;
    UnicodeString *fEmptyString;                // An empty string, to be used when a matcher
                                                //   is created with no input.

};

U_NAMESPACE_END
#endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
#endif   // REGEXST_H

--- NEW FILE: rematch.cpp ---
//
//  file:  rematch.cpp    
//
//         Contains the implementation of class RegexMatcher,
//         which is one of the main API classes for the ICU regular expression package.
//
/*
**************************************************************************
*   Copyright (C) 2002-2003 International Business Machines Corporation  *
*   and others. All rights reserved.                                     *
**************************************************************************
*/

#include "unicode/utypes.h"
#if !UCONFIG_NO_REGULAR_EXPRESSIONS

#include "unicode/regex.h"
#include "unicode/uniset.h"
#include "unicode/uchar.h"
[...2091 lines suppressed...]
        if (fTraceDebug) {
            REGEX_RUN_DEBUG_PRINTF("No match\n\n");
        }
    }

    fFrame = fp;                // The active stack frame when the engine stopped.
                                //   Contains the capture group results that we need to
                                //    access later.

    return;
}

const char RegexMatcher::fgClassID = 0;

U_NAMESPACE_END

#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS

--- NEW FILE: repattrn.cpp ---
//
//  file:  repattrn.cpp    
//
/*
***************************************************************************
*   Copyright (C) 2002-2003 International Business Machines Corporation   *
*   and others. All rights reserved.                                      *
***************************************************************************
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_REGULAR_EXPRESSIONS

#include "unicode/regex.h"
#include "uassert.h"
#include "uvector.h"
#include "uvectr32.h"
#include "regexcmp.h"
#include "regeximp.h"
#include "regexst.h"

U_NAMESPACE_BEGIN

//--------------------------------------------------------------------------
//
//    RegexPattern    Default Constructor
//
//--------------------------------------------------------------------------
RegexPattern::RegexPattern() {
    // Init all of this instances data.
    init();

    // Lazy init of all shared global sets.
    RegexStaticSets::initGlobals(&fDeferredStatus);
};

//--------------------------------------------------------------------------
//
//   Copy Constructor        Note:  This is a rather inefficient implementation,
//                                  but it probably doesn't matter.
//
//--------------------------------------------------------------------------
RegexPattern::RegexPattern(const RegexPattern &other) :  UObject(other) {
    init(); 
    *this = other;
}

//--------------------------------------------------------------------------
//
//    Assignmenet Operator
//
//--------------------------------------------------------------------------
RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
    if (this == &other) {
        // Source and destination are the same.  Don't do anything.
        return *this;
    }

    // Clean out any previous contents of object being assigned to.
    zap();

    // Give target object a default initialization
    init();

    // Copy simple fields
    fPattern          = other.fPattern;
    fFlags            = other.fFlags;
    fLiteralText      = other.fLiteralText;
    fDeferredStatus   = other.fDeferredStatus;
    fMinMatchLen      = other.fMinMatchLen;
    fMaxCaptureDigits = other.fMaxCaptureDigits;
    fStaticSets       = other.fStaticSets; 

    fStartType        = other.fStartType;
    fInitialStringIdx = other.fInitialStringIdx;
    fInitialStringLen = other.fInitialStringLen;
    *fInitialChars    = *other.fInitialChars;
    *fInitialChars8   = *other.fInitialChars8;
    fInitialChar      = other.fInitialChar;

    //  Copy the pattern.  It's just values, nothing deep to copy.
    fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus);
    fGroupMap->assign(*other.fGroupMap, fDeferredStatus);

    //  Copy the Unicode Sets.  
    //    Could be made more efficient if the sets were reference counted and shared,
    //    but I doubt that pattern copying will be particularly common. 
    //    Note:  init() already added an empty element zero to fSets
    int32_t i;
    int32_t  numSets = other.fSets->size();
    fSets8 = new Regex8BitSet[numSets];
    for (i=1; i<numSets; i++) {
        if (U_FAILURE(fDeferredStatus)) {
            return *this;
        }
        UnicodeSet *sourceSet = (UnicodeSet *)other.fSets->elementAt(i);
        UnicodeSet *newSet    = new UnicodeSet(*sourceSet);
        if (newSet == NULL) {
            fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
            break;
        }
        fSets->addElement(newSet, fDeferredStatus);
        fSets8[i] = other.fSets8[i];
    }

    return *this;
}

//--------------------------------------------------------------------------
//
//    init        Shared initialization for use by constructors.
//                Bring an uninitialized RegexPattern up to a default state.
//
//--------------------------------------------------------------------------
void RegexPattern::init() {
    fFlags            = 0;
    fDeferredStatus   = U_ZERO_ERROR;
    fMinMatchLen      = 0;
    fMaxCaptureDigits = 1;  
    fStaticSets       = NULL;
    fFrameSize        = 0;
    fDataSize         = 0;
    fStartType        = START_NO_INFO;
    fInitialStringIdx = 0;
    fInitialStringLen = 0;
    fInitialChars     = NULL;
    fInitialChars8    = NULL;
    fInitialChar      = 0;
    fSets8            = NULL;

    fCompiledPat      = new UVector32(fDeferredStatus);
    fGroupMap         = new UVector32(fDeferredStatus);
    fSets             = new UVector(fDeferredStatus);
    fInitialChars     = new UnicodeSet;
    fInitialChars8    = new Regex8BitSet;
    if (U_FAILURE(fDeferredStatus)) {
        return;
    }
    if (fCompiledPat == NULL  || fGroupMap == NULL || fSets == NULL ||
        fInitialChars == NULL || fInitialChars8 == NULL) {
        fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
        return;
    }

    // Slot zero of the vector of sets is reserved.  Fill it here.
    fSets->addElement((int32_t)0, fDeferredStatus);
}

//--------------------------------------------------------------------------
//
//   zap            Delete everything owned by this RegexPattern. 
//
//--------------------------------------------------------------------------
void RegexPattern::zap() {
    delete fCompiledPat;
    fCompiledPat = NULL;
    int i;
    for (i=1; i<fSets->size(); i++) {
        UnicodeSet *s;
        s = (UnicodeSet *)fSets->elementAt(i);
        if (s != NULL) {
            delete s;
        }
    }
    delete fSets;
    fSets = NULL;
    delete fGroupMap;
    fGroupMap = NULL;
    delete fInitialChars;
    fInitialChars = NULL;
    delete fInitialChars8;
    fInitialChars8 = NULL;
    delete[] fSets8;
    fSets8 = NULL;
}

//--------------------------------------------------------------------------
//
//   Destructor
//
//--------------------------------------------------------------------------
RegexPattern::~RegexPattern() {
    zap();
};

//--------------------------------------------------------------------------
//
//   Clone
//
//--------------------------------------------------------------------------
RegexPattern  *RegexPattern::clone() const { 
    RegexPattern  *copy = new RegexPattern(*this);
    return copy;
};

//--------------------------------------------------------------------------
//
//   operator ==   (comparison)    Consider to patterns to be == if the
//                                 pattern strings and the flags are the same.
//
//--------------------------------------------------------------------------
UBool   RegexPattern::operator ==(const RegexPattern &other) const {
    UBool r = this->fFlags    == other.fFlags &&
              this->fPattern  == other.fPattern &&
              this->fDeferredStatus == other.fDeferredStatus;
    return r;
}

//---------------------------------------------------------------------
//
//   compile        
//
//---------------------------------------------------------------------
RegexPattern  *RegexPattern::compile(
                             const UnicodeString &regex,
                             uint32_t             flags,
                             UParseError          &pe,
                             UErrorCode           &status)  {

    if (U_FAILURE(status)) {
        return NULL;
    }

    const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
                              UREGEX_DOTALL   | UREGEX_MULTILINE;

    if ((flags & ~allFlags) != 0) {
        status = U_REGEX_INVALID_FLAG;
        return NULL;
    }

    if ((flags & UREGEX_CANON_EQ) != 0) {
        status = U_REGEX_UNIMPLEMENTED;
        return NULL;
    }

    RegexPattern *This = new RegexPattern;
    if (This == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return NULL;
    }
    if (U_FAILURE(This->fDeferredStatus)) {
        status = This->fDeferredStatus;
        return NULL;
    }
    This->fFlags = flags;

    RegexCompile     compiler(This, status);
    compiler.compile(regex, pe, status);

    return This;
};

//
//   compile with default flags.
//
RegexPattern *RegexPattern::compile( const UnicodeString &regex,
        UParseError          &pe,
        UErrorCode           &err) 
{
    return compile(regex, 0, pe, err); 
}

//
//   compile with no UParseErr parameter.
//
RegexPattern *RegexPattern::compile( const UnicodeString &regex,
        uint32_t             flags,
        UErrorCode           &err) 
{
    UParseError pe;
    return compile(regex, flags, pe, err); 
}

//---------------------------------------------------------------------
//
//   flags
//
//---------------------------------------------------------------------
uint32_t RegexPattern::flags() const {
    return fFlags;
}

//---------------------------------------------------------------------
//
//   matcher(UnicodeString, err)
//
//---------------------------------------------------------------------
RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
                                    UErrorCode          &status)  const {
    RegexMatcher    *retMatcher = matcher(status);
    if (retMatcher != NULL) {
        retMatcher->reset(input);
    }
    return retMatcher;
};

//---------------------------------------------------------------------
//
//   matcher(status)
//
//---------------------------------------------------------------------
RegexMatcher *RegexPattern::matcher(UErrorCode &status)  const {
    RegexMatcher    *retMatcher = NULL;

    if (U_FAILURE(status)) {
        return NULL;
    }
    if (U_FAILURE(fDeferredStatus)) {
        status = fDeferredStatus;
        return NULL;
    }

    retMatcher = new RegexMatcher(this); 
    if (retMatcher == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return NULL;
    }
    return retMatcher;
};

//---------------------------------------------------------------------
//
//   matches        Convenience function to test for a match, starting
//                  with a pattern string and a data string.
//
//---------------------------------------------------------------------
UBool RegexPattern::matches(const UnicodeString   &regex,
              const UnicodeString   &input,
                    UParseError     &pe,
                    UErrorCode      &status) {

    if (U_FAILURE(status)) {return FALSE;}

    UBool         retVal;
    RegexPattern *pat     = NULL;
    RegexMatcher *matcher = NULL;

    pat     = RegexPattern::compile(regex, 0, pe, status);
    matcher = pat->matcher(input, status);
    retVal  = matcher->matches(status);

    delete matcher;
    delete pat;
    return retVal;
}

//---------------------------------------------------------------------
//
//   pattern
//
//---------------------------------------------------------------------
UnicodeString RegexPattern::pattern() const {
    return fPattern;
}

//---------------------------------------------------------------------
//
//   split
//
//---------------------------------------------------------------------
int32_t  RegexPattern::split(const UnicodeString &input,
        UnicodeString    dest[],
        int32_t          destCapacity,
        UErrorCode       &status) const
{
    if (U_FAILURE(status)) {
        return 0;
    };

    RegexMatcher  m(this);
    int32_t r = m.split(input, dest, destCapacity, status);
    return r;
}

//---------------------------------------------------------------------
//
//   dump    Output the compiled form of the pattern.
//           Debugging function only.
//
//---------------------------------------------------------------------
void   RegexPattern::dumpOp(int32_t index) const {
#if defined(REGEX_DEBUG)
    static const char * const opNames[] = {URX_OPCODE_NAMES};
    int32_t op          = fCompiledPat->elementAti(index);
    int32_t val         = URX_VAL(op);
    int32_t type        = URX_TYPE(op);
    int32_t pinnedType  = type;
    if (pinnedType >= sizeof(opNames)/sizeof(char *)) {
        pinnedType = 0;
    }

    REGEX_DUMP_DEBUG_PRINTF("%4d   %08x    %-15s  ", index, op, opNames[pinnedType]);
    switch (type) {
    case URX_NOP:
    case URX_DOTANY:
    case URX_DOTANY_ALL:
    case URX_DOTANY_PL:
    case URX_DOTANY_ALL_PL:
    case URX_FAIL:
    case URX_CARET:
    case URX_DOLLAR:
    case URX_BACKSLASH_G:
    case URX_BACKSLASH_X:
    case URX_END:
    case URX_DOLLAR_M:
    case URX_CARET_M:
        // Types with no operand field of interest.
        break;

    case URX_RESERVED_OP:
    case URX_START_CAPTURE:
    case URX_END_CAPTURE:
    case URX_STATE_SAVE:
    case URX_JMP:
    case URX_JMP_SAV:
    case URX_JMP_SAV_X:
    case URX_BACKSLASH_B:
    case URX_BACKSLASH_D:
    case URX_BACKSLASH_Z:
    case URX_STRING_LEN:
    case URX_CTR_INIT:
    case URX_CTR_INIT_NG:
    case URX_CTR_LOOP:
    case URX_CTR_LOOP_NG:
    case URX_RELOC_OPRND:
    case URX_STO_SP:
    case URX_LD_SP:
    case URX_BACKREF:
    case URX_STO_INP_LOC:
    case URX_JMPX:
    case URX_LA_START:
    case URX_LA_END:
    case URX_BACKREF_I:
    case URX_LB_START:
    case URX_LB_CONT:
    case URX_LB_END:
    case URX_LBN_CONT:
    case URX_LBN_END:
    case URX_LOOP_C:
    case URX_LOOP_DOT_I:
        // types with an integer operand field.
        REGEX_DUMP_DEBUG_PRINTF("%d", val);
        break;

    case URX_ONECHAR:
    case URX_ONECHAR_I:
        REGEX_DUMP_DEBUG_PRINTF("%c", val<256?val:'?');
        break;

    case URX_STRING:
    case URX_STRING_I:
        {
            int32_t lengthOp       = fCompiledPat->elementAti(index+1);
            U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
            int32_t length = URX_VAL(lengthOp);
            int32_t i;
            for (i=val; i<val+length; i++) {
                UChar c = fLiteralText[i];
                if (c < 32 || c >= 256) {c = '.';}
                REGEX_DUMP_DEBUG_PRINTF("%c", c);
            }
        }
        break;

    case URX_SETREF:
    case URX_LOOP_SR_I:
        {
            UnicodeString s;
            UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
            set->toPattern(s, TRUE);
            for (int32_t i=0; i<s.length(); i++) {
                REGEX_DUMP_DEBUG_PRINTF("%c", s.charAt(i));
            }
        }
        break;

    case URX_STATIC_SETREF:
    case URX_STAT_SETREF_N:
        {
            UnicodeString s;
            if (val & URX_NEG_SET) {
                REGEX_DUMP_DEBUG_PRINTF("NOT ");
                val &= ~URX_NEG_SET;
            }
            UnicodeSet *set = fStaticSets[val];
            set->toPattern(s, TRUE);
            for (int32_t i=0; i<s.length(); i++) {
                REGEX_DUMP_DEBUG_PRINTF("%c", s.charAt(i));
            }
        }
        break;

    default:
        REGEX_DUMP_DEBUG_PRINTF("??????");
        break;
    }
    REGEX_DUMP_DEBUG_PRINTF("\n");
#endif
}

void   RegexPattern::dump() const {
#if defined(REGEX_DEBUG)
    int      index;
    int      i;

    REGEX_DUMP_DEBUG_PRINTF("Original Pattern:  ");
    for (i=0; i<fPattern.length(); i++) {
        REGEX_DUMP_DEBUG_PRINTF("%c", fPattern.charAt(i));
    }
    REGEX_DUMP_DEBUG_PRINTF("\n");
    REGEX_DUMP_DEBUG_PRINTF("   Min Match Length:  %d\n", fMinMatchLen);
    REGEX_DUMP_DEBUG_PRINTF("   Match Start Type:  %s\n", START_OF_MATCH_STR(fStartType));   
    if (fStartType == START_STRING) {
        REGEX_DUMP_DEBUG_PRINTF("    Initial match sting: \"");
        for (i=fInitialStringIdx; i<fInitialStringIdx+fInitialStringLen; i++) {
            REGEX_DUMP_DEBUG_PRINTF("%c", fLiteralText[i]);   // TODO:  non-printables, surrogates.
        }

    } else if (fStartType == START_SET) {
        int32_t numSetChars = fInitialChars->size();
        if (numSetChars > 20) {
            numSetChars = 20;
        }
        REGEX_DUMP_DEBUG_PRINTF("     Match First Chars : ");
        for (i=0; i<numSetChars; i++) {
            UChar32 c = fInitialChars->charAt(i);
            if (0x20<c && c <0x7e) { 
                REGEX_DUMP_DEBUG_PRINTF("%c ", c);
            } else {
                REGEX_DUMP_DEBUG_PRINTF("%#x ", c);
            }
        }
        if (numSetChars < fInitialChars->size()) {
            REGEX_DUMP_DEBUG_PRINTF(" ...");
        }
        REGEX_DUMP_DEBUG_PRINTF("\n");

    } else if (fStartType == START_CHAR) {
        REGEX_DUMP_DEBUG_PRINTF("    First char of Match : ");
        if (0x20 < fInitialChar && fInitialChar<0x7e) {
                REGEX_DUMP_DEBUG_PRINTF("%c\n", fInitialChar);
            } else {
                REGEX_DUMP_DEBUG_PRINTF("%#x\n", fInitialChar);
            }
    }

    REGEX_DUMP_DEBUG_PRINTF("\nIndex   Binary     Type             Operand\n"
           "-------------------------------------------\n");
    for (index = 0; index<fCompiledPat->size(); index++) {
        dumpOp(index);
    }
    REGEX_DUMP_DEBUG_PRINTF("\n\n");
#endif
};

const char RegexPattern::fgClassID = 0;

//----------------------------------------------------------------------------------
//
//   regex_cleanup      Memory cleanup function, free/delete all
//                      cached memory.  Called by ICU's u_cleanup() function.
//
//----------------------------------------------------------------------------------
U_CFUNC UBool 
regex_cleanup(void) {
    RegexCompile::cleanup();
    return TRUE;
};

U_NAMESPACE_END
#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS

--- NEW FILE: strrepl.cpp ---
/*
**********************************************************************
*   Copyright (c) 2002, International Business Machines Corporation
*   and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   01/21/2002  aliu        Creation.
**********************************************************************
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "strrepl.h"
#include "rbt_data.h"
#include "util.h"
#include "unicode/uniset.h"

U_NAMESPACE_BEGIN

const UChar EMPTY[] = { 0 }; // empty string: ""

const char StringReplacer::fgClassID=0;

/**
 * Construct a StringReplacer that sets the emits the given output
 * text and sets the cursor to the given position.
 * @param theOutput text that will replace input text when the
 * replace() method is called.  May contain stand-in characters
 * that represent nested replacers.
 * @param theCursorPos cursor position that will be returned by
 * the replace() method
 * @param theData transliterator context object that translates
 * stand-in characters to UnicodeReplacer objects
 */
StringReplacer::StringReplacer(const UnicodeString& theOutput,
                               int32_t theCursorPos,
                               const TransliterationRuleData* theData) {
    output = theOutput;
    cursorPos = theCursorPos;
    hasCursor = TRUE;
    data = theData;
    isComplex = TRUE;
}

/**
 * Construct a StringReplacer that sets the emits the given output
 * text and does not modify the cursor.
 * @param theOutput text that will replace input text when the
 * replace() method is called.  May contain stand-in characters
 * that represent nested replacers.
 * @param theData transliterator context object that translates
 * stand-in characters to UnicodeReplacer objects
 */
StringReplacer::StringReplacer(const UnicodeString& theOutput,
                               const TransliterationRuleData* theData) {
    output = theOutput;
    cursorPos = 0;
    hasCursor = FALSE;
    data = theData;
    isComplex = TRUE;
}

/**
 * Copy constructor.
 */
StringReplacer::StringReplacer(const StringReplacer& other) {
    output = other.output;
    cursorPos = other.cursorPos;
    hasCursor = other.hasCursor;
    data = other.data;
    isComplex = other.isComplex;
}

/**
 * Destructor
 */
StringReplacer::~StringReplacer() {
}

/**
 * Implement UnicodeFunctor
 */
UnicodeFunctor* StringReplacer::clone() const {
    return new StringReplacer(*this);
}

/**
 * Implement UnicodeFunctor
 */
UnicodeReplacer* StringReplacer::toReplacer() const {
    return (UnicodeReplacer*) this;
}

/**
 * UnicodeReplacer API
 */
int32_t StringReplacer::replace(Replaceable& text,
                                int32_t start,
                                int32_t limit,
                                int32_t& cursor) {
    int32_t outLen;
    int32_t newStart = 0;

    // NOTE: It should be possible to _always_ run the complex
    // processing code; just slower.  If not, then there is a bug
    // in the complex processing code.

    // Simple (no nested replacers) Processing Code :
    if (!isComplex) {
        text.handleReplaceBetween(start, limit, output);
        outLen = output.length();

        // Setup default cursor position (for cursorPos within output)
        newStart = cursorPos;
    }

    // Complex (nested replacers) Processing Code :
    else {
        /* When there are segments to be copied, use the Replaceable.copy()
         * API in order to retain out-of-band data.  Copy everything to the
         * end of the string, then copy them back over the key.  This preserves
         * the integrity of indices into the key and surrounding context while
         * generating the output text.
         */
        UnicodeString buf;
        int32_t oOutput; // offset into 'output'
        isComplex = FALSE;

        // The temporary buffer starts at tempStart, and extends
        // to destLimit.  The start of the buffer has a single
        // character from before the key.  This provides style
        // data when addition characters are filled into the
        // temporary buffer.  If there is nothing to the left, use
        // the non-character U+FFFF, which Replaceable subclasses
        // should treat specially as a "no-style character."
        // destStart points to the point after the style context
        // character, so it is tempStart+1 or tempStart+2.
        int32_t tempStart = text.length(); // start of temp buffer
        int32_t destStart = tempStart; // copy new text to here
        if (start > 0) {
            int32_t len = UTF_CHAR_LENGTH(text.char32At(start-1));
            text.copy(start-len, start, tempStart);
            destStart += len;
        } else {
            UnicodeString str((UChar) 0xFFFF);
            text.handleReplaceBetween(tempStart, tempStart, str);
            destStart++;
        }
        int32_t destLimit = destStart;

        for (oOutput=0; oOutput<output.length(); ) {
            if (oOutput == cursorPos) {
                // Record the position of the cursor
                newStart = destLimit - destStart; // relative to start
            }
            UChar32 c = output.char32At(oOutput);
            UnicodeReplacer* r = data->lookupReplacer(c);
            if (r == NULL) {
                // Accumulate straight (non-segment) text.
                buf.append(c);
            } else {
                isComplex = TRUE;

                // Insert any accumulated straight text.
                if (buf.length() > 0) {
                    text.handleReplaceBetween(destLimit, destLimit, buf);
                    destLimit += buf.length();
                    buf.truncate(0);
                }

                // Delegate output generation to replacer object
                int32_t len = r->replace(text, destLimit, destLimit, cursor);
                destLimit += len;
            }
            oOutput += UTF_CHAR_LENGTH(c);
        }
        // Insert any accumulated straight text.
        if (buf.length() > 0) {
            text.handleReplaceBetween(destLimit, destLimit, buf);
            destLimit += buf.length();
        }
        if (oOutput == cursorPos) {
            // Record the position of the cursor
            newStart = destLimit - destStart; // relative to start
        }

        outLen = destLimit - destStart;

        // Copy new text to start, and delete it
        text.copy(destStart, destLimit, start);
        text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, EMPTY);

        // Delete the old text (the key)
        text.handleReplaceBetween(start + outLen, limit + outLen, EMPTY);
    }        

    if (hasCursor) {
        // Adjust the cursor for positions outside the key.  These
        // refer to code points rather than code units.  If cursorPos
        // is within the output string, then use newStart, which has
        // already been set above.
        if (cursorPos < 0) {
            newStart = start;
            int32_t n = cursorPos;
            // Outside the output string, cursorPos counts code points
            while (n < 0 && newStart > 0) {
                newStart -= UTF_CHAR_LENGTH(text.char32At(newStart-1));
                ++n;
            }
            newStart += n;
        } else if (cursorPos > output.length()) {
            newStart = start + outLen;
            int32_t n = cursorPos - output.length();
            // Outside the output string, cursorPos counts code points
            while (n > 0 && newStart < text.length()) {
                newStart += UTF_CHAR_LENGTH(text.char32At(newStart));
                --n;
            }
            newStart += n;
        } else {
            // Cursor is within output string.  It has been set up above
            // to be relative to start.
            newStart += start;
        }

        cursor = newStart;
    }

    return outLen;
}

/**
 * UnicodeReplacer API
 */
UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule,
                                                 UBool escapeUnprintable) const {
    rule.truncate(0);
    UnicodeString quoteBuf;

    int32_t cursor = cursorPos;

    // Handle a cursor preceding the output
    if (hasCursor && cursor < 0) {
        while (cursor++ < 0) {
            ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
        }
        // Fall through and append '|' below
    }

    for (int32_t i=0; i<output.length(); ++i) {
        if (hasCursor && i == cursor) {
            ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
        }
        UChar c = output.charAt(i); // Ok to use 16-bits here

        UnicodeReplacer* r = data->lookupReplacer(c);
        if (r == NULL) {
            ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
        } else {
            UnicodeString buf;
            r->toReplacerPattern(buf, escapeUnprintable);
            buf.insert(0, (UChar)0x20);
            buf.append((UChar)0x20);
            ICU_Utility::appendToRule(rule, buf,
                                      TRUE, escapeUnprintable, quoteBuf);
        }
    }

    // Handle a cursor after the output.  Use > rather than >= because
    // if cursor == output.length() it is at the end of the output,
    // which is the default position, so we need not emit it.
    if (hasCursor && cursor > output.length()) {
        cursor -= output.length();
        while (cursor-- > 0) {
            ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
        }
        ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
    }
    // Flush quoteBuf out to result
    ICU_Utility::appendToRule(rule, -1,
                              TRUE, escapeUnprintable, quoteBuf);

    return rule;
}

/**
 * Implement UnicodeReplacer
 */
void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
    UChar32 ch;
    for (int32_t i=0; i<output.length(); i+=UTF_CHAR_LENGTH(ch)) {
	ch = output.char32At(i);
	UnicodeReplacer* r = data->lookupReplacer(ch);
	if (r == NULL) {
	    toUnionTo.add(ch);
	} else {
	    r->addReplacementSetTo(toUnionTo);
	}
    }
}

/**
 * UnicodeFunctor API
 */
void StringReplacer::setData(const TransliterationRuleData* d) {
    data = d;
    int32_t i = 0;
    while (i<output.length()) {
        UChar32 c = output.char32At(i);
        UnicodeFunctor* f = data->lookup(c);
        if (f != NULL) {
            f->setData(data);
        }
        i += UTF_CHAR_LENGTH(c);
    }
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */

//eof

--- NEW FILE: strrepl.h ---
/*
**********************************************************************
*   Copyright (c) 2002, International Business Machines Corporation
*   and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   01/21/2002  aliu        Creation.
**********************************************************************
*/

#ifndef STRREPL_H
#define STRREPL_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "unicode/unifunct.h"
#include "unicode/unirepl.h"
#include "unicode/unistr.h"

U_NAMESPACE_BEGIN

class TransliterationRuleData;

/**
 * A replacer that produces static text as its output. The text may
 * contain transliterator stand-in characters that represent nested
 * UnicodeReplacer objects, making it possible to encode a tree of
 * replacers in a StringReplacer. A StringReplacer that contains such
 * stand-ins is called a complex StringReplacer. A complex
 * StringReplacer has a slower processing loop than a non-complex one.
 * @author Alan Liu
 */
class StringReplacer : public UnicodeFunctor, public UnicodeReplacer {

 private:

    /**
     * Output text, possibly containing stand-in characters that
     * represent nested UnicodeReplacers.
     */
    UnicodeString output;

    /**
     * Cursor position.  Value is ignored if hasCursor is false.
     */
    int32_t cursorPos;

    /**
     * True if this object outputs a cursor position.
     */
    UBool hasCursor;

    /**
     * A complex object contains nested replacers and requires more
     * complex processing.  StringReplacers are initially assumed to
     * be complex.  If no nested replacers are seen during processing,
     * then isComplex is set to false, and future replacements are
     * short circuited for better performance.
     */
    UBool isComplex;

    /**
     * Object that translates stand-in characters in 'output' to
     * UnicodeReplacer objects.
     */
    const TransliterationRuleData* data;

    /**
     * The address of this static class variable serves as this class's ID
     * for ICU "poor man's RTTI".
     */
    static const char fgClassID;

 public:

    /**
     * Construct a StringReplacer that sets the emits the given output
     * text and sets the cursor to the given position.
     * @param theOutput text that will replace input text when the
     * replace() method is called.  May contain stand-in characters
     * that represent nested replacers.
     * @param theCursorPos cursor position that will be returned by
     * the replace() method
     * @param theData transliterator context object that translates
     * stand-in characters to UnicodeReplacer objects
     */
    StringReplacer(const UnicodeString& theOutput,
                   int32_t theCursorPos,
                   const TransliterationRuleData* theData);

    /**
     * Construct a StringReplacer that sets the emits the given output
     * text and does not modify the cursor.
     * @param theOutput text that will replace input text when the
     * replace() method is called.  May contain stand-in characters
     * that represent nested replacers.
     * @param theData transliterator context object that translates
     * stand-in characters to UnicodeReplacer objects
     */
    StringReplacer(const UnicodeString& theOutput,
                   const TransliterationRuleData* theData);

    /**
     * Copy constructor.
     */
    StringReplacer(const StringReplacer& other);

    /**
     * Destructor
     */
    virtual ~StringReplacer();

    /**
     * Implement UnicodeFunctor
     */
    virtual UnicodeFunctor* clone() const;

    /**
     * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
     * and return the pointer.
     */
    virtual UnicodeReplacer* toReplacer() const;

    /**
     * UnicodeReplacer API
     */
    virtual int32_t replace(Replaceable& text,
                            int32_t start,
                            int32_t limit,
                            int32_t& cursor);

    /**
     * UnicodeReplacer API
     */
    virtual UnicodeString& toReplacerPattern(UnicodeString& result,
                                             UBool escapeUnprintable) const;

    /**
     * Implement UnicodeReplacer
     */
    virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const;

    /**
     * UnicodeFunctor API
     */
    virtual void setData(const TransliterationRuleData*);

    /**
     * ICU "poor man's RTTI", returns a UClassID for the actual class.
     *
     * @draft ICU 2.2
     */
    virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }

    /**
     * ICU "poor man's RTTI", returns a UClassID for this class.
     *
     * @draft ICU 2.2
     */
    static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
};

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */

#endif

//eof

--- NEW FILE: tridpars.cpp ---
/*
**********************************************************************
*   Copyright (c) 2002-2003, International Business Machines Corporation
*   and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   01/14/2002  aliu        Creation.
**********************************************************************
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "tridpars.h"
#include "hash.h"
#include "mutex.h"
#include "ucln_in.h"
#include "unicode/parsepos.h"
#include "unicode/translit.h"
#include "unicode/uchar.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/utrans.h"
#include "util.h"
#include "uvector.h"

U_NAMESPACE_BEGIN

static const UChar ID_DELIM    = 0x003B; // ;
static const UChar TARGET_SEP  = 0x002D; // -
static const UChar VARIANT_SEP = 0x002F; // /
static const UChar OPEN_REV    = 0x0028; // (
static const UChar CLOSE_REV   = 0x0029; // )

static const UChar EMPTY[]     = {0}; // ""
static const UChar ANY[]       = {65,110,121,0}; // "Any"
static const UChar ANY_NULL[]  = {65,110,121,45,78,117,108,108,0}; // "Any-Null"

static const int32_t FORWARD = UTRANS_FORWARD;
static const int32_t REVERSE = UTRANS_REVERSE;

static Hashtable* SPECIAL_INVERSES = NULL;

/**
 * The mutex controlling access to SPECIAL_INVERSES
 */
static UMTX LOCK = 0;

TransliteratorIDParser::Specs::Specs(const UnicodeString& s, const UnicodeString& t,
                                     const UnicodeString& v, UBool sawS,
                                     const UnicodeString& f) {
    source = s;
    target = t;
    variant = v;
    sawSource = sawS;
    filter = f;
}

TransliteratorIDParser::SingleID::SingleID(const UnicodeString& c, const UnicodeString& b,
                                           const UnicodeString& f) {
    canonID = c;
    basicID = b;
    filter = f;
}

TransliteratorIDParser::SingleID::SingleID(const UnicodeString& c, const UnicodeString& b) {
    canonID = c;
    basicID = b;
}

Transliterator* TransliteratorIDParser::SingleID::createInstance() {
    Transliterator* t;
    if (basicID.length() == 0) {
        t = createBasicInstance(ANY_NULL, &canonID);
    } else {
        t = createBasicInstance(basicID, &canonID);
    }
    if (t != NULL) {
        if (filter.length() != 0) {
            UErrorCode ec = U_ZERO_ERROR;
            UnicodeSet *set = new UnicodeSet(filter, ec);
            if (U_FAILURE(ec)) {
                delete set;
            } else {
                t->adoptFilter(set);
            }
        }
    }
    return t;
}

/**
 * Parse a single ID, that is, an ID of the general form
 * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element
 * optional, the filters optional, and the variants optional.
 * @param id the id to be parsed
 * @param pos INPUT-OUTPUT parameter.  On input, the position of
 * the first character to parse.  On output, the position after
 * the last character parsed.
 * @param dir the direction.  If the direction is REVERSE then the
 * SingleID is constructed for the reverse direction.
 * @return a SingleID object or NULL
 */
TransliteratorIDParser::SingleID*
TransliteratorIDParser::parseSingleID(const UnicodeString& id, int32_t& pos,
                                      int32_t dir) {

    int32_t start = pos;

    // The ID will be of the form A, A(), A(B), or (B), where
    // A and B are filter IDs.
    Specs* specsA = NULL;
    Specs* specsB = NULL;
    UBool sawParen = FALSE;

    // On the first pass, look for (B) or ().  If this fails, then
    // on the second pass, look for A, A(B), or A().
    for (int32_t pass=1; pass<=2; ++pass) {
        if (pass == 2) {
            specsA = parseFilterID(id, pos, TRUE);
            if (specsA == NULL) {
                pos = start;
                return NULL;
            }
        }
        if (ICU_Utility::parseChar(id, pos, OPEN_REV)) {
            sawParen = TRUE;
            if (!ICU_Utility::parseChar(id, pos, CLOSE_REV)) {
                specsB = parseFilterID(id, pos, TRUE);
                // Must close with a ')'
                if (specsB == NULL || !ICU_Utility::parseChar(id, pos, CLOSE_REV)) {
                    delete specsA;
                    pos = start;
                    return NULL;
                }
            }
            break;
        }
    }

    // Assemble return results
    SingleID* single;
    if (sawParen) {
        if (dir == FORWARD) {
            SingleID* b = specsToID(specsB, FORWARD);
            single = specsToID(specsA, FORWARD);
            single->canonID.append(OPEN_REV)
                .append(b->canonID).append(CLOSE_REV);
            if (specsA != NULL) {
                single->filter = specsA->filter;
            }
            delete b;
        } else {
            SingleID* a = specsToID(specsA, FORWARD);
            single = specsToID(specsB, FORWARD);
            single->canonID.append(OPEN_REV)
                .append(a->canonID).append(CLOSE_REV);
            if (specsB != NULL) {
                single->filter = specsB->filter;
            }
            delete a;
        }
    } else {
        // assert(specsA != NULL);
        if (dir == FORWARD) {
            single = specsToID(specsA, FORWARD);
        } else {
            single = specsToSpecialInverse(*specsA);
            if (single == NULL) {
                single = specsToID(specsA, REVERSE);
            }
        }
        single->filter = specsA->filter;
    }

    delete specsA;
    delete specsB;

    return single;
}

/**
 * Parse a filter ID, that is, an ID of the general form
 * "[f1] s1-t1/v1", with the filters optional, and the variants optional.
 * @param id the id to be parsed
 * @param pos INPUT-OUTPUT parameter.  On input, the position of
 * the first character to parse.  On output, the position after
 * the last character parsed.
 * @return a SingleID object or null if the parse fails
 */
TransliteratorIDParser::SingleID*
TransliteratorIDParser::parseFilterID(const UnicodeString& id, int32_t& pos) {

    int32_t start = pos;

    Specs* specs = parseFilterID(id, pos, TRUE);
    if (specs == NULL) {
        pos = start;
        return NULL;
    }

    // Assemble return results
    SingleID* single = specsToID(specs, FORWARD);
    single->filter = specs->filter;
    delete specs;
    return single;
}

/**
 * Parse a global filter of the form "[f]" or "([f])", depending
 * on 'withParens'.
 * @param id the pattern the parse
 * @param pos INPUT-OUTPUT parameter.  On input, the position of
 * the first character to parse.  On output, the position after
 * the last character parsed.
 * @param dir the direction.
 * @param withParens INPUT-OUTPUT parameter.  On entry, if
 * withParens is 0, then parens are disallowed.  If it is 1,
 * then parens are requires.  If it is -1, then parens are
 * optional, and the return result will be set to 0 or 1.
 * @param canonID OUTPUT parameter.  The pattern for the filter
 * added to the canonID, either at the end, if dir is FORWARD, or
 * at the start, if dir is REVERSE.  The pattern will be enclosed
 * in parentheses if appropriate, and will be suffixed with an
 * ID_DELIM character.  May be NULL.
 * @return a UnicodeSet object or NULL.  A non-NULL results
 * indicates a successful parse, regardless of whether the filter
 * applies to the given direction.  The caller should discard it
 * if withParens != (dir == REVERSE).
 */
UnicodeSet* TransliteratorIDParser::parseGlobalFilter(const UnicodeString& id, int32_t& pos,
                                                      int32_t dir,
                                                      int32_t& withParens,
                                                      UnicodeString* canonID) {
    UnicodeSet* filter = NULL;
    int32_t start = pos;

    if (withParens == -1) {
        withParens = ICU_Utility::parseChar(id, pos, OPEN_REV) ? 1 : 0;
    } else if (withParens == 1) {
        if (!ICU_Utility::parseChar(id, pos, OPEN_REV)) {
            pos = start;
            return NULL;
        }
    }

    ICU_Utility::skipWhitespace(id, pos, TRUE);

    if (UnicodeSet::resemblesPattern(id, pos)) {
        ParsePosition ppos(pos);
        UErrorCode ec = U_ZERO_ERROR;
        filter = new UnicodeSet(id, ppos, USET_IGNORE_SPACE, ec);
        /* test for NULL */
        if (filter == 0) {
            pos = start;
            return 0;
        }
        if (U_FAILURE(ec)) {
            delete filter;
            pos = start;
            return NULL;
        }

        UnicodeString pattern;
        id.extractBetween(pos, ppos.getIndex(), pattern);
        pos = ppos.getIndex();

        if (withParens == 1 && !ICU_Utility::parseChar(id, pos, CLOSE_REV)) {
            pos = start;
            return NULL;
        }

        // In the forward direction, append the pattern to the
        // canonID.  In the reverse, insert it at zero, and invert
        // the presence of parens ("A" <-> "(A)").
        if (canonID != NULL) {
            if (dir == FORWARD) {
                if (withParens == 1) {
                    pattern.insert(0, OPEN_REV);
                    pattern.append(CLOSE_REV);
                }
                canonID->append(pattern).append(ID_DELIM);
            } else {
                if (withParens == 0) {
                    pattern.insert(0, OPEN_REV);
                    pattern.append(CLOSE_REV);
                }
                canonID->insert(0, pattern);
                canonID->insert(pattern.length(), ID_DELIM);
            }
        }
    }

    return filter;
}

U_CDECL_BEGIN
static void U_CALLCONV _deleteSingleID(void* obj) {
    delete (TransliteratorIDParser::SingleID*) obj;
}

static void U_CALLCONV _deleteTransliterator(void* obj) {
    delete (Transliterator*) obj;
}
U_CDECL_END

/**
 * Parse a compound ID, consisting of an optional forward global
 * filter, a separator, one or more single IDs delimited by
 * separators, an an optional reverse global filter.  The
 * separator is a semicolon.  The global filters are UnicodeSet
 * patterns.  The reverse global filter must be enclosed in
 * parentheses.
 * @param id the pattern the parse
 * @param dir the direction.
 * @param canonID OUTPUT parameter that receives the canonical ID,
 * consisting of canonical IDs for all elements, as returned by
 * parseSingleID(), separated by semicolons.  Previous contents
 * are discarded.
 * @param list OUTPUT parameter that receives a list of SingleID
 * objects representing the parsed IDs.  Previous contents are
 * discarded.
 * @param globalFilter OUTPUT parameter that receives a pointer to
 * a newly created global filter for this ID in this direction, or
 * NULL if there is none.
 * @return TRUE if the parse succeeds, that is, if the entire
 * id is consumed without syntax error.
 */
UBool TransliteratorIDParser::parseCompoundID(const UnicodeString& id, int32_t dir,
                                              UnicodeString& canonID,
                                              UVector& list,
                                              UnicodeSet*& globalFilter) {
    UErrorCode ec = U_ZERO_ERROR;
    int32_t i;
    int32_t pos = 0;
    int32_t withParens = 1;
    list.removeAllElements();
    UnicodeSet* filter;
    globalFilter = NULL;
    canonID.truncate(0);

    // Parse leading global filter, if any
    withParens = 0; // parens disallowed
    filter = parseGlobalFilter(id, pos, dir, withParens, &canonID);
    if (filter != NULL) {
        if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) {
            // Not a global filter; backup and resume
            canonID.truncate(0);
            pos = 0;
        }
        if (dir == FORWARD) {
            globalFilter = filter;
        } else {
            delete filter;
        }
        filter = NULL;
    }

    UBool sawDelimiter = TRUE;
    for (;;) {
        SingleID* single = parseSingleID(id, pos, dir);
        if (single == NULL) {
            break;
        }
        if (dir == FORWARD) {
            list.addElement(single, ec);
        } else {
            list.insertElementAt(single, 0, ec);
        }
        if (U_FAILURE(ec)) {
            goto FAIL;
        }
        if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) {
            sawDelimiter = FALSE;
            break;
        }
    }

    if (list.size() == 0) {
        goto FAIL;
    }

    // Construct canonical ID
    for (i=0; i<list.size(); ++i) {
        SingleID* single = (SingleID*) list.elementAt(i);
        canonID.append(single->canonID);
        if (i != (list.size()-1)) {
            canonID.append(ID_DELIM);
        }
    }

    // Parse trailing global filter, if any, and only if we saw
    // a trailing delimiter after the IDs.
    if (sawDelimiter) {
        withParens = 1; // parens required
        filter = parseGlobalFilter(id, pos, dir, withParens, &canonID);
        if (filter != NULL) {
            // Don't require trailing ';', but parse it if present
            ICU_Utility::parseChar(id, pos, ID_DELIM);

            if (dir == REVERSE) {
                globalFilter = filter;
            } else {
                delete filter;
            }
            filter = NULL;
        }
    }

    // Trailing unparsed text is a syntax error
    ICU_Utility::skipWhitespace(id, pos, TRUE);
    if (pos != id.length()) {
        goto FAIL;
    }

    return TRUE;

 FAIL:
    UObjectDeleter *save = list.setDeleter(_deleteSingleID);
    list.removeAllElements();
    list.setDeleter(save);
    delete globalFilter;
    globalFilter = NULL;
    return FALSE;
}

/**
 * Convert the elements of the 'list' vector, which are SingleID
 * objects, into actual Transliterator objects.  In the course of
 * this, some (or all) entries may be removed.  If all entries
 * are removed, the NULL transliterator will be added.
 *
 * Delete entries with empty basicIDs; these are generated by
 * elements like "(A)" in the forward direction, or "A()" in
 * the reverse.  THIS MAY RESULT IN AN EMPTY VECTOR.  Convert
 * SingleID entries to actual transliterators.
 *
 * Also, optionally, insert the given transliterator at the given
 * position.  This effectively happens before anything else.
 *
 * @param list vector of SingleID objects.  On exit, vector
 * of one or more Transliterators.
 * @param insert Transliterator to insert, or NULL if none.
 * Adopted.
 * @param insertIndex index from 0..list.size()-1, at which
 * to place 'insert', or -1 if none.
 * @return new value of insertIndex.  The index will shift if
 * there are empty items, like "(Lower)", with indices less than
 * insertIndex.
 */
int32_t TransliteratorIDParser::instantiateList(UVector& list,
                                                Transliterator* insert,
                                                int32_t insertIndex,
                                                UErrorCode& ec) {
    UVector tlist(ec);
    if (U_FAILURE(ec)) {
        goto RETURN;
    }
    tlist.setDeleter(_deleteTransliterator);

    Transliterator* t;
    int32_t i;
    for (i=0; i<=list.size(); ++i) { // [sic]: i<=list.size()
        if (insertIndex == i) {
            insertIndex = tlist.size();
            tlist.addElement(insert, ec);
            if (U_FAILURE(ec)) {
                goto RETURN;
            }
            insert = NULL;
        }

        // We run the loop too long by one, so we can
        // do an insert after the last element
        if (i==list.size()) {
            break;
        }

        SingleID* single = (SingleID*) list.elementAt(i);
        if (single->basicID.length() != 0) {
            t = single->createInstance();
            if (t == NULL) {
                ec = U_INVALID_ID;
                goto RETURN;
            }
            tlist.addElement(t, ec);
            if (U_FAILURE(ec)) {
                delete t;
                goto RETURN;
            }
        }
    }

    // An empty list is equivalent to a NULL transliterator.
    if (tlist.size() == 0) {
        t = createBasicInstance(ANY_NULL, NULL);
        if (t == NULL) {
            // Should never happen
            ec = U_INTERNAL_TRANSLITERATOR_ERROR;
        }
        tlist.addElement(t, ec);
        if (U_FAILURE(ec)) {
            delete t;
        }
    }

 RETURN:

    UObjectDeleter *save = list.setDeleter(_deleteSingleID);
    list.removeAllElements();

    if (U_SUCCESS(ec)) {
        list.setDeleter(_deleteTransliterator);

        while (tlist.size() > 0) {
            t = (Transliterator*) tlist.orphanElementAt(0);
            list.addElement(t, ec);
            if (U_FAILURE(ec)) {
                delete t;
                list.removeAllElements();
                break;
            }
        }
    }

    delete insert; // Clean up in case of failure
    list.setDeleter(save);
    return insertIndex;
}

/**
 * Parse an ID into pieces.  Take IDs of the form T, T/V, S-T,
 * S-T/V, or S/V-T.  If the source is missing, return a source of
 * ANY.
 * @param id the id string, in any of several forms
 * @return an array of 4 strings: source, target, variant, and
 * isSourcePresent.  If the source is not present, ANY will be
 * given as the source, and isSourcePresent will be NULL.  Otherwise
 * isSourcePresent will be non-NULL.  The target may be empty if the
 * id is not well-formed.  The variant may be empty.
 */
void TransliteratorIDParser::IDtoSTV(const UnicodeString& id,
                                     UnicodeString& source,
                                     UnicodeString& target,
                                     UnicodeString& variant,
                                     UBool& isSourcePresent) {
    source = ANY;
    target.truncate(0);
    variant.truncate(0);

    int32_t sep = id.indexOf(TARGET_SEP);
    int32_t var = id.indexOf(VARIANT_SEP);
    if (var < 0) {
        var = id.length();
    }
    isSourcePresent = FALSE;

    if (sep < 0) {
        // Form: T/V or T (or /V)
        id.extractBetween(0, var, target);
        id.extractBetween(var, id.length(), variant);
    } else if (sep < var) {
        // Form: S-T/V or S-T (or -T/V or -T)
        if (sep > 0) {
            id.extractBetween(0, sep, source);
            isSourcePresent = TRUE;
        }
        id.extractBetween(++sep, var, target);
        id.extractBetween(var, id.length(), variant);
    } else {
        // Form: (S/V-T or /V-T)
        if (var > 0) {
            id.extractBetween(0, var, source);
            isSourcePresent = TRUE;
        }
        id.extractBetween(var, sep++, variant);
        id.extractBetween(sep, id.length(), target);
    }

    if (variant.length() > 0) {
        variant.remove(0, 1);
    }
}

/**
 * Given source, target, and variant strings, concatenate them into a
 * full ID.  If the source is empty, then "Any" will be used for the
 * source, so the ID will always be of the form s-t/v or s-t.
 */
void TransliteratorIDParser::STVtoID(const UnicodeString& source,
                                     const UnicodeString& target,
                                     const UnicodeString& variant,
                                     UnicodeString& id) {
    id = source;
    if (id.length() == 0) {
        id = ANY;
    }
    id.append(TARGET_SEP).append(target);
    if (variant.length() != 0) {
        id.append(VARIANT_SEP).append(variant);
    }
}

/**
 * Register two targets as being inverses of one another.  For
 * example, calling registerSpecialInverse("NFC", "NFD", TRUE) causes
 * Transliterator to form the following inverse relationships:
 *
 * <pre>NFC => NFD
 * Any-NFC => Any-NFD
 * NFD => NFC
 * Any-NFD => Any-NFC</pre>
 *
 * (Without the special inverse registration, the inverse of NFC
 * would be NFC-Any.)  Note that NFD is shorthand for Any-NFD, but
 * that the presence or absence of "Any-" is preserved.
 *
 * <p>The relationship is symmetrical; registering (a, b) is
 * equivalent to registering (b, a).
 *
 * <p>The relevant IDs must still be registered separately as
 * factories or classes.
 *
 * <p>Only the targets are specified.  Special inverses always
 * have the form Any-Target1 <=> Any-Target2.  The target should
 * have canonical casing (the casing desired to be produced when
 * an inverse is formed) and should contain no whitespace or other
 * extraneous characters.
 *
 * @param target the target against which to register the inverse
 * @param inverseTarget the inverse of target, that is
 * Any-target.getInverse() => Any-inverseTarget
 * @param bidirectional if TRUE, register the reverse relation
 * as well, that is, Any-inverseTarget.getInverse() => Any-target
 */
void TransliteratorIDParser::registerSpecialInverse(const UnicodeString& target,
                                                    const UnicodeString& inverseTarget,
                                                    UBool bidirectional) {
    init();

    // If target == inverseTarget then force bidirectional => FALSE
    if (bidirectional && 0==target.caseCompare(inverseTarget, U_FOLD_CASE_DEFAULT)) {
        bidirectional = FALSE;
    }

    umtx_init(&LOCK);
    Mutex lock(&LOCK);

    UErrorCode ec = U_ZERO_ERROR;
    SPECIAL_INVERSES->put(target, new UnicodeString(inverseTarget), ec);
    if (bidirectional) {
        SPECIAL_INVERSES->put(inverseTarget, new UnicodeString(target), ec);
    }
}

//----------------------------------------------------------------
// Private implementation
//----------------------------------------------------------------

/**
 * Parse an ID into component pieces.  Take IDs of the form T,
 * T/V, S-T, S-T/V, or S/V-T.  If the source is missing, return a
 * source of ANY.
 * @param id the id string, in any of several forms
 * @param pos INPUT-OUTPUT parameter.  On input, pos is the
 * offset of the first character to parse in id.  On output,
 * pos is the offset after the last parsed character.  If the
 * parse failed, pos will be unchanged.
 * @param allowFilter2 if TRUE, a UnicodeSet pattern is allowed
 * at any location between specs or delimiters, and is returned
 * as the fifth string in the array.
 * @return a Specs object, or NULL if the parse failed.  If
 * neither source nor target was seen in the parsed id, then the
 * parse fails.  If allowFilter is TRUE, then the parsed filter
 * pattern is returned in the Specs object, otherwise the returned
 * filter reference is NULL.  If the parse fails for any reason
 * NULL is returned.
 */
TransliteratorIDParser::Specs*
TransliteratorIDParser::parseFilterID(const UnicodeString& id, int32_t& pos,
                                      UBool allowFilter) {
    UnicodeString first;
    UnicodeString source;
    UnicodeString target;
    UnicodeString variant;
    UnicodeString filter;
    UChar delimiter = 0;
    int32_t specCount = 0;
    int32_t start = pos;

    // This loop parses one of the following things with each
    // pass: a filter, a delimiter character (either '-' or '/'),
    // or a spec (source, target, or variant).
    for (;;) {
        ICU_Utility::skipWhitespace(id, pos, TRUE);
        if (pos == id.length()) {
            break;
        }

        // Parse filters
        if (allowFilter && filter.length() == 0 &&
            UnicodeSet::resemblesPattern(id, pos)) {

            ParsePosition ppos(pos);
            UErrorCode ec = U_ZERO_ERROR;
            UnicodeSet set(id, ppos, USET_IGNORE_SPACE, ec);
            if (U_FAILURE(ec)) {
                pos = start;
                return NULL;
            }
            id.extractBetween(pos, ppos.getIndex(), filter);
            pos = ppos.getIndex();
            continue;
        }

        if (delimiter == 0) {
            UChar c = id.charAt(pos);
            if ((c == TARGET_SEP && target.length() == 0) ||
                (c == VARIANT_SEP && variant.length() == 0)) {
                delimiter = c;
                ++pos;
                continue;
            }
        }

        // We are about to try to parse a spec with no delimiter
        // when we can no longer do so (we can only do so at the
        // start); break.
        if (delimiter == 0 && specCount > 0) {
            break;
        }

        UnicodeString spec = ICU_Utility::parseUnicodeIdentifier(id, pos);
        if (spec.length() == 0) {
            // Note that if there was a trailing delimiter, we
            // consume it.  So Foo-, Foo/, Foo-Bar/, and Foo/Bar-
            // are legal.
            break;
        }

        switch (delimiter) {
        case 0:
            first = spec;
            break;
        case TARGET_SEP:
            target = spec;
            break;
        case VARIANT_SEP:
            variant = spec;
            break;
        }
        ++specCount;
        delimiter = 0;
    }

    // A spec with no prior character is either source or target,
    // depending on whether an explicit "-target" was seen.
    if (first.length() != 0) {
        if (target.length() == 0) {
            target = first;
        } else {
            source = first;
        }
    }

    // Must have either source or target
    if (source.length() == 0 && target.length() == 0) {
        pos = start;
        return NULL;
    }

    // Empty source or target defaults to ANY
    UBool sawSource = TRUE;
    if (source.length() == 0) {
        source = ANY;
        sawSource = FALSE;
    }
    if (target.length() == 0) {
        target = ANY;
    }

    return new Specs(source, target, variant, sawSource, filter);
}

/**
 * Givens a Spec object, convert it to a SingleID object.  The
 * Spec object is a more unprocessed parse result.  The SingleID
 * object contains information about canonical and basic IDs.
 * @return a SingleID; never returns NULL.  Returned object always
 * has 'filter' field of NULL.
 */
TransliteratorIDParser::SingleID*
TransliteratorIDParser::specsToID(const Specs* specs, int32_t dir) {
    UnicodeString canonID;
    UnicodeString basicID;
    UnicodeString basicPrefix;
    if (specs != NULL) {
        UnicodeString buf;
        if (dir == FORWARD) {
            if (specs->sawSource) {
                buf.append(specs->source).append(TARGET_SEP);
            } else {
                basicPrefix = specs->source;
                basicPrefix.append(TARGET_SEP);
            }
            buf.append(specs->target);
        } else {
            buf.append(specs->target).append(TARGET_SEP).append(specs->source);
        }
        if (specs->variant.length() != 0) {
            buf.append(VARIANT_SEP).append(specs->variant);
        }
        basicID = basicPrefix;
        basicID.append(buf);
        if (specs->filter.length() != 0) {
            buf.insert(0, specs->filter);
        }
        canonID = buf;
    }
    return new SingleID(canonID, basicID);
}

/**
 * Given a Specs object, return a SingleID representing the
 * special inverse of that ID.  If there is no special inverse
 * then return NULL.
 * @return a SingleID or NULL.  Returned object always has
 * 'filter' field of NULL.
 */
TransliteratorIDParser::SingleID*
TransliteratorIDParser::specsToSpecialInverse(const Specs& specs) {
    if (0!=specs.source.caseCompare(ANY, U_FOLD_CASE_DEFAULT)) {
        return NULL;
    }
    init();

    UnicodeString* inverseTarget;

    umtx_init(&LOCK);
    umtx_lock(&LOCK);
    inverseTarget = (UnicodeString*) SPECIAL_INVERSES->get(specs.target);
    umtx_unlock(&LOCK);

    if (inverseTarget != NULL) {
        // If the original ID contained "Any-" then make the
        // special inverse "Any-Foo"; otherwise make it "Foo".
        // So "Any-NFC" => "Any-NFD" but "NFC" => "NFD".
        UnicodeString buf;
        if (specs.filter.length() != 0) {
            buf.append(specs.filter);
        }
        if (specs.sawSource) {
            buf.append(ANY).append(TARGET_SEP);
        }
        buf.append(*inverseTarget);

        UnicodeString basicID(ANY);
        basicID.append(TARGET_SEP).append(*inverseTarget);

        if (specs.variant.length() != 0) {
            buf.append(VARIANT_SEP).append(specs.variant);
            basicID.append(VARIANT_SEP).append(specs.variant);
        }
        return new SingleID(buf, basicID);
    }
    return NULL;
}

/**
 * Glue method to get around access problems in C++.  This would
 * ideally be inline but we want to avoid a circular header
 * dependency.
 */
Transliterator* TransliteratorIDParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) {
    return Transliterator::createBasicInstance(id, canonID);
}

/**
 * Initialize static memory.
 */
void TransliteratorIDParser::init() {
    if (SPECIAL_INVERSES != NULL) {
        return;
    }

    Hashtable* special_inverses = new Hashtable(TRUE);
    special_inverses->setValueDeleter(uhash_deleteUnicodeString);

    umtx_init(&LOCK);
    umtx_lock(&LOCK);
    if (SPECIAL_INVERSES == NULL) {
        SPECIAL_INVERSES = special_inverses;
        special_inverses = NULL;
    }
    umtx_unlock(&LOCK);
    delete special_inverses;

    ucln_i18n_registerCleanup();
}

/**
 * Free static memory.
 */
void TransliteratorIDParser::cleanup() {
    if (SPECIAL_INVERSES) {
        delete SPECIAL_INVERSES;
        SPECIAL_INVERSES = NULL;
    }
    umtx_destroy(&LOCK);
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */

//eof

--- NEW FILE: tridpars.h ---
/*
**********************************************************************
*   Copyright (c) 2002, International Business Machines Corporation
*   and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   01/28/2002  aliu        Creation.
**********************************************************************
*/
#ifndef TRIDPARS_H
#define TRIDPARS_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "unicode/uobject.h"
#include "unicode/unistr.h"

U_NAMESPACE_BEGIN

class Transliterator;
class UnicodeSet;
class UVector;

/**
 * Parsing component for transliterator IDs.  This class contains only
 * static members; it cannot be instantiated.  Methods in this class
 * parse various ID formats, including the following:
 *
 * A basic ID, which contains source, target, and variant, but no
 * filter and no explicit inverse.  Examples include
 * "Latin-Greek/UNGEGN" and "Null".
 *
 * A single ID, which is a basic ID plus optional filter and optional
 * explicit inverse.  Examples include "[a-zA-Z] Latin-Greek" and
 * "Lower (Upper)".
 *
 * A compound ID, which is a sequence of one or more single IDs,
 * separated by semicolons, with optional forward and reverse global
 * filters.  The global filters are UnicodeSet patterns prepended or
 * appended to the IDs, separated by semicolons.  An appended filter
 * must be enclosed in parentheses and applies in the reverse
 * direction.
 *
 * @author Alan Liu
 */
class TransliteratorIDParser /* not : public UObject because all methods are static */ {

 public:

    /**
     * A structure containing the parsed data of a filtered ID, that
     * is, a basic ID optionally with a filter.
     *
     * 'source' and 'target' will always be non-null.  The 'variant'
     * will be non-null only if a non-empty variant was parsed.
     *
     * 'sawSource' is true if there was an explicit source in the
     * parsed id.  If there was no explicit source, then an implied
     * source of ANY is returned and 'sawSource' is set to false.
     * 
     * 'filter' is the parsed filter pattern, or null if there was no
     * filter.
     */
    class Specs : public UMemory {
    public:
        UnicodeString source; // not null
        UnicodeString target; // not null
        UnicodeString variant; // may be null
        UnicodeString filter; // may be null
        UBool sawSource;
        Specs(const UnicodeString& s, const UnicodeString& t,
              const UnicodeString& v, UBool sawS,
              const UnicodeString& f);

    private:

        Specs(const Specs &other); // forbid copying of this class
        Specs &operator=(const Specs &other); // forbid copying of this class
    };

    /**
     * A structure containing the canonicalized data of a filtered ID,
     * that is, a basic ID optionally with a filter.
     *
     * 'canonID' is always non-null.  It may be the empty string "".
     * It is the id that should be assigned to the created
     * transliterator.  It _cannot_ be instantiated directly.
     *
     * 'basicID' is always non-null and non-empty.  It is always of
     * the form S-T or S-T/V.  It is designed to be fed to low-level
     * instantiation code that only understands these two formats.
     *
     * 'filter' may be null, if there is none, or non-null and
     * non-empty.
     */
    class SingleID : public UMemory {
    public:
        UnicodeString canonID;
        UnicodeString basicID;
        UnicodeString filter;
        SingleID(const UnicodeString& c, const UnicodeString& b,
                 const UnicodeString& f);
        SingleID(const UnicodeString& c, const UnicodeString& b);
        Transliterator* createInstance();

    private:

        SingleID(const SingleID &other); // forbid copying of this class
        SingleID &operator=(const SingleID &other); // forbid copying of this class
    };

    /**
     * Parse a filter ID, that is, an ID of the general form
     * "[f1] s1-t1/v1", with the filters optional, and the variants optional.
     * @param id the id to be parsed
     * @param pos INPUT-OUTPUT parameter.  On input, the position of
     * the first character to parse.  On output, the position after
     * the last character parsed.
     * @return a SingleID object or null if the parse fails
     */
    static SingleID* parseFilterID(const UnicodeString& id, int32_t& pos);

    /**
     * Parse a single ID, that is, an ID of the general form
     * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element
     * optional, the filters optional, and the variants optional.
     * @param id the id to be parsed
     * @param pos INPUT-OUTPUT parameter.  On input, the position of
     * the first character to parse.  On output, the position after
     * the last character parsed.
     * @param dir the direction.  If the direction is REVERSE then the
     * SingleID is constructed for the reverse direction.
     * @return a SingleID object or null
     */
    static SingleID* parseSingleID(const UnicodeString& id, int32_t& pos,
                                  int32_t dir);

    /**
     * Parse a global filter of the form "[f]" or "([f])", depending
     * on 'withParens'.
     * @param id the pattern the parse
     * @param pos INPUT-OUTPUT parameter.  On input, the position of
     * the first character to parse.  On output, the position after
     * the last character parsed.
     * @param dir the direction.
     * @param withParens INPUT-OUTPUT parameter.  On entry, if
     * withParens[0] is 0, then parens are disallowed.  If it is 1,
     * then parens are requires.  If it is -1, then parens are
     * optional, and the return result will be set to 0 or 1.
     * @param canonID OUTPUT parameter.  The pattern for the filter
     * added to the canonID, either at the end, if dir is FORWARD, or
     * at the start, if dir is REVERSE.  The pattern will be enclosed
     * in parentheses if appropriate, and will be suffixed with an
     * ID_DELIM character.  May be null.
     * @return a UnicodeSet object or null.  A non-null results
     * indicates a successful parse, regardless of whether the filter
     * applies to the given direction.  The caller should discard it
     * if withParens != (dir == REVERSE).
     */
    static UnicodeSet* parseGlobalFilter(const UnicodeString& id, int32_t& pos,
                                         int32_t dir,
                                         int32_t& withParens,
                                         UnicodeString* canonID);

    /**
     * Parse a compound ID, consisting of an optional forward global
     * filter, a separator, one or more single IDs delimited by
     * separators, an an optional reverse global filter.  The
     * separator is a semicolon.  The global filters are UnicodeSet
     * patterns.  The reverse global filter must be enclosed in
     * parentheses.
     * @param id the pattern the parse
     * @param dir the direction.
     * @param canonID OUTPUT parameter that receives the canonical ID,
     * consisting of canonical IDs for all elements, as returned by
     * parseSingleID(), separated by semicolons.  Previous contents
     * are discarded.
     * @param list OUTPUT parameter that receives a list of SingleID
     * objects representing the parsed IDs.  Previous contents are
     * discarded.
     * @param globalFilter OUTPUT parameter that receives a pointer to
     * a newly created global filter for this ID in this direction, or
     * null if there is none.
     * @return true if the parse succeeds, that is, if the entire
     * id is consumed without syntax error.
     */
    static UBool parseCompoundID(const UnicodeString& id, int32_t dir,
                                 UnicodeString& canonID,
                                 UVector& list,
                                 UnicodeSet*& globalFilter);

    /**
     * Convert the elements of the 'list' vector, which are SingleID
     * objects, into actual Transliterator objects.  In the course of
     * this, some (or all) entries may be removed.  If all entries
     * are removed, the Null transliterator will be added.
     *
     * Delete entries with empty basicIDs; these are generated by
     * elements like "(A)" in the forward direction, or "A()" in
     * the reverse.  THIS MAY RESULT IN AN EMPTY VECTOR.  Convert
     * SingleID entries to actual transliterators.
     *
     * Also, optionally, insert the given transliterator at the given
     * position.  This effectively happens before anything else.
     *
     * @param list vector of SingleID objects.  On exit, vector
     * of one or more Transliterators.
     * @param insert Transliterator to insert, or null if none.
     * @param insertIndex index from 0..list.size()-1, at which
     * to place 'insert', or -1 if none.
     * @param ec Output param to receive a success or an error code.
     * @return new value of insertIndex.  The index will shift if
     * there are empty items, like "(Lower)", with indices less than
     * insertIndex.
     */
    static int32_t instantiateList(UVector& list,
                                   Transliterator* insert,
                                   int32_t insertIndex,
                                   UErrorCode& ec);

    /**
     * Parse an ID into pieces.  Take IDs of the form T, T/V, S-T,
     * S-T/V, or S/V-T.  If the source is missing, return a source of
     * ANY.
     * @param id the id string, in any of several forms
     * @param source          the given source.
     * @param target          the given target.
     * @param variant         the given variant
     * @param isSourcePresent If TRUE then the source is present. 
     *                        If the source is not present, ANY will be
     *                        given as the source, and isSourcePresent will be null
     * @return an array of 4 strings: source, target, variant, and
     * isSourcePresent.  If the source is not present, ANY will be
     * given as the source, and isSourcePresent will be null.  Otherwise
     * isSourcePresent will be non-null.  The target may be empty if the
     * id is not well-formed.  The variant may be empty.
     */
    static void IDtoSTV(const UnicodeString& id,
                        UnicodeString& source,
                        UnicodeString& target,
                        UnicodeString& variant,
                        UBool& isSourcePresent);

    /**
     * Given source, target, and variant strings, concatenate them into a
     * full ID.  If the source is empty, then "Any" will be used for the
     * source, so the ID will always be of the form s-t/v or s-t.
     */
    static void STVtoID(const UnicodeString& source,
                        const UnicodeString& target,
                        const UnicodeString& variant,
                        UnicodeString& id);

    /**
     * Register two targets as being inverses of one another.  For
     * example, calling registerSpecialInverse("NFC", "NFD", true) causes
     * Transliterator to form the following inverse relationships:
     *
     * <pre>NFC => NFD
     * Any-NFC => Any-NFD
     * NFD => NFC
     * Any-NFD => Any-NFC</pre>
     *
     * (Without the special inverse registration, the inverse of NFC
     * would be NFC-Any.)  Note that NFD is shorthand for Any-NFD, but
     * that the presence or absence of "Any-" is preserved.
     *
     * <p>The relationship is symmetrical; registering (a, b) is
     * equivalent to registering (b, a).
     *
     * <p>The relevant IDs must still be registered separately as
     * factories or classes.
     *
     * <p>Only the targets are specified.  Special inverses always
     * have the form Any-Target1 <=> Any-Target2.  The target should
     * have canonical casing (the casing desired to be produced when
     * an inverse is formed) and should contain no whitespace or other
     * extraneous characters.
     *
     * @param target the target against which to register the inverse
     * @param inverseTarget the inverse of target, that is
     * Any-target.getInverse() => Any-inverseTarget
     * @param bidirectional if true, register the reverse relation
     * as well, that is, Any-inverseTarget.getInverse() => Any-target
     */
    static void registerSpecialInverse(const UnicodeString& target,
                                       const UnicodeString& inverseTarget,
                                       UBool bidirectional);

    /**
     * Free static memory.
     */
    static void cleanup();

 private:
    //----------------------------------------------------------------
    // Private implementation
    //----------------------------------------------------------------

    // forbid instantiation
    TransliteratorIDParser();

    /**
     * Parse an ID into component pieces.  Take IDs of the form T,
     * T/V, S-T, S-T/V, or S/V-T.  If the source is missing, return a
     * source of ANY.
     * @param id the id string, in any of several forms
     * @param pos INPUT-OUTPUT parameter.  On input, pos[0] is the
     * offset of the first character to parse in id.  On output,
     * pos[0] is the offset after the last parsed character.  If the
     * parse failed, pos[0] will be unchanged.
     * @param allowFilter if true, a UnicodeSet pattern is allowed
     * at any location between specs or delimiters, and is returned
     * as the fifth string in the array.
     * @return a Specs object, or null if the parse failed.  If
     * neither source nor target was seen in the parsed id, then the
     * parse fails.  If allowFilter is true, then the parsed filter
     * pattern is returned in the Specs object, otherwise the returned
     * filter reference is null.  If the parse fails for any reason
     * null is returned.
     */
    static Specs* parseFilterID(const UnicodeString& id, int32_t& pos,
                                UBool allowFilter);

    /**
     * Givens a Specs object, convert it to a SingleID object.  The
     * Spec object is a more unprocessed parse result.  The SingleID
     * object contains information about canonical and basic IDs.
     * @param specs the given Specs object.
     * @param dir   either FORWARD or REVERSE.
     * @return a SingleID; never returns null.  Returned object always
     * has 'filter' field of null.
     */
    static SingleID* specsToID(const Specs* specs, int32_t dir);

    /**
     * Given a Specs object, return a SingleID representing the
     * special inverse of that ID.  If there is no special inverse
     * then return null.
     * @param specs the given Specs.
     * @return a SingleID or null.  Returned object always has
     * 'filter' field of null.
     */
    static SingleID* specsToSpecialInverse(const Specs& specs);

    /**
     * Glue method to get around access problems in C++.
     * @param id the id string for the transliterator, in any of several forms
     * @param canonID the given canonical ID
     */
    static Transliterator* createBasicInstance(const UnicodeString& id,
                                               const UnicodeString* canonID);

    /**
     * Initialize static memory.
     */
    static void init();

    friend class SingleID;
};

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */

#endif

--- NEW FILE: ucurr.cpp ---
/*
**********************************************************************
* Copyright (c) 2002-2003, International Business Machines
* Corporation and others.  All Rights Reserved.
**********************************************************************
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_FORMATTING

#include "unicode/ucurr.h"
#include "unicode/locid.h"
#include "unicode/resbund.h"
#include "unicode/ustring.h"
#include "cmemory.h"
#include "cstring.h"
#include "uassert.h"
#include "iculserv.h"
#include "ucln_in.h"

//------------------------------------------------------------
// Constants

// Default currency meta data of last resort.  We try to use the
// defaults encoded in the meta data resource bundle.  If there is a
// configuration/build error and these are not available, we use these
// hard-coded defaults (which should be identical).
static const int32_t LAST_RESORT_DATA[] = { 2, 0 };

// POW10[i] = 10^i, i=0..MAX_POW10
static const int32_t POW10[] = { 1, 10, 100, 1000, 10000, 100000,
                                 1000000, 10000000, 100000000, 1000000000 };

static const int32_t MAX_POW10 = (sizeof(POW10)/sizeof(POW10[0])) - 1;

#define ISO_COUNTRY_CODE_LENGTH 3

//------------------------------------------------------------
// Resource tags

// Tag for meta-data, in root.
static const char CURRENCY_META[] = "CurrencyMeta";

// Tag for map from countries to currencies, in root.
static const char CURRENCY_MAP[] = "CurrencyMap";

// Tag for default meta-data, in CURRENCY_META
static const char DEFAULT_META[] = "DEFAULT";

// Variant for legacy pre-euro mapping in CurrencyMap
static const char VAR_PRE_EURO[] = "PREEURO";

// Variant for legacy euro mapping in CurrencyMap
static const char VAR_EURO[] = "EURO";

// Variant delimiter
static const char VAR_DELIM[] = "_";

// Tag for localized display names (symbols) of currencies
static const char CURRENCIES[] = "Currencies";

// Marker character indicating that a display name is a ChoiceFormat
// pattern.  Strings that start with one mark are ChoiceFormat
// patterns.  Strings that start with 2 marks are static strings, and
// the first mark is deleted.
static const UChar CHOICE_FORMAT_MARK = 0x003D; // Equals sign

//------------------------------------------------------------
// Code

/**
 * Unfortunately, we have to convert the UChar* currency code to char*
 * to use it as a resource key.
 */
static inline char*
myUCharsToChars(char* resultOfLen4, const UChar* currency) {
    u_UCharsToChars(currency, resultOfLen4, ISO_COUNTRY_CODE_LENGTH);
    resultOfLen4[ISO_COUNTRY_CODE_LENGTH] = 0;
    return resultOfLen4;
}

/**
 * Internal function to look up currency data.  Result is an array of
 * two integers.  The first is the fraction digits.  The second is the
 * rounding increment, or 0 if none.  The rounding increment is in
 * units of 10^(-fraction_digits).
 */
static const int32_t*
_findMetaData(const UChar* currency) {

    // Get CurrencyMeta resource out of root locale file.  [This may
    // move out of the root locale file later; if it does, update this
    // code.]
    UErrorCode ec = U_ZERO_ERROR;
    ResourceBundle currencyMeta =
        ResourceBundle((char*)0, Locale(""), ec).get(CURRENCY_META, ec);

    if (U_FAILURE(ec)) {
        // Config/build error; return hard-coded defaults
        return LAST_RESORT_DATA;
    }

    // Look up our currency, or if that's not available, then DEFAULT
    char buf[ISO_COUNTRY_CODE_LENGTH+1];
    ResourceBundle rb = currencyMeta.get(myUCharsToChars(buf, currency), ec);
    if (U_FAILURE(ec)) {
        rb = currencyMeta.get(DEFAULT_META, ec);
        if (U_FAILURE(ec)) {
            // Config/build error; return hard-coded defaults
            return LAST_RESORT_DATA;
        }
    }

    int32_t len;
    const int32_t *data = rb.getIntVector(len, ec);
    if (U_FAILURE(ec) || len < 2) {
        // Config/build error; return hard-coded defaults
        return LAST_RESORT_DATA;
    }

    return data;
}

// ------------------------------------------
//
// Registration
//
//-------------------------------------------

// don't use ICUService since we don't need fallback

struct CReg;

/* Remember to call umtx_init(&gCRegLock) before using it! */
static UMTX gCRegLock = 0;
static CReg* gCRegHead = 0;

struct CReg : public UMemory {
    CReg *next;
    UChar iso[ISO_COUNTRY_CODE_LENGTH+1];
    char  id[ULOC_FULLNAME_CAPACITY];

    CReg(const UChar* _iso, const char* _id)
        : next(0)
    {
        int32_t len = uprv_strlen(_id);
        if (len > (int32_t)(sizeof(id)-1)) {
            len = (sizeof(id)-1);
        }
        uprv_strncpy(id, _id, len);
        id[len] = 0;
        uprv_memcpy(iso, _iso, ISO_COUNTRY_CODE_LENGTH * sizeof(const UChar));
        iso[ISO_COUNTRY_CODE_LENGTH] = 0;
    }

    static UCurrRegistryKey reg(const UChar* _iso, const char* _id, UErrorCode* status)
    {
        if (status && U_SUCCESS(*status) && _iso && _id) {
            CReg* n = new CReg(_iso, _id);
            if (n) {
                umtx_init(&gCRegLock);
                Mutex mutex(&gCRegLock);
                if (!gCRegHead) {
                    ucln_i18n_registerCleanup();
                }
                n->next = gCRegHead;
                gCRegHead = n;
                return n;
            }
            *status = U_MEMORY_ALLOCATION_ERROR;
        }
        return 0;
    }

    static UBool unreg(UCurrRegistryKey key) {
        umtx_init(&gCRegLock);
        Mutex mutex(&gCRegLock);
        if (gCRegHead == key) {
            gCRegHead = gCRegHead->next;
            delete (CReg*)key;
            return TRUE;
        }

        CReg* p = gCRegHead;
        while (p) {
            if (p->next == key) {
                p->next = ((CReg*)key)->next;
                delete (CReg*)key;	
                return TRUE;
            }
            p = p->next;
        }

        return FALSE;
    }

    static const UChar* get(const char* id) {
        umtx_init(&gCRegLock);
        Mutex mutex(&gCRegLock);
        CReg* p = gCRegHead;
        while (p) {
            if (uprv_strcmp(id, p->id) == 0) {
                return p->iso;
            }
            p = p->next;
        }
        return NULL;
    }

    /* This doesn't need to be thread safe. It's for u_cleanup only. */
    static void cleanup(void) {
        while (gCRegHead) {
            CReg* n = gCRegHead;
            gCRegHead = gCRegHead->next;
            delete n;
        }
        umtx_destroy(&gCRegLock);
    }
};

// -------------------------------------

static void
idForLocale(const char* locale, char* buffer, int capacity, UErrorCode* ec)
{
    // !!! this is internal only, assumes buffer is not null and capacity is sufficient
    // Extract the country name and variant name.  We only
    // recognize two variant names, EURO and PREEURO.
    char variant[ULOC_FULLNAME_CAPACITY];
    uloc_getCountry(locale, buffer, capacity, ec);
    uloc_getVariant(locale, variant, sizeof(variant), ec);
    if (0 == uprv_strcmp(variant, VAR_PRE_EURO) ||
        0 == uprv_strcmp(variant, VAR_EURO))
    {
        uprv_strcat(buffer, VAR_DELIM);
        uprv_strcat(buffer, variant);
    }
}

// -------------------------------------

U_CAPI UCurrRegistryKey U_EXPORT2
ucurr_register(const UChar* isoCode, const char* locale, UErrorCode *status) 
{
    if (status && U_SUCCESS(*status)) {
        char id[ULOC_FULLNAME_CAPACITY];
        idForLocale(locale, id, sizeof(id), status);
        return CReg::reg(isoCode, id, status);
    }
    return NULL;
}

// -------------------------------------

U_CAPI UBool U_EXPORT2
ucurr_unregister(UCurrRegistryKey key, UErrorCode* status) 
{
    if (status && U_SUCCESS(*status)) {
        return CReg::unreg(key);
    }
    return FALSE;
}

// -------------------------------------

U_CAPI const UChar* U_EXPORT2
ucurr_forLocale(const char* locale, UErrorCode* ec) {
    if (ec != NULL && U_SUCCESS(*ec)) {
        char id[ULOC_FULLNAME_CAPACITY];
        idForLocale(locale, id, sizeof(id), ec);
        if (U_FAILURE(*ec)) {
            return NULL;
        }

        const UChar* result = CReg::get(id);
        if (result) {
            return result;
        }

        // Look up the CurrencyMap element in the root bundle.
        UResourceBundle* rb = ures_open(NULL, "", ec);
        UResourceBundle* cm = ures_getByKey(rb, CURRENCY_MAP, NULL, ec);
        int32_t len;
        const UChar* s = ures_getStringByKey(cm, id, &len, ec);
        ures_close(cm);
        ures_close(rb);

        if (U_SUCCESS(*ec)) {
            return s;
        }
    }
    return NULL;
}

// end registration

/**
 * Modify the given locale name by removing the rightmost _-delimited
 * element.  If there is none, empty the string ("" == root).
 * NOTE: The string "root" is not recognized; do not use it.
 * @return TRUE if the fallback happened; FALSE if locale is already
 * root ("").
 */
static UBool fallback(char *loc) {
    if (!*loc) {
        return FALSE;
    }
    UErrorCode status = U_ZERO_ERROR;
    uloc_getParent(loc, loc, uprv_strlen(loc), &status);
 /*
    char *i = uprv_strrchr(loc, '_');
    if (i == NULL) {
        i = loc;
    }
    *i = 0;
 */
    return TRUE;
}

U_CAPI const UChar* U_EXPORT2
ucurr_getName(const UChar* currency,
              const char* locale,
              UCurrNameStyle nameStyle,
              UBool* isChoiceFormat, // fillin
              int32_t* len, // fillin
              UErrorCode* ec) {

    // Look up the Currencies resource for the given locale.  The
    // Currencies locale data looks like this:
    //|en {
    //|  Currencies { 
    //|    USD { "US$", "US Dollar" }
    //|    CHF { "Sw F", "Swiss Franc" }
    //|    INR { "=0#Rs|1#Re|1<Rs", "=0#Rupees|1#Rupee|1<Rupees" }
    //|    //...
    //|  }
    //|}

    if (ec == NULL || U_FAILURE(*ec)) {
        return 0;
    }

    int32_t choice = (int32_t) nameStyle;
    if (choice < 0 || choice > 1) {
        *ec = U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    // In the future, resource bundles may implement multi-level
    // fallback.  That is, if a currency is not found in the en_US
    // Currencies data, then the en Currencies data will be searched.
    // Currently, if a Currencies datum exists in en_US and en, the
    // en_US entry hides that in en.

    // We want multi-level fallback for this resource, so we implement
    // it manually.

    // Use a separate UErrorCode here that does not propagate out of
    // this function.
    UErrorCode ec2 = U_ZERO_ERROR;

    char loc[ULOC_FULLNAME_CAPACITY];
    uloc_getName(locale, loc, sizeof(loc), &ec2);
    if (U_FAILURE(ec2) || ec2 == U_STRING_NOT_TERMINATED_WARNING) {
        *ec = U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    char buf[ISO_COUNTRY_CODE_LENGTH+1];
    myUCharsToChars(buf, currency);

    const UChar* s = NULL;

    // Multi-level resource inheritance fallback loop
    for (;;) {
        ec2 = U_ZERO_ERROR;
        UResourceBundle* rb = ures_open(NULL, loc, &ec2);
        UResourceBundle* curr = ures_getByKey(rb, CURRENCIES, NULL, &ec2);
        UResourceBundle* names = ures_getByKey(curr, buf, NULL, &ec2);
        s = ures_getStringByIndex(names, choice, len, &ec2);
        ures_close(names);
        ures_close(curr);
        ures_close(rb);

        // If we've succeeded we're done.  Otherwise, try to fallback.
        // If that fails (because we are already at root) then exit.
        if (U_SUCCESS(ec2) || !fallback(loc)) {
            break;
        }
    }

    // Determine if this is a ChoiceFormat pattern.  One leading mark
    // indicates a ChoiceFormat.  Two indicates a static string that
    // starts with a mark.  In either case, the first mark is ignored,
    // if present.  Marks in the rest of the string have no special
    // meaning.
    *isChoiceFormat = FALSE;
    if (U_SUCCESS(ec2)) {
        U_ASSERT(s != NULL);
        int32_t i=0;
        while (i < *len && s[i] == CHOICE_FORMAT_MARK && i < 2) {
            ++i;
        }
        *isChoiceFormat = (i == 1);
        if (i != 0) ++s; // Skip over first mark
        return s;
    }

    // If we fail to find a match, use the ISO 4217 code
    *len = u_strlen(currency); // Should == ISO_COUNTRY_CODE_LENGTH, but maybe not...?
    return currency;
}

//!// This API is now redundant.  It predates ucurr_getName, which
//!// replaces and extends it.
//!U_CAPI const UChar* U_EXPORT2
//!ucurr_getSymbol(const UChar* currency,
//!                const char* locale,
//!    int32_t* len, // fillin
//!                UErrorCode* ec) {
//!    UBool isChoiceFormat;
//!    const UChar* s = ucurr_getName(currency, locale, UCURR_SYMBOL_NAME,
//!                                   &isChoiceFormat, len, ec);
//!    if (isChoiceFormat) {
//!        // Don't let ChoiceFormat patterns out through this API
//!        *len = u_strlen(currency); // Should == 3, but maybe not...?
//!        return currency;
//!    }
//!    return s;
//!}

U_CAPI int32_t U_EXPORT2
ucurr_getDefaultFractionDigits(const UChar* currency) {
    return (_findMetaData(currency))[0];
}

U_CAPI double U_EXPORT2
ucurr_getRoundingIncrement(const UChar* currency) {
    const int32_t *data = _findMetaData(currency);

    // If there is no rounding, or if the meta data is invalid,
    // return 0.0 to indicate no rounding.  A rounding value
    // (data[1]) of 0 or 1 indicates no rounding.
    if (data[1] < 2 || data[0] < 0 || data[0] > MAX_POW10) {
        return 0.0;
    }

    // Return data[1] / 10^(data[0]).  The only actual rounding data,
    // as of this writing, is CHF { 2, 5 }.
    return double(data[1]) / POW10[data[0]];
}

/**
 * Release all static memory held by currency.
 */
U_CFUNC UBool currency_cleanup(void) {
    CReg::cleanup();
    return TRUE;
}

#endif /* #if !UCONFIG_NO_FORMATTING */

//eof

--- NEW FILE: unitohex.h ---
/*
**********************************************************************
* Copyright (C) 1999-2003, International Business Machines Corporation and others. All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   11/17/99    aliu        Creation.
**********************************************************************
*/
#ifndef UNITOHEX_H
#define UNITOHEX_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "unicode/translit.h"
#include "unicode/unistr.h"

U_NAMESPACE_BEGIN

class UnicodeFilter;

/**
 * A transliterator that converts from Unicode characters to
 * hexadecimal Unicode escape sequences. It outputs a
 * prefix specified in the constructor and optionally converts the hex
 * digits to uppercase.
 *
 * The format of the output is set by a pattern. This pattern
 * follows the same syntax as <code>HexToUnicodeTransliterator</code>,
 * except it does not allow multiple specifications. The pattern sets
 * the prefix string, suffix string, and minimum and maximum digit
 * count. There are no setters or getters for these attributes; they
 * are set only through the pattern.
 *
 * The setUppercase() and isUppercase() methods control whether 'a'
 * through 'f' or 'A' through 'F' are output as hex digits. This is
 * not controlled through the pattern; only through the methods. The
 * default is uppercase.
 *
 * @author Alan Liu
 * @internal Use transliterator factory methods instead since this class will be removed in that release.
 */
class U_I18N_API UnicodeToHexTransliterator : public Transliterator {

private:

    // Character constants defined here to avoid ASCII dependency
    enum {
        ZERO      = 0x0030, // '0'
        POUND     = 0x0023, // '#'
        BACKSLASH = 0x005C  // '\\'
    };

    static const UChar HEX_DIGITS[32];

    /**
     * ID for this transliterator.
     */
    static const char _ID[];

    /**
     * The pattern set by applyPattern() and returned by toPattern().
     */
    UnicodeString pattern;

    /**
     * The string preceding the hex digits, parsed from the pattern.
     */
    UnicodeString prefix;

    /**
     * The string following the hex digits, parsed from the pattern.
     */
    UnicodeString suffix;

    /**
     * The minimum number of hex digits to output, between 1 and 4,
     * inclusive.  Parsed from the pattern.
     */
    int8_t minDigits;

    /**
     * If TRUE, output uppercase hex digits; otherwise output
     * lowercase.  Set by setUppercase() and returned by isUppercase().
     */
    UBool uppercase;

    /**
     * The address of this static class variable serves as this class's ID
     * for ICU "poor man's RTTI".
     */
    static const char fgClassID;

public:

    /**
     * Constructs a transliterator.
     * @param pattern The pattern for this transliterator.  See
     * applyPattern() for pattern syntax.
     * @param isUppercase if true, the four hex digits will be
     * converted to uppercase; otherwise they will be lowercase.
     * @param adoptedFilter the filter for this transliterator, or
     * NULL if none.  Adopted by this transliterator.
     * @param status Error code indicating success or failure
     * to parse pattern.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    UnicodeToHexTransliterator(const UnicodeString& pattern,
                               UBool isUppercase,
                               UnicodeFilter* adoptedFilter,
                               UErrorCode& status);

    /**
     * Constructs an uppercase transliterator with no filter.
     * @param pattern The pattern for this transliterator.  See
     * applyPattern() for pattern syntax.
     * @param status Error code indicating success or failure
     * to parse pattern.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    UnicodeToHexTransliterator(const UnicodeString& pattern,
                               UErrorCode& status);

    /**
     * Constructs a transliterator with the default prefix "\u"
     * that outputs uppercase hex digits.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    UnicodeToHexTransliterator(UnicodeFilter* adoptedFilter = 0);

    /**
     * Destructor.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    virtual ~UnicodeToHexTransliterator();

    /**
     * Copy constructor.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    UnicodeToHexTransliterator(const UnicodeToHexTransliterator&);

    /**
     * Assignment operator.
     * @stable ICU 2.0
     */
    UnicodeToHexTransliterator& operator=(const UnicodeToHexTransliterator&);

    /**
     * Transliterator API.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    virtual Transliterator* clone(void) const;

    /**
     * Set the pattern recognized by this transliterator.  The pattern
     * must contain zero or more prefix characters, one or more digit
     * characters, and zero or more suffix characters.  The digit
     * characters indicates optional digits ('#') followed by required
     * digits ('0').  The total number of digits cannot exceed 4, and
     * must be at least 1 required digit.  Use a backslash ('\\') to
     * escape any of the special characters.  An empty pattern is not
     * allowed.
     *
     * <p>Example: "U+0000" specifies a prefix of "U+", exactly four
     * digits, and no suffix.  "<###0>" has a prefix of "<", between
     * one and four digits, and a suffix of ">".
     *
     * <p><pre>
     * pattern := prefix-char* digit-spec suffix-char*
     * digit-spec := '#'* '0'+
     * prefix-char := [^special-char] | '\\' special-char
     * suffix-char := [^special-char] | '\\' special-char
     * special-char := ';' | '0' | '#' | '\\'
     * </pre>
     *
     * <p>Limitations: There is no way to set the uppercase attribute
     * in the pattern.  (applyPattern() does not alter the uppercase
     * attribute.)
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    void applyPattern(const UnicodeString& thePattern, UErrorCode& status);

    /**
     * Return this transliterator's pattern.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    const UnicodeString& toPattern(void) const;

    /**
     * Returns true if this transliterator outputs uppercase hex digits.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    virtual UBool isUppercase(void) const;

    /**
     * Sets if this transliterator outputs uppercase hex digits.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    virtual void setUppercase(UBool outputUppercase);

    /**
     * Implements {@link Transliterator#handleTransliterate}.
     * @internal Use transliterator factory methods instead since this class will be removed in that release.
     */
    virtual void handleTransliterate(Replaceable& text, UTransPosition& offsets,
                                     UBool isIncremental) const;

    /**
     * ICU "poor man's RTTI", returns a UClassID for the actual class.
     *
     * @draft ICU 2.2
     */
    virtual inline UClassID getDynamicClassID() const;

    /**
     * ICU "poor man's RTTI", returns a UClassID for this class.
     *
     * @draft ICU 2.2
     */
    static inline UClassID getStaticClassID();
};

inline UnicodeToHexTransliterator::~UnicodeToHexTransliterator() {}

inline UClassID
UnicodeToHexTransliterator::getStaticClassID()
{ return (UClassID)&fgClassID; }

inline UClassID
UnicodeToHexTransliterator::getDynamicClassID() const
{ return UnicodeToHexTransliterator::getStaticClassID(); }

U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_TRANSLITERATION */

#endif