[sword-cvs] icu-sword/source/common/unicode .cvsignore,1.2,1.3 brkiter.h,NONE,1.1 caniter.h,NONE,1.1 chariter.h,1.3,1.4 dbbi.h,NONE,1.1 docmain.h,1.3,1.4 locid.h,1.3,1.4 normlzr.h,1.4,1.5 parseerr.h,NONE,1.1 parsepos.h,NONE,1.1 platform.h.in,1.4,1.5 pmacos.h,1.2,1.3 pos400.h,1.2,1.3 putil.h,1.3,1.4 pwin32.h,1.4,1.5 rbbi.h,NONE,1.1 rep.h,1.3,1.4 resbund.h,1.3,1.4 schriter.h,1.3,1.4 strenum.h,NONE,1.1 ubidi.h,1.3,1.4 ubrk.h,NONE,1.1 ucat.h,NONE,1.1 uchar.h,1.4,1.5 uchriter.h,1.3,1.4 uclean.h,1.3,1.4 ucnv.h,1.3,1.4 ucnv_cb.h,1.3,1.4 ucnv_err.h,1.3,1.4 uconfig.h,NONE,1.1 udata.h,1.3,1.4 uenum.h,NONE,1.1 uidna.h,NONE,1.1 uiter.h,NONE,1.1 uloc.h,1.3,1.4 umachine.h,1.3,1.4 umisc.h,1.2,1.3 unifilt.h,NONE,1.1 unifunct.h,NONE,1.1 unimatch.h,NONE,1.1 uniset.h,NONE,1.1 unistr.h,1.4,1.5 unorm.h,1.3,1.4 uobject.h,NONE,1.1 urename.h,1.4,1.5 urep.h,1.3,1.4 ures.h,1.4,1.5 uscript.h,1.4,1.5 uset.h,NONE,1.1 usetiter.h,NONE,1.1 ushape.h,1.3,1.4 ustring.h,1.3,1.4 utf.h,1.3,1.4 utf16.h,1.2,1.3 utf32.h,1.2,1.3 utf8.h,1.3,1.4 utf_old.h,NONE,1.1 utypes.h,1.8,1.9 uversion.h,1.4,1.5

sword@www.crosswire.org sword@www.crosswire.org
Tue, 9 Sep 2003 19:43:16 -0700


Update of /usr/local/cvsroot/icu-sword/source/common/unicode
In directory www:/tmp/cvs-serv19862/source/common/unicode

Added Files:
	.cvsignore brkiter.h caniter.h chariter.h dbbi.h docmain.h 
	locid.h normlzr.h parseerr.h parsepos.h platform.h.in pmacos.h 
	pos400.h putil.h pwin32.h rbbi.h rep.h resbund.h schriter.h 
	strenum.h ubidi.h ubrk.h ucat.h uchar.h uchriter.h uclean.h 
	ucnv.h ucnv_cb.h ucnv_err.h uconfig.h udata.h uenum.h uidna.h 
	uiter.h uloc.h umachine.h umisc.h unifilt.h unifunct.h 
	unimatch.h uniset.h unistr.h unorm.h uobject.h urename.h 
	urep.h ures.h uscript.h uset.h usetiter.h ushape.h ustring.h 
	utf.h utf16.h utf32.h utf8.h utf_old.h utypes.h uversion.h 
Log Message:
ICU 2.6 commit


--- NEW FILE: brkiter.h ---
/*
********************************************************************************
*   Copyright (C) 1997-2003, International Business Machines
*   Corporation and others.  All Rights Reserved.
********************************************************************************
*
* File brkiter.h
*
* Modification History:
*
*   Date        Name        Description
*   02/18/97    aliu        Added typedef for TextCount.  Made DONE const.
*   05/07/97    aliu        Fixed DLL declaration.
*   07/09/97    jfitz       Renamed BreakIterator and interface synced with JDK
*   08/11/98    helena      Sync-up JDK1.2.
*   01/13/2000  helena      Added UErrorCode parameter to createXXXInstance methods.
********************************************************************************
*/

#ifndef BRKITER_H
#define BRKITER_H

#include "unicode/utypes.h"

#if UCONFIG_NO_BREAK_ITERATION

U_NAMESPACE_BEGIN

/*
 * Allow the declaration of APIs with pointers to BreakIterator
 * even when break iteration is removed from the build.
 */
class BreakIterator;

U_NAMESPACE_END

#else

#include "unicode/uobject.h"
#include "unicode/unistr.h"
#include "unicode/chariter.h"
#include "unicode/locid.h"
#include "unicode/ubrk.h"
#include "unicode/strenum.h"

U_NAMESPACE_BEGIN

typedef const void* URegistryKey;

/**
 * The BreakIterator class implements methods for finding the location
 * of boundaries in text. BreakIterator is an abstract base class.
 * Instances of BreakIterator maintain a current position and scan over
 * text returning the index of characters where boundaries occur.
 * <P>
 * Line boundary analysis determines where a text string can be broken
 * when line-wrapping. The mechanism correctly handles punctuation and
 * hyphenated words.
 * <P>
 * Sentence boundary analysis allows selection with correct
 * interpretation of periods within numbers and abbreviations, and
 * trailing punctuation marks such as quotation marks and parentheses.
 * <P>
 * Word boundary analysis is used by search and replace functions, as
 * well as within text editing applications that allow the user to
 * select words with a double click. Word selection provides correct
 * interpretation of punctuation marks within and following
 * words. Characters that are not part of a word, such as symbols or
 * punctuation marks, have word-breaks on both sides.
 * <P>
 * Character boundary analysis allows users to interact with
 * characters as they expect to, for example, when moving the cursor
 * through a text string. Character boundary analysis provides correct
 * navigation of through character strings, regardless of how the
 * character is stored.  For example, an accented character might be
 * stored as a base character and a diacritical mark. What users
 * consider to be a character can differ between languages.
 * <P>
 * This is the interface for all text boundaries.
 * <P>
 * Examples:
 * <P>
 * Helper function to output text
 * <pre>
 * \code
 *    void printTextRange( BreakIterator& iterator, int32_t start, int32_t end )
 *    {
 *        UnicodeString textBuffer, temp;
 *        CharacterIterator *strIter = iterator.createText();
 *        strIter->getText(temp);
 *        cout << " " << start << " " << end << " |"
 *             << temp.extractBetween(start, end, textBuffer)
 *             << "|" << endl;
 *        delete strIter;
 *    }
 * \endcode
 * </pre>
 * Print each element in order:
 * <pre>
 * \code
 *    void printEachForward( BreakIterator& boundary)
 *    {
 *       int32_t start = boundary.first();
 *       for (int32_t end = boundary.next();
 *         end != BreakIterator::DONE;
 *         start = end, end = boundary.next())
 *         {
 *             printTextRange( boundary, start, end );
 *         }
 *    }
 * \code
 * </pre>
 * Print each element in reverse order:
 * <pre>
 * \code
 *    void printEachBackward( BreakIterator& boundary)
 *    {
 *       int32_t end = boundary.last();
 *       for (int32_t start = boundary.previous();
 *         start != BreakIterator::DONE;
 *         end = start, start = boundary.previous())
 *         {
 *             printTextRange( boundary, start, end );
 *         }
 *    }
 * \endcode
 * </pre>
 * Print first element
 * <pre>
 * \code
 *    void printFirst(BreakIterator& boundary)
 *    {
 *        int32_t start = boundary.first();
 *        int32_t end = boundary.next();
 *        printTextRange( boundary, start, end );
 *    }
 * \endcode
 * </pre>
 * Print last element
 * <pre>
 *  \code
 *    void printLast(BreakIterator& boundary)
 *    {
 *        int32_t end = boundary.last();
 *        int32_t start = boundary.previous();
 *        printTextRange( boundary, start, end );
 *    }
 * \endcode
 * </pre>
 * Print the element at a specified position
 * <pre>
 * \code
 *    void printAt(BreakIterator &boundary, int32_t pos )
 *    {
 *        int32_t end = boundary.following(pos);
 *        int32_t start = boundary.previous();
 *        printTextRange( boundary, start, end );
 *    }
 * \endcode
 * </pre>
 * Creating and using text boundaries
 * <pre>
 * \code
 *       void BreakIterator_Example( void )
 *       {
 *           BreakIterator* boundary;
 *           UnicodeString stringToExamine("Aaa bbb ccc. Ddd eee fff.");
 *           cout << "Examining: " << stringToExamine << endl;
 *
 *           //print each sentence in forward and reverse order
 *           boundary = BreakIterator::createSentenceInstance( Locale::US );
 *           boundary->setText(stringToExamine);
 *           cout << "----- forward: -----------" << endl;
 *           printEachForward(*boundary);
 *           cout << "----- backward: ----------" << endl;
 *           printEachBackward(*boundary);
 *           delete boundary;
 *
 *           //print each word in order
 *           boundary = BreakIterator::createWordInstance();
 *           boundary->setText(stringToExamine);
 *           cout << "----- forward: -----------" << endl;
 *           printEachForward(*boundary);
 *           //print first element
 *           cout << "----- first: -------------" << endl;
 *           printFirst(*boundary);
 *           //print last element
 *           cout << "----- last: --------------" << endl;
 *           printLast(*boundary);
 *           //print word at charpos 10
 *           cout << "----- at pos 10: ---------" << endl;
 *           printAt(*boundary, 10 );
 *
 *           delete boundary;
 *       }
 * \endcode
 * </pre>
 */
class U_COMMON_API BreakIterator : public UObject {
public:
    /**
     *  destructor
     *  @stable ICU 2.0
     */
    virtual ~BreakIterator();

    /**
     * Return true if another object is semantically equal to this
     * one. The other object should be an instance of the same subclass of
     * BreakIterator. Objects of different subclasses are considered
     * unequal.
     * <P>
     * Return true if this BreakIterator is at the same position in the
     * same text, and is the same class and type (word, line, etc.) of
     * BreakIterator, as the argument.  Text is considered the same if
     * it contains the same characters, it need not be the same
     * object, and styles are not considered.
     * @stable ICU 2.0
     */
    virtual UBool operator==(const BreakIterator&) const = 0;

    /**
     * Returns the complement of the result of operator==
     * @param rhs The BreakIterator to be compared for inequality
     * @return the complement of the result of operator==
     * @stable ICU 2.0
     */
    UBool operator!=(const BreakIterator& rhs) const { return !operator==(rhs); }

    /**
     * Return a polymorphic copy of this object.  This is an abstract
     * method which subclasses implement.
     * @stable ICU 2.0
     */
    virtual BreakIterator* clone(void) const = 0;

    /**
     * Return a polymorphic class ID for this object. Different subclasses
     * will return distinct unequal values.
     * @stable ICU 2.0
     */
    virtual UClassID getDynamicClassID(void) const = 0;

    /**
     * Return a CharacterIterator over the text being analyzed.
     * Changing the state of the returned iterator can have undefined consequences
     * on the operation of the break iterator.  If you need to change it, clone it first.
     * @stable ICU 2.0
     */
    virtual const CharacterIterator& getText(void) const = 0;

    /**
     * Change the text over which this operates. The text boundary is
     * reset to the start.
     * @param text The UnicodeString used to change the text.
     * @stable ICU 2.0
     */
    virtual void  setText(const UnicodeString &text) = 0;

    /**
     * Change the text over which this operates. The text boundary is
     * reset to the start.
     * @param it The CharacterIterator used to change the text.
     * @stable ICU 2.0
     */
    virtual void  adoptText(CharacterIterator* it) = 0;

    /**
     * DONE is returned by previous() and next() after all valid
     * boundaries have been returned.
     * @stable ICU 2.0
     */
    static const int32_t DONE;

    /**
     * Return the index of the first character in the text being scanned.
     * @stable ICU 2.0
     */
    virtual int32_t first(void) = 0;

    /**
     * Return the index immediately BEYOND the last character in the text being scanned.
     * @stable ICU 2.0
     */
    virtual int32_t last(void) = 0;

    /**
     * Return the boundary preceding the current boundary.
     * @return The character index of the previous text boundary or DONE if all
     * boundaries have been returned.
     * @stable ICU 2.0
     */
    virtual int32_t previous(void) = 0;

    /**
     * Return the boundary following the current boundary.
     * @return The character index of the next text boundary or DONE if all
     * boundaries have been returned.
     * @stable ICU 2.0
     */
    virtual int32_t next(void) = 0;

    /**
     * Return character index of the current interator position within the text.
     * @return The boundary most recently returned.
     * @stable ICU 2.0
     */
    virtual int32_t current(void) const = 0;

    /**
     * Return the first boundary following the specified offset.
     * The value returned is always greater than the offset or
     * the value BreakIterator.DONE
     * @param offset the offset to begin scanning.
     * @return The first boundary after the specified offset.
     * @stable ICU 2.0
     */
    virtual int32_t following(int32_t offset) = 0;

    /**
     * Return the first boundary preceding the specified offset.
     * The value returned is always smaller than the offset or
     * the value BreakIterator.DONE
     * @param offset the offset to begin scanning.
     * @return The first boundary before the specified offset.
     * @stable ICU 2.0
     */
    virtual int32_t preceding(int32_t offset) = 0;

    /**
     * Return true if the specfied position is a boundary position.
     * As a side effect, the current position of the iterator is set
     * to the first boundary position at or following the specified offset.
     * @param offset the offset to check.
     * @return True if "offset" is a boundary position.
     * @stable ICU 2.0
     */
    virtual UBool isBoundary(int32_t offset) = 0;

    /**
     * Return the nth boundary from the current boundary
     * @param n which boundary to return.  A value of 0
     * does nothing.  Negative values move to previous boundaries
     * and positive values move to later boundaries.
     * @return The index of the nth boundary from the current position, or
     * DONE if there are fewer than |n| boundaries in the specfied direction.
     * @stable ICU 2.0
     */
    virtual int32_t next(int32_t n) = 0;

    /**
     * Create BreakIterator for word-breaks using the given locale.
     * Returns an instance of a BreakIterator implementing word breaks.
     * WordBreak is useful for word selection (ex. double click)
     * @param where the locale.
     * @param status the error code
     * @return A BreakIterator for word-breaks.  The UErrorCode& status
     * parameter is used to return status information to the user.
     * To check whether the construction succeeded or not, you should check
     * the value of U_SUCCESS(err).  If you wish more detailed information, you
     * can check for informational error results which still indicate success.
     * U_USING_FALLBACK_WARNING indicates that a fall back locale was used.  For
     * example, 'de_CH' was requested, but nothing was found there, so 'de' was
     * used.  U_USING_DEFAULT_WARNING indicates that the default locale data was
     * used; neither the requested locale nor any of its fall back locales
     * could be found.
     * The caller owns the returned object and is responsible for deleting it.
     * @stable ICU 2.0
     */
    static BreakIterator* createWordInstance(const Locale& where,
                                                   UErrorCode& status);

    /**
     * Create BreakIterator for line-breaks using specified locale.
     * Returns an instance of a BreakIterator implementing line breaks. Line
     * breaks are logically possible line breaks, actual line breaks are
     * usually determined based on display width.
     * LineBreak is useful for word wrapping text.
     * @param where the locale.
     * @param status The error code.
     * @return A BreakIterator for line-breaks.  The UErrorCode& status
     * parameter is used to return status information to the user.
     * To check whether the construction succeeded or not, you should check
     * the value of U_SUCCESS(err).  If you wish more detailed information, you
     * can check for informational error results which still indicate success.
     * U_USING_FALLBACK_WARNING indicates that a fall back locale was used.  For
     * example, 'de_CH' was requested, but nothing was found there, so 'de' was
     * used.  U_USING_DEFAULT_WARNING indicates that the default locale data was
     * used; neither the requested locale nor any of its fall back locales
     * could be found.
     * The caller owns the returned object and is responsible for deleting it.
     * @stable ICU 2.0
     */
    static BreakIterator* createLineInstance(const Locale& where,
                                                   UErrorCode& status);

    /**
     * Create BreakIterator for character-breaks using specified locale
     * Returns an instance of a BreakIterator implementing character breaks.
     * Character breaks are boundaries of combining character sequences.
     * @param where the locale.
     * @param status The error code.
     * @return A BreakIterator for character-breaks.  The UErrorCode& status
     * parameter is used to return status information to the user.
     * To check whether the construction succeeded or not, you should check
     * the value of U_SUCCESS(err).  If you wish more detailed information, you
     * can check for informational error results which still indicate success.
     * U_USING_FALLBACK_WARNING indicates that a fall back locale was used.  For
     * example, 'de_CH' was requested, but nothing was found there, so 'de' was
     * used.  U_USING_DEFAULT_WARNING indicates that the default locale data was
     * used; neither the requested locale nor any of its fall back locales
     * could be found.
     * The caller owns the returned object and is responsible for deleting it.
     * @stable ICU 2.0
     */
    static BreakIterator* createCharacterInstance(const Locale& where,
                                                        UErrorCode& status);

    /**
     * Create BreakIterator for sentence-breaks using specified locale
     * Returns an instance of a BreakIterator implementing sentence breaks.
     * @param where the locale.
     * @param status The error code.
     * @return A BreakIterator for sentence-breaks.  The UErrorCode& status
     * parameter is used to return status information to the user.
     * To check whether the construction succeeded or not, you should check
     * the value of U_SUCCESS(err).  If you wish more detailed information, you
     * can check for informational error results which still indicate success.
     * U_USING_FALLBACK_WARNING indicates that a fall back locale was used.  For
     * example, 'de_CH' was requested, but nothing was found there, so 'de' was
     * used.  U_USING_DEFAULT_WARNING indicates that the default locale data was
     * used; neither the requested locale nor any of its fall back locales
     * could be found.
     * The caller owns the returned object and is responsible for deleting it.
     * @stable ICU 2.0
     */
    static BreakIterator* createSentenceInstance(const Locale& where,
                                                       UErrorCode& status);

    /**
     * Create BreakIterator for title-casing breaks using the specified locale
     * Returns an instance of a BreakIterator implementing title breaks.
     * The iterator returned locates title boundaries as described for 
     * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
     * please use Word Boundary iterator.{@link createWordInstance()}
     *
     * @param where the locale.
     * @param status The error code.
     * @return A BreakIterator for title-breaks.  The UErrorCode& status
     * parameter is used to return status information to the user.
     * To check whether the construction succeeded or not, you should check
     * the value of U_SUCCESS(err).  If you wish more detailed information, you
     * can check for informational error results which still indicate success.
     * U_USING_FALLBACK_WARNING indicates that a fall back locale was used.  For
     * example, 'de_CH' was requested, but nothing was found there, so 'de' was
     * used.  U_USING_DEFAULT_WARNING indicates that the default locale data was
     * used; neither the requested locale nor any of its fall back locales
     * could be found.
     * The caller owns the returned object and is responsible for deleting it.
     * @stable ICU 2.1
     */
    static BreakIterator* createTitleInstance(const Locale& where,
                                                       UErrorCode& status);

    /**
     * Get the set of Locales for which TextBoundaries are installed.
     * <p><b>Note:</b> this will not return locales added through the register
     * call.</p>
     * @param count the output parameter of number of elements in the locale list
     * @return available locales
     * @stable ICU 2.0
     */
    static const Locale* getAvailableLocales(int32_t& count);

    /**
     * Get name of the object for the desired Locale, in the desired langauge.
     * @param objectLocale must be from getAvailableLocales.
     * @param displayLocale specifies the desired locale for output.
     * @param name the fill-in parameter of the return value
     * Uses best match.
     * @return user-displayable name
     * @stable ICU 2.0
     */
    static UnicodeString& getDisplayName(const Locale& objectLocale,
                                         const Locale& displayLocale,
                                         UnicodeString& name);

    /**
     * Get name of the object for the desired Locale, in the langauge of the
     * default locale.
     * @param objectLocale must be from getMatchingLocales
     * @param name the fill-in parameter of the return value
     * @return user-displayable name
     * @stable ICU 2.0
     */
    static UnicodeString& getDisplayName(const Locale& objectLocale,
                                         UnicodeString& name);

    /**
     * Thread safe client-buffer-based cloning operation
     *    Do NOT call delete on a safeclone, since 'new' is not used to create it.
     * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
     * If buffer is not large enough, new memory will be allocated.
     * @param BufferSize reference to size of allocated space.
     * If BufferSize == 0, a sufficient size for use in cloning will
     * be returned ('pre-flighting')
     * If BufferSize is not enough for a stack-based safe clone,
     * new memory will be allocated.
     * @param status to indicate whether the operation went on smoothly or there were errors
     *  An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were
     *  necessary.
     * @return pointer to the new clone
     *
     * @stable ICU 2.0
     */
    virtual BreakIterator *  createBufferClone(void *stackBuffer,
                                               int32_t &BufferSize,
                                               UErrorCode &status) = 0;

    /**
     *   Determine whether the BreakIterator was created in user memory by
     *   createBufferClone(), and thus should not be deleted.  Such objects
     *   must be closed by an explicit call to the destructor (not delete).
     *  @stable ICU 2.0
     */
    inline UBool isBufferClone(void);

    /**
     * Register a new break iterator of the indicated kind, to use in the given locale.
     * The break iterator will be adoped.  Clones of the iterator will be returned
     * if a request for a break iterator of the given kind matches or falls back to
     * this locale.
     * @param toAdopt the BreakIterator instance to be adopted
     * @param locale the Locale for which this instance is to be registered
     * @param kind the type of iterator for which this instance is to be registered
     * @param status the in/out status code, no special meanings are assigned
     * @return a registry key that can be used to unregister this instance
     * @draft ICU 2.4
     */
    static URegistryKey registerInstance(BreakIterator* toAdopt, const Locale& locale, UBreakIteratorType kind, UErrorCode& status);

    /**
     * Unregister a previously-registered BreakIterator using the key returned from the
     * register call.  Key becomes invalid after a successful call and should not be used again.
     * The BreakIterator corresponding to the key will be deleted.
     * @param key the registry key returned by a previous call to registerInstance
     * @param status the in/out status code, no special meanings are assigned
     * @return TRUE if the iterator for the key was successfully unregistered
     * @draft ICU 2.4
     */
    static UBool unregister(URegistryKey key, UErrorCode& status);

    /**
     * Return a StringEnumeration over the locales available at the time of the call, 
     * including registered locales.
     * @return a StringEnumeration over the locales available at the time of the call
     * @draft ICU 2.4
     */
    static StringEnumeration* getAvailableLocales(void);

 private:
    static BreakIterator* makeCharacterInstance(const Locale& loc, UErrorCode& status);
    static BreakIterator* makeWordInstance(const Locale& loc, UErrorCode& status);
    static BreakIterator* makeLineInstance(const Locale& loc, UErrorCode& status);
    static BreakIterator* makeSentenceInstance(const Locale& loc, UErrorCode& status);
    static BreakIterator* makeTitleInstance(const Locale& loc, UErrorCode& status);

    static BreakIterator* createInstance(const Locale& loc, UBreakIteratorType kind, UErrorCode& status);
    static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status);

    friend class ICUBreakIteratorFactory;
    friend class ICUBreakIteratorService;

protected:
    /** @internal */
    BreakIterator();
    /** @internal */
    UBool fBufferClone;
    /** @internal */
    BreakIterator (const BreakIterator &other) : UObject(other), fBufferClone(FALSE) {}
private:
    /**
     * The assignment operator has no real implementation.
     * It's provided to make the compiler happy. Do not call.
     */
    BreakIterator& operator=(const BreakIterator&) { return *this; }
};

inline UBool BreakIterator::isBufferClone()
{
    return fBufferClone;
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

#endif // _BRKITER
//eof


--- NEW FILE: caniter.h ---
/*
 *******************************************************************************
 * Copyright (C) 1996-2003, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 */

#ifndef CANITER_H
#define CANITER_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_NORMALIZATION

#include "unicode/uobject.h"
#include "unicode/unistr.h"

/** Should permutation skip characters with combining class zero
 *  Should be either TRUE or FALSE. This is a compile time option
 *  @draft ICU 2.4
 */
#ifndef CANITER_SKIP_ZEROES
#define CANITER_SKIP_ZEROES TRUE
#endif

U_NAMESPACE_BEGIN

class Hashtable;

/**
 * This class allows one to iterate through all the strings that are canonically equivalent to a given
 * string. For example, here are some sample results:
Results for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
1: \u0041\u030A\u0064\u0307\u0327
 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
2: \u0041\u030A\u0064\u0327\u0307
 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
3: \u0041\u030A\u1E0B\u0327
 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
4: \u0041\u030A\u1E11\u0307
 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
5: \u00C5\u0064\u0307\u0327
 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
6: \u00C5\u0064\u0327\u0307
 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
7: \u00C5\u1E0B\u0327
 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
8: \u00C5\u1E11\u0307
 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
9: \u212B\u0064\u0307\u0327
 = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
10: \u212B\u0064\u0327\u0307
 = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
11: \u212B\u1E0B\u0327
 = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
12: \u212B\u1E11\u0307
 = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
 *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
 * since it has not been optimized for that situation.
 * Note, CanonicalIterator is not intended to be subclassed.
 * @author M. Davis
 * @author C++ port by V. Weinstein
 * @draft ICU 2.4
 */
class U_COMMON_API CanonicalIterator : public UObject {
public:
    /**
     * Construct a CanonicalIterator object
     * @param source    string to get results for
     * @param status    Fill-in parameter which receives the status of this operation.
     * @draft ICU 2.4
     */
    CanonicalIterator(const UnicodeString &source, UErrorCode &status);    

    /** Destructor
     *  Cleans pieces
     * @draft ICU 2.4
     */
    ~CanonicalIterator();

    /**
     * Gets the NFD form of the current source we are iterating over.
     * @return gets the source: NOTE: it is the NFD form of source
     * @draft ICU 2.4
     */
    UnicodeString getSource();    

    /**
     * Resets the iterator so that one can start again from the beginning.
     * @draft ICU 2.4
     */
    void reset();    

    /**
     * Get the next canonically equivalent string.
	 * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
     * @return the next string that is canonically equivalent. A bogus string is returned when
     * the iteration is done.
     * @draft ICU 2.4
     */
    UnicodeString next();    

    /**
     * Set a new source for this iterator. Allows object reuse.
     * @param newSource     the source string to iterate against. This allows the same iterator to be used
     *                     while changing the source string, saving object creation.
     * @param status        Fill-in parameter which receives the status of this operation.
     * @draft ICU 2.4
     */
    void setSource(const UnicodeString &newSource, UErrorCode &status);    

    /**
     * Dumb recursive implementation of permutation. 
     * TODO: optimize
     * @param source     the string to find permutations for
     * @param skipZeros  determine if skip zeros
     * @param result     the results in a set.
     * @param status       Fill-in parameter which receives the status of this operation.
     * @internal 
     */
    static void permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status);     
    
    /**
     * ICU "poor man's RTTI", returns a UClassID for the actual class.
     *
     * @draft ICU 2.2
     */
    virtual inline UClassID getDynamicClassID() const;

    /**
     * ICU "poor man's RTTI", returns a UClassID for this class.
     *
     * @draft ICU 2.2
     */
    static inline UClassID getStaticClassID();

private:
    // ===================== PRIVATES ==============================
    // private default constructor
    CanonicalIterator(); 
       

    /**
     * Copy constructor. Private for now.
     * @internal
     */
    CanonicalIterator(const CanonicalIterator& other);

    /**
     * Assignment operator. Private for now.
     * @internal
     */
    CanonicalIterator& operator=(const CanonicalIterator& other);

    // fields
    UnicodeString source;
    UBool done;

    // 2 dimensional array holds the pieces of the string with
    // their different canonically equivalent representations
    UnicodeString **pieces;
    int32_t pieces_length;
    int32_t *pieces_lengths;

    // current is used in iterating to combine pieces
    int32_t *current;
    int32_t current_length;
    
    // transient fields
    UnicodeString buffer;
    
    // we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
    UnicodeString *getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status); //private String[] getEquivalents(String segment)
    
    //Set getEquivalents2(String segment);
    Hashtable *getEquivalents2(const UChar *segment, int32_t segLen, UErrorCode &status);
    //Hashtable *getEquivalents2(const UnicodeString &segment, int32_t segLen, UErrorCode &status);
    
    /**
     * See if the decomposition of cp2 is at segment starting at segmentPos 
     * (with canonical rearrangment!)
     * If so, take the remainder, and return the equivalents 
     */
    //Set extract(int comp, String segment, int segmentPos, StringBuffer buffer);
    Hashtable *extract(UChar32 comp, const UChar *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status);
    //Hashtable *extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status);

    void cleanPieces();

    /**
     * The address of this static class variable serves as this class's ID
     * for ICU "poor man's RTTI".
     */
    static const char fgClassID;
};

inline UClassID
CanonicalIterator::getStaticClassID()
{ return (UClassID)&fgClassID; }

inline UClassID
CanonicalIterator::getDynamicClassID() const 
{ return CanonicalIterator::getStaticClassID(); }

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_NORMALIZATION */

#endif


--- NEW FILE: dbbi.h ---
/*
**********************************************************************
*   Copyright (C) 1999-2003 IBM Corp. All rights reserved.
**********************************************************************
*   Date        Name        Description
*   12/1/99    rgillam     Complete port from Java.
*   01/13/2000 helena      Added UErrorCode to ctors.
**********************************************************************
*/

#ifndef DBBI_H
#define DBBI_H

#include "unicode/rbbi.h"

#if !UCONFIG_NO_BREAK_ITERATION

U_NAMESPACE_BEGIN

/* forward declaration */
class DictionaryBasedBreakIteratorTables;

/**
 * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
 * to further subdivide ranges of text beyond what is possible using just the
 * state-table-based algorithm.  This is necessary, for example, to handle
 * word and line breaking in Thai, which doesn't use spaces between words.  The
 * state-table-based algorithm used by RuleBasedBreakIterator is used to divide
 * up text as far as possible, and then contiguous ranges of letters are
 * repeatedly compared against a list of known words (i.e., the dictionary)
 * to divide them up into words.
 *
 * <p>Applications do not normally need to include this header.</p>
 *
 * <p>This class will probably be deprecated in a future release of ICU, and replaced
 *  with a more flexible and capable dictionary based break iterator.  This change
 *  should be invisible to applications, because creation and use of instances of
 *  DictionaryBasedBreakIterator is through the factories and abstract
 *  API on class BreakIterator, which will remain stable.</p>
 *
 * <p>This class is not intended to be subclassed.</p>
 *
 *
 * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator,
 * but adds one more special substitution name: &lt;dictionary&gt;.  This substitution
 * name is used to identify characters in words in the dictionary.  The idea is that
 * if the iterator passes over a chunk of text that includes two or more characters
 * in a row that are included in &lt;dictionary&gt;, it goes back through that range and
 * derives additional break positions (if possible) using the dictionary.
 *
 * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
 * file.  It follows a prescribed search path to locate the dictionary (right now,
 * it looks for it in /com/ibm/text/resources in each directory in the classpath,
 * and won't find it in JAR files, but this location is likely to change).  The
 * dictionary file is in a serialized binary format.  We have a very primitive (and
 * slow) BuildDictionaryFile utility for creating dictionary files, but aren't
 * currently making it public.  Contact us for help.
 * <p>
 * <b> NOTE </b>  The DictionaryBasedIterator class is still under development.  The
 * APIs are not in stable condition yet.  
 */
class U_COMMON_API DictionaryBasedBreakIterator : public RuleBasedBreakIterator {

private:

    /**
     * when a range of characters is divided up using the dictionary, the break
     * positions that are discovered are stored here, preventing us from having
     * to use either the dictionary or the state table again until the iterator
     * leaves this range of text
     */
    int32_t* cachedBreakPositions;

    /**
     * The number of elements in cachedBreakPositions
     */
    int32_t numCachedBreakPositions;

    /**
     * if cachedBreakPositions is not null, this indicates which item in the
     * cache the current iteration position refers to
     */
    int32_t positionInCache;

    DictionaryBasedBreakIteratorTables  *fTables;

    /**
     * Class ID
     */
    static const char fgClassID;

    /**=======================================================================
     * Create a dictionary based break boundary detection iterator.  
     * @param tablesImage The location for the dictionary to be loaded into memory
     * @param dictionaryFilename The name of the dictionary file 
     * @param status the error code status
     * @return A dictionary based break detection iterator.  The UErrorCode& status 
     * parameter is used to return status information to the user.
     * To check whether the construction succeeded or not, you should check
     * the value of U_SUCCESS(err).  If you wish more detailed information, you
     * can check for informational error results which still indicate success.  For example,
     * U_FILE_ACCESS_ERROR will be returned if the file does not exist.
     * The caller owns the returned object and is responsible for deleting it.
     ======================================================================= */
    DictionaryBasedBreakIterator(UDataMemory* tablesImage, const char* dictionaryFilename, UErrorCode& status);

public:
    //=======================================================================
    // boilerplate
    //=======================================================================

    /**
     * Destructor
     * @stable ICU 2.0
     */
    virtual ~DictionaryBasedBreakIterator();

    /**
     * Default constructor.  Creates an "empty" break iterator.
     * Such an iterator can subsequently be assigned to.
     * @return the newly created DictionaryBaseBreakIterator.
     * @stable ICU 2.0
     */
     DictionaryBasedBreakIterator();

     /**
      * Copy constructor.
      * @param other The DictionaryBasedBreakIterator to be copied.
      * @return the newly created DictionaryBasedBreakIterator.
      * @stable ICU 2.0
      */
     DictionaryBasedBreakIterator(const DictionaryBasedBreakIterator &other);

    /**
     * Assignment operator. 
     * @param that The object to be copied.
     * @return the newly set DictionaryBasedBreakIterator.
     * @stable ICU 2.0
     */
    DictionaryBasedBreakIterator& operator=(const DictionaryBasedBreakIterator& that);

    /**
     * Returns a newly-constructed RuleBasedBreakIterator with the same
     * behavior, and iterating over the same text, as this one.
     * @return Returns a newly-constructed RuleBasedBreakIterator.
     * @stable ICU 2.0
     */
    virtual BreakIterator* clone(void) const;

    //=======================================================================
    // BreakIterator overrides
    //=======================================================================
    /**
     * Advances the iterator backwards, to the last boundary preceding this one.
     * @return The position of the last boundary position preceding this one.
     * @stable ICU 2.0
     */
    virtual int32_t previous(void);

    /**
     * Sets the iterator to refer to the first boundary position following
     * the specified position.
     * @offset The position from which to begin searching for a break position.
     * @return The position of the first break after the current position.
     * @stable ICU 2.0
     */
    virtual int32_t following(int32_t offset);

    /**
     * Sets the iterator to refer to the last boundary position before the
     * specified position.
     * @offset The position to begin searching for a break from.
     * @return The position of the last boundary before the starting position.
     * @stable ICU 2.0
     */
    virtual int32_t preceding(int32_t offset);

    /**
     * Returns a unique class ID POLYMORPHICALLY.  Pure virtual override.
     * This method is to implement a simple version of RTTI, since not all
     * C++ compilers support genuine RTTI.  Polymorphic operator==() and
     * clone() methods call this method.
     *
     * @return          The class ID for this object. All objects of a
     *                  given class have the same class ID.  Objects of
     *                  other classes have different class IDs.
     * @stable ICU 2.0
     */
    virtual UClassID getDynamicClassID(void) const;

    /**
     * Returns the class ID for this class.  This is useful only for
     * comparing to a return value from getDynamicClassID().  For example:
     *
     *      Base* polymorphic_pointer = createPolymorphicObject();
     *      if (polymorphic_pointer->getDynamicClassID() ==
     *          Derived::getStaticClassID()) ...
     *
     * @return          The class ID for all objects of this class.
     * @stable ICU 2.0
     */
    static inline UClassID getStaticClassID(void);

protected:
    //=======================================================================
    // implementation
    //=======================================================================
    /**
     * This method is the actual implementation of the next() method.  All iteration
     * vectors through here.  This method initializes the state machine to state 1
     * and advances through the text character by character until we reach the end
     * of the text or the state machine transitions to state 0.  We update our return
     * value every time the state machine passes through a possible end state.
     * @internal
     */
    virtual int32_t handleNext(void);

    /**
     * removes the cache of break positions (usually in response to a change in
     * position of some sort)
     * @internal
     */
    virtual void reset(void);

    /**
     *  init    Initialize a dbbi.  Common routine for use by constructors.
     *  @internal
     */
    void init();

    /**
     * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated. 
     * If buffer is not large enough, new memory will be allocated.
     * @param BufferSize reference to size of allocated space. 
     * If BufferSize == 0, a sufficient size for use in cloning will 
     * be returned ('pre-flighting')
     * If BufferSize is not enough for a stack-based safe clone, 
     * new memory will be allocated.
     * @param status to indicate whether the operation went on smoothly or there were errors
     *  An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were 
     *  necessary.
     * @return pointer to the new clone
     * @internal
     */
    virtual BreakIterator *  createBufferClone(void *stackBuffer,
                                               int32_t &BufferSize,
                                               UErrorCode &status);


private:
    /**
     * This is the function that actually implements the dictionary-based
     * algorithm.  Given the endpoints of a range of text, it uses the
     * dictionary to determine the positions of any boundaries in this
     * range.  It stores all the boundary positions it discovers in
     * cachedBreakPositions so that we only have to do this work once
     * for each time we enter the range.
     * @param startPos The start position of a range of text
     * @param endPos The end position of a range of text
     * @param status The error code status
     */
    void divideUpDictionaryRange(int32_t startPos, int32_t endPos, UErrorCode &status);


    /*
     * HSYS : Please revisit with Rich, the ctors of the DBBI class is currently
     * marked as private.
     */
    friend class DictionaryBasedBreakIteratorTables;
    friend class BreakIterator;
};

inline UClassID
DictionaryBasedBreakIterator::getStaticClassID(void)
{ return (UClassID)(&fgClassID); }

inline UClassID
DictionaryBasedBreakIterator::getDynamicClassID(void) const
{ return DictionaryBasedBreakIterator::getStaticClassID(); }

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

#endif




--- NEW FILE: parseerr.h ---
/*
**********************************************************************
*   Copyright (C) 1999-2000, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   03/14/00    aliu        Creation.
*   06/27/00    aliu        Change from C++ class to C struct
**********************************************************************
*/
#ifndef PARSEERR_H
#define PARSEERR_H

#include "unicode/utypes.h"


/**
 * The capacity of the context strings in UParseError.
 * @stable ICU 2.0
 */ 
enum { U_PARSE_CONTEXT_LEN = 16 };

/**
 * A UParseError struct is used to returned detailed information about
 * parsing errors.  It is used by ICU parsing engines that parse long
 * rules, patterns, or programs, where the text being parsed is long
 * enough that more information than a UErrorCode is needed to
 * localize the error.
 *
 * <p>The code field is an integer error code specific to each parsing
 * engine, but globally unique.  See the engine header file for
 * possible values.  The line, offset, and context fields are
 * optional; parsing engines may choose not to use to use them.
 *
 * <p>Examples of engines which use UParseError (or may use it in the
 * future) are RuleBasedTransliterator and RuleBasedBreakIterator.
 * 
 * @stable ICU 2.0
 */
typedef struct UParseError {

    /**
     * An integer indicating the type of error.  If no error was
     * encountered, the parse engine sets this to zero, and the
     * other fields' values should be ignored.
     *
     * <p>Each parse engine should use a range of codes from
     * 0xNNNN0001 to 0xNNNNFFFF, where NNNN is a 16-bit integer
     * between 0x0001 and 0xFFFF unique to each parse engine.
     * Parse engines should define the enum PARSE_ERROR_BASE
     * to be 0xNNNN0000.
     */
    /*int32_t        code; */

    /**
     * The line on which the error occured.  If the parse engine
     * is not using this field, it should set it to zero.  Otherwise
     * it should be a positive integer. The default value of this field
     * is -1. It will be set to 0 if the code populating this struct is not
     * using line numbers.
     * @stable ICU 2.0    
     */
    int32_t        line;

    /**
     * The character offset to the error.  If the line field is
     * being used, then this offset is from the start of the line.
     * If the line field is not being used, then this offset is from
     * the start of the text.The default value of this field
     * is -1. It will be set to appropriate value by the code that 
     * populating the struct.
     * @stable ICU 2.0   
     */
    int32_t    offset;

    /**
     * Textual context before the error.  Null-terminated.
     * May be the empty string if not implemented by parser.
     * @stable ICU 2.0   
     */
    UChar          preContext[U_PARSE_CONTEXT_LEN];

    /**
     * Textual context after the error.  Null-terminated.
     * May be the empty string if not implemented by parser.
     * @stable ICU 2.0   
     */
    UChar          postContext[U_PARSE_CONTEXT_LEN];

} UParseError;

#endif

--- NEW FILE: parsepos.h ---
/*
* Copyright (C) {1997-2003}, International Business Machines Corporation and others. All Rights Reserved.
*******************************************************************************
*
* File PARSEPOS.H
*
* Modification History:
*
*   Date        Name        Description
*   07/09/97    helena      Converted from java.
*   07/17/98    stephen     Added errorIndex support.
*   05/11/99    stephen     Cleaned up.
*******************************************************************************
*/

#ifndef PARSEPOS_H
#define PARSEPOS_H

#include "unicode/utypes.h"
#include "unicode/uobject.h"

U_NAMESPACE_BEGIN

/**
 * <code>ParsePosition</code> is a simple class used by <code>Format</code>
 * and its subclasses to keep track of the current position during parsing.
 * The <code>parseObject</code> method in the various <code>Format</code>
 * classes requires a <code>ParsePosition</code> object as an argument.
 *
 * <p>
 * By design, as you parse through a string with different formats,
 * you can use the same <code>ParsePosition</code>, since the index parameter
 * records the current position.
 *
 * The ParsePosition class is not suitable for subclassing.
 *
 * @version     1.3 10/30/97
 * @author      Mark Davis, Helena Shih
 * @see         java.text.Format
 */

class U_COMMON_API ParsePosition : public UObject {
public:
    /**
     * Default constructor, the index starts with 0 as default.
     * @stable ICU 2.0
     */
    ParsePosition()
        : UObject()
      { this->index = 0; this->errorIndex = -1; }

    /**
     * Create a new ParsePosition with the given initial index.
     * @param newIndex the new text offset.
     * @stable ICU 2.0
     */
    ParsePosition(int32_t newIndex)
        : UObject()
      {    this->index = newIndex; this->errorIndex = -1; }

    /**
     * Copy constructor
     * @param copy the object to be copied from.
     * @stable ICU 2.0
     */
    ParsePosition(const ParsePosition& copy)
        : UObject(copy)
      {    this->index = copy.index; this->errorIndex = copy.errorIndex; }

    /**
     * Destructor
     * @stable ICU 2.0
     */
    ~ParsePosition() {}

    /**
     * Assignment operator
     * @stable ICU 2.0
     */
    ParsePosition&      operator=(const ParsePosition& copy);

    /**
     * Equality operator.
     * @return TRUE if the two parse positions are equal, FALSE otherwise.
     * @stable ICU 2.0
     */
    UBool              operator==(const ParsePosition& that) const;

    /**
     * Equality operator.
     * @return TRUE if the two parse positions are not equal, FALSE otherwise.
     * @stable ICU 2.0
     */
    UBool              operator!=(const ParsePosition& that) const;

    /**
     * Retrieve the current parse position.  On input to a parse method, this
     * is the index of the character at which parsing will begin; on output, it
     * is the index of the character following the last character parsed.
     * @return the current index.
     * @stable ICU 2.0
     */
    int32_t getIndex(void) const;

    /**
     * Set the current parse position.
     * @param index the new index.
     * @stable ICU 2.0
     */
    void setIndex(int32_t index);

    /**
     * Set the index at which a parse error occurred.  Formatters
     * should set this before returning an error code from their
     * parseObject method.  The default value is -1 if this is not
     * set.
     * @stable ICU 2.0
     */
    void setErrorIndex(int32_t ei);

    /**
     * Retrieve the index at which an error occurred, or -1 if the
     * error index has not been set.
     * @stable ICU 2.0
     */
    int32_t getErrorIndex(void) const;

    /**
     * ICU "poor man's RTTI", returns a UClassID for the actual class.
     *
     * @draft ICU 2.2
     */
    virtual inline UClassID getDynamicClassID() const;

    /**
     * ICU "poor man's RTTI", returns a UClassID for this class.
     *
     * @draft ICU 2.2
     */
    static inline UClassID getStaticClassID();

private:
    /**
     * Input: the place you start parsing.
     * <br>Output: position where the parse stopped.
     * This is designed to be used serially,
     * with each call setting index up for the next one.
     */
    int32_t index;

    /**
     * The index at which a parse error occurred.
     */
    int32_t errorIndex;

    /**
     * The address of this static class variable serves as this class's ID
     * for ICU "poor man's RTTI".
     */
    static const char fgClassID;
};

inline UClassID
ParsePosition::getStaticClassID()
{ return (UClassID)&fgClassID; }

inline UClassID
ParsePosition::getDynamicClassID() const
{ return ParsePosition::getStaticClassID(); }

inline ParsePosition&
ParsePosition::operator=(const ParsePosition& copy)
{
  index = copy.index;
  errorIndex = copy.errorIndex;
  return *this;
}

inline UBool
ParsePosition::operator==(const ParsePosition& copy) const
{
  if(index != copy.index || errorIndex != copy.errorIndex)
  return FALSE;
  else
  return TRUE;
}

inline UBool
ParsePosition::operator!=(const ParsePosition& copy) const
{
  return !operator==(copy);
}

inline int32_t
ParsePosition::getIndex() const
{
  return index;
}

inline void
ParsePosition::setIndex(int32_t offset)
{
  this->index = offset;
}

inline int32_t
ParsePosition::getErrorIndex() const
{
  return errorIndex;
}

inline void
ParsePosition::setErrorIndex(int32_t ei)
{
  this->errorIndex = ei;
}
U_NAMESPACE_END

#endif






--- NEW FILE: rbbi.h ---
/*
***************************************************************************
*   Copyright (C) 1999-2003 International Business Machines Corporation   *
*   and others. All rights reserved.                                      *
***************************************************************************

**********************************************************************
*   Date        Name        Description
*   10/22/99    alan        Creation.
*   11/11/99    rgillam     Complete port from Java.
**********************************************************************
*/

#ifndef RBBI_H
#define RBBI_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_BREAK_ITERATION

#include "unicode/brkiter.h"
#include "unicode/udata.h"
#include "unicode/parseerr.h"

struct UTrie;

U_NAMESPACE_BEGIN

struct RBBIDataHeader;
class  RuleBasedBreakIteratorTables;
class  BreakIterator;
class  RBBIDataWrapper;



/**
 * A subclass of BreakIterator whose behavior is specified using a list of rules.
 * <p>Instances of this class are most commonly created by the factory methods of
 *  BreakIterator::createWordInstance(), BreakIterator::createLineInstance(), etc.,
 *  and then used via the abstract API in class BreakIterator</p>
 *
 * <p>See the ICU User Guide for information on Break Iterator Rules.</p>
 *
 * <p>This class is not intended to be subclassed.  (Class DictionaryBasedBreakIterator
 *    is a subclass, but that relationship is effectively internal to the ICU 
 *    implementation.  The subclassing interface to RulesBasedBreakIterator is
 *    not part of the ICU API, and may not remain stable.</p>
 *
 */
class U_COMMON_API RuleBasedBreakIterator : public BreakIterator {

protected:
    /**
     * The character iterator through which this BreakIterator accesses the text
     * @internal
     */
    CharacterIterator*  fText;

    /**
     * The rule data for this BreakIterator instance
     * @internal
     */
    RBBIDataWrapper    *fData;
    /** @internal */
    UTrie              *fCharMappings;

    /** Rule {tag} value for the most recent match. 
     *  @internal
    */
    int32_t             fLastBreakTag;

    /**
     * Rule tag value valid flag.
     * Some iterator operations don't intrinsically set the correct tag value.
     * This flag lets us lazily compute the value if we are ever asked for it.
     * @internal
     */
    UBool               fLastBreakTagValid;

    /**
     * Counter for the number of characters encountered with the "dictionary"
     *   flag set.  Normal RBBI iterators don't use it, although the code
     *   for updating it is live.  Dictionary Based break iterators (a subclass
     *   of us) access this field directly.
     * @internal
     */
    uint32_t           fDictionaryCharCount;

    /**
     * Debugging flag.  Trace operation of state machine when true.
     * @internal
     */
    static UBool        fTrace;



private:
    /**
     * Class ID
     */
    static const char fgClassID;

protected:
    //=======================================================================
    // constructors
    //=======================================================================

    /**
     * This constructor uses the udata interface to create a BreakIterator
     * whose internal tables live in a memory-mapped file.  "image" is a pointer
     * to the beginning of that file.
     * @internal
     */
    RuleBasedBreakIterator(UDataMemory* image, UErrorCode &status);

    /**
     * Constructor from a flattened set of RBBI data in malloced memory.
     *             RulesBasedBreakIterators built from a custom set of rules
     *             are created via this constructor; the rules are compiled
     *             into memory, then the break iterator is constructed here.
     *
     *             The break iterator adopts the memory, and will
     *             free it when done.
     * @internal
     */
    RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);

    friend class RBBIRuleBuilder; /** @internal */
    friend class BreakIterator;



public:

    /** Default constructor.  Creates an empty shell of an iterator, with no
     *  rules or text to iterate over.   Object can subsequently be assigned to.
     *  @draft ICU 2.2
     */
    RuleBasedBreakIterator();

    /**
     * Copy constructor.  Will produce a break iterator with the same behavior,
     * and which iterates over the same text, as the one passed in.
     * @param that The RuleBasedBreakIterator passed to be copied
     * @stable ICU 2.0
     */
    RuleBasedBreakIterator(const RuleBasedBreakIterator& that);

    /**
     * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
     * @param rules The break rules to be used.
     * @param parseError  In the event of a syntax error in the rules, provides the location
     *                    within the rules of the problem.
     * @param status Information on any errors encountered.
     *  @draft ICU 2.2
     */
    RuleBasedBreakIterator( const UnicodeString    &rules,
                             UParseError           &parseError,
                             UErrorCode            &status);
    /**
     * Destructor
     *  @stable ICU 2.0
     */
    virtual ~RuleBasedBreakIterator();

    /**
     * Assignment operator.  Sets this iterator to have the same behavior,
     * and iterate over the same text, as the one passed in.
     * @param that The RuleBasedBreakItertor passed in
     * @return the newly created RuleBasedBreakIterator
     *  @stable ICU 2.0
     */
    RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that);

    /**
     * Equality operator.  Returns TRUE if both BreakIterators are of the
     * same class, have the same behavior, and iterate over the same text.
     * @param that The BreakIterator to be compared for equality
     * @Return TRUE if both BreakIterators are of the
     * same class, have the same behavior, and iterate over the same text.
     *  @stable ICU 2.0
     */
    virtual UBool operator==(const BreakIterator& that) const;

    /**
     * Not-equal operator.  If operator== returns TRUE, this returns FALSE,
     * and vice versa.
     * @param that The BreakIterator to be compared for inequality
     * @return TRUE if both BreakIterators are not same.
     *  @stable ICU 2.0
     */
    UBool operator!=(const BreakIterator& that) const;

    /**
     * Returns a newly-constructed RuleBasedBreakIterator with the same
     * behavior, and iterating over the same text, as this one.
     * Differs from the copy constructor in that it is polymorphic, and
     *   will correctly clone (copy) a derived class.
     * clone() is thread safe.  Multiple threads may simultaeneously
     * clone the same source break iterator.
     *  @stable ICU 2.0
     */
    virtual BreakIterator* clone() const;

    /**
     * Compute a hash code for this BreakIterator
     * @return A hash code
     *  @stable ICU 2.0
     */
    virtual int32_t hashCode(void) const;

    /**
     * Returns the description used to create this iterator
     * @return the description used to create this iterator
     *  @stable ICU 2.0
     */
    virtual const UnicodeString& getRules(void) const;

    //=======================================================================
    // BreakIterator overrides
    //=======================================================================

    /**
     * Return a CharacterIterator over the text being analyzed.  This version
     * of this method returns the actual CharacterIterator we're using internally.
     * Changing the state of this iterator can have undefined consequences.  If
     * you need to change it, clone it first.
     * @return An iterator over the text being analyzed.
     *  @stable ICU 2.0
     */
    virtual const CharacterIterator& getText(void) const;


    /**
     * Set the iterator to analyze a new piece of text.  This function resets
     * the current iteration position to the beginning of the text.
     * @param newText An iterator over the text to analyze.  The BreakIterator
     * takes ownership of the character iterator.  The caller MUST NOT delete it!
     *  @stable ICU 2.0
     */
    virtual void adoptText(CharacterIterator* newText);

    /**
     * Set the iterator to analyze a new piece of text.  This function resets
     * the current iteration position to the beginning of the text.
     * @param newText The text to analyze.
     *  @stable ICU 2.0
     */
    virtual void setText(const UnicodeString& newText);

    /**
     * Sets the current iteration position to the beginning of the text.
     * (i.e., the CharacterIterator's starting offset).
     * @return The offset of the beginning of the text.
     *  @stable ICU 2.0
     */
    virtual int32_t first(void);

    /**
     * Sets the current iteration position to the end of the text.
     * (i.e., the CharacterIterator's ending offset).
     * @return The text's past-the-end offset.
     *  @stable ICU 2.0
     */
    virtual int32_t last(void);

    /**
     * Advances the iterator either forward or backward the specified number of steps.
     * Negative values move backward, and positive values move forward.  This is
     * equivalent to repeatedly calling next() or previous().
     * @param n The number of steps to move.  The sign indicates the direction
     * (negative is backwards, and positive is forwards).
     * @return The character offset of the boundary position n boundaries away from
     * the current one.
     *  @stable ICU 2.0
     */
    virtual int32_t next(int32_t n);

    /**
     * Advances the iterator to the next boundary position.
     * @return The position of the first boundary after this one.
     *  @stable ICU 2.0
     */
    virtual int32_t next(void);

    /**
     * Moves the iterator backwards, to the last boundary preceding this one.
     * @return The position of the last boundary position preceding this one.
     *  @stable ICU 2.0
     */
    virtual int32_t previous(void);

    /**
     * Sets the iterator to refer to the first boundary position following
     * the specified position.
     * @param offset The position from which to begin searching for a break position.
     * @return The position of the first break after the current position.
     *  @stable ICU 2.0
     */
    virtual int32_t following(int32_t offset);

    /**
     * Sets the iterator to refer to the last boundary position before the
     * specified position.
     * @param offset The position to begin searching for a break from.
     * @return The position of the last boundary before the starting position.
     *  @stable ICU 2.0
     */
    virtual int32_t preceding(int32_t offset);

    /**
     * Returns true if the specfied position is a boundary position.  As a side
     * effect, leaves the iterator pointing to the first boundary position at
     * or after "offset".
     * @param offset the offset to check.
     * @return True if "offset" is a boundary position.
     *  @stable ICU 2.0
     */
    virtual UBool isBoundary(int32_t offset);

    /**
     * Returns the current iteration position.
     * @return The current iteration position.
     * @stable ICU 2.0
     */
    virtual int32_t current(void) const;


    /**
     * Return the status tag from the break rule that determined the most recently
     * returned break position.  The values appear in the rule source
     * within brackets, {123}, for example.  For rules that do not specify a
     * status, a default value of 0 is returned.
     * <p>
     * Of the standard types of ICU break iterators, only the word break
     * iterator provides status values.  The values are defined in
     * <code>enum UWordBreak</code>, and allow distinguishing between words
     * that contain alphabetic letters, "words" that appear to be numbers,
     * punctuation and spaces, words containing ideographic characters, and
     * more.  Call <code>getRuleStatus</code> after obtaining a boundary
     * position from <code>next()<code>, <code>previous()</code>, or 
     * any other break iterator functions that returns a boundary position.
     * <p>
     * @return the status from the break rule that determined the most recently
     * returned break position.
     *
     * @see UWordBreak
     * @draft ICU 2.2
     */
    virtual int32_t getRuleStatus() const;

    /**
     * Returns a unique class ID POLYMORPHICALLY.  Pure virtual override.
     * This method is to implement a simple version of RTTI, since not all
     * C++ compilers support genuine RTTI.  Polymorphic operator==() and
     * clone() methods call this method.
     *
     * @return          The class ID for this object. All objects of a
     *                  given class have the same class ID.  Objects of
     *                  other classes have different class IDs.
     * @stable ICU 2.0
     */
    inline virtual UClassID getDynamicClassID(void) const;

    /**
     * Returns the class ID for this class.  This is useful only for
     * comparing to a return value from getDynamicClassID().  For example:
     *
     *      Base* polymorphic_pointer = createPolymorphicObject();
     *      if (polymorphic_pointer->getDynamicClassID() ==
     *          Derived::getStaticClassID()) ...
     *
     * @return          The class ID for all objects of this class.
     * @stable ICU 2.0
     */
    inline static UClassID getStaticClassID(void);

    /*
     * Create a clone (copy) of this break iterator in memory provided
     *  by the caller.  The idea is to increase performance by avoiding
     *  a storage allocation.  Use of this functoin is NOT RECOMMENDED.
     *  Performance gains are minimal, and correct buffer management is
     *  tricky.  Use clone() instead.
     *
     * @param stackBuffer  The pointer to the memory into which the cloned object
     *                     should be placed.  If NULL,  allocate heap memory
     *                     for the cloned object.
     * @param BufferSize   The size of the buffer.  If zero, return the required
     *                     buffer size, but do not clone the object.  If the
     *                     size was too small (but not zero), allocate heap
     *                     storage for the cloned object.
     *
     * @param status       Error status.  U_SAFECLONE_ALLOCATED_WARNING will be
     *                     returned if the the provided buffer was too small, and
     *                     the clone was therefore put on the heap.
     *
     * @return  Pointer to the clone object.  This may differ from the stackBuffer
     *          address if the byte alignment of the stack buffer was not suitable
     *          or if the stackBuffer was too small to hold the clone.
     * @stable ICU 2.0
     */
    virtual BreakIterator *  createBufferClone(void *stackBuffer,
                                               int32_t &BufferSize,
                                               UErrorCode &status);


    /**
     * Return the binary form of compiled break rules,
     * which can then be used to create a new break iterator at some
     * time in the future.  Creating a break iterator from pre-compiled rules
     * is much faster than building one from the source form of the
     * break rules.
     *
     * The binary data is can only be used with the same version of ICU
     *  and on the same platform type (processor endian-ness)
     *
     * @param length Returns the length of the binary data.  (Out paramter.)
     *
     * @return   A pointer to the binary (compiled) rule data.  The storage
     *           belongs to the RulesBasedBreakIterator object, not the
     *           caller, and must not be modified or deleted.
     * @internal
     */
    virtual const uint8_t *getBinaryRules(uint32_t &length);


protected:
    //=======================================================================
    // implementation
    //=======================================================================
    /**
     * This method is the actual implementation of the next() method.  All iteration
     * vectors through here.  This method initializes the state machine to state 1
     * and advances through the text character by character until we reach the end
     * of the text or the state machine transitions to state 0.  We update our return
     * value every time the state machine passes through a possible end state.
     * @internal
     */
    virtual int32_t handleNext(void);

    /**
     * This method backs the iterator back up to a "safe position" in the text.
     * This is a position that we know, without any context, must be a break position.
     * The various calling methods then iterate forward from this safe position to
     * the appropriate position to return.  (For more information, see the description
     * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
     * @internal
     */
    virtual int32_t handlePrevious(void);

    /**
     * Dumps caches and performs other actions associated with a complete change
     * in text or iteration position.  This function is a no-op in RuleBasedBreakIterator,
     * but subclasses can and do override it.
     * @internal
     */
    virtual void reset(void);

    /**
      * Return true if the category lookup for this char
      * indicates that it is in the set of dictionary lookup chars.
      * This function is intended for use by dictionary based break iterators.
      * @return true if the category lookup for this char
      * indicates that it is in the set of dictionary lookup chars.
      * @internal
      */
    virtual UBool isDictionaryChar(UChar32);

    /**
      * Common initialization function, used by constructors and bufferClone.
      *   (Also used by DictionaryBasedBreakIterator::createBufferClone().)
      * @internal
      */
    void init();

};

//----------------------------------------------------------------------------------
//
//   Inline Functions Definitions ...
//
//----------------------------------------------------------------------------------

inline UBool RuleBasedBreakIterator::operator!=(const BreakIterator& that) const {
    return !operator==(that);
}

inline UClassID RuleBasedBreakIterator::getStaticClassID(void) {
    return (UClassID)(&fgClassID);
}

inline UClassID RuleBasedBreakIterator::getDynamicClassID(void) const {
    return RuleBasedBreakIterator::getStaticClassID();
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

#endif




--- NEW FILE: strenum.h ---
/*
*******************************************************************************
*
*   Copyright (C) 2002, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*/

#ifndef STRENUM_H
#define STRENUM_H

#include "unicode/uobject.h"

U_NAMESPACE_BEGIN

class UnicodeString;

/**
 * Base class for 'pure' C++ implementations of uenum api.  Adds a
 * method that returns the next UnicodeString since in C++ this can
 * be a common storage format for strings.
 *
 * <p>The model is that the enumeration is over strings maintained by
 * a 'service.'  At any point, the service might change, invalidating
 * the enumerator (though this is expected to be rare).  The iterator
 * returns an error if this has occurred.  Lack of the error is no
 * guarantee that the service didn't change immediately after the
 * call, so the returned string still might not be 'valid' on
 * subsequent use.</p>
 *
 * <p>Strings may take the form of const char*, const UChar*, or const
 * UnicodeString*.  The type you get is determine by the variant of
 * 'next' that you call.  In general the StringEnumeration is
 * optimized for one of these types, but all StringEnumerations can
 * return all types.  Returned strings are each terminated with a NUL.
 * Depending on the service data, they might also include embedded NUL
 * characters, so API is provided to optionally return the true
 * length, counting the embedded NULs but not counting the terminating
 * NUL.</p>
 *
 * <p>The pointers returned by next, unext, and snext become invalid
 * upon any subsequent call to the enumeration's destructor, next,
 * unext, snext, or reset.</p>
 *
 * @draft ICU 2.4 
 */
class U_COMMON_API StringEnumeration : public UObject { 
 public:
  /**
   * Destructor.
   * @draft ICU 2.4
   */
  virtual ~StringEnumeration();

  /**
   * <p>Return the number of elements that the iterator traverses.  If
   * the iterator is out of sync with its service, status is set to
   * U_ENUM_OUT_OF_SYNC_ERROR, and the return value is zero.</p>
   *
   * <p>The return value will not change except possibly as a result of
   * a subsequent call to reset, or if the iterator becomes out of sync.</p>
   *
   * <p>This is a convenience function. It can end up being very
   * expensive as all the items might have to be pre-fetched
   * (depending on the storage format of the data being
   * traversed).</p>
   *
   * @param status the error code.
   * @return number of elements in the iterator.
   *
   * @draft ICU 2.4 */
  virtual int32_t count(UErrorCode& status) const = 0;

  /**
   * <p>Returns the next element as a NUL-terminated char*.  If there
   * are no more elements, returns NULL.  If the resultLength pointer
   * is not NULL, the length of the string (not counting the
   * terminating NUL) is returned at that address.  If an error
   * status is returned, the value at resultLength is undefined.</p>
   *
   * <p>The returned pointer is owned by this iterator and must not be
   * deleted by the caller.  The pointer is valid until the next call
   * to next, unext, snext, reset, or the enumerator's destructor.</p>
   *
   * <p>If the iterator is out of sync with its service, status is set
   * to U_ENUM_OUT_OF_SYNC_ERROR and NULL is returned.</p>
   *
   * <p>If the native service string is a UChar* string, it is
   * converted to char* with the invariant converter.  If the
   * conversion fails (because a character cannot be converted) then
   * status is set to U_INVARIANT_CONVERSION_ERROR and the return
   * value is undefined (though not NULL).</p>
   *
   * @param status the error code.
   * @param resultLength a pointer to receive the length, can be NULL.
   * @return a pointer to the string, or NULL.
   *
   * @draft ICU 2.4 
   */
  virtual const char* next(int32_t *resultLength, UErrorCode& status) = 0;

  /**
   * <p>Returns the next element as a NUL-terminated UChar*.  If there
   * are no more elements, returns NULL.  If the resultLength pointer
   * is not NULL, the length of the string (not counting the
   * terminating NUL) is returned at that address.  If an error
   * status is returned, the value at resultLength is undefined.</p>
   *
   * <p>The returned pointer is owned by this iterator and must not be
   * deleted by the caller.  The pointer is valid until the next call
   * to next, unext, snext, reset, or the enumerator's destructor.</p>
   *
   * <p>If the iterator is out of sync with its service, status is set
   * to U_ENUM_OUT_OF_SYNC_ERROR and NULL is returned.</p>
   *
   * @param status the error code.
   * @param resultLength a ponter to receive the length, can be NULL.
   * @return a pointer to the string, or NULL.
   *
   * @draft ICU 2.4 
   */
  virtual const UChar* unext(int32_t *resultLength, UErrorCode& status) = 0;

  /**
   * <p>Returns the next element a UnicodeString*.  If there are no
   * more elements, returns NULL.</p>
   *
   * <p>The returned pointer is owned by this iterator and must not be
   * deleted by the caller.  The pointer is valid until the next call
   * to next, unext, snext, reset, or the enumerator's destructor.</p>
   *
   * <p>If the iterator is out of sync with its service, status is set
   * to U_ENUM_OUT_OF_SYNC_ERROR and NULL is returned.</p>
   *
   * @param status the error code.
   * @return a pointer to the string, or NULL.
   *
   * @draft ICU 2.4 
   */
  virtual const UnicodeString* snext(UErrorCode& status) = 0;

  /**
   * <p>Resets the iterator.  This re-establishes sync with the
   * service and rewinds the iterator to start at the first
   * element.</p>
   *
   * <p>Previous pointers returned by next, unext, or snext become
   * invalid, and the value returned by count might change.</p>
   *
   * @param status the error code.
   *
   * @draft ICU 2.4 
   */
  virtual void reset(UErrorCode& status) = 0;
};

inline StringEnumeration::~StringEnumeration() {
}

U_NAMESPACE_END

/* STRENUM_H */
#endif


--- NEW FILE: ubrk.h ---
/*
* Copyright (C) 1996-2003, International Business Machines Corporation and others. All Rights Reserved.
*****************************************************************************************
*/

#ifndef UBRK_H
#define UBRK_H

#include "unicode/utypes.h"

/**
 * A text-break iterator.
 *  For usage in C programs.
 */
#ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
#   define UBRK_TYPEDEF_UBREAK_ITERATOR
    /**
     *  Opaque type representing an ICU Break iterator object.
     *  @stable ICU 2.0
     */
    typedef void UBreakIterator;
#endif

#if !UCONFIG_NO_BREAK_ITERATION

#include "unicode/parseerr.h"

/**
 * \file
 * \brief C API: BreakIterator
 *
 * <h2> BreakIterator C API </h2>
 *
 * The BreakIterator C API defines  methods for finding the location
 * of boundaries in text. Pointer to a UBreakIterator maintain a
 * current position and scan over text returning the index of characters
 * where boundaries occur.
 * <P>
 * Line boundary analysis determines where a text string can be broken
 * when line-wrapping. The mechanism correctly handles punctuation and
 * hyphenated words.
 * <P>
 * Sentence boundary analysis allows selection with correct
 * interpretation of periods within numbers and abbreviations, and
 * trailing punctuation marks such as quotation marks and parentheses.
 * <P>
 * Word boundary analysis is used by search and replace functions, as
 * well as within text editing applications that allow the user to
 * select words with a double click. Word selection provides correct
 * interpretation of punctuation marks within and following
 * words. Characters that are not part of a word, such as symbols or
 * punctuation marks, have word-breaks on both sides.
 * <P>
 * Character boundary analysis allows users to interact with
 * characters as they expect to, for example, when moving the cursor
 * through a text string. Character boundary analysis provides correct
 * navigation of through character strings, regardless of how the
 * character is stored.  For example, an accented character might be
 * stored as a base character and a diacritical mark. What users
 * consider to be a character can differ between languages.
 * <P>
 * Title boundary analysis locates all positions,
 * typically starts of words, that should be set to Title Case
 * when title casing the text.
 * <P>
 *
 * This is the interface for all text boundaries.
 * <P>
 * Examples:
 * <P>
 * Helper function to output text
 * <pre>
 * \code
 *    void printTextRange(UChar* str, int32_t start, int32_t end ) {
 *         UChar* result;
 *         UChar* temp;
 *         const char* res;
 *         temp=(UChar*)malloc(sizeof(UChar) * ((u_strlen(str)-start)+1));
 *         result=(UChar*)malloc(sizeof(UChar) * ((end-start)+1));
 *         u_strcpy(temp, &str[start]);
 *         u_strncpy(result, temp, end-start);
 *         res=(char*)malloc(sizeof(char) * (u_strlen(result)+1));
 *         u_austrcpy(res, result);
 *         printf("%s\n", res);
 *    }
 * \endcode
 * </pre>
 * Print each element in order:
 * <pre>
 * \code
 *    void printEachForward( UBreakIterator* boundary, UChar* str) {
 *       int32_t end;
 *       int32_t start = ubrk_first(boundary);
 *       for (end = ubrk_next(boundary)); end != UBRK_DONE; start = end, end = ubrk_next(boundary)) {
 *             printTextRange(str, start, end );
 *         }
 *    }
 * \endcode
 * </pre>
 * Print each element in reverse order:
 * <pre>
 * \code
 *    void printEachBackward( UBreakIterator* boundary, UChar* str) {
 *       int32_t start;
 *       int32_t end = ubrk_last(boundary);
 *       for (start = ubrk_previous(boundary); start != UBRK_DONE;  end = start, start =ubrk_previous(boundary)) {
 *             printTextRange( str, start, end );
 *         }
 *    }
 * \endcode
 * </pre>
 * Print first element
 * <pre>
 * \code
 *    void printFirst(UBreakIterator* boundary, UChar* str) {
 *        int32_t end;
 *        int32_t start = ubrk_first(boundary);
 *        end = ubrk_next(boundary);
 *        printTextRange( str, start, end );
 *    }
 * \endcode
 * </pre>
 * Print last element
 * <pre>
 * \code
 *    void printLast(UBreakIterator* boundary, UChar* str) {
 *        int32_t start;
 *        int32_t end = ubrk_last(boundary);
 *        start = ubrk_previous(boundary);
 *        printTextRange(str, start, end );
 *    }
 * \endcode
 * </pre>
 * Print the element at a specified position
 * <pre>
 * \code
 *    void printAt(UBreakIterator* boundary, int32_t pos , UChar* str) {
 *        int32_t start;
 *        int32_t end = ubrk_following(boundary, pos);
 *        start = ubrk_previous(boundary);
 *        printTextRange(str, start, end );
 *    }
 * \endcode
 * </pre>
 * Creating and using text boundaries
 * <pre>
 * \code
 *       void BreakIterator_Example( void ) {
 *           UBreakIterator* boundary;
 *           UChar *stringToExamine;
 *           stringToExamine=(UChar*)malloc(sizeof(UChar) * (strlen("Aaa bbb ccc. Ddd eee fff.")+1) );
 *           u_uastrcpy(stringToExamine, "Aaa bbb ccc. Ddd eee fff.");
 *           printf("Examining: "Aaa bbb ccc. Ddd eee fff.");
 *
 *           //print each sentence in forward and reverse order
 *           boundary = ubrk_open(UBRK_SENTENCE, "en_us", stringToExamine, u_strlen(stringToExamine), &status);
 *           printf("----- forward: -----------\n");
 *           printEachForward(boundary, stringToExamine);
 *           printf("----- backward: ----------\n");
 *           printEachBackward(boundary, stringToExamine);
 *           ubrk_close(boundary);
 *
 *           //print each word in order
 *           boundary = ubrk_open(UBRK_WORD, "en_us", stringToExamine, u_strlen(stringToExamine), &status);
 *           printf("----- forward: -----------\n");
 *           printEachForward(boundary, stringToExamine);
 *           printf("----- backward: ----------\n");
 *           printEachBackward(boundary, stringToExamine);
 *           //print first element
 *           printf("----- first: -------------\n");
 *           printFirst(boundary, stringToExamine);
 *           //print last element
 *           printf("----- last: --------------\n");
 *           printLast(boundary, stringToExamine);
 *           //print word at charpos 10
 *           printf("----- at pos 10: ---------\n");
 *           printAt(boundary, 10 , stringToExamine);
 *
 *           ubrk_close(boundary);
 *       }
 * \endcode
 * </pre>
 */

/** The possible types of text boundaries.  @stable ICU 2.0 */
typedef enum UBreakIteratorType {
  /** Character breaks  @stable ICU 2.0 */
  UBRK_CHARACTER,
  /** Word breaks @stable ICU 2.0 */
  UBRK_WORD,
  /** Line breaks @stable ICU 2.0 */
  UBRK_LINE,
  /** Sentence breaks @stable ICU 2.0 */
  UBRK_SENTENCE,
  /** 
   * Title Case breaks 
   * The iterator created using this type locates title boundaries as described for 
   * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
   * please use Word Boundary iterator.  @draft ICU 2.2
   *
   */
  UBRK_TITLE
} UBreakIteratorType;

/** Value indicating all text boundaries have been returned.
 *  @stable ICU 2.0 
 */
#define UBRK_DONE ((int32_t) -1)


/**
 *  Enum constants for the word break tags returned by
 *  getRuleStatus().  A range of values is defined for each category of
 *  word, to allow for further subdivisions of a category in future releases.
 *  Applications should check for tag values falling within the range, rather
 *  than for single individual values.
 *  @draft ICU 2.2
*/
typedef enum UWordBreak {
    /** Tag value for "words" that do not fit into any of other categories. 
     *  Includes spaces and most punctuation. */
    UBRK_WORD_NONE           = 0,
    /** Upper bound for tags for uncategorized words. */
    UBRK_WORD_NONE_LIMIT     = 100,
    /** Tag value for words that appear to be numbers, lower limit.    */
    UBRK_WORD_NUMBER         = 100,
    /** Tag value for words that appear to be numbers, upper limit.    */
    UBRK_WORD_NUMBER_LIMIT   = 200,
    /** Tag value for words that contain letters, excluding
     *  hiragana, katakana or ideographic characters, lower limit.    */
    UBRK_WORD_LETTER         = 200,
    /** Tag value for words containing letters, upper limit  */
    UBRK_WORD_LETTER_LIMIT   = 300,
    /** Tag value for words containing kana characters, lower limit */
    UBRK_WORD_KANA           = 300,
    /** Tag value for words containing kana characters, upper limit */
    UBRK_WORD_KANA_LIMIT     = 400,
    /** Tag value for words containing ideographic characters, lower limit */
    UBRK_WORD_IDEO           = 400,
    /** Tag value for words containing ideographic characters, upper limit */
    UBRK_WORD_IDEO_LIMIT     = 500
} UWordBreak;


/**
 * Open a new UBreakIterator for locating text boundaries for a specified locale.
 * A UBreakIterator may be used for detecting character, line, word,
 * and sentence breaks in text.
 * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
 * UBRK_LINE, UBRK_SENTENCE
 * @param locale The locale specifying the text-breaking conventions.
 * @param text The text to be iterated over.
 * @param textLength The number of characters in text, or -1 if null-terminated.
 * @param status A UErrorCode to receive any errors.
 * @return A UBreakIterator for the specified locale.
 * @see ubrk_openRules
 * @stable ICU 2.0
 */
U_CAPI UBreakIterator* U_EXPORT2
ubrk_open(UBreakIteratorType type,
      const char *locale,
      const UChar *text,
      int32_t textLength,
      UErrorCode *status);

/**
 * Open a new UBreakIterator for locating text boundaries using specified breaking rules.
 * The rule syntax is ... (TBD)
 * @param rules A set of rules specifying the text breaking conventions.
 * @param rulesLength The number of characters in rules, or -1 if null-terminated.
 * @param text The text to be iterated over.  May be null, in which case ubrk_setText() is
 *        used to specify the text to be iterated.
 * @param textLength The number of characters in text, or -1 if null-terminated.
 * @param parseErr   Receives position and context information for any syntax errors
 *                   detected while parsing the rules.
 * @param status A UErrorCode to receive any errors.
 * @return A UBreakIterator for the specified rules.
 * @see ubrk_open
 * @draft ICU 2.2
 */
U_CAPI UBreakIterator* U_EXPORT2
ubrk_openRules(const UChar     *rules,
               int32_t         rulesLength,
               const UChar     *text,
               int32_t          textLength,
               UParseError     *parseErr,
               UErrorCode      *status);

/**
 * Thread safe cloning operation
 * @param bi iterator to be cloned
 * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
 *  If buffer is not large enough, new memory will be allocated.
 *  Clients can use the U_BRK_SAFECLONE_BUFFERSIZE. This will probably be enough to avoid memory allocations.
 * @param pBufferSize pointer to size of allocated space.
 *  If *pBufferSize == 0, a sufficient size for use in cloning will
 *  be returned ('pre-flighting')
 *  If *pBufferSize is not enough for a stack-based safe clone,
 *  new memory will be allocated.
 * @param status to indicate whether the operation went on smoothly or there were errors
 *  An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary.
 * @return pointer to the new clone
 * @stable ICU 2.0
 */
U_CAPI UBreakIterator * U_EXPORT2
ubrk_safeClone(
          const UBreakIterator *bi,
          void *stackBuffer,
          int32_t *pBufferSize,
          UErrorCode *status);

/**
  * A recommended size (in bytes) for the memory buffer to be passed to ubrk_saveClone().
  * @stable ICU 2.0
  */
#define U_BRK_SAFECLONE_BUFFERSIZE 512

/**
* Close a UBreakIterator.
* Once closed, a UBreakIterator may no longer be used.
* @param bi The break iterator to close.
 * @stable ICU 2.0
*/
U_CAPI void U_EXPORT2
ubrk_close(UBreakIterator *bi);

/**
 * Sets an existing iterator to point to a new piece of text
 * @param bi The iterator to use
 * @param text The text to be set
 * @param textLength The length of the text
 * @param status The error code
 * @stable ICU 2.0
 */
U_CAPI void U_EXPORT2
ubrk_setText(UBreakIterator* bi,
             const UChar*    text,
             int32_t         textLength,
             UErrorCode*     status);

/**
 * Determine the most recently-returned text boundary.
 *
 * @param bi The break iterator to use.
 * @return The character index most recently returned by \Ref{ubrk_next}, \Ref{ubrk_previous},
 * \Ref{ubrk_first}, or \Ref{ubrk_last}.
 * @stable ICU 2.0
 */
U_CAPI int32_t U_EXPORT2
ubrk_current(const UBreakIterator *bi);

/**
 * Determine the text boundary following the current text boundary.
 *
 * @param bi The break iterator to use.
 * @return The character index of the next text boundary, or UBRK_DONE
 * if all text boundaries have been returned.
 * @see ubrk_previous
 * @stable ICU 2.0
 */
U_CAPI int32_t U_EXPORT2
ubrk_next(UBreakIterator *bi);

/**
 * Determine the text boundary preceding the current text boundary.
 *
 * @param bi The break iterator to use.
 * @return The character index of the preceding text boundary, or UBRK_DONE
 * if all text boundaries have been returned.
 * @see ubrk_next
 * @stable ICU 2.0
 */
U_CAPI int32_t U_EXPORT2
ubrk_previous(UBreakIterator *bi);

/**
 * Determine the index of the first character in the text being scanned.
 * This is not always the same as index 0 of the text.
 * @param bi The break iterator to use.
 * @return The character index of the first character in the text being scanned.
 * @see ubrk_last
 * @stable ICU 2.0
 */
U_CAPI int32_t U_EXPORT2
ubrk_first(UBreakIterator *bi);

/**
 * Determine the index immediately <EM>beyond</EM> the last character in the text being
 * scanned.
 * This is not the same as the last character.
 * @param bi The break iterator to use.
 * @return The character offset immediately <EM>beyond</EM> the last character in the
 * text being scanned.
 * @see ubrk_first
 * @stable ICU 2.0
 */
U_CAPI int32_t U_EXPORT2
ubrk_last(UBreakIterator *bi);

/**
 * Determine the text boundary preceding the specified offset.
 * The value returned is always smaller than offset, or UBRK_DONE.
 * @param bi The break iterator to use.
 * @param offset The offset to begin scanning.
 * @return The text boundary preceding offset, or UBRK_DONE.
 * @see ubrk_following
 * @stable ICU 2.0
 */
U_CAPI int32_t U_EXPORT2
ubrk_preceding(UBreakIterator *bi,
           int32_t offset);

/**
 * Determine the text boundary following the specified offset.
 * The value returned is always greater than offset, or UBRK_DONE.
 * @param bi The break iterator to use.
 * @param offset The offset to begin scanning.
 * @return The text boundary following offset, or UBRK_DONE.
 * @see ubrk_preceding
 * @stable ICU 2.0
 */
U_CAPI int32_t U_EXPORT2
ubrk_following(UBreakIterator *bi,
           int32_t offset);

/**
* Get a locale for which text breaking information is available.
* A UBreakIterator in a locale returned by this function will perform the correct
* text breaking for the locale.
* @param index The index of the desired locale.
* @return A locale for which number text breaking information is available, or 0 if none.
* @see ubrk_countAvailable
* @stable ICU 2.0
*/
U_CAPI const char* U_EXPORT2
ubrk_getAvailable(int32_t index);

/**
* Determine how many locales have text breaking information available.
* This function is most useful as determining the loop ending condition for
* calls to \Ref{ubrk_getAvailable}.
* @return The number of locales for which text breaking information is available.
* @see ubrk_getAvailable
* @stable ICU 2.0
*/
U_CAPI int32_t U_EXPORT2
ubrk_countAvailable(void);


/**
* Returns true if the specfied position is a boundary position.  As a side
* effect, leaves the iterator pointing to the first boundary position at
* or after "offset".
* @param bi The break iterator to use.
* @param offset the offset to check.
* @return True if "offset" is a boundary position.
* @stable ICU 2.0
*/
U_CAPI  UBool U_EXPORT2
ubrk_isBoundary(UBreakIterator *bi, int32_t offset);

/**
 * Return the status from the break rule that determined the most recently
 * returned break position.  The values appear in the rule source
 * within brackets, {123}, for example.  For rules that do not specify a
 * status, a default value of 0 is returned.
 * <p>
 * For word break iterators, the possible values are defined in enum UWordBreak.
 * @draft ICU 2.2
 */
U_CAPI  int32_t U_EXPORT2
ubrk_getRuleStatus(UBreakIterator *bi);

#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

#endif

--- NEW FILE: ucat.h ---
/*
**********************************************************************
* Copyright (c) 2003, International Business Machines
* Corporation and others.  All Rights Reserved.
**********************************************************************
* Author: Alan Liu
* Created: March 19 2003
* Since: ICU 2.6
**********************************************************************
*/
#ifndef UCAT_H
#define UCAT_H

#include "unicode/utypes.h"
#include "unicode/ures.h"

/**
 * \file
 * \brief C API: Message Catalog Wrappers
 *
 * This C API provides look-alike functions that deliberately resemble
 * the POSIX catopen, catclose, and catgets functions.  The underlying
 * implementation is in terms of ICU resource bundles, rather than
 * POSIX message catalogs.
 *
 * The ICU resource bundles obey standard ICU inheritance policies.
 * To facilitate this, sets and messages are flattened into one tier.
 * This is done by creating resource bundle keys of the form
 * <set_num>%<msg_num> where set_num is the set number and msg_num is
 * the message number, formatted as decimal strings.
 *
 * Example:  Consider a message catalog containing two sets:
 *
 * Set 1: Message 4  = "Good morning."
 *        Message 5  = "Good afternoon."
 *        Message 7  = "Good evening."
 *        Message 8  = "Good night."
 * Set 4: Message 14 = "Please "
 *        Message 19 = "Thank you."
 *        Message 20 = "Sincerely,"
 *
 * The ICU resource bundle source file would, assuming it is named
 * "greet.txt", would look like this:
 *
 * greet
 * {
 *     1%4  { "Good morning." }
 *     1%5  { "Good afternoon." }
 *     1%7  { "Good evening." }
 *     1%8  { "Good night." }
 * 
 *     4%14 { "Please " }
 *     4%19 { "Thank you." }
 *     4%20 { "Sincerely," }
 * }
 *
 * The catgets function is commonly used in combination with functions
 * like printf and strftime.  ICU components like message format can
 * be used instead, although they use a different format syntax.
 * There is an unsupported ICU package, ustdio, that provides some of
 * the POSIX-style formatting API.
 */

U_CDECL_BEGIN

/**
 * An ICU message catalog descriptor, analogous to nl_catd.
 * 
 * @draft ICU 2.6
 */
typedef UResourceBundle* u_nl_catd;

/**
 * Open and return an ICU message catalog descriptor. The descriptor
 * may be passed to u_catgets() to retrieve localized strings.
 *
 * @param name string containing the full path pointing to the
 * directory where the resources reside followed by the package name
 * e.g. "/usr/resource/my_app/resources/guimessages" on a Unix system.
 * If NULL, ICU default data files will be used.
 *
 * Unlike POSIX, environment variables are not interpolated within the
 * name.
 *
 * @param locale the locale for which we want to open the resource. If
 * NULL, the default ICU locale will be used (see uloc_getDefault). If
 * strlen(locale) == 0, the root locale will be used.
 *
 * @param ec input/output error code. Upon output,
 * U_USING_FALLBACK_WARNING indicates that a fallback locale was
 * used. For example, 'de_CH' was requested, but nothing was found
 * there, so 'de' was used. U_USING_DEFAULT_WARNING indicates that the
 * default locale data or root locale data was used; neither the
 * requested locale nor any of its fallback locales were found.
 *
 * @return a message catalog descriptor that may be passed to
 * u_catgets(). If the ec parameter indicates success, then the caller
 * is responsible for calling u_catclose() to close the message
 * catalog. If the ec parameter indicates failure, then NULL will be
 * returned.
 * 
 * @draft ICU 2.6
 */
U_CAPI u_nl_catd U_EXPORT2
u_catopen(const char* name, const char* locale, UErrorCode* ec);

/**
 * Close an ICU message catalog, given its descriptor.
 *
 * @param catd a message catalog descriptor to be closed. May be NULL,
 * in which case no action is taken.
 * 
 * @draft ICU 2.6
 */
U_CAPI void U_EXPORT2
u_catclose(u_nl_catd catd);

/**
 * Retrieve a localized string from an ICU message catalog.
 *
 * @param catd a message catalog descriptor returned by u_catopen.
 *
 * @param set_num the message catalog set number. Sets need not be
 * numbered consecutively.
 *
 * @param msg_num the message catalog message number within the
 * set. Messages need not be numbered consecutively.
 *
 * @param s the default string. This is returned if the string
 * specified by the set_num and msg_num is not found. It must be
 * zero-terminated.
 *
 * @param len fill-in parameter to receive the length of the result.
 * May be NULL, in which case it is ignored.
 *
 * @param ec input/output error code. May be U_USING_FALLBACK_WARNING
 * or U_USING_DEFAULT_WARNING. U_MISSING_RESOURCE_ERROR indicates that
 * the set_num/msg_num tuple does not specify a valid message string
 * in this catalog.
 *
 * @return a pointer to a zero-terminated UChar array which lives in
 * an internal buffer area, typically a memory mapped/DLL file. The
 * caller must NOT delete this pointer. If the call is unsuccessful
 * for any reason, then s is returned.  This includes the situation in
 * which ec indicates a failing error code upon entry to this
 * function.
 * 
 * @draft ICU 2.6
 */
U_CAPI const UChar* U_EXPORT2
u_catgets(u_nl_catd catd, int32_t set_num, int32_t msg_num,
          const UChar* s,
          int32_t* len, UErrorCode* ec);

U_CDECL_END

#endif /*UCAT_H*/
/*eof*/







--- NEW FILE: uconfig.h ---
/*  
**********************************************************************
*   Copyright (C) 2002-2003, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*   file name:  uconfig.h
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2002sep19
*   created by: Markus W. Scherer
*/

#ifndef __UCONFIG_H__
#define __UCONFIG_H__

/*!
 * \file
 * \brief Switches for excluding parts of ICU library code modules.
 *
 * Allows to build partial, smaller libraries for special purposes.
 * By default, all modules are built.
 * The switches are fairly coarse, controlling large modules.
 * Basic services cannot be turned off.
 *
 * @draft ICU 2.4
 */

/**
 * \def UCONFIG_ONLY_COLLATION
 * This switch turns off modules that are not needed for collation.
 *
 * It does not turn off legacy conversion because that is necessary
 * for ICU to work on EBCDIC platforms (for the default converter).
 * If you want "only collation" and do not build for EBCDIC,
 * then you can #define UCONFIG_NO_LEGACY_CONVERSION 1 as well.
 *
 * @draft ICU 2.4
 */
#ifndef UCONFIG_ONLY_COLLATION
#   define UCONFIG_ONLY_COLLATION 0
#endif

#if UCONFIG_ONLY_COLLATION
    /* common library */
#   define UCONFIG_NO_BREAK_ITERATION 1
#   define UCONFIG_NO_IDNA 1

    /* i18n library */
#   if UCONFIG_NO_COLLATION
#       error Contradictory collation switches in uconfig.h.
#   endif
#   define UCONFIG_NO_FORMATTING 1
#   define UCONFIG_NO_TRANSLITERATION 1
#   define UCONFIG_NO_REGULAR_EXPRESSIONS 1
#endif

/* common library switches -------------------------------------------------- */

/**
 * \def UCONFIG_NO_LEGACY_CONVERSION
 * This switch turns off all converters except for
 * - Unicode charsets (UTF-7/8/16/32, CESU-8, SCSU, BOCU-1)
 * - US-ASCII
 * - ISO-8859-1
 *
 * Turning off legacy conversion is not possible on EBCDIC platforms
 * because they need ibm-37 or ibm-1047 default converters.
 *
 * @draft ICU 2.4
 */
#ifndef UCONFIG_NO_LEGACY_CONVERSION
#   define UCONFIG_NO_LEGACY_CONVERSION 0
#endif

/**
 * \def UCONFIG_NO_NORMALIZATION
 * This switch turns off normalization.
 * It implies turning off several other services as well, for example
 * collation and IDNA.
 *
 * @draft ICU 2.6
 */
#ifndef UCONFIG_NO_NORMALIZATION
#   define UCONFIG_NO_NORMALIZATION 0
#elif UCONFIG_NO_NORMALIZATION
    /* common library */
#   define UCONFIG_NO_IDNA 1

    /* i18n library */
#   if UCONFIG_ONLY_COLLATION
#       error Contradictory collation switches in uconfig.h.
#   endif
#   define UCONFIG_NO_COLLATION 1
#   define UCONFIG_NO_TRANSLITERATION 1
#endif

/**
 * \def UCONFIG_NO_BREAK_ITERATION
 * This switch turns off break iteration.
 *
 * @draft ICU 2.4
 */
#ifndef UCONFIG_NO_BREAK_ITERATION
#   define UCONFIG_NO_BREAK_ITERATION 0
#endif

/**
 * \def UCONFIG_NO_IDNA
 * This switch turns off IDNA.
 *
 * @draft ICU 2.6
 */
#ifndef UCONFIG_NO_IDNA
#   define UCONFIG_NO_IDNA 0
#endif

/* i18n library switches ---------------------------------------------------- */

/**
 * \def 
 * This switch turns off collation and collation-based string search.
 *
 * @draft ICU 2.4
 */
#ifndef UCONFIG_NO_COLLATION
#   define UCONFIG_NO_COLLATION 0
#endif

/**
 * \def UCONFIG_NO_FORMATTING
 * This switch turns off formatting and calendar/timezone services.
 *
 * @draft ICU 2.4
 */
#ifndef UCONFIG_NO_FORMATTING
#   define UCONFIG_NO_FORMATTING 0
#endif

/**
 * \def UCONFIG_NO_TRANSLITERATION
 * This switch turns off transliteration.
 *
 * @draft ICU 2.4
 */
#ifndef UCONFIG_NO_TRANSLITERATION
#   define UCONFIG_NO_TRANSLITERATION 0
#endif

/**
 * \def UCONFIG_NO_REGULAR_EXPRESSIONS
 * This switch turns off regular expressions.
 *
 * @draft ICU 2.4
 */
#ifndef UCONFIG_NO_REGULAR_EXPRESSIONS
#   define UCONFIG_NO_REGULAR_EXPRESSIONS 0
#endif



#endif


--- NEW FILE: uenum.h ---
/*
*******************************************************************************
*
*   Copyright (C) 2002, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  uenum.h
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:2
*
*   created on: 2002jul08
*   created by: Vladimir Weinstein
*/

#ifndef __UENUM_H
#define __UENUM_H

#include "unicode/utypes.h"

/**
 * An enumeration object.
 * For usage in C programs.
 * @draft ICU 2.2
 */
struct UEnumeration;
/** structure representing an enumeration object instance @draft ICU 2.2 */
typedef struct UEnumeration UEnumeration;

/**
 * Disposes of resources in use by the iterator.  If en is NULL,
 * does nothing.  After this call, any char* or UChar* pointer
 * returned by uenum_unext() or uenum_next() is invalid.
 * @param en UEnumeration structure pointer
 * @draft ICU 2.2
 */
U_CAPI void U_EXPORT2
uenum_close(UEnumeration* en);

/**
 * Returns the number of elements that the iterator traverses.  If
 * the iterator is out-of-sync with its service, status is set to
 * U_ENUM_OUT_OF_SYNC_ERROR.
 * This is a convenience function. It can end up being very
 * expensive as all the items might have to be pre-fetched (depending
 * on the type of data being traversed). Use with caution and only 
 * when necessary.
 * @param en UEnumeration structure pointer
 * @param status error code, can be U_ENUM_OUT_OF_SYNC_ERROR if the
 *               iterator is out of sync.
 * @return number of elements in the iterator
 * @draft ICU 2.2
 */
U_CAPI int32_t U_EXPORT2
uenum_count(UEnumeration* en, UErrorCode* status);

/**
 * Returns the next element in the iterator's list.  If there are
 * no more elements, returns NULL.  If the iterator is out-of-sync
 * with its service, status is set to U_ENUM_OUT_OF_SYNC_ERROR and
 * NULL is returned.  If the native service string is a char* string,
 * it is converted to UChar* with the invariant converter.
 * The result is terminated by (UChar)0.
 * @param en the iterator object
 * @param resultLength pointer to receive the length of the result
 *                     (not including the terminating \0).
 *                     If the pointer is NULL it is ignored.
 * @param status the error code, set to U_ENUM_OUT_OF_SYNC_ERROR if
 *               the iterator is out of sync with its service.
 * @return a pointer to the string.  The string will be
 *         zero-terminated.  The return pointer is owned by this iterator
 *         and must not be deleted by the caller.  The pointer is valid
 *         until the next call to any uenum_... method, including
 *         uenum_next() or uenum_unext().  When all strings have been
 *         traversed, returns NULL.
 * @draft ICU 2.2
 */
U_CAPI const UChar* U_EXPORT2
uenum_unext(UEnumeration* en,
            int32_t* resultLength,
            UErrorCode* status);

/**
 * Returns the next element in the iterator's list.  If there are
 * no more elements, returns NULL.  If the iterator is out-of-sync
 * with its service, status is set to U_ENUM_OUT_OF_SYNC_ERROR and
 * NULL is returned.  If the native service string is a UChar*
 * string, it is converted to char* with the invariant converter.
 * The result is terminated by (char)0.  If the conversion fails
 * (because a character cannot be converted) then status is set to
 * U_INVARIANT_CONVERSION_ERROR and the return value is undefined
 * (but non-NULL).
 * @param en the iterator object
 * @param resultLength pointer to receive the length of the result
 *                     (not including the terminating \0).
 *                     If the pointer is NULL it is ignored.
 * @param status the error code, set to U_ENUM_OUT_OF_SYNC_ERROR if
 *               the iterator is out of sync with its service.  Set to
 *               U_INVARIANT_CONVERSION_ERROR if the underlying native string is
 *               UChar* and conversion to char* with the invariant converter
 *               fails. This error pertains only to current string, so iteration
 *               might be able to continue successfully.
 * @return a pointer to the string.  The string will be
 *         zero-terminated.  The return pointer is owned by this iterator
 *         and must not be deleted by the caller.  The pointer is valid
 *         until the next call to any uenum_... method, including
 *         uenum_next() or uenum_unext().  When all strings have been
 *         traversed, returns NULL.
 * @draft ICU 2.2
 */
U_CAPI const char* U_EXPORT2
uenum_next(UEnumeration* en,
           int32_t* resultLength,
           UErrorCode* status);

/**
 * Resets the iterator to the current list of service IDs.  This
 * re-establishes sync with the service and rewinds the iterator
 * to start at the first element.
 * @param en the iterator object
 * @param status the error code, set to U_ENUM_OUT_OF_SYNC_ERROR if
 *               the iterator is out of sync with its service.  
 * @draft ICU 2.2
 */
U_CAPI void U_EXPORT2
uenum_reset(UEnumeration* en, UErrorCode* status);

#endif

--- NEW FILE: uidna.h ---
/*
 *******************************************************************************
 *
 *   Copyright (C) 2003, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
 *   file name:  uidna.h
 *   encoding:   US-ASCII
 *   tab size:   8 (not used)
 *   indentation:4
 *
 *   created on: 2003feb1
 *   created by: Ram Viswanadha
 */

#ifndef __UIDNA_H__
#define __UIDNA_H__

#include "unicode/utypes.h"

#if !UCONFIG_NO_IDNA

#include "unicode/parseerr.h"
  
/**
 *\file
 * UIDNA API implements the IDNA protocol as defined in the IDNA RFC 
 * (http://www.ietf.org/rfc/rfc3490.txt).
 * The RFC defines 2 operations: ToASCII and ToUnicode. Domain labels 
 * containing non-ASCII code points are required to be processed by
 * ToASCII operation before passing it to resolver libraries. Domain names
 * that are obtained from resolver libraries are required to be processed by
 * ToUnicode operation before displaying the domain name to the user.
 * IDNA requires that implementations process input strings with Nameprep
 * (http://www.ietf.org/rfc/rfc3491.txt), 
 * which is a profile of Stringprep (http://www.ietf.org/rfc/rfc3454.txt), 
 * and then with Punycode (http://www.ietf.org/rfc/rfc3492.txt). 
 * Implementations of IDNA MUST fully implement Nameprep and Punycode; 
 * neither Nameprep nor Punycode are optional.
 * The input and output of ToASCII and ToUnicode operations are Unicode 
 * and are designed to be chainable, i.e., applying ToASCII or ToUnicode operations
 * multiple times to an input string will yield the same result as applying the operation
 * once.
 * ToUnicode(ToUnicode(ToUnicode...(ToUnicode(string)))) == ToUnicode(string) 
 * ToASCII(ToASCII(ToASCII...(ToASCII(string))) == ToASCII(string).
 *\end_file
 */

/** 
 * Option to prohibit processing of unassigned codepoints in the input and
 * do not check if the input conforms to STD-3 ASCII rules.
 * 
 * @see  uidna_toASCII uidna_toUnicode
 * @draft ICU 2.6
 */
#define UIDNA_DEFAULT          0x0000
/** 
 * Option to allow processing of unassigned codepoints in the input
 * 
 * @see  uidna_toASCII uidna_toUnicode
 * @draft ICU 2.6
 */
#define UIDNA_ALLOW_UNASSIGNED 0x0001
/** 
 * Option to check if input conforms to STD-3 ASCII rules
 * 
 * @see  uidna_toASCII uidna_toUnicode
 * @draft ICU 2.6
 */
#define UIDNA_USE_STD3_RULES   0x0002
    
/**
 * This function implements the ToASCII operation as defined in the IDNA RFC.
 * This operation is done on <b>single labels</b> before sending it to something that expects
 * ASCII names. A label is an individual part of a domain name. Labels are usually
 * separated by dots; e.g." "www.example.com" is composed of 3 labels 
 * "www","example", and "com".
 *
 *
 * @param src               Input UChar array containing label in Unicode.
 * @param srcLength         Number of UChars in src, or -1 if NUL-terminated.
 * @param dest              Output UChar array with ASCII (ACE encoded) label.
 * @param destCapacity      Size of dest.
 * @param options           A bit set of options:
 *
 *  - UIDNA_DEFAULT         Use default options, i.e., do not process unassigned code points
 *                          and do not use STD3 ASCII rules
 *                          If unassigned code points are found the operation fails with 
 *                          U_UNASSIGNED_CODE_POINT_FOUND error code.
 *
 *  - UIDNA_UNASSIGNED      Unassigned values can be converted to ASCII for query operations
 *                          If this option is set, the unassigned code points are in the input 
 *                          are treated as normal Unicode code points.
 *                          
 *  - UIDNA_USE_STD3_RULES  Use STD3 ASCII rules for host name syntax restrictions
 *                          If this option is set and the input does not satisfy STD3 rules,  
 *                          the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
 *
 * @param parseError        Pointer to UParseError struct to receive information on position 
 *                          of error if an error is encountered. Can be NULL.
 * @param status            ICU in/out error code parameter.
 *                          U_INVALID_CHAR_FOUND if src contains
 *                          unmatched single surrogates.
 *                          U_INDEX_OUTOFBOUNDS_ERROR if src contains
 *                          too many code points.
 *                          U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
 * @return                  Number of ASCII characters converted.
 * @draft ICU 2.6
 */
U_CAPI int32_t U_EXPORT2
uidna_toASCII(const UChar* src, int32_t srcLength, 
              UChar* dest, int32_t destCapacity,
              int32_t options,
              UParseError* parseError,
              UErrorCode* status);


/**
 * This function implements the ToUnicode operation as defined in the IDNA RFC.
 * This operation is done on <b>single labels</b> before sending it to something that expects
 * Unicode names. A label is an individual part of a domain name. Labels are usually
 * separated by dots; for e.g." "www.example.com" is composed of 3 labels 
 * "www","example", and "com".
 *
 * @param src               Input UChar array containing ASCII (ACE encoded) label.
 * @param srcLength         Number of UChars in src, or -1 if NUL-terminated.
 * @param dest Output       Converted UChar array containing Unicode equivalent of label.
 * @param destCapacity      Size of dest.
 * @param options           A bit set of options:
 *  
 *  - UIDNA_DEFAULT         Use default options, i.e., do not process unassigned code points
 *                          and do not use STD3 ASCII rules
 *                          If unassigned code points are found the operation fails with 
 *                          U_UNASSIGNED_CODE_POINT_FOUND error code.
 *
 *  - UIDNA_UNASSIGNED      Unassigned values can be converted to ASCII for query operations
 *                          If this option is set, the unassigned code points are in the input 
 *                          are treated as normal Unicode code points. <b> Note: </b> This option is 
 *                          required on toUnicode operation because the RFC mandates 
 *                          verification of decoded ACE input by applying toASCII and comparing
 *                          its output with source
 *
 *                          
 *                          
 *  - UIDNA_USE_STD3_RULES  Use STD3 ASCII rules for host name syntax restrictions
 *                          If this option is set and the input does not satisfy STD3 rules,  
 *                          the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
 *
 * @param parseError        Pointer to UParseError struct to receive information on position 
 *                          of error if an error is encountered. Can be NULL.
 * @param status            ICU in/out error code parameter.
 *                          U_INVALID_CHAR_FOUND if src contains
 *                          unmatched single surrogates.
 *                          U_INDEX_OUTOFBOUNDS_ERROR if src contains
 *                          too many code points.
 *                          U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
 * @return                  Number of Unicode characters converted.
 * @draft ICU 2.6
 */
U_CAPI int32_t U_EXPORT2
uidna_toUnicode(const UChar* src, int32_t srcLength,
                UChar* dest, int32_t destCapacity,
                int32_t options,
                UParseError* parseError,
                UErrorCode* status);


/**
 * Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
 * This operation is done on complete domain names, e.g: "www.example.com". 
 * It is important to note that this operation can fail. If it fails, then the input 
 * domain name cannot be used as an Internationalized Domain Name and the application
 * should have methods defined to deal with the failure.
 * 
 * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
 * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, 
 * and then convert. This function does not offer that level of granularity. The options once  
 * set will apply to all labels in the domain name
 *
 * @param src               Input UChar array containing IDN in Unicode.
 * @param srcLength         Number of UChars in src, or -1 if NUL-terminated.
 * @param dest              Output UChar array with ASCII (ACE encoded) IDN.
 * @param destCapacity      Size of dest.
 * @param options           A bit set of options:
 *  
 *  - UIDNA_DEFAULT         Use default options, i.e., do not process unassigned code points
 *                          and do not use STD3 ASCII rules
 *                          If unassigned code points are found the operation fails with 
 *                          U_UNASSIGNED_CODE_POINT_FOUND error code.
 *
 *  - UIDNA_UNASSIGNED      Unassigned values can be converted to ASCII for query operations
 *                          If this option is set, the unassigned code points are in the input 
 *                          are treated as normal Unicode code points.
 *                          
 *  - UIDNA_USE_STD3_RULES  Use STD3 ASCII rules for host name syntax restrictions
 *                          If this option is set and the input does not satisfy STD3 rules,  
 *                          the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
 * 
 * @param parseError        Pointer to UParseError struct to receive information on position 
 *                          of error if an error is encountered. Can be NULL.
 * @param status            ICU in/out error code parameter.
 *                          U_INVALID_CHAR_FOUND if src contains
 *                          unmatched single surrogates.
 *                          U_INDEX_OUTOFBOUNDS_ERROR if src contains
 *                          too many code points.
 *                          U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
 * @return                  Number of ASCII characters converted.
 * @draft ICU 2.6
 */
U_CAPI int32_t U_EXPORT2
uidna_IDNToASCII(  const UChar* src, int32_t srcLength,
                   UChar* dest, int32_t destCapacity,
                   int32_t options,
                   UParseError* parseError,
                   UErrorCode* status);

/**
 * Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
 * This operation is done on complete domain names, e.g: "www.example.com". 
 *
 * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
 * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, 
 * and then convert. This function does not offer that level of granularity. The options once  
 * set will apply to all labels in the domain name
 *
 * @param src               Input UChar array containing IDN in ASCII (ACE encoded) form.
 * @param srcLength         Number of UChars in src, or -1 if NUL-terminated.
 * @param dest Output       UChar array containing Unicode equivalent of source IDN.
 * @param destCapacity      Size of dest.
 * @param options           A bit set of options:
 *  
 *  - UIDNA_DEFAULT         Use default options, i.e., do not process unassigned code points
 *                          and do not use STD3 ASCII rules
 *                          If unassigned code points are found the operation fails with 
 *                          U_UNASSIGNED_CODE_POINT_FOUND error code.
 *
 *  - UIDNA_UNASSIGNED      Unassigned values can be converted to ASCII for query operations
 *                          If this option is set, the unassigned code points are in the input 
 *                          are treated as normal Unicode code points.
 *                          
 *  - UIDNA_USE_STD3_RULES  Use STD3 ASCII rules for host name syntax restrictions
 *                          If this option is set and the input does not satisfy STD3 rules,  
 *                          the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
 *
 * @param parseError        Pointer to UParseError struct to receive information on position 
 *                          of error if an error is encountered. Can be NULL.
 * @param status            ICU in/out error code parameter.
 *                          U_INVALID_CHAR_FOUND if src contains
 *                          unmatched single surrogates.
 *                          U_INDEX_OUTOFBOUNDS_ERROR if src contains
 *                          too many code points.
 *                          U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
 * @return                  Number of ASCII characters converted.
 * @draft ICU 2.6
 */
U_CAPI int32_t U_EXPORT2
uidna_IDNToUnicode(  const UChar* src, int32_t srcLength,
                     UChar* dest, int32_t destCapacity,
                     int32_t options,
                     UParseError* parseError,
                     UErrorCode* status);

/**
 * Compare two strings for IDNs for equivalence.
 * This function splits the domain names into labels and compares them.
 * According to IDN RFC, whenever two labels are compared, they are 
 * considered equal if and only if their ASCII forms (obtained by 
 * applying toASCII) match using an case-insensitive ASCII comparison.
 * Two domain names are considered a match if and only if all labels 
 * match regardless of whether label separators match.
 *
 * @param s1                First source string.
 * @param length1           Length of first source string, or -1 if NUL-terminated.
 *
 * @param s2                Second source string.
 * @param length2           Length of second source string, or -1 if NUL-terminated.
 * @param options           A bit set of options:
 *  
 *  - UIDNA_DEFAULT         Use default options, i.e., do not process unassigned code points
 *                          and do not use STD3 ASCII rules
 *                          If unassigned code points are found the operation fails with 
 *                          U_UNASSIGNED_CODE_POINT_FOUND error code.
 *
 *  - UIDNA_UNASSIGNED      Unassigned values can be converted to ASCII for query operations
 *                          If this option is set, the unassigned code points are in the input 
 *                          are treated as normal Unicode code points.
 *                          
 *  - UIDNA_USE_STD3_RULES  Use STD3 ASCII rules for host name syntax restrictions
 *                          If this option is set and the input does not satisfy STD3 rules,  
 *                          the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
 *
 * @param status            ICU error code in/out parameter.
 *                          Must fulfill U_SUCCESS before the function call.
 * @return <0 or 0 or >0 as usual for string comparisons
 * @draft ICU 2.6
 */
U_CAPI int32_t U_EXPORT2
uidna_compare(  const UChar *s1, int32_t length1,
                const UChar *s2, int32_t length2,
                int32_t options,
                UErrorCode* status);

#endif /* #if !UCONFIG_NO_IDNA */

#endif

--- NEW FILE: uiter.h ---
/*
*******************************************************************************
*
*   Copyright (C) 2002-2003, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  uiter.h
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2002jan18
*   created by: Markus W. Scherer
*/

#ifndef __UITER_H__
#define __UITER_H__

/**
 * \file
 * \brief C API: Unicode Character Iteration
 *
 * @see UCharIterator
 */

#include "unicode/utypes.h"

#ifdef XP_CPLUSPLUS
    U_NAMESPACE_BEGIN

    class CharacterIterator;
    class Replaceable;

    U_NAMESPACE_END
#endif

U_CDECL_BEGIN

struct UCharIterator;
typedef struct UCharIterator UCharIterator; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */

/**
 * Origin constants for UCharIterator.getIndex() and UCharIterator.move().
 * @see UCharIteratorMove
 * @see UCharIterator
 * @stable ICU 2.1
 */
typedef enum UCharIteratorOrigin {
    UITER_START, UITER_CURRENT, UITER_LIMIT, UITER_ZERO, UITER_LENGTH
} UCharIteratorOrigin;

/** Constants for UCharIterator. @draft ICU 2.6 */
enum {
    /**
     * Constant value that may be returned by UCharIteratorMove
     * indicating that the final UTF-16 index is not known, but that the move succeeded.
     * This can occur when moving relative to limit or length, or
     * when moving relative to the current index after a setState()
     * when the current UTF-16 index is not known.
     *
     * It would be very inefficient to have to count from the beginning of the text
     * just to get the current/limit/length index after moving relative to it.
     * The actual index can be determined with getIndex(UITER_CURRENT)
     * which will count the UChars if necessary.
     *
     * @draft ICU 2.6
     */
    UITER_UNKNOWN_INDEX=-2
};

/**
 * Constant for UCharIterator getState() indicating an error or
 * an unknown state.
 * Returned by uiter_getState()/UCharIteratorGetState
 * when an error occurs.
 * Also, some UCharIterator implementations may not be able to return
 * a valid state for each position. This will be clearly documented
 * for each such iterator (none of the public ones here).
 *
 * @draft ICU 2.6
 */
#define UITER_NO_STATE ((uint32_t)0xffffffff)

/**
 * Function type declaration for UCharIterator.getIndex().
 *
 * Gets the current position, or the start or limit of the
 * iteration range.
 *
 * This function may perform slowly for UITER_CURRENT after setState() was called,
 * or for UITER_LENGTH, because an iterator implementation may have to count
 * UChars if the underlying storage is not UTF-16.
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @param origin get the 0, start, limit, length, or current index
 * @return the requested index, or U_SENTINEL in an error condition
 *
 * @see UCharIteratorOrigin
 * @see UCharIterator
 * @stable ICU 2.1
 */
typedef int32_t U_CALLCONV
UCharIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin);

/**
 * Function type declaration for UCharIterator.move().
 *
 * Use iter->move(iter, index, UITER_ZERO) like CharacterIterator::setIndex(index).
 *
 * Moves the current position relative to the start or limit of the
 * iteration range, or relative to the current position itself.
 * The movement is expressed in numbers of code units forward
 * or backward by specifying a positive or negative delta.
 * Out of bounds movement will be pinned to the start or limit.
 *
 * This function may perform slowly for moving relative to UITER_LENGTH
 * because an iterator implementation may have to count the rest of the
 * UChars if the native storage is not UTF-16.
 *
 * When moving relative to the limit or length, or
 * relative to the current position after setState() was called,
 * move() may return UITER_UNKNOWN_INDEX (-2) to avoid an inefficient
 * determination of the actual UTF-16 index.
 * The actual index can be determined with getIndex(UITER_CURRENT)
 * which will count the UChars if necessary.
 * See UITER_UNKNOWN_INDEX for details.
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @param delta can be positive, zero, or negative
 * @param origin move relative to the 0, start, limit, length, or current index
 * @return the new index, or U_SENTINEL on an error condition,
 *         or UITER_UNKNOWN_INDEX when the index is not known.
 *
 * @see UCharIteratorOrigin
 * @see UCharIterator
 * @see UITER_UNKNOWN_INDEX
 * @stable ICU 2.1
 */
typedef int32_t U_CALLCONV
UCharIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin);

/**
 * Function type declaration for UCharIterator.hasNext().
 *
 * Check if current() and next() can still
 * return another code unit.
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @return boolean value for whether current() and next() can still return another code unit
 *
 * @see UCharIterator
 * @stable ICU 2.1
 */
typedef UBool U_CALLCONV
UCharIteratorHasNext(UCharIterator *iter);

/**
 * Function type declaration for UCharIterator.hasPrevious().
 *
 * Check if previous() can still return another code unit.
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @return boolean value for whether previous() can still return another code unit
 *
 * @see UCharIterator
 * @stable ICU 2.1
 */
typedef UBool U_CALLCONV
UCharIteratorHasPrevious(UCharIterator *iter);
 
/**
 * Function type declaration for UCharIterator.current().
 *
 * Return the code unit at the current position,
 * or U_SENTINEL if there is none (index is at the limit).
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @return the current code unit
 *
 * @see UCharIterator
 * @stable ICU 2.1
 */
typedef UChar32 U_CALLCONV
UCharIteratorCurrent(UCharIterator *iter);

/**
 * Function type declaration for UCharIterator.next().
 *
 * Return the code unit at the current index and increment
 * the index (post-increment, like s[i++]),
 * or return U_SENTINEL if there is none (index is at the limit).
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @return the current code unit (and post-increment the current index)
 *
 * @see UCharIterator
 * @stable ICU 2.1
 */
typedef UChar32 U_CALLCONV
UCharIteratorNext(UCharIterator *iter);

/**
 * Function type declaration for UCharIterator.previous().
 *
 * Decrement the index and return the code unit from there
 * (pre-decrement, like s[--i]),
 * or return U_SENTINEL if there is none (index is at the start).
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @return the previous code unit (after pre-decrementing the current index)
 *
 * @see UCharIterator
 * @stable ICU 2.1
 */
typedef UChar32 U_CALLCONV
UCharIteratorPrevious(UCharIterator *iter);

/**
 * Function type declaration for UCharIterator.reservedFn().
 * Reserved for future use.
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @param something some integer argument
 * @return some integer
 *
 * @see UCharIterator
 * @stable ICU 2.1
 */
typedef int32_t U_CALLCONV
UCharIteratorReserved(UCharIterator *iter, int32_t something);

/**
 * Function type declaration for UCharIterator.getState().
 *
 * Get the "state" of the iterator in the form of a single 32-bit word.
 * It is recommended that the state value be calculated to be as small as
 * is feasible. For strings with limited lengths, fewer than 32 bits may
 * be sufficient.
 *
 * This is used together with setState()/UCharIteratorSetState
 * to save and restore the iterator position more efficiently than with
 * getIndex()/move().
 *
 * With some UCharIterator implementations (e.g., UTF-8),
 * getting and setting the UTF-16 index with existing functions
 * (getIndex(UITER_CURRENT) followed by move(pos, UITER_ZERO)) is possible but
 * relatively slow because the iterator has to "walk" from a known index
 * to the requested one.
 * This takes more time the farther it needs to go.
 *
 * An opaque state value allows an iterator implementation to provide
 * an internal index (UTF-8: the source byte array index) for
 * fast, constant-time restoration.
 *
 * After calling setState(), a getIndex(UITER_CURRENT) may be slow because
 * the UTF-16 index may not be restored as well, but the iterator can deliver
 * the correct text contents and move relative to the current position
 * without performance degradation.
 *
 * Some UCharIterator implementations may not be able to return
 * a valid state for each position, in which case they return UITER_NO_STATE instead.
 * This will be clearly documented for each such iterator (none of the public ones here).
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @return the state word
 *
 * @see UCharIterator
 * @see UCharIteratorSetState
 * @see UITER_NO_STATE
 * @draft ICU 2.6
 */
typedef uint32_t U_CALLCONV
UCharIteratorGetState(const UCharIterator *iter);

/**
 * Function type declaration for UCharIterator.setState().
 *
 * Restore the "state" of the iterator using a state word from a getState() call.
 * The iterator object need not be the same one as for which getState() was called,
 * but it must be of the same type (set up using the same uiter_setXYZ function)
 * and it must iterate over the same string
 * (binary identical regardless of memory address).
 * For more about the state word see UCharIteratorGetState.
 *
 * After calling setState(), a getIndex(UITER_CURRENT) may be slow because
 * the UTF-16 index may not be restored as well, but the iterator can deliver
 * the correct text contents and move relative to the current position
 * without performance degradation.
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @param state the state word from a getState() call
 *              on a same-type, same-string iterator
 * @param pErrorCode Must be a valid pointer to an error code value,
 *                   which must not indicate a failure before the function call.
 *
 * @see UCharIterator
 * @see UCharIteratorGetState
 * @draft ICU 2.6
 */
typedef void U_CALLCONV
UCharIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode);


/**
 * C API for code unit iteration.
 * This can be used as a C wrapper around
 * CharacterIterator, Replaceable, or implemented using simple strings, etc.
 *
 * There are two roles for using UCharIterator:
 *
 * A "provider" sets the necessary function pointers and controls the "protected"
 * fields of the UCharIterator structure. A "provider" passes a UCharIterator
 * into C APIs that need a UCharIterator as an abstract, flexible string interface.
 *
 * Implementations of such C APIs are "callers" of UCharIterator functions;
 * they only use the "public" function pointers and never access the "protected"
 * fields directly.
 *
 * UCharIterator functions return code unit values 0..0xffff,
 * or U_SENTINEL if the iteration bounds are reached.
 *
 * @stable ICU 2.1
 */
struct UCharIterator {
    /**
     * (protected) Pointer to string or wrapped object or similar.
     * Not used by caller.
     * @stable ICU 2.1
     */
    const void *context;

    /**
     * (protected) Length of string or similar.
     * Not used by caller.
     * @stable ICU 2.1
     */
    int32_t length;

    /**
     * (protected) Start index or similar.
     * Not used by caller.
     * @stable ICU 2.1
     */
    int32_t start;

    /**
     * (protected) Current index or similar.
     * Not used by caller.
     * @stable ICU 2.1
     */
    int32_t index;

    /**
     * (protected) Limit index or similar.
     * Not used by caller.
     * @stable ICU 2.1
     */
    int32_t limit;

    /**
     * (protected) Used by UTF-8 iterators and possibly others.
     * @stable ICU 2.1
     */
    int32_t reservedField;

    /**
     * (public) Returns the current position or the
     * start or limit index of the iteration range.
     *
     * @see UCharIteratorGetIndex
     * @stable ICU 2.1
     */
    UCharIteratorGetIndex *getIndex;

    /**
     * (public) Moves the current position relative to the start or limit of the
     * iteration range, or relative to the current position itself.
     * The movement is expressed in numbers of code units forward
     * or backward by specifying a positive or negative delta.
     *
     * @see UCharIteratorMove
     * @stable ICU 2.1
     */
    UCharIteratorMove *move;

    /**
     * (public) Check if current() and next() can still
     * return another code unit.
     *
     * @see UCharIteratorHasNext
     * @stable ICU 2.1
     */
    UCharIteratorHasNext *hasNext;

    /**
     * (public) Check if previous() can still return another code unit.
     *
     * @see UCharIteratorHasPrevious
     * @stable ICU 2.1
     */
    UCharIteratorHasPrevious *hasPrevious;

    /**
     * (public) Return the code unit at the current position,
     * or U_SENTINEL if there is none (index is at the limit).
     *
     * @see UCharIteratorCurrent
     * @stable ICU 2.1
     */
    UCharIteratorCurrent *current;

    /**
     * (public) Return the code unit at the current index and increment
     * the index (post-increment, like s[i++]),
     * or return U_SENTINEL if there is none (index is at the limit).
     *
     * @see UCharIteratorNext
     * @stable ICU 2.1
     */
    UCharIteratorNext *next;

    /**
     * (public) Decrement the index and return the code unit from there
     * (pre-decrement, like s[--i]),
     * or return U_SENTINEL if there is none (index is at the start).
     *
     * @see UCharIteratorPrevious
     * @stable ICU 2.1
     */
    UCharIteratorPrevious *previous;

    /**
     * (public) Reserved for future use. Currently NULL.
     *
     * @see UCharIteratorReserved
     * @stable ICU 2.1
     */
    UCharIteratorReserved *reservedFn;

    /**
     * (public) Return the state of the iterator, to be restored later with setState().
     * This function pointer is NULL if the iterator does not implement it.
     *
     * @see UCharIteratorGet
     * @draft ICU 2.6
     */
    UCharIteratorGetState *getState;

    /**
     * (public) Restore the iterator state from the state word from a call
     * to getState().
     * This function pointer is NULL if the iterator does not implement it.
     *
     * @see UCharIteratorSet
     * @draft ICU 2.6
     */
    UCharIteratorSetState *setState;
};

/**
 * Helper function for UCharIterator to get the code point
 * at the current index.
 *
 * Return the code point that includes the code unit at the current position,
 * or U_SENTINEL if there is none (index is at the limit).
 * If the current code unit is a lead or trail surrogate,
 * then the following or preceding surrogate is used to form
 * the code point value.
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @return the current code point
 *
 * @see UCharIterator
 * @see U16_GET
 * @see UnicodeString::char32At()
 * @stable ICU 2.1
 */
U_CAPI UChar32 U_EXPORT2
uiter_current32(UCharIterator *iter);

/**
 * Helper function for UCharIterator to get the next code point.
 *
 * Return the code point at the current index and increment
 * the index (post-increment, like s[i++]),
 * or return U_SENTINEL if there is none (index is at the limit).
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @return the current code point (and post-increment the current index)
 *
 * @see UCharIterator
 * @see U16_NEXT
 * @stable ICU 2.1
 */
U_CAPI UChar32 U_EXPORT2
uiter_next32(UCharIterator *iter);

/**
 * Helper function for UCharIterator to get the previous code point.
 *
 * Decrement the index and return the code point from there
 * (pre-decrement, like s[--i]),
 * or return U_SENTINEL if there is none (index is at the start).
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @return the previous code point (after pre-decrementing the current index)
 *
 * @see UCharIterator
 * @see U16_PREV
 * @stable ICU 2.1
 */
U_CAPI UChar32 U_EXPORT2
uiter_previous32(UCharIterator *iter);

/**
 * Get the "state" of the iterator in the form of a single 32-bit word.
 * This is a convenience function that calls iter->getState(iter)
 * if iter->getState is not NULL;
 * if it is NULL or any other error occurs, then UITER_NO_STATE is returned.
 *
 * Some UCharIterator implementations may not be able to return
 * a valid state for each position, in which case they return UITER_NO_STATE instead.
 * This will be clearly documented for each such iterator (none of the public ones here).
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @return the state word
 *
 * @see UCharIterator
 * @see UCharIteratorGetState
 * @see UITER_NO_STATE
 * @draft ICU 2.6
 */
U_CAPI uint32_t U_EXPORT2
uiter_getState(const UCharIterator *iter);

/**
 * Restore the "state" of the iterator using a state word from a getState() call.
 * This is a convenience function that calls iter->setState(iter, state, pErrorCode)
 * if iter->setState is not NULL; if it is NULL, then U_UNSUPPORTED_ERROR is set.
 *
 * @param iter the UCharIterator structure ("this pointer")
 * @param state the state word from a getState() call
 *              on a same-type, same-string iterator
 * @param pErrorCode Must be a valid pointer to an error code value,
 *                   which must not indicate a failure before the function call.
 *
 * @see UCharIterator
 * @see UCharIteratorSetState
 * @draft ICU 2.6
 */
U_CAPI void U_EXPORT2
uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode);

/**
 * Set up a UCharIterator to iterate over a string.
 *
 * Sets the UCharIterator function pointers for iteration over the string s
 * with iteration boundaries start=index=0 and length=limit=string length.
 * The "provider" may set the start, index, and limit values at any time
 * within the range 0..length.
 * The length field will be ignored.
 *
 * The string pointer s is set into UCharIterator.context without copying
 * or reallocating the string contents.
 *
 * getState() simply returns the current index.
 * move() will always return the final index.
 *
 * @param iter UCharIterator structure to be set for iteration
 * @param s String to iterate over
 * @param length Length of s, or -1 if NUL-terminated
 *
 * @see UCharIterator
 * @stable ICU 2.1
 */
U_CAPI void U_EXPORT2
uiter_setString(UCharIterator *iter, const UChar *s, int32_t length);

/**
 * Set up a UCharIterator to iterate over a UTF-16BE string
 * (byte vector with a big-endian pair of bytes per UChar).
 *
 * Everything works just like with a normal UChar iterator (uiter_setString),
 * except that UChars are assembled from byte pairs,
 * and that the length argument here indicates an even number of bytes.
 *
 * getState() simply returns the current index.
 * move() will always return the final index.
 *
 * @param iter UCharIterator structure to be set for iteration
 * @param s UTF-16BE string to iterate over
 * @param length Length of s as an even number of bytes, or -1 if NUL-terminated
 *               (NUL means pair of 0 bytes at even index from s)
 *
 * @see UCharIterator
 * @see uiter_setString
 * @draft ICU 2.6
 */
U_CAPI void U_EXPORT2
uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length);

/**
 * Set up a UCharIterator to iterate over a UTF-8 string.
 *
 * Sets the UCharIterator function pointers for iteration over the UTF-8 string s
 * with UTF-8 iteration boundaries 0 and length.
 * The implementation counts the UTF-16 index on the fly and
 * lazily evaluates the UTF-16 length of the text.
 *
 * The start field is used as the UTF-8 offset, the limit field as the UTF-8 length.
 * When the reservedField is not 0, then it contains a supplementary code point
 * and the UTF-16 index is between the two corresponding surrogates.
 * At that point, the UTF-8 index is behind that code point.
 *
 * The UTF-8 string pointer s is set into UCharIterator.context without copying
 * or reallocating the string contents.
 *
 * getState() returns a state value consisting of
 * - the current UTF-8 source byte index (bits 31..1)
 * - a flag (bit 0) that indicates whether the UChar position is in the middle
 *   of a surrogate pair
 *   (from a 4-byte UTF-8 sequence for the corresponding supplementary code point)
 *
 * getState() cannot also encode the UTF-16 index in the state value.
 * move(relative to limit or length), or
 * move(relative to current) after setState(), may return UITER_UNKNOWN_INDEX.
 *
 * @param iter UCharIterator structure to be set for iteration
 * @param s UTF-8 string to iterate over
 * @param length Length of s in bytes, or -1 if NUL-terminated
 *
 * @see UCharIterator
 * @draft ICU 2.6
 */
U_CAPI void U_EXPORT2
uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length);

#ifdef XP_CPLUSPLUS

/**
 * Set up a UCharIterator to wrap around a C++ CharacterIterator.
 *
 * Sets the UCharIterator function pointers for iteration using the
 * CharacterIterator charIter.
 *
 * The CharacterIterator pointer charIter is set into UCharIterator.context
 * without copying or cloning the CharacterIterator object.
 * The other "protected" UCharIterator fields are set to 0 and will be ignored.
 * The iteration index and boundaries are controlled by the CharacterIterator.
 *
 * getState() simply returns the current index.
 * move() will always return the final index.
 *
 * @param iter UCharIterator structure to be set for iteration
 * @param charIter CharacterIterator to wrap
 *
 * @see UCharIterator
 * @stable ICU 2.1
 */
U_CAPI void U_EXPORT2
uiter_setCharacterIterator(UCharIterator *iter, CharacterIterator *charIter);

/**
 * Set up a UCharIterator to iterate over a C++ Replaceable.
 *
 * Sets the UCharIterator function pointers for iteration over the
 * Replaceable rep with iteration boundaries start=index=0 and
 * length=limit=rep->length().
 * The "provider" may set the start, index, and limit values at any time
 * within the range 0..length=rep->length().
 * The length field will be ignored.
 *
 * The Replaceable pointer rep is set into UCharIterator.context without copying
 * or cloning/reallocating the Replaceable object.
 *
 * getState() simply returns the current index.
 * move() will always return the final index.
 *
 * @param iter UCharIterator structure to be set for iteration
 * @param rep Replaceable to iterate over
 *
 * @see UCharIterator
 * @stable ICU 2.1
 */
U_CAPI void U_EXPORT2
uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep);

#endif

U_CDECL_END

#endif




--- NEW FILE: unifilt.h ---
/*
* Copyright (C) 1999, International Business Machines Corporation and others. All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   11/17/99    aliu        Creation.
**********************************************************************
*/
#ifndef UNIFILT_H
#define UNIFILT_H

#include "unicode/unifunct.h"
#include "unicode/unimatch.h"

U_NAMESPACE_BEGIN

/**
 * U_ETHER is used to represent character values for positions outside
 * a range.  For example, transliterator uses this to represent
 * characters outside the range contextStart..contextLimit-1.  This
 * allows explicit matching by rules and UnicodeSets of text outside a
 * defined range.
 */
#define U_ETHER ((UChar)0xFFFF)

/**
 * <code>UnicodeFilter</code> defines a protocol for selecting a
 * subset of the full range (U+0000 to U+10FFFF) of Unicode characters.
 * Currently, filters are used in conjunction with classes like {@link
 * Transliterator} to only process selected characters through a
 * transformation.
 *
 * <p>Note: UnicodeFilter currently stubs out two pure virtual methods
 * of its base class, UnicodeMatcher.  These methods are toPattern()
 * and matchesIndexValue().  This is done so that filter classes that
 * are not actually used as matchers -- specifically, those in the
 * UnicodeFilterLogic component, and those in tests -- can continue to
 * work without defining these methods.  As long as a filter is not
 * used in an RBT during real transliteration, these methods will not
 * be called.  However, this breaks the UnicodeMatcher base class
 * protocol, and it is not a correct solution.
 *
 * <p>In the future we may revisit the UnicodeMatcher / UnicodeFilter
 * hierarchy and either redesign it, or simply remove the stubs in
 * UnicodeFilter and force subclasses to implement the full
 * UnicodeMatcher protocol.
 *
 * @see UnicodeFilterLogic
 * @stable ICU 2.0
 */
class U_COMMON_API UnicodeFilter : public UnicodeFunctor, public UnicodeMatcher {

public:
    /**
     * Destructor
     * @stable ICU 2.0
     */
    virtual ~UnicodeFilter();

    /**
     * Returns <tt>true</tt> for characters that are in the selected
     * subset.  In other words, if a character is <b>to be
     * filtered</b>, then <tt>contains()</tt> returns
     * <b><tt>false</tt></b>.
     * @stable ICU 2.0
     */
    virtual UBool contains(UChar32 c) const = 0;

    /**
     * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
     * and return the pointer.
     * @draft ICU 2.4
     */
    virtual UnicodeMatcher* toMatcher() const;

    /**
     * Implement UnicodeMatcher API.
     * @draft ICU 2.4
     */
    virtual UMatchDegree matches(const Replaceable& text,
                                 int32_t& offset,
                                 int32_t limit,
                                 UBool incremental);

    /**
     * UnicodeFunctor API.  Nothing to do.
     * @draft ICU 2.4
     */
    virtual void setData(const TransliterationRuleData*) {}

    /**
     * ICU "poor man's RTTI", returns a UClassID for the actual class.
     *
     * @draft ICU 2.2
     */
    virtual inline UClassID getDynamicClassID() const = 0;

    /**
     * ICU "poor man's RTTI", returns a UClassID for this class.
     *
     * @draft ICU 2.2
     */
    static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }

protected:

    /**
     * @stable ICU 2.0
     */
    UnicodeFilter();

private:

    /**
     * The address of this static class variable serves as this class's ID
     * for ICU "poor man's RTTI".
     */
    static const char fgClassID;
};

inline UnicodeFilter::UnicodeFilter() {}
inline UnicodeFilter::~UnicodeFilter() {}

U_NAMESPACE_END

#endif

--- NEW FILE: unifunct.h ---
/*
**********************************************************************
*   Copyright (c) 2002-2003, International Business Machines Corporation
*   and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   01/14/2002  aliu        Creation.
**********************************************************************
*/
#ifndef UNIFUNCT_H
#define UNIFUNCT_H

#include "unicode/utypes.h"
#include "unicode/uobject.h"

U_NAMESPACE_BEGIN

class UnicodeMatcher;
class UnicodeReplacer;
class TransliterationRuleData;

/**
 * <code>UnicodeFunctor</code> is an abstract base class for objects
 * that perform match and/or replace operations on Unicode strings.
 * @author Alan Liu
 * @draft ICU 2.4
 */
class U_COMMON_API UnicodeFunctor : public UObject {

 public:

    /**
     * Destructor
     * @draft ICU 2.4
     */
    virtual ~UnicodeFunctor();

    /**
     * Return a copy of this object.  All UnicodeFunctor objects
     * have to support cloning in order to allow classes using
     * UnicodeFunctor to implement cloning.
     * @draft ICU 2.4
     */
    virtual UnicodeFunctor* clone() const = 0;

    /**
     * Cast 'this' to a UnicodeMatcher* pointer and return the
     * pointer, or null if this is not a UnicodeMatcher*.  Subclasses
     * that mix in UnicodeMatcher as a base class must override this.
     * This protocol is required because a pointer to a UnicodeFunctor
     * cannot be cast to a pointer to a UnicodeMatcher, since
     * UnicodeMatcher is a mixin that does not derive from
     * UnicodeFunctor.
     * @draft ICU 2.4
     */
    virtual UnicodeMatcher* toMatcher() const;

    /**
     * Cast 'this' to a UnicodeReplacer* pointer and return the
     * pointer, or null if this is not a UnicodeReplacer*.  Subclasses
     * that mix in UnicodeReplacer as a base class must override this.
     * This protocol is required because a pointer to a UnicodeFunctor
     * cannot be cast to a pointer to a UnicodeReplacer, since
     * UnicodeReplacer is a mixin that does not derive from
     * UnicodeFunctor.
     * @draft ICU 2.4
     */
    virtual UnicodeReplacer* toReplacer() const;

    /**
     * Return the class ID for this class.  This is useful only for
     * comparing to a return value from getDynamicClassID().  For example:
     * <pre>
     * .      Base* polymorphic_pointer = createPolymorphicObject();
     * .      if (polymorphic_pointer->getDynamicClassID() ==
     * .          Derived::getStaticClassID()) ...
     * </pre>
     * @return          The class ID for all objects of this class.
     * @stable ICU 2.0
     */
    static UClassID getStaticClassID(void) { return (UClassID)&fgClassID; }

    /**
     * Returns a unique class ID <b>polymorphically</b>.  This method
     * is to implement a simple version of RTTI, since not all C++
     * compilers support genuine RTTI.  Polymorphic operator==() and
     * clone() methods call this method.
     * 
     * <p>Concrete subclasses of UnicodeFunctor that wish clients to
     * be able to identify them should implement getDynamicClassID()
     * and also a static method and data member:
     * 
     * <pre>
     * static UClassID getStaticClassID() { return (UClassID)&fgClassID; }
     * static char fgClassID;
     * </pre>
     *
     * Subclasses that do not implement this method will have a
     * dynamic class ID of UnicodeFunctor::getStatisClassID().
     *
     * @return The class ID for this object. All objects of a given
     * class have the same class ID.  Objects of other classes have
     * different class IDs.
     * @draft ICU 2.4
     */
    virtual UClassID getDynamicClassID(void) const = 0;

    /**
     * Set the data object associated with this functor.  The data
     * object provides context for functor-to-standin mapping.  This
     * method is required when assigning a functor to a different data
     * object.  This function MAY GO AWAY later if the architecture is
     * changed to pass data object pointers through the API.
     * @internal ICU 2.1
     */
    virtual void setData(const TransliterationRuleData*) = 0;

 protected:

    /**
     * @stable ICU 2.0
     */
    UnicodeFunctor();

 private:

    /**
     * Class identifier for subclasses of UnicodeFunctor that do not
     * define their class (anonymous subclasses).
     */
    static const char fgClassID;
};

inline UnicodeFunctor::UnicodeFunctor() {}
inline UnicodeFunctor::~UnicodeFunctor() {}

U_NAMESPACE_END

#endif

--- NEW FILE: unimatch.h ---
/*
* Copyright (C) 2001, International Business Machines Corporation and others. All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   07/18/01    aliu        Creation.
**********************************************************************
*/
#ifndef UNIMATCH_H
#define UNIMATCH_H

#include "unicode/utypes.h"

U_NAMESPACE_BEGIN

class Replaceable;
class UnicodeString;
class UnicodeSet;

/**
 * Constants returned by <code>UnicodeMatcher::matches()</code>
 * indicating the degree of match.
 * @draft ICU 2.4
 */
enum UMatchDegree {
    /**
     * Constant returned by <code>matches()</code> indicating a
     * mismatch between the text and this matcher.  The text contains
     * a character which does not match, or the text does not contain
     * all desired characters for a non-incremental match.
     * @draft ICU 2.4
     */
    U_MISMATCH,
    
    /**
     * Constant returned by <code>matches()</code> indicating a
     * partial match between the text and this matcher.  This value is
     * only returned for incremental match operations.  All characters
     * of the text match, but more characters are required for a
     * complete match.  Alternatively, for variable-length matchers,
     * all characters of the text match, and if more characters were
     * supplied at limit, they might also match.
     * @draft ICU 2.4
     */
    U_PARTIAL_MATCH,
    
    /**
     * Constant returned by <code>matches()</code> indicating a
     * complete match between the text and this matcher.  For an
     * incremental variable-length match, this value is returned if
     * the given text matches, and it is known that additional
     * characters would not alter the extent of the match.
     * @draft ICU 2.4
     */
    U_MATCH
};

/**
 * <code>UnicodeMatcher</code> defines a protocol for objects that can
 * match a range of characters in a Replaceable string.
 * @draft ICU 2.4
 */
class U_COMMON_API UnicodeMatcher /* not : public UObject because this is an interface/mixin class */ {

public:
    /**
     * Destructor.
     * @draft ICU 2.4
     */
    virtual inline ~UnicodeMatcher() {};

    /**
     * Return a UMatchDegree value indicating the degree of match for
     * the given text at the given offset.  Zero, one, or more
     * characters may be matched.
     *
     * Matching in the forward direction is indicated by limit >
     * offset.  Characters from offset forwards to limit-1 will be
     * considered for matching.
     * 
     * Matching in the reverse direction is indicated by limit <
     * offset.  Characters from offset backwards to limit+1 will be
     * considered for matching.
     *
     * If limit == offset then the only match possible is a zero
     * character match (which subclasses may implement if desired).
     *
     * As a side effect, advance the offset parameter to the limit of
     * the matched substring.  In the forward direction, this will be
     * the index of the last matched character plus one.  In the
     * reverse direction, this will be the index of the last matched
     * character minus one.
     *
     * <p>Note:  This method is not const because some classes may
     * modify their state as the result of a match.
     *
     * @param text the text to be matched
     * @param offset on input, the index into text at which to begin
     * matching.  On output, the limit of the matched text.  The
     * number of matched characters is the output value of offset
     * minus the input value.  Offset should always point to the
     * HIGH SURROGATE (leading code unit) of a pair of surrogates,
     * both on entry and upon return.
     * @param limit the limit index of text to be matched.  Greater
     * than offset for a forward direction match, less than offset for
     * a backward direction match.  The last character to be
     * considered for matching will be text.charAt(limit-1) in the
     * forward direction or text.charAt(limit+1) in the backward
     * direction.
     * @param incremental if TRUE, then assume further characters may
     * be inserted at limit and check for partial matching.  Otherwise
     * assume the text as given is complete.
     * @return a match degree value indicating a full match, a partial
     * match, or a mismatch.  If incremental is FALSE then
     * U_PARTIAL_MATCH should never be returned.
     * @draft ICU 2.4
     */
    virtual UMatchDegree matches(const Replaceable& text,
                                 int32_t& offset,
                                 int32_t limit,
                                 UBool incremental) = 0;

    /**
     * Returns a string representation of this matcher.  If the result of
     * calling this function is passed to the appropriate parser, it
     * will produce another matcher that is equal to this one.
     * @param result the string to receive the pattern.  Previous
     * contents will be deleted.
     * @param escapeUnprintable if TRUE then convert unprintable
     * character to their hex escape representations, \uxxxx or
     * \Uxxxxxxxx.  Unprintable characters are those other than
     * U+000A, U+0020..U+007E.
     * @draft ICU 2.4
     */
    virtual UnicodeString& toPattern(UnicodeString& result,
                                     UBool escapeUnprintable = FALSE) const = 0;

    /**
     * Returns TRUE if this matcher will match a character c, where c
     * & 0xFF == v, at offset, in the forward direction (with limit >
     * offset).  This is used by <tt>RuleBasedTransliterator</tt> for
     * indexing.
     * @draft ICU 2.4
     */
    virtual UBool matchesIndexValue(uint8_t v) const = 0;

    /**
     * Union the set of all characters that may be matched by this object
     * into the given set.
     * @param toUnionTo the set into which to union the source characters
     * @draft ICU 2.4
     */
    virtual void addMatchSetTo(UnicodeSet& toUnionTo) const = 0;
};

U_NAMESPACE_END

#endif

--- NEW FILE: uniset.h ---
/*
**********************************************************************
* Copyright (C) 1999-2003, International Business Machines Corporation and others. All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   10/20/99    alan        Creation.
**********************************************************************
*/

#ifndef UNICODESET_H
#define UNICODESET_H

#include "unicode/unifilt.h"
#include "unicode/utypes.h"
#include "unicode/unistr.h"
#include "unicode/uchar.h"
#include "unicode/uset.h"

U_NAMESPACE_BEGIN
[...1320 lines suppressed...]

inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {
    return !operator==(o);
}

inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
    return !containsNone(start, end);
}

inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const {
    return !containsNone(s);
}

inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {
    return !containsNone(s);
}

U_NAMESPACE_END

#endif



--- NEW FILE: uobject.h ---
/*
******************************************************************************
*
*   Copyright (C) 2002-2003, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
******************************************************************************
*   file name:  uobject.h
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2002jun26
*   created by: Markus W. Scherer
*/

#ifndef __UOBJECT_H__
#define __UOBJECT_H__

#include "unicode/utypes.h"

U_NAMESPACE_BEGIN

/**
 * \file
 * \brief C++ API: Common ICU base class UObject.
 */

/**  U_OVERRIDE_CXX_ALLOCATION - Define this to override operator new and
 *                               delete in UMemory. Enabled by default for ICU.
 *
 *         Enabling forces all allocation of ICU object types to use ICU's
 *         memory allocation. On Windows, this allows the ICU DLL to be used by
 *         applications that statically link the C Runtime library, meaning that
 *         the app and ICU will be using different heaps.
 *
 * @draft ICU 2.2
 */                              
#ifndef U_OVERRIDE_CXX_ALLOCATION
#define U_OVERRIDE_CXX_ALLOCATION 1
#endif

/**  U_HAVE_PLACEMENT_NEW - Define this to define the placement new and
 *                          delete in UMemory for STL.
 *
 * @draft ICU 2.6
 */                              
#ifndef U_HAVE_PLACEMENT_NEW
#define U_HAVE_PLACEMENT_NEW 1
#endif

/**
 * UMemory is the common ICU base class.
 * All other ICU C++ classes are derived from UMemory (starting with ICU 2.4).
 *
 * This is primarily to make it possible and simple to override the
 * C++ memory management by adding new/delete operators to this base class.
 *
 * To override ALL ICU memory management, including that from plain C code,
 * replace the allocation functions declared in cmemory.h
 *
 * UMemory does not contain any virtual functions.
 * Common "boilerplate" functions are defined in UObject.
 *
 * @draft ICU 2.4
 */
class U_COMMON_API UMemory {
public:

#if U_OVERRIDE_CXX_ALLOCATION
    /**
     * Override for ICU4C C++ memory management.
     * simple, non-class types are allocated using the macros in common/cmemory.h
     * (uprv_malloc(), uprv_free(), uprv_realloc());
     * they or something else could be used here to implement C++ new/delete
     * for ICU4C C++ classes
     * @draft ICU 2.4
     */
    static void *operator new(size_t size);

    /**
     * Override for ICU4C C++ memory management.
     * See new().
     * @draft ICU 2.4
     */
    static void *operator new[](size_t size);

    /**
     * Override for ICU4C C++ memory management.
     * simple, non-class types are allocated using the macros in common/cmemory.h
     * (uprv_malloc(), uprv_free(), uprv_realloc());
     * they or something else could be used here to implement C++ new/delete
     * for ICU4C C++ classes
     * @draft ICU 2.4
     */
    static void operator delete(void *p);

    /**
     * Override for ICU4C C++ memory management.
     * See delete().
     * @draft ICU 2.4
     */
    static void operator delete[](void *p);

#if U_HAVE_PLACEMENT_NEW
    /**
     * Override for ICU4C C++ memory management for STL.
     * See new().
     * @draft ICU 2.6
     */
    static inline void * operator new(size_t, void *ptr) { return ptr; }

    /**
     * Override for ICU4C C++ memory management for STL.
     * See delete().
     * @draft ICU 2.6
     */
    static inline void operator delete(void *, void *) {}
#endif /* U_HAVE_PLACEMENT_NEW */
#endif /* U_OVERRIDE_CXX_ALLOCATION */

    /*
     * Assignment operator not declared. The compiler will provide one
     * which does nothing since this class does not contain any data members.
     * API/code coverage may show the assignment operator as present and
     * untested - ignore.
     * Subclasses need this assignment operator if they use compiler-provided
     * assignment operators of their own. An alternative to not declaring one
     * here would be to declare and empty-implement a protected or public one.
    UMemory &UMemory::operator=(const UMemory &);
     */
};

/**
 * UObject is the common ICU "boilerplate" class.
 * UObject inherits UMemory (starting with ICU 2.4),
 * and all other public ICU C++ classes
 * are derived from UObject (starting with ICU 2.2).
 *
 * UObject contains common virtual functions like for ICU's "poor man's RTTI".
 * It does not contain default implementations of virtual methods
 * like getDynamicClassID to allow derived classes such as Format
 * to declare these as pure virtual.
 *
 * The clone() function is not available in UObject because it is not
 * implemented by all ICU classes.
 * Many ICU services provide a clone() function for their class trees,
 * defined on the service's C++ base class, and all subclasses within that
 * service class tree return a pointer to the service base class
 * (which itself is a subclass of UObject).
 * This is because some compilers do not support covariant (same-as-this)
 * return types; cast to the appropriate subclass if necessary.
 *
 * @draft ICU 2.2
 */
class U_COMMON_API UObject : public UMemory {
public:
    /**
     * Destructor.
     *
     * @draft ICU 2.2
     */
    virtual inline ~UObject() {}

    /**
     * ICU4C "poor man's RTTI", returns a UClassID for the actual ICU class.
     *
     * @draft ICU 2.2
     */
    virtual inline UClassID getDynamicClassID() const = 0;

protected:
    // the following functions are protected to prevent instantiation and
    // direct use of UObject itself

    // default constructor
    // commented out because UObject is abstract (see getDynamicClassID)
    // inline UObject() {}

    // copy constructor
    // commented out because UObject is abstract (see getDynamicClassID)
    // inline UObject(const UObject &other) {}

#if U_ICU_VERSION_MAJOR_NUM>2 || (U_ICU_VERSION_MAJOR_NUM==2 && U_ICU_VERSION_MINOR_NUM>6)
    // TODO post ICU 2.4  (This comment inserted in 2.2)
    // some or all of the following "boilerplate" functions may be made public
    // in a future ICU4C release when all subclasses implement them

    // assignment operator
    // (not virtual, see "Taligent's Guide to Designing Programs" pp.73..74)
    // commented out because the implementation is the same as a compiler's default
    // UObject &operator=(const UObject &other) { return *this; }

    // comparison operators
    virtual inline UBool operator==(const UObject &other) const { return this==&other; }
    inline UBool operator!=(const UObject &other) const { return !operator==(other); }

    // clone() commented out from the base class:
    // some compilers do not support co-variant return types
    // (i.e., subclasses would have to return UObject * as well, instead of SubClass *)
    // see also UObject class documentation.
    // virtual UObject *clone() const;
#endif

    /*
     * Assignment operator not declared. The compiler will provide one
     * which does nothing since this class does not contain any data members.
     * API/code coverage may show the assignment operator as present and
     * untested - ignore.
     * Subclasses need this assignment operator if they use compiler-provided
     * assignment operators of their own. An alternative to not declaring one
     * here would be to declare and empty-implement a protected or public one.
    UObject &UObject::operator=(const UObject &);
     */
};

U_NAMESPACE_END

#endif





--- NEW FILE: uset.h ---
/*
*******************************************************************************
*
*   Copyright (C) 2002-2003, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  uset.h
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2002mar07
*   created by: Markus W. Scherer
*
*   C version of UnicodeSet.
*/


/**
 * \file
 * \brief C API: Unicode Set
 *
 * <p>This is a C wrapper around the C++ UnicodeSet class.</p>
 */

#ifndef __USET_H__
#define __USET_H__

#include "unicode/utypes.h"

#ifndef UCNV_H
struct USet;
/**
 * A UnicodeSet.  Use the uset_* API to manipulate.  Create with
 * uset_open*, and destroy with uset_close.
 * @draft ICU 2.4
 */
typedef struct USet USet;
#endif

/**
 * Bitmask values to be passed to the UnicodeSet constructor or
 * applyPattern() taking an option parameter.
 * @draft
 */
enum {
    /**
     * Ignore white space within patterns unless quoted or escaped.
     * @draft
     */
    USET_IGNORE_SPACE = 1,  

    /**
     * Enable case insensitive matching.  E.g., "[ab]" with this flag
     * will match 'a', 'A', 'b', and 'B'.  "[^ab]" with this flag will
     * match all except 'a', 'A', 'b', and 'B'.
     * @draft
     */
    USET_CASE_INSENSITIVE = 2,  

    /**
     * Bitmask for UnicodeSet::closeOver() indicating letter case.
     * This may be ORed together with other selectors.
     * @internal
     */
    USET_CASE = 2,
    /**
     * Enough for any single-code point set
     * @internal
     */
    USET_SERIALIZED_STATIC_ARRAY_CAPACITY=8
};

/**
 * A serialized form of a Unicode set.  Limited manipulations are
 * possible directly on a serialized set.  See below.
 * @draft ICU 2.4
 */
typedef struct USerializedSet {
    /**
     * The serialized Unicode Set.
     * @draft ICU 2.4
     */
    const uint16_t *array;
    /**
     * The length of the array that contains BMP characters.
     * @draft ICU 2.4
     */
    int32_t bmpLength;
    /**
     * The total length of the array.
     * @draft ICU 2.4
     */
    int32_t length;
    /**
     * A small buffer for the array to reduce memory allocations.
     * @draft ICU 2.4
     */
    uint16_t staticArray[USET_SERIALIZED_STATIC_ARRAY_CAPACITY];
} USerializedSet;

/*********************************************************************
 * USet API
 *********************************************************************/

/**
 * Creates a USet object that contains the range of characters
 * start..end, inclusive.
 * @param start first character of the range, inclusive
 * @param end last character of the range, inclusive
 * @return a newly created USet.  The caller must call uset_close() on
 * it when done.
 * @draft ICU 2.4
 */
U_CAPI USet* U_EXPORT2
uset_open(UChar32 start, UChar32 end);

/**
 * Creates a set from the given pattern.  See the UnicodeSet class
 * description for the syntax of the pattern language.
 * @param pattern a string specifying what characters are in the set
 * @param patternLength the length of the pattern, or -1 if null
 * terminated
 * @param ec the error code
 * @draft ICU 2.4
 */
U_CAPI USet* U_EXPORT2
uset_openPattern(const UChar* pattern, int32_t patternLength,
                 UErrorCode* ec);

/**
 * Creates a set from the given pattern.  See the UnicodeSet class
 * description for the syntax of the pattern language.
 * @param pattern a string specifying what characters are in the set
 * @param patternLength the length of the pattern, or -1 if null
 * terminated
 * @param options bitmask for options to apply to the pattern.
 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
 * @param ec the error code
 * @draft ICU 2.4
 */
U_CAPI USet* U_EXPORT2
uset_openPatternOptions(const UChar* pattern, int32_t patternLength,
                 uint32_t options,
                 UErrorCode* ec);

/**
 * Disposes of the storage used by a USet object.  This function should
 * be called exactly once for objects returned by uset_open().
 * @param set the object to dispose of
 * @draft ICU 2.4
 */
U_CAPI void U_EXPORT2
uset_close(USet* set);

/**
 * Returns a string representation of this set.  If the result of
 * calling this function is passed to a uset_openPattern(), it
 * will produce another set that is equal to this one.
 * @param set the set
 * @param result the string to receive the rules, may be NULL
 * @param resultCapacity the capacity of result, may be 0 if result is NULL
 * @param escapeUnprintable if TRUE then convert unprintable
 * character to their hex escape representations, \uxxxx or
 * \Uxxxxxxxx.  Unprintable characters are those other than
 * U+000A, U+0020..U+007E.
 * @param ec error code.
 * @return length of string, possibly larger than resultCapacity
 * @draft ICU 2.4
 */
U_CAPI int32_t U_EXPORT2
uset_toPattern(const USet* set,
               UChar* result, int32_t resultCapacity,
               UBool escapeUnprintable,
               UErrorCode* ec);

/**
 * Adds the given character to the given USet.  After this call,
 * uset_contains(set, c) will return TRUE.
 * @param set the object to which to add the character
 * @param c the character to add
 * @draft ICU 2.4
 */
U_CAPI void U_EXPORT2
uset_add(USet* set, UChar32 c);

/**
 * Adds all of the elements in the specified set to this set if
 * they're not already present.  This operation effectively
 * modifies this set so that its value is the <i>union</i> of the two
 * sets.  The behavior of this operation is unspecified if the specified
 * collection is modified while the operation is in progress.
 *
 * @param set the object to which to add the set
 * @param additionalSet the source set whose elements are to be added to this set.
 * @draft ICU 2.6
 */
U_CAPI void U_EXPORT2
uset_addAll(USet* set, const USet *additionalSet);

/**
 * Adds the given range of characters to the given USet.  After this call,
 * uset_contains(set, start, end) will return TRUE.
 * @param set the object to which to add the character
 * @param start the first character of the range to add, inclusive
 * @param end the last character of the range to add, inclusive
 * @draft ICU 2.2
 */
U_CAPI void U_EXPORT2
uset_addRange(USet* set, UChar32 start, UChar32 end);

/**
 * Adds the given string to the given USet.  After this call,
 * uset_containsString(set, str, strLen) will return TRUE.
 * @param set the object to which to add the character
 * @param str the string to add
 * @param strLen the length of the string or -1 if null terminated.
 * @draft ICU 2.4
 */
U_CAPI void U_EXPORT2
uset_addString(USet* set, const UChar* str, int32_t strLen);

/**
 * Removes the given character from the given USet.  After this call,
 * uset_contains(set, c) will return FALSE.
 * @param set the object from which to remove the character
 * @param c the character to remove
 * @draft ICU 2.4
 */
U_CAPI void U_EXPORT2
uset_remove(USet* set, UChar32 c);

/**
 * Removes the given range of characters from the given USet.  After this call,
 * uset_contains(set, start, end) will return FALSE.
 * @param set the object to which to add the character
 * @param start the first character of the range to remove, inclusive
 * @param end the last character of the range to remove, inclusive
 * @draft ICU 2.2
 */
U_CAPI void U_EXPORT2
uset_removeRange(USet* set, UChar32 start, UChar32 end);

/**
 * Removes the given string to the given USet.  After this call,
 * uset_containsString(set, str, strLen) will return FALSE.
 * @param set the object to which to add the character
 * @param str the string to remove
 * @param strLen the length of the string or -1 if null terminated.
 * @draft ICU 2.4
 */
U_CAPI void U_EXPORT2
uset_removeString(USet* set, const UChar* str, int32_t strLen);

/**
 * Inverts this set.  This operation modifies this set so that
 * its value is its complement.  This operation does not affect
 * the multicharacter strings, if any.
 * @param set the set
 * @draft ICU 2.4
 */
U_CAPI void U_EXPORT2
uset_complement(USet* set);

/**
 * Removes all of the elements from this set.  This set will be
 * empty after this call returns.
 * @param set the set
 * @draft ICU 2.4
 */
U_CAPI void U_EXPORT2
uset_clear(USet* set);

/**
 * Returns TRUE if the given USet contains no characters and no
 * strings.
 * @param set the set
 * @return true if set is empty
 * @draft ICU 2.4
 */
U_CAPI UBool U_EXPORT2
uset_isEmpty(const USet* set);

/**
 * Returns TRUE if the given USet contains the given character.
 * @param set the set
 * @param c The codepoint to check for within the set
 * @return true if set contains c
 * @draft ICU 2.4
 */
U_CAPI UBool U_EXPORT2
uset_contains(const USet* set, UChar32 c);

/**
 * Returns TRUE if the given USet contains all characters c
 * where start <= c && c <= end.
 * @param set the set
 * @param start the first character of the range to test, inclusive
 * @param end the last character of the range to test, inclusive
 * @return TRUE if set contains the range
 * @draft ICU 2.2
 */
U_CAPI UBool U_EXPORT2
uset_containsRange(const USet* set, UChar32 start, UChar32 end);

/**
 * Returns TRUE if the given USet contains the given string.
 * @param set the set
 * @param str the string
 * @param strLen the length of the string or -1 if null terminated.
 * @return true if set contains str
 * @draft ICU 2.4
 */
U_CAPI UBool U_EXPORT2
uset_containsString(const USet* set, const UChar* str, int32_t strLen);

/**
 * Returns the number of characters and strings contained in the given
 * USet.
 * @param set the set
 * @return a non-negative integer counting the characters and strings
 * contained in set
 * @draft ICU 2.4
 */
U_CAPI int32_t U_EXPORT2
uset_size(const USet* set);

/**
 * Returns the number of items in this set.  An item is either a range
 * of characters or a single multicharacter string.
 * @param set the set
 * @return a non-negative integer counting the character ranges
 * and/or strings contained in set
 * @draft ICU 2.4
 */
U_CAPI int32_t U_EXPORT2
uset_getItemCount(const USet* set);

/**
 * Returns an item of this set.  An item is either a range of
 * characters or a single multicharacter string.
 * @param set the set
 * @param itemIndex a non-negative integer in the range 0..
 * uset_getItemCount(set)-1
 * @param start pointer to variable to receive first character
 * in range, inclusive
 * @param end pointer to variable to receive last character in range,
 * inclusive
 * @param str buffer to receive the string, may be NULL
 * @param strCapacity capacity of str, or 0 if str is NULL
 * @param ec error code
 * @return the length of the string (>= 2), or 0 if the item is a
 * range, in which case it is the range *start..*end, or -1 if
 * itemIndex is out of range
 * @draft ICU 2.4
 */
U_CAPI int32_t U_EXPORT2
uset_getItem(const USet* set, int32_t itemIndex,
             UChar32* start, UChar32* end,
             UChar* str, int32_t strCapacity,
             UErrorCode* ec);

/*********************************************************************
 * Serialized set API
 *********************************************************************/

/**
 * Serializes this set into an array of 16-bit integers.  Serialization
 * (currently) only records the characters in the set; multicharacter
 * strings are ignored.
 *
 * The array
 * has following format (each line is one 16-bit integer):
 *
 *  length     = (n+2*m) | (m!=0?0x8000:0)
 *  bmpLength  = n; present if m!=0
 *  bmp[0]
 *  bmp[1]
 *  ...
 *  bmp[n-1]
 *  supp-high[0]
 *  supp-low[0]
 *  supp-high[1]
 *  supp-low[1]
 *  ...
 *  supp-high[m-1]
 *  supp-low[m-1]
 *
 * The array starts with a header.  After the header are n bmp
 * code points, then m supplementary code points.  Either n or m
 * or both may be zero.  n+2*m is always <= 0x7FFF.
 *
 * If there are no supplementary characters (if m==0) then the
 * header is one 16-bit integer, 'length', with value n.
 *
 * If there are supplementary characters (if m!=0) then the header
 * is two 16-bit integers.  The first, 'length', has value
 * (n+2*m)|0x8000.  The second, 'bmpLength', has value n.
 *
 * After the header the code points are stored in ascending order.
 * Supplementary code points are stored as most significant 16
 * bits followed by least significant 16 bits.
 *
 * @param set the set
 * @param dest pointer to buffer of destCapacity 16-bit integers.
 * May be NULL only if destCapacity is zero.
 * @param destCapacity size of dest, or zero.  Must not be negative.
 * @param pErrorCode pointer to the error code.  Will be set to
 * U_INDEX_OUTOFBOUNDS_ERROR if n+2*m > 0x7FFF.  Will be set to
 * U_BUFFER_OVERFLOW_ERROR if n+2*m+(m!=0?2:1) > destCapacity.
 * @return the total length of the serialized format, including
 * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other
 * than U_BUFFER_OVERFLOW_ERROR.
 * @draft ICU 2.4
 */
U_CAPI int32_t U_EXPORT2
uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* pErrorCode);

/**
 * Given a serialized array, fill in the given serialized set object.
 * @param fillSet pointer to result
 * @param src pointer to start of array
 * @param srcLength length of array
 * @return true if the given array is valid, otherwise false
 * @draft ICU 2.4
 */
U_CAPI UBool U_EXPORT2
uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength);

/**
 * Set the USerializedSet to contain the given character (and nothing
 * else).
 * @param fillSet pointer to result
 * @param c The codepoint to set
 * @draft ICU 2.4
 */
U_CAPI void U_EXPORT2
uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c);

/**
 * Returns TRUE if the given USerializedSet contains the given
 * character.
 * @param set the serialized set
 * @param c The codepoint to check for within the set
 * @return true if set contains c
 * @draft ICU 2.4
 */
U_CAPI UBool U_EXPORT2
uset_serializedContains(const USerializedSet* set, UChar32 c);

/**
 * Returns the number of disjoint ranges of characters contained in
 * the given serialized set.  Ignores any strings contained in the
 * set.
 * @param set the serialized set
 * @return a non-negative integer counting the character ranges
 * contained in set
 * @draft ICU 2.4
 */
U_CAPI int32_t U_EXPORT2
uset_getSerializedRangeCount(const USerializedSet* set);

/**
 * Returns a range of characters contained in the given serialized
 * set.
 * @param set the serialized set
 * @param rangeIndex a non-negative integer in the range 0..
 * uset_getSerializedRangeCount(set)-1
 * @param pStart pointer to variable to receive first character
 * in range, inclusive
 * @param pEnd pointer to variable to receive last character in range,
 * inclusive
 * @return true if rangeIndex is valid, otherwise false
 * @draft ICU 2.4
 */
U_CAPI UBool U_EXPORT2
uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex,
                        UChar32* pStart, UChar32* pEnd);

#endif

--- NEW FILE: usetiter.h ---
/*
**********************************************************************
* Copyright (c) 2002-2003, International Business Machines
* Corporation and others.  All Rights Reserved.
**********************************************************************
* $Source: /usr/local/cvsroot/icu-sword/source/common/unicode/usetiter.h,v $ 
**********************************************************************
*/
#ifndef USETITER_H
#define USETITER_H

#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "unicode/unistr.h"

U_NAMESPACE_BEGIN

class UnicodeSet;
class UnicodeString;

/**
 * UnicodeSetIterator iterates over the contents of a UnicodeSet.  It
 * iterates over either code points or code point ranges.  After all
 * code points or ranges have been returned, it returns the
 * multicharacter strings of the UnicodSet, if any.
 *
 * <p>To iterate over code points, use a loop like this:
 * <pre>
 * UnicodeSetIterator it(set);
 * while (set.next()) {
 *   if (set.isString()) {
 *     processString(set.getString());
 *   } else {
 *     processCodepoint(set.getCodepoint());
 *   }
 * }
 * </pre>
 *
 * <p>To iterate over code point ranges, use a loop like this:
 * <pre>
 * UnicodeSetIterator it(set);
 * while (it.nextRange()) {
 *   if (it.isString()) {
 *     processString(it.getString());
 *   } else {
 *     processCodepointRange(it.getCodepoint(), it.getCodepointEnd());
 *   }
 * }
 * </pre>
 * @author M. Davis
 * @draft ICU 2.2
 */
class U_COMMON_API UnicodeSetIterator : public UObject {

 protected:
    
    /**
     * Value of <tt>codepoint</tt> if the iterator points to a string.
     * If <tt>codepoint == IS_STRING</tt>, then examine
     * <tt>string</tt> for the current iteration result.
     * @draft ICU 2.4
     */
    enum { IS_STRING = -1 };

    /**
     * Current code point, or the special value <tt>IS_STRING</tt>, if
     * the iterator points to a string.
     * @draft ICU 2.4
     */
    UChar32 codepoint;

    /**
     * When iterating over ranges using <tt>nextRange()</tt>,
     * <tt>codepointEnd</tt> contains the inclusive end of the
     * iteration range, if <tt>codepoint != IS_STRING</tt>.  If
     * iterating over code points using <tt>next()</tt>, or if
     * <tt>codepoint == IS_STRING</tt>, then the value of
     * <tt>codepointEnd</tt> is undefined.
     * @draft ICU 2.4
     */
    UChar32 codepointEnd;

    /**
     * If <tt>codepoint == IS_STRING</tt>, then <tt>string</tt> points
     * to the current string.  If <tt>codepoint != IS_STRING</tt>, the
     * value of <tt>string</tt> is undefined.
     * @draft ICU 2.4
     */
    const UnicodeString* string;

 public:

    /**
     * Create an iterator over the given set.  The iterator is valid
     * only so long as <tt>set</tt> is valid.
     * @param set set to iterate over
     * @draft ICU 2.4
     */
    UnicodeSetIterator(const UnicodeSet& set);
        
    /**
     * Create an iterator over nothing.  <tt>next()</tt> and
     * <tt>nextRange()</tt> return false. This is a convenience
     * constructor allowing the target to be set later.
     * @draft ICU 2.4
     */
    UnicodeSetIterator();
        
    /**
     * Destructor.
     * @draft ICU 2.4
     */
    virtual ~UnicodeSetIterator();

    /**
     * Returns true if the current element is a string.  If so, the
     * caller can retrieve it with <tt>getString()</tt>.  If this
     * method returns false, the current element is a code point or
     * code point range, depending on whether <tt>next()</tt> or
     * <tt>nextRange()</tt> was called, and the caller can retrieve it
     * with <tt>getCodepoint()</tt> and, for a range,
     * <tt>getCodepointEnd()</tt>.
     * @draft ICU 2.4
     */
    inline UBool isString() const;

    /**
     * Returns the current code point, if <tt>isString()</tt> returned
     * false.  Otherwise returns an undefined result.
     * @draft ICU 2.4
     */
    inline UChar32 getCodepoint() const;

    /**
     * Returns the end of the current code point range, if
     * <tt>isString()</tt> returned false and <tt>nextRange()</tt> was
     * called.  Otherwise returns an undefined result.
     * @draft ICU 2.4
     */
    inline UChar32 getCodepointEnd() const;

    /**
     * Returns the current string, if <tt>isString()</tt> returned
     * true.  Otherwise returns an undefined result.
     * @draft ICU 2.4
     */
    inline const UnicodeString& getString() const;

    /**
     * Returns the next element in the set, either a single code point
     * or a string.  If there are no more elements in the set, return
     * false.  If <tt>codepoint == IS_STRING</tt>, the value is a
     * string in the <tt>string</tt> field.  Otherwise the value is a
     * single code point in the <tt>codepoint</tt> field.
     * 
     * <p>The order of iteration is all code points in sorted order,
     * followed by all strings sorted order.  <tt>codepointEnd</tt> is
     * undefined after calling this method.  <tt>string</tt> is
     * undefined unless <tt>codepoint == IS_STRING</tt>.  Do not mix
     * calls to <tt>next()</tt> and <tt>nextRange()</tt> without
     * calling <tt>reset()</tt> between them.  The results of doing so
     * are undefined.
     *
     * @return true if there was another element in the set and this
     * object contains the element.
     * @draft ICU 2.4
     */
    UBool next();
        
    /**
     * Returns the next element in the set, either a code point range
     * or a string.  If there are no more elements in the set, return
     * false.  If <tt>codepoint == IS_STRING</tt>, the value is a
     * string in the <tt>string</tt> field.  Otherwise the value is a
     * range of one or more code points from <tt>codepoint</tt> to
     * <tt>codepointeEnd</tt> inclusive.
     * 
     * <p>The order of iteration is all code points ranges in sorted
     * order, followed by all strings sorted order.  Ranges are
     * disjoint and non-contiguous.  <tt>string</tt> is undefined
     * unless <tt>codepoint == IS_STRING</tt>.  Do not mix calls to
     * <tt>next()</tt> and <tt>nextRange()</tt> without calling
     * <tt>reset()</tt> between them.  The results of doing so are
     * undefined.
     *
     * @return true if there was another element in the set and this
     * object contains the element.
     * @draft ICU 2.4
     */
    UBool nextRange();
        
    /**
     * Sets this iterator to visit the elements of the given set and
     * resets it to the start of that set.  The iterator is valid only
     * so long as <tt>set</tt> is valid.
     * @param set the set to iterate over.
     * @draft ICU 2.4
     */
    void reset(const UnicodeSet& set);
        
    /**
     * Resets this iterator to the start of the set.
     * @draft ICU 2.4
     */
    void reset();
    
    /**
     * ICU "poor man's RTTI", returns a UClassID for the actual class.
     *
     * @draft ICU 2.2
     */
    virtual inline UClassID getDynamicClassID() const;

    /**
     * ICU "poor man's RTTI", returns a UClassID for this class.
     *
     * @draft ICU 2.2
     */
    static inline UClassID getStaticClassID();

    // ======================= PRIVATES ===========================
    
 protected:

    // endElement and nextElements are really UChar32's, but we keep
    // them as signed int32_t's so we can do comparisons with
    // endElement set to -1.  Leave them as int32_t's.
    /** The set
     * @draft ICU 2.4
     */
    const UnicodeSet* set;
    /** End range
     * @draft ICU 2.4
     */
    int32_t endRange;
    /** Range
     * @draft ICU 2.4
     */
    int32_t range;
    /** End element
     * @draft ICU 2.4
     */
    int32_t endElement;
    /** Next element
     * @draft ICU 2.4
     */
    int32_t nextElement;
    //UBool abbreviated;
    /** Next string
     * @draft ICU 2.4
     */
    int32_t nextString;
    /** String count
     * @draft ICU 2.4
     */
    int32_t stringCount;

    /** Copy constructor. Disallowed.
     * @draft ICU 2.4
     */
    UnicodeSetIterator(const UnicodeSetIterator&); // disallow

    /** Assignment operator. Disallowed.
     * @draft ICU 2.4
     */
    UnicodeSetIterator& operator=(const UnicodeSetIterator&); // disallow

    /** Load range
     * @draft ICU 2.4
     */
    virtual void loadRange(int32_t range);

private:

    /**
     * The address of this static class variable serves as this class's ID
     * for ICU "poor man's RTTI".
     */
    static const char fgClassID;
};

inline UClassID
UnicodeSetIterator::getStaticClassID()
{ return (UClassID)&fgClassID; }

inline UClassID
UnicodeSetIterator::getDynamicClassID() const
{ return UnicodeSetIterator::getStaticClassID(); }

inline UBool UnicodeSetIterator::isString() const {
    return codepoint == (UChar32)IS_STRING;
}

inline UChar32 UnicodeSetIterator::getCodepoint() const {
    return codepoint;
}

inline UChar32 UnicodeSetIterator::getCodepointEnd() const {
    return codepointEnd;
}

inline const UnicodeString& UnicodeSetIterator::getString() const {
    return *string;
}

U_NAMESPACE_END

#endif







--- NEW FILE: utf_old.h ---
/*
*******************************************************************************
*
*   Copyright (C) 2002-2003, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  utf.h
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2002sep21
*   created by: Markus W. Scherer
*/

/**
 * \file
 * The macros in utf_old.h are all deprecated and their use discouraged.
[...1121 lines suppressed...]
 */
#define UTF_BACK_N(s, start, i, n) U16_BACK_N(s, start, i, n)

/**
 * Take the random-access index i and adjust it so that it points beyond
 * a code point. The input index points beyond any code unit
 * of a code point and is moved to point beyond the last code unit of the same
 * code point. i is never decremented.
 * In other words, if i points to a trail surrogate that is preceded by a matching
 * lead surrogate, then i is incremented. Otherwise it is not modified.
 * This can be used to start an iteration with UTF_PREV_CHAR() from a random index.
 * Same as UTF16_SET_CHAR_LIMIT.
 * \pre start<i<=length
 * \post start<i<=length
 *
 * @deprecated ICU 2.4. Renamed to U16_SET_CP_LIMIT, see utf_old.h.
 */
#define UTF_SET_CHAR_LIMIT(s, start, i, length) U16_SET_CP_LIMIT(s, start, i, length)

#endif