Index: src/modules/filters/utf8nfc.cpp =================================================================== --- src/modules/filters/utf8nfc.cpp (revision 2135) +++ src/modules/filters/utf8nfc.cpp (working copy) @@ -9,6 +9,9 @@ #include #include +#include +#include +#include #include #include @@ -25,26 +28,21 @@ char UTF8NFC::processText(SWBuf &text, const SWKey *key, const SWModule *module) { - if ((unsigned long)key < 2) // hack, we're en(1)/de(0)ciphering + if ((unsigned long)key < 2) // hack, we're en(1)/de(0)ciphering return -1; - int32_t len = text.length() * 2; - source = new UChar[len + 1]; //each char could become a surrogate pair + UErrorCode status = U_ZERO_ERROR; + UnicodeString source(text.getRawData(), text.length(), conv, status); + UnicodeString target; - // Convert UTF-8 string to UTF-16 (UChars) - len = ucnv_toUChars(conv, source, len, text.c_str(), -1, &err); - target = new UChar[len + 1]; + status = U_ZERO_ERROR; + Normalizer::normalize(source, UNORM_NFC, 0, target, status); - //canonical composition - unorm_normalize(source, len, UNORM_NFC, 0, target, len, &err); + status = U_ZERO_ERROR; + text.setSize(text.size()*2); // potentially, it can grow to 2x the original size + int32_t len = target.extract(text.getRawData(), text.size(), conv, status); + text.setSize(len); - text.setSize(text.size()*2); - len = ucnv_fromUChars(conv, text.getRawData(), text.size(), target, -1, &err); - text.setSize(len); - - delete [] source; - delete [] target; - return 0; } Index: utilities/osis2mod.cpp =================================================================== --- utilities/osis2mod.cpp (revision 2135) +++ utilities/osis2mod.cpp (working copy) @@ -23,6 +23,10 @@ #include #include +#ifdef _ICU_ +#include +#endif + //#define DEBUG // Debug for simple transformation stack @@ -34,6 +38,10 @@ using namespace std; +#ifdef _ICU_ +UTF8NFC normalizer; +#endif + SWText *module = 0; VerseKey *currentVerse = 0; char activeOsisID[255]; @@ -50,11 +58,12 @@ "Jude", "Rev"}; static bool inCanonicalOSISBook = true; // osisID is for a book that is not in Sword's canon +static bool normalize = false; // Whether to normalize UTF-8 to NFC bool isOSISAbbrev(const char *buf) { bool match = false; for (int i = 0; i < 66; i++) { - if (!strcmp(buf, osisabbrevs[i])){ + if (!strcmp(buf, osisabbrevs[i])) { match = true; break; } @@ -141,6 +150,12 @@ makeKJVRef(key); } +#ifdef _ICU_ + if (normalize) { + normalizer.processText(activeVerseText, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks + } +#endif + SWBuf currentText = module->getRawEntry(); if (currentText.length()) { cout << "Appending entry: " << key.getOSISRef() << ": " << activeVerseText << endl; @@ -650,6 +665,8 @@ fprintf(stderr, "\t\t\t\t 2 - verse; 3 - chapter; 4 - book\n"); fprintf(stderr, " -c \t encipher module using supplied key\n"); fprintf(stderr, "\t\t\t\t (default no enciphering)\n"); + fprintf(stderr, " -n\t\t\t normalize UTF-8 to NFC (default is to leave text unmodified)\n"); + fprintf(stderr, "\t\t\t\t Note: all UTF-8 texts should be normalized to NFC\n"); exit(-1); } @@ -692,6 +709,13 @@ } usage(*argv, "-b requires one of <2|3|4>"); } + else if (!strcmp(argv[i], "-n")) { + normalize = true; +#ifndef _ICU_ + normalize = false; + cout << program << " is not compiled with support for ICU. Ignoring -n flag." << endl; +#endif + } else if (!strcmp(argv[i], "-c")) { if (i+1 < argc) cipherKey = argv[++i]; else usage(*argv, "-c requires "); @@ -706,7 +730,7 @@ } #ifdef DEBUG - cout << "path: " << path << " osisDoc: " << osisDoc << " create: " << append << " compressType: " << compType << " blockType: " << iType << " cipherKey: " << cipherKey.c_str() << "\n"; + cout << "path: " << path << " osisDoc: " << osisDoc << " create: " << append << " compressType: " << compType << " blockType: " << iType << " cipherKey: " << cipherKey.c_str() << " normalize: " << normalize << "\n"; cout << ""; // exit(-3); #endif @@ -715,8 +739,8 @@ if (!append) { // == 0 then create module // Try to initialize a default set of datafiles and indicies at our // datapath location passed to us from the user. - if ( compressor ){ - if ( zText::createModule(path, iType) ){ + if ( compressor ) { + if ( zText::createModule(path, iType) ) { fprintf(stderr, "error: %s: couldn't create module at path: %s \n", program, path); exit(-3); } @@ -735,7 +759,7 @@ } // Do some initialization stuff - if (compressor){ + if (compressor) { module = new zText(path, 0, 0, iType, compressor); } else{ @@ -744,7 +768,7 @@ SWFilter *cipherFilter = 0; - if (!cipherKey.empty()){ + if (!cipherKey.empty()) { fprintf(stderr, "Adding cipher filter with phrase: %s\n", cipherKey.c_str() ); cipherFilter = new CipherFilter(cipherKey.c_str()); module->AddRawFilter(cipherFilter);