The SWORD Project  1.9.0.svnversion
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
stringmgr.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  *
3  * stringmgr.cpp - implementation of class StringMgr
4  *
5  * $Id: stringmgr.cpp 3844 2021-02-14 18:26:54Z scribe $
6  *
7  * Copyright 2004-2013 CrossWire Bible Society (http://www.crosswire.org)
8  * CrossWire Bible Society
9  * P. O. Box 2528
10  * Tempe, AZ 85280-2528
11  *
12  * This program is free software; you can redistribute it and/or modify it
13  * under the terms of the GNU General Public License as published by the
14  * Free Software Foundation version 2.
15  *
16  * This program is distributed in the hope that it will be useful, but
17  * WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  * General Public License for more details.
20  *
21  */
22 
23 #include <ctype.h>
24 #include <stringmgr.h>
25 #include <swlog.h>
26 #include <localemgr.h>
27 #include <utilstr.h>
28 
29 #ifdef _ICU_
30 
31 #include <unicode/utypes.h>
32 #include <unicode/ucnv.h>
33 #include <unicode/ustring.h>
34 #include <unicode/uchar.h>
35 
36 #include <unicode/unistr.h>
37 #include <unicode/translit.h>
38 
39 #include <unicode/locid.h>
40 
41 #else
42 
43 #include <swtoupperdata.h>
44 
45 #endif
46 
47 
49 
50 
52 
54 public:
58 
59 
60 namespace {
61 
84  int isValidUTF8(unsigned char *txt) {
85  unsigned int countUTF8 = 0;
86  #if 0
87  unsigned char parts = 0;
88 
89 
90  unsigned char *p = txt;
91  while (*p) {
92  // Is the high order bit set?
93  if (*p & 0x80) {
94  // then count the number of high order bits that are set
95  // this determines the number of following bytes need to have high order bits set
96  unsigned char i = *p;
97  for (parts = 0; i & 0x80; parts++) {
98  i <<= 1;
99  }
100 
101 
102  // The pattern 10nnnnnn is not a unicode character
103  if (parts == 1) {
104  return 0;
105  }
106  else {
107  while (--parts && ++*p) {
108  // The pattern of each following character must be: 10nnnnnn
109  if (0xc0 & *p != 0x80) {
110  return 0;
111  }
112  }
113 
114  // Oops, we've run out of bytes too soon: Cannot be UTF-8
115  if (parts) {
116  return 0;
117  }
118  }
119  countUTF8++;
120  }
121  }
122 
123  // At this point it is either UTF-8 or ascii
124  #endif
125  return countUTF8 ? 1 : -1;
126  }
127 
128  char *lowerLatin1(char *buf, unsigned int maxlen = 0) {
129  if (!buf)
130  return 0;
131 
132  char *ret = buf;
133  bool checkMax = maxlen;
134 
135  while (*buf && (!checkMax || maxlen--)) {
136  *buf = SW_tolower(*buf);
137  buf++;
138  }
139 
140  return ret;
141  }
142 }
143 
144 
145 #ifdef _ICU_
146 
147 //here comes our ICUStringMgr reimplementation
148 class ICUStringMgr : public StringMgr {
149 public:
150  virtual char *upperUTF8(char *, unsigned int maxlen = 0) const;
151  virtual char *lowerUTF8(char *, unsigned int maxlen = 0) const;
152  virtual bool isUpper(SW_u32 character) const;
153  virtual bool isLower(SW_u32 character) const;
154  virtual bool isDigit(SW_u32 character) const;
155  virtual bool isAlpha(SW_u32 character) const;
156 
157 protected:
158  virtual bool supportsUnicode() const { return true; };
159 };
160 
161 #endif
162 
163 
167 }
168 
172 }
173 
177 }
178 
183  if (systemStringMgr)
184  delete systemStringMgr;
185 
186  systemStringMgr = newStringMgr;
187 
188  // TODO: this is magic. apparently we have to reset the system localemgr upon changing stringmgr.
189  // setting system stringmgr should be set before localemgr and not possible to change.
190  // rework this design.
192 }
193 
198  if (!systemStringMgr) {
199 #ifdef _ICU_
200  systemStringMgr = new ICUStringMgr();
201 // SWLOGI("created default ICUStringMgr");
202 #else
203  systemStringMgr = new StringMgr();
204 // SWLOGI("created default StringMgr");
205 #endif
206  }
207 
208  return systemStringMgr;
209 }
210 
211 
223 char *StringMgr::upperUTF8(char *t, unsigned int maxlen) const {
224 
225 #ifndef _ICU_
226 
227  SWBuf orig = t;
228  const unsigned char* from = (unsigned char*)orig.c_str();
229  SWBuf text = "";
230  std::map<SW_u32, SW_u32>::const_iterator it = toUpperData.end();
231  while (*from) {
232  SW_u32 ch = getUniCharFromUTF8(&from, true);
233  // should we skip conversion if we run into an invalid UTF8 character?
234  // maybe the string isn't intended to be UTF8
235  // Right now, if ch is bad, then convert to replacement char
236  if (!ch) ch = 0xFFFD;
237 
238  it = toUpperData.find(ch);
239  getUTF8FromUniChar(it == toUpperData.end() ? ch : it->second, &text);
240  }
241  long len = maxlen ? (text.size() < maxlen ? text.size() : (maxlen - 1)) : 0;
242  if (len) memcpy(t, text.c_str(), len);
243  t[len] = 0;
244 #endif
245  return t;
246 /* OLD
247  // try to decide if it's worth trying to toupper. Do we have more
248  // characters which are probably lower latin than not?
249  // we still don't use isValidUTF8 optimally. what if we have 1 unicode
250  // character in the string? should we not try to upper any of the string?
251  // dunno. Best solution is to upper all other characters. Don't have
252  // time to write that before release.
253 
254  long performOp = 0;
255  if (!isValidUTF8((unsigned char *)t)) {
256  performOp = 1;
257  }
258  else {
259  for (const char *ch = t; *ch; ch++) {
260  performOp += (*ch > 0) ? 1 : -1;
261  }
262  }
263 
264  if (performOp > 0) {
265  return upperLatin1(t);
266  }
267 */
268 
269  return t;
270 }
271 
272 
284 char *StringMgr::lowerUTF8(char *t, unsigned int maxlen) const {
285  // try to decide if it's worth trying to tolower. Do we have more
286  // characters which are probably lower latin than not?
287  // we still don't use isValidUTF8 optimally. what if we have 1 unicode
288  // character in the string? should we not try to lower any of the string?
289  // dunno. Best solution is to lower all other characters. Don't have
290  // time to write that before release.
291  long performOp = 0;
292  if (!isValidUTF8((unsigned char *)t)) {
293  performOp = 1;
294  }
295  else {
296  for (const char *ch = t; *ch; ch++) {
297  performOp += (*ch > 0) ? 1 : -1;
298  }
299  }
300 
301  if (performOp > 0) {
302  return lowerLatin1(t);
303  }
304 
305  return t;
306 }
307 
308 bool StringMgr::isUpper(SW_u32 character) const {
309  return isupper(character);
310 }
311 bool StringMgr::isLower(SW_u32 character) const {
312  return islower(character);
313 }
314 bool StringMgr::isDigit(SW_u32 character) const {
315  return isdigit(character);
316 }
317 bool StringMgr::isAlpha(SW_u32 character) const {
318  return isalpha(character);
319 }
320 
321 
322 
327 char *StringMgr::upperLatin1(char *buf, unsigned int maxlen) const {
328  if (!buf)
329  return 0;
330 
331  char *ret = buf;
332  bool checkMax = maxlen;
333 
334  while (*buf && (!checkMax || maxlen--)) {
335  *buf = SW_toupper(*buf);
336  buf++;
337  }
338 
339  return ret;
340 }
341 
343  return true; //default impl has no UTF8 support
344 }
345 
346 
347 #ifdef _ICU_
348 
349 char *ICUStringMgr::upperUTF8(char *buf, unsigned int maxlen) const {
350  char *ret = buf;
351  int max = (int)((maxlen) ? maxlen : strlen(buf));
352 
353  UErrorCode err = U_ZERO_ERROR;
354 
355  if (!buf || !max) {
356  return ret;
357  }
358 
359  UChar *lowerStr = new UChar[max+10];
360  UChar *upperStr = new UChar[max+10];
361 
362  u_strFromUTF8(lowerStr, max+9, 0, buf, -1, &err);
363  if (err != U_ZERO_ERROR) {
364 // SWLog::getSystemLog()->logError("from: %s", u_errorName(err));
365  delete [] lowerStr;
366  delete [] upperStr;
367  return ret;
368  }
369 
370  u_strToUpper(upperStr, max+9, lowerStr, -1, 0, &err);
371  if (err != U_ZERO_ERROR) {
372 // SWLog::getSystemLog()->logError("upperCase: %s", u_errorName(err));
373  delete [] lowerStr;
374  delete [] upperStr;
375  return ret;
376  }
377 
378  ret = u_strToUTF8(ret, max, 0, upperStr, -1, &err);
379 
380  delete [] lowerStr;
381  delete [] upperStr;
382  return ret;
383 }
384 
385 char *ICUStringMgr::lowerUTF8(char *buf, unsigned int maxlen) const {
386  char *ret = buf;
387  int max = (int)((maxlen) ? maxlen : strlen(buf));
388 
389  UErrorCode err = U_ZERO_ERROR;
390 
391  if (!buf || !max) {
392  return ret;
393  }
394 
395  UChar *sourceStr = new UChar[max+10];
396  UChar *resultStr = new UChar[max+10];
397 
398  u_strFromUTF8(sourceStr, max+9, 0, buf, -1, &err);
399  if (err != U_ZERO_ERROR) {
400 // SWLog::getSystemLog()->logError("from: %s", u_errorName(err));
401  delete [] sourceStr;
402  delete [] resultStr;
403  return ret;
404  }
405 
406  u_strToLower(resultStr, max+9, sourceStr, -1, 0, &err);
407  if (err != U_ZERO_ERROR) {
408 // SWLog::getSystemLog()->logError("upperCase: %s", u_errorName(err));
409  delete [] sourceStr;
410  delete [] resultStr;
411  return ret;
412  }
413 
414  ret = u_strToUTF8(ret, max, 0, resultStr, -1, &err);
415 
416  delete [] sourceStr;
417  delete [] resultStr;
418  return ret;
419 }
420 
421 bool ICUStringMgr::isUpper(SW_u32 character) const {
422  return u_isupper(character);
423 }
424 bool ICUStringMgr::isLower(SW_u32 character) const {
425  return u_islower(character);
426 }
427 bool ICUStringMgr::isDigit(SW_u32 character) const {
428  return u_isdigit(character);
429 }
430 bool ICUStringMgr::isAlpha(SW_u32 character) const {
431  return u_isalpha(character);
432 }
433 
434 #endif
435 
#define SWORD_NAMESPACE_START
Definition: defs.h:39
#define SW_tolower(c)
Definition: utilstr.h:75
Definition: swbuf.h:47
class __staticsystemStringMgr _staticsystemStringMgr
#define SW_toupper(c)
Definition: utilstr.h:67
virtual bool isLower(SW_u32 character) const
Definition: stringmgr.cpp:311
virtual bool isDigit(SW_u32 character) const
Definition: stringmgr.cpp:314
static StringMgr * getSystemStringMgr()
Definition: stringmgr.cpp:197
virtual bool supportsUnicode() const
Definition: stringmgr.cpp:342
const char * c_str() const
Definition: swbuf.h:158
static StringMgr * systemStringMgr
Definition: stringmgr.h:41
virtual char * upperUTF8(char *text, unsigned int max=0) const
Definition: stringmgr.cpp:223
virtual bool isUpper(SW_u32 character) const
Definition: stringmgr.cpp:308
static LocaleMgr * getSystemLocaleMgr()
Definition: localemgr.cpp:54
static void setSystemStringMgr(StringMgr *newStringMgr)
Definition: stringmgr.cpp:182
unsigned long size() const
Definition: swbuf.h:185
static void setSystemLocaleMgr(LocaleMgr *newLocaleMgr)
Definition: localemgr.cpp:63
SWBuf * getUTF8FromUniChar(SW_u32 uchar, SWBuf *appendTo)
Definition: utilstr.h:165
virtual char * upperLatin1(char *text, unsigned int max=0) const
Definition: stringmgr.cpp:327
reg_syntax_t ret
Definition: regex.c:1351
unsigned int SW_u32
Definition: sysdata.h:41
virtual bool isAlpha(SW_u32 character) const
Definition: stringmgr.cpp:317
SWORD_NAMESPACE_START std::map< SW_u32, SW_u32 > toUpperData
Definition: swtoupperdata.h:32
virtual ~StringMgr()
Definition: stringmgr.cpp:176
#define SWORD_NAMESPACE_END
Definition: defs.h:40
SW_u32 getUniCharFromUTF8(const unsigned char **buf, bool skipValidation=false)
Definition: utilstr.h:88
virtual char * lowerUTF8(char *text, unsigned int max=0) const
Definition: stringmgr.cpp:284