The SWORD Project  1.9.0.svnversion
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
utilstr.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  *
3  * utilstr.cpp - String utility functions
4  *
5  * $Id: utilstr.cpp 3844 2021-02-14 18:26:54Z scribe $
6  *
7  * Copyright 1997-2013 CrossWire Bible Society (http://www.crosswire.org)
8  * CrossWire Bible Society
9  * P. O. Box 2528
10  * Tempe, AZ 85280-2528
11  *
12  * This program is free software; you can redistribute it and/or modify it
13  * under the terms of the GNU General Public License as published by the
14  * Free Software Foundation version 2.
15  *
16  * This program is distributed in the hope that it will be useful, but
17  * WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  * General Public License for more details.
20  *
21  */
22 
23 #include <utilstr.h>
24 #include <ctype.h>
25 #include <string.h>
26 
27 #include <sysdata.h>
28 #include <swlog.h>
29 #include <swbuf.h>
30 
31 
33 
34 const unsigned char SW_toupper_array[256] = {
35  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
36  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
37  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
38  0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
39  0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
40  0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
41  0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
42  0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
43  0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
44  0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
45  0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
46  0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
47  0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
48  0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
49  0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
50  0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
51  0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
52  0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
53  0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
54  0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
55  0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
56  0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
57  0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
58  0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
59  0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
60  0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
61  0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
62  0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
63  0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
64  0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
65  0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7,
66  0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff
67 };
68 
69 // WARNING, this is simply a copy of toupper right now
70 // It needs to be inverted, but actually should never
71 // be used. A StringMgr which supports UTF-8 should
72 // be used intstead
73 const unsigned char SW_tolower_array[256] = {
74  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
75  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
76  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
77  0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
78  0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
79  0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
80  0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
81  0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
82  0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
83  0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
84  0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
85  0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
86  0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
87  0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
88  0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
89  0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
90  0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
91  0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
92  0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
93  0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
94  0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
95  0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
96  0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
97  0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
98  0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
99  0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
100  0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7,
101  0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf,
102  0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
103  0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
104  0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
105  0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
106 };
107 
108 
109 
110 /******************************************************************************
111  * strstrip - Removes leading and trailing spaces from a string
112  *
113  * ENT: istr - string pointer to strip
114  *
115  * RET: *istr
116  */
117 
118 char *strstrip(char *istr) {
119  char *tmp = istr;
120  char *rtmp;
121 
122  int len = (int)strlen(istr);
123  if (len < 1)
124  return istr;
125  rtmp = istr + (len - 1);
126 
127  while ((rtmp > istr)&&((*rtmp == ' ')||(*rtmp == '\t')||(*rtmp == 10)||(*rtmp == 13))) *(rtmp--) = 0;
128  while ((*tmp == ' ')||(*tmp == '\t')||(*tmp == 10)||(*tmp == 13)) tmp++;
129  memmove(istr, tmp, (rtmp - tmp) + 1);
130  istr[(rtmp - tmp) + 1] = 0;
131 
132  return istr;
133 }
134 
135 
136 /******************************************************************************
137  * stristr - Scans a string for the occurrence of a given substring, no case
138  *
139  * ENT: scans s1 for the first occurrence of the substring s2, ingnoring case
140  *
141  * RET: a pointer to the element in s1, where s2 begins (points to s2 in s1).
142  * If s2 does not occur in s1, returns null.
143  */
144 
145 const char *stristr(const char *s1, const char *s2) {
146  int tLen = (int)strlen(s2);
147  int cLen = (int)strlen(s1);
148  char *target = new char [ tLen + 1 ];
149  int i, j;
150  const char *retVal = 0;
151 
152  strcpy(target, s2);
153  for (i = 0; i < tLen; i++)
154  target[i] = SW_toupper(target[i]);
155 
156  for (i = 0; i < (cLen - tLen)+1; i++) {
157  if (SW_toupper(s1[i]) == (unsigned char)*target) {
158  for (j = 1; j < tLen; j++) {
159  if (SW_toupper(s1[i+j]) != (unsigned char)target[j])
160  break;
161  }
162  if (j == tLen) {
163  retVal = s1+i;
164  break;
165  }
166  }
167  }
168  delete [] target;
169  return retVal;
170 }
171 
172 /******************************************************************************
173  * strnicmp - compares the first n bytes of 2 strings ignoring case
174  *
175  * ENT: compares s1 to s2 comparing the first n byte ingnoring case
176  *
177  * RET: same as strcmp
178  */
179 
180 int strnicmp(const char *s1, const char *s2, int len) {
181  int tLen = (int)strlen(s2);
182  int cLen = (int)strlen(s1);
183  char diff;
184  int i;
185  for (i = 0; ((i < len) && (i < tLen) && (i < cLen)); i++) {
186  if ((diff = SW_toupper(*s1) - SW_toupper(*s2)))
187  return diff;
188  s1++;
189  s2++;
190  }
191  return (i < len) ? cLen - tLen : 0;
192 }
193 
194 int stricmp(const char *s1, const char *s2) {
195 #if defined(__GNUC__)
196  return ::strcasecmp(s1, s2);
197 #else
198  #if defined(_WIN32_WCE)
199  return ::_stricmp(s1, s2);
200  #else
201  return ::stricmp(s1, s2);
202  #endif
203 #endif
204 }
205 
206 
207 SWBuf assureValidUTF8(const char *buf) {
208 
209  SWBuf myCopy = buf;
210  const unsigned char *b = (const unsigned char *)myCopy.c_str();
211  const unsigned char *q = 0;
212  bool invalidChar = false;
213  while (*b) {
214  q = b;
215  if (!getUniCharFromUTF8(&b)) {
216  long len = b - q;
217  if (len) {
218  invalidChar = true;
219  for (long start = q - (const unsigned char *)myCopy.c_str(); len; len--) {
220  myCopy[start+len-1] = 0x1a; // unicode replacement character
221  }
222 
223  }
224  }
225  }
226  if (invalidChar) {
227 // SWLog::getSystemLog()->logWarning("Changing invalid UTF-8 string (%s) to (%s)\n", buf, myCopy.c_str());
228  }
229  return myCopy;
230 }
231 
232 
233 /****
234  * This can be called to convert a UTF8 stream to an SWBuf which manages
235  * a wchar_t[]
236  * access buffer with (wchar_t *)SWBuf::getRawData();
237  *
238  */
239 SWBuf utf8ToWChar(const char *buf) {
240 
241  const char *q = 0;
242  SWBuf wcharBuf;
243  while (*buf) {
244  q = buf;
245  wchar_t wc = getUniCharFromUTF8((const unsigned char **)&buf);
246  if (!wc) {
247  // if my buffer was advanced but nothing was converted, I had invalid data
248  if (buf - q) {
249  // invalid bytes in UTF8 stream
250  wcharBuf.append((wchar_t)0x1a); // unicode replacement character
251  }
252  }
253  else wcharBuf.append(wc);
254  }
255  return wcharBuf;
256 }
257 
258 
259 /****
260  * This can be called to convert a wchar_t[] to a UTF-8 SWBuf
261  *
262  */
263 SWBuf wcharToUTF8(const wchar_t *buf) {
264 
265  SWBuf utf8Buf;
266  if (buf) {
267  while (*buf) {
268  getUTF8FromUniChar(*buf++, &utf8Buf);
269  }
270  }
271  return utf8Buf;
272 }
273 
274 
#define SWORD_NAMESPACE_START
Definition: defs.h:39
SWBuf assureValidUTF8(const char *buf)
Definition: utilstr.cpp:207
Definition: swbuf.h:47
const unsigned char SW_tolower_array[256]
Definition: utilstr.cpp:73
#define SW_toupper(c)
Definition: utilstr.h:67
SWORD_NAMESPACE_START const unsigned char SW_toupper_array[256]
Definition: utilstr.cpp:34
SWBuf wcharToUTF8(const wchar_t *buf)
Definition: utilstr.cpp:263
SWBuf utf8ToWChar(const char *buf)
Definition: utilstr.cpp:239
int stricmp(const char *s1, const char *s2)
Definition: utilstr.cpp:194
const char * stristr(const char *s1, const char *s2)
Definition: utilstr.cpp:145
const char * c_str() const
Definition: swbuf.h:158
SWBuf & append(const char *str, long max=-1)
Definition: swbuf.h:274
SWBuf * getUTF8FromUniChar(SW_u32 uchar, SWBuf *appendTo)
Definition: utilstr.h:165
char * strstrip(char *istr)
Definition: utilstr.cpp:118
int strnicmp(const char *s1, const char *s2, int len)
Definition: utilstr.cpp:180
#define SWORD_NAMESPACE_END
Definition: defs.h:40
SW_u32 getUniCharFromUTF8(const unsigned char **buf, bool skipValidation=false)
Definition: utilstr.h:88