The SWORD Project  1.9.0.svnversion
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
utilstr.h
Go to the documentation of this file.
1 /******************************************************************************
2  *
3  * utilstr.h - prototypes for string utility functions
4  *
5  * $Id: utilstr.h 3765 2020-07-26 10:31:09Z scribe $
6  *
7  * Copyright 1997-2013 CrossWire Bible Society (http://www.crosswire.org)
8  * CrossWire Bible Society
9  * P. O. Box 2528
10  * Tempe, AZ 85280-2528
11  *
12  * This program is free software; you can redistribute it and/or modify it
13  * under the terms of the GNU General Public License as published by the
14  * Free Software Foundation version 2.
15  *
16  * This program is distributed in the hope that it will be useful, but
17  * WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  * General Public License for more details.
20  *
21  */
22 
23 #ifndef UTILSTR_H
24 #define UTILSTR_H
25 
26 #include <defs.h>
27 #include <sysdata.h>
28 #include <swbuf.h>
29 
31 
32 
33 /******************************************************************************
34  * stdstr - clones a string
35  *
36  * ENT: ipstr - pointer to a string pointer to set if necessary
37  * istr - string to set to *ipstr
38  * 0 - only get
39  * memPadFactor - memory will be allocated the size of istr * memPadFactor
40  *
41  * RET: *ipstr
42  */
43 
44 inline char *stdstr(char **ipstr, const char *istr, unsigned int memPadFactor = 1) {
45  if (*ipstr)
46  delete [] *ipstr;
47  if (istr) {
48  int len = (int)strlen(istr) + 1;
49  *ipstr = new char [ len * memPadFactor ];
50  memcpy(*ipstr, istr, len);
51  }
52  else *ipstr = 0;
53  return *ipstr;
54 }
55 
56 SWDLLEXPORT char *strstrip (char *istr);
57 SWDLLEXPORT const char *stristr (const char *s1, const char *s2);
58 SWDLLEXPORT int strnicmp(const char *s1, const char *s2, int len);
59 SWDLLEXPORT int stricmp(const char *s1, const char *s2);
60 
61 /******************************************************************************
62  * SW_toupper - array of uppercase values for any given Latin-1 value
63  *
64  * use this instead of toupper() for fast lookups on accented characters
65  */
66 extern const unsigned char SW_toupper_array[256];
67 #define SW_toupper(c) SW_toupper_array[(unsigned char)c]
68 
69 /******************************************************************************
70  * SW_tolower - array of lowercase values for any given Latin-1 value
71  *
72  * use this instead of tolower() for fast lookups on accented characters
73  */
74 extern const unsigned char SW_tolower_array[256];
75 #define SW_tolower(c) SW_tolower_array[(unsigned char)c]
76 
77 
78 /******************************************************************************
79  * getUniCharFromUTF8 - retrieves the next Unicode codepoint from a UTF8 string
80  * and increments buf to start of next codepoint
81  *
82  * ENT: buf - address of a utf8 buffer
83  *
84  * RET: buf - incremented past last byte used in computing the current codepoint
85  * unicode codepoint value (0 with buf incremented is invalid UTF8 byte
86  */
87 
88 inline SW_u32 getUniCharFromUTF8(const unsigned char **buf, bool skipValidation = false) {
89  SW_u32 ch = 0;
90 
91  //case: We're at the end
92  if (!(**buf)) {
93  return ch;
94  }
95 
96  //case: ANSI
97  if (!(**buf & 128)) {
98  ch = **buf;
99  (*buf)++;
100  return ch;
101  }
102 
103  //case: Invalid UTF-8 (illegal continuing byte in initial position)
104  if ((**buf >> 6) == 2) {
105  (*buf)++;
106  return ch;
107  }
108 
109 
110  //case: 2+ byte codepoint
111  int subsequent = 1;
112  if ((**buf & 32) == 0) { subsequent = 1; }
113  else if ((**buf & 16) == 0) { subsequent = 2; }
114  else if ((**buf & 8) == 0) { subsequent = 3; }
115  else if ((**buf & 4) == 0) { subsequent = 4; }
116  else if ((**buf & 2) == 0) { subsequent = 5; }
117  else if ((**buf & 1) == 0) { subsequent = 6; }
118  else subsequent = 7; // is this legal?
119 
120  ch = **buf & (0xFF>>(subsequent + 1));
121 
122  for (int i = 1; i <= subsequent; ++i) {
123  // subsequent byte did not begin with 10XXXXXX
124  // move our buffer to here and error out
125  // this also catches our null if we hit the string terminator
126  if (((*buf)[i] >> 6) != 2) {
127  *buf += i;
128  return 0;
129  }
130  ch <<= 6;
131  ch |= (*buf)[i] & 63;
132  }
133  *buf += (subsequent+1);
134 
135  if (!skipValidation) {
136  // I THINK THIS IS STUPID BUT THE SPEC SAYS NO MORE THAN 4 BYTES
137  if (subsequent > 3) ch = 0;
138  // AGAIN stupid, but spec says UTF-8 can't use more than 21 bits
139  if (ch > 0x1FFFFF) ch = 0;
140  // This would be out of Unicode bounds
141  if (ch > 0x10FFFF) ch = 0;
142  // these would be values which could be represented in less bytes
143  if (ch < 0x80 && subsequent > 0) ch = 0;
144  if (ch < 0x800 && subsequent > 1) ch = 0;
145  if (ch < 0x10000 && subsequent > 2) ch = 0;
146  if (ch < 0x200000 && subsequent > 3) ch = 0;
147  }
148 
149  return ch;
150 }
151 
152 
153 /******************************************************************************
154  * getUTF8FromUniChar - retrieves us UTF8 string from a
155  * Unicode codepoint
156  *
157  * ENT: uchar - unicode codepoint value
158  *
159  * RET: buf - a UTF8 string which consists of the proper UTF8 sequence of
160  * bytes for the given Unicode codepoint
161  * NOTE: for speed and thread safety, this method now requires a buffer
162  * to work with
163  */
164 
165 inline SWBuf *getUTF8FromUniChar(SW_u32 uchar, SWBuf *appendTo) {
166  unsigned long base = appendTo->size();
167 
168  // This would be out of Unicode bounds
169  if (uchar > 0x10FFFF) uchar = 0xFFFD;
170  char bytes = uchar < 0x80 ? 1 : uchar < 0x800 ? 2 : uchar < 0x10000 ? 3 : 4;
171  appendTo->setSize(base+bytes);
172  switch (bytes) {
173  case 1:
174  (*appendTo)[base ] = (unsigned char)uchar;
175  break;
176  case 2:
177  (*appendTo)[base+1] = (unsigned char)(0x80 | (uchar & 0x3f));
178  uchar >>= 6;
179  (*appendTo)[base ] = (unsigned char)(0xc0 | (uchar & 0x1f));
180  break;
181  case 3:
182  (*appendTo)[base+2] = (unsigned char)(0x80 | (uchar & 0x3f));
183  uchar >>= 6;
184  (*appendTo)[base+1] = (unsigned char)(0x80 | (uchar & 0x3f));
185  uchar >>= 6;
186  (*appendTo)[base ] = (unsigned char)(0xe0 | (uchar & 0x0f));
187  break;
188  case 4:
189  (*appendTo)[base+3] = (unsigned char)(0x80 | (uchar & 0x3f));
190  uchar >>= 6;
191  (*appendTo)[base+2] = (unsigned char)(0x80 | (uchar & 0x3f));
192  uchar >>= 6;
193  (*appendTo)[base+1] = (unsigned char)(0x80 | (uchar & 0x3f));
194  uchar >>= 6;
195  (*appendTo)[base ] = (unsigned char)(0xf0 | (uchar & 0x07));
196  break;
197  }
198 /*
199  else if (uchar < 0x4000000) {
200  appendTo->setSize(base+5);
201  i = uchar & 0x3f;
202  (*appendTo)[base+4] = (unsigned char)(0x80 | i);
203  uchar >>= 6;
204 
205  i = uchar & 0x3f;
206  (*appendTo)[base+3] = (unsigned char)(0x80 | i);
207  uchar >>= 6;
208 
209  i = uchar & 0x3f;
210  (*appendTo)[base+2] = (unsigned char)(0x80 | i);
211  uchar >>= 6;
212 
213  i = uchar & 0x3f;
214  (*appendTo)[base+1] = (unsigned char)(0x80 | i);
215  uchar >>= 6;
216 
217  i = uchar & 0x03;
218  (*appendTo)[base] = (unsigned char)(0xf8 | i);
219  }
220  else if (uchar < 0x80000000) {
221  appendTo->setSize(base+6);
222  i = uchar & 0x3f;
223  (*appendTo)[base+5] = (unsigned char)(0x80 | i);
224  uchar >>= 6;
225 
226  i = uchar & 0x3f;
227  (*appendTo)[base+4] = (unsigned char)(0x80 | i);
228  uchar >>= 6;
229 
230  i = uchar & 0x3f;
231  (*appendTo)[base+3] = (unsigned char)(0x80 | i);
232  uchar >>= 6;
233 
234  i = uchar & 0x3f;
235  (*appendTo)[base+2] = (unsigned char)(0x80 | i);
236  uchar >>= 6;
237 
238  i = uchar & 0x3f;
239  (*appendTo)[base+1] = (unsigned char)(0x80 | i);
240  uchar >>= 6;
241 
242  i = uchar & 0x01;
243  (*appendTo)[base] = (unsigned char)(0xfc | i);
244  }
245 */
246  return appendTo;
247 }
248 
249 
250 /******************************************************************************
251  * assureValidUTF8 - iterates the supplied UTF-8 buffer and checks for validity
252  * replacing invalid bytes if necessary and returning a
253  * verified UTF8 buffer, leaving the original input
254  * unchanged.
255  *
256  * ENT: buf - a utf8 buffer
257  *
258  * RET: input buffer validated and any problems fixed by substituting a
259  * replacement character for bytes not valid.
260  */
261 SWBuf assureValidUTF8(const char *buf);
262 
263 /****
264  * This can be called to convert a UTF8 stream to an SWBuf which manages
265  * a wchar_t[]
266  * access buffer with (wchar_t *)SWBuf::getRawData();
267  *
268  */
269 SWBuf utf8ToWChar(const char *buf);
270 
271 /****
272  * This can be called to convert a wchar_t[] to a UTF-8 SWBuf
273  *
274  */
275 SWBuf wcharToUTF8(const wchar_t *buf);
276 
277 
278 
280 #endif
#define SWORD_NAMESPACE_START
Definition: defs.h:39
SWBuf assureValidUTF8(const char *buf)
Definition: utilstr.cpp:207
Definition: swbuf.h:47
const unsigned char SW_tolower_array[256]
Definition: utilstr.cpp:73
#define SWDLLEXPORT
Definition: defs.h:171
SWORD_NAMESPACE_START const unsigned char SW_toupper_array[256]
Definition: utilstr.cpp:34
SWBuf wcharToUTF8(const wchar_t *buf)
Definition: utilstr.cpp:263
SWBuf utf8ToWChar(const char *buf)
Definition: utilstr.cpp:239
int stricmp(const char *s1, const char *s2)
Definition: utilstr.cpp:194
SWORD_NAMESPACE_START char * stdstr(char **ipstr, const char *istr, unsigned int memPadFactor=1)
Definition: utilstr.h:44
const char * stristr(const char *s1, const char *s2)
Definition: utilstr.cpp:145
unsigned long size() const
Definition: swbuf.h:185
SWBuf * getUTF8FromUniChar(SW_u32 uchar, SWBuf *appendTo)
Definition: utilstr.h:165
char * strstrip(char *istr)
Definition: utilstr.cpp:118
int strnicmp(const char *s1, const char *s2, int len)
Definition: utilstr.cpp:180
unsigned int SW_u32
Definition: sysdata.h:41
#define SWORD_NAMESPACE_END
Definition: defs.h:40
SW_u32 getUniCharFromUTF8(const unsigned char **buf, bool skipValidation=false)
Definition: utilstr.h:88
void setSize(unsigned long len)
Definition: swbuf.h:255
static time_t base
Definition: ftpparse.c:47