The SWORD Project  1.9.0.svnversion
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
scsuutf8.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  *
3  * scsuutf8.cpp - SWFilter descendant to convert a SCSU character to
4  * UTF-8
5  *
6  * $Id: scsuutf8.cpp 3727 2020-04-29 02:33:23Z scribe $
7  *
8  * Copyright 2001-2014 CrossWire Bible Society (http://www.crosswire.org)
9  * CrossWire Bible Society
10  * P. O. Box 2528
11  * Tempe, AZ 85280-2528
12  *
13  * This program is free software; you can redistribute it and/or modify it
14  * under the terms of the GNU General Public License as published by the
15  * Free Software Foundation version 2.
16  *
17  * This program is distributed in the hope that it will be useful, but
18  * WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  * General Public License for more details.
21  *
22  */
23 
24 /* This class is based on:
25  * http://czyborra.com/scsu/scsu.c written by Roman Czyborra@dds.nl
26  *
27  * This is a deflator to UTF-8 output for input compressed in SCSU,
28  * the (Reuters) Standard Compression Scheme for Unicode as described
29  * in http://www.unicode.org/unicode/reports/tr6.html
30  */
31 
32 #include <scsuutf8.h>
33 #include <swbuf.h>
34 #ifdef _ICU_
35 #include <unicode/unistr.h>
36 #endif
37 
38 
39 
41 
42 
44 #ifdef _ICU_
45  : err()
46 #endif
47 {
48 #ifdef _ICU_
49 
50  // initialize SCSU converter
51  scsuConv = ucnv_open("SCSU", &err);
52  // initialize UTF-8 converter
53  utf8Conv = ucnv_open("UTF-8", &err);
54 #else
55  active = 0;
56  mode = 0;
57 #endif
58 }
59 
61 #ifdef _ICU_
62  ucnv_close(scsuConv);
63  ucnv_close(utf8Conv);
64 #endif
65 }
66 
67 #ifndef _ICU_
68 unsigned short SCSUUTF8::start[] = {0x0000,0x0080,0x0100,0x0300,0x2000,0x2080,0x2100,0x3000};
69 unsigned short SCSUUTF8::slide[] = {0x0080,0x00C0,0x0400,0x0600,0x0900,0x3040,0x30A0,0xFF00};
70 unsigned short SCSUUTF8::win[] = {
71  0x0000, 0x0080, 0x0100, 0x0180, 0x0200, 0x0280, 0x0300, 0x0380,
72  0x0400, 0x0480, 0x0500, 0x0580, 0x0600, 0x0680, 0x0700, 0x0780,
73  0x0800, 0x0880, 0x0900, 0x0980, 0x0A00, 0x0A80, 0x0B00, 0x0B80,
74  0x0C00, 0x0C80, 0x0D00, 0x0D80, 0x0E00, 0x0E80, 0x0F00, 0x0F80,
75  0x1000, 0x1080, 0x1100, 0x1180, 0x1200, 0x1280, 0x1300, 0x1380,
76  0x1400, 0x1480, 0x1500, 0x1580, 0x1600, 0x1680, 0x1700, 0x1780,
77  0x1800, 0x1880, 0x1900, 0x1980, 0x1A00, 0x1A80, 0x1B00, 0x1B80,
78  0x1C00, 0x1C80, 0x1D00, 0x1D80, 0x1E00, 0x1E80, 0x1F00, 0x1F80,
79  0x2000, 0x2080, 0x2100, 0x2180, 0x2200, 0x2280, 0x2300, 0x2380,
80  0x2400, 0x2480, 0x2500, 0x2580, 0x2600, 0x2680, 0x2700, 0x2780,
81  0x2800, 0x2880, 0x2900, 0x2980, 0x2A00, 0x2A80, 0x2B00, 0x2B80,
82  0x2C00, 0x2C80, 0x2D00, 0x2D80, 0x2E00, 0x2E80, 0x2F00, 0x2F80,
83  0x3000, 0x3080, 0x3100, 0x3180, 0x3200, 0x3280, 0x3300, 0x3800,
84  0xE000, 0xE080, 0xE100, 0xE180, 0xE200, 0xE280, 0xE300, 0xE380,
85  0xE400, 0xE480, 0xE500, 0xE580, 0xE600, 0xE680, 0xE700, 0xE780,
86  0xE800, 0xE880, 0xE900, 0xE980, 0xEA00, 0xEA80, 0xEB00, 0xEB80,
87  0xEC00, 0xEC80, 0xED00, 0xED80, 0xEE00, 0xEE80, 0xEF00, 0xEF80,
88  0xF000, 0xF080, 0xF100, 0xF180, 0xF200, 0xF280, 0xF300, 0xF380,
89  0xF400, 0xF480, 0xF500, 0xF580, 0xF600, 0xF680, 0xF700, 0xF780,
90  0xF800, 0xF880, 0xF900, 0xF980, 0xFA00, 0xFA80, 0xFB00, 0xFB80,
91  0xFC00, 0xFC80, 0xFD00, 0xFD80, 0xFE00, 0xFE80, 0xFF00, 0xFF80,
92  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
93  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
94  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
95  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
96  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
97  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
98  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
99  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
100  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
101  0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
102  0x0000, 0x00C0, 0x0250, 0x0370, 0x0530, 0x3040, 0x30A0, 0xFF60,
103 };
104 
105 int SCSUUTF8::UTF8Output(unsigned long uchar, SWBuf* utf8Buf)
106 {
107  // join UTF-16 surrogates without any pairing sanity checks
108  if (uchar >= 0xd800 && uchar <= 0xdbff) {
109  d = uchar & 0x3ff;
110  return 0;
111  }
112  if (uchar >= 0xdc00 && uchar <= 0xdfff) {
113  uchar = uchar + 0x2400 + d * 0x400;
114  }
115 
116  // output one character as UTF-8 multibyte sequence
117 
118  if (uchar < 0x80) {
119  utf8Buf += uchar;
120  }
121  else if (uchar < 0x800) {
122  utf8Buf += (0xc0 | (uchar>>6));
123  utf8Buf += (0x80 | (uchar & 0x3f));
124  }
125  else if (uchar < 0x10000) {
126  utf8Buf += (0xe0 | (uchar>>12));
127  utf8Buf += (0x80 | (uchar>>6 & 0x3f));
128  utf8Buf += (0x80 | (uchar & 0x3f));
129  }
130  else if (uchar < 0x200000) {
131  utf8Buf += (0xf0 | (uchar>>18));
132  utf8Buf += (0x80 | (uchar>>12 & 0x3f));
133  utf8Buf += (0x80 | (uchar>>6 & 0x3f));
134  utf8Buf += (0x80 | (uchar & 0x3f));
135  }
136 
137  return 0;
138 }
139 #endif
140 
141 char SCSUUTF8::processText(SWBuf &text, const SWKey *key, const SWModule *module) {
142  if ((unsigned long)key < 2) // hack, we're en(1)/de(0)ciphering
143  return -1;
144 
145 #ifdef _ICU_
146  // Try decoding with ICU if possible
147  err = U_ZERO_ERROR;
148  icu::UnicodeString utf16Text(text.getRawData(), text.length(), scsuConv, err);
149  err = U_ZERO_ERROR;
150  int32_t len = utf16Text.extract(text.getRawData(), text.size(), utf8Conv, err);
151  if (len > (int32_t)text.size()+1) {
152  text.setSize(len+1);
153  utf16Text.extract(text.getRawData(), text.size(), utf8Conv, err);
154  }
155 #else
156  // If ICU is unavailable, decode using Czyborra's decoder
157  SWBuf utf8Buf = "";
158  int len = text.length();
159  const char* scsuString = text.c_str();
160 
161  for (int i = 0; i < len;) {
162 
163  if (i >= len) break;
164  c = scsuString[i++];
165 
166  if (c >= 0x80)
167  {
168  UTF8Output(c - 0x80 + slide[active], &utf8Buf);
169  }
170  else if (c >= 0x20 && c <= 0x7F)
171  {
172  UTF8Output(c, &utf8Buf);
173  }
174  else if (c == 0x0 || c == 0x9 || c == 0xA || c == 0xC || c == 0xD)
175  {
176  UTF8Output(c, &utf8Buf);
177  }
178  else if (c >= 0x1 && c <= 0x8) // SQn
179  {
180  if (i >= len) break;
181  d = scsuString[i++]; // single quote
182 
183  UTF8Output(d < 0x80 ? d + start[c - 0x1] :
184  d - 0x80 + slide[c - 0x1], &utf8Buf);
185  }
186  else if (c >= 0x10 && c <= 0x17) // SCn
187  {
188  active = c - 0x10; // change window
189  }
190  else if (c >= 0x18 && c <= 0x1F) // SDn
191  {
192  active = c - 0x18; // define window
193  if (i >= len) break;
194  slide[active] = win[(unsigned char)scsuString[i++]];
195  }
196  else if (c == 0xB) // SDX
197  {
198  if (i >= len) break;
199  c = scsuString[i++];
200 
201  if (i >= len) break;
202  d = scsuString[i++];
203 
204  slide[active = c>>5] = 0x10000 + (((c & 0x1F) << 8 | d) << 7);
205  }
206  else if (c == 0xE) // SQU
207  {
208  if (i >= len) break;
209  c = scsuString[i++]; // SQU
210 
211  if (i >= len) break;
212  UTF8Output(c << 8 | scsuString[i++], &utf8Buf);
213  }
214  else if (c == 0xF) // SCU
215  {
216  mode = 1; // change to Unicode mode
217 
218  while (mode)
219  {
220  if (i >= len) break;
221  c = scsuString[i++];
222 
223  if (c <= 0xDF || c >= 0xF3)
224  {
225  if (i >= len) break;
226  UTF8Output(c << 8 | scsuString[i++], &utf8Buf);
227  }
228  else if (c == 0xF0) // UQU
229  {
230  if (i >= len) break;
231  c = scsuString[i++];
232 
233  if (i >= len) break;
234  UTF8Output(c << 8 | scsuString[i++], &utf8Buf);
235  }
236  else if (c >= 0xE0 && c <= 0xE7) // UCn
237  {
238  active = c - 0xE0;
239  mode = 0;
240  }
241  else if (c >= 0xE8 && c <= 0xEF) // UDn
242  {
243  if (i >= len) break;
244  slide[active=c-0xE8] = win[(unsigned char)scsuString[i++]];
245  mode = 0;
246  }
247  else if (c == 0xF1) // UDX
248  {
249  if (i >= len) break;
250  c = scsuString[i++];
251 
252  if (i >= len) break;
253  d = scsuString[i++];
254 
255  slide[active = c>>5] =
256  0x10000 + (((c & 0x1F) << 8 | d) << 7);
257  mode = 0;
258  }
259  }
260  }
261  }
262 #endif
263 
264  return 0;
265 }
266 
267 
#define SWORD_NAMESPACE_START
Definition: defs.h:39
static unsigned short start[8]
Definition: scsuutf8.h:50
static unsigned short slide[8]
Definition: scsuutf8.h:51
static unsigned short win[256]
Definition: scsuutf8.h:52
Definition: swbuf.h:47
unsigned long length() const
Definition: swbuf.h:197
bool mode
Definition: scsuutf8.h:47
SWText * module
Definition: osis2mod.cpp:105
unsigned long d
Definition: scsuutf8.h:48
~SCSUUTF8()
Definition: scsuutf8.cpp:60
char * getRawData()
Definition: swbuf.h:379
const char * c_str() const
Definition: swbuf.h:158
unsigned char active
Definition: scsuutf8.h:46
SCSUUTF8()
Definition: scsuutf8.cpp:43
unsigned long size() const
Definition: swbuf.h:185
unsigned long c
Definition: scsuutf8.h:48
virtual char processText(SWBuf &text, const SWKey *key=0, const SWModule *module=0)
Definition: scsuutf8.cpp:141
#define SWORD_NAMESPACE_END
Definition: defs.h:40
int UTF8Output(unsigned long, SWBuf *utf8Buf)
Definition: scsuutf8.cpp:105
Definition: swkey.h:77
void setSize(unsigned long len)
Definition: swbuf.h:255