The SWORD Project  1.9.0.svnversion
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
utf8arabicpoints.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  *
3  * utf8arabicpoints.cpp - SWFilter descendant to remove UTF-8
4  * Arabic vowel points
5  *
6  * $Id: utf8arabicpoints.cpp 3439 2016-10-23 08:32:02Z scribe $
7  *
8  * Copyright 2009-2013 CrossWire Bible Society (http://www.crosswire.org)
9  * CrossWire Bible Society
10  * P. O. Box 2528
11  * Tempe, AZ 85280-2528
12  *
13  * This program is free software; you can redistribute it and/or modify it
14  * under the terms of the GNU General Public License as published by the
15  * Free Software Foundation version 2.
16  *
17  * This program is distributed in the hope that it will be useful, but
18  * WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  * General Public License for more details.
21  *
22  */
23 
24 
25 #include <stdlib.h>
26 #include <stdio.h>
27 #include <utf8arabicpoints.h>
28 
29 
31 
32 namespace {
33 
34  static const char oName[] = "Arabic Vowel Points";
35  static const char oTip[] = "Toggles Arabic Vowel Points";
36 
37  static const StringList *oValues() {
38  static const SWBuf choices[3] = {"On", "Off", ""};
39  static const StringList oVals(&choices[0], &choices[2]);
40  return &oVals;
41  }
42 
43 
44  static char *nextMark(const char* from, int* mark_size) {
45  // Arabic vowel points currently targeted for elimination:
46  // Table entries excerpted from
47  // http://www.utf8-chartable.de/unicode-utf8-table.pl.
48  // Code UTF-8 Description
49  // point
50  // ----- --------- -----------
51  // U+064B d9 8b ARABIC FATHATAN
52  // U+064C d9 8c ARABIC DAMMATAN
53  // U+064D d9 8d ARABIC KASRATAN
54  // U+064E d9 8e ARABIC FATHA
55  // U+064F d9 8f ARABIC DAMMA
56  // U+0650 d9 90 ARABIC KASRA
57  // U+0651 d9 91 ARABIC SHADDA
58  // U+0652 d9 92 ARABIC SUKUN
59  // U+0653 d9 93 ARABIC MADDAH ABOVE
60  // U+0654 d9 94 ARABIC HAMZA ABOVE
61  // U+0655 d9 95 ARABIC HAMZA BELOW
62  //
63  // U+FC5E ef b1 9e ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM
64  // U+FC5F ef b1 9f ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM
65  // U+FC60 ef b1 a0 ARABIC LIGATURE SHADDA WITH FATHA ISOLATED FORM
66  // U+FC61 ef b1 a1 ARABIC LIGATURE SHADDA WITH DAMMA ISOLATED FORM
67  // U+FC62 ef b1 a2 ARABIC LIGATURE SHADDA WITH KASRA ISOLATED FORM
68  // U+FC63 ef b1 a3 ARABIC LIGATURE SHADDA WITH SUPERSCRIPT ALEF ISOLATED FORM
69  //
70  // U+FE70 ef b9 b0 ARABIC FATHATAN ISOLATED FORM
71  // U+FE71 ef b9 b1 ARABIC TATWEEL WITH FATHATAN ABOVE
72  // U+FE72 ef b9 b2 ARABIC DAMMATAN ISOLATED FORM
73  // U+FE73 ef b9 b3 ARABIC TAIL FRAGMENT
74  // U+FE74 ef b9 b4 ARABIC KASRATAN ISOLATED FORM
75  // U+FE75 ef b9 b5 ???
76  // U+FE76 ef b9 b6 ARABIC FATHA ISOLATED FORM
77  // U+FE77 ef b9 b7 ARABIC FATHA MEDIAL FORM
78  // U+FE78 ef b9 b8 ARABIC DAMMA ISOLATED FORM
79  // U+FE79 ef b9 b9 ARABIC DAMMA MEDIAL FORM
80  // U+FE7A ef b9 ba ARABIC KASRA ISOLATED FORM
81  // U+FE7B ef b9 bb ARABIC KASRA MEDIAL FORM
82  // U+FE7C ef b9 bc ARABIC SHADDA ISOLATED FORM
83  // U+FE7D ef b9 bd ARABIC SHADDA MEDIAL FORM
84  // U+FE7E ef b9 be ARABIC SUKUN ISOLATED FORM
85  // U+FE7F ef b9 bf ARABIC SUKUN MEDIAL FORM
86 
87  unsigned char* byte = (unsigned char*) from;
88  for (; *byte; ++byte) {
89  if (byte[0] == 0xD9) {
90  if (byte[1] >= 0x8B && byte[1] <= 0x95) {
91  *mark_size = 2;
92  break;
93  }
94  continue;
95  }
96  if (byte[0] == 0xEF) {
97  if (byte[1] == 0xB1) {
98  if (byte[2] >= 0x9E && byte[2] <= 0xA3) {
99  *mark_size = 3;
100  break;
101  }
102  continue;
103  }
104  if (byte[1] == 0xB9) {
105  if (byte[2] >= 0xB0 && byte[2] <= 0xBF) {
106  *mark_size = 3;
107  break;
108  }
109  continue;
110  }
111  }
112  }
113  return (char*)byte;
114  }
115 }
116 
117 
119 }
120 
121 
123 
124 
125 
126 char UTF8ArabicPoints::processText(SWBuf &text, const SWKey *, const SWModule *) {
127  // A non-zero/true option setting means that setOptionValue("On")
128  // was called which apparently means that Arabic Vowel Marks are ENABLED,
129  // so the filter's actions are DISABLED.
130  if (option)
131  return 0;
132 
133  // Eliminate Arabic vowel marks from the text.
134  // The recognized marks are determined by the "nextMark" function.
135 
136  // If nextMark were polymorphic (a virtual function or a function
137  // pointer), this function could be generically used in any filter that
138  // only removed (vs. replaced) areas of text based on the arbitrary
139  // match criteria encapsulated in the specific nextMark
140  // implementation.
141  int mark_size = 0;
142  char* mark_pos = nextMark(text.c_str(), &mark_size);
143 
144  // Here and at the end of the loop,
145  // test BOTH mark_pos AND *mark_pos for safety and to give nextMark
146  // the option of returning either NULL or a pointer to the null
147  // terminator when done.
148  if (!mark_pos || !*mark_pos)
149  return 0; // no marks found.
150 
151  // Purposely granting write access into SWBuf internal buffer via
152  // "end_of_output" avoids a needless temporary SWBuf copy.
153  // Everything before the first mark is already in its final position
154  // and can be safely ignored. So start appending at the current mark.
155  char* end_of_output = mark_pos;
156 
157  // For consistency, input starts at (vs. after) the first mark as well
158  // -- not a problem since the mark itself gets skipped, anyway.
159  const char* start_of_input = mark_pos;
160  do {
161  // At this point, "mark_pos" and "mark_pos+mark_size" delimit
162  // the text to drop.
163  // "start_of_input" is either mark_pos or any text between the
164  // end of any previous mark and the current mark_pos.
165  // This text is now ready to be moved into the output.
166  int ready_size = (int)(mark_pos - start_of_input);
167  if (ready_size > 0) {
168  // Append the input text before the current mark to the
169  // output.
170  // Must use bcopy vs. strncpy because the final
171  // end_of_output may overtake the original
172  // start_of_input.
173  memmove(end_of_output, start_of_input, ready_size);
174  // Keep appending to end_of_output.
175  end_of_output += ready_size;
176  }
177  // Ensure the mark never gets copied.
178  start_of_input = mark_pos + mark_size;
179  // Find the next mark.
180  mark_pos = nextMark(start_of_input, &mark_size);
181 
182  } while (mark_pos && *mark_pos); // No more marks.
183 
184  // Copy any trailing input text AND always the terminating null.
185  memmove(end_of_output, start_of_input, strlen(start_of_input)+1);
186  return 0;
187 }
188 
#define SWORD_NAMESPACE_START
Definition: defs.h:39
Definition: swbuf.h:47
static char * nextMark(const char *from, int *mark_size)
static const StringList * oValues()
const char * c_str() const
Definition: swbuf.h:158
std::list< SWBuf > StringList
Definition: swmodule.cpp:91
static const char oName[]
static const char * choices[4]
static const char oTip[]
#define SWORD_NAMESPACE_END
Definition: defs.h:40
Definition: swkey.h:77
virtual char processText(SWBuf &text, const SWKey *key=0, const SWModule *module=0)