The SWORD Project  1.9.0.svnversion
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
osisplain.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  *
3  * osisplain.cpp - An SWFilter that provides stripping of OSIS tags
4  *
5  * $Id: osisplain.cpp 3623 2019-05-19 02:47:41Z scribe $
6  *
7  * Copyright 2003-2013 CrossWire Bible Society (http://www.crosswire.org)
8  * CrossWire Bible Society
9  * P. O. Box 2528
10  * Tempe, AZ 85280-2528
11  *
12  * This program is free software; you can redistribute it and/or modify it
13  * under the terms of the GNU General Public License as published by the
14  * Free Software Foundation version 2.
15  *
16  * This program is distributed in the hope that it will be useful, but
17  * WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  * General Public License for more details.
20  *
21  */
22 
23 #include <stdlib.h>
24 #include <osisplain.h>
25 #include <ctype.h>
26 #include <versekey.h>
27 #include <stringmgr.h>
28 #include <utilxml.h>
29 #include <swmodule.h>
30 
32 
33 
34 namespace {
35 
36  class MyUserData : public BasicFilterUserData {
37  public:
40  char testament;
42  MyUserData(const SWModule *module, const SWKey *key) : BasicFilterUserData(module, key) {}
43  };
44 }
45 
46 
48  setTokenStart("<");
49  setTokenEnd(">");
50 
51  setEscapeStart("&");
52  setEscapeEnd(";");
53 
55 
56  addEscapeStringSubstitute("amp", "&");
57  addEscapeStringSubstitute("apos", "'");
58  addEscapeStringSubstitute("lt", "<");
59  addEscapeStringSubstitute("gt", ">");
60  addEscapeStringSubstitute("quot", "\"");
61 
63  addTokenSubstitute("title", "\n");
64  addTokenSubstitute("/title", "\n");
65  addTokenSubstitute("/l", "\n");
66  addTokenSubstitute("lg", "\n");
67  addTokenSubstitute("/lg", "\n");
68 
70 }
71 
72 
74  MyUserData *u = new MyUserData(module, key);
75  u->testament = (u->vkey) ? u->vkey->getTestament() : 2; // default to NT
76  return u;
77 }
78 
79 
80 bool OSISPlain::processStage(char stage, SWBuf &text, char *&from, BasicFilterUserData *userData) {
81  // this is a strip filter so we want to do this as optimized as possible. Avoid calling
82  // getUniCharFromUTF8 for slight speed improvement
83 
84  if (stage == PRECHAR) {
85  if ((unsigned)from[0] == 0xC2 && (unsigned)from[1] == 0xAD) return true; // skip soft hyphens
86  }
87  return false;
88 }
89 
90 
91 bool OSISPlain::handleToken(SWBuf &buf, const char *token, BasicFilterUserData *userData) {
92  // manually process if it wasn't a simple substitution
93  if (!substituteToken(buf, token)) {
94  MyUserData *u = (MyUserData *)userData;
95  if (((*token == 'w') && (token[1] == ' ')) ||
96  ((*token == '/') && (token[1] == 'w') && (!token[2]))) {
97  u->tag = token;
98 
99  bool start = false;
100  if (*token == 'w') {
101  if (token[strlen(token)-1] != '/') {
102  u->w = token;
103  return true;
104  }
105  start = true;
106  }
107  u->tag = (start) ? token : u->w.c_str();
108  bool show = true; // to handle unplaced article in kjv2003-- temporary till combined
109 
110  SWBuf lastText = (start) ? "stuff" : u->lastTextNode.c_str();
111 
112  const char *attrib;
113  const char *val;
114  if ((attrib = u->tag.getAttribute("xlit"))) {
115  val = strchr(attrib, ':');
116  val = (val) ? (val + 1) : attrib;
117  buf.append(" <");
118  buf.append(val);
119  buf.append('>');
120  }
121  if ((attrib = u->tag.getAttribute("gloss"))) {
122  buf.append(" <");
123  buf.append(attrib);
124  buf.append('>');
125  }
126  if ((attrib = u->tag.getAttribute("lemma"))) {
127  int count = u->tag.getAttributePartCount("lemma", ' ');
128  int i = (count > 1) ? 0 : -1; // -1 for whole value cuz it's faster, but does the same thing as 0
129  do {
130  char gh;
131  attrib = u->tag.getAttribute("lemma", i, ' ');
132  if (i < 0) i = 0; // to handle our -1 condition
133  val = strchr(attrib, ':');
134  val = (val) ? (val + 1) : attrib;
135  if ((strchr("GH", *val)) && (isdigit(val[1]))) {
136  gh = *val;
137  val++;
138  }
139  else {
140  gh = (u->testament>1) ? 'G' : 'H';
141  }
142  if ((!strcmp(val, "3588")) && (lastText.length() < 1))
143  show = false;
144  else {
145  buf.append(" <");
146  buf.append(gh);
147  buf.append(val);
148  buf.append(">");
149  }
150  } while (++i < count);
151  }
152  if ((attrib = u->tag.getAttribute("morph")) && (show)) {
153  int count = u->tag.getAttributePartCount("morph", ' ');
154  int i = (count > 1) ? 0 : -1; // -1 for whole value cuz it's faster, but does the same thing as 0
155  do {
156  attrib = u->tag.getAttribute("morph", i, ' ');
157  if (i < 0) i = 0; // to handle our -1 condition
158  val = strchr(attrib, ':');
159  val = (val) ? (val + 1) : attrib;
160  if ((*val == 'T') && (strchr("GH", val[1])) && (isdigit(val[2])))
161  val+=2;
162  buf.append(" (");
163  buf.append(val);
164  buf.append(')');
165  } while (++i < count);
166  }
167  if ((attrib = u->tag.getAttribute("POS"))) {
168  val = strchr(attrib, ':');
169  val = (val) ? (val + 1) : attrib;
170 
171  buf.append(" <");
172  buf.append(val);
173  buf.append('>');
174  }
175  }
176 
177  // <note> tag
178  else if (!strncmp(token, "note", 4)) {
179  if (!strstr(token, "strongsMarkup")) { // leave strong's markup notes out, in the future we'll probably have different option filters to turn different note types on or off
180  buf.append(" [");
181  }
182  else u->suspendTextPassThru = true;
183  if (u->module) {
184  XMLTag tag = token;
185  SWBuf swordFootnote = tag.getAttribute("swordFootnote");
186  SWBuf footnoteBody = u->module->getEntryAttributes()["Footnote"][swordFootnote]["body"];
187  buf.append(u->module->renderText(footnoteBody));
188  }
189  }
190  else if (!strncmp(token, "/note", 5)) {
191  if (!u->suspendTextPassThru)
192  buf.append("] ");
193  else u->suspendTextPassThru = false;
194  }
195 
196  // <p> paragraph tag
197  else if (((*token == 'p') && ((token[1] == ' ') || (!token[1]))) ||
198  ((*token == '/') && (token[1] == 'p') && (!token[2]))) {
199  userData->supressAdjacentWhitespace = true;
200  buf.append('\n');
201  }
202 
203  // Milestoned paragraph, created by osis2mod
204  // <div type="paragraph" sID... />
205  // <div type="paragraph" eID... />
206  else if (!strcmp(u->tag.getName(), "div") && u->tag.getAttribute("type") && (!strcmp(u->tag.getAttribute("type"), "x-p") || !strcmp(u->tag.getAttribute("type"), "paragraph")) &&
207  (u->tag.isEmpty() && (u->tag.getAttribute("sID") || u->tag.getAttribute("eID")))) {
208  userData->supressAdjacentWhitespace = true;
209  buf.append('\n');
210  }
211 
212  // <lb .../>
213  else if (!strncmp(token, "lb", 2)) {
214  userData->supressAdjacentWhitespace = true;
215  buf.append('\n');
216  }
217  else if (!strncmp(token, "l", 1) && strstr(token, "eID")) {
218  userData->supressAdjacentWhitespace = true;
219  buf.append('\n');
220  }
221  else if (!strncmp(token, "/divineName", 11)) {
222  // Get the end portion of the string, and upper case it
223  char* end = buf.getRawData();
224  end += buf.size() - u->lastTextNode.size();
225  toupperstr(end);
226  }
227  else if (!strncmp(token, "hi", 2)) {
228 
229  // handle both OSIS 'type' and TEI 'rend' attributes
230  // there is no officially supported OSIS overline attribute,
231  // thus either TEI overline or OSIS x-overline would be best,
232  // but we have used "ol" in the past, as well. Once a valid
233  // OSIS overline attribute is made available, these should all
234  // eventually be deprecated and never documented that they are supported.
235  if (strstr(token, "rend=\"ol\"") || strstr(token, "rend=\"x-overline\"") || strstr(token, "rend=\"overline\"")
236  || strstr(token, "type=\"ol\"") || strstr(token, "type=\"x-overline\"") || strstr(token, "type=\"overline\"")) {
237  u->hiType = "overline";
238  }
239  else u->hiType = "";
240  u->suspendTextPassThru = true;
241  }
242  else if (!strncmp(token, "/hi", 3)) {
243  if (u->hiType == "overline") {
244  const unsigned char *b = (const unsigned char *)u->lastTextNode.c_str();
245  while (*b) {
246  const unsigned char *o = b;
247  if (getUniCharFromUTF8(&b)) {
248  while (o != b) buf.append(*(o++));
249  buf.append((unsigned char)0xCC);
250  buf.append((unsigned char)0x85);
251  }
252  }
253  }
254  else {
255  buf.append("* ");
256  buf.append(u->lastSuspendSegment);
257  buf.append(" *");
258  }
259  u->suspendTextPassThru = false;
260  }
261 
262  else if ((!strncmp(token, "q", 1) && (u->tag.getAttribute("marker")))) {
263  buf.append(u->tag.getAttribute("marker"));
264  }
265 
266 
267  // <milestone type="line"/>
268  else if (!strncmp(token, "milestone", 9)) {
269  const char* type = strstr(token+10, "type=\"");
270  if (type && strncmp(type+6, "line", 4)) { //we check for type != line
271  userData->supressAdjacentWhitespace = true;
272  buf.append('\n');
273  }
274  if (u->tag.getAttribute("marker")) {
275  buf.append(u->tag.getAttribute("marker"));
276  }
277  }
278 
279  else {
280  return false; // we still didn't handle token
281  }
282  }
283  return true;
284 }
285 
286 
#define SWORD_NAMESPACE_START
Definition: defs.h:39
static const char PRECHAR
Definition: swbasicfilter.h:97
void setTokenEnd(const char *tokenEnd)
Definition: swbuf.h:47
unsigned long length() const
Definition: swbuf.h:197
virtual bool processStage(char stage, SWBuf &text, char *&from, BasicFilterUserData *userData)
Definition: osisplain.cpp:80
SWText * module
Definition: osis2mod.cpp:105
Definition: utilxml.h:38
void setTokenCaseSensitive(bool val)
void setEscapeStart(const char *escStart)
bool substituteToken(SWBuf &buf, const char *token)
void addEscapeStringSubstitute(const char *findString, const char *replaceString)
void setTokenStart(const char *tokenStart)
char * getRawData()
Definition: swbuf.h:379
MyUserData(const SWModule *module, const SWKey *key)
Definition: osisplain.cpp:42
SWBuf & append(const char *str, long max=-1)
Definition: swbuf.h:274
virtual bool handleToken(SWBuf &buf, const char *token, BasicFilterUserData *userData)
Definition: osisplain.cpp:91
unsigned long size() const
Definition: swbuf.h:185
const char * getAttribute(const char *attribName, int partNum=-1, char partSplit= '|') const
Definition: utilxml.cpp:230
virtual void setStageProcessing(char stages)
void setEscapeStringCaseSensitive(bool val)
void addTokenSubstitute(const char *findString, const char *replaceString)
char * toupperstr(char *t, unsigned int max=0)
Definition: stringmgr.h:107
void setEscapeEnd(const char *escEnd)
#define SWORD_NAMESPACE_END
Definition: defs.h:40
Definition: swkey.h:77
virtual BasicFilterUserData * createUserData(const SWModule *module, const SWKey *key)
Definition: osisplain.cpp:73
SW_u32 getUniCharFromUTF8(const unsigned char **buf, bool skipValidation=false)
Definition: utilstr.h:88