[sword-svn] r3401 - trunk/utilities

dmsmith at crosswire.org dmsmith at crosswire.org
Sat Feb 6 10:12:20 MST 2016


Author: dmsmith
Date: 2016-02-06 10:12:20 -0700 (Sat, 06 Feb 2016)
New Revision: 3401

Modified:
   trunk/utilities/osis2mod.cpp
Log:
Added entity handling. div type='colophon' now not changed to milestones.

Modified: trunk/utilities/osis2mod.cpp
===================================================================
--- trunk/utilities/osis2mod.cpp	2016-02-06 16:41:53 UTC (rev 3400)
+++ trunk/utilities/osis2mod.cpp	2016-02-06 17:12:20 UTC (rev 3401)
@@ -1213,6 +1213,7 @@
 	static std::stack<XMLTag> bspTagStack;
 	static int sID = 1;
 	char buf[11];
+	SWBuf typeAttr = t.getAttribute("type");
 
 	// Support simplification transformations
 	if (t.isEmpty()) {
@@ -1240,9 +1241,10 @@
 		//   abbr       When would this ever cross a boundary?
 		//   seg        as it is used for a divineName hack
 		//   foreign    so that it can be easily italicized
+		//   div type="colophon" so that it can be treated as a block
 		else if (tagName == "chapter" ||
 			 tagName == "closer"  ||
-			 tagName == "div"     ||
+			 (tagName == "div" && typeAttr != "colophon") ||
 			 tagName == "l"       ||
 			 tagName == "lg"      ||
 			 tagName == "q"       ||
@@ -1272,11 +1274,13 @@
 			}
 
 			bspTagStack.pop();
+			SWBuf topTypeAttr = topToken.getAttribute("type");
 
 			// Look for the milestoneable container tags handled above.
+			// Have to treat div type="colophon" differently
 			if (tagName == "chapter" ||
 			    tagName == "closer"  ||
-			    tagName == "div"     ||
+			    (tagName == "div" && topTypeAttr != "colophon") ||
 			    tagName == "l"       ||
 			    tagName == "lg"      ||
 			    tagName == "p"       ||
@@ -1429,6 +1433,14 @@
 		CS_SEEN_ENDING_GREATER_THAN
 	} t_commentstate;
 
+	typedef enum {
+		ET_NUM,
+		ET_HEX,
+		ET_CHAR,
+		ET_NONE,
+		ET_ERR
+	} t_entitytype;
+
 	activeOsisID[0] = '\0';
 
 	strcpy(currentOsisID,"N/A");
@@ -1449,6 +1461,13 @@
 	bool inWhitespace = false;
 	bool seeingSpace = false;
 	unsigned char curChar = '\0';
+	SWBuf entityToken;
+	bool inentity = false;
+	t_entitytype entitytype = ET_NONE;
+	unsigned char attrQuoteChar = '\0';
+	bool inattribute = false;
+	unsigned int linePos = 1;
+	unsigned int charPos = 0;
 
 	while (infile.good()) {
 
@@ -1465,16 +1484,221 @@
 		// Does a SWORD module actually require this?
 		if (curChar == '\n') {
 			curChar = ' ';
+			charPos = 0;
+			linePos++;
 		}
+		charPos++;
 
+		// Look for entities:
+		// These are of the form &#dddd;, &xHHHH; or &llll;
+		// where dddd is a sequence of digits
+		//       HHHH is a sequence of [A-Fa-f0-9]
+		//       llll is amp, lt, gt, quot or apos
+		//            but we will look for a sequence of [A-Za-z0-9]
+		// All but &amp;, &lt;, &gt;, &quot;, &apos; will produce a WARNING
+		// In the future:
+		//    &#dddd; and &xHHHH; should be converted to UTF-8,
+		//        with a WARNING if the text is not UTF-8
+		//    &llll; other than the xml standard 5 should produce a WARNING
+
+		// For entity diagnostics track whether the text is an attribute value
+		if (inattribute && (curChar == '\'' || curChar == '"')) {
+			if (attrQuoteChar == curChar) {
+				inattribute = false;
+				attrQuoteChar = '\0';
+			}
+			else {
+				attrQuoteChar = curChar;
+			}
+		}
+		if (intoken && curChar == '=') {
+			inattribute = true;
+			attrQuoteChar = '\0';
+		}
+
+		if (!inentity && curChar == '&') {
+			inentity = true;
+			entitytype = ET_NONE;
+			entityToken = "&";
+			continue;
+		}
+
+		if (inentity) {
+			if (curChar == ';') {
+				inentity = false;
+			}
+			else {
+				switch (entitytype) {
+				    case ET_NONE:
+					// A hex entity cannot start with X in XML, but it can in HTML
+					// Allow for it here and complain later
+					if (curChar == 'x' || curChar == 'X') {
+						entitytype = ET_HEX;
+					}
+					else
+					if (curChar == '#') {
+						entitytype = ET_NUM;
+					}
+					else
+					if ((curChar >= 'A' && curChar <= 'Z') ||
+					    (curChar >= 'a' && curChar <= 'z') ||
+					    (curChar >= '0' && curChar <= '9')) {
+						entitytype = ET_CHAR;
+					}
+					else {
+						inentity = false;
+						entitytype = ET_ERR;
+					}
+					break;
+
+				    case ET_NUM :
+					if (!(curChar >= '0' && curChar <= '9')) {
+						inentity = false;
+						entitytype = ET_ERR;
+					}
+					break;
+				    case ET_HEX :
+					if ((curChar >= 'G' && curChar <= 'Z') ||
+					    (curChar >= 'g' && curChar <= 'z')) {
+						// Starts out as a HEX entity, but it isn't one
+						entitytype = ET_CHAR;
+					}
+					else
+					if (!((curChar >= 'A' && curChar <= 'F') ||
+					      (curChar >= 'a' && curChar <= 'f') ||
+					      (curChar >= '0' && curChar <= '9'))) {
+						inentity = false;
+						entitytype = ET_ERR;
+					}
+					break;
+				    case ET_CHAR :
+					if (!((curChar >= 'A' && curChar <= 'Z') ||
+					      (curChar >= 'a' && curChar <= 'z') ||
+					      (curChar >= '0' && curChar <= '9'))) {
+						inentity = false;
+						entitytype = ET_ERR;
+					}
+					break;
+				    default:
+					cout << "FATAL(ENTITY): unknown entitytype on entity end: " << entitytype << endl;
+					exit(EXIT_BAD_NESTING);
+				}
+			}
+
+			if (entitytype != ET_ERR) {
+				entityToken.append((char) curChar);
+			}
+
+			// It is an entity, perhaps invalid, if curChar is ';', error otherwise
+			// Test to see if we now have an entity or a failure
+			// It may not be a valid entity.
+			if (!inentity) {
+				switch (entitytype) {
+				    case ET_ERR :
+					// Remove the leading &
+					entityToken << 1;
+					cout << "WARNING(PARSE): malformed entity, replacing &" << entityToken << " with &amp;" << entityToken << endl;
+					if (intoken) {
+						token.append("&amp;");
+						token.append(entityToken);
+					}
+					else {
+						text.append("&amp;");
+						text.append(entityToken);
+					}
+					break;
+				    case ET_HEX :
+					if (entityToken[1] != 'x') {
+						cout << "WARNING(PARSE): HEX entity must begin with &x, found " << entityToken << endl;
+					}
+					else {
+						cout << "WARNING(PARSE): SWORD does not search HEX entities, found " << entityToken << endl;
+					}
+					break;
+				    case ET_CHAR :
+					if (strcmp(entityToken, "&amp;")  &&
+				            strcmp(entityToken, "&lt;")   &&
+				            strcmp(entityToken, "&gt;")   &&
+				            strcmp(entityToken, "&quot;") &&
+				            strcmp(entityToken, "&apos;")) {
+						cout << "WARNING(PARSE): XML only supports 5 Character entities &amp;, &lt;, &gt;, &quot; and &apos;, found " << entityToken << endl;
+					}
+					else
+					if (!strcmp(entityToken, "&apos;")) {
+						cout << "WARNING(PARSE): While valid for XML, XHTML does not support &apos;." << endl;
+						if (!inattribute) {
+							cout << "WARNING(PARSE): &apos; is unnecessary outside of attribute values. Replacing with '. " << endl;
+							entityToken = "'";
+						}
+						else {
+							switch (attrQuoteChar) {
+							    case '"' :
+								cout << "WARNING(PARSE): &apos; is unnecessary inside double quoted attribute values. Replacing with '. " << endl;
+								entityToken = "'";
+								break;
+							    case '\'' :
+								cout << "WARNING(PARSE): &apos; is only needed within single quoted attribute values. Considering using double quoted attribute and replacing with '." << endl;
+								break;
+							}
+						}
+					}
+					else
+					if (!strcmp(entityToken, "&quot;")) {
+						cout << "WARNING(PARSE): While valid for XML, &quot; is only needed within double quoted attribute values" << endl;
+						if (!inattribute) {
+							cout << "WARNING(PARSE): &quot; is unnecessary outside of attribute values. Replace with \"." << endl;
+							entityToken = "\"";
+						}
+						else {
+							switch (attrQuoteChar) {
+							    case '"' :
+								cout << "WARNING(PARSE): &quot; is only needed within double quoted attribute values. Considering using single quoted attribute and replacing with \"." << endl;
+								break;
+							    case '\'' :
+								cout << "WARNING(PARSE): &quot; is unnecessary inside single quoted attribute values. Replace with \"." << endl;
+								entityToken = "\"";
+								break;
+							}
+						}
+					}
+					break;
+				    case ET_NUM :
+					cout << "WARNING(PARSE): SWORD does not search numeric entities, found " << entityToken << endl;
+					break;
+				    case ET_NONE :
+				    default:
+					break;
+				}
+
+				// Put the entity into the stream.
+				if (intoken) {
+					token.append(entityToken);
+				}
+				else {
+					text.append(entityToken);
+				}
+
+				if (curChar == ';') {
+					// The character was handled, so go get the next one.
+					continue;
+				}
+			}
+			else {
+				// The character was handled, so go get the next one.
+				continue;
+			}
+		}
+
+
 		if (!intoken && curChar == '<') {
 			intoken = true;
 			token = "<";
+			inattribute = false;
+			attrQuoteChar = '\0';
 			continue;
 		}
 
 		// Handle XML comments starting with "<!--", ending with "-->"
-
 		if (intoken && !incomment) {
 			switch (commentstate) {
 				case CS_NOT_IN_COMMENT :




More information about the sword-cvs mailing list