[sword-cvs] sword/utilities osis2mod.cpp,NONE,1.1 Makefile.am,1.15,1.16

sword@www.crosswire.org sword@www.crosswire.org
Mon, 26 May 2003 11:57:30 -0700


Update of /usr/local/cvsroot/sword/utilities
In directory www:/tmp/cvs-serv26339/utilities

Modified Files:
	Makefile.am 
Added Files:
	osis2mod.cpp 
Log Message:
	Added first cut of osis2mod using the ESV as a
		basic template.  Works with VerseKey mods
		only and doesn't capture all data.



--- NEW FILE: osis2mod.cpp ---
#include <ctype.h>
#include <stdio.h>
#include <fcntl.h>
#include <errno.h>
#include <stdlib.h>

#ifndef __GNUC__
#include <io.h>
#else
#include <unistd.h>
#endif

#include <swmgr.h>
#include <rawtext.h>
#include <iostream>
#include <swbuf.h>
#include <utilxml.h>

#ifndef O_BINARY
#define O_BINARY 0
#endif

#ifndef NO_SWORD_NAMESPACE
using namespace sword;
#endif

using namespace std;

char readline(int fd, char **buf) {
	char ch;
	if (*buf)
		delete [] *buf;
	*buf = 0;
	int len;


	long index = lseek(fd, 0, SEEK_CUR);
	// clean up any preceding white space
	while ((len = read(fd, &ch, 1)) == 1) {
		if ((ch != 13) && (ch != ' ') && (ch != '\t'))
			break;
		else index++;
	}


	while (ch != 10) {
        if ((len = read(fd, &ch, 1)) != 1)
			break;
	}
	
	int size = (lseek(fd, 0, SEEK_CUR) - index) - 1;

	*buf = new char [ size + 1 ];

	if (size > 0) {
		lseek(fd, index, SEEK_SET);
		read(fd, *buf, size);
		read(fd, &ch, 1);   //pop terminating char
		(*buf)[size] = 0;

		// clean up any trailing junk on buf
		for (char *it = *buf+(strlen(*buf)-1); it > *buf; it--) {
			if ((*it != 10) && (*it != 13) && (*it != ' ') && (*it != '\t'))
				break;
			else *it = 0;
		}
	}
	else **buf = 0;
	return !len;
}


bool isKJVRef(const char *buf) {
	VerseKey vk, test;
	vk.AutoNormalize(0);
	vk.Headings(1);	// turn on mod/testmnt/book/chap headings
	vk.Persist(1);
	// lets do some tests on the verse --------------
	vk = buf;
	test = buf;

	if (vk.Testament() && vk.Book() && vk.Chapter() && vk.Verse()) { // if we're not a heading
//		std::cerr << (const char*)vk << " == "  << (const char*)test << std::endl;
		return (vk == test);
	}
	else return true;	// no check if we're a heading... Probably bad.
}


void writeEntry(VerseKey &key, SWBuf &text) {
	cout << "Verse: " << key << "\n";
	cout << "TEXT: " << text << "\n\n";
}


bool handleToken(SWBuf &text, XMLTag token) {
	static VerseKey currentVerse;
	static bool inHeader = false;
	static SWBuf headerType = "";
	static SWBuf header = "";
	static SWBuf lastTitle = "";
	static int titleOffset = -1;

	currentVerse.Headings(0);
	currentVerse.AutoNormalize(0);

	if ((!strcmp(token.getName(), "title")) && (!token.isEndTag())) {
		titleOffset = text.length();
		return false;
	}
	if ((!strcmp(token.getName(), "title")) && (token.isEndTag())) {
		lastTitle = (text.c_str() + titleOffset);
		lastTitle += token;
		return false;
	}
	if ((!strcmp(token.getName(), "div")) && (!token.isEndTag()) && (token.getAttribute("osisID"))) {
		if (!strcmp(token.getAttribute("type"), "book")) {
			if (inHeader) {	// this one should never happen, but just in case
				cout << "HEADING ";
				writeEntry(currentVerse, text);
				inHeader = false;
			}
			currentVerse = token.getAttribute("osisID");
			currentVerse.Chapter(0);
			currentVerse.Verse(0);
			inHeader = true;
			headerType = "book";
			lastTitle = "";
			text = "";
		}
		if (!strcmp(token.getAttribute("type"), "chapter")) {
			if (inHeader) {
				cout << "HEADING ";
				writeEntry(currentVerse, text);
				inHeader = false;
			}

			currentVerse = token.getAttribute("osisID");
			currentVerse.Verse(0);
			inHeader = true;
			headerType = "chap";
			lastTitle = "";
			text = "";
		}
	}
	if ((!strcmp(token.getName(), "verse")) && (!token.isEndTag())) {
		if (inHeader) {
			cout << "HEADING ";
			writeEntry(currentVerse, text);
			inHeader = false;
		}

		currentVerse = token.getAttribute("osisID");
		text = "";
		return true;
	}
	if ((!strcmp(token.getName(), "verse")) && (token.isEndTag())) {
		if (lastTitle.length()) {
			SWBuf titleHead = lastTitle;
			char *end = strchr(lastTitle.getRawData(), '>');
			titleHead.setSize((end - lastTitle.getRawData())+1);
			XMLTag titleTag(titleHead);
			titleTag.setAttribute("type", "section");
			titleTag.setAttribute("subtype", "x-preverse");
			text = SWBuf(titleTag) + SWBuf(end+1) + text;
		}
		writeEntry(currentVerse, text);
		lastTitle = "";
		text = "";
		return true;
	}
	return false;
}




int main(int argc, char **argv) {

	// Let's test our command line arguments
	if (argc < 3) {
		fprintf(stderr, "usage: %s <path/to/mod/files> <osisDoc> [0|1 - create|augment module]\n\n", argv[0]);
		exit(-1);
	}


	if ((argc>3)&&(strcmp(argv[3], "1"))) {	// != 1 then create module
	// Try to initialize a default set of datafiles and indicies at our
	// datapath location passed to us from the user.
		if (RawText::createModule(argv[1])) {
			fprintf(stderr, "error: %s: couldn't create module at path: %s \n", argv[0], argv[1]);
			exit(-3);
		}
		exit(0);
	}

	// Let's see if we can open our input file
	int fd = open(argv[2], O_RDONLY|O_BINARY);
	if (fd < 0) {
		fprintf(stderr, "error: %s: couldn't open input file: %s \n", argv[0], argv[2]);
		exit(-2);
	}

	// Do some initialization stuff
	char *buffer = 0;
	RawText mod(argv[1]);	// open our datapath with our RawText driver.
	VerseKey vk;
	vk.AutoNormalize(0);
	vk.Headings(1);	// turn on mod/testmnt/book/chap headings
	vk.Persist(1);

	mod.setKey(vk);

	mod = TOP;
	  
	int successive = 0;  //part of hack below

	char *from;
	SWBuf token;
	SWBuf text;
	bool intoken = false;

	while (!readline(fd, &buffer)) {
		for (from = buffer; *from; from++) {
			if (*from == '<') {
				intoken = true;
				token = "<";
				continue;
			}

			if (*from == '>') {
				intoken = false;
				token += ">";
				if (!handleToken(text, token.c_str())) {
					text += token;
				}
				continue;
			}

			if (intoken)
				token += *from;
			else	text += *from;
		}
	}
	// clear up our buffer that readline might have allocated
	if (buffer)
		delete [] buffer;
}



		
/*
		string verseText = "";

		// chapter number
		if (!strncmp("$$$ ", buffer, 4)) {
			buffer[7] = 0;
			chapter = atoi(buffer+4);
			continue;
		}
		// header
		if (!strncmp("<TD COLSPAN=4 VALIGN=TOP><FONT SIZE=2><B>", buffer, 41)) {
			char *end = strstr(buffer+41, "</B>");
			*end = 0;
			header = buffer+41;
			continue;
		}
		// verse number
		if (!strncmp("<TD VALIGN=TOP ALIGN=RIGHT WIDTH=12><FONT SIZE=2 COLOR=RED><B><SUP>", buffer, 67)) {
			char *end = strstr(buffer+67, "</SUP>");
			*end = 0;
			verse = atoi(buffer+67);
			continue;
		}
		// Actual verse data
		if (!strncmp("<TD VALIGN=TOP><FONT SIZE=2>", buffer, 28)) {
			char *end = strstr(buffer+28, "</FONT>");
			*end = 0;
		}
		// extra
		else {
			continue;
		}

		verseText = buffer + 28;

		if (header.length()) {
			verseText = "<title type=\"section\" subtype=\"x-preverse\">" + header + "</title>" + verseText;
			header = "";
		}

		string vsbuf = argv[3];
		sprintf(tmpBuf, "%i", chapter);
		vsbuf += ((string)" ") + tmpBuf;
		sprintf(tmpBuf, "%i", verse);
		vsbuf += ((string)":") + tmpBuf;
		vk = vsbuf.c_str();
		if (vk.Error()) {
			std::cerr << "Error parsing key: " << vsbuf << "\n";
			exit(-5);
		}
		string orig = mod.getRawEntry();

		if (!isKJVRef(vsbuf.c_str())) {
			VerseKey origVK = vk;
//	This block is functioning improperly -- problem with AutoNormalize???
//			do {
//				vk--;
//			}
//			while (!vk.Error() && !isKJVRef(vk));

			//hack to replace above:
			successive++;
			vk -= successive;
			orig = mod.getRawEntry();

			std::cerr << "Not a valid KJV ref: " << origVK << "\n";
			std::cerr << "appending to ref: " << vk << "\n";
			orig += " [ (";
			orig += origVK;
			orig += ") ";
			orig += verseText;
			orig += " ] ";
			verseText = orig.c_str();
		}
		else {
		  successive = 0;
		}

		if (orig.length() > 1)
			   std::cerr << "Warning, overwriting verse: " << vk << std::endl;
		  
		// ------------- End verse tests -----------------
		std::cout << "adding "<< vk << "\n";
		mod << verseText.c_str();	// save text to module at current position
	}

*/

Index: Makefile.am
===================================================================
RCS file: /usr/local/cvsroot/sword/utilities/Makefile.am,v
retrieving revision 1.15
retrieving revision 1.16
diff -C2 -d -r1.15 -r1.16
*** Makefile.am	13 Feb 2003 22:27:53 -0000	1.15
--- Makefile.am	26 May 2003 18:57:28 -0000	1.16
***************
*** 5,9 ****
  noinst_PROGRAMS = cipherraw ciphertest ciphertest2 lexdump mkfastmod \
  mod2vpl vpl2mod stepdump step2vpl mod2zmod gbfidx modwrite addvs addld emptyvss \
! txt2sword addgb imp2gbs imp2ld imp2vs mod2imp thml2gbs mod2osis xml2gbs
  
  if ICU
--- 5,9 ----
  noinst_PROGRAMS = cipherraw ciphertest ciphertest2 lexdump mkfastmod \
  mod2vpl vpl2mod stepdump step2vpl mod2zmod gbfidx modwrite addvs addld emptyvss \
! txt2sword addgb imp2gbs imp2ld imp2vs mod2imp thml2gbs mod2osis xml2gbs osis2mod
  
  if ICU
***************
*** 43,44 ****
--- 43,45 ----
  mod2osis_SOURCES = mod2osis.cpp
  xml2gbs_SOURCES = xml2gbs.cpp
+ osis2mod_SOURCES = osis2mod.cpp