[sword-svn] r98 - in trunk: . textsstats

Fri Sep 7 15:44:35 MST 2007

Author: scribe
Date: 2007-09-07 15:44:35 -0700 (Fri, 07 Sep 2007)
New Revision: 98

Added:
   trunk/textsstats/
   trunk/textsstats/Makefile
   trunk/textsstats/stats.cpp
Log:
Added some basic greek text statistical analysis


Added: trunk/textsstats/Makefile
===================================================================

--- trunk/textsstats/Makefile	                        (rev 0)
+++ trunk/textsstats/Makefile	2007-09-07 22:44:35 UTC (rev 98)
@@ -0,0 +1,10 @@
+TARGETS= stats
+all: $(TARGETS)
+
+clean:
+	rm $(TARGETS)
+
+.cpp:
+	g++ -g `pkg-config --static --cflags sword` $< -o $@ `pkg-config --static --libs sword`
+
+

Added: trunk/textsstats/stats.cpp
===================================================================
--- trunk/textsstats/stats.cpp	                        (rev 0)
+++ trunk/textsstats/stats.cpp	2007-09-07 22:44:35 UTC (rev 98)
@@ -0,0 +1,358 @@
+/******************************************************************************
+ *  flash.cpp - Automation of flashcards generation 
+ *
+ * Copyright 2007 CrossWire Bible Society (http://www.crosswire.org)
+ *	CrossWire Bible Society
+ *	P. O. Box 2528
+ *	Tempe, AZ  85280-2528
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contributors:
+ *	Lyndon Drake <lyndon at arotau dot com>
+ *	Troy A. Griffitts <scribe at crosswire dot org>
+ */
+
+#include <map>
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include <fstream>
+
+#include <swmgr.h>
+#include <swbuf.h>
+#include <swmodule.h>
+#include <utf8utf16.h>
+#include <utf16utf8.h>
+#include <versekey.h>
+#include <thmlplain.h>
+
+using namespace sword;
+using namespace std;
+
+namespace {
+	const int GREEK_START = 0x370;
+	const int GREEK_END   = 0x3FF;
+};
+
+// used to hold a KJV translation phrase for a greek/hebrew word
+// and any greek/hebrew words combined to make this KJV phrase
+// e.g. hO QEOS = QEOS: [+ hO ]: God
+class Phrase {
+public:
+	Phrase()
+		: phrase("")
+	{}
+	SWBuf phrase;
+	vector<SWBuf> with;
+	inline bool operator ==(const Phrase &other) const { return !compare(other); }
+	inline bool operator !=(const Phrase &other) const { return compare(other); }
+	inline bool operator > (const Phrase &other) const { return compare(other) > 0; }
+	inline bool operator < (const Phrase &other) const { return compare(other) < 0; }
+	inline bool operator <=(const Phrase &other) const { return compare(other) <= 0; }
+	inline bool operator >=(const Phrase &other) const { return compare(other) >= 0; }
+
+	int compare(const Phrase &right) const {
+		int c = phrase.compare(right.phrase);
+		if (c) return c;
+		vector<SWBuf>::const_iterator lit = with.begin();
+		vector<SWBuf>::const_iterator rit = right.with.begin();
+		while (lit != with.end() && rit != right.with.end()) {
+			c = lit->compare(*rit);
+			if (c) return c;
+			lit++; rit++;
+		}
+		if (lit !=       with.end()) return  1;
+		if (rit != right.with.end()) return -1;
+		return 0;
+	}
+};
+
+// KJV phrases and their occurance frequency
+typedef map<Phrase, int> KJVPhrases;
+
+// primary result class
+class Word {
+public:
+	Word()
+		: utf8("")
+		, strong("")
+		, freq(0)
+		, def("")
+	{}
+
+	// lexical form of this word in utf8 greek/hebrew
+	SWBuf utf8;
+	vector<unsigned short> utf16;
+
+	// strongs number for this word (e.g. G3588)
+	SWBuf strong;
+
+	// frequency of occurance in the iterated text
+	int freq;
+
+	// definition pulled from short strongs def
+	SWBuf def;
+
+	// kjv translation phrases and their frequencies
+	KJVPhrases kjvFreq;
+};
+
+
+string itoa(int v) { stringstream str; str << v; return str.str(); }
+
+
+bool compareFreq(const Word &w1, const Word &w2) {
+	return w1.freq > w2.freq;
+}
+
+
+bool compareKJVFreq(const KJVPhrases::const_iterator &i1, const KJVPhrases::const_iterator &i2) {
+	return i1->second > i2->second;
+}
+
+
+// sort and pretty up all the KJV phrases for a word into a nice output buffer
+SWBuf prettyKJVFreq(KJVPhrases in) {
+	SWBuf retVal;
+	vector<KJVPhrases::const_iterator> sorted;
+	for (KJVPhrases::const_iterator it = in.begin(); it != in.end(); it++) {
+		// combine cap words with lowercase, if exists
+		Phrase k = it->first;
+		if (k.phrase.size() && toupper(k.phrase[0]) == k.phrase[0] && k.phrase != "God" && k.phrase != "Lord") {
+			k.phrase[0] = tolower(k.phrase[0]);
+			if (k != it->first) {
+				KJVPhrases::iterator i = in.find(k);
+				if (i != in.end()) {
+					i->second += it->second;
+					// don't include us in the list cuz we added our freq to another
+					continue;
+				}
+			}
+		}
+		sorted.push_back(it);
+	}
+	sort(sorted.begin(), sorted.end(), compareKJVFreq);
+	for (vector<KJVPhrases::const_iterator>::const_iterator it = sorted.begin(); it != sorted.end(); it++) {
+		if (retVal.size()) retVal += "; ";
+		// prepend 'with other strongs' if present
+		if ((*it)->first.with.size()) {
+			retVal += "[+";
+			for (int i = 0; i < (*it)->first.with.size(); i++) {
+				retVal.appendFormatted(" %s", (*it)->first.with[i].c_str());
+			}
+			retVal += " ] ";
+		}
+		retVal.appendFormatted("%s (%d)", (*it)->first.phrase.c_str(), (*it)->second);
+	}
+	return retVal;
+}
+
+
+// take utf8 text and spit out equiv. text substituting escaped codes for multibyte chars
+// java .properties files wants this format (flashcard .flash lessons use this format)
+SWBuf escapedUTF8(SWBuf inText) {
+	static UTF8UTF16 convert;
+	convert.processText(inText);
+	SWBuf retBuf;
+	for (unsigned short *i = (unsigned short *)inText.getRawData(); *i; i++) {
+		if (*i < 128) {
+			retBuf += (char)*i;
+		}
+		else {
+			retBuf.appendFormatted("\\u%.4x", *i);
+			// change hex alpha values to upper case
+			for (int i = retBuf.size()-1; i > retBuf.size() - 4; i--) {
+				retBuf[i] = toupper(retBuf[i]);
+			}
+		}
+	}
+	return retBuf;
+}
+
+SWBuf toUTF8(const vector<unsigned short> &utf16) {
+	static UTF16UTF8 convert;
+	SWBuf retVal;
+	retVal.size((utf16.size()+1)*2);
+	unsigned short *i = (unsigned short *)retVal.getRawData();
+	int j;
+	for (j = 0; j < utf16.size(); j++) {
+		i[j] = utf16[j];
+	}
+	i[j] = 0;
+	convert.processText(retVal);
+	return retVal;
+}
+
+// output a simple CSV ('|' separated really) format for importing into OOo or excel
+void outputCSV(const vector<Word> &seqList) {
+	for (vector<Word>::const_iterator it = seqList.begin(); it != seqList.end(); it++) {
+		const Word &w = (*it);
+//		cout << w->freq << "|" << escapedUTF8(w->utf8).c_str() << "|" << w->strong << "|" << prettyKJVFreq(w->kjvFreq).c_str() << "\n";
+		cout << w.freq << "|" << toUTF8(w.utf16).c_str() << "|" << w.utf16.size() << "\n";
+	}
+	std::cout << std::endl;
+}
+
+
+/**
+ * output our flashcard .flash file format
+ *
+ * seqList - duh
+ * outputDir - directory path where to write files, e.g. "./hebFreq"
+ * kjvFreq - if true, process KJV translation frequencies and use these as
+ *		the word answers; otherwise, use short strongs defs.
+ * maxPerLesson - maximum number of words per lesson
+ *
+ */
+void outputFlash(const vector<Word> &seqList, const char *outputDir = ".", bool kjvFreq = true, int maxPerLesson = 25) {
+	ThMLPlain strip;
+	ofstream ofile;
+	int wordCount    = 0;
+	int lessonNumber = 0;
+	int startFreq    = 0;
+	int lastFreq     = 0;
+
+	vector<Word>::const_iterator it = seqList.begin();
+	while (it != seqList.end()) {
+		const Word &w = (*it);
+		if (!wordCount) {
+			SWBuf fname = outputDir;
+			fname += "/lesson";
+			fname.appendFormatted("%d", lessonNumber);
+			fname += ".flash";
+			ofile.open(fname);
+			startFreq = w.freq;
+		}
+
+		SWBuf word = w.utf8;
+		word.trim();
+		SWBuf answers = "";
+		answers.trim();
+		// if we want answers as KJV phrases
+		if (kjvFreq) {
+			answers = prettyKJVFreq(w.kjvFreq);
+			if (answers.size() > 200) answers.size(200);
+		}
+		// if we would rather have short strongs
+		else {
+			answers = w.def;
+			strip.processText(answers);	// remove html tags
+			answers.replaceBytes("\n\r", ' ');	// remove newlines
+		}
+
+		// be sure we have both a word and an answer
+		if (word.size() && answers.size()) {
+			ofile << "word" << wordCount << "=" << escapedUTF8(word) << "\n";
+			ofile << "answers" << wordCount << "=" << answers << "\n";
+			lastFreq = w.freq;
+			wordCount++;
+		}
+
+		it++;
+
+		if (it == seqList.end() || wordCount >= maxPerLesson) {
+			// close lesson
+			SWBuf lessonTitle = "";
+			lessonTitle.appendFormatted("lessonTitle=%.3d Freqs. %d-%d\n", lessonNumber, startFreq, lastFreq);
+			ofile << lessonTitle;
+			ofile << "wordCount=" << wordCount << "\n";
+			ofile.close();
+			wordCount = 0;
+			lessonNumber++;
+		}
+	} 
+}
+
+
+/**
+ * do the work
+ *
+ * range - the range of verses to process (e.g. "gen-mal")
+ * addAll - if we should add all words in our lexicon for the testaments
+ *		included in the range even if they don't exist in the text
+ *		(useful for generating complete OT or NT strongs word lists)
+ *
+ */
+vector<Word> processSequences(const char *range, int seqLength) {
+	SWMgr manager;
+	manager.setGlobalOption("Greek Accents", "Off");
+	UTF8UTF16 toUTF16;
+
+	map<vector<unsigned short>, Word> seqList;
+
+	SWModule *tmpBible = manager.getModule("WHNU");
+	if (!tmpBible) {
+		cerr << "Unable to locate WHNU module" << endl;
+		exit(1);
+	}
+	SWModule &bible = *tmpBible;
+
+	VerseKey parser;
+	ListKey r = parser.ParseVerseList(range, 0, true);
+	r.Persist(true);
+	bible.setKey(r);
+	for (bible = TOP; !bible.Error(); bible++) {
+		bible.RenderText();		// force an entry lookup to resolve key to something in the index
+		SWBuf text = bible.StripText();
+		toUTF16.processText(text);
+		for (unsigned short *i = (unsigned short *)text.getRawData(); *i; i++) {
+			vector<unsigned short> seq;
+			int j;
+			for (j = 0; ((j < seqLength) && (i[j] >= GREEK_START) && (i[j] <= GREEK_END)); j++) {
+				seq.push_back(i[j]);
+			}
+			if (seq.size() == seqLength) {
+				seqList[seq].freq++;
+			}
+			else {
+				if (!i[j]) {
+					// we don't need to process the rest of this text as all remaining seq lengths will fail
+					break;
+				}
+			}
+		}
+	}
+	
+	vector<Word> sorted;
+	for (map<vector<unsigned short>, Word>::iterator it = seqList.begin(); it != seqList.end(); it++) {
+		// pull utf16 key from map and populate Word
+		it->second.utf16 = it->first;
+		// put only word in sorted container
+		sorted.push_back(it->second);
+	}
+	sort(sorted.begin(), sorted.end(), compareFreq);
+
+	return sorted;
+}
+
+
+int main(int argc, char **argv)
+{
+	int minLength = 1;
+	int maxLength = 3;
+	char *range = "mat-rev";
+
+	if (argc > 1) minLength = atoi(argv[1]);
+	if (argc > 2) maxLength = atoi(argv[2]);
+	if (argc > 3) range = argv[3];
+
+	vector<Word> results;
+	for (int i = minLength; i <= maxLength; i++) {
+		vector<Word> pass = processSequences(range, i);
+		results.insert(results.end(), pass.begin(), pass.end());
+	}
+	
+	sort(results.begin(), results.end(), compareFreq);
+	outputCSV(results);
+
+	return 0;
+}
+