The SWORD Project  1.9.0.svnversion
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
lzsscomprs.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  *
3  * lzssomprs.cpp - LZSSCompress: a driver class that provides LZSS
4  * compression
5  *
6  * $Id: lzsscomprs.cpp 3785 2020-08-30 11:12:34Z scribe $
7  *
8  * Copyright 1996-2013 CrossWire Bible Society (http://www.crosswire.org)
9  * CrossWire Bible Society
10  * P. O. Box 2528
11  * Tempe, AZ 85280-2528
12  *
13  * This program is free software; you can redistribute it and/or modify it
14  * under the terms of the GNU General Public License as published by the
15  * Free Software Foundation version 2.
16  *
17  * This program is distributed in the hope that it will be useful, but
18  * WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  * General Public License for more details.
21  *
22  */
23 
24 #include <stdlib.h>
25 #include <string.h>
26 #include <lzsscomprs.h>
27 
28 // The following are constant sizes used by the compression algorithm.
29 //
30 // N - This is the size of the ring buffer. It is set
31 // to 4K. It is important to note that a position
32 // within the ring buffer requires 12 bits.
33 //
34 // F - This is the maximum length of a character sequence
35 // that can be taken from the ring buffer. It is set
36 // to 18. Note that a length must be 3 before it is
37 // worthwhile to store a position/length pair, so the
38 // length can be encoded in only 4 bits. Or, put yet
39 // another way, it is not necessary to encode a length
40 // of 0-18, it is necessary to encode a length of
41 // 3-18, which requires 4 bits.
42 //
43 // THRESHOLD - It takes 2 bytes to store an offset and
44 // a length. If a character sequence only
45 // requires 1 or 2 characters to store
46 // uncompressed, then it is better to store
47 // it uncompressed than as an offset into
48 // the ring buffer.
49 //
50 // Note that the 12 bits used to store the position and the 4 bits
51 // used to store the length equal a total of 16 bits, or 2 bytes.
52 
53 #define N 4096
54 #define F 18
55 #define THRESHOLD 3
56 #define NOT_USED N
57 
58 
60 
62 public:
63  static unsigned char m_ring_buffer[N + F - 1];
64  static short int m_match_position;
65  static short int m_match_length;
66  static short int m_lson[N + 1];
67  static short int m_rson[N + 257];
68  static short int m_dad[N + 1];
69  void InitTree();
70  void InsertNode(short int Pos);
71  void DeleteNode(short int Node);
72 };
73 
74 /******************************************************************************
75  * LZSSCompress Statics
76  */
77 
78 // m_ring_buffer is a text buffer. It contains "nodes" of
79 // uncompressed text that can be indexed by position. That is,
80 // a substring of the ring buffer can be indexed by a position
81 // and a length. When decoding, the compressed text may contain
82 // a position in the ring buffer and a count of the number of
83 // bytes from the ring buffer that are to be moved into the
84 // uncompressed buffer.
85 //
86 // This ring buffer is not maintained as part of the compressed
87 // text. Instead, it is reconstructed dynamically. That is,
88 // it starts out empty and gets built as the text is decompressed.
89 //
90 // The ring buffer contain N bytes, with an additional F - 1 bytes
91 // to facilitate string comparison.
92 
93 unsigned char LZSSCompress::Private::m_ring_buffer[N + F - 1];
94 
95 // m_match_position and m_match_length are set by InsertNode().
96 //
97 // These variables indicate the position in the ring buffer
98 // and the number of characters at that position that match
99 // a given string.
100 
103 
104 // m_lson, m_rson, and m_dad are the Japanese way of referring to
105 // a tree structure. The dad is the parent and it has a right and
106 // left son (child).
107 //
108 // For i = 0 to N-1, m_rson[i] and m_lson[i] will be the right
109 // and left children of node i.
110 //
111 // For i = 0 to N-1, m_dad[i] is the parent of node i.
112 //
113 // For i = 0 to 255, rson[N + i + 1] is the root of the tree for
114 // strings that begin with the character i. Note that this requires
115 // one byte characters.
116 //
117 // These nodes store values of 0...(N-1). Memory requirements
118 // can be reduces by using 2-byte integers instead of full 4-byte
119 // integers (for 32-bit applications). Therefore, these are
120 // defined as "short ints."
121 
122 short int LZSSCompress::Private::m_lson[N + 1];
123 short int LZSSCompress::Private::m_rson[N + 257];
124 short int LZSSCompress::Private::m_dad[N + 1];
125 
126 
127 /******************************************************************************
128  * LZSSCompress Constructor - Initializes data for instance of LZSSCompress
129  *
130  */
131 
133  p = new Private();
134 }
135 
136 
137 /******************************************************************************
138  * LZSSCompress Destructor - Cleans up instance of LZSSCompress
139  */
140 
142  delete p;
143 }
144 
145 
146 /******************************************************************************
147  * LZSSCompress::InitTree - This function initializes the tree nodes to
148  * "empty" states.
149  */
150 
152  int i;
153 
154  // For i = 0 to N - 1, m_rson[i] and m_lson[i] will be the right
155  // and left children of node i. These nodes need not be
156  // initialized. However, for debugging purposes, it is nice to
157  // have them initialized. Since this is only used for compression
158  // (not decompression), I don't mind spending the time to do it.
159  //
160  // For the same range of i, m_dad[i] is the parent of node i.
161  // These are initialized to a known value that can represent
162  // a "not used" state.
163 
164  for (i = 0; i < N; i++) {
165  m_lson[i] = NOT_USED;
166  m_rson[i] = NOT_USED;
167  m_dad[i] = NOT_USED;
168  }
169 
170  // For i = 0 to 255, m_rson[N + i + 1] is the root of the tree
171  // for strings that begin with the character i. This is why
172  // the right child array is larger than the left child array.
173  // These are also initialzied to a "not used" state.
174  //
175  // Note that there are 256 of these, one for each of the possible
176  // 256 characters.
177 
178  for (i = N + 1; i <= (N + 256); i++) {
179  m_rson[i] = NOT_USED;
180  }
181 }
182 
183 
184 /******************************************************************************
185  * LZSSCompress::InsertNode - This function inserts a string from the ring
186  * buffer into one of the trees. It loads the
187  * match position and length member variables
188  * for the longest match.
189  *
190  * The string to be inserted is identified by
191  * the parameter Pos, A full F bytes are
192  * inserted. So,
193  * m_ring_buffer[Pos ... Pos+F-1]
194  * are inserted.
195  *
196  * If the matched length is exactly F, then an
197  * old node is removed in favor of the new one
198  * (because the old one will be deleted
199  * sooner).
200  *
201  * Note that Pos plays a dual role. It is
202  * used as both a position in the ring buffer
203  * and also as a tree node.
204  * m_ring_buffer[Pos] defines a character that
205  * is used to identify a tree node.
206  *
207  * ENT: pos - position in the buffer
208  */
209 
211 {
212  short int i;
213  short int p;
214  int cmp;
215  unsigned char * key;
216 
217 /*
218  ASSERT(Pos >= 0);
219  ASSERT(Pos < N);
220 */
221 
222  cmp = 1;
223  key = &(m_ring_buffer[Pos]);
224 
225  // The last 256 entries in m_rson contain the root nodes for
226  // strings that begin with a letter. Get an index for the
227  // first letter in this string.
228 
229  p = (short int) (N + 1 + key[0]);
230 
231  // Set the left and right tree nodes for this position to "not
232  // used."
233 
234  m_lson[Pos] = NOT_USED;
235  m_rson[Pos] = NOT_USED;
236 
237  // Haven't matched anything yet.
238 
239  m_match_length = 0;
240 
241  for ( ; ; ) {
242  if (cmp >= 0) {
243  if (m_rson[p] != NOT_USED) {
244  p = m_rson[p];
245  }
246  else {
247  m_rson[p] = Pos;
248  m_dad[Pos] = p;
249  return;
250  }
251  }
252  else {
253  if (m_lson[p] != NOT_USED) {
254  p = m_lson[p];
255  }
256  else {
257  m_lson[p] = Pos;
258  m_dad[Pos] = p;
259  return;
260  }
261  }
262 
263  // Should we go to the right or the left to look for the
264  // next match?
265 
266  for (i = 1; i < F; i++) {
267  cmp = key[i] - m_ring_buffer[p + i];
268  if (cmp != 0)
269  break;
270  }
271 
272  if (i > m_match_length) {
273  m_match_position = p;
274  m_match_length = i;
275 
276  if (i >= F)
277  break;
278  }
279  }
280 
281  m_dad[Pos] = m_dad[p];
282  m_lson[Pos] = m_lson[p];
283  m_rson[Pos] = m_rson[p];
284 
285  m_dad[ m_lson[p] ] = Pos;
286  m_dad[ m_rson[p] ] = Pos;
287 
288  if (m_rson[ m_dad[p] ] == p) {
289  m_rson[ m_dad[p] ] = Pos;
290  }
291  else {
292  m_lson[ m_dad[p] ] = Pos;
293  }
294 
295  // Remove "p"
296 
297  m_dad[p] = NOT_USED;
298 }
299 
300 
301 /******************************************************************************
302  * LZSSCompress::DeleteNode - This function removes the node "Node" from the
303  * tree.
304  *
305  * ENT: node - node to be removed
306  */
307 
309 {
310  short int q;
311 
312 /*
313  ASSERT(Node >= 0);
314  ASSERT(Node < (N+1));
315 */
316 
317  if (m_dad[Node] == NOT_USED) { // not in tree, nothing to do
318  return;
319  }
320 
321  if (m_rson[Node] == NOT_USED) {
322  q = m_lson[Node];
323  }
324  else if (m_lson[Node] == NOT_USED) {
325  q = m_rson[Node];
326  }
327  else {
328  q = m_lson[Node];
329  if (m_rson[q] != NOT_USED) {
330  do {
331  q = m_rson[q];
332  } while (m_rson[q] != NOT_USED);
333 
334  m_rson[ m_dad[q] ] = m_lson[q];
335  m_dad[ m_lson[q] ] = m_dad[q];
336  m_lson[q] = m_lson[Node];
337  m_dad[ m_lson[Node] ] = q;
338  }
339 
340  m_rson[q] = m_rson[Node];
341  m_dad[ m_rson[Node] ] = q;
342  }
343 
344  m_dad[q] = m_dad[Node];
345 
346  if (m_rson[ m_dad[Node] ] == Node) {
347  m_rson[ m_dad[Node] ] = q;
348  }
349  else {
350  m_lson[ m_dad[Node] ] = q;
351  }
352 
353  m_dad[Node] = NOT_USED;
354 }
355 
356 
357 /******************************************************************************
358  * LZSSCompress::encode - This function "encodes" the input stream into the
359  * output stream.
360  * The getChars() and sendChars() functions are
361  * used to separate this method from the actual
362  * i/o.
363  * NOTE: must set zlen for parent class to know length of
364  * compressed buffer.
365  */
366 
368 {
369  short int i; // an iterator
370  short int r; // node number in the binary tree
371  short int s; // position in the ring buffer
372  unsigned short int len; // len of initial string
373  short int last_match_length; // length of last match
374  short int code_buf_pos; // position in the output buffer
375  unsigned char code_buf[17]; // the output buffer
376  unsigned char mask; // bit mask for byte 0 of out buf
377  unsigned char c; // character read from string
378 
379  // Start with a clean tree.
380 
381  p->InitTree();
382  direct = 0; // set direction needed by parent [Get|Send]Chars()
383 
384  // code_buf[0] works as eight flags. A "1" represents that the
385  // unit is an unencoded letter (1 byte), and a "0" represents
386  // that the next unit is a <position,length> pair (2 bytes).
387  //
388  // code_buf[1..16] stores eight units of code. Since the best
389  // we can do is store eight <position,length> pairs, at most 16
390  // bytes are needed to store this.
391  //
392  // This is why the maximum size of the code buffer is 17 bytes.
393 
394  code_buf[0] = 0;
395  code_buf_pos = 1;
396 
397  // Mask iterates over the 8 bits in the code buffer. The first
398  // character ends up being stored in the low bit.
399  //
400  // bit 8 7 6 5 4 3 2 1
401  // | |
402  // | first sequence in code buffer
403  // |
404  // last sequence in code buffer
405 
406  mask = 1;
407 
408  s = 0;
409  r = (short int) N - (short int) F;
410 
411  // Initialize the ring buffer with spaces...
412 
413  // Note that the last F bytes of the ring buffer are not filled.
414  // This is because those F bytes will be filled in immediately
415  // with bytes from the input stream.
416 
417  memset(p->m_ring_buffer, ' ', N - F);
418 
419  // Read F bytes into the last F bytes of the ring buffer.
420  //
421  // This function loads the buffer with X characters and returns
422  // the actual amount loaded.
423 
424  len = getChars((char *) &(p->m_ring_buffer[r]), F);
425 
426  // Make sure there is something to be compressed.
427 
428  if (len == 0)
429  return;
430 
431  // Insert the F strings, each of which begins with one or more
432  // 'space' characters. Note the order in which these strings
433  // are inserted. This way, degenerate trees will be less likely
434  // to occur.
435 
436  for (i = 1; i <= F; i++) {
437  p->InsertNode((short int) (r - i));
438  }
439 
440  // Finally, insert the whole string just read. The
441  // member variables match_length and match_position are set.
442 
443  p->InsertNode(r);
444 
445  // Now that we're preloaded, continue till done.
446 
447  do {
448 
449  // m_match_length may be spuriously long near the end of
450  // text.
451 
452  if (p->m_match_length > len) {
453  p->m_match_length = len;
454  }
455 
456  // Is it cheaper to store this as a single character? If so,
457  // make it so.
458 
459  if (p->m_match_length < THRESHOLD) {
460  // Send one character. Remember that code_buf[0] is the
461  // set of flags for the next eight items.
462 
463  p->m_match_length = 1;
464  code_buf[0] |= mask;
465  code_buf[code_buf_pos++] = p->m_ring_buffer[r];
466  }
467 
468  // Otherwise, we do indeed have a string that can be stored
469  // compressed to save space.
470 
471  else {
472  // The next 16 bits need to contain the position (12 bits)
473  // and the length (4 bits).
474 
475  code_buf[code_buf_pos++] = (unsigned char) p->m_match_position;
476  code_buf[code_buf_pos++] = (unsigned char) (
477  ((p->m_match_position >> 4) & 0xf0) |
478  (p->m_match_length - THRESHOLD) );
479  }
480 
481  // Shift the mask one bit to the left so that it will be ready
482  // to store the new bit.
483 
484  mask = (unsigned char) (mask << 1);
485 
486  // If the mask is now 0, then we know that we have a full set
487  // of flags and items in the code buffer. These need to be
488  // output.
489 
490  if (!mask) {
491  // code_buf is the buffer of characters to be output.
492  // code_buf_pos is the number of characters it contains.
493 
494  sendChars((char *) code_buf, code_buf_pos);
495 
496  // Reset for next buffer...
497 
498  code_buf[0] = 0;
499  code_buf_pos = 1;
500  mask = 1;
501  }
502 
503  last_match_length = p->m_match_length;
504 
505  // Delete old strings and read new bytes...
506 
507  for (i = 0; i < last_match_length; i++) {
508  // Get next character...
509 
510  if (getChars((char *) &c, 1) != 1)
511  break;
512 
513  // Delete "old strings"
514 
515  p->DeleteNode(s);
516 
517  // Put this character into the ring buffer.
518  //
519  // The original comment here says "If the position is near
520  // the end of the buffer, extend the buffer to make
521  // string comparison easier."
522  //
523  // That's a little misleading, because the "end" of the
524  // buffer is really what we consider to be the "beginning"
525  // of the buffer, that is, positions 0 through F.
526  //
527  // The idea is that the front end of the buffer is duplicated
528  // into the back end so that when you're looking at characters
529  // at the back end of the buffer, you can index ahead (beyond
530  // the normal end of the buffer) and see the characters
531  // that are at the front end of the buffer wihtout having
532  // to adjust the index.
533  //
534  // That is...
535  //
536  // 1234xxxxxxxxxxxxxxxxxxxxxxxxxxxxx1234
537  // | | |
538  // position 0 end of buffer |
539  // |
540  // duplicate of front of buffer
541 
542  p->m_ring_buffer[s] = c;
543 
544  if (s < F - 1) {
545  p->m_ring_buffer[s + N] = c;
546  }
547 
548  // Increment the position, and wrap around when we're at
549  // the end. Note that this relies on N being a power of 2.
550 
551  s = (short int) ( (s + 1) & (N - 1) );
552  r = (short int) ( (r + 1) & (N - 1) );
553 
554  // Register the string that is found in
555  // m_ring_buffer[r..r+F-1].
556 
557  p->InsertNode(r);
558  }
559 
560  // If we didn't quit because we hit the last_match_length,
561  // then we must have quit because we ran out of characters
562  // to process.
563 
564  while (i++ < last_match_length) {
565  p->DeleteNode(s);
566 
567  s = (short int) ( (s + 1) & (N - 1) );
568  r = (short int) ( (r + 1) & (N - 1) );
569 
570  // Note that len hitting 0 is the key that causes the
571  // do...while() to terminate. This is the only place
572  // within the loop that len is modified.
573  //
574  // Its original value is F (or a number less than F for
575  // short strings).
576 
577  if (--len) {
578  p->InsertNode(r); /* buffer may not be empty. */
579  }
580  }
581 
582  // End of do...while() loop. Continue processing until there
583  // are no more characters to be compressed. The variable
584  // "len" is used to signal this condition.
585  } while (len > 0);
586 
587  // There could still be something in the output buffer. Send it
588  // now.
589 
590  if (code_buf_pos > 1) {
591  // code_buf is the encoded string to send.
592  // code_buf_ptr is the number of characters.
593 
594  sendChars((char *) code_buf, code_buf_pos);
595  }
596 
597 
598  // must set zlen for parent class to know length of compressed buffer
599  zlen = zpos;
600 }
601 
602 
603 /******************************************************************************
604  * LZSSCompress::decode - This function "decodes" the input stream into the
605  * output stream.
606  * The getChars() and sendChars() functions are
607  * used to separate this method from the actual
608  * i/o.
609  */
610 
612 {
613  int k;
614  int r; // node number
615  unsigned char c[F]; // an array of chars
616  unsigned char flags; // 8 bits of flags
617  int flag_count; // which flag we're on
618  short int pos; // position in the ring buffer
619  short int len; // number of chars in ring buffer
620  unsigned long totalLen = 0;
621 
622  direct = 1; // set direction needed by parent [Get|Send]Chars()
623 
624  // Initialize the ring buffer with a common string.
625  //
626  // Note that the last F bytes of the ring buffer are not filled.
627 
628  memset(p->m_ring_buffer, ' ', N - F);
629 
630  r = N - F;
631 
632  flags = (char) 0;
633  flag_count = 0;
634 
635  for ( ; ; ) {
636 
637  // If there are more bits of interest in this flag, then
638  // shift that next interesting bit into the 1's position.
639  //
640  // If this flag has been exhausted, the next byte must
641  // be a flag.
642 
643  if (flag_count > 0) {
644  flags = (unsigned char) (flags >> 1);
645  flag_count--;
646  }
647  else {
648  // Next byte must be a flag.
649 
650  if (getChars((char *) &flags, 1) != 1)
651  break;
652 
653  // Set the flag counter. While at first it might appear
654  // that this should be an 8 since there are 8 bits in the
655  // flag, it should really be a 7 because the shift must
656  // be performed 7 times in order to see all 8 bits.
657 
658  flag_count = 7;
659  }
660 
661  // If the low order bit of the flag is now set, then we know
662  // that the next byte is a single, unencoded character.
663 
664  if (flags & 1) {
665  if (getChars((char *) c, 1) != 1)
666  break;
667 
668  if (sendChars((char *) c, 1) != 1) {
669  break;
670  }
671  totalLen++;
672 
673  // Add to buffer, and increment to next spot. Wrap at end.
674 
675  p->m_ring_buffer[r] = c[0];
676  r = (short int) ( (r + 1) & (N - 1) );
677  }
678 
679  // Otherwise, we know that the next two bytes are a
680  // <position,length> pair. The position is in 12 bits and
681  // the length is in 4 bits.
682 
683  else {
684  // Original code:
685  // if ((i = getc(infile)) == EOF)
686  // break;
687  // if ((j = getc(infile)) == EOF)
688  // break;
689  // i |= ((j & 0xf0) << 4);
690  // j = (j & 0x0f) + THRESHOLD;
691  //
692  // I've modified this to only make one input call, and
693  // have changed the variable names to something more
694  // obvious.
695 
696  if (getChars((char *) c, 2) != 2)
697  break;
698 
699  // Convert these two characters into the position and
700  // length. Note that the length is always at least
701  // THRESHOLD, which is why we're able to get a length
702  // of 18 out of only 4 bits.
703 
704  pos = (short int) ( c[0] | ((c[1] & 0xf0) << 4) );
705 
706  len = (short int) ( (c[1] & 0x0f) + THRESHOLD );
707 
708  // There are now "len" characters at position "pos" in
709  // the ring buffer that can be pulled out. Note that
710  // len is never more than F.
711 
712  for (k = 0; k < len; k++) {
713  c[k] = p->m_ring_buffer[(pos + k) & (N - 1)];
714 
715  // Add to buffer, and increment to next spot. Wrap at end.
716 
717  p->m_ring_buffer[r] = c[k];
718  r = (short int) ( (r + 1) & (N - 1) );
719  }
720 
721  // Add the "len" :characters to the output stream.
722 
723  if (sendChars((char *) c, len) != (unsigned int)len) {
724  break;
725  }
726  totalLen += len;
727  }
728  }
729  slen = totalLen;
730 }
731 
#define SWORD_NAMESPACE_START
Definition: defs.h:39
virtual ~LZSSCompress()
Definition: lzsscomprs.cpp:141
#define NOT_USED
Definition: lzsscomprs.cpp:56
void InsertNode(short int Pos)
Definition: lzsscomprs.cpp:210
static short int m_lson[N+1]
Definition: lzsscomprs.cpp:66
unsigned long zlen
Definition: swcomprs.h:39
unsigned long slen
Definition: swcomprs.h:39
void DeleteNode(short int Node)
Definition: lzsscomprs.cpp:308
virtual void encode(void)
Definition: lzsscomprs.cpp:367
#define THRESHOLD
Definition: lzsscomprs.cpp:55
static short int m_rson[N+257]
Definition: lzsscomprs.cpp:67
virtual void decode(void)
Definition: lzsscomprs.cpp:611
Private * p
Definition: lzsscomprs.h:35
static short int m_match_length
Definition: lzsscomprs.cpp:65
unsigned long zpos
Definition: swcomprs.h:39
static unsigned char m_ring_buffer[N+F-1]
Definition: lzsscomprs.cpp:63
virtual unsigned long getChars(char *buf, unsigned long len)
Definition: swcomprs.cpp:121
char direct
Definition: swcomprs.h:38
unsigned long pos
Definition: swcomprs.h:39
#define F
Definition: lzsscomprs.cpp:54
static short int m_match_position
Definition: lzsscomprs.cpp:64
#define SWORD_NAMESPACE_END
Definition: defs.h:40
static short int m_dad[N+1]
Definition: lzsscomprs.cpp:68
#define N
Definition: lzsscomprs.cpp:53
virtual unsigned long sendChars(char *buf, unsigned long len)
Definition: swcomprs.cpp:141