Lempel-Ziv
A. First Edition
This is the first trial of Lempel-Ziv compression algorithm and I only finished the encoding. The decoding is
a bit complicated and only half-finished. I am very hungry for my brunch.
How to compress a text file? Lempel-Ziv is said to be the algorithm used in "winzip".
I used my favourite
linked list to store each character as I am strongly disagree to store each string in memory which wastes not
only space but also searching running time, especially strings are clustered with similarity. For example,
strings like "ABCDE", "ABC", "ABCD" will only be stored with the largest string "ABCDE".
กก
E.Further improvement
กก
F.File listing
project client:
1. main.cpp
file name: main.cpp
#include <stdlib.h> #include <string.h> #include <stdio.h> const int MaxStringLength=128; const int MaxBufferLength=1024; const int MaxAlphabetNumber=3; //assume alphabet is not case sensitive struct Token { char value; int index; Token* next[MaxAlphabetNumber]; }; class Lempel { private: char buffer[MaxStringLength]; char codeBuf[MaxBufferLength]; Token tokens[MaxAlphabetNumber]; void initialize(); FILE* in, *out; int curIndex; Token* createToken(char ch); void doDisplay(Token* ptr, int level); void releaseAll(Token* ptr); char readChar(); bool doFindIndex(Token* ptr, int level, int index); bool findIndex(int index); void addStr(char* str); void findToken(char* str, Token*& result); void openFile(char* fileName, char* outFile); public: int readToken(); ~Lempel(); Lempel(); void encode(char* fileName); void decode(char* fileName); void display(); }; int main() { Lempel L; L.encode("nick.txt"); L.display(); return 0; } void Lempel::openFile(char* fileName, char* outFile) { if ((in=fopen(fileName, "r"))==NULL) { printf("wrong file name %s\n", fileName); exit(0); } if ((out=fopen(outFile, "w"))==NULL) { printf("unable write file %s\n", outFile); exit(0); } } void Lempel::findToken(char* str, Token*& result) { char* ptr=str; result=&tokens[*ptr-'A']; ptr++; while (*ptr!='\0') { result=result->next[*ptr-'A']; ptr++; } } void Lempel::addStr(char* str) { char temp[MaxStringLength]; char ch; Token* ptr; int len=strlen(str); strcpy(temp, str); ch=temp[len-1]; temp[len]='\0'; findToken(temp, ptr); ptr=createToken(ch); } bool Lempel::findIndex(int index) { int level; for (int i=0; i<MaxAlphabetNumber; i++) { level=0; if (doFindIndex(tokens+i, level, index)) { //the string is stored in buffer return true; } } return false; } bool Lempel::doFindIndex(Token* ptr, int level, int index) { buffer[level]=ptr->value; if (ptr->index==index) { buffer[level+1]='\0'; return true; } for (int i=0; i<MaxAlphabetNumber; i++) { if (ptr->next[i]!=NULL) { if (doFindIndex(ptr->next[i], level+1, index)) { return true; } } } return false; } void Lempel::decode(char* fileName) { openFile(fileName, "decoded.txt"); } void Lempel::doDisplay(Token* ptr, int level) { buffer[level]=ptr->value; buffer[level+1]='\0'; printf("%s:%d\n", buffer, ptr->index); for (int i=0; i<MaxAlphabetNumber; i++) { if (ptr->next[i]!=NULL) { doDisplay(ptr->next[i], level+1); } } } void Lempel::display() { int level; Token* curPtr; printf("\nNow display the table\n"); for (int i=0; i<MaxAlphabetNumber; i++) { curPtr=&tokens[i]; level=0; //buffer[level]=curPtr->value; doDisplay(curPtr, level); } } Lempel::Lempel() { initialize(); } char Lempel::readChar() { char ch; ch=fgetc(in); while (ch>('A'+MaxAlphabetNumber)||ch<'A') { ch=fgetc(in); if (feof(in)) { break; } } return ch; } void Lempel::releaseAll(Token* ptr) { for (int i=0; i<MaxAlphabetNumber; i++) { if (ptr->next[i]!=NULL) { releaseAll(ptr->next[i]); delete ptr->next[i]; ptr->next[i]=NULL; } } } Lempel::~Lempel() { Token* ptr; for (int i=0; i<MaxAlphabetNumber; i++) { ptr=&tokens[i]; releaseAll(ptr); } } void Lempel::initialize() { for (int i=0; i<MaxAlphabetNumber; i++) { tokens[i].value='A'+i; tokens[i].index=i; for (int j=0; j<MaxAlphabetNumber; j++) //memset(tokens[i].next, NULL, MaxAlphabetNumber); { tokens[i].next[j]=NULL; } } curIndex=i; } int Lempel::readToken() { char ch; Token* ptr, *nxt; ch=readChar(); if (feof(in)) { return -1; } ptr=&tokens[ch-'A']; do { ch=readChar(); //the last known token found if (feof(in)) { return ptr->index; } nxt=ptr->next[ch-'A']; if (nxt==NULL) { break; } else { ptr=nxt; } }while (true); ptr->next[ch-'A']=createToken(ch); fseek(in, -1, SEEK_CUR); return ptr->index; } Token* Lempel::createToken(char ch) { Token* result=new Token; result->value=ch; result->index=curIndex++; for (int j=0; j<MaxAlphabetNumber; j++) //memset(tokens[i].next, NULL, MaxAlphabetNumber); { result->next[j]=NULL; } //memset(result->next, NULL, MaxAlphabetNumber); return result; } void Lempel::encode(char* fileName) { //char outName[30]; int code; openFile(fileName, "encoded.txt"); while ((code=readToken())!=-1) { //fprintf(out, "%d,", code); printf("%d,", code); } fclose(in); fclose(out); }
How to run?
1. Copy a file "nick.txt" into the same path of this executable file. The contents of file is like this:
A B C AB BA ABA ABC CB BAB BABA ABCB BABAB BABC CBA
The space between character is not necessary.
กก