CFG Reader(Follow)

A. Sixth Edition
This is fifth?fourth? edition of my CFG Reader which stands for Context-Free-Grammar Reader. In this edition, 
I finished the follow set calculation and removed meta symbol---"{", "}" from last version because it is same
thing as "|".
B.The problem
What is Follow set?
Example: B -> A a
The variable A will have a follow set of Follow(A) = {a}.

program -> stmt-sequence
stmt-sequence -> stmt-sequence ; statement | statement
statement -> if-stmt | repeat-stmt | assign-stmt | read-stmt | write-stmt
if-stmt -> if boolean-exp then stmt-sequence end
| if boolean-exp then stmt-sequence else stmt-sequence end
repeat-stmt -> repeat stmt-sequence until boolean-exp
assign-stmt -> identifier := integer-exp
read-stmt -> read identifier
write-stmt -> write integer-exp
boolean-exp -> integer-exp comparison-op integer-exp
comparison-op -> < | =
integer-exp -> integer-exp addop term | term
addop -> + | -
term -> term mulop factor | factor
mulop -> * | /
factor -> ( integer-exp ) | number | identifier

 

C.The idea of program
 
This problem has a standard algorithm in text. So, will you just refer to the text if you are really interested in?
I highly suspect if there is anyone in my reader group would be interested in reading this or even want to have 
any idea of "what is First" and how to calculate. 
D.The major functions
E.Further improvement
F.File listing
1. CFGReader.h
2. CFGReader.cpp  
3. Grammar.h
4. Grammar.cpp
5. main.cpp (main)
 
 
file name: CFGReader.h
#include "Grammar.h"


class CFGReader
{
private:
	char buffer[BufferLength];
	FILE* stream;
	void newRule(const char* str);
	void readRule();
	void addRule(const char* str);
	int addToken(const char* str, bool isTerminal=true);
	void printRule(int index);
public:
	void readFromFile(const char* fileName);
	void optimize();
	void print();
};

 

file name: CFGReader.cpp 
#include <iostream>
#include "CFGReader.h"

using namespace std;

Grammar grammar;

const char* deliStr=" |\n";

//this would only remove the immediate left recursion
//and should I expand it to remove all kinds of left-recursion?


void CFGReader::readFromFile(const char* fileName)
{
	if ((stream=fopen(fileName, "r"))==NULL)
	{
		cout<<"cannot open file "<<fileName<<endl;
	}
	else
	{		
		readRule();		
	}
}

void CFGReader::readRule()
{
	char* ptr=buffer;
	char* hold;
	int count=0;

	while (!feof(stream))
	{
		count=0;
		fgets(buffer, BufferLength-1, stream);
		ptr=buffer;	
		
		while ((ptr=strtok(ptr, " \n"))!=NULL)
		{
			if (strcmp(ptr, "|")!=0)
			{
				//test the first two token to see if old rule continues
				if (count==0)				
				{
					hold=ptr;
				}
				if (count==1)
				{
					if (strcmp("->", ptr)==0)
					{			
						/*
						curIndex=addToken(hold, false);//the index of cur token	
						grammar[curIndex]->terminal=false;
						newRule();	
						*/
						newRule(hold);
					}
					else
					{								
						addRule(hold);
						addRule(ptr);															
					}
				}
				if (count>=2)//always add
				{
					addRule(ptr);
				}
				count++;
			}
			else
			{
				//this is an alternative production rule
				newRule(NULL);						
			}
			ptr=NULL;
		}		
	}
}

void CFGReader::newRule(const char* str)
{
	grammar.newRule(str);
}

void CFGReader::optimize()
{
	grammar.optimize();
}


void CFGReader::addRule(const char* str)
{
	grammar.addRule(str);
}

int CFGReader::addToken(const char* str, bool isTerminal)
{
	return grammar.addToken(str, isTerminal);
}



void CFGReader::printRule(int index)
{
	grammar.printRule(index);
}

void CFGReader::print()
{
	grammar.print();
	grammar.printToken();
	//grammar.printAllRules();
}





  
file name: Grammar.h
const int BufferLength=256;
const int MaxTokenCount=80;
const int MaxRHSCount=50;
const int MaxRuleCount=200;
const int MaxTokenLength=10;
const int MaxProduction=10;

struct Token
{
	//bool isMeta;
	bool terminal;
	bool isNull;
	char* name;
	int production[MaxProduction];//pointer to the production it gives
	int count;
	int firstCount;
	int followCount;
	int follow[MaxTokenCount];
	int first[MaxTokenCount];
};

class Grammar
{
private:
	Token* token[MaxTokenCount];
	int tokenCount;
	int curIndex;//the index of current token
	int curPos;//the position at production rule
	//MaxRuleCount>=MaxVariableCount!!!
	int production[MaxRuleCount][MaxRHSCount];
	int prodIndex;//pointing to current production, initialized to -1
	int checkRecursion(int tIndex, int curToken);
	void addBeginEnd();
	bool addIntoFirst(int target, int source);
	bool addFirst(int target, int source);
	void shiftRule(int ruleIndex, bool Left2Right, int offset);
	void addAtEnd(int ruleIndex, int toAdd);
	void addRuleForToken(int tIndex, int ruleIndex);
	int copyRule(int source, int target);//return the position of -1
	void replaceRule(int curToken, int ruleIndex);
	int findMetaSymbol(int& begin, int& end);
	void initialize();
	void doReplaceMetaSymbol(int ruleIndex, int begin, int end);
	void removeRuleFromToken(int tIndex, int ruleIndex);
	void checkEpsilon(int ruleIndex);
	void removeImmediate(int tIndex, int ruleIndex);
	void doAddRule(int tIndex);
	void commonLeftFactor();
	int findCommonFactor(int tokenIndex, int ruleIndex);
	void removeLeftRecursion();
	void doCommonFactor(int tokenIndex, int ruleIndex, int index);
	int forgeToken(int index);//create a new variable
	void epsilonRule(int ruleIndex);
	bool isRedundant(int tIndex);
	void replaceMetaSymbol();
	void calculateFirst();
	void calculateFollow();
	void calculateNull();
	bool Null(int tIndex);
	bool addFollow(int target, int source);
	bool addIntoFollow(int target, int source);
	bool addFollowIntoFollow(int target, int source);
    void removeRedundant();
	void removeToken(int tIndex);
	//	void calculateProperty();
public:
	Grammar();
	void optimize();
	void printRule(int index);
	void print();
	void printAllRules();
	void printToken();
	void addRule(const char* str);
	void newRule(const char* str);//this is an internal method, not suitable for outside
	Token* operator[](int index);
	int addToken(const char* str, bool isTerminal=true);
	Token* createToken(const char* theName, bool isTerminal);
	int tokenIndex(const char* str);
};
 
 
file name: Grammar.cpp 
#include <iostream>
#include "Grammar.h"

using namespace std;

const char* emptyStr="e";
const char* optionBegin="{";
const char* optionEnd="}";
const char* startStr="START";
const char* endStr="$";
int startSymbolIndex=-1;
int stackBottomIndex=-1;
int beginIndex=-1;
int endIndex=-1;
int emptyIndex=-1;

//leave the rule unremoved!!! as I have no way to do it!!!
void Grammar::removeToken(int tIndex)
{	
	delete[]token[tIndex]->name;
	delete token[tIndex];
	tokenCount--;
	for (int i=tIndex; i<tokenCount; i++)
	{
		token[i]=token[i+1];
	}
}
	

bool Grammar::isRedundant(int tIndex)
{
	int k, theRule, theToken;
	for (int i=0; i<tokenCount; i++)
	{
		for (int j=0; j<token[i]->count; j++)
		{
			k=0;
			theRule=token[i]->production[j];
			theToken=production[theRule][k];
			while (theToken!=-1)
			{
				if (theToken==tIndex)
				{
					return false;
				}
				k++;
				theToken=production[theRule][k];
			}
		}
	}
	return true;
}


//what is redundant? except the start variable
//the non-terminal never appears in other rules!
void Grammar::removeRedundant()
{
	int tIndex=1;
	bool findNew=false;
	while (tIndex<tokenCount)
	{
		findNew=false;
		if (!token[tIndex]->terminal)
		{
			if (isRedundant(tIndex))
			{
				removeToken(tIndex);
				findNew=true;
			}
		}
		if (findNew)
		{
			tIndex=1;
		}
		else
		{
			tIndex++;
		}
	}			
}

void Grammar::printAllRules()
{
/*
	cout<<"\nnow print all rules\n";
	for (int i=0; i<=prodIndex; i++)
	{
		cout<<"rule index "<<i<<": ";
		printRule(i);
		cout<<"\n";
	}
	*/
	int sum=0;
	for (int i=0; i<tokenCount; i++)
	{
		if (!token[i]->terminal)
		{
			sum+=token[i]->count;
		}
	}
	cout<<" total rules is:"<<prodIndex+1;
	cout<<"\n and the sum is "<<sum<<endl;

}



void Grammar::addBeginEnd()
{
	int begin=addToken(startStr, false);
	int end=addToken(endStr, true);
	prodIndex++;
	production[prodIndex][0]=0;
	production[prodIndex][1]=end;
	production[prodIndex][2]=-1;
	addRuleForToken(begin, prodIndex);
	startSymbolIndex=begin;
	stackBottomIndex=end;
}
/*
void Grammar::calculateProperty()
{
	calculateNull();
	calculateFirst();
	calculateFollow();
}
*/
void Grammar::printToken()
{
	for (int i=0; i<tokenCount; i++)
	{
		cout<<"name: "<<token[i]->name<<endl;
		cout<<"   isNull: "<<(token[i]->isNull?"true":"false")<<endl;
		cout<<"   isTerminal: "<<(token[i]->terminal?"true":"false")<<endl;
		cout<<"   first: ";
		for (int j=0; j<token[i]->firstCount; j++)
		{
			if (j!=0)
			{
				cout<<",";
			}
			cout<<"["<<j<<"]="<<token[token[i]->first[j]]->name;			
		}
		cout<<"\n   follow: ";
		for (j=0; j<token[i]->followCount; j++)
		{
			if (j!=0)
			{
				cout<<",";
			}
			cout<<"["<<j<<"]="<<token[token[i]->follow[j]]->name;
		}
		cout<<endl;
	}
}

//must use while loop to discover new set!!!!!
void Grammar::calculateFirst()
{
	int i;
	bool addNew;
	do
	{
		i=0;
		addNew=false;
		while (i<tokenCount)
		{		
			//the terminal contains itself
			if (token[i]->terminal)
			{
				//addFirst don't judge if it is nullable!!
				addFirst(i, i);			
				//token[i]->first[token[i]->firstCount++]=i;
			}
			else
			{
				//for all its rules
				for (int j=0; j<token[i]->count; j++)
				{
					int theToken, k=0;
					theToken=production[token[i]->production[j]][k];
					//for each token in each rule
					do 
					{
						//add non epsilon set
						if (addIntoFirst(i, theToken))
						{
							addNew=true;
						}
						//if it is not null, means it is end
						if (!token[theToken]->isNull)
						{
							break;
						}
						//if it is nullable, continue
						k++;
						theToken=production[token[i]->production[j]][k];
					}while (theToken!=-1);
					//it means all token in this rule is nullable, so
					if (theToken==-1)
					{
						//it is nullable
						//addEpsilonIntoFirst(i);
						addFirst(i, emptyIndex);
					}
				}
			}
			i++;
		}		
	}while (addNew);
}

bool Grammar::addFollowIntoFollow(int target, int source)
{
	bool addNew=false;
	for (int i=0; i<token[source]->followCount; i++)
	{
		if (addFollow(target, token[source]->follow[i]))
		{
			addNew=true;
		}
	}
	return addNew;
}

bool Grammar::addIntoFollow(int target, int source)
{
	bool addNew=false;
	if (source==-1)
	{
		return false;
	}
	for (int i=0; i<token[source]->firstCount; i++)
	{
		//add non-epsilon
		if (!token[token[source]->first[i]]->isNull)
		{
			if (addFollow(target, token[source]->first[i]))
			{
				addNew=true;
			}
		}
	}
	return addNew;
}


void Grammar::calculateFollow()
{		
	int i;
	bool addNew, started;
	//token[startSymbolIndex]->follow[0]=stackBottomIndex;
	//token[startSymbolIndex]->followCount=1;
	do
	{
		i=0;
		addNew=false;
		while (i<tokenCount)
		{		
			//the terminal contains itself
			if (!token[i]->terminal)
			{
				for (int tIndex=0; tIndex<tokenCount; tIndex++)
				{
					//for all its rules
					if (token[tIndex]->terminal)
					{
						continue;
					}
					//for each its rule
					for (int j=0; j<token[tIndex]->count; j++)
					{
						int theToken, k=0, theRule=token[tIndex]->production[j];
						started=false;
						theToken=production[theRule][k];
						//for each token in each rule
						do 
						{
							//the token appears here
							if (theToken==i)
							{
								started=true;
								//add non epsilon set, including -1 situation!!!
								if (addIntoFollow(i, production[theRule][k+1]))
								{
									addNew=true;
								}
							}
							//if it is not null, means it is end
							if (started&&!token[theToken]->isNull)
							{
								break;
							}
							//if it is nullable, continue
							k++;
							theToken=production[theRule][k];
						}while (theToken!=-1);
						//it means all token in this rule is nullable, so
						if (started&&theToken==-1)
						{
							//it is nullable
							//add current variable Follow(j) into Follow(i);
							if (addFollowIntoFollow(i, tIndex))
							{
								addNew=true;
							}
						}
					}
				}					
			}
			i++;
		}		
	}while (addNew);
}

void Grammar::addRuleForToken(int tIndex, int ruleIndex)
{
	token[tIndex]->production[token[tIndex]->count++]=ruleIndex;
}

void Grammar::shiftRule(int ruleIndex, bool left2Right, int offset)
{
	int end=0;
	while (production[ruleIndex][end]!=-1)
	{
		end++;
	}
	if (left2Right)
	{
		for (int i=end; i>=0; i--)
		{
			production[ruleIndex][i+offset]=production[ruleIndex][i];
		}
	}
	else
	{
		for (int i=0; i<=end-offset; i++)
		{
			production[ruleIndex][i]=production[ruleIndex][i+offset];
		}
		checkEpsilon(ruleIndex);
	}
}

void Grammar::checkEpsilon(int ruleIndex)
{
	if (production[ruleIndex][0]==-1)
	{
		epsilonRule(ruleIndex);
	}
}


bool Grammar::addFollow(int target, int source)
{
	//check if it is already there
	for (int i=0; i<token[target]->followCount; i++)
	{
		if (token[target]->follow[i]==source)
		{
			return false;
		}
	}
	//add at end
	token[target]->follow[token[target]->followCount++]=source;
	return true;
}

bool Grammar::addFirst(int target, int source)
{
	//check if it is already there
	for (int i=0; i<token[target]->firstCount; i++)
	{
		if (token[target]->first[i]==source)
		{
			return false;
		}
	}
	//add at end
	token[target]->first[token[target]->firstCount++]=source;
	return true;
}

//add non epsilon into it.
bool Grammar::addIntoFirst(int target, int source)
{
	bool addNew=false;
	if (token[source]->terminal)
	{
		return addFirst(target, source);
	}
	for (int i=0; i<token[source]->firstCount; i++)
	{
		//add non epsilon
		if (!token[token[source]->first[i]]->isNull)
		{
			if (addFirst(target, token[source]->first[i]))
			{
				addNew=true;
			}
		}
	}
	return addNew;
}


bool Grammar::Null(int tIndex)
{
	if (token[tIndex]->terminal)
	{
		return token[tIndex]->isNull;		
	}
	for (int i=0; i<token[tIndex]->count; i++)
	{
		int j=0, theToken;
		theToken=production[token[tIndex]->production[i]][j];
		do
		{
			if (theToken==tIndex||!Null(theToken))
			{
				break;
			}
			j++;
			theToken=production[token[tIndex]->production[i]][j];
		}while (theToken!=-1);
		if (theToken==-1)
		{			
			return true;
		}
	}
	return false;
}

void Grammar::calculateNull()
{
	for (int i=0; i<tokenCount; i++)
	{
		token[i]->isNull=Null(i);	
	}
}
	
int Grammar::findMetaSymbol(int& begin, int& end)
{
	int theRule, theToken, k;
	begin=end=-1;
	for (int i=0; i<tokenCount; i++)
	{
		for (int j=0; j<token[i]->count; j++)
		{
			k=0;
			theRule=token[i]->production[j];
			theToken=production[theRule][k];
			while (theToken!=-1)
			{
				if (theToken==beginIndex)
				{
					begin=k;
				}
				if (theToken==endIndex)
				{
					end=k;
					return theRule;
				}
				k++;
				theToken=production[theRule][k];
			}
		}
	}
	return -1;
}

	
void Grammar::doReplaceMetaSymbol(int ruleIndex, int begin, int end)
{
	int newTokenIndex=forgeToken(0), i=0;

	addRuleForToken(newTokenIndex, ++prodIndex);
	//token[newTokenIndex]->production[token[newTokenIndex]->count++]=++prodIndex;
	//token[newTokenIndex]->terminal=false;

	copyRule(ruleIndex, prodIndex);
	//shrink 
	while (production[ruleIndex][i+end+1]!=-1)
	{
		production[ruleIndex][begin+i]=production[ruleIndex][i+end+1];
		i++;
	}
	production[ruleIndex][begin+i]=-1;
	addAtEnd(ruleIndex, newTokenIndex);
	/*
	production[ruleIndex][begin]=newTokenIndex;
	production[ruleIndex][begin+1]=-1;
	*/

	shiftRule(prodIndex, false, begin+1);
	
	production[prodIndex][end-begin-1]=newTokenIndex;
	production[prodIndex][end-begin]=-1;
	epsilonRule(++prodIndex);
	addRuleForToken(newTokenIndex, prodIndex);
}
		
void Grammar::replaceMetaSymbol()
{
	int begin, end, ruleIndex;
	while ((ruleIndex=findMetaSymbol(begin, end))!=-1)
	{
		doReplaceMetaSymbol(ruleIndex, begin, end);
	}	
	for (int i=0; i<tokenCount; i++)
	{
		//if (token[i]->isMeta)
		if (i==beginIndex||i==endIndex)
		{
			removeToken(i);
		}
	}
}

void Grammar::addAtEnd(int ruleIndex, int toAdd)
{
	int end=0;
	while (production[ruleIndex][end]!=-1)
	{
		end++;
	}
	production[ruleIndex][end++]=toAdd;
	production[ruleIndex][end]=-1;
}

//left-recursion:  A -> A a | b | c
//change to: A  -> b A' | c A'
//			 A' -> a A' | empty
void Grammar::removeImmediate(int tIndex, int ruleIndex)
{
	int newIndex=forgeToken(tIndex);
	int holdRuleIndex=token[tIndex]->production[ruleIndex];
	//sequence matters!

	//change to: A -> b A'
	for (int i=0; i<token[tIndex]->count; i++)
	{
		if (i!=ruleIndex)
		{
			addAtEnd(token[tIndex]->production[i], newIndex);
		}
	}
	
	//shift
	removeRuleFromToken(tIndex, ruleIndex);
	
	addRuleForToken(newIndex, holdRuleIndex);
	//token[newIndex]->production[token[newIndex]->count++]=holdRuleIndex;

	shiftRule(holdRuleIndex, false, 1);
	addAtEnd(holdRuleIndex, newIndex);
	
	//add epsilon rule for new variable
	epsilonRule(++prodIndex);
	addRuleForToken(newIndex, prodIndex);
}


int Grammar::forgeToken(int index)
{
	char str[MaxTokenLength+2], ch;
	int len=strlen(token[index]->name);
	int temp=0, i=0;
	strcpy(str, token[index]->name);
	ch=str[len-1];//get last char of name
	if (ch>='0'&&ch<'9')
	{
		str[len-1]=ch+i+1;
	}
	else
	{
		str[len]='0'+i;
		str[len+1]='\0';
	}
	//I won't bother to check repetitation of name
	while (tokenIndex(str)!=-1)
	{
		i++;
		if (ch>='0'&&ch<'9')
		{
			str[len-1]=ch+i+1;
		}
		else
		{
			str[len]='0'+i;
			str[len+1]='\0';
		}
	}

	return addToken(str, false);//it is non-terminal for sure	
}

int Grammar::copyRule(int source, int target)
{
	int i=0;
	while (production[source][i]!=-1) 
	{
		production[target][i]=production[source][i];
		i++;
	}
	production[target][i]=-1;
	return i;
}

void Grammar::doAddRule(int tIndex)
{	
	token[tIndex]->production[token[tIndex]->count++]=++prodIndex;
}

void Grammar::addRule(const char* str)
{
	int index;
	index=addToken(str);
	production[prodIndex][curPos++]=index;
	production[prodIndex][curPos]=-1;//set end
}


//if the token is already there, it return the index
//otherwise, it add new node in token array
int Grammar::addToken(const char* str, bool isTerminal)
{
	int index;
	index=tokenIndex(str);
	if (index==-1)
	{
		index=tokenCount;
	}
	else
	{		
		return index;
	}
	token[index]=createToken(str, isTerminal);
	if (strcmp(str, optionBegin)==0)
	{
		beginIndex=index;
	}
	if (strcmp(str, optionEnd)==0)
	{
		endIndex=index;
	}
	tokenCount++;
	if (strcmp(str, emptyStr)==0)
	{
		token[index]->isNull=true;
		emptyIndex=index;
	}
	return index;
}

void Grammar::newRule(const char* str)
{
	if (str!=NULL)
	{
		curIndex=addToken(str, false);
	}
	//add one pointer
	token[curIndex]->production[token[curIndex]->count++]=++prodIndex;
	token[curIndex]->terminal=false;
	curPos=0;//reset to restart;
}

Token* Grammar::createToken(const char* theName, bool isTerminal)
{
	Token* ptr=new Token;
	ptr->name=new char[strlen(theName)+1];
	strcpy(ptr->name, theName);
	ptr->terminal=isTerminal;
	ptr->count=ptr->firstCount=ptr->followCount=0;
	ptr->isNull=false;
	return ptr;
}

int Grammar::tokenIndex(const char* str)
{
	for (int i=0; i<tokenCount; i++)
	{
		if (strcmp(str, token[i]->name)==0)
		{
			return i;
		}
	}
	return -1;
}

int Grammar::checkRecursion(int tIndex, int curToken)
{
	for (int i=0; i<token[curToken]->count; i++)
	{
		//token[tIndex]->production[i]=ruleIndex
		//production[ruleIndex][0] is the first left-recursion one
		if (production[token[curToken]->production[i]][0]<=curToken)
		{
			return i;
		}
	}
	return -1;
}

void Grammar::printRule(int index)
{
	int nodeIndex=0;
	while (production[index][nodeIndex]!=-1)
	{
		cout<<token[production[index][nodeIndex]]->name<<" ";
		nodeIndex++;
	}
}



void Grammar::initialize()
{
	tokenCount=curIndex=curPos=0;
	prodIndex=-1;//in order to ++ blindly
}

Grammar::Grammar()
{
	initialize();
}

void Grammar::removeLeftRecursion()
{
	int tIndex=0, curToken=0;
	bool newChange=false;
	while (tIndex<tokenCount)
	{		
		if (!token[tIndex]->terminal)
		{
			for (int i=0; i<token[tIndex]->count; i++)
			{
				curToken=production[token[tIndex]->production[i]][0];
				if (curToken<=tIndex&&!token[curToken]->terminal)
				{
					if (curToken!=tIndex)
					{
						replaceRule(tIndex, i);
					}
					else
					{
						removeImmediate(tIndex, i);
					}
					newChange=true;				
				}
			}
		}
		//whenever there is some new findings, restart
		if (!newChange)
		{
			tIndex++;
		}
		else
		{
			tIndex=0;
			newChange=false;
		}
	}
}

void Grammar::replaceRule(int tIndex, int ruleIndex)
{
	int pos, j, targetEnd, sourceEnd, curRule;
	curRule=token[tIndex]->production[ruleIndex];
	int curToken=production[curRule][0];
	for (int i=token[curToken]->count-1; i>=0; i--)
	{
		if (i>0)
		{
			doAddRule(tIndex);
			pos=copyRule(token[curToken]->production[i], prodIndex);
			j=0;
			while (production[curRule][j+1]!=-1)
			{
				production[prodIndex][pos+j]=production[curRule][j+1];
				j++;
			}
			production[prodIndex][pos+j]=-1;
			//addRuleForToken(curToken, prodIndex);
		}
		else
		{
			targetEnd=sourceEnd=0;
			//curRule=token[tIndex]->production[ruleIndex];
			while (true)
			{
				if (production[token[curToken]->production[0]][sourceEnd]==-1&&
				production[curRule][targetEnd]==-1)
				{
					break;
				}
				if (production[token[curToken]->production[0]][sourceEnd]!=-1)
				{
					sourceEnd++;
				}
				if (production[curRule][targetEnd]!=-1)
				{
					targetEnd++;
				}
			}
			j=targetEnd+sourceEnd-1;
			while (j>=0)
			{
				if (j>=sourceEnd)
				{
					production[curRule][j]=production[curRule][j-sourceEnd+1];
				}
				else
				{
					production[curRule][j]=production[token[curToken]->production[0]][j];
				}
				j--;
			}
		}
	}
}


//optimize sequence is first common-factor then remove left recursion
//therefore I don't have to check if for same variable if there will be
//more than one left-recursion
void Grammar::optimize()
{
	replaceMetaSymbol();
	removeLeftRecursion();
	commonLeftFactor();
	removeRedundant();
	addBeginEnd();
	calculateNull();
	calculateFirst();
	calculateFollow();
}

int Grammar::findCommonFactor(int tIndex, int ruleIndex)
{
	for (int i=ruleIndex+1; i<token[tIndex]->count; i++)
	{	
		//if the two rule has the same first token
		if (production[token[tIndex]->production[ruleIndex]][0]==
			production[token[tIndex]->production[i]][0])
		{
			/*
			//calculate if there is epsilon
			if (emptyIndex==-1)
			{
				emptyIndex=tokenIndex(emptyStr);
			}
			//if it is not epsilon
			if (production[token[tIndex]->production[i]][0]!=emptyIndex)
			{
				return i;
			}
			*/
			return i;
		}
	}
	return -1;
}

void Grammar::epsilonRule(int ruleIndex)
{
	production[ruleIndex][0]=addToken(emptyStr);
	production[ruleIndex][1]=-1;
}

//sample:  x -> Aa 
//         x -> Ab
//changed to:  x -> Ax' //this is to change the old rule
//             x' -> b //this is to change the old rule
//             x' -> a //this is the new-added rule
void Grammar::doCommonFactor(int tIndex, int ruleIndex, int index)
{
	int newTokenIndex=forgeToken(tIndex);//create a new variable
	//move the second and after part to the new rule of new variable
	//doing: x' -> a
	//curPos=0;
	//prepare to add one new rule 	
	addRuleForToken(newTokenIndex, ++prodIndex);
	//token[newTokenIndex]->production[token[newTokenIndex]->count++]=++prodIndex;
	token[newTokenIndex]->terminal=false;

	copyRule(token[tIndex]->production[ruleIndex], prodIndex);
	shiftRule(prodIndex, false, 1);
	/*
	do
	{
		
		//do copying
		production[prodIndex][curPos]=
			production[token[tIndex]->production[ruleIndex]][curPos+1];
		curPos++;
	//even the -1 at end is copied
	}while (production[token[tIndex]->production[ruleIndex]][curPos]!=-1);
	*/

	//in order to show an empty string, in case the string is "epsilon"

	
	/*
	if (curPos==1)
	{
		epsilonRule(prodIndex);
	}
	*/


	//replace x -> Aa with x -> Ax'
	production[token[tIndex]->production[ruleIndex]][1]=newTokenIndex;
	production[token[tIndex]->production[ruleIndex]][2]=-1;//end
	
	//doing: x' -> b
	//curPos=0;
	//prepare to add one new rule 
	//pointing new token to where old rule lies
	addRuleForToken(newTokenIndex, token[tIndex]->production[index]);
	/*
	token[newTokenIndex]->production[token[newTokenIndex]->count++]=
		token[tIndex]->production[index];
		*/

	shiftRule(token[tIndex]->production[index], false, 1);

	removeRuleFromToken(tIndex, index);
}

void Grammar::removeRuleFromToken(int tIndex, int ruleIndex)
{
	token[tIndex]->count--;
	for (int i=ruleIndex; i<token[tIndex]->count; i++)
	{
		token[tIndex]->production[i]=token[tIndex]->production[i+1];
	}	
}


void Grammar::commonLeftFactor()
{
	int index=-1, tIndex=0, ruleIndex=0;
	bool flag;
	//whenever a newrule is done, restart!
	while (tIndex<tokenCount)
	{
		ruleIndex=0;
		flag=false;		
		while (ruleIndex<token[tIndex]->count)
		{			
			index=findCommonFactor(tIndex, ruleIndex);
			if (index!=-1)
			{
				doCommonFactor(tIndex, ruleIndex, index);
				//restart
				flag=true;
				break;
			}
			else
			{
				ruleIndex++;
			}
		}
		if (flag)
		{
			tIndex=0;		
		}
		else
		{
			tIndex++;
		}
	}
}

Token* Grammar::operator[](int index)
{
	if (index>=0&&index<tokenCount)
	{
		return token[index];
	}
	else
	{
		return NULL;
	}
}

void Grammar::print()
{
	for (int i=0; i<tokenCount; i++)
	{		
		if (!token[i]->terminal)
		{
			cout<<token[i]->name<<" ==> ";
			for (int j=0; j<token[i]->count; j++)
			{			
				printRule(token[i]->production[j]);	
				if (j!=token[i]->count-1)
				{
					cout<<" | ";
				}
			}
			cout<<"\n";
		}		
	}
}



file name: main.cpp (main)
#include <iostream>
#include "CFGReader.h"

using namespace std;

int main()
{
	CFGReader R;
	R.readFromFile("c:\\ruleTest.txt");
    R.optimize();
	R.print();
	return 0;
}
 
 
 
The input is something like following:
Please note that all tokens must be separated by space, including "|" and "->" and "{", "}" is meta symbol 
equivalent to Kaleen star. 
M' -> program i ; Dl B
Dl -> Dv Ml
Ml -> Ml M | e
M -> module i ( Vl ) Dv B
Dv -> variables Vl | e
Vl -> Vl V | V
V -> Il : T ;
T -> integer Ad ; | char Ad
Il -> i | Il , i
Ad -> e | array [ n ]
B -> begin Sl end ;
Sl -> S | S Sl
S -> L := E ; | if C then S else S | loop Sl end ; | exit ; i ( Lp ) ; | B | read Ln ; | write Ln ; | e ;
Lp -> e | Ln
Ln -> L { , L }
L -> i Ar
Ar -> e | [ E ]
E -> F { Oa F }
Oa -> + | -
F -> R { Om R }
Om -> * | /
R -> L | n | ( E ) | c
C -> E Or E
Or -> = | < | > | <= | >=




Here is the result:
M' ==> program i ; Dl B 
Dl ==> Dv Ml 
B ==> Sl end S ; 
Dv ==> V )  | module 
Ml ==> module Sl0 
Vl ==> Il START 
V ==> : T integer ; 
Il ==> i $ 
T ==> Ad char ;  | , char 
Ad ==> module  | [ n ] begin 
Sl ==> L Sl0 
S ==> := E if ;  | C then else L loop L  | exit end S ;  | Lp ; i Vl read variables ;  | Sl end S ;  | Ln write ;  | } write ;  | module ; 
L ==> i R 
E ==> Om Il0 
C ==> Om Il0 >= if 
Lp ==> module  | write 
Ln ==> i R Sl0 
R ==> i R  | ]  | Vl if variables  | <= 
Om ==> <  | > 
Or ==> M'0  | M'1  | M'2  | Ml0  | Vl0 
M'0 ==> * Om Il0  | module 
M'1 ==> array := Sl0  | module 
M'2 ==> = Or START  | module 
Ml0 ==> ( i Vl ) variables Dv B Sl0  | module 
Vl0 ==> i $ T integer ; START  | module 
Il0 ==> array i $  | module 
Sl0 ==> module  | end 
START ==> M' $ 
name: M'
   isNull: false
   isTerminal: false
   first: [0]=program
   follow: [0]=$
name: program
   isNull: false
   isTerminal: true
   first: [0]=program
   follow: 
name: i
   isNull: false
   isTerminal: true
   first: [0]=i
   follow: 
name: ;
   isNull: false
   isTerminal: true
   first: [0]=;
   follow: 
name: Dl
   isNull: false
   isTerminal: false
   first: [0]=module,[1]=:
   follow: [0]=i
name: B
   isNull: false
   isTerminal: false
   first: [0]=i
   follow: [0]=module,[1]=end
name: Dv
   isNull: false
   isTerminal: false
   first: [0]=module,[1]=:
   follow: [0]=module,[1]=i
name: Ml
   isNull: false
   isTerminal: false
   first: [0]=module
   follow: 
name: e
   isNull: true
   isTerminal: true
   first: [0]=e
   follow: 
name: module
   isNull: false
   isTerminal: true
   first: [0]=module
   follow: 
name: (
   isNull: false
   isTerminal: true
   first: [0]=(
   follow: 
name: Vl
   isNull: false
   isTerminal: false
   first: [0]=i
   follow: [0]=read,[1]=if,[2]=)
name: )
   isNull: false
   isTerminal: true
   first: [0]=)
   follow: 
name: variables
   isNull: false
   isTerminal: true
   first: [0]=variables
   follow: 
name: V
   isNull: false
   isTerminal: false
   first: [0]=:
   follow: [0]=)
name: Il
   isNull: false
   isTerminal: false
   first: [0]=i
   follow: [0]=program
name: :
   isNull: false
   isTerminal: true
   first: [0]=:
   follow: 
name: T
   isNull: false
   isTerminal: false
   first: [0]=,,[1]=module,[2]=[
   follow: [0]=integer
name: integer
   isNull: false
   isTerminal: true
   first: [0]=integer
   follow: 
name: Ad
   isNull: false
   isTerminal: false
   first: [0]=module,[1]=[
   follow: [0]=char
name: char
   isNull: false
   isTerminal: true
   first: [0]=char
   follow: 
name: ,
   isNull: false
   isTerminal: true
   first: [0]=,
   follow: 
name: array
   isNull: false
   isTerminal: true
   first: [0]=array
   follow: 
name: [
   isNull: false
   isTerminal: true
   first: [0]=[
   follow: 
name: n
   isNull: false
   isTerminal: true
   first: [0]=n
   follow: 
name: ]
   isNull: false
   isTerminal: true
   first: [0]=]
   follow: 
name: begin
   isNull: false
   isTerminal: true
   first: [0]=begin
   follow: 
name: Sl
   isNull: false
   isTerminal: false
   first: [0]=i
   follow: [0]=end
name: end
   isNull: false
   isTerminal: true
   first: [0]=end
   follow: 
name: S
   isNull: false
   isTerminal: false
   first: [0]=:=,[1]=exit,[2]=},[3]=module,[4]=write,[5]=i,[6]=<,[7]=>
   follow: [0]=;
name: L
   isNull: false
   isTerminal: false
   first: [0]=i
   follow: [0]=module,[1]=end,[2]=loop
name: :=
   isNull: false
   isTerminal: true
   first: [0]=:=
   follow: 
name: E
   isNull: false
   isTerminal: false
   first: [0]=<,[1]=>
   follow: [0]=if
name: if
   isNull: false
   isTerminal: true
   first: [0]=if
   follow: 
name: C
   isNull: false
   isTerminal: false
   first: [0]=<,[1]=>
   follow: [0]=then
name: then
   isNull: false
   isTerminal: true
   first: [0]=then
   follow: 
name: else
   isNull: false
   isTerminal: true
   first: [0]=else
   follow: 
name: loop
   isNull: false
   isTerminal: true
   first: [0]=loop
   follow: 
name: exit
   isNull: false
   isTerminal: true
   first: [0]=exit
   follow: 
name: Lp
   isNull: false
   isTerminal: false
   first: [0]=module,[1]=write
   follow: [0]=;
name: read
   isNull: false
   isTerminal: true
   first: [0]=read
   follow: 
name: Ln
   isNull: false
   isTerminal: false
   first: [0]=i
   follow: [0]=write
name: write
   isNull: false
   isTerminal: true
   first: [0]=write
   follow: 
name: }
   isNull: false
   isTerminal: true
   first: [0]=}
   follow: 
name: +
   isNull: false
   isTerminal: true
   first: [0]=+
   follow: 
name: -
   isNull: false
   isTerminal: true
   first: [0]=-
   follow: 
name: R
   isNull: false
   isTerminal: false
   first: [0]=i,[1]=],[2]=<=
   follow: [0]=module,[1]=end
name: Om
   isNull: false
   isTerminal: false
   first: [0]=<,[1]=>
   follow: [0]=array,[1]=module
name: *
   isNull: false
   isTerminal: true
   first: [0]=*
   follow: 
name: /
   isNull: false
   isTerminal: true
   first: [0]=/
   follow: 
name: c
   isNull: false
   isTerminal: true
   first: [0]=c
   follow: 
name: Or
   isNull: false
   isTerminal: false
   first: [0]=*,[1]=module,[2]=array,[3]==,[4]=(,[5]=i
   follow: [0]=program
name: =
   isNull: false
   isTerminal: true
   first: [0]==
   follow: 
name: <
   isNull: false
   isTerminal: true
   first: [0]=<
   follow: 
name: >
   isNull: false
   isTerminal: true
   first: [0]=>
   follow: 
name: <=
   isNull: false
   isTerminal: true
   first: [0]=<=
   follow: 
name: >=
   isNull: false
   isTerminal: true
   first: [0]=>=
   follow: 
name: M'0
   isNull: false
   isTerminal: false
   first: [0]=*,[1]=module
   follow: 
name: M'1
   isNull: false
   isTerminal: false
   first: [0]=array,[1]=module
   follow: 
name: M'2
   isNull: false
   isTerminal: false
   first: [0]==,[1]=module
   follow: 
name: Ml0
   isNull: false
   isTerminal: false
   first: [0]=(,[1]=module
   follow: 
name: Vl0
   isNull: false
   isTerminal: false
   first: [0]=i,[1]=module
   follow: 
name: Il0
   isNull: false
   isTerminal: false
   first: [0]=array,[1]=module
   follow: [0]=>=
name: Sl0
   isNull: false
   isTerminal: false
   first: [0]=module,[1]=end
   follow: 
name: START
   isNull: false
   isTerminal: false
   first: [0]=program
   follow: 
name: $
   isNull: false
   isTerminal: true
   first: [0]=$
   follow: 
 







                                 back.gif (341 bytes)       up.gif (335 bytes)         next.gif (337 bytes)