CFG Reader(NT-Table)

A. Seventh Edition
This is seventh edition of my CFG Reader which stands for Context-Free-Grammar Reader. In this edition, 
I finished the establishing NT-table and mapping of token-type from "scanner" with terminals in my grammar.
B.The problem

M -> program i ; Dl B
Dl -> Dv Ml
Ml -> Ml Mo | e
Mo -> module i ( Vl ) Dv B
Dv -> variables Vl | e
Vl -> Vl V | V
V -> Il : T ;
T -> integer Ad ; | char Ad
Il -> i | Il , i
L -> i Ar
Ad -> e | array [ n ]
B -> begin Sl end ;
Sl -> S | S Sl
S -> L := E ; | if C then S else S | loop Sl end ; | exit ; | i ( Lp ) ; | B | read Ln ; | write Lo ; | e ;
Lp -> e | Ln
Ln -> L { , L }
Lo -> Lr { , Lr }
Lr -> i Ar | n | c
Ar -> e | [ E ]
E -> F { Oa F }
Oa -> + | -
F -> R { Om R }
Om -> * | /
R -> L | n | ( E ) | c
C -> E Or E
Or -> = | < | > | <= | >= | !=


 

C.The idea of program
 
I cannot remove Tokens! I cannot discover INDIRECT common-left-factors. I cannot remove redundant variables. 
D.The major functions
E.Further improvement
F.File listing
1. CFGReader.h
2. CFGReader.cpp  
3. Grammar.h
4. Grammar.cpp
5. main.cpp (main)
 
 
file name: CFGReader.h
#include "Grammar.h"


class CFGReader
{
private:
	char buffer[BufferLength];
	FILE* stream;
	void newRule(const char* str);
	void readRule();
	void addRule(const char* str);
	int addToken(const char* str, bool isTerminal=true);
	void printRule(int index);
public:
	void readFromFile(const char* fileName);
	void optimize();
	void print();
};


file name: CFGReader.cpp 
#include <iostream>
#include "CFGReader.h"

using namespace std;

Grammar grammar;

const char* deliStr=" |\n";

//this would only remove the immediate left recursion
//and should I expand it to remove all kinds of left-recursion?


void CFGReader::readFromFile(const char* fileName)
{
	if ((stream=fopen(fileName, "r"))==NULL)
	{
		cout<<"cannot open file "<<fileName<<endl;
	}
	else
	{		
		readRule();		
	}
}

void CFGReader::readRule()
{
	char* ptr=buffer;
	char* hold;
	int count=0;

	while (!feof(stream))
	{
		count=0;
		fgets(buffer, BufferLength-1, stream);
		ptr=buffer;	
		
		while ((ptr=strtok(ptr, " \n"))!=NULL)
		{
			if (strcmp(ptr, "|")!=0)
			{
				//test the first two token to see if old rule continues
				if (count==0)				
				{
					hold=ptr;
				}
				if (count==1)
				{
					if (strcmp("->", ptr)==0)
					{			
						/*
						curIndex=addToken(hold, false);//the index of cur token	
						grammar[curIndex]->terminal=false;
						newRule();	
						*/
						newRule(hold);
					}
					else
					{								
						addRule(hold);
						addRule(ptr);															
					}
				}
				if (count>=2)//always add
				{
					addRule(ptr);
				}
				count++;
			}
			else
			{
				//this is an alternative production rule
				newRule(NULL);						
			}
			ptr=NULL;
		}		
	}
}

void CFGReader::newRule(const char* str)
{
	grammar.newRule(str);
}

void CFGReader::optimize()
{
	grammar.optimize();
}


void CFGReader::addRule(const char* str)
{
	grammar.addRule(str);
}

int CFGReader::addToken(const char* str, bool isTerminal)
{
	return grammar.addToken(str, isTerminal);
}

void CFGReader::printRule(int index)
{
	grammar.printRule(index);
}

void CFGReader::print()
{
	grammar.print();
	grammar.printToken(true);
	grammar.buildTable();
	grammar.printTable();
	
	//grammar.printAllRules();
}




  
file name: Grammar.h
const int BufferLength=256;
const int MaxTokenCount=80;
const int MaxRHSCount=80;
const int MaxRuleCount=200;
const int MaxTokenLength=10;
const int MaxProduction=10;



struct Token
{
	//bool isMeta;
	bool terminal;
	bool isNull;
	char* name;
	int production[MaxProduction];//pointer to the production it gives
	int count;
	int firstCount;
	int followCount;
	int follow[MaxTokenCount];
	int first[MaxTokenCount];
};

class Grammar
{
private:
	int table[MaxTokenCount][MaxTokenCount];
	int variables[MaxTokenCount];
	int terminals[MaxTokenCount];
	int tCount;
	int nCount;
	Token* token[MaxTokenCount];
	int tokenCount;
	int curIndex;//the index of current token
	int curPos;//the position at production rule
	//MaxRuleCount>=MaxVariableCount!!!
	int production[MaxRuleCount][MaxRHSCount];
	int prodIndex;//pointing to current production, initialized to -1
	int checkRecursion(int tIndex, int curToken);
	void grammarError(int theVar, int theToken, int rule1, int rule2);
	void addFirstIntoTable(int theVariable, int theFirst, int theRule);
	void addFollowIntoTable(int theVariable, int theRule);
	void addBeginEnd();
	bool addIntoFirst(int target, int source);
	bool addFirst(int target, int source);
	void shiftRule(int ruleIndex, bool Left2Right, int offset);
	void addAtEnd(int ruleIndex, int toAdd);
	void addRuleForToken(int tIndex, int ruleIndex);
	int copyRule(int source, int target);//return the position of -1
	void replaceRule(int curToken, int ruleIndex);
	int findMetaSymbol(int& begin, int& end);
	void initialize();
	void doReplaceMetaSymbol(int ruleIndex, int begin, int end);
	void removeRuleFromToken(int tIndex, int ruleIndex);
	void checkEpsilon(int ruleIndex);
	void removeImmediate(int tIndex, int ruleIndex);
	void doAddRule(int tIndex);
	void commonLeftFactor();
	int findCommonFactor(int tokenIndex, int ruleIndex);
	void removeLeftRecursion();
	void doCommonFactor(int tokenIndex, int ruleIndex, int index);
	int forgeToken(int index);//create a new variable
	void epsilonRule(int ruleIndex);
	bool isRedundant(int tIndex);
	void replaceMetaSymbol();
	void calculateFirst();
	void calculateFollow();
	void calculateNull();
	bool Null(int tIndex);
	bool addFollow(int target, int source);
	bool addIntoFollow(int target, int source);
	bool addFollowIntoFollow(int target, int source);
    void removeRedundant();
	void removeToken(int tIndex);
	//	void calculateProperty();
public:
	Grammar();
	void buildTable();
	void optimize();
	void printRule(int index);
	void print();
	void printTable();
	void printAllRules();
	void printToken(bool onlyVar=false);
	int variableCount();
	int terminalCount();
	void addRule(const char* str);
	void newRule(const char* str);//this is an internal method, not suitable for outside
	Token* operator[](int index);
	int addToken(const char* str, bool isTerminal=true);
	Token* createToken(const char* theName, bool isTerminal);
	int tokenIndex(const char* str);
};
file name: Grammar.cpp 
#include <iostream>
#include "Grammar.h"

using namespace std;

const int TokenTypeCount=38;
const char* emptyStr="e";
const char* optionBegin="{";
const char* optionEnd="}";
const char* startStr="START";
const char* endStr="$";
int startSymbolIndex=-1;
int stackBottomIndex=-1;
int beginIndex=-1;
int endIndex=-1;
int emptyIndex=-1;

char* terminalStr[TokenTypeCount]=
{
	//GENERAL TYPE 5
	"i", "n", "c", "COMMENT", "ERROR",
	//THE FOLLOWING ARE SYMBOL TYPE	18
	"(", ")", ";", "+", "-", "*", 
    "/", ":=", "<", ">", "=", "<=",
	">=", "!=", "[", "]", ",", 
	":", 
	//THE FOLLOWING ARE RESERVED TYPE 15
	"begin", "end", "program", "variables","integer", "array", "char", 
	"module", "if", "then", "else", "loop", "exit", "read", "write"
};

//these are "hash-table-like" tables
int token2type[MaxTokenCount];
int type2token[MaxTokenCount];

int matchTokens(const char* str)
{
	for (int i=0; i<TokenTypeCount; i++)
	{
		if (strcmp(terminalStr[i], str)==0)
		{
			return i;
		}
	}
	return -1;
}

void Grammar::printTable()
{
	/*
	cout<<"         ";
	for (int i=0; i<tokenCount; i++)
	{
		cout<<"| "<<i;
	}
	cout<<endl;
	*/
	
	for (int r=0; r<tokenCount; r++)
	{
		if (token[r]->terminal)
		{
			continue;
		}
		cout<<token[r]->name<<":";
		for (int c=0; c<tokenCount; c++)
		{
			if (table[r][c]!=-1)
			{
				cout<<token[c]->name<<"="<<table[r][c]<<",";
			}
		}
		cout<<endl;
	}
}



void Grammar::buildTable()
{
	int typeIndex;
	for (int r=0; r<tokenCount; r++)
	{	
		if (token[r]->terminal)
		{
			typeIndex=matchTokens(token[r]->name);
			if (typeIndex!=-1)
			{
				type2token[typeIndex]=r;
				token2type[r]=typeIndex;
			}	
		}
		//initialize
		for (int c=0; c<MaxTokenCount; c++)
		{
			table[r][c]=-1;
		}
	}

	for (int i=0; i<tokenCount; i++)
	{	
		if (token[i]->terminal)
		{
			continue;
		}
		for (int j=0; j<token[i]->count; j++)
		{
			int k=0, theRule=token[i]->production[j];
			int theToken=production[theRule][k];
			while (theToken!=-1)
			{
				addFirstIntoTable(i, theToken, theRule);
				if (!token[theToken]->isNull)
				{
					break;
				}
				k++;
				theToken=production[theRule][k];
			}
			if (theToken==-1)
			{
				addFollowIntoTable(i, theRule);
			}
		}
	}

}
	/*
	nCount=tCount=0;

	for (int r=0; r<MaxTokenCount; r++)
	{
		if (r<tokenCount)
		{
			if (token[r]->terminal)
			{
				typeIndex=matchTokens(token[r]->name);
				if (typeIndex!=-1)
				{
					type2token[typeIndex]=r;
					token2type[r]=typeIndex;
				}
				//I am making a invertable table!!!
				tArray[r]=tCount++;
			}
			else
			{
				//it is an invertable table!!!
				nArray[r]=nCount++;
			}
		}

		for (int c=0; c<MaxTokenCount; c++)
		{
			table[r][c]=-1;
		}
	}

	for (int i=0; i<tCount; i++)
	{
		
		for (int j=0; j<token[i]->count; j++)
		{
			int k=0, theRule=token[i]->production[j];
			int theToken=production[theRule][k];
			while (theToken!=-1)
			{
				addFirstIntoTable(theToken, j);
				if (!token[theToken]->isNull)
				{
					break;
				}
				k++;
				theToken=production[theRule][k];
			}
			if (theToken==-1)
			{
				addFollowIntoTable(j);
			}
		}
	}

}
*/

void Grammar::grammarError(int theVar, int theToken, int rule1, int rule2)
{
	cout<<"error! the conflict of grammar at ";
	cout<<token[theVar]->name<<" for token of "
		<<token[theToken]->name
		<<" between rules of \n";
	printRule(rule1);
	cout<<"     and      ";
	printRule(rule2);
	cout<<endl;
}

void Grammar::addFollowIntoTable(int theVariable, int theRule)
{
	int temp;
	for (int i=0; i<token[theVariable]->followCount; i++)
	{
		temp=token[theVariable]->follow[i];
		if (table[theVariable][temp]!=theRule)
		{
			if (table[theVariable][temp]!=-1)
			{
				grammarError(theVariable, temp, theRule, table[theVariable][temp]);
			}
			table[theVariable][temp]=theRule;
		}
	}
}

void Grammar::addFirstIntoTable(int theVariable, int theFirst, int theRule)
{
	for (int i=0; i<token[theFirst]->firstCount; i++)
	{
		if (table[theVariable][token[theFirst]->first[i]]!=theRule)
		{
			if (table[theVariable][token[theFirst]->first[i]]!=-1)
			{
				grammarError(theVariable, token[theFirst]->first[i], theRule,
					table[theVariable][token[theFirst]->first[i]]);
			}
			table[theVariable][token[theFirst]->first[i]]=theRule;
		}
	}
}


int Grammar::variableCount()
{
	int result=0;
	for (int i=0; i<tokenCount; i++)
	{
		if (!token[i]->terminal)
		{
			result++;
		}
	}
	return result;
}

int Grammar::terminalCount()
{
	int result=0;
	for (int i=0; i<tokenCount; i++)
	{
		if (token[i]->terminal)
		{
			result++;
		}
	}
	return result;
}



//leave the rule unremoved!!! as I have no way to do it!!!
void Grammar::removeToken(int tIndex)
{	
	delete[]token[tIndex]->name;
	delete token[tIndex];
	tokenCount--;
	for (int i=tIndex; i<tokenCount; i++)
	{
		token[i]=token[i+1];
	}
}
	

bool Grammar::isRedundant(int tIndex)
{
	int k, theRule, theToken;
	for (int i=0; i<tokenCount; i++)
	{
		for (int j=0; j<token[i]->count; j++)
		{
			k=0;
			theRule=token[i]->production[j];
			theToken=production[theRule][k];
			while (theToken!=-1)
			{
				if (theToken==tIndex)
				{
					return false;
				}
				k++;
				theToken=production[theRule][k];
			}
		}
	}
	return true;
}


//what is redundant? except the start variable
//the non-terminal never appears in other rules!
void Grammar::removeRedundant()
{
	int tIndex=1;
	bool findNew=false;
	while (tIndex<tokenCount)
	{
		findNew=false;
		if (!token[tIndex]->terminal)
		{
			if (isRedundant(tIndex))
			{
				removeToken(tIndex);
				findNew=true;
			}
		}
		if (findNew)
		{
			tIndex=1;
		}
		else
		{
			tIndex++;
		}
	}			
}

void Grammar::printAllRules()
{
/*
	cout<<"\nnow print all rules\n";
	for (int i=0; i<=prodIndex; i++)
	{
		cout<<"rule index "<<i<<": ";
		printRule(i);
		cout<<"\n";
	}
	*/
	int sum=0;
	for (int i=0; i<tokenCount; i++)
	{
		if (!token[i]->terminal)
		{
			sum+=token[i]->count;
		}
	}
	cout<<" total rules is:"<<prodIndex+1;
	cout<<"\n and the sum is "<<sum<<endl;

}



void Grammar::addBeginEnd()
{
	int begin=addToken(startStr, false);
	int end=addToken(endStr, true);
	prodIndex++;
	production[prodIndex][0]=0;
	production[prodIndex][1]=end;
	production[prodIndex][2]=-1;
	addRuleForToken(begin, prodIndex);
	startSymbolIndex=begin;
	stackBottomIndex=end;
}
/*
void Grammar::calculateProperty()
{
	calculateNull();
	calculateFirst();
	calculateFollow();
}
*/
void Grammar::printToken(bool onlyVar)
{
	for (int i=0; i<tokenCount; i++)
	{
		if (onlyVar)
		{
			if (token[i]->terminal)
			{
				continue;
			}
		}
		cout<<"name: "<<token[i]->name<<endl;
		cout<<"   isNull: "<<(token[i]->isNull?"true":"false")<<endl;
		cout<<"   isTerminal: "<<(token[i]->terminal?"true":"false")<<endl;
		cout<<"   first: ";
		for (int j=0; j<token[i]->firstCount; j++)
		{
			if (j!=0)
			{
				cout<<",";
			}
			cout<<"["<<j<<"]="<<token[token[i]->first[j]]->name;			
		}
		cout<<"\n   follow: ";
		for (j=0; j<token[i]->followCount; j++)
		{
			if (j!=0)
			{
				cout<<",";
			}
			cout<<"["<<j<<"]="<<token[token[i]->follow[j]]->name;
		}
		cout<<endl;
	}
}

//must use while loop to discover new set!!!!!
void Grammar::calculateFirst()
{
	int i;
	bool addNew;
	do
	{
		i=0;
		addNew=false;
		while (i<tokenCount)
		{		
			//the terminal contains itself
			if (token[i]->terminal)
			{
				//addFirst don't judge if it is nullable!!
				addFirst(i, i);			
				//token[i]->first[token[i]->firstCount++]=i;
			}
			else
			{
				//for all its rules
				for (int j=0; j<token[i]->count; j++)
				{
					int theToken, k=0;
					theToken=production[token[i]->production[j]][k];
					//for each token in each rule
					do 
					{
						//add non epsilon set
						if (addIntoFirst(i, theToken))
						{
							addNew=true;
						}
						//if it is not null, means it is end
						if (!token[theToken]->isNull)
						{
							break;
						}
						//if it is nullable, continue
						k++;
						theToken=production[token[i]->production[j]][k];
					}while (theToken!=-1);
					//it means all token in this rule is nullable, so
					if (theToken==-1)
					{
						//it is nullable
						//addEpsilonIntoFirst(i);
						addFirst(i, emptyIndex);
					}
				}
			}
			i++;
		}		
	}while (addNew);
}

bool Grammar::addFollowIntoFollow(int target, int source)
{
	bool addNew=false;
	for (int i=0; i<token[source]->followCount; i++)
	{
		if (addFollow(target, token[source]->follow[i]))
		{
			addNew=true;
		}
	}
	return addNew;
}

bool Grammar::addIntoFollow(int target, int source)
{
	bool addNew=false;
	if (source==-1)
	{
		return false;
	}
	for (int i=0; i<token[source]->firstCount; i++)
	{
		//add non-epsilon
		if (!token[token[source]->first[i]]->isNull)
		{
			if (addFollow(target, token[source]->first[i]))
			{
				addNew=true;
			}
		}
	}
	return addNew;
}


void Grammar::calculateFollow()
{		
	int i;
	bool addNew, started;
	//token[startSymbolIndex]->follow[0]=stackBottomIndex;
	//token[startSymbolIndex]->followCount=1;
	do
	{
		i=0;
		addNew=false;
		while (i<tokenCount)
		{		
			//the terminal contains itself
			if (!token[i]->terminal)
			{
				for (int tIndex=0; tIndex<tokenCount; tIndex++)
				{
					//for all its rules
					if (token[tIndex]->terminal)
					{
						continue;
					}
					//for each its rule
					for (int j=0; j<token[tIndex]->count; j++)
					{
						int theToken, k=0, theRule=token[tIndex]->production[j];
						started=false;
						theToken=production[theRule][k];
						//for each token in each rule
						do 
						{
							//the token appears here
							if (theToken==i)
							{
								started=true;
								//add non epsilon set, including -1 situation!!!
								if (addIntoFollow(i, production[theRule][k+1]))
								{
									addNew=true;
								}
							}
							//if it is not null, means it is end
							if (started&&!token[theToken]->isNull)
							{
								break;
							}
							//if it is nullable, continue
							k++;
							theToken=production[theRule][k];
						}while (theToken!=-1);
						//it means all token in this rule is nullable, so
						if (started&&theToken==-1)
						{
							//it is nullable
							//add current variable Follow(j) into Follow(i);
							if (addFollowIntoFollow(i, tIndex))
							{
								addNew=true;
							}
						}
					}
				}					
			}
			i++;
		}		
	}while (addNew);
}

void Grammar::addRuleForToken(int tIndex, int ruleIndex)
{
	token[tIndex]->production[token[tIndex]->count++]=ruleIndex;
}

void Grammar::shiftRule(int ruleIndex, bool left2Right, int offset)
{
	int end=0;
	while (production[ruleIndex][end]!=-1)
	{
		end++;
	}
	if (left2Right)
	{
		for (int i=end; i>=0; i--)
		{
			production[ruleIndex][i+offset]=production[ruleIndex][i];
		}
	}
	else
	{
		for (int i=0; i<=end-offset; i++)
		{
			production[ruleIndex][i]=production[ruleIndex][i+offset];
		}
		checkEpsilon(ruleIndex);
	}
}

void Grammar::checkEpsilon(int ruleIndex)
{
	if (production[ruleIndex][0]==-1)
	{
		epsilonRule(ruleIndex);
	}
}


bool Grammar::addFollow(int target, int source)
{
	//check if it is already there
	for (int i=0; i<token[target]->followCount; i++)
	{
		if (token[target]->follow[i]==source)
		{
			return false;
		}
	}
	//add at end
	token[target]->follow[token[target]->followCount++]=source;
	return true;
}

bool Grammar::addFirst(int target, int source)
{
	//check if it is already there
	for (int i=0; i<token[target]->firstCount; i++)
	{
		if (token[target]->first[i]==source)
		{
			return false;
		}
	}
	//add at end
	token[target]->first[token[target]->firstCount++]=source;
	return true;
}

//add non epsilon into it.
bool Grammar::addIntoFirst(int target, int source)
{
	bool addNew=false;
	if (token[source]->terminal)
	{
		return addFirst(target, source);
	}
	for (int i=0; i<token[source]->firstCount; i++)
	{
		//add non epsilon
		if (!token[token[source]->first[i]]->isNull)
		{
			if (addFirst(target, token[source]->first[i]))
			{
				addNew=true;
			}
		}
	}
	return addNew;
}


bool Grammar::Null(int tIndex)
{
	if (token[tIndex]->terminal)
	{
		return token[tIndex]->isNull;		
	}
	for (int i=0; i<token[tIndex]->count; i++)
	{
		int j=0, theToken;
		theToken=production[token[tIndex]->production[i]][j];
		do
		{
			if (theToken==tIndex||!Null(theToken))
			{
				break;
			}
			j++;
			theToken=production[token[tIndex]->production[i]][j];
		}while (theToken!=-1);
		if (theToken==-1)
		{			
			return true;
		}
	}
	return false;
}

void Grammar::calculateNull()
{
	for (int i=0; i<tokenCount; i++)
	{
		token[i]->isNull=Null(i);	
	}
}
	
int Grammar::findMetaSymbol(int& begin, int& end)
{
	int theRule, theToken, k;
	begin=end=-1;
	for (int i=0; i<tokenCount; i++)
	{
		for (int j=0; j<token[i]->count; j++)
		{
			k=0;
			theRule=token[i]->production[j];
			theToken=production[theRule][k];
			while (theToken!=-1)
			{
				if (theToken==beginIndex)
				{
					begin=k;
				}
				if (theToken==endIndex)
				{
					end=k;
					return theRule;
				}
				k++;
				theToken=production[theRule][k];
			}
		}
	}
	return -1;
}

	
void Grammar::doReplaceMetaSymbol(int ruleIndex, int begin, int end)
{
	int newTokenIndex=forgeToken(0), i=0;

	addRuleForToken(newTokenIndex, ++prodIndex);
	//token[newTokenIndex]->production[token[newTokenIndex]->count++]=++prodIndex;
	//token[newTokenIndex]->terminal=false;

	copyRule(ruleIndex, prodIndex);
	//shrink 
	while (production[ruleIndex][i+end+1]!=-1)
	{
		production[ruleIndex][begin+i]=production[ruleIndex][i+end+1];
		i++;
	}
	production[ruleIndex][begin+i]=-1;
	addAtEnd(ruleIndex, newTokenIndex);
	/*
	production[ruleIndex][begin]=newTokenIndex;
	production[ruleIndex][begin+1]=-1;
	*/

	shiftRule(prodIndex, false, begin+1);
	
	production[prodIndex][end-begin-1]=newTokenIndex;
	production[prodIndex][end-begin-1]=-1;
	epsilonRule(++prodIndex);
	addRuleForToken(newTokenIndex, prodIndex);
}
		
void Grammar::replaceMetaSymbol()
{
	int begin, end, ruleIndex;
	while ((ruleIndex=findMetaSymbol(begin, end))!=-1)
	{
		doReplaceMetaSymbol(ruleIndex, begin, end);
	}
	/*
	for (int i=0; i<tokenCount; i++)
	{
		//if (token[i]->isMeta)
		if (i==beginIndex||i==endIndex)
		{
			removeToken(i);
		}
	}
	*/
}

void Grammar::addAtEnd(int ruleIndex, int toAdd)
{
	int end=0;
	while (production[ruleIndex][end]!=-1)
	{
		end++;
	}
	production[ruleIndex][end++]=toAdd;
	production[ruleIndex][end]=-1;
}

//left-recursion:  A -> A a | b | c
//change to: A  -> b A' | c A'
//			 A' -> a A' | empty
void Grammar::removeImmediate(int tIndex, int ruleIndex)
{
	int newIndex=forgeToken(tIndex);
	int holdRuleIndex=token[tIndex]->production[ruleIndex];
	//sequence matters!

	//change to: A -> b A'
	for (int i=0; i<token[tIndex]->count; i++)
	{
		if (i!=ruleIndex)
		{
			addAtEnd(token[tIndex]->production[i], newIndex);
		}
	}
	
	//shift
	removeRuleFromToken(tIndex, ruleIndex);
	
	addRuleForToken(newIndex, holdRuleIndex);
	//token[newIndex]->production[token[newIndex]->count++]=holdRuleIndex;

	shiftRule(holdRuleIndex, false, 1);
	addAtEnd(holdRuleIndex, newIndex);
	
	//add epsilon rule for new variable
	epsilonRule(++prodIndex);
	addRuleForToken(newIndex, prodIndex);
}


int Grammar::forgeToken(int index)
{
	char str[MaxTokenLength+2], ch;
	int len=strlen(token[index]->name);
	int temp=0, i=0;
	strcpy(str, token[index]->name);
	ch=str[len-1];//get last char of name
	if (ch>='0'&&ch<'9')
	{
		str[len-1]=ch+i+1;
	}
	else
	{
		str[len]='0'+i;
		str[len+1]='\0';
	}
	//I won't bother to check repetitation of name
	while (tokenIndex(str)!=-1)
	{
		i++;
		if (ch>='0'&&ch<'9')
		{
			str[len-1]=ch+i+1;
		}
		else
		{
			str[len]='0'+i;
			str[len+1]='\0';
		}
	}

	return addToken(str, false);//it is non-terminal for sure	
}

int Grammar::copyRule(int source, int target)
{
	int i=0;
	while (production[source][i]!=-1) 
	{
		production[target][i]=production[source][i];
		i++;
	}
	production[target][i]=-1;
	return i;
}

void Grammar::doAddRule(int tIndex)
{	
	token[tIndex]->production[token[tIndex]->count++]=++prodIndex;
}

void Grammar::addRule(const char* str)
{
	int index;
	index=addToken(str);
	production[prodIndex][curPos++]=index;
	production[prodIndex][curPos]=-1;//set end
}


//if the token is already there, it return the index
//otherwise, it add new node in token array
int Grammar::addToken(const char* str, bool isTerminal)
{
	int index;
	index=tokenIndex(str);
	if (index==-1)
	{
		index=tokenCount;
	}
	else
	{		
		return index;
	}
	token[index]=createToken(str, isTerminal);
	if (strcmp(str, optionBegin)==0)
	{
		beginIndex=index;
	}
	if (strcmp(str, optionEnd)==0)
	{
		endIndex=index;
	}
	tokenCount++;
	if (strcmp(str, emptyStr)==0)
	{
		token[index]->isNull=true;
		emptyIndex=index;
	}
	return index;
}

void Grammar::newRule(const char* str)
{
	if (str!=NULL)
	{
		curIndex=addToken(str, false);
	}
	//add one pointer
	token[curIndex]->production[token[curIndex]->count++]=++prodIndex;
	token[curIndex]->terminal=false;
	curPos=0;//reset to restart;
}

Token* Grammar::createToken(const char* theName, bool isTerminal)
{
	Token* ptr=new Token;
	ptr->name=new char[strlen(theName)+1];
	strcpy(ptr->name, theName);
	ptr->terminal=isTerminal;
	ptr->count=ptr->firstCount=ptr->followCount=0;
	ptr->isNull=false;
	return ptr;
}

int Grammar::tokenIndex(const char* str)
{
	for (int i=0; i<tokenCount; i++)
	{
		if (strcmp(str, token[i]->name)==0)
		{
			return i;
		}
	}
	return -1;
}

int Grammar::checkRecursion(int tIndex, int curToken)
{
	for (int i=0; i<token[curToken]->count; i++)
	{
		//token[tIndex]->production[i]=ruleIndex
		//production[ruleIndex][0] is the first left-recursion one
		if (production[token[curToken]->production[i]][0]<=curToken)
		{
			return i;
		}
	}
	return -1;
}

void Grammar::printRule(int index)
{
	int nodeIndex=0;
	cout<<" ~"<<index<<"~ ";
	while (production[index][nodeIndex]!=-1)
	{
		cout<<token[production[index][nodeIndex]]->name<<" ";
		nodeIndex++;
	}
}



void Grammar::initialize()
{
	tokenCount=curIndex=curPos=0;
	prodIndex=-1;//in order to ++ blindly
}

Grammar::Grammar()
{
	initialize();
}

void Grammar::removeLeftRecursion()
{
	int tIndex=0, curToken=0;
	bool newChange=false;
	while (tIndex<tokenCount)
	{		
		if (!token[tIndex]->terminal)
		{
			for (int i=0; i<token[tIndex]->count; i++)
			{
				curToken=production[token[tIndex]->production[i]][0];
				if (curToken<=tIndex&&!token[curToken]->terminal)
				{
					if (curToken!=tIndex)
					{
						replaceRule(tIndex, i);
					}
					else
					{
						removeImmediate(tIndex, i);
					}
					newChange=true;				
				}
			}
		}
		//whenever there is some new findings, restart
		if (!newChange)
		{
			tIndex++;
		}
		else
		{
			tIndex=0;
			newChange=false;
		}
	}
}

void Grammar::replaceRule(int tIndex, int ruleIndex)
{
	int pos, j, targetEnd, sourceEnd, curRule;
	curRule=token[tIndex]->production[ruleIndex];
	int curToken=production[curRule][0];
	for (int i=token[curToken]->count-1; i>=0; i--)
	{
		if (i>0)
		{
			doAddRule(tIndex);
			pos=copyRule(token[curToken]->production[i], prodIndex);
			j=0;
			while (production[curRule][j+1]!=-1)
			{
				production[prodIndex][pos+j]=production[curRule][j+1];
				j++;
			}
			production[prodIndex][pos+j]=-1;
			//addRuleForToken(curToken, prodIndex);
		}
		else
		{
			targetEnd=sourceEnd=0;
			//curRule=token[tIndex]->production[ruleIndex];
			while (true)
			{
				if (production[token[curToken]->production[0]][sourceEnd]==-1&&
				production[curRule][targetEnd]==-1)
				{
					break;
				}
				if (production[token[curToken]->production[0]][sourceEnd]!=-1)
				{
					sourceEnd++;
				}
				if (production[curRule][targetEnd]!=-1)
				{
					targetEnd++;
				}
			}
			j=targetEnd+sourceEnd-1;
			while (j>=0)
			{
				if (j>=sourceEnd)
				{
					production[curRule][j]=production[curRule][j-sourceEnd+1];
				}
				else
				{
					production[curRule][j]=production[token[curToken]->production[0]][j];
				}
				j--;
			}
		}
	}
}


//optimize sequence is first common-factor then remove left recursion
//therefore I don't have to check if for same variable if there will be
//more than one left-recursion
void Grammar::optimize()
{
	replaceMetaSymbol();
	removeLeftRecursion();
	commonLeftFactor();
	//removeRedundant();
	addBeginEnd();
	calculateNull();
	calculateFirst();
	calculateFollow();
}

int Grammar::findCommonFactor(int tIndex, int ruleIndex)
{
	for (int i=ruleIndex+1; i<token[tIndex]->count; i++)
	{	
		//if the two rule has the same first token
		if (production[token[tIndex]->production[ruleIndex]][0]==
			production[token[tIndex]->production[i]][0])
		{
			/*
			//calculate if there is epsilon
			if (emptyIndex==-1)
			{
				emptyIndex=tokenIndex(emptyStr);
			}
			//if it is not epsilon
			if (production[token[tIndex]->production[i]][0]!=emptyIndex)
			{
				return i;
			}
			*/
			return i;
		}
	}
	return -1;
}

void Grammar::epsilonRule(int ruleIndex)
{
	production[ruleIndex][0]=addToken(emptyStr);
	production[ruleIndex][1]=-1;
}

//sample:  x -> Aa 
//         x -> Ab
//changed to:  x -> Ax' //this is to change the old rule
//             x' -> b //this is to change the old rule
//             x' -> a //this is the new-added rule
void Grammar::doCommonFactor(int tIndex, int ruleIndex, int index)
{
	int newTokenIndex=forgeToken(tIndex);//create a new variable
	//move the second and after part to the new rule of new variable
	//doing: x' -> a
	//curPos=0;
	//prepare to add one new rule 	
	addRuleForToken(newTokenIndex, ++prodIndex);
	//token[newTokenIndex]->production[token[newTokenIndex]->count++]=++prodIndex;
	token[newTokenIndex]->terminal=false;

	copyRule(token[tIndex]->production[ruleIndex], prodIndex);
	shiftRule(prodIndex, false, 1);
	/*
	do
	{
		
		//do copying
		production[prodIndex][curPos]=
			production[token[tIndex]->production[ruleIndex]][curPos+1];
		curPos++;
	//even the -1 at end is copied
	}while (production[token[tIndex]->production[ruleIndex]][curPos]!=-1);
	*/

	//in order to show an empty string, in case the string is "epsilon"

	
	/*
	if (curPos==1)
	{
		epsilonRule(prodIndex);
	}
	*/


	//replace x -> Aa with x -> Ax'
	production[token[tIndex]->production[ruleIndex]][1]=newTokenIndex;
	production[token[tIndex]->production[ruleIndex]][2]=-1;//end
	
	//doing: x' -> b
	//curPos=0;
	//prepare to add one new rule 
	//pointing new token to where old rule lies
	addRuleForToken(newTokenIndex, token[tIndex]->production[index]);
	/*
	token[newTokenIndex]->production[token[newTokenIndex]->count++]=
		token[tIndex]->production[index];
		*/

	shiftRule(token[tIndex]->production[index], false, 1);

	removeRuleFromToken(tIndex, index);
}

void Grammar::removeRuleFromToken(int tIndex, int ruleIndex)
{
	token[tIndex]->count--;
	for (int i=ruleIndex; i<token[tIndex]->count; i++)
	{
		token[tIndex]->production[i]=token[tIndex]->production[i+1];
	}	
}


void Grammar::commonLeftFactor()
{
	int index=-1, tIndex=0, ruleIndex=0;
	bool flag;
	//whenever a newrule is done, restart!
	while (tIndex<tokenCount)
	{
		ruleIndex=0;
		flag=false;		
		while (ruleIndex<token[tIndex]->count)
		{			
			index=findCommonFactor(tIndex, ruleIndex);
			if (index!=-1)
			{
				doCommonFactor(tIndex, ruleIndex, index);
				//restart
				flag=true;
				break;
			}
			else
			{
				ruleIndex++;
			}
		}
		if (flag)
		{
			tIndex=0;		
		}
		else
		{
			tIndex++;
		}
	}
}

Token* Grammar::operator[](int index)
{
	if (index>=0&&index<tokenCount)
	{
		return token[index];
	}
	else
	{
		return NULL;
	}
}

void Grammar::print()
{
	for (int i=0; i<tokenCount; i++)
	{		
		if (!token[i]->terminal)
		{		
			cout<<token[i]->name<<" ==> ";
			for (int j=0; j<token[i]->count; j++)
			{			
				printRule(token[i]->production[j]);	
				if (j!=token[i]->count-1)
				{
					cout<<" | ";
				}
			}
			cout<<"\n";
		}		
	}
}




file name: main.cpp (main)
#include <iostream>
#include "CFGReader.h"

using namespace std;

int main()
{
	CFGReader R;
	R.readFromFile("c:\\ruleTest.txt");
    R.optimize();
	R.print();

	return 0;
}
 
 
 
 
The input is something like following:
Please note that all tokens must be separated by space, including "|" and "->" and "{", "}" is meta symbol 
equivalent to Kaleen star. 
M -> program i ; Dl B
Dl -> Dv Ml
Ml -> Ml Mo | e
Mo -> module i ( Vl ) Dv B
Dv -> variables Vl | e
Vl -> Vl V | V
V -> Il : T ;
T -> integer Ad ; | char Ad
Il -> i | Il , i
L -> i Ar
Ad -> e | array [ n ]
B -> begin Sl end ;
Sl -> S | S Sl
S -> L := E ; | if C then S else S | loop Sl end ; | exit ; | i ( Lp ) ; | B | read Ln ; | write Lo ; | e ;
Lp -> e | Ln
Ln -> L { , L }
Lo -> Lr { , Lr }
Lr -> i Ar | n | c
Ar -> e | [ E ]
E -> F { Oa F }
Oa -> + | -
F -> R { Om R }
Om -> * | /
R -> L | n | ( E ) | c
C -> E Or E
Or -> = | < | > | <= | >= | !=


Here is the result:
M ==>  ~0~ program i ; Dl B
Dl ==>  ~1~ Dv Ml
B ==>  ~17~ begin Sl end ;
Dv ==>  ~5~ variables Vl  |  ~6~ e
Ml ==>  ~3~ e Ml0
Mo ==>  ~4~ module i ( Vl ) Dv B
Vl ==>  ~8~ V Vl0
V ==>  ~9~ Il : T ;
Il ==>  ~12~ i Il0
T ==>  ~10~ integer Ad ;  |  ~11~ char Ad
Ad ==>  ~15~ e  |  ~16~ array [ n ]
L ==>  ~14~ i Ar
Ar ==>  ~36~ e  |  ~37~ [ E ]
Sl ==>  ~18~ S Sl0
S ==>  ~20~ i S0  |  ~21~ if C then S else S  |  ~22~ loop Sl end ;  |  ~23~ exit ;  |  ~25~ begin S
l end ;  |  ~26~ read Ln ;  |  ~27~ write Lo ;  |  ~28~ e ;
E ==>  ~38~ F M0
C ==>  ~48~ F M0 Or E
Lp ==>  ~29~ e  |  ~30~ Ln
Ln ==>  ~31~ i Ar M1
Lo ==>  ~32~ Lr M2
Lr ==>  ~33~ i Ar  |  ~34~ n  |  ~35~ c
F ==>  ~41~ R M3
Oa ==>  ~39~ +  |  ~40~ -
R ==>  ~44~ i Ar  |  ~45~ n  |  ~46~ ( E )  |  ~47~ c
Om ==>  ~42~ *  |  ~43~ /
Or ==>  ~49~ =  |  ~50~ <  |  ~51~ >  |  ~52~ <=  |  ~53~ >=  |  ~54~ !=
M0 ==>  ~55~ + F  |  ~56~ e  |  ~66~ - F
M1 ==>  ~57~ , L  |  ~58~ e
M2 ==>  ~59~ , Lr  |  ~60~ e
M3 ==>  ~61~ * R  |  ~62~ e  |  ~67~ / R
Ml0 ==>  ~2~ module i ( Vl ) Dv B Ml0  |  ~63~ e
Vl0 ==>  ~7~ i Il0 : T ; Vl0  |  ~64~ e
Il0 ==>  ~13~ , i Il0  |  ~65~ e
Sl0 ==>  ~68~ e  |  ~19~ Sl
S0 ==>  ~69~ Ar := E ;  |  ~24~ ( Lp ) ;
START ==>  ~70~ M $
name: M
   isNull: false
   isTerminal: false
   first: [0]=program
   follow: [0]=$
name: Dl
   isNull: true
   isTerminal: false
   first: [0]=e,[1]=variables,[2]=module
   follow: [0]=begin
name: B
   isNull: false
   isTerminal: false
   first: [0]=begin
   follow: [0]=module
name: Dv
   isNull: true
   isTerminal: false
   first: [0]=variables,[1]=e
   follow: [0]=module,[1]=begin
name: Ml
   isNull: true
   isTerminal: false
   first: [0]=e,[1]=module
   follow: [0]=begin
name: Mo
   isNull: false
   isTerminal: false
   first: [0]=module
   follow:
name: Vl
   isNull: false
   isTerminal: false
   first: [0]=i
   follow: [0]=)
name: V
   isNull: false
   isTerminal: false
   first: [0]=i
   follow: [0]=i
name: Il
   isNull: false
   isTerminal: false
   first: [0]=i
   follow: [0]=:
name: T
   isNull: false
   isTerminal: false
   first: [0]=integer,[1]=char
   follow: [0]=;
name: Ad
   isNull: true
   isTerminal: false
   first: [0]=e,[1]=array
   follow: [0]=;
name: L
   isNull: false
   isTerminal: false
   first: [0]=i
   follow:
name: Ar
   isNull: true
   isTerminal: false
   first: [0]=e,[1]=[
   follow: [0]=,,[1]=:=,[2]=;,[3]=*,[4]=/
name: Sl
   isNull: false
   isTerminal: false
   first: [0]=i,[1]=if,[2]=loop,[3]=exit,[4]=begin,[5]=read,[6]=write,[7]=;
   follow: [0]=end
name: S
   isNull: false
   isTerminal: false
   first: [0]=i,[1]=if,[2]=loop,[3]=exit,[4]=begin,[5]=read,[6]=write,[7]=e,[8]=;
   follow: [0]=i,[1]=if,[2]=loop,[3]=exit,[4]=begin,[5]=read,[6]=write,[7]=;,[8]=else
name: E
   isNull: false
   isTerminal: false
   first: [0]=i,[1]=n,[2]=(,[3]=c
   follow: [0]=],[1]=),[2]=;
name: C
   isNull: false
   isTerminal: false
   first: [0]=i,[1]=n,[2]=(,[3]=c
   follow: [0]=then
name: Lp
   isNull: true
   isTerminal: false
   first: [0]=e,[1]=i
   follow: [0]=)
name: Ln
   isNull: false
   isTerminal: false
   first: [0]=i
   follow: [0]=;
name: Lo
   isNull: false
   isTerminal: false
   first: [0]=i,[1]=n,[2]=c
   follow: [0]=;
name: Lr
   isNull: false
   isTerminal: false
   first: [0]=i,[1]=n,[2]=c
   follow: [0]=,
name: F
   isNull: false
   isTerminal: false
   first: [0]=i,[1]=n,[2]=(,[3]=c
   follow: [0]=+,[1]=-
name: Oa
   isNull: false
   isTerminal: false
   first: [0]=+,[1]=-
   follow:
name: R
   isNull: false
   isTerminal: false
   first: [0]=i,[1]=n,[2]=(,[3]=c
   follow: [0]=*,[1]=/
name: Om
   isNull: false
   isTerminal: false
   first: [0]=*,[1]=/
   follow:
name: Or
   isNull: false
   isTerminal: false
   first: [0]==,[1]=<,[2]=>,[3]=<=,[4]=>=,[5]=!=
   follow: [0]=i,[1]=n,[2]=(,[3]=c
name: M0
   isNull: true
   isTerminal: false
   first: [0]=+,[1]=e,[2]=-
   follow: [0]=],[1]=),[2]=;,[3]==,[4]=<,[5]=>,[6]=<=,[7]=>=,[8]=!=
name: M1
   isNull: true
   isTerminal: false
   first: [0]=,,[1]=e
   follow: [0]=;
name: M2
   isNull: true
   isTerminal: false
   first: [0]=,,[1]=e
   follow: [0]=;
name: M3
   isNull: true
   isTerminal: false
   first: [0]=*,[1]=e,[2]=/
   follow: [0]=+,[1]=-
name: Ml0
   isNull: true
   isTerminal: false
   first: [0]=module,[1]=e
   follow: [0]=begin
name: Vl0
   isNull: true
   isTerminal: false
   first: [0]=i,[1]=e
   follow: [0]=)
name: Il0
   isNull: true
   isTerminal: false
   first: [0]=,,[1]=e
   follow: [0]=:
name: Sl0
   isNull: true
   isTerminal: false
   first: [0]=e,[1]=i,[2]=if,[3]=loop,[4]=exit,[5]=begin,[6]=read,[7]=write,[8]=;
   follow: [0]=end
name: S0
   isNull: false
   isTerminal: false
   first: [0]=[,[1]=:=,[2]=(
   follow:
name: START
   isNull: false
   isTerminal: false
   first: [0]=program
   follow:
M:program=0,
Dl:e=1,module=1,variables=1,begin=1,
B:begin=17,
Dv:e=6,module=6,variables=5,begin=6,
Ml:e=3,module=3,begin=3,
Mo:module=4,
Vl:i=8,
V:i=9,
Il:i=12,
T:integer=10,char=11,
Ad:;=15,e=15,array=16,
L:i=14,
Ar:;=36,e=36,,=36,[=37,:==36,*=36,/=36,
Sl:i=18,;=18,e=18,begin=18,if=18,loop=18,exit=18,read=18,write=18,
S:i=20,;=28,e=28,begin=25,if=21,loop=22,exit=23,read=26,write=27,
E:i=38,(=38,n=38,c=38,
C:i=48,(=48,n=48,c=48,
Lp:i=30,e=29,)=29,
Ln:i=31,
Lo:i=32,n=32,c=32,
Lr:i=33,n=34,c=35,
F:i=41,(=41,n=41,c=41,
Oa:+=39,-=40,
R:i=44,(=46,n=45,c=47,
Om:*=42,/=43,
Or:==49,<=50,>=51,<==52,>==53,!==54,
M0:;=56,e=56,)=56,]=56,+=55,-=66,==56,<=56,>=56,<==56,>==56,!==56,
M1:;=58,e=58,,=57,
M2:;=60,e=60,,=59,
M3:e=62,+=62,-=62,*=61,/=67,
Ml0:e=63,module=2,begin=63,
Vl0:i=7,e=64,)=64,
Il0:e=65,:=65,,=13,
Sl0:i=19,;=19,e=68,begin=19,end=68,if=19,loop=19,exit=19,read=19,write=19,
S0:e=69,(=24,[=69,:==69,
START:program=70,
Press any key to continue






                                 back.gif (341 bytes)       up.gif (335 bytes)         next.gif (337 bytes)