CFG Reader(removal-left-recursion1)
A. Third Edition
This is third edition of my CFG Reader which stands for Context-Free-Grammar Reader. In this edition, I finished
the first part of removal-left-recursion.
What is RLR?
Example: A -> A a | b
Here you see the variable A which appears the left-most of rule and this will produce infinite loop for Recursive-
Descent parsing. For table-driven parsing, it is the same problem. So, how to remove?
A general algorithm is to index all variables and make sure all the left-most variable in any rule has no small or
equal index number than the left-hand-side.
For example,
1) A -> b a | b b
2) B -> A a | b
on 2) LHS is B which has a bigger index than left-most variable A in "B -> A a". This should be replaced with
whatever of A represents:
1) A -> b a | b b
2) B -> b a a | b b a | b
This is just what I have done in this edition.
In next edition I will remove the immediate left recursion.
program -> stmt-sequence
  stmt-sequence -> stmt-sequence ; statement | statement
  statement -> if-stmt | repeat-stmt | assign-stmt | read-stmt | write-stmt
  if-stmt -> if boolean-exp then stmt-sequence end
  | if boolean-exp then stmt-sequence else stmt-sequence end
  repeat-stmt -> repeat stmt-sequence until boolean-exp
  assign-stmt -> identifier := integer-exp
  read-stmt -> read identifier
  write-stmt -> write integer-exp
  boolean-exp -> integer-exp comparison-op integer-exp
  comparison-op -> < | =
  integer-exp -> integer-exp addop term | term
  addop -> + | -
  term -> term mulop factor | factor
  mulop -> * | /
  factor -> ( integer-exp ) | number | identifier
1) A -> b a
2) B -> A a | b
Modified:
1) A -> b a
2) B -> b a a | b
Case 2: The replaced variable has more than one rules, say n. We must invent n-1 rules at end. These
new n-1 rules are all beginning with the replaced plus the original rule minus the first to-be-replaced
variable. Then for the original rule, just modify it like case 1.
i.e.
1) A -> b a | b b
2) B -> A a | b
after replaced
1) A -> b a | b b
2) B -> b a a | b
| B -> b b a //this should be newly-invented rule placed at end.
E.Further improvement
F.File listing
1. CFGReader.h
2. CFGReader.cpp
3. Grammar.h
4. Grammar.cpp
5. main.cpp (main)
  file name: CFGReader.h
#include "Grammar.h"
class CFGReader
{
private:
	char buffer[BufferLength];
	FILE* stream;
	void newRule(const char* str);
	void readRule();
	void addRule(const char* str);
	int addToken(const char* str, bool isTerminal=true);
	void printRule(int index);
public:
	void readFromFile(const char* fileName);
	void optimize();
	void print();
};
file name: CFGReader.cpp 
#include <iostream>
#include "CFGReader.h"
using namespace std;
Grammar grammar;
const char* deliStr=" |\n";
//this would only remove the immediate left recursion
//and should I expand it to remove all kinds of left-recursion?
void CFGReader::readFromFile(const char* fileName)
{
	if ((stream=fopen(fileName, "r"))==NULL)
	{
		cout<<"cannot open file "<<fileName<<endl;
	}
	else
	{		
		readRule();		
	}
}
void CFGReader::readRule()
{
	char* ptr=buffer;
	char* hold;
	int count=0;
	while (!feof(stream))
	{
		count=0;
		fgets(buffer, BufferLength-1, stream);
		ptr=buffer;	
		
		while ((ptr=strtok(ptr, " \n"))!=NULL)
		{
			if (strcmp(ptr, "|")!=0)
			{
				//test the first two token to see if old rule continues
				if (count==0)				
				{
					hold=ptr;
				}
				if (count==1)
				{
					if (strcmp("->", ptr)==0)
					{			
						/*
						curIndex=addToken(hold, false);//the index of cur token	
						grammar[curIndex]->terminal=false;
						newRule();	
						*/
						newRule(hold);
					}
					else
					{								
						addRule(hold);
						addRule(ptr);															
					}
				}
				if (count>=2)//always add
				{
					addRule(ptr);
				}
				count++;
			}
			else
			{
				//this is an alternative production rule
				newRule(NULL);						
			}
			ptr=NULL;
		}		
	}
}
void CFGReader::newRule(const char* str)
{
	grammar.newRule(str);
}
void CFGReader::optimize()
{
	grammar.optimize();
}
void CFGReader::addRule(const char* str)
{
	grammar.addRule(str);
}
int CFGReader::addToken(const char* str, bool isTerminal)
{
	return grammar.addToken(str, isTerminal);
}
void CFGReader::printRule(int index)
{
	grammar.printRule(index);
}
void CFGReader::print()
{
	grammar.print();
}
  file name: Grammar.h
const int BufferLength=256;
const int MaxTokenCount=50;
const int MaxRHSCount=8;
const int MaxRuleCount=50;
const int MaxTokenLength=10;
const int MaxProduction=10;
struct Token
{
	bool terminal;
	char* name;
	int production[MaxProduction];//pointer to the production it gives
	int count;
	int firstCount;
	int followCount;
	int follow[MaxTokenCount];
	int first[MaxTokenCount];
};
class Grammar
{
private:
	Token* token[MaxTokenCount];
	int tokenCount;
	int curIndex;//the index of current token
	int curPos;//the position at production rule
	//MaxRuleCount>=MaxVariableCount!!!
	int production[MaxRuleCount][MaxRHSCount];
	int prodIndex;//pointing to current production, initialized to -1
	int checkRecursion(int tIndex, int curToken);
	int copyRule(int source, int target);//return the position of -1
	void replaceRule(int curToken, int ruleIndex);
	void initialize();
	void doAddRule(int tIndex);
	void commonLeftFactor();
	int findCommonFactor(int tokenIndex, int ruleIndex);
	void removeLeftRecursion();
	void doCommonFactor(int tokenIndex, int ruleIndex, int index);
	int forgeToken(int index);//create a new variable
	void epsilonRule(int ruleIndex);
public:
	Grammar();
	void optimize();
	void printRule(int index);
	void print();
	void addRule(const char* str);
	void newRule(const char* str);//this is an internal method, not suitable for outside
	Token* operator[](int index);
	int addToken(const char* str, bool isTerminal=true);
	Token* createToken(const char* theName, bool isTerminal);
	int tokenIndex(const char* str);
};
file name: Grammar.cpp 
#include <iostream>
#include "Grammar.h"
using namespace std;
const char* emptyStr="empty";
int emptyIndex=-1;
int Grammar::forgeToken(int index)
{
	char str[MaxTokenLength+2], ch;
	int len=strlen(token[index]->name);
	int temp=0, i=0;
	strcpy(str, token[index]->name);
	ch=str[len-1];//get last char of name
	if (ch>='0'&&ch<'9')
	{
		str[len-1]=ch+i+1;
	}
	else
	{
		str[len]='0'+i;
		str[len+1]='\0';
	}
	//I won't bother to check repetitation of name
	while (tokenIndex(str)!=-1)
	{
		i++;
		if (ch>='0'&&ch<'9')
		{
			str[len-1]=ch+i+1;
		}
		else
		{
			str[len]='0'+i;
			str[len+1]='\0';
		}
	}
	return addToken(str, false);//it is non-terminal for sure	
}
int Grammar::copyRule(int source, int target)
{
	int i=0;
	while (production[source][i]!=-1) 
	{
		production[target][i]=production[source][i];
		i++;
	}
	production[target][i]=-1;
	return i;
}
void Grammar::doAddRule(int tIndex)
{	
	token[tIndex]->production[token[tIndex]->count++]=++prodIndex;
}
void Grammar::addRule(const char* str)
{
	int index;
	index=addToken(str);
	production[prodIndex][curPos++]=index;
	production[prodIndex][curPos]=-1;//set end
}
//if the token is already there, it return the index
//otherwise, it add new node in token array
int Grammar::addToken(const char* str, bool isTerminal)
{
	int index;
	index=tokenIndex(str);
	if (index==-1)
	{
		index=tokenCount;
	}
	else
	{		
		return index;
	}
	token[index]=createToken(str, isTerminal);	
	tokenCount++;
	return index;
}
void Grammar::newRule(const char* str)
{
	if (str!=NULL)
	{
		curIndex=addToken(str, false);
	}
	//add one pointer
	token[curIndex]->production[token[curIndex]->count++]=++prodIndex;
	token[curIndex]->terminal=false;
	curPos=0;//reset to restart;
}
Token* Grammar::createToken(const char* theName, bool isTerminal)
{
	Token* ptr=new Token;
	ptr->name=new char[strlen(theName)+1];
	strcpy(ptr->name, theName);
	ptr->terminal=isTerminal;
	ptr->count=ptr->firstCount=ptr->followCount=0;
	return ptr;
}
int Grammar::tokenIndex(const char* str)
{
	for (int i=0; i<tokenCount; i++)
	{
		if (strcmp(str, token[i]->name)==0)
		{
			return i;
		}
	}
	return -1;
}
int Grammar::checkRecursion(int tIndex, int curToken)
{
	for (int i=0; i<token[curToken]->count; i++)
	{
		//token[tIndex]->production[i]=ruleIndex
		//production[ruleIndex][0] is the first left-recursion one
		if (production[token[curToken]->production[i]][0]<=curToken)
		{
			return i;
		}
	}
	return -1;
}
void Grammar::printRule(int index)
{
	int nodeIndex=0;
	while (production[index][nodeIndex]!=-1)
	{
		cout<<token[production[index][nodeIndex]]->name<<" ";
		nodeIndex++;
	}
}
void Grammar::initialize()
{
	tokenCount=curIndex=curPos=0;
	prodIndex=-1;//in order to ++ blindly
}
Grammar::Grammar()
{
	initialize();
}
void Grammar::removeLeftRecursion()
{
	int tIndex=0, curToken=0;
	bool newChange=false;
	while (tIndex<tokenCount)
	{		
		if (!token[tIndex]->terminal)
		{
			for (int i=0; i<token[tIndex]->count; i++)
			{
				curToken=production[token[tIndex]->production[i]][0];
				if (curToken<=tIndex&&!token[curToken]->terminal)
				{
					if (curToken!=tIndex)
					{
						replaceRule(tIndex, i);
					}
					else
					{
						//generateRule(tIndex, i);
					}
					newChange=true;				
				}
			}
		}
		//whenever there is some new findings, restart
		if (!newChange)
		{
			tIndex++;
		}
		else
		{
			tIndex=0;
			newChange=false;
		}
	}
}
void Grammar::replaceRule(int tIndex, int ruleIndex)
{
	int pos, j, targetEnd, sourceEnd, curRule;
	int curToken=production[token[tIndex]->production[ruleIndex]][0];
	for (int i=token[curToken]->count-1; i>=0; i--)
	{
		if (i>0)
		{
			doAddRule(tIndex);
			pos=copyRule(token[curToken]->production[i], prodIndex);
			j=0;
			while (production[token[tIndex]->production[ruleIndex]][j+1]!=-1)
			{
				production[prodIndex][pos+j]=
					production[token[tIndex]->production[ruleIndex]][j+1];
				j++;
			}
			production[prodIndex][pos+j]=-1;
		}
		else
		{
			targetEnd=sourceEnd=0;
			curRule=token[tIndex]->production[ruleIndex];
			while (true)
			{
				if (production[token[curToken]->production[0]][sourceEnd]==-1&&
				production[curRule][targetEnd]==-1)
				{
					break;
				}
				if (production[token[curToken]->production[0]][sourceEnd]!=-1)
				{
					sourceEnd++;
				}
				if (production[curRule][targetEnd]!=-1)
				{
					targetEnd++;
				}
			}
			j=targetEnd+sourceEnd-1;
			while (j>=0)
			{
				if (j>=sourceEnd)
				{
					production[curRule][j]=production[curRule][j-sourceEnd+1];
				}
				else
				{
					production[curRule][j]=production[token[curToken]->production[0]][j];
				}
				j--;
			}
		}
	}
}
//optimize sequence is first common-factor then remove left recursion
//therefore I don't have to check if for same variable if there will be
//more than one left-recursion
void Grammar::optimize()
{
	commonLeftFactor();
	removeLeftRecursion();
}
int Grammar::findCommonFactor(int tIndex, int ruleIndex)
{
	for (int i=ruleIndex+1; i<token[tIndex]->count; i++)
	{	
		//if the two rule has the same first token
		if (production[token[tIndex]->production[ruleIndex]][0]==
			production[token[tIndex]->production[i]][0])
		{
			/*
			//calculate if there is epsilon
			if (emptyIndex==-1)
			{
				emptyIndex=tokenIndex(emptyStr);
			}
			//if it is not epsilon
			if (production[token[tIndex]->production[i]][0]!=emptyIndex)
			{
				return i;
			}
			*/
			return i;
		}
	}
	return -1;
}
void Grammar::epsilonRule(int ruleIndex)
{
	production[ruleIndex][0]=addToken(emptyStr);
	production[ruleIndex][1]=-1;
}
//sample:  x -> Aa 
//         x -> Ab
//changed to:  x -> Ax' //this is to change the old rule
//             x' -> b //this is to change the old rule
//             x' -> a //this is the new-added rule
void Grammar::doCommonFactor(int tIndex, int ruleIndex, int index)
{
	int newTokenIndex=forgeToken(tIndex);//create a new variable
	//move the second and after part to the new rule of new variable
	//doing: x' -> a
	curPos=0;
	//prepare to add one new rule 	
	token[newTokenIndex]->production[token[newTokenIndex]->count++]=++prodIndex;
	token[newTokenIndex]->terminal=false;
	do
	{
		//do copying
		production[prodIndex][curPos]=
			production[token[tIndex]->production[ruleIndex]][curPos+1];
		curPos++;
	//even the -1 at end is copied
	}while (production[token[tIndex]->production[ruleIndex]][curPos]!=-1);
	//in order to show an empty string, in case the string is "epsilon"
	if (curPos==1)
	{
		epsilonRule(prodIndex);
	}
	//replace x -> Aa with x -> Ax'
	production[token[tIndex]->production[ruleIndex]][1]=newTokenIndex;
	production[token[tIndex]->production[ruleIndex]][2]=-1;//end
	
	//doing: x' -> b
	curPos=0;
	//prepare to add one new rule 
	//pointing new token to where old rule lies
	token[newTokenIndex]->production[token[newTokenIndex]->count++]=
		token[tIndex]->production[index];
	do 
	{
		//left-shift productions
		production[token[tIndex]->production[index]][curPos]=
			production[token[tIndex]->production[index]][curPos+1];
		curPos++;
	}while (production[token[tIndex]->production[index]][curPos]!=-1);
	
	//add epsilon
	if (curPos==1)
	{
		epsilonRule(token[tIndex]->production[index]);
	}
	//remove from token's production array the rule-index----"index"
	//shrink the array by one
	token[tIndex]->count--;
	for (int i=index; i<token[tIndex]->count; i++)
	{
		token[tIndex]->production[i]=token[tIndex]->production[i+1];
	}
}
void Grammar::commonLeftFactor()
{
	int index=-1, tIndex=0, ruleIndex=0;
	bool flag;
	//whenever a newrule is done, restart!
	while (tIndex<tokenCount)
	{
		ruleIndex=0;
		flag=false;		
		while (ruleIndex<token[tIndex]->count)
		{			
			index=findCommonFactor(tIndex, ruleIndex);
			if (index!=-1)
			{
				doCommonFactor(tIndex, ruleIndex, index);
				//restart
				flag=true;
				break;
			}
			else
			{
				ruleIndex++;
			}
		}
		if (flag)
		{
			tIndex=0;		
		}
		else
		{
			tIndex++;
		}
	}
}
Token* Grammar::operator[](int index)
{
	if (index>=0&&index<tokenCount)
	{
		return token[index];
	}
	else
	{
		return NULL;
	}
}
void Grammar::print()
{
	for (int i=0; i<tokenCount; i++)
	{		
		if (!token[i]->terminal)
		{
			cout<<token[i]->name<<" ==> ";
			for (int j=0; j<token[i]->count; j++)
			{			
				printRule(token[i]->production[j]);	
				if (j!=token[i]->count-1)
				{
					cout<<" | ";
				}
			}
			cout<<"\n";
		}		
	}
}
file name: main.cpp (main)
#include <iostream>
#include "CFGReader.h"
using namespace std;
int main()
{
	CFGReader R;
	R.readFromFile("c:\\ruleTest.txt");
    R.optimize();
	R.print();
	return 0;
}
The input is something like following:
Please note that all tokens must be separated by space, including "|" and "->".
A -> a | b B -> A a | a b
Here is the result:
A ==> a | b B ==> a a | a b | b a Press any key to continue