#include "stdafx.h"

int skipSpaceW(LPCTSTR buffer, int size)
{
	int result = 0;
	while (result < size)
	{
		if (buffer[result] == _T(' '))
		{
			result ++;
		}
		else
		{
			return result;
		}
	}
	return -1;
}

int skipSpaceA(LPCSTR buffer, int size)
{
	int result = 0;	  	
	while (result < size)
	{
		if (buffer[result] == ' ')
		{
			result ++;
		}
		else
		{
			return result;
		}
	}
	return -1;
}


int findFirstCharW(LPCTSTR buffer, int size, TCHAR ch)
{
	int result = 0;
	bool insideQuote = false;

	while (result < size)
	{
		if (!insideQuote)
		{
			if (towupper(buffer[result]) == towupper(ch))
			{
				return result;
			}
			if (buffer[result] == _T('"'))
			{
				insideQuote = true;
			}
		}
		else
		{
			if (buffer[result] == _T('"'))
			{
				insideQuote = false;
			}
		}

		result ++;
	}
	return -1;
}

int findFirstStringW(LPCTSTR buffer, int size, LPCTSTR subStr)
{
	int result = 0;
	int offset;
	int bufSize = size;
	int subLen = _tcslen(subStr);
	while (result < size)
	{
		offset = findFirstCharW(buffer + result, size - result, subStr[0]);
		if (offset < 0)
		{
			return -1;
		}
		result += offset;
		if (size - result < subLen)
		{
			return  -1;
		}
		if (_tcsnicmp(buffer + result, subStr, subLen) == 0)
		{
			return result;
		}
		else
		{
			result ++;
		}
	}
	
	return -1;
}

int findNextCharW(LPCTSTR buffer, int size, TCHAR ch)
{
	int result = 0;
	while (result < size)
	{
		if (buffer[result] == _T(' '))
		{
			result++;
		}
		else
		{
			if(towupper(buffer[result])==towupper(ch))
			{
				return result;
			}
			else
			{
				return -1;
			}
		}
	}
	return -1;
}

int findNextStringW(LPCTSTR buffer, int size, LPCTSTR subStr)
{
	int result = 0;
	int subLen = 0;
	result = findNextCharW(buffer, size, subStr[0]);
	if (result < 0)
	{
		return -1;
	}
	subLen = _tcslen(subStr);
	if (size - result <	subLen)
	{
		return -1;
	}
	if (_tcsnicmp(buffer + result, subStr, subLen) == 0)
	{
		return result;
	}
	else
	{
		return -1;
	}
}
//quote will be removed, end is boundary
bool findFirstWordW(LPCTSTR buffer, int size, int& start, int& end)
{
	start = end =0;
	bool isQuote = false;
	bool isBegin = false;
	bool isEnd   = false;
	while (start < size)
	{  	
		if (buffer[start]!=_T('"') && buffer[start]!=_T(' '))
		{
			break;		 
		} 
	
		if (buffer[start]==_T('"'))
		{
			if (!isQuote)
			{
				isQuote = true;
			}
			else
			{
				end = start;
				return false;
			}
		}
		start ++;
	}

	if (start>= size)
	{
		return false;
	}
	end = start;
	while (end < size)
	{
		if (isQuote)
		{
			if (buffer[end] == _T('"'))
			{
				break;
			}
		}
		else
		{
			if (buffer[end]==_T(' ')||buffer[end]==_T('>'))
			{
				break;
			}
		}
		end++;
	}
	/*
	if (isQuote)
	{
		while (end > start)
		{
			if (buffer[end - 1]==_T(' '))
			{
				end --;
			}
			else
			{
				break;
			}
		}
	}
	*/
	if (end>= size)
	{
		return false;
	}
	return true;
}


int findFirstCharA(LPCSTR buffer, int size, char ch)
{
	int result = 0;
	bool insideQuote = false;
	while (result < size)
	{
		if (!insideQuote)
		{  
			if (toupper(buffer[result]) == toupper(ch))
			{
				return result;
			}
			if (buffer[result] == '"'||buffer[result]=='\'')
			{
				insideQuote = true;
			}
		}
		else
		{
			if (buffer[result] == '"'||buffer[result]=='\'')
			{
				insideQuote = false;
			}
		}
		result ++;
	}
	return -1;
}

int findFirstStringA(LPCSTR buffer, int size, LPCSTR subStr)
{
	int result = 0;
	int offset;
	int bufSize = size;
	int subLen = strlen(subStr);
	while (result < size)
	{
		offset = findFirstCharA(buffer + result, size - result, subStr[0]);
		if (offset < 0)
		{
			return -1;
		}
		result += offset;
		if (size - result < subLen)
		{
			return  -1;
		}
		if (strnicmp(buffer + result, subStr, subLen) == 0)
		{
			return result;
		}
		else
		{
			result ++;
		}
	}
	
	return -1;
}

int findNextCharA(LPCSTR buffer, int size, char ch)
{
	int result = 0;

	while (result < size)
	{
		if (buffer[result] ==' ')
		{
			result++;
		}
		else
		{
			if (buffer[result] == '>')
			{
				return -1;
			}
			if(toupper(buffer[result])==toupper(ch))
			{
				return result;
			}
			else
			{
				return -1;
			}
		}
	}
	return -1;
}

int findNextStringA(LPCSTR buffer, int size, LPCSTR subStr)
{
	int result = 0;
	int subLen = 0;
	result = findNextCharA(buffer, size, subStr[0]);
	if (result < 0)
	{
		return -1;
	}
	subLen = strlen(subStr);
	if (size - result <	subLen)
	{
		return -1;
	}
	if (strnicmp(buffer + result, subStr, subLen) == 0)
	{
		return result;
	}
	else
	{
		return -1;
	}
}
//quote will be removed, end is boundary
bool findFirstWordA(LPCSTR buffer, int size, int& start, int& end)
{
	start = end =0;
	bool isQuote = false;
	bool isBegin = false;
	bool isEnd   = false;
	while (start < size)
	{  	
		if (buffer[start]=='"'||buffer[start] == '\'')
		{			
			isQuote = true;		
		}

		if (buffer[start]!='"' && buffer[start]!=' ' && buffer[start]!='\'')
		{
			break;		 
		}  	
		start ++;
	}


	if (start>= size)
	{
		return false;
	}
	end = start;
	while (end < size)
	{
		if (isQuote)
		{
			if (buffer[end] == '"'||buffer[end]=='\'')
			{
				break;
			}
		}
		else
		{
			if (buffer[end]==' '||buffer[end]=='>')
			{
				break;
			}
		}
		end++;
	}  
	if (end>= size)
	{
		return false;
	}
	return true;
}



int HtmlParser::findFirstImageW(LPCTSTR buffer, int size, DomainNameType& nameBuffer)
{
	int result = 0;
	int offset = 0;
	int start, end;
	while (result < size)
	{
		offset = findFirstCharW(buffer + result, size - result, _T('<'));
		if (offset < 0)
		{
			return -1;
		}
		result += offset + 1;
		
		offset = findNextStringW(buffer + result, size - result, _T("img "));
		if (offset < 0)
		{
			continue;
		}
		result += offset + 4;
		offset = findFirstStringW(buffer + result, size - result, _T("src"));
		if (offset < 0)
		{
			return HTML_PARSE_ERROR;
		}
		result += offset + 3;
		offset = findNextCharW(buffer + result, size - result, _T('='));
		if (offset < 0)
		{
			return HTML_PARSE_ERROR;
		}
		result += offset + 1;
		if (!findFirstWordW(buffer+result, size - result, start, end))
		{
			return HTML_PARSE_ERROR;
		}

		result += start;
		offset = end - start;
	
		if (offset > MaxDomainNameLength)
		{
			return HTML_PATH_MAX_ERROR;
		}  	

		_tcsncpy(nameBuffer.nameBuffer, buffer + result, offset);
		nameBuffer.nameBuffer[offset]= _T('\0');
		//_tprintf(_T("\n*********[%s]*********\n"), nameBuffer.nameBuffer);

		result += offset + 1;
		return result;
	}
	return -1;
}

int HtmlParser::findFirstLinkW(LPCTSTR buffer, int size, DomainNameType& nameBuffer)
{
	int result = 0;
	int offset = 0;
	int start, end;
	while (result < size)
	{
		offset = findFirstCharW(buffer + result, size - result, _T('<'));
		if (offset < 0)
		{
			return -1;
		}
		result += offset + 1;
		
		offset = findNextStringW(buffer + result, size - result, _T("a "));
		if (offset < 0)
		{
			continue;
		}
		result += offset + 2;
		offset = findFirstStringW(buffer + result, size - result, _T("href"));
		if (offset < 0)
		{
			return HTML_PARSE_ERROR;
		}
		result += offset + 4;
		offset = findNextCharW(buffer + result, size - result, _T('='));
		if (offset < 0)
		{
			return HTML_PARSE_ERROR;
		}
		result += offset + 1;
		if (!findFirstWordW(buffer+result, size - result, start, end))
		{
			return HTML_PARSE_ERROR;
		}

		result += start;
		offset = end - start;
	
		if (offset > MaxDomainNameLength)
		{
			return HTML_PATH_MAX_ERROR;
		}  	

		_tcsncpy(nameBuffer.nameBuffer, buffer + result, offset);
		nameBuffer.nameBuffer[offset]= _T('\0');
		//_tprintf(_T("\n*********[%s]*********\n"), nameBuffer.nameBuffer);

		result += offset + 1;
		return result;
	}
	return -1;
}

int HtmlParser::doParseBufferImageW(LPCTSTR buffer, int size)
{
	int offset = 0;
	int result = 0;
	int imageCounter = 0;
	DomainNameType domainName;
	NameResultPair resultPair;

	while (result < size)
	{
		offset = findFirstImageW(buffer + result, size - result, domainName);
		if (offset < 0)
		{
			break;
		}  		
		resultPair = nameSet.insert(domainName);
		if (resultPair.second)
		{
			nameQueue.push_back(domainName);
			imageCounter ++;  
		}
		//printf("set size[%d]\n", nameSet.size());

		result += offset;
		
	}
	return imageCounter;
}

int HtmlParser::doParseBufferLinkW(LPCTSTR buffer, int size)
{
	int offset = 0;
	int result = 0;
	int linkCounter = 0;
	DomainNameType domainName;
	NameResultPair resultPair;

	while (result < size)
	{
		offset = findFirstLinkW(buffer + result, size - result, domainName);
		if (offset < 0)
		{
			break;
		}
		resultPair = linkSet.insert(domainName);
		
		if (resultPair.second)
		{
			linkQueue.push_back(domainName);
			linkCounter ++;
			//_tprintf(_T("new link[%s]\n"), domainName.nameBuffer);

		}
		result += offset;
	}
	return linkCounter;
}

bool HtmlParser::isGB2312(LPCTSTR buffer)
{
	unsigned short sh = _T('<');
	//_tprintf(_T("[%u][%u]\n"), GB2312ConvertToUnicode(buffer[0]), sh);
	return (GB2312ConvertToUnicode(buffer[0]) == _T('<'));
}


void HtmlParser::convertGB2312Buffer(LPTSTR buffer, int size)
{
	for (int i= 0; i < size; i ++)
	{
		buffer[i] = GB2312ConvertToUnicode(buffer[i]);
	}
}

bool HtmlParser::isAnsi(LPCSTR buffer)
{
	int index =0;
	while (buffer[index] == ' ' || buffer[index]== 10 || buffer[index]==13)
	{
		index ++;
	}
	return buffer[index] == '<';
}

bool HtmlParser::parseBuffer(LPTSTR buffer, int size)
{		
	ImageFileFormat fileFormat;
	fileFormat = imageFileFormat((LPBYTE)buffer);
	if (fileFormat != Image_File_Format_UNKNOWN)
	{
		return false;
	}

	if (isAnsi((LPCSTR)buffer))
	{
		doParseBufferLinkA((LPSTR)buffer, size);
		doParseBufferImageA((LPSTR)buffer, size); 
	}
	else
	{
		 /*
		if (isGB2312(buffer))
		{
			convertGB2312Buffer(buffer, size);
		}
		*/
			
		doParseBufferLinkW(buffer, size/sizeof(TCHAR));
		doParseBufferImageW(buffer, size/sizeof(TCHAR));
	}
	return true;
}

int HtmlParser::findFirstImageA(LPCSTR buffer, int size, DomainNameType& nameBuffer)
{
	int result = 0;
	int offset = 0;
	int start, end;
	LPSTR ptr = NULL;
	while (result < size)
	{
		offset = findFirstCharA(buffer + result, size - result, '<');
		if (offset < 0)
		{
			return -1;
		}
		result += offset + 1;
		
		offset = findNextStringA(buffer + result, size - result, "img ");
		if (offset < 0)
		{
			continue;
		}
		result += offset + 4;
		offset = findFirstStringA(buffer + result, size - result, "src");
		if (offset < 0)
		{
			return HTML_PARSE_ERROR;
		}
		result += offset + 3;
		offset = findNextCharA(buffer + result, size - result, '=');
		if (offset < 0)
		{
			return HTML_PARSE_ERROR;
		}
		result += offset + 1;
		if (!findFirstWordA(buffer+result, size - result, start, end))
		{
			return HTML_PARSE_ERROR;
		}

		result += start;
		offset = end - start;
	
		if (offset > MaxDomainNameLength)
		{
			return HTML_PATH_MAX_ERROR;
		}  	
		for (int i = 0; i < offset; i ++)
		{
			_stprintf(nameBuffer.nameBuffer + i, _T("%C"), buffer[result + i]);
		}
		//strncpy((LPSTR)nameBuffer.nameBuffer, buffer + result, offset);
		//ptr = (LPSTR)(nameBuffer.nameBuffer);
		//ptr[offset]= '\0';

		nameBuffer.nameBuffer[offset]= _T('\0');
		

		result += offset + 1;
		//printf("\n*********[%s]*********\n", nameBuffer.nameBuffer);

		return result;
	}
	return -1;
}

int HtmlParser::findFirstLinkA(LPCSTR buffer, int size, DomainNameType& nameBuffer)
{
	int result = 0;
	int offset = 0;
	int start, end;
	LPSTR ptr = NULL;
	while (result < size)
	{
		offset = findFirstCharA(buffer + result, size - result, '<');
		if (offset < 0)
		{
			return -1;
		}
		result += offset + 1;
		
		offset = findNextStringA(buffer + result, size - result, "a ");
		if (offset < 0)
		{
			continue;
		}
		result += offset + 2;
		offset = findFirstStringA(buffer + result, size - result, "href");
		if (offset < 0)
		{
			continue;
			//return HTML_PARSE_ERROR;
		}
		result += offset + 4;
		offset = findNextCharA(buffer + result, size - result, '=');
		if (offset < 0)
		{
			return HTML_PARSE_ERROR;
		}
		result += offset + 1;
		if (!findFirstWordA(buffer+result, size - result, start, end))
		{
			return HTML_PARSE_ERROR;
		}

		result += start;
		offset = end - start;
	
		if (offset > MaxDomainNameLength)
		{
			return HTML_PATH_MAX_ERROR;
		}  	

		//printf("\n*********[%s]*********\n", buffer + result);
		for (int i = 0; i < offset; i ++)
		{
			_stprintf(nameBuffer.nameBuffer + i, _T("%C"), buffer[result + i]);
		}

		nameBuffer.nameBuffer[offset] = _T('\0');
		//strncpy((LPSTR)nameBuffer.nameBuffer, buffer + result, offset);
		//ptr =  (LPSTR)(nameBuffer.nameBuffer);
		//ptr[offset]= '\0';			
		

		result += offset + 1;
		
		return result;
	}
	return -1;
}

int HtmlParser::doParseBufferImageA(LPCSTR buffer, int size)
{
	int offset = 0;
	int result = 0;
	int imageCounter = 0;
	DomainNameType domainName;
	NameResultPair resultPair;

	while (result < size)
	{
		offset = findFirstImageA(buffer + result, size - result, domainName);
		if (offset < 0)
		{
			break;
		}  		
		resultPair = nameSet.insert(domainName);
		if (resultPair.second)
		{
			nameQueue.push_back(domainName);
			imageCounter ++;  
		}
		//printf("set size[%d]\n", nameSet.size());

		result += offset;
		
	}
	return imageCounter;
}

int HtmlParser::doParseBufferLinkA(LPCSTR buffer, int size)
{
	int offset = 0;
	int result = 0;
	int linkCounter = 0;
	DomainNameType domainName;
	NameResultPair resultPair;

	while (result < size)
	{
		offset = findFirstLinkA(buffer + result, size - result, domainName);
		if (offset < 0)
		{
			break;
		}
		resultPair = linkSet.insert(domainName);
		
		if (resultPair.second)
		{
			linkQueue.push_back(domainName);
			linkCounter ++;
			_tprintf(_T("new link[%s]\n"), domainName.nameBuffer);
			//printf("new link[%s]\n", domainName.nameBuffer);


		}
		result += offset;
	}
	return linkCounter;
}


bool HtmlParser::retrieveCurrentHost(LPCTSTR www)
{
	URL_COMPONENTS url_component;
	memset(&url_component, 0, sizeof(URL_COMPONENTS));
	url_component.dwHostNameLength = MaxDomainNameLength;
	url_component.lpszHostName = currentHost.nameBuffer;
	if (InternetCrackUrl(www, _tcslen(www), ICU_DECODE, &url_component))
	{	 		
		currentHostNameLength = url_component.dwHostNameLength;
		if (currentHost.nameBuffer[currentHostNameLength] == _T('\\') 
			|| currentHost.nameBuffer[currentHostNameLength] == _T('/'))
		{
			currentHostNameLength -- ;
		}
		currentHost.nameBuffer[currentHostNameLength] = _T('\0');
		_tprintf(_T("the host name is [%s]\n"), currentHost.nameBuffer);  		
		return true;
	}
	return false;
}


bool HtmlParser::isOutsideHostName(LPCTSTR www)
{
	URL_COMPONENTS url_component;
	DomainNameType  theHost;
	int counter = 0;
	memset(&url_component, 0, sizeof(URL_COMPONENTS));
	url_component.dwHostNameLength = MaxDomainNameLength;
	url_component.lpszHostName = theHost.nameBuffer;
	if (InternetCrackUrl(www, _tcslen(www), ICU_DECODE, &url_component))
	{
		url_component.lpszHostName[url_component.dwHostNameLength] = _T('\0');
		_tprintf(_T("www url[%s]hostname[%s],link[%s]\n"), www, currentHost.nameBuffer, 
			url_component.lpszHostName);

		for (int i = currentHostNameLength - 1 ; i >= 0; i --)
		{
			if (currentHost.nameBuffer[i] != theHost.nameBuffer[i])
			{
				return true;
			}
			if (currentHost.nameBuffer[i] == _T('.'))
			{
				counter ++ ;
			}
			if (counter == 2)
			{
				return false;
			}
		}		
		//_tcsicmp(theHost.nameBuffer, currentHost.nameBuffer)!=0;
	}
	return false;
}








