/*====================================================================*
 *
 *   xmlscan.c - markup scanner;
 *
 *   node.h
 *
 *   scan XML source and create a parse tree;
 *
 *   Motley Tools by Charles Maier <cmaier@cmassoc.net>;
 *   Copyright (c) 2001-2006 by Charles Maier Associates;
 *   Licensed under the Internet Software Consortium License;
 *
 *--------------------------------------------------------------------*/

#ifndef XMLSCAN_SOURCE
#define XMLSCAN_SOURCE

/*====================================================================*
 *  system header files;
 *--------------------------------------------------------------------*/

#include <string.h>
#include <ctype.h>

/*====================================================================*
 *  custom header files;
 *--------------------------------------------------------------------*/

#include "../nodes/node.h"
#include "../tools/number.h"
#include "../tools/error.h"

/*====================================================================*
 *
 *   char * advance (char * string, unsigned * line);
 *
 *   discard whitespace and count newlines up to the next meaningful
 *   character;
 *
 *   this function is critical to the XML parsing engine because it
 *   ensures that node strings are NUL terminated and line counts
 *   are accurate;
 *
 *   Motley Tools by Charles Maier <cmaier@cmassoc.net>;
 *   Copyright (c) 2001-2006 by Charles Maier Associates;
 *   Licensed under the Internet Software Consortium License;
 *
 *--------------------------------------------------------------------*/

static char * advance (char * string, unsigned * lineno)

{
	while (isspace (*string))
	{
		if (*string == '\n')
		{
			(*lineno)++;
		}
		*string++ = (char)(0);
	}
	return (string);
}

/*====================================================================*
 *
 *   char * discard (char * string, unsigned * line);
 *
 *   discard current character; advance to next character;
 *
 *   Motley Tools by Charles Maier <cmaier@cmassoc.net>;
 *   Copyright (c) 2001-2006 by Charles Maier Associates;
 *   Licensed under the Internet Software Consortium License;
 *
 *--------------------------------------------------------------------*/

static char * discard (char * string, unsigned * lineno)

{
	*string++ = (char)(0);
	string = advance (string, lineno);
	return (string);
}

/*====================================================================*
 *
 *   char * nmtoken (char * string);
 *
 *   collect nmtoken as per w3c xml 1.0 specification;
 *
 *   Motley Tools by Charles Maier <cmaier@cmassoc.net>;
 *   Copyright (c) 2001-2006 by Charles Maier Associates;
 *   Licensed under the Internet Software Consortium License;
 *
 *--------------------------------------------------------------------*/

static char * nmtoken (char * string)

{
	while (isalnum (*string) || (*string == '-') || (*string == '_') || (*string == '.') || (*string == ':'))
	{
		string++;
	}
	return (string);
}

/*====================================================================*
 *
 *   char * content (char * string, char quote, unsigned * line);
 *
 *   collect literal string; discard quotes; preserve whitespace;
 *   count newlines;
 *
 *   Motley Tools by Charles Maier <cmaier@cmassoc.net>;
 *   Copyright (c) 2001-2006 by Charles Maier Associates;
 *   Licensed under the Internet Software Consortium License;
 *
 *--------------------------------------------------------------------*/

static char * content (char * string, char quote, unsigned * lineno)

{
	if (*string == quote)
	{
		*string++ = (char)(0);
	}
	while (*string)
	{
		if (*string == quote)
		{
			break;
		}
		if (*string++ == '\n')
		{
			(*lineno)++;
		}
	}
	if (*string == quote)
	{
		*string++ = (char)(0);
	}
	return (string);
}

/*====================================================================*
 *
 *   char * collect (char * string);
 *
 *   collect entity; an entity consists of non-blank characters
 *   excluding common tag punctuation;
 *
 *   Motley Tools by Charles Maier <cmaier@cmassoc.net>;
 *   Copyright (c) 2001-2006 by Charles Maier Associates;
 *   Licensed under the Internet Software Consortium License;
 *
 *--------------------------------------------------------------------*/

static char * collect (char * string)

{
	while (*string)
	{
		if (*string == '<')
		{
			break;
		}
		if (*string == '=')
		{
			break;
		}
		if (*string == '/')
		{
			break;
		}
		if (*string == '?')
		{
			break;
		}
		if (*string == '>')
		{
			break;
		}
		if (isspace (*string))
		{
			break;
		}
		string++;
	}
	return (string);
}

/*====================================================================*
 *
 *   static char * comment (char * string, unsigned * line);
 *
 *   collect comment;
 *   preserve delimiters;
 *   preserve whitespace;
 *   count newlines;
 *
 *   Motley Tools by Charles Maier <cmaier@cmassoc.net>;
 *   Copyright (c) 2001-2006 by Charles Maier Associates;
 *   Licensed under the Internet Software Consortium License;
 *
 *--------------------------------------------------------------------*/

static char * comment (char * string, unsigned * lineno)

{
	string++;
	if (*string == '-')
	{
		while (*string == '-')
		{
			string++;
		}
		while ((*string) && (*string != '-'))
		{
			while ((*string) && (*string != '-'))
			{
				if (*string == '\n')
				{
					(*lineno)++;
				}
				string++;
			}
			string++;
		}
		while (*string == '-')
		{
			string++;
		}
	}
	return (string);
}

/*====================================================================*
 *
 *   char * literal (char * string, char quote, unsigned * line);
 *
 *   collect literal;
 *   preserve delimiters;
 *   preserve whitespace;
 *   count newlines;
 *
 *   Motley Tools by Charles Maier <cmaier@cmassoc.net>;
 *   Copyright (c) 2001-2006 by Charles Maier Associates;
 *   Licensed under the Internet Software Consortium License;
 *
 *--------------------------------------------------------------------*/

static char * literal (char *string, char quote, unsigned * lineno)

{
	if (*string == quote)
	{
		*string++ = (char)(0);
	}
	while (*string)
	{
		if (*string == quote)
		{
			break;
		}
		if (*string == '\n')
		{
			(*lineno)++;
		}
		string++;
	}
	if (*string == quote)
	{
		*string++ = (char)(0);
	}
	return (string);
}

/*====================================================================*
 *
 *   char * context (char * string, signed c, unsigned *line);
 *
 *   collect context;
 *   preserve delimiters;
 *   preserve whitespace;
 *   count newlines;
 *
 *   Motley Tools by Charles Maier <cmaier@cmassoc.net>;
 *   Copyright (c) 2001-2006 by Charles Maier Associates;
 *   Licensed under the Internet Software Consortium License;
 *
 *--------------------------------------------------------------------*/

static char * context (char *string, signed c, unsigned * lineno)

{
	string++;
	while (*string)
	{
		if (*string == (char)(c))
		{
			string++;
			break;
		}
		if (*string == '{')
		{
			string = context (string, '}', lineno);
			continue;
		}
		if (*string == '(')
		{
			string = context (string, ')', lineno);
			continue;
		}
		if (*string == '[')
		{
			string = context (string, ']', lineno);
			continue;
		}
		if ((*string == '\"') || (*string == '\''))
		{
			string = literal (string, *string, lineno);
			continue;
		}
		if (*string == '\n')
		{
			(*lineno)++;
		}
		string++;
	}
	return (string);
}

/*====================================================================*
 *
 *   void xmlscan (NODE * node);
 *
 *   node.h
 *
 *   Motley Tools by Charles Maier <cmaier@cmassoc.net>;
 *   Copyright (c) 2001-2006 by Charles Maier Associates;
 *   Licensed under the Internet Software Consortium License;
 *
 *--------------------------------------------------------------------*/

signed xmlscan (NODE * node)

{
	NODE * section = node;
	NODE * element;
	NODE * attribute;
	NODE * value;
	char prefix = (char)(0);
	char suffix = (char)(0);
	char * string = node->text;
	unsigned lineno = 1;
	if (!section)
	{
		error (1, EFAULT, "section is null");
	}
	if (!string)
	{
		error (1, EFAULT, "string is null");
	}
	while (*string)
	{
		if (*string == '<')
		{
			prefix = '<';
			suffix = '>';
			string = discard (string, &lineno);
			if ((*string == '/') || (*string == '?') || (*string == '!'))
			{
				prefix = *string;
				string = discard (string, &lineno);
			}
			element = xmlnode (section);
			element->line = lineno;
			element->type = NODE_ELEM;
			element->text = string;
			if (isalpha (*string))
			{
				string = nmtoken (string);
			}
			else if (*string == '-')
			{
				string = comment (string, &lineno);
			}
			else if (*string == '[')
			{
				string = context (string, ']', &lineno);
			}
			else
			{
				string = collect (string);
			}
			string = advance (string, &lineno);
			while ((*string) && (*string != '<') && (*string != '/') && (*string != '?') && (*string != '>'))
			{
				attribute = xmlnode (element);
				attribute->line = lineno;
				attribute->type = NODE_ATTR;
				attribute->text = string;
				if (isalpha (*string))
				{
					string = nmtoken (string);
				}
				else if (*string == '-')
				{
					string = comment (string, &lineno);
				}
				else if (*string == '[')
				{
					string = context (string, ']', &lineno);
				}
				else if ((*string == '\"') || (*string == '\''))
				{
					string = content (string, *string, &lineno);
					attribute->text++;
				}
				else
				{
					string = collect (string);
				}
				string = advance (string, &lineno);
				if (*string == '=')
				{
					string = discard (string, &lineno);
					value = xmlnode (attribute);
					value->line = lineno;
					value->type = NODE_VALU;
					value->text = string;
					if ((*string == '\"') || (*string == '\''))
					{
						string = content (string, *string, &lineno);
						value->text++;
					}
					else
					{
						string = collect (string);
					}
					string = advance (string, &lineno);
				}
			}
			if ((*string == '/') || (*string == '?'))
			{
				suffix = *string;
				string = discard (string, &lineno);
			}
		}
		else if (*string == '>')
		{
			string = discard (string, &lineno);
			if (prefix == '!')
			{
				element->type = NODE_SGML;
			}
			else if (prefix == '?')
			{
				element->type = NODE_INST;
			}
			else if (suffix == '?')
			{
			}
			else if (prefix == '/')
			{
				element->type = NODE_ETAG;
				if (element->below)
				{
					error (1, 0, "Element </%s> on line %d has attributes or content.", element->text, element->line);
				}
				if (strcmp (section->text, element->text))
				{
					error (1, 0, "Element <%s> on line %d teminated by </%s> on line %d", section->text, section->line, element->text, element->line);
				}
				if (section->above)
				{
					section = section->above;
				}
			}
			else if (suffix == '/')
			{
			}
			else
			{
				section = element;
			}
		}
		else
		{
			signed space = 0;
			char * output = string;
			NODE * segment = xmlnode (section);
			segment->line = lineno;
			segment->type = NODE_DATA;
			segment->text = string;
			while (*string)
			{
				if (*string == '<')
				{
					break;
				}
				if (isspace (*string))
				{
					string = advance (string, &lineno);
					space++;
					continue;
				}
				if (space)
				{
					*output++ = ' ';
					space--;
				}
				*output++ = *string++;
			}
			if (output < string)
			{
				*output = (char)(0);
			}
		}
	}
	return (0);
}

#endif