Newer
Older
GameEngine / src / Editor / XmlTokenizer.cpp
@John Ryland John Ryland on 22 Aug 5 KB save WIP
/*
	GameEngine and Editor
	by John Ryland
	Copyright (c) 2023
*/

////////////////////////////////////////////////////////////////////////////////////
//	XML Tokenizer

#include <cctype>
#include <cstddef>
#include <cstdio>
#include <cstdint>
#include "XmlTokenizer.h"

enum XmlCharacterType
{
	XC_Less,    		// '<'
	XC_Greater, 		// '>'
	XC_Not,     		// '!'
	XC_Question,		// '?'
	XC_Slash,   		// '/'
	XC_AlNum,   		// 0-9,a-z,A-Z
	XC_Space,   		// ' ',tab,CR,LF
	XC_Assign,  		// '='
	XC_Quote,   		//  '
	XC_Quotes,  		// '"'
	XC_Other
};

#define XML_CONT(st)		state = st; continue
#define XML_NEXT(st)		start = i + 1; XML_CONT(st)
#define XML_END 			consume({ XT_ElementEnd, "", 0 }, user)
#define XML_EMIT(typ)		consume({ typ, &data[start], len }, user)

void XmlTokenize(const char *data, size_t size, const XmlTokenConsumer& consume, void* user)
{
	int state = 0;
	size_t start = 2; // Skip BOM (byte order mark)
	for (size_t i = start; i < size; i++)
	{
		size_t len = i - 1 - start + 1;
		uint8_t ch = data[i];
		if (state == 0 && ch == '<')    { XML_EMIT(XT_Text); XML_CONT(1); }              // '<'
		if (state == 0)                 { XML_CONT(0); }                                 // '[~<]*'
		if (state == 1 && ch == '!')    { XML_CONT(12); }                                // '<!'
		if (state == 1 && ch == '?')    { XML_CONT(1); }                                 // '<?'      (needs to parse '<?blah?>')
		if (state == 1 && ch == '/')    { XML_CONT(10); }                                // '</'
		if (state == 1 && isalnum(ch))  { start = i; XML_CONT(2); }                      // '<alnum'  (need to parse ':' too for namespaces)
		if (state == 2 && isalnum(ch))  { XML_CONT(2); }                                 // '<alnum*'
		if (state == 2 && isspace(ch))  { XML_EMIT(XT_ElementBegin); XML_NEXT(3); }      // '<alnum* '
		if (state == 2 && ch == '/')    { XML_CONT(8); }                                 // '<alnum*/'
		if (state == 2 && ch == '>')    { XML_EMIT(XT_ElementBegin); XML_NEXT(0); }      // '<alnum>*'
		if (state == 3 && isspace(ch))  { XML_CONT(3); }                                 // '<alnum* *'
		if (state == 3 && isalnum(ch))  { start = i; XML_CONT(4); }                      // '<alnum* *alnum'
		if (state == 3 && ch == '/')    { XML_CONT(7); }                                 // '<alnum /'
		if (state == 3 && ch == '>')    { XML_NEXT(0); }                                 // '<alnum >'
		if (state == 4 && isalnum(ch))  { XML_CONT(4); }                                 // '<alnum* *alnum*'
		if (state == 4 && ch == '=')    { XML_EMIT(XT_AttributeName); XML_NEXT(5); }     // '<alnum* *alnum*='
		if (state == 5 && ch == '\"')   { XML_CONT(9); }                                 // '<alnum* *alnum*="'

		// should be parsing only   <tag attrib='value with spaces'   or  <tag attrib="value with spaces"    not  <tag attrib=value
		if (state == 5 && isalnum(ch))  { XML_CONT(5); }                                 // '<alnum* *alnum*=alnum*'
		if (state == 5 && isspace(ch))  { XML_EMIT(XT_AttributeValue); XML_NEXT(3); }    // '<alnum* *alnum*=alnum* '
		if (state == 5 && ch == '/')    { XML_CONT(6); }                                 // '<alnum* *alnum*=alnum*/'
		if (state == 5 && ch == '>')    { XML_EMIT(XT_AttributeValue); XML_NEXT(0); }    // '<alnum* *alnum*=alnum*>'
		if (state == 5 && ispunct(ch))  { XML_CONT(5); }

		if (state == 6 && ch == '>')    { XML_EMIT(XT_AttributeValue); XML_END; XML_NEXT(0); }
		if (state == 7 && ch == '>')    { XML_END; XML_NEXT(0); }                            // '<alnum />'
		if (state == 8 && ch == '>')    { XML_EMIT(XT_ElementBegin); XML_END; XML_NEXT(0); } // '<alnum*/>'
		if (state == 9 && ch == '\"')   { XML_CONT(5); }                                 // '<alnum* *alnum*=".*"'
		if (state == 9)                 { XML_CONT(9); }                                 // '<alnum* *alnum*=".*'
		if (state == 10 && isspace(ch)) { XML_CONT(10); }                                // '</ '
		if (state == 10 && isalnum(ch)) { XML_CONT(11); }                                // '</alnum'
		if (state == 11 && isalnum(ch)) { XML_CONT(11); }                                // '</alnum*'
		if (state == 11 && isspace(ch)) { XML_CONT(11); }                                // '</alnum* '
		if (state == 11 && ch == '>')   { XML_END; XML_NEXT(0); }                        // '</alnum*>'
		if (state == 12 && ch == '-')   { XML_CONT(13); }                                // '<!-'
		if (state == 13 && ch == '-')   { XML_NEXT(14); }                                // '<!--'
		if (state == 14 && ch == '-')   { XML_CONT(15); }                                // '<!--[~-]*-'
		if (state == 14)                { XML_CONT(14); }                                // '<!--[~-]*
		if (state == 15 && ch == '-')   { XML_CONT(16); }                                // '<!--[~-]*--'
		if (state == 15)                { XML_CONT(14); }                                // '<!--[~-]*-[~-]'
		if (state == 16 && ch != '>')   { XML_CONT(14); }                                // '<!--.*--[~>]'
		if (state == 16)                { len -= 2; XML_EMIT(XT_Comment); XML_NEXT(0); } // '<!--.*-->'
		printf("unexpected token\n"); break;
	}
}