/*
GameEngine and Editor
by John Ryland
Copyright (c) 2023
*/
////////////////////////////////////////////////////////////////////////////////////
// XML Tokenizer
#include <cctype>
#include <cstddef>
#include <cstdio>
#include <cstdint>
#include "XmlTokenizer.h"
enum XmlCharacterType
{
XC_Less, // '<'
XC_Greater, // '>'
XC_Not, // '!'
XC_Question, // '?'
XC_Slash, // '/'
XC_AlNum, // 0-9,a-z,A-Z
XC_Space, // ' ',tab,CR,LF
XC_Assign, // '='
XC_Quote, // '
XC_Quotes, // '"'
XC_Other
};
#define XML_CONT(st) state = st; continue
#define XML_NEXT(st) start = i + 1; XML_CONT(st)
#define XML_END consume({ XT_ElementEnd, "", 0 }, user)
#define XML_EMIT(typ) consume({ typ, &data[start], len }, user)
void XmlTokenize(const char *data, size_t size, const XmlTokenConsumer& consume, void* user)
{
int state = 0;
size_t start = 2; // Skip BOM (byte order mark)
for (size_t i = start; i < size; i++)
{
size_t len = i - 1 - start + 1;
uint8_t ch = data[i];
if (state == 0 && ch == '<') { XML_EMIT(XT_Text); XML_CONT(1); } // '<'
if (state == 0) { XML_CONT(0); } // '[~<]*'
if (state == 1 && ch == '!') { XML_CONT(12); } // '<!'
if (state == 1 && ch == '?') { XML_CONT(1); } // '<?' (needs to parse '<?blah?>')
if (state == 1 && ch == '/') { XML_CONT(10); } // '</'
if (state == 1 && isalnum(ch)) { start = i; XML_CONT(2); } // '<alnum' (need to parse ':' too for namespaces)
if (state == 2 && isalnum(ch)) { XML_CONT(2); } // '<alnum*'
if (state == 2 && isspace(ch)) { XML_EMIT(XT_ElementBegin); XML_NEXT(3); } // '<alnum* '
if (state == 2 && ch == '/') { XML_CONT(8); } // '<alnum*/'
if (state == 2 && ch == '>') { XML_EMIT(XT_ElementBegin); XML_NEXT(0); } // '<alnum>*'
if (state == 3 && isspace(ch)) { XML_CONT(3); } // '<alnum* *'
if (state == 3 && isalnum(ch)) { start = i; XML_CONT(4); } // '<alnum* *alnum'
if (state == 3 && ch == '/') { XML_CONT(7); } // '<alnum /'
if (state == 3 && ch == '>') { XML_NEXT(0); } // '<alnum >'
if (state == 4 && isalnum(ch)) { XML_CONT(4); } // '<alnum* *alnum*'
if (state == 4 && ch == '=') { XML_EMIT(XT_AttributeName); XML_NEXT(5); } // '<alnum* *alnum*='
if (state == 5 && ch == '\"') { XML_CONT(9); } // '<alnum* *alnum*="'
// should be parsing only <tag attrib='value with spaces' or <tag attrib="value with spaces" not <tag attrib=value
if (state == 5 && isalnum(ch)) { XML_CONT(5); } // '<alnum* *alnum*=alnum*'
if (state == 5 && isspace(ch)) { XML_EMIT(XT_AttributeValue); XML_NEXT(3); } // '<alnum* *alnum*=alnum* '
if (state == 5 && ch == '/') { XML_CONT(6); } // '<alnum* *alnum*=alnum*/'
if (state == 5 && ch == '>') { XML_EMIT(XT_AttributeValue); XML_NEXT(0); } // '<alnum* *alnum*=alnum*>'
if (state == 5 && ispunct(ch)) { XML_CONT(5); }
if (state == 6 && ch == '>') { XML_EMIT(XT_AttributeValue); XML_END; XML_NEXT(0); }
if (state == 7 && ch == '>') { XML_END; XML_NEXT(0); } // '<alnum />'
if (state == 8 && ch == '>') { XML_EMIT(XT_ElementBegin); XML_END; XML_NEXT(0); } // '<alnum*/>'
if (state == 9 && ch == '\"') { XML_CONT(5); } // '<alnum* *alnum*=".*"'
if (state == 9) { XML_CONT(9); } // '<alnum* *alnum*=".*'
if (state == 10 && isspace(ch)) { XML_CONT(10); } // '</ '
if (state == 10 && isalnum(ch)) { XML_CONT(11); } // '</alnum'
if (state == 11 && isalnum(ch)) { XML_CONT(11); } // '</alnum*'
if (state == 11 && isspace(ch)) { XML_CONT(11); } // '</alnum* '
if (state == 11 && ch == '>') { XML_END; XML_NEXT(0); } // '</alnum*>'
if (state == 12 && ch == '-') { XML_CONT(13); } // '<!-'
if (state == 13 && ch == '-') { XML_NEXT(14); } // '<!--'
if (state == 14 && ch == '-') { XML_CONT(15); } // '<!--[~-]*-'
if (state == 14) { XML_CONT(14); } // '<!--[~-]*
if (state == 15 && ch == '-') { XML_CONT(16); } // '<!--[~-]*--'
if (state == 15) { XML_CONT(14); } // '<!--[~-]*-[~-]'
if (state == 16 && ch != '>') { XML_CONT(14); } // '<!--.*--[~>]'
if (state == 16) { len -= 2; XML_EMIT(XT_Comment); XML_NEXT(0); } // '<!--.*-->'
printf("unexpected token\n"); break;
}
}