// C RunTime Header Files
#include <cstdlib>
#include <cmath>
#include <cstdio>
#include <ctime>
#include <climits>
#include <cassert>

// C++ Headers
#include <iostream>
#include <iomanip>
#include <fstream>
#include <vector>
#include <map>

// Project Headers
#include "Graphics.h"
#include "Common.h"
#include "MemoryMapping.h"
#include "Window.h"
#include "TrueType.h"
//#include "Font.h"


BEGIN_NAMESPACE


void LogMessage(LogLevel a_level, const char* a_utf8TextFmt, ...);


// Using Debug can be used for quickly adding debug output, and in release builds the calls to it shouldn't
// produce any code (if NDEBUG is defined which the ANSI C standard defines for disabling assert in release builds)
#ifdef NDEBUG
#  define Debug(a_utf8TextFmt, ...)           ((void)0)
#else
#  define Debug(a_utf8TextFmt, ...)           LogMessage(DEBUG, a_utf8TextFmt, __VA_ARGS__)
#endif


struct vec2i
{
  int x, y;

  template <typename T>
  vec2i operator*=(T scale) {
    x *= scale;
    y *= scale;
    return *this;
  }
  template <typename T>
  vec2i operator*(T scale) {
    vec2i ret;
    ret.x = x * scale;
    ret.y = y * scale;
    return ret;
  }
};


static uint32_t blendColors(uint32_t a_color1, uint32_t a_color2, uint8_t a_alpha)
{
	unsigned alpha2 = 255 - a_alpha;
	unsigned red   = ((a_color1 >> 16) & 0xff) * a_alpha + ((a_color2 >> 16) & 0xff) * alpha2;
	unsigned green = ((a_color1 >>  8) & 0xff) * a_alpha + ((a_color2 >>  8) & 0xff) * alpha2;
	unsigned blue  = ((a_color1 >>  0) & 0xff) * a_alpha + ((a_color2 >>  0) & 0xff) * alpha2;
	unsigned alpha = ((a_color1 >> 24) & 0xff) * a_alpha + ((a_color2 >> 24) & 0xff) * alpha2;
	// return 0xff000000 | ((red & 0xff00) << 8) | (green & 0xff00) | (blue >> 8);
	return ((alpha & 0xff00) << 16) | ((red & 0xff00) << 8) | (green & 0xff00) | (blue >> 8);
}


static uint32_t blendColorsF(uint32_t a_color1, uint32_t a_color2, float a_alpha)
{
	float alpha2 = 1.0f - a_alpha;
	float red   = float((a_color1 >> 16) & 0xff) * a_alpha + float((a_color2 >> 16) & 0xff) * alpha2;
	float green = float((a_color1 >>  8) & 0xff) * a_alpha + float((a_color2 >>  8) & 0xff) * alpha2;
	float blue  = float((a_color1 >>  0) & 0xff) * a_alpha + float((a_color2 >>  0) & 0xff) * alpha2;
	return 0xff000000 | ((int(red) & 0xff) << 16) | ((int(green) & 0xff) << 8) | (int(blue) & 0xff);
}



/*
  Another approach is to have a ROP enum - Raster operation
  This defines how for each pixel in a function, how it combines the src, dst and static color colors
  This could be like the Porter-Duff ops.
  Then this ROP value is passed as a template parameter, and the functions are all templated on this.
  At the calling site, it picks the implementation based on the currently set ROP for the painter
*/

/*
// Clipping should be outside the loop
template <typename functor>
void ForEachPixel(PixelBuffer* a_target, uint32_t a_color, int a_x, int a_y, int a_width, int a_height)
{
	for (int j = 0; j < a_height; j++)
		for (int i = 0; i < a_width; i++)
		{
			int x = i + a_x;
			int y = j + a_y;
			if ( x >= 0 && x < a_target->m_width && y >= 0 && y < a_target->m_height ) {
				uint32_t *dst = &(a_target->m_pixels[y*a_target->m_strideBytes/4 + x]);
				functor.func(); // TODO: This needs parameters
			}
		}
}
*/


struct ClipRect
{
  bool valid;
  int x1, x2;
  int y1, y2;
};


inline bool PixelClipTest(const ClipRect& clip, int x, int y)
{
	return (x >= clip.x1 && x <= clip.x2 && y >= clip.y1 && y <= clip.y2);
}


inline ClipRect SetupClip(PixelBuffer* a_target, bool retina = true)//false)
{
  if (!a_target || !a_target->m_pixels || a_target->m_width < 1 || a_target->m_height < 1)
  {
    printf("Invalid target\n");
    //abort();
    return { false, 1, 0, 1, 0 };;
  }
  if (retina)
  {
    int clipW = a_target->m_isRetina ? a_target->m_width / c_retinaScale : a_target->m_width;
    int clipH = a_target->m_isRetina ? a_target->m_height / c_retinaScale : a_target->m_height;
    return { true, 0, clipW - 1, 0, clipH - 1 };
  }
  int clipW = a_target->m_width;
  int clipH = a_target->m_height;
  return { true, 0, clipW - 1, 0, clipH - 1 };
}


void DrawRectangleAlpha(PixelBuffer* a_target, uint32_t a_color, int a_x, int a_y, int a_width, int a_height)
{
  ClipRect clip = SetupClip(a_target); if (!clip.valid) return;
	for (int j = 0; j < a_height; j++)
		for (int i = 0; i < a_width; i++)
		{
			int x = i + a_x;
			int y = j + a_y;
      if (PixelClipTest(clip, x, y))
      {
			//if ( x >= 0 && x < a_target->m_width && y >= 0 && y < a_target->m_height ) {
				uint32_t *dst = &(a_target->m_pixels[y*a_target->m_strideBytes/4 + x]);
				*dst = blendColors(*dst, a_color, a_color >> 24);
			}
		}
}


void DrawRectangle(PixelBuffer* a_target, uint32_t a_color, int a_x, int a_y, int a_width, int a_height, bool a_setAlpha)
{
  ClipRect clip = SetupClip(a_target, true); if (!clip.valid) return;
  if (a_setAlpha)
    a_color |= 0xFF000000;
	for (int j = 0; j < a_height; j++)
		for (int i = 0; i < a_width; i++)
		{
			int x = i + a_x;
			int y = j + a_y;
      if (PixelClipTest(clip, x, y))
      {
			//if ( x >= 0 && x < a_target->m_width && y >= 0 && y < a_target->m_height ) {
				uint32_t *dst = &(a_target->m_pixels[y*a_target->m_strideBytes/4 + x]);
				*dst = a_color;
			}
		}
}


// NOTE: This isn't highly optimized at the moment, could be made to be a lot more efficient
// such as clipping outside the loops, using shifts instead of divides, not needing to iterate over
// the entire area to draw the ellipse etc.
// It does draw the ellipse filled, not just the outline, but that could be done more efficiently by
// scanning from top to bottom and algerbraically calculate the first x pos for each scan line, then
// can do x2 = width - x1, then fill from x1 to x2
// Currently a_smoothEdge is not used
void DrawEllipse(PixelBuffer* a_target, uint32_t a_color, int a_x, int a_y, int a_width, int a_height, bool a_smoothEdge)
{
  ClipRect clip = SetupClip(a_target); if (!clip.valid) return;
	int alpha = (a_color >> 24) & 0xff;
	if ( alpha == 0 )
		return;
	int centerX = a_width/2;
	int centerY = a_height/2;
  // TODO: convert to shifts instead of dividing by scale
	int scale = 64;  // This is to stop overflows with 32bit ints
	if ( a_width < 256 && a_height < 256 )
		scale = 1;
	if ( alpha == 255 )
	{
		for (int j = 0; j < a_height; j++)
			for (int i = 0; i < a_width; i++)
			{
				int x = i + a_x;
				int y = j + a_y;
        if (PixelClipTest(clip, x, y))
        {
				//if ( x >= 0 && x < a_target->m_width && y >= 0 && y < a_target->m_height ) {
					uint32_t *dst = &(a_target->m_pixels[y*a_target->m_strideBytes/4 + x]);
					int a = (i - centerX) * a_height / scale;
					int b = (j - centerY) * a_width / scale;
					int c1 = a_width * a_height / (2 * scale);
					int c2 = (a_width-1) * (a_height-1) / (2 * scale);
					//int c2 = ((a_width+1) * (a_height+1)) / (2 * scale);
					
					int innerR = c1*c1;
					int outerR = c2*c2;
					int dist = a*a + b*b;
					int diffR = outerR - innerR;
					int distR = dist - innerR;
          // FIXME: alpha variable aliases earlier alpha variable
					int alpha = ((distR * 255) / diffR);

					if ( dist < outerR )
						*dst = a_color;
					else if ( dist < innerR )
						*dst = blendColors(a_color, *dst, alpha);
					//a_antiAlias
				}
			}
	} else {
		for (int j = 0; j < a_height; j++)
			for (int i = 0; i < a_width; i++)
			{
				int x = i + a_x;
				int y = j + a_y;
        if (PixelClipTest(clip, x, y))
        {
				//if ( x >= 0 && x < a_target->m_width && y >= 0 && y < a_target->m_height ) {
					uint32_t *dst = &(a_target->m_pixels[y*a_target->m_strideBytes/4 + x]);
					int a = (i - centerX) * a_height / scale;
					int b = (j - centerY) * a_width / scale;
					int c = a_width * a_height / (2 * scale);
					if ( (a*a + b*b) < c*c )
						*dst = (alpha<<24) | (blendColors(a_color, *dst, alpha) & 0xffffff);
				}
			}
	}
}


void DrawGradient(PixelBuffer* a_target, const Gradient& a_gradient, int a_x, int a_y, int a_width, int a_height)
{
  ClipRect clip = SetupClip(a_target); if (!clip.valid) return;
	for ( int j = 0; j < a_height; j++ )
		for ( int i = 0; i < a_width; i++ )
		{
			int x = i + a_x;
			int y = j + a_y;
      if (PixelClipTest(clip, x, y))
			// if ( x >= 0 && x < a_target->m_width && y >= 0 && y < a_target->m_height )
			{
				float dist = 0.0;

				if (a_gradient.m_type == RADIAL_GRADIENT)
				{
					int centerX = a_gradient.m_data.m_radial.m_centerX;
					int centerY = a_gradient.m_data.m_radial.m_centerY;
					//float errLUT[16] = { 1.1, -1.1, 2.2, -2.2, 3.3, -3.3, 4.4, -4.4, 5.5, -5.5, 6.6, -6.6, 7.7, -7.7, 8.8, -8.8 };
					//int dist = (int)(errLUT[rand() % 4] + sqrt(float((centerX - i)*(centerX - i) + (centerY - j)*(centerY - j))) * 256 / a_gradient.m_data.m_radial.m_distance);
					dist = sqrt(float((centerX - i)*(centerX - i) + (centerY - j)*(centerY - j))) / a_gradient.m_data.m_radial.m_distance;
					/*
					float distf = sqrt(float((centerX - i)*(centerX - i) + (centerY - j)*(centerY - j))) * 256 / a_gradient.m_data.m_radial.m_distance;
					float frac = distf - floor(distf) - 0.51;
					float frac2 = frac + 0.7;
					float frac3 = float(j*8) / a_height + float(i-90) / a_width;
					if ( frac < 0.1 && frac > -0.1 )
						frac = ( frac < 0.0 ) ? -0.081 : 0.08;
					int dist = (int)(float(frac * 64) - float(frac2 * frac3) + distf);
					//dist += ;
					*/
					//dist = (dist > 255) ? 255 : ((dist < 0) ? 0 : dist);
				}
				else if (a_gradient.m_type == CONICAL_GRADIENT)
        {
					// TODO: implement me
        }
				else if (a_gradient.m_type == LINEAR_GRADIENT)
				{
					/*
					m = dy / dx
					y = mx + c;
					c = y - mx;
					m = (y - c) / x;
					*/
					// TODO:

					int y1 = a_gradient.m_data.m_linear.m_y1;
					int y2 = a_gradient.m_data.m_linear.m_y2;
					if ( y <= y1 )
						dist = 0;
					else if ( y >= y2 )
						dist = 1.0;
					else {
						dist = float(y2 - y) / (y2 - y1);
					}
				}

				uint32_t *dst = &(a_target->m_pixels[y*a_target->m_strideBytes/4 + x]);

        // TODO: this is where flags to do repeat or reflect etc of the gradient
        // pattern should be added
        dist = (dist > 1.0f) ? 1.0f : ((dist < 0.0f) ? 0.0f : dist);

#define ENABLE_GRADIENT_STOPS 1
#if ENABLE_GRADIENT_STOPS
        // Apply gradient stops - perhaps there are more intelligent ways to do this
        // that remember the last stop
        for (int stop = 0; stop < (a_gradient.m_gradientStops.size() - 1); stop++)
        {
          float pos1 = a_gradient.m_gradientStops[stop + 0].m_position;
          float pos2 = a_gradient.m_gradientStops[stop + 1].m_position;

          if (dist >= pos1 && dist <= pos2)
          {
            dist = (dist - pos1) / (pos2 - pos1);
            uint32_t col1 = a_gradient.m_gradientStops[stop + 0].m_color;
            uint32_t col2 = a_gradient.m_gradientStops[stop + 1].m_color;
				    *dst = blendColorsF(col1, col2, dist);
            break;
          }
        }
#else
				*dst = blendColorsF(a_gradient.m_color1, a_gradient.m_color2, dist);
#endif

			}
		}
}


/*
void DrawLineFloat(PixelBuffer* a_target, uint32_t a_color, int a_x1, int a_y1, int a_x2, int a_y2)
{
	float x = 0.0f;
	float y = 0.0f;
	float fdy = 1.0f;
	float fdx = 1.0f;
	int dx = a_x2 - a_x1;
	int dy = a_y2 - a_y1;
	int count = 1;
	if ( !dy && !dx ) {
		if ( a_x1 >= 0 && a_x1 < a_target->m_width && a_y1 >= 0 && a_y1 < a_target->m_height )
			a_target->m_pixels[a_y1*a_target->m_strideBytes/4 + a_x1] = a_color;
		return;
	}
	if ( dx*dx > dy*dy ) {
		count = dx;
		fdy = float(dy) / dx;
	} else {
		count = dy;
		fdx = float(dx) / dy;
	}
	if ( count < 0 ) {
		count = -count;
		fdy = -fdy;
		fdx = -fdx;
	}
	//count++;
  // TODO: Using fixed-point would be faster to avoid float->int conversion
	for (; count--; x+=fdx, y+=fdy)
	{
		int xi = int(a_x1 + x);
		int yi = int(a_y1 + y);
		if ( xi >= 0 && xi < a_target->m_width && yi >= 0 && yi < a_target->m_height )
			a_target->m_pixels[yi*a_target->m_strideBytes/4 + xi] = a_color;
	}
}
*/


void ClipLinePoint(int& a_x, int& a_y, int a_dx, int a_dy, const Rectangle& a_bounds)
{
  float m1 = float(a_dy) / a_dx;
  float m2 = float(a_dx) / a_dy;
  int c1 = a_y - m1 * a_x;
  int c2 = a_x - m2 * a_y;

  int bx1 = a_bounds.m_x;
  int by1 = a_bounds.m_y;
  int bx2 = bx1 + a_bounds.m_width - 1;
  int by2 = by1 + a_bounds.m_height - 1;
  a_x = (a_x < bx1) ? bx1 : ((a_x > bx2) ? bx2 : a_x);
  a_y = m1 * a_x + c1;
  a_y = (a_y < by1) ? by1 : ((a_y > by2) ? by2 : a_y);
  a_x = m2 * a_y + c2;
}


void DrawHLine(PixelBuffer* a_target, uint32_t a_color, int a_x1, int a_x2, int a_y, bool a_blend)
{
  if (!a_target || a_y < 0)
    return;
  if (a_y >= a_target->m_height)
    return;
  if (!a_blend)
    a_color |= 0xFF000000;
  if (a_x1 > a_x2)
  {
    int tmp = a_x1;
    a_x1 = a_x2;
    a_x2 = tmp;
  }
  if (a_x1 < 0)
    a_x1 = 0;
  if (a_x2 < 0)
    a_x2 = 0;
  if (a_x1 >= a_target->m_width)
    a_x1 = a_target->m_width - 1;
  if (a_x2 >= a_target->m_width)
    a_x2 = a_target->m_width - 1;

  uint32_t *pix = &a_target->m_pixels[a_y*a_target->m_strideBytes/4 + a_x1];
	int dx = a_x2 - a_x1 + 1;
  for (int i = 0; i < dx; ++i)
  {
    *pix = a_color;
    ++pix;
  }
}


void DrawLine(PixelBuffer* a_target, uint32_t a_color, int a_x1, int a_y1, int a_x2, int a_y2, bool a_blend)
{
  ClipRect clip = SetupClip(a_target, true); if (!clip.valid) return;
	int dx = a_x2 - a_x1;
	int dy = a_y2 - a_y1;

  if (!a_blend)
    a_color |= 0xFF000000;

  // Single pixel case
	if ( !dy && !dx ) {
    if (PixelClipTest(clip, a_x1, a_y1))
		//if ( a_x1 >= 0 && a_x1 < clipW && a_y1 >= 0 && a_y1 < clipH )
			a_target->m_pixels[a_y1*a_target->m_strideBytes/4 + a_x1] = a_color;
		return;
	}

  // Work out gradients in 32:32 fixed point maths
  const unsigned shiftCount = 32;
	int count = 1;
	int64_t x = int64_t(a_x1) << shiftCount;
	int64_t y = int64_t(a_y1) << shiftCount;
	int64_t fdy = 1LL << shiftCount;
	int64_t fdx = 1LL << shiftCount;
	if ( dx*dx > dy*dy ) {
		count = dx;
		fdy = (dy * fdy) / dx; // fdy = dy/dx  and  fdx = 1   (in fixed-point math)
	} else {
		count = dy;
		fdx = (dx * fdx) / dy; // fdx = dx/dy  and  fdy = 1
	}
	if ( count < 0 ) {
		count = -count;
		fdy = -fdy;
		fdx = -fdx;
	}

  // Why isn't it count++?
	count++;
  if (count > 0)
	for (; count--; x += fdx, y += fdy)
	{
		int xi = x >> shiftCount;
		int yi = y >> shiftCount;
    if (PixelClipTest(clip, xi, yi))
		//if ( xi >= 0 && xi < clipW && yi >= 0 && yi < clipH )
			a_target->m_pixels[yi*a_target->m_strideBytes/4 + xi] = a_color;
	}
}


void DrawLine__Old_But_With_Clipping_Attempt(PixelBuffer* a_target, uint32_t a_color, int a_x1, int a_y1, int a_x2, int a_y2, bool a_blend)
{
	int dx = a_x2 - a_x1;
	int dy = a_y2 - a_y1;

  if (!a_blend)
    a_color |= 0xFF000000;

  if (!a_target) // || !a_target->m_pixels || a_x1 < 0 || a_y1 < 0 || a_x2 < 0 || a_y2 < 0)
    return;

// line-clipping may be broken - getting crashes with it enabled
#define USE_LINE_CLIPPING 0

#if USE_LINE_CLIPPING
  // Might be a more optimal line clipping method
  // Figure out flags for the start and end points, and use those flags.
  uint8_t clippingFlags = 0;
  if ( a_x1 < 0 ) clippingFlags |= 1;
  else if ( a_x1 >= a_target->m_width ) clippingFlags |= 2;
  if ( a_y1 < 0 ) clippingFlags |= 4;
  else if ( a_y1 >= a_target->m_height) clippingFlags |= 8;
  if ( a_x2 < 0 ) clippingFlags |= 16;
  else if ( a_x2 >= a_target->m_width ) clippingFlags |= 32;
  if ( a_y2 < 0 ) clippingFlags |= 64;
  else if ( a_y2 >= a_target->m_height) clippingFlags |= 128;
  if ( clippingFlags ) {
    // Trivial rejections
    if ( (clippingFlags & 17) == 17 ) return; // off left   1 + 16
    if ( (clippingFlags & 34) == 34 ) return; // off right  2 + 32
    if ( (clippingFlags & 68) == 68 ) return; // off top    4 + 64
    if ( (clippingFlags & 136) == 136 ) return; // off bottom  8 + 128
  }
#endif

  // TODO: get ClipLinePoint working, then enable using it and disable the verbose clipping code lower down
  // Ideally the clipping can be applied to other functions in Graphics.cpp to reduce the bounds checking
  // inside the loops
  if (dx && dy)
  {
    /*
    Rectangle rect = { { { 0, 0 } }, { { a_target->m_width, a_target->m_height } } };
    ClipLinePoint(a_x1, a_y1, dx, dy, rect);
    ClipLinePoint(a_x2, a_y2, dx, dy, rect);
    */
  }

	if ( !dy && !dx ) {
#if !USE_LINE_CLIPPING
		if ( a_x1 >= 0 && a_x1 < a_target->m_width && a_y1 >= 0 && a_y1 < a_target->m_height )
#endif
			a_target->m_pixels[a_y1*a_target->m_strideBytes/4 + a_x1] = a_color;
		return;
	}

  const unsigned shiftCount = 32;
	int count = 1;
	int64_t x = int64_t(a_x1) << shiftCount;
	int64_t y = int64_t(a_y1) << shiftCount;
	int64_t fdy = 1LL << shiftCount;
	int64_t fdx = 1LL << shiftCount;
	if ( dx*dx > dy*dy ) {
		count = dx;
		fdy = (dy * fdy) / dx; // fdy is dydx & fdx is 1
	} else {
		count = dy;
		fdx = (dx * fdx) / dy; // fdx is dxdy & fdy is 1
	}
	if ( count < 0 ) {
		count = -count;
		fdy = -fdy;
		fdx = -fdx;
	}

#if USE_LINE_CLIPPING
  // Try to start iterating the line in bounds
  int iterations = 0;
  if ( x < 0 ) {
    if ( fdx <= 0 ) {
      return; // Line starts off-screen and is going wrong way 
    } else {
      iterations = x / -fdx; // fdx can't be zero
    }
  }
  if ( y < 0 ) {
    if ( fdy <= 0 ) {
      return; // Line starts off-screen and is going wrong way 
    } else {
      iterations = std::max<int>(iterations, y / -fdy); // fdy can't be zero
    }
  }
  if ( a_x1 >= a_target->m_width ) {
    if ( fdx >= 0 ) {
      return; // Line starts off-screen and is going wrong way 
    } else {
      iterations = std::max<int>(iterations, (int64_t(a_x1 - (a_target->m_width-1))<<shiftCount) / -fdx); // fdx can't be zero
    }
  }
  if ( a_y1 >= a_target->m_height ) {
    if ( fdy >= 0 ) {
      return; // Line starts off-screen and is going wrong way 
    } else {
      iterations = std::max<int>(iterations, (int64_t(a_y1 - (a_target->m_height-1))<<shiftCount) / -fdy); // fdy can't be zero
    }
  }
  if ( iterations > count ) {
    return;
  } else {
    x += fdx * iterations;
    y += fdy * iterations;
    count -= iterations;

    // For some reason need this, sometimes xi is -1
    if (x < 0) {
      x += fdx;
      y += fdy;
      count--;
    }
  }

  iterations = count;
	int64_t x2 = int64_t(a_x2) << shiftCount;
	int64_t y2 = int64_t(a_y2) << shiftCount;
  if ( x2 < 0 ) { // fdx can't be 0 by this point
    iterations = std::min<int>(iterations, (count-1) - ((x2 - (1LL<<(shiftCount))) / fdx));
  }
  if ( y2 < 0 ) {
    iterations = std::min<int>(iterations, (count-1) - ((y2 - (1LL<<(shiftCount))) / fdy));
  }
  if ( a_x2 >= a_target->m_width ) {
    iterations = std::min<int>(iterations, (count-1) - ((int64_t(a_x2 - (a_target->m_width+1))<<shiftCount) / fdx)); // fdx can't be zero
  }
  if ( a_y2 >= a_target->m_height ) {
    iterations = std::min<int>(iterations, (count-1) - ((int64_t(a_y2 - (a_target->m_height+1))<<shiftCount) / fdy)); // fdy can't be zero
  }
  count = iterations;

  if (count > 0)
	for (; count--; x += fdx, y += fdy)
	{
		int xi = x >> shiftCount;
		int yi = y >> shiftCount;
    //a_target->m_pixels[yi*a_target->m_strideBytes/4 + xi] = a_color;
		if ( xi >= 0 && xi < a_target->m_width && yi >= 0 && yi < a_target->m_height )
    {
      uint32_t *dst = &(a_target->m_pixels[yi*a_target->m_strideBytes/4 + xi]);
      *dst = (!a_blend) ? a_color : blendColors(a_color, *dst, (a_color>>24) & 0xff);
    }
    else {
      fprintf(stderr, "count: %i, xi: %i, yi: %i\n", count, (int)xi, (int)yi);
    }
	}

#else
	//count++;
  if (count > 0)
	for (; count--; x += fdx, y += fdy)
	{
		int xi = x >> shiftCount;
		int yi = y >> shiftCount;
		if ( xi >= 0 && xi < a_target->m_width && yi >= 0 && yi < (a_target->m_height/2) )
			a_target->m_pixels[yi*a_target->m_strideBytes/4 + xi] = a_color;
    //else {
    //  fprintf(stderr, "count: %i, xi: %i, yi: %i\n", count, (int)xi, (int)yi);
    //}
	}
#endif
}


void DrawPixels(PixelBuffer* a_target, uint32_t* a_bits, int a_x, int a_y, int a_width, int a_height, int a_xOffset, int a_yOffset, int a_pixelsWidth, int a_pixelsHeight)
{
  ClipRect clip = SetupClip(a_target); if (!clip.valid) return;
	for (int j = 0; j < a_height; j++)
		for (int i = 0; i < a_width; i++)
		{
			int x = i + a_x;
			int y = j + a_y;
      if (PixelClipTest(clip, x, y))
			//if ( y > 0 && y < a_target->m_height && x > 0 && x < a_target->m_width )
			{
				uint32_t *dst = &(a_target->m_pixels[(j+a_y)*a_target->m_strideBytes/4 + (i+a_x)]);
				uint32_t src = a_bits[(j+a_yOffset)*a_pixelsWidth+(i+a_xOffset)];
				if ( src & 0xff000000 )
					*dst = src;
			}
		}
}


void DrawPixelsAlpha(PixelBuffer* a_target, uint32_t* a_bits, int a_x, int a_y, int a_width, int a_height, int a_xOffset, int a_yOffset, int a_pixelsWidth, int a_pixelsHeight)
{
  ClipRect clip = SetupClip(a_target); if (!clip.valid) return;
	for (int j = 0; j < a_height; j++)
		for (int i = 0; i < a_width; i++)
		{
			int x = i + a_x;
			int y = j + a_y;
      if (PixelClipTest(clip, x, y))
			// if ( y > 0 && y < a_target->m_height && x > 0 && x < a_target->m_width )
			{
				uint32_t *dst = &(a_target->m_pixels[(j+a_y)*a_target->m_strideBytes/4 + (i+a_x)]);
				uint32_t src = a_bits[(j+a_yOffset)*a_pixelsWidth+(i+a_xOffset)];
				if ( src & 0xff000000 )
					*dst = blendColors(src, *dst, (src>>24) & 0xff);
			}
		}
}


// Specialized function for blitting a cached glyph to the output colorized with a_color
// a_bits are the rasterized glyph, where it contains alpha, as well as black for the character, and color on the edges for sub-pixel rendering
void DrawPixelsText(PixelBuffer* a_target, uint32_t a_color, uint32_t* a_bits, int a_x, int a_y, int a_width, int a_height, int a_xOffset, int a_yOffset, int a_pixelsWidth, int a_pixelsHeight)
{
  ClipRect clip = SetupClip(a_target); if (!clip.valid) return;
	for (int j = 0; j < a_height; j++)
		for (int i = 0; i < a_width; i++)
		{
			int x = i + a_x;
			int y = j + a_y;
      if (PixelClipTest(clip, x, y))
			// if ( y > 0 && y < a_target->m_height && x > 0 && x < a_target->m_width )
			{
				uint32_t *dst = &(a_target->m_pixels[(j+a_y)*a_target->m_strideBytes/4 + (i+a_x)]);
				uint32_t src = a_bits[(j+a_yOffset)*a_pixelsWidth+(i+a_xOffset)];
				if ( src & 0xff000000 )
					*dst = blendColors(a_color | src, *dst, (src>>24) & 0xff);
			}
		}
}


void DrawPixelsAlphaBlended(PixelBuffer* a_target, uint32_t* a_bits, int a_x, int a_y, int a_width, int a_height, int a_xOffset, int a_yOffset, int a_pixelsWidth, int a_pixelsHeight, int a_alpha)
{
  ClipRect clip = SetupClip(a_target); if (!clip.valid) return;
	for (int j = 0; j < a_height; j++)
		for (int i = 0; i < a_width; i++)
		{
			int x = i + a_x;
			int y = j + a_y;
      if (PixelClipTest(clip, x, y))
			//if ( y > 0 && y < a_target->m_height && x > 0 && x < a_target->m_width )
			{
				uint32_t *dst = &(a_target->m_pixels[(j+a_y)*a_target->m_strideBytes/4 + (i+a_x)]);
				uint32_t src = a_bits[(j+a_yOffset)*a_pixelsWidth+(i+a_xOffset)];
				if ( src & 0xff000000 )
					*dst = blendColors(src, *dst, a_alpha);
			}
		}
}


// TODO: move to header/common, make use of else where
// perhaps PixelBuffer can be of PixelValue instead of uint32_t
struct PixelValue
{
  PixelValue(uint32_t v) : m_value(v) {}
  operator uint32_t() const { return m_value; }
  PixelValue& operator=(uint32_t val) { m_value = val; return *this; }
  union
  {
    uint32_t  m_value;
    uint8_t   m_components[4];
  };
};


struct WidePixelValue
{
  WidePixelValue()
  {
    for (int i = 0; i < 4; i++)
      m_components[i] = 0;
  }
  WidePixelValue& operator=(PixelValue other) {
    for (int i = 0; i < 4; i++)
      m_components[i] = other.m_components[i];
    return *this;
  }
  WidePixelValue& operator+=(PixelValue other) {
    for (int i = 0; i < 4; i++)
      m_components[i] += other.m_components[i];
    return *this;
  }
  WidePixelValue& operator/=(int v) {
    for (int i = 0; i < 4; i++)
      m_components[i] /= v;
    return *this;
  }
  operator uint32_t() {
    PixelValue out(0);
    for (int i = 0; i < 4; i++)
      out.m_components[i] = m_components[i];
    return out;
  }
  uint32_t m_components[4];
};


template <bool axis>
void SmoothDownSampleHelper(uint32_t* a_dstBits, int a_dstW, int a_dstH, uint32_t* a_srcBits, int a_width, int a_height)
{
  int64_t maxJ = (axis) ? a_height : a_width;
  int64_t maxI = (axis) ? a_width : a_height;
  int64_t maxDI = (axis) ? a_dstW : a_dstH;
  if (!maxI)
    return;
  // 32.32 fixed point
  int64_t dDdS = (maxDI << 32) / (maxI);
  for (int j = 0; j < maxJ; j++) {
    int64_t d = (1LL<<32) / 3;
    int c = 0, di = 0;
    WidePixelValue col;
    for (int i = 0; i < maxI; i++) {
      int x = (axis) ? i : j;
      int y = (axis) ? j : i;
      col += a_srcBits[y * a_width + x];
      c++;
      d += dDdS; // DDA style
      if (d >= (1LL<<32)) { // emit
        d -= (1LL<<32);
        col /= c;
        if (axis)
          a_dstBits[y * a_dstW + di] = col;
        else
          a_dstBits[di * a_dstW + x] = col;
        c = 0;
        col = 0;
        di++;
        if (di == maxDI)
          break;
      }
    }  
  }
}


bool SmoothDownSample(uint32_t* a_dstBits, int a_dstW, int a_dstH, uint32_t* a_srcBits, int a_width, int a_height)
{
  if (a_dstW > a_width || a_dstH > a_height)
    return false;
  uint32_t *tmpBits = new uint32_t[a_height * a_dstW]; // resize in x only
  // First scale in the x-axis
  SmoothDownSampleHelper<true>(tmpBits, a_dstW, a_height, a_srcBits, a_width, a_height);
  // Now scale in the y-axis
  SmoothDownSampleHelper<false>(a_dstBits, a_dstW, a_dstH, tmpBits, a_dstW, a_height);
  delete[] tmpBits;
  return true;
}



#if 0

static bool TestFileExtension(const char* a_file, const char* a_ext)
{
	size_t len = strlen(a_file);
	if (len < 4 || strlen(a_ext) != 4)
		return false;
	for (int i = 0; i < 4; i++)
		if (toupper(a_file[len-4+i]) != toupper(a_ext[i]))
			return false;
	return true;
}


extern int DecodePNG(std::vector<unsigned char>& out_image, unsigned long& image_width, unsigned long& image_height, const unsigned char* in_png, size_t in_size, bool convert_to_rgba32);


static bool LoadImage(const char* file, PixelBuffer& a_img, bool chromaKeyed = true)
{
	if (TestFileExtension(file, ".png"))
	{
		// It's a PNG image
		MemoryMappingData* mapping = MemoryMapping_Open(file);
    if (!mapping)
      return false;
		// For 32bit builds, this code won't support input PNG files that are more than 4GiB, but neither can it support
		// having a buffer as large as that for storing the pixels in to, so perhaps best just to use 64bit builds if dealing with massive images.
		size_t siz = (size_t)MemoryMapping_GetSize(mapping);
		uint8_t* buf = (uint8_t*)MemoryMapping_GetAddress(mapping);
		std::vector<unsigned char> out_image;
    unsigned long w, h;
		if (DecodePNG(out_image, w, h, buf, siz, true))
    {
      a_img.m_height = h;
      a_img.m_width = w;
      a_img.m_strideBytes = w * sizeof(uint32_t);
      a_img.m_format = PF_ARGB8888;
      a_img.m_pixels = new uint32_t[w * h];
      memcpy(a_img.m_pixels, out_image.data(), w * h * sizeof(uint32_t));
      for ( int i = 0; i < w*h; i++ )
        a_img.m_pixels[i] = ((a_img.m_pixels[i] << 16) & 0xff0000) |
          ((a_img.m_pixels[i] >> 16) & 0xff) | (a_img.m_pixels[i] & 0xff00ff00);
      if ( chromaKeyed )
        for ( int i = 0; i < w*h; i++ )
          if ( a_img.m_pixels[i] != 0x00ff00 )
            a_img.m_pixels[i] |= 0xff000000;
      return true;
    }
	}
  return false;
}


void DrawPixmapFile(PixelBuffer* a_target, const char* file, int x, int y, int x1, int y1, int x2, int y2)
{
	static std::map<const char*,PixelBuffer> imageMap; // pixmap cache
	PixelBuffer img = imageMap[file];
	if (!img.m_pixels)
  {
		if (LoadImage(file, img, false))
    {
		  imageMap[file] = img;
    }
	}
	DrawPixels(a_target, img.m_pixels, x, y, x2-x1+1, y2-y1+1, x1, y1, img.m_width, img.m_height);
}


void DrawNumber(PixelBuffer* a_target, uint32_t col, int x, int y, int num, int width)
{
//	LoadImage("numbers.png", g_numbersPixelsW, g_numbersPixelsH, g_numbersPixels, false);
	int divisor = 1;
	for (int i = width; i != 0; i--, divisor*=10)
		DrawPixels(a_target, g_numbersPixels, x + (i-1)*16, y, 16, 30, 0, 30*((num/divisor)%10), g_numbersPixelsW, g_numbersPixelsH);
}


#endif


void LogMessage(LogLevel a_level, const char* a_formatString, ...)
{
	char buf[1024];
	va_list vaargs;
	va_start(vaargs, a_formatString);
	vsnprintf(buf, 1024, a_formatString, vaargs);
	buf[1023] = 0; // Ensure null terminated
#ifdef _WIN32
  OutputDebugStringA(buf);
#endif
  puts(buf);
	va_end(vaargs);
}


// Simple 3 control point bezier curve
// p1 and p3 are the 2 end points the curve starts/ends at
// p2 is another control point but not a point on the curve
// t is the ratio between the two end points to interpolate a result for
vec2i bezierCurve(const vec2i& p1, const vec2i& p2, const vec2i& p3, double t)
{
  double t2 = 1.0 - t;
  //return (vec2i){ int(t2*t2*p1.x + 2.0*t2*t*p2.x + t*t*p3.x),
  //                int(t2*t2*p1.y + 2.0*t2*t*p2.y + t*t*p3.y) }; 
  return (vec2i){ (int)std::ceil(t2*t2*p1.x + 2.0*t2*t*p2.x + t*t*p3.x + 0.5),
                  (int)std::ceil(t2*t2*p1.y + 2.0*t2*t*p2.y + t*t*p3.y + 0.5) }; 
}


// Simple 3 control point bezier curve
// p1 and p3 are the 2 end points the curve starts/ends at
// p2 is another control point but not a point on the curve
// t is a ratio (from 0 to 65535) between the two end points to interpolate a result for
vec2i bezierCurveFixed(const vec2i& p1, const vec2i& p2, const vec2i& p3, uint64_t t)
{
  uint64_t t2 = 0x10000 - t;
  //uint64_t t2 = 0xFFFF - t;
  uint64_t t2t2 = t2*t2;
  uint64_t t2t = t2*t;
  uint64_t tt = t*t;
  return (vec2i){ int((t2t2*p1.x + 2*t2t*p2.x + tt*p3.x + (1<<31)) >> 32),
                  int((t2t2*p1.y + 2*t2t*p2.y + tt*p3.y + (1<<31)) >> 32) };
}


void DrawCurve(PixelBuffer* a_target, uint32_t a_color, const vec2i& p1, const vec2i& p2, const vec2i& p3, bool a_blend)
{
  ClipRect clip = SetupClip(a_target); if (!clip.valid) return;
  int dx = p1.x - p3.x;
  int dy = p1.y - p3.y;
  if (dx*dx > dy*dy) {
    dx = (dx < 0) ? -dx : dx;
  } else {
    dx = (dy < 0) ? -dy : dy;
  }
  dx *= 2;

  /* 
  float divisor = 1.0f / float(dx);
  for (int i = 0; i < dx; i++)
  {
    vec2i pos1 = bezierCurve(p1, p2, p3, float(i) * divisor);
    vec2i pos2 = bezierCurve(p1, p2, p3, float(i + 1) * divisor);
    DrawLine(a_target, a_color, pos1.x, pos1.y, pos2.x, pos2.y, a_blend);
  }
  */

  // Fixed precision version
  for (int i = 0; i < dx; i++)
  {
    vec2i pos1 = bezierCurveFixed(p1, p2, p3, ((i << 16) - (dx/2)) / dx);
    vec2i pos2 = bezierCurveFixed(p1, p2, p3, (((i+1) << 16) - (dx/2)) / dx);
    DrawLine(a_target, a_color, pos1.x, pos1.y, pos2.x, pos2.y, a_blend);
  }
}


void DrawFontCurveInternal(PixelBuffer* a_target, uint32_t a_color, const vec2i& p1, const vec2i& p2, const vec2i& p3)
{
  ClipRect clip = SetupClip(a_target); if (!clip.valid) return;
  int dx = p1.x - p3.x;
  int dy = p1.y - p3.y;
  if (dx*dx > dy*dy) {
    dx = (dx < 0) ? -dx : dx;
  } else {
    dx = (dy < 0) ? -dy : dy;
  }
  dx *= 2;

  /* 
  float divisor = 1.0f / float(dx);
  for (int i = 0; i < dx; i++)
  {
    vec2i pos1 = bezierCurve(p1, p2, p3, float(i) * divisor);
    vec2i pos2 = bezierCurve(p1, p2, p3, float(i + 1) * divisor);
    if (PixelClipTest(clip, pos1.x, pos1.y))
      a_target->m_pixels[pos1.y*a_target->m_strideBytes/4 + pos1.x] |= a_color;
    if (PixelClipTest(clip, pos2.x, pos2.y))
      a_target->m_pixels[pos2.y*a_target->m_strideBytes/4 + pos2.x] |= a_color;
  }
  */

  // Fixed precision version
  for (int i = 0; i < dx; i++)
  {
    vec2i pos1 = bezierCurveFixed(p1, p2, p3, ((i << 16) - (dx/2)) / dx);
    vec2i pos2 = bezierCurveFixed(p1, p2, p3, (((i+1) << 16) - (dx/2)) / dx);
    if (PixelClipTest(clip, pos1.x, pos1.y))
      a_target->m_pixels[pos1.y*a_target->m_strideBytes/4 + pos1.x] |= a_color;
    if (PixelClipTest(clip, pos2.x, pos2.y))
      a_target->m_pixels[pos2.y*a_target->m_strideBytes/4 + pos2.x] |= a_color;
  }
}


Rectangle glyphBounds(const Glyph::Outline& outline)
{
  Rectangle ret;
  int minX = INT_MAX, minY = INT_MAX;
  int maxX = INT_MIN, maxY = INT_MIN;
  for (size_t i = 0; i < outline.m_lines.size(); i++)
  {
    const Glyph::Outline::Curve& b = outline.m_lines[i];
    for (int c = 0; c < 3; c++) {
      int x = b.m_controlPoints[c].m_x;
      int y = b.m_controlPoints[c].m_y;
      if (x < minX) minX = x; if (x > maxX) maxX = x;
      if (y < minY) minY = y; if (y > maxY) maxY = y;
    }
  }
  ret.m_x = minX;
  ret.m_y = minY;
  ret.m_width = maxX - minX + 1;
  ret.m_height = maxY - minY + 1;
  return ret;
}


// TODO: remove these debugging variables
#define DEBUG_FONT_RENDERING  1
#if DEBUG_FONT_RENDERING
bool drawWholeOutline = false;
std::string fontOpt = "Arial";
bool drawAAText = true;
#else
const bool drawWholeOutline = false;
const std::string fontOpt = "Arial";
const bool drawAAText = true;
#endif


// debug thing for debugging the rasterization stages:
int g_stage = 100;   // 100 - means do all the stages
bool g_enableKerning = true;


struct CachedGlyphData
{
  Rectangle   rect;
  PixelBuffer buf;
};


struct GlyphRasterizationCache : public Glyph::CacheDataInterface
{
  std::map<int, CachedGlyphData> m_sizeMap;
  std::map<int, CachedGlyphData> m_aaSizeMap;
};


CachedGlyphData CreateCachedGlyphData(int a_size, const Glyph::Outline& outline, const int bias)
{
  CachedGlyphData data;
  Rectangle rect = glyphBounds(outline);
  if (a_size <= 0)
  {
    printf("bad size\n");
    exit(-1);
  }
  
  assert(a_size > 0);

  rect.m_width = ((rect.m_width * a_size + bias) >> 10) + 5;   // TODO: What's the +7 for?
  rect.m_height = ((rect.m_height * a_size + bias) >> 10) + 5; // With the bias for sub-pixel placement, a one pixel guard band is needed, so +2
  const int size = rect.m_width*rect.m_height;
  PixelBuffer buf = {
    new uint32_t[size],
    rect.m_width*4,
    rect.m_width,
    rect.m_height,
    PF_ARGB8888,
    false
  };
  for (int i = 0; i < size; i++)
    buf.m_pixels[i] = 0x00000000;
  data.rect = rect;
  data.buf = buf;
  return data;
}


void ReleaseCachedGlyphData(CachedGlyphData& data)
{
  delete[] data.buf.m_pixels;
}


void DrawOutlineInternal(PixelBuffer& buf, const int a_size, const Rectangle& rect, const Glyph::Outline& outline, const int bias)
{
  const uint32_t leftColor = 0xFF0000AF;
  const uint32_t rightColor = 0xFFAF0000;

  // iterate the bezier curves drawing the left sides of the outline
  for (size_t i = 0; i < outline.m_lines.size(); i++)
  {
    const Glyph::Outline::Curve& b = outline.m_lines[i];
    vec2i pnts[3];
    for (int c = 0; c < 3; c++)
      pnts[c] = (vec2i){ 1 + int((((b.m_controlPoints[c].m_x - rect.m_x) * a_size) + bias) >> 10),
        rect.m_height - 1 - int((((b.m_controlPoints[c].m_y - rect.m_y) * a_size) + bias) >> 10) };
    const uint32_t color = (pnts[0].y > pnts[2].y) ? leftColor : rightColor;
    DrawFontCurveInternal(&buf, color, pnts[0], pnts[1], pnts[2]);
  }
}


CachedGlyphData RasterizeCharacterToBuffer(int a_size, const Glyph::Outline& outline)
{
  const int bias = 512;// 512;//0;//512;

  //a_size -= 1;
  //a_size++;
  CachedGlyphData data = CreateCachedGlyphData(a_size, outline, bias);

  Rectangle& rect = data.rect;
  PixelBuffer& buf = data.buf;

  // TODO: why does this magic number work?
  //rect.m_y -= 128;//bias;

  DrawOutlineInternal(buf, a_size, rect, outline, bias);

  const uint32_t leftColor = 0xFF0000AF;
  const uint32_t rightColor = 0xFFAF0000;
  const uint32_t fillColor = 0xFF202020;

  // Simplest and best solution to flood-filling so far
  // Potentially could be done in a shader
  for (int j = 0; j < rect.m_height; j++)
  {
// #define GPU_TEST  // Try to do it just by looking at the pixel/texture values
                     // assumes can read the modified values and it
                     // processes in the same order
#ifndef GPU_TEST
    bool wasLeft = false;
    bool wasRight = false;
    bool wasFilled = false;
#endif
    for (int i = 0; i < rect.m_width; i++)
    {
      uint32_t color = buf.m_pixels[j*rect.m_width + i];
      uint32_t aboveColor = (j == 0) ? 0 : buf.m_pixels[(j-1)*rect.m_width + i];
#ifdef GPU_TEST
      uint32_t previousColor = (i == 0) ? 0 : buf.m_pixels[j*rect.m_width + (i-1)];
      bool wasLeft = (previousColor & leftColor) == leftColor;
      bool wasRight = (previousColor & rightColor) == rightColor;
      bool wasFilled = previousColor == fillColor;
#endif
      bool isLeft = (color & leftColor) == leftColor;
      bool isRight = (color & rightColor) == rightColor;
      bool aboveIsFill = aboveColor == fillColor;
      bool aboveIsRight = (aboveColor & rightColor) == rightColor;
      bool aboveIsLeft = (aboveColor & leftColor) == leftColor;

      bool fill = (!isLeft && !isRight && (wasFilled || wasLeft) && (aboveIsLeft || aboveIsRight || aboveIsFill));
        // && !(!wasLeft && wasRight && aboveIsRight) )

      if (fill)
      {
        buf.m_pixels[j*rect.m_width + i] = fillColor;
      }

#ifndef GPU_TEST
      wasFilled = fill;
      wasLeft = isLeft;
      wasRight = isRight;
#endif
    }
  }


  if (g_stage == 100)
    return data;
  //if (g_stage == 5)
  //  return data;
 
  // For debugging
  DrawOutlineInternal(buf, a_size, rect, outline, bias);

  return data;
}


int nextPowerOfTwo(int a_value)
{
  int i = 1;
  while (i < a_value)
    i <<= 1;
  return i;
}


template <typename Tout, typename Tin>
Tout dynamic_cast2(Tin ptr)
{
#ifndef NO_RTTI
  return dynamic_cast<Tout>(ptr);
#else
  return reinterpret_cast<Tout>(ptr);
#endif
}


void DrawGlyphOutlineBlend(const Glyph& a_glyph, PixelBuffer* a_target, uint32_t a_color, int a_size, int& a_x, int& a_y);

void RasterizeCharacter2(PixelBuffer* a_target, uint32_t a_color, int a_size, int a_x, int a_y, const Glyph& a_glyph)
{
  const int bias = 0;//512;
  const Glyph::Outline& outline = a_glyph.m_outline;
  GlyphRasterizationCache* cache = dynamic_cast2<GlyphRasterizationCache*>(a_glyph.m_userData);

  if (!cache) // TODO: Assumes no one else is using m_userData with another type
  {
    // TODO: this is a cache per glyph - might be hard to have a global cache size limit / policy
    cache = new GlyphRasterizationCache;
    a_glyph.m_userData = cache;
  }

  int nonAASize = a_size;  // for aa, size we scale down from, otherwise the original requested size
  if (drawAAText) {
    if (a_size < 64) {
      nonAASize = nextPowerOfTwo(a_size);
      nonAASize *= 3;
    }
  }

  if (g_stage != 100)
  {
    CachedGlyphData data = RasterizeCharacterToBuffer(nonAASize, outline);
    const Rectangle& rect = data.rect;
    const PixelBuffer& buf = data.buf;
    int yOff = a_size - rect.m_height + 1;

    // DrawPixelsText(a_target, a_color, buf.m_pixels, a_x + ((rect.m_x*a_size + bias)>>10), 
    DrawPixelsAlpha(a_target, buf.m_pixels, a_x + ((rect.m_x*a_size + bias)>>10), 
        a_y + yOff - ((rect.m_y*a_size + bias)>>10), buf.m_width, buf.m_height, 0, 0, buf.m_width, buf.m_height);
    return;
  }

  if (!cache->m_sizeMap.count(nonAASize)) {
    CachedGlyphData data = RasterizeCharacterToBuffer(nonAASize, outline);
    cache->m_sizeMap[nonAASize] = data;
  }

  if (drawAAText) {
    if (!cache->m_aaSizeMap.count(a_size)) {
      const CachedGlyphData& data = cache->m_sizeMap[nonAASize];
      const Rectangle& rect = data.rect;
      CachedGlyphData ret = data;
      if (a_size < 64) {
        float aaScale = a_size / float(nonAASize);
        int w = rect.m_width * aaScale;
        int h = rect.m_height * aaScale;
        ret.buf = { new uint32_t[w*h], w*4, w, h, PF_ARGB8888, false };
        ret.rect.m_width = w;
        ret.rect.m_height = h;
        SmoothDownSample(ret.buf.m_pixels, w, h, data.buf.m_pixels, rect.m_width, rect.m_height);
      } else {
        int w = rect.m_width;
        int h = rect.m_height;
        ret.buf = { new uint32_t[w*h], w*4, w, h, PF_ARGB8888, false };
        for (int i = 0; i < w*h; i++)
        {
          uint32_t col = data.buf.m_pixels[i];
          ret.buf.m_pixels[i] = (col & 0xFF000000) ? 0xFF000000 : 0x00000000;
        }
        /*
        int x = 0, y = 0;
        int yOff = a_size - rect.m_height;
        int x1 = -((rect.m_x*a_size - bias)>>10); 
        int y1 = -yOff + ((rect.m_y*a_size - bias)>>10);
        const uint32_t col = 0x20000000;
        x = x1-1, y = y1;
        DrawGlyphOutlineBlend(a_glyph, &ret.buf, col, a_size, x, y);
        x = x1+1, y = y1;
        DrawGlyphOutlineBlend(a_glyph, &ret.buf, col, a_size, x, y);
        x = x1-1, y = y1-1;
        DrawGlyphOutlineBlend(a_glyph, &ret.buf, col, a_size, x, y);
        x = x1-1, y = y1+1;
        DrawGlyphOutlineBlend(a_glyph, &ret.buf, col, a_size, x, y);
        x = x1, y = y1-1;
        DrawGlyphOutlineBlend(a_glyph, &ret.buf, col, a_size, x, y);
        x = x1, y = y1+1;
        DrawGlyphOutlineBlend(a_glyph, &ret.buf, col, a_size, x, y);
        x = x1+1, y = y1-1;
        DrawGlyphOutlineBlend(a_glyph, &ret.buf, col, a_size, x, y);
        x = x1+1, y = y1+1;
        DrawGlyphOutlineBlend(a_glyph, &ret.buf, col, a_size, x, y);
        x = x1, y = y1;
        DrawGlyphOutlineBlend(a_glyph, &ret.buf, col, a_size, x, y);
        */
      }
      cache->m_aaSizeMap[a_size] = ret;
    }
  }

  const CachedGlyphData& data = (drawAAText) ? cache->m_aaSizeMap[a_size] : cache->m_sizeMap[a_size];
  const Rectangle& rect = data.rect;
  const PixelBuffer& buf = data.buf;
  int yOff = a_size - rect.m_height + 1;

  DrawPixelsText(a_target, a_color, buf.m_pixels, a_x + ((rect.m_x*a_size + bias)>>10), 
      a_y + yOff - ((rect.m_y*a_size + bias)>>10), buf.m_width, buf.m_height, 0, 0, buf.m_width, buf.m_height);
  //ReleaseCachedGlyphData(data);
}


void DrawGlyphOutline(const Glyph& a_glyph, PixelBuffer* a_target, uint32_t a_color, int a_size, int& a_x, int& a_y)
{
  // iterate the bezier curves
  for (size_t i = 0; i < a_glyph.m_outline.m_lines.size(); i++)
  {
    const Glyph::Outline::Curve& b = a_glyph.m_outline.m_lines[i];
    vec2i pnts[3];
    for (int c = 0; c < 3; c++)
      pnts[c] = (vec2i){ a_x + int((b.m_controlPoints[c].m_x * a_size) >> 10),
        a_y + a_size + 3 - int((b.m_controlPoints[c].m_y * a_size) >> 10) };    // TODO: this +3 is a bit magic to make it line up with filled drawing
    DrawCurve(a_target, a_color, pnts[0], pnts[1], pnts[2], false);
  }
}


void DrawGlyphOutlineBlend(const Glyph& a_glyph, PixelBuffer* a_target, uint32_t a_color, int a_size, int& a_x, int& a_y)
{
  // iterate the bezier curves
  for (size_t i = 0; i < a_glyph.m_outline.m_lines.size(); i++)
  {
    const Glyph::Outline::Curve& b = a_glyph.m_outline.m_lines[i];
    vec2i pnts[3];
    for (int c = 0; c < 3; c++)
      pnts[c] = (vec2i){ a_x + int((b.m_controlPoints[c].m_x * a_size) >> 10),
        a_y + a_size - int((b.m_controlPoints[c].m_y * a_size) >> 10) };
    DrawCurve(a_target, a_color, pnts[0], pnts[1], pnts[2], true);
  }
}


void DrawGlyphFilled(const Glyph& a_glyph, PixelBuffer* a_target, uint32_t a_color, int a_size, int& a_x, int& a_y)
{
  // TODO: Add rasterized glyph caching?? - Perhaps try to do the SDF thing with the multiple distance fields
  //         - Probably would want to do that with a few glyphs at a time
  //         - Then see about using it to draw text in varying styles - bold, normal, outlined, shadowed etc in shaders
  //         - The text can skew, rotate, scale and move by manipulation of the vertices. Need to tie that with kerning somehow.
  //  Next step if can do all of that is to see if can then do all of that with asian fonts/characters
  //  And then with any font/language

  int y = a_y + 1;
  //y -= a_size / 2; // FIXME: hack

  //RasterizeCharacter(a_target, a_color, a_size, a_x, a_y + 1, outline);
  RasterizeCharacter2(a_target, a_color, a_size, a_x, y, a_glyph);
#if DEBUG_FONT_RENDERING
  if (drawWholeOutline)
    DrawGlyphOutline(a_glyph, a_target, 0xFF000000, a_size, a_x, a_y);
#endif
}


void GetGlyphExtents(const Glyph& a_glyph, PixelBuffer* a_target, uint32_t a_color, int a_size, int& a_width, int& a_height)
{
  Rectangle rect = glyphBounds(a_glyph.m_outline);
  if (rect.m_height > a_height)
    a_height = rect.m_height;
}


typedef void (*ProcessTextHandler)(const Glyph& a_glyph, PixelBuffer* a_target, uint32_t a_color, int a_size, int& a_x, int& a_y);


//FontManager g_fontManager;

FontMetrics GetFontMetrics()
{
  TrueTypeFont& font = getFont(fontOpt.c_str());
  Metrics& fm = font.getMetrics();
  struct FontMetrics fm2;
  fm2.m_ascent = fm.m_ascent;
  fm2.m_descent = fm.m_descent;
  fm2.m_xHeight = fm.m_xHeight;
  fm2.m_capHeight = fm.m_capHeight;
  fm2.m_lineGap = fm.m_lineGap;
  return fm2;
}


GlyphMetrics GetGlyphMetrics(uint32_t a_unicodeCharacter)
{
  TrueTypeFont& font = getFont(fontOpt.c_str());
  Glyph& glyph = font.getGlyph(a_unicodeCharacter);
  struct GlyphMetrics gm;
	gm.m_advanceWidth = glyph.m_metrics.m_advanceWidth;
	gm.m_leftSideBearing = glyph.m_metrics.m_leftSideBearing;
	gm.m_rightSideBearing = glyph.m_metrics.rightSideBearing();
  gm.m_minX = glyph.m_metrics.m_min.m_x;
  gm.m_minY = glyph.m_metrics.m_min.m_y;
  gm.m_maxX = glyph.m_metrics.m_max.m_x;
  gm.m_maxY = glyph.m_metrics.m_max.m_y;
  return gm;
}


void ProcessText(ProcessTextHandler a_func, PixelBuffer* a_target, uint32_t a_color, const char* a_fontFamily, int a_size, int& a_x, int& a_y, const char* a_utf8String)
{
  if (!a_target || !a_fontFamily || !a_utf8String || a_size <= 0)
  {
    return;
  }

  // TODO: Doesn't use the given font, uses a fixed font name
  TrueTypeFont& font = getFont(fontOpt.c_str());
  // Font& font = getFont(a_fontFamily);

  // TODO: Fix me
//  getFontName(fontOpt.c_str());

  // TODO: Loop over the string and break on line-breaks and call some internal function which does the body of this function
  while (*a_utf8String)
  {
    // TODO: need to decode the UTF8 in to Unicode characters - this is assuming everything is latin1
    uint32_t unicodeCharacter = *a_utf8String;
    Glyph& glyph = font.getGlyph(unicodeCharacter);

    a_func(glyph, a_target, a_color, a_size, a_x, a_y);

    a_utf8String++;
    uint32_t rightUnicodeCharacter = *a_utf8String;
    int16_t kerningAdjustment = (g_enableKerning ? 1 : 0) * font.getKerningAdjustment(unicodeCharacter, rightUnicodeCharacter);
    const float spacing = 1.0;
    a_x += (((glyph.m_metrics.m_advanceWidth + kerningAdjustment) * a_size) >> 10) + (int)(spacing + 0.5);
  }
}


// TODO: refactor with DrawText so that it aligns with same font, same character spacing etc
void DrawOutlineText(PixelBuffer* a_target, uint32_t a_color, const char* a_fontFamily, int a_size, int a_x, int a_y, const char* a_utf8String)
{
  ProcessText(DrawGlyphOutline, a_target, a_color, a_fontFamily, a_size, a_x, a_y, a_utf8String);
}


void GetTextExtents(PixelBuffer* a_target, const char* a_fontFamily, int a_size, const char* a_utf8String, int& a_width, int& a_height)
{
  const int bias = 512;
  a_width = 0;
  a_height = 0;
  ProcessText(GetGlyphExtents, a_target, 0xFF000000, a_fontFamily, a_size, a_width, a_height, a_utf8String);
  a_height *= a_size;
  a_height += bias;
  a_height >>= 10;

  // TODO: more magic:
  a_height += 6;
  a_width += 10;
  //a_height = a_size;
}


void DrawText(PixelBuffer* a_target, uint32_t a_color, const char* a_fontFamily, int a_size, int a_x, int a_y, const char* a_utf8String, int a_flags)
{
  ProcessTextHandler handler = DrawGlyphFilled;
  g_stage = a_flags - 3;
  switch (a_flags)
  {
    case 0: g_stage = 100;                   break;
    case 1: handler = DrawGlyphOutline;      break;
    case 2: handler = DrawGlyphOutlineBlend; break;
  }

  // TODO: some dodgy offsets
  a_x += 4;
  a_y += 7;
  ProcessText(handler, a_target, a_color, a_fontFamily, a_size, a_x, a_y, a_utf8String);
  
  g_stage = 100;
}


/*

struct vec2f
{
  float x, y;
};

vec2f operator*(const vec2f& v, float c)
{
  return (vec2f){ v.x*c, v.y*c };
}

vec2f operator*(float c, const vec2f& v)
{
  return (vec2f){ v.x*c, v.y*c };
}

vec2f operator+(const vec2f& v1, const vec2f& v2)
{
  return (vec2f){ v1.x+v2.x, v1.y+v2.y };
}

// Generalized spline function
// a_tension       -1.0 -> 1.0     Round -> Tight
// a_bias          -1.0 -> 1.0     Pre shoot -> Post shoot
// a_continuity    -1.0 -> 1.0     Box corners -> Inverted corners
vec2f kochanekBartelsSpline(const std::vector<vec2f>& a_points, float t, float a_tension, float a_bias, float a_continuity)
{
  int pnts = a_points.size();
  if (!pnts)
    return (vec2f){ 0.0f, 0.0f };
  t = std::max(0.0f, std::min(1.0f, t)); // clamp t to be between 0->1
  int intervals = pnts - 1; // the intervals are the segments between each point 
  int n = std::max(0, std::min(pnts-1, int(intervals * t))); // figure out which interval t is between
  t = t * intervals - n; // the new t value is the ratio between the points at n and n+1
  float tmp1 = 0.5f * (1.0f - a_tension);
  float A = tmp1 * (1.0f + a_continuity) * (1.0f + a_bias); // A,B,C,D are the parameter factors
  float B = tmp1 * (1.0f - a_continuity) * (1.0f - a_bias);
  float C = tmp1 * (1.0f - a_continuity) * (1.0f + a_bias);
  float D = tmp1 * (1.0f + a_continuity) * (1.0f - a_bias);
  const vec2f& pnt0 = a_points[std::max(n - 1, 0)];
  const vec2f& pnt1 = a_points[n];
  const vec2f& pnt2 = a_points[std::min(pnts - 1, n + 1)];
  const vec2f& pnt3 = a_points[std::min(pnts - 1, n + 2)];
  vec2f vA =  -A*pnt0 +      (2+A-B-C)*pnt1 +  (-2+B+C-D)*pnt2 +  D*pnt3; // vA,vB,vC - polynomial factors
  vec2f vB = 2*A*pnt0 + (-3-2*A+2*B+C)*pnt1 + (3-2*B-C+D)*pnt2 + -D*pnt3;
  vec2f vC =  -A*pnt0 +          (A-B)*pnt1 +           B*pnt2;
  return ((vA*t + vB)*t + vC)*t + pnt1; // blends by the factors between the 4 closest control points
}

vec2f catmulRomSpline(const std::vector<vec2f>& a_points, float t)
{
  return kochanekBartelsSpline(a_points, t, 0.0f, 0.0f, 0.0f);
}

vec2f cubicSpline(const std::vector<vec2f>& a_points, float t)
{
  return kochanekBartelsSpline(a_points, t, 1.0f, 0.0f, 0.0f);
}

vec2f lineSpline(const std::vector<vec2f>& a_points, float t)
{
  return kochanekBartelsSpline(a_points, t, 0.0f, 0.0f, -1.0f);
}

*/

END_NAMESPACE

