Newer
Older
Import / research / ui / toolkit / unicode_reference / BidiReferenceC / 6.3.0 / source / brtable.c
/*
**      Unicode Bidirectional Algorithm
**      Reference Implementation
**      (c) Copyright 2013
**      Unicode, Inc. All rights reserved.
**      Unpublished rights reserved under U.S. copyright laws.
*/

/*
 * brtable.c
 *
 * Module to create and access tables of Bidi_Class and any
 * other Unicode character properties required for correct
 * implementation of the UBA.
 *
 * Values needed to initialize the tables are read
 * from UnicodeData.txt and other UCD data files of explicit
 * versions, as needed.
 *
 * Exports:
 *  br_InitTable
 *  br_Init
 *  br_GetInitDone
 *  br_GetBC
 *  br_GetBPT
 *  br_GetBPB
 */

/*
 * Change history:
 *
 * 2013-Jun-02 kenw   Created.
 * 2013-Jun-20 kenw   Added use of br_ErrPrint.
 * 2013-Jun-26 kenw   Added Trace15 to br_Init.
 * 2013-Jun-27 kenw   Correct incorrect error printing in
 *                       br_parsePropertyData.
 */

#include <string.h>
#include <ctype.h>
#include <stdio.h>

#include "bidirefp.h"

static int linesProcessed;

static int initDone = 0;

#define UNICODE_DATA ( 0 )
#define BRACKET_DATA ( 1 )

/*
 * Arrays to store basic data for each character.
 */
static BIDIDATA planes01[0x20000];  /* 2048 K  Inefficient, but gets the job done. */

static BIDIDATA plane2[0x800];   /* Only needed for compatibility CJK range. */

static BIDIDATA plane14[0x200];  /* Just for tag and variation selector characters. */

static BIDIDATA handata;         /* Static value shared for all unified CJK. */

/***********************************************************/

/*
 * SECTION: Utility routines.
 */

int br_GetInitDone ( void )
{
    return ( initDone );
}

#define DELIM ';'

/*
 * skipField
 *
 * Skip over one semi-colon delimited field.
 */

static char *skipField ( char *src )
{
char *sp;

    sp = src;
    while ( (*sp != DELIM) && (*sp != '\0') && (*sp != '\n') )
    {
        sp++;
    }
    if ( ( *sp == DELIM ) || ( *sp == '\n') )
    {
        sp++;
    }
    return sp;
}

/*
 * skipSpace
 *
 * Skip over spaces.
 */

static char *skipSpace ( char *src )
{
char *sp;

    sp = src;
    while ( (*sp == ' ') && (*sp != '\0') && (*sp != '\n') )
    {
        sp++;
    }
  return sp;
}

/*
 * br_PrintErr
 *
 * Print out a diagnostic error for a parsing evaluationa failure.
 */

static void br_PrintErr ( int n, char *s )
{
    if ( Trace ( Trace15 ) )
    {
        if ( n == -1 )
        {
            printf ( "Error: Uneven hex digits in code point [%s].\n", s );
        }
        else
        {
            sprintf ( "Error: Invalid hex digit in code point [%s].\n", s );
        }
    }
}


/***********************************************************/

/*
 * SECTION: Array access utility routines.
 *
 * Abstract the access to the BidiData array by code point.
 */

static BDPTR getBidiDataPtr ( U_Int_32 i )
{
    if ( i <= 0x33FF )
    {
        return ( &(planes01[i] ) );
    }
    else if ( i <= CJK_EXTA_LAST )
    {
        return ( &handata );
    }
    else if ( i <= 0x4DFF )
    {
        return ( &(planes01[i] ) );
    }
    else if ( i <= CJK_URO_LAST )
    {
        return ( &handata );
    }
    else if ( i <= 0x1FFFF )
    {
        return ( &(planes01[i] ) );
    }
    else if ( i <= CJK_EXTB_LAST )
    {
        return ( &handata );
    }
    else if ( i <= CJK_EXTC_LAST )
    {
        return ( &handata );
    }
    else if ( i <= CJK_EXTD_LAST )
    {
        return ( &handata );
    }
    else if ( ( i >= 0x2F800 ) && ( i <= 0x2FFFF ) )
    {
        return ( &(plane2[i - 0x2F800] ) );
    }
    else if ( ( i >= 0xE0000 ) && ( i <= 0xE01FF ) )
    {
        return ( &(plane14[i - 0xE0000]) );
    }
    else
    {
        return ((BDPTR)NULL);
    }
}

/*
 * Exported functions to get property values from the
 * property tables for particular characters.
 */

BIDIPROP br_GetBC ( U_Int_32 c )
{
BDPTR dp;
    
    dp = getBidiDataPtr ( c );
    return ( dp != NULL ) ? dp->bidivalue : BIDI_Unknown ;
}

BPTPROP br_GetBPT ( U_Int_32 c )
{
BDPTR dp;
    
    dp = getBidiDataPtr ( c );
    return ( dp != NULL ) ? dp->bpt : BIDI_None ;
}

U_Int_32 br_GetBPB ( U_Int_32 c )
{
BDPTR dp;
    
    dp = getBidiDataPtr ( c );
    return ( dp != NULL ) ? dp->bpb : BPB_None ;
}

/***********************************************************/

/*
 * SECTION: Initialize the BidiData arrays.
 *
 * Special case the Han and Hangul ranges, which are not
 * explicitly listed in the UnicodeData files.
 */

static void initBidiData ( void )
{
int i;
BDPTR tmp;

/*
 * First initialize the static record for handata.
 */
    handata.han = 1;
    handata.bidivalue = BIDI_L;
    handata.bpb = BPB_None;
    handata.bpt = BPT_None;

/*
 * Then scan through the arrays and initialize
 * all values. Skip setting values if tmp turns up
 * pointing to the handata record.
 *
 * For this initial setting, default to GC_Cn
 * (unassigned) and to Bidi_Class L.
 */

    for ( i = 0; i <= 0x1FFFF; i++ )
    {
        tmp = getBidiDataPtr ( i );
        if ( tmp == &handata )
            continue;
        tmp->han = 0;
        tmp->bidivalue = BIDI_L;
        tmp->bpb = BPB_None;
        tmp->bpt = BPT_None;
    }
    for ( i = 0x2F800; i <= 0x2FFFF; i++ )
    {
        tmp = getBidiDataPtr ( i );
        tmp->han = 0;
        tmp->bidivalue = BIDI_L;
        tmp->bpb = BPB_None;
        tmp->bpt = BPT_None;
    }
    for ( i = 0xE0000; i <= 0xE01FF; i++ )
    {
        tmp = getBidiDataPtr ( i );
        tmp->han = 0;
        tmp->bidivalue = BIDI_L;
        tmp->bpb = BPB_None;
        tmp->bpt = BPT_None;
    }
/*
 * Special case the ranges for default values
 * for the Bidi_Class which are not L. This covers
 * the various right-to-left block ranges.
 * See DerivedBidiClass.txt for details.
 */
    for ( i = 0x0600; i <= 0x07BF; i++ )
    {
        tmp = getBidiDataPtr ( i );
        tmp->bidivalue = BIDI_AL;
    }
    for ( i = 0x08A0; i <= 0x08FF; i++ )
    {
        tmp = getBidiDataPtr ( i );
        tmp->bidivalue = BIDI_AL;
    }
    for ( i = 0xFB50; i <= 0xFDCF; i++ )
    {
        tmp = getBidiDataPtr ( i );
        tmp->bidivalue = BIDI_AL;
    }
    for ( i = 0xFDF0; i <= 0xFDFF; i++ )
    {
        tmp = getBidiDataPtr ( i );
        tmp->bidivalue = BIDI_AL;
    }
    for ( i = 0xFE70; i <= 0xFEFF; i++ )
    {
        tmp = getBidiDataPtr ( i );
        tmp->bidivalue = BIDI_AL;
    }
    for ( i = 0x1EE00; i <= 0x1EEFF; i++ )
    {
        tmp = getBidiDataPtr ( i );
        tmp->bidivalue = BIDI_AL;
    }
    for ( i = 0x0590; i <= 0x05FF; i++ )
    {
        tmp = getBidiDataPtr ( i );
        tmp->bidivalue = BIDI_R;
    }
    for ( i = 0x07C0; i <= 0x089F; i++ )
    {
        tmp = getBidiDataPtr ( i );
        tmp->bidivalue = BIDI_R;
    }
    for ( i = 0xFB1D; i <= 0xFB4F; i++ )
    {
        tmp = getBidiDataPtr ( i );
        tmp->bidivalue = BIDI_R;
    }
    for ( i = 0x10800; i <= 0x10FFF; i++ )
    {
        tmp = getBidiDataPtr ( i );
        tmp->bidivalue = BIDI_R;
    }
    for ( i = 0x1E800; i <= 0x1EDFF; i++ )
    {
        tmp = getBidiDataPtr ( i );
        tmp->bidivalue = BIDI_R;
    }
    for ( i = 0x1EF00; i <= 0x1EFFF; i++ )
    {
        tmp = getBidiDataPtr ( i );
        tmp->bidivalue = BIDI_R;
    }
}

/***********************************************************/

/*
 * processUnicodeData
 *
 * Process a line from UnicodeData.txt
 */

static int processUnicodeData ( char *buf )
{
char *sp;
char localbuf[256];
char errString[80];
int nn;
int nn2;
U_Int_32 n;
BDPTR tmp;

    sp = buf;

    sp = copyField ( localbuf, sp );

    nn = convertHexToInt ( &n, localbuf );
    if ( nn != 0 )
    {
        br_PrintErr ( nn, localbuf );
        return ( -1 );
    }

    /* Field (field 1): Unicode name */

    sp = copyField ( localbuf, sp );

    /* 
     * Special case range values in UnicodeData.txt, which are
     * marked by first and last point labels in angle brackets.
     * Just skip these entries without processing.
     * The General_Category and Bidi_Class values are set
     * instead by default initialization of the table ranges.
     */

    if ( localbuf[0] == '<' )
    {
    	if ( ( strstr ( localbuf, "First" ) != NULL )  ||
    		 ( strstr ( localbuf, "Last" ) != NULL ) )
    	{
    		return ( 0 );
    	}
    }

    /* Omit 1 field (field 2): Unicode category */

    sp = skipField ( sp );

    /* Omit 1 field (field 3): Combining class */

    sp = skipField ( sp );

    /* Span and copy 1 field (field 4): Bidi category */

    sp = copyField ( localbuf, sp );
    nn2 = br_GetBCFromLabel ( localbuf );
    if ( nn2 == BIDI_Unknown )
    {
        sprintf ( errString, "U+%04X Bidi_Class value %s unknown!\n", n, localbuf );
        br_ErrPrint ( errString );
        return ( -1 );
    }

    /* At this point we have a valid BC value. Stuff it in the table. */

    tmp = getBidiDataPtr ( n );
    tmp->bidivalue = (BIDIPROP)nn2;

    /* Omit 7 fields (fields 5 - 11) */

    /* Diagnostic output */
#ifdef NOTDEF
	if ( ( n >= 0x200B ) && ( n <= 0x202F ) )
	{
		printf ( "n=%04X, gc=%d, bc=%d\n", n, nn, nn2);
	}
#endif
    return ( 1 );
}

/***********************************************************/

/*
 * processBracketData
 *
 * Process a line from BidiBrackets.txt
 *
 * Lines are of the format:
 *
 * 0028; 0029; o # LEFT PARENTHESIS
 * 0029; 0028; c # RIGHT PARENTHESIS
 *
 * field 0 is a Unicode code point.
 * field 1 is the bpb property: a Unicode code point or "<none>"
 * field 2 is the bpt property: "o" or "c" or "n"
 *
 * For all of the lines actually defined in the file, bpb != none.
 *
 * Note that unlike UnicodeData.txt, BidiBrackets.txt includes
 * spaces after the semicolons. These need to be skipped over
 * before parsing the subsequent fields.
 */

static int processBracketData ( char *buf )
{
char *sp;
char localbuf[256];
int nn;
U_Int_32 n;
U_Int_32 n2;
BPTPROP bpt;
BDPTR tmp;

    sp = buf;

    sp = copyField ( localbuf, sp );

    nn = convertHexToInt ( &n, localbuf );
    if ( nn != 0 )
    {
        br_PrintErr ( nn, localbuf );
        return ( -1 );
    }

    sp = skipSpace ( sp );

    /* Field (field 1): bpb value, another code point */

    sp = copyField ( localbuf, sp );

    nn = convertHexToInt ( &n2, localbuf );
    if ( nn != 0 )
    {
        br_PrintErr ( nn, localbuf );
        return ( -1 );
    }

    sp = skipSpace ( sp );

    /* Field (field 1): bpt value, a symbolic value, "o" or "c" */

    /* 
     * This parse uses copySubField atm, because this field in
     * BidiBrackets.txt is not terminated by a semicolon, but
     * rather is followed by " # " and the character name as
     * a comment.
     */

    sp = copySubField ( localbuf, sp );

    if ( strcmp ( localbuf, "o" ) == 0 )
    {
        bpt = BPT_O;
    }
    else if ( strcmp ( localbuf, "c" ) == 0 )
    {
        bpt = BPT_C;
    }
    else
    {
        bpt = BPT_None;
    }

    /* At this point we have valid bpb and bpt values. Stuff them in the table. */

    tmp = getBidiDataPtr ( n );
    tmp->bpb = n2;
    tmp->bpt = bpt;

    return ( 1 );
}

/***********************************************************/

/*
 * SECTION: parsePropertyData
 *
 * Parse the UnicodeData.txt file (or any other file using
 * the same generic format), branching by status value to specific
 * parse routines for each line.
 */

static int br_parsePropertyData ( int status, int version )
{
FILE *fdi;
int rc;
int i;
char *fileName;
char buffer[512];

    if ( Trace ( Trace10 ) )
    {
        printf ( "Trace: Entering br_parsePropertyData\n" );
    }

    linesProcessed = 0;

    if ( status == UNICODE_DATA )
    {
        if ( version == UBA63 )
        {
            fileName = "UnicodeData-6.3.0.txt";
        }
        else
        {
            fileName = "UnicodeData-6.2.0.txt";
        }
    }
    else if ( status == BRACKET_DATA )
    {
        fileName = "BidiBrackets-6.3.0.txt";
    }
    else
    {
        fileName = "Not Defined";
    }

    fdi = fopen ( fileName, "rt" );

    if ( fdi == NULL )
    {
        if ( Trace ( Trace15 ) )
        {
            printf ( "Cannot open property data file: \"%s\"\n", fileName );
        }
        return -1 ;
    }
    /* Do the work */

    while ( fgets ( buffer, 512, fdi ) != NULL )
    {
    int slen;
    int lineIsBlank;

    /* Don't process empty lines or comments. */
        slen = strlen ( buffer );
        if ( ( slen == 0 ) || ( buffer[0] == '#' ) || ( buffer[0] == ';' ) )
            continue ;
    /* Also check for non-zero length lines with just whitespace */
        lineIsBlank = 1;
        i = 0 ;
        while ( lineIsBlank && ( i < slen ) )
        {
            if ( !isspace (buffer[i]) )
            {
                lineIsBlank = 0;
            }
            i++;
        }
        if ( lineIsBlank )
            continue;

/*      fputs ( buffer, stdout ); */

        switch ( status )
        {
            case UNICODE_DATA :
                rc = processUnicodeData ( buffer );
                linesProcessed++;
                break;
            case BRACKET_DATA :
                rc = processBracketData ( buffer );
                linesProcessed++;
                break;
            default :
                rc = -1;
        }

        if ( rc == -1 )
            break;
    }
    fclose ( fdi );

    if ( rc < 0 )
    {
        return ( rc );
    }
    else if ( Trace ( Trace10 ) )
    {
    	printf ( "Debug: Processed %d lines from %s\n", linesProcessed, fileName );
    }

    return ( 0 );
}

/***********************************************************/

/*
 * SECTION: Initialize the property tables.
 */

/*
 * br_InitTable
 *
 * Initialize the arrays, then read in values from
 * UnicodeData.txt.
 *
 * If the input is set to FORMAT_B, then no parsed
 * property data is required. Exit without reading
 * in property data files.
 */

int br_InitTable ( int version )
{
int rc;

    /* Bombproof against repeat invocations */

    if ( initDone )
    {
        return ( 1 );
    }

    if ( Trace ( Trace10 ) )
    {
        printf ( "Trace: Entering br_InitTable\n" );
    }

    if ( GetFileFormat() == FORMAT_B )
    {
        initDone = 1;
        return ( 1 );
    }

    /*
     * Initialize the property arrays with
     * default values.
     */

	initBidiData();

	rc = br_parsePropertyData ( UNICODE_DATA, version );
    if ( rc < 0 )
    {
        if ( Trace ( Trace15 ) )
        {
            printf ( "Error: Failure in parsing UnicodeData.txt: %d.\n", rc );
        }
        return ( rc );
    }

    if ( version == UBA63 )
    {
        rc = br_parsePropertyData ( BRACKET_DATA, version );
        if ( rc < 0 )
        {
            if ( Trace ( Trace15 ) )
            {
                printf ( "Error: Failure in parsing BidiBrackets.txt: %d.\n", rc );
            }
            return ( rc );
        }
    }

    initDone = 1;
	return ( 1 );
}

/*
 * br_Init
 *
 * Public API. This forces the file format to FORMAT_A and
 * then initializes the tables.
 *
 * Set Trace0 and Trace15 on as the default for output display. These
 * may be turned off for the br_QueryTestResults API, or the
 * client may turn on other trace flags for br_ProcessTestResults.
 *
 * Used for an external application making use of the public APIs
 * to run and/or query individual test cases.
 */

int br_Init ( int version )
{
int rc;

    /* Bombproof against repeat invocations */

    if ( initDone )
    {
        return ( 1 );
    }

    SetFileFormat ( FORMAT_A );

    SetUBAVersion ( version );

    TraceOn ( Trace0 );
    TraceOn ( Trace15 );

    rc = br_InitTable ( version );

    return ( rc );
}