/*
** Unicode Bidirectional Algorithm
** Reference Implementation
** Copyright 2015, Unicode, Inc. All rights reserved.
** Unpublished rights reserved under U.S. copyright laws.
*/
/*
* bidirefp.h
*
* Private types, constants, and declarations used by bidiref.
*/
/*
* Change history:
*
* 2013-Jun-02 kenw Created.
* 2013-Jun-05 kenw Added diretyBit to UBACONTEXT.
* 2013-Jun-19 kenw Add declaration for br_ErrPrint.
* 2013-Jul-08 kenw Move BR_MAXINPUTLEN to bidiref.h.
* 2014-Apr-17 kenw Add accelerator flags to BIDIUNIT.
* 2015-Jun-05 kenw Set MAXPAIRINGDEPTH to 63 for UBA 8.0.
* 2015-Aug-19 kenw Add CJK Extension E range.
* 2015-Dec-04 kenw Bumped up bracket stack size by one.
*/
#ifndef __BIDIREFP_LOADED
#define __BIDIREFP_LOADED
#ifdef __cplusplus
extern "C" {
#endif
#include "bidiref.h"
/*
* Constants defining Unihan ranges.
*/
#define CJK_URO_FIRST (0x4E00)
#define CJK_URO_LAST (0x9FCC)
#define CJK_EXTA_FIRST (0x3400)
#define CJK_EXTA_LAST (0x4DB5)
#define CJK_EXTB_FIRST (0x20000)
#define CJK_EXTB_LAST (0x2A6D6)
#define CJK_EXTC_FIRST (0x2A700)
#define CJK_EXTC_LAST (0x2B734)
#define CJK_EXTD_FIRST (0x2B740)
#define CJK_EXTD_LAST (0x2B81D)
#define CJK_EXTE_FIRST (0x2B820)
#define CJK_EXTE_LAST (0x2CEA1)
/*
* Enumerated types for relevant Unicode properties,
* to make reference to them easier to understand in the algorithm.
*/
typedef enum
{
BIDI_None,
BIDI_Unknown,
BIDI_L, /* 0x00 strong: left-to-right (bc=L) */
BIDI_ON, /* 0x01 neutral: Other Neutral (bc=ON) */
BIDI_R, /* 0x02 strong: right-to-left (bc=R) */
BIDI_EN, /* 0x03 weak: European Number (bc=EN) */
BIDI_ES, /* 0x04 weak: European Number Separator (bc=ES) */
BIDI_ET, /* 0x05 weak: European Number Terminator (bc=ET) */
BIDI_AN, /* 0x06 weak: Arabic Number (bc=AN) */
BIDI_CS, /* 0x07 weak: Common Number Separator (bc=CS) */
BIDI_B, /* 0x08 neutral: Paragraph Separator (bc=B) */
BIDI_S, /* 0x09 neutral: Segment Separator (bc=S) */
BIDI_WS, /* 0x0A neutral: White Space (bc=WS) */
BIDI_AL, /* 0x0B strong: right-to-left Arabic (bc=AL) */
BIDI_NSM, /* 0x0C weak: non-spacing mark (bc=NSM) */
BIDI_BN, /* 0x0D weak: boundary neutral (bc=BN) */
BIDI_PDF, /* 0x0E format: pop directional formatting (bc=PDF) */
BIDI_LRE, /* format: left-to-right embedding */
BIDI_LRO, /* format: left-to-right override */
BIDI_RLE, /* format: right-to-left embedding */
BIDI_RLO, /* format: right-to-left override */
BIDI_LRI, /* format: left-to-right isolate */
BIDI_RLI, /* format: right-to-left isolate */
BIDI_FSI, /* format: first strong isolate */
BIDI_PDI, /* format: pop directional isolate */
BIDI_MAX
} BIDIPROP;
typedef enum
{
BPT_O, BPT_C, BPT_None
} BPTPROP;
/*
* File format defines.
*
* These defines can be used to distinguish between formats
* of data to be read in. They are used to branch the
* parsing functions in brinput.c.
*/
#define FORMAT_A (0) /* Format used in BidiCharacterTest.txt */
#define FORMAT_B (1) /* Format used in BidiTest.txt */
/*
* ALGORITHM_STATE is used to store how far along in
* algorithm the data has been processed. This can be
* used to help the debug display to show relevant
* information.
*/
typedef enum
{
State_Unitialized, /* context allocated, but no data */
State_Initialized, /* test case data read in and parsed */
State_P3Done, /* rules done through P3 */
State_X8Done, /* rules done through X8 */
State_X9Done, /* rules done through X9 */
State_RunsDone, /* rules done through X10 part 1: runs are identified */
State_X10Done, /* rules done through X10: runs & seqs are identified */
State_W1Done, /* rules done through W1: combining marks are resolved */
State_W2Done, /* rules done through W2: */
State_W3Done, /* rules done through W3: */
State_W4Done, /* rules done through W4: */
State_W5Done, /* rules done through W5: */
State_W6Done, /* rules done through W6: */
State_W7Done, /* rules done through W7: weak types are resolved */
State_N0Done, /* rules done through N0: bracket pairs are resolved */
State_N1Done, /* rules done through N1: */
State_N2Done, /* rules done through N2: neutral types are resolved */
State_I2Done, /* rules done through I2: implicit levels are resolved */
State_L1Done, /* rules done through L1: trailing whitespace resolved */
State_L2Done, /* rules done through L2: reordering data available */
State_Complete /* finished application of rules: ready for checking */
} ALGORITHM_STATE;
/*
* BIDIPROPDATA
*
* This struct is used to store property information
* about characters and code points.
*
* Only limited property data is stored -- just the data
* needed for running the UBA algorithm.
*/
typedef struct {
int han;
BIDIPROP bidivalue;
U_Int_32 bpb;
BPTPROP bpt;
} BIDIDATA;
typedef BIDIDATA *BDPTR;
#define BPB_None (0xFFFFFFFF)
typedef enum
{
Dir_LTR, Dir_RTL, Dir_Auto, Dir_Unknown
} Paragraph_Direction;
typedef enum
{
Override_Neutral, Override_LTR, Override_RTL
} D_Override_Status;
/*
* A single status stack definition is shared by both the
* UBA62 and UBA63 reference implementations. The
* UBA62 implementation makes no use of the isolate_status
* field.
*
* The status stack is used by rules X1-X8.
*/
typedef struct
{
int embedding_level;
D_Override_Status override_status; /* direction */
int isolate_status; /* boolean */
} STATUSSTACKELEMENT;
typedef STATUSSTACKELEMENT *STACKPTR;
/*
* The maximum_depth for embedding for UBA62.
*/
#define MAX_DEPTH_62 61
/*
* The maximum_depth for embedding for UBA63.
*/
#define MAX_DEPTH_63 125
/*
* Typedefs used in the bracket pairing algorithm in rule N0.
*/
typedef struct {
U_Int_32 bracket;
int pos;
} BRACKETSTACKELEMENT;
typedef BRACKETSTACKELEMENT *BRACKETSTACKPTR;
typedef struct Pairing_Element_Struct {
struct Pairing_Element_Struct *next;
int openingpos;
int closingpos;
} PAIRINGELEMENT;
typedef PAIRINGELEMENT *PAIRINGPTR;
/*
* MAXPAIRINGDEPTH is used to declare the bracket pairing stack.
*
* For UBA 8.0 and subsequent versions, this depth is specified
* to be exactly 63. (See brrule.c, where the stack allocation is
* made as MAXPAIRINGDEPTH +1.)
*
* For UBA 6.3 (and 7.0), these depth was implementation-dependent,
* and not specified by UAX #9. For the 6.3 and 7.0 versions of
* bidiref it was set to half the MAXINPUTLEN, because in principle an
* input string could consist *only* of brackets to pair up: "(({{[]}}))",
* etc., in which case the maximum possible set of pairs would be
* half the text length.
*
* Starting with the 8.0 version of bidiref, however, 63 is used
* also for testing the 6.3 and 7.0 versions of UBA, and for
* consistency, the stack full error handling is also based
* on the 8.0 specification.
*/
#define MAXPAIRINGDEPTH (63)
/*
* Provide a reasonable length for input-handling buffers, based
* on MAXINPUTLEN, which limits how many code points the
* implementation will handle for an input string.
*/
#define BUFFERLEN (5 * BR_MAXINPUTLEN) /* for data chunks parsed from input */
#define RAWBUFFERLEN (2 * BUFFERLEN) /* full unparsed input line */
/*
* BIDIUNIT
*
* This struct is the primitive manipulated by
* the UBA reference implementation. It is used
* to construct a vector in the UBACONTEXT,
* containing the character data (stored as UTF-32),
* the looked-up property data, so that the UBA,
* which works primarily on the basis of the Bidi_Class
* values, has the correct input, along with the
* level and order information.
*
* Starting with Version 2.0 of bidiref, a number of
* accelerator flags are added, to speed up checking of
* common combinations of Bidi_Class values. Each flag
* is conceptually a Boolean, and could be stored as a bit
* in a bitfield, but in this reference implementation
* is simply stored as another int field in the BIDIUNIT.
* These values are all initialized when the BIDIUNIT
* vector is initialized for a testcase string. Some need
* to be reset, whenever a rule changes the current bc
* value for a character.
*/
typedef struct {
U_Int_32 c; /* character value, stored as UTF-32 */
BIDIPROP bc; /* current Bidi_Class value */
BIDIPROP orig_bc; /* store original value for Bidi_Class */
int bc_numeric; /* accelerator for bc = (AN or EN) */
int bc_isoinit; /* accelerator for bc = (LRI or RLI or FSI) */
U_Int_32 bpb; /* bidi paired bracket property */
BPTPROP bpt; /* bidi paired bracket type property */
int level; /* current embedding level */
int expLevel; /* store expected level for test case */
int order; /* store position for reordering */
int order2; /* spare order array used in reversal algorithm */
} BIDIUNIT;
typedef BIDIUNIT *BIDIUNITPTR;
#define NOLEVEL (-1)
/*
* A text chain is simply an array of pointers to BIDIUNITs.
*
* This data structure is introduced to help in the abstraction
* of rule application between UBA62 (and earlier), where
* rules apply to level runs, and UBA63 (and later), where
* rules apply to isolating run sequences. Because
* isolating run sequences may contain indefinite lists
* of discontiguous level runs, it is difficult to
* specify rule application directly to the isolating
* run sequences -- the concept of previous character
* and next character in the sequence gets rather complex
* and makes the implementation difficult.
*
* One possible approach to the isolating run sequences
* would be to simply clone the entire text into
* allocated contiguous BIDIUNIT vectors for each
* isolating run sequence identified. However, that makes
* bookkeeping in the original input structure a bit more
* complex. Instead, whenever an isolating run sequence
* is identified, a contiguous text chain is allocated
* and initialized instead. This keeps all the data
* in the original input structure, and simply embodies
* the traversal order (and prior and next relation)
* needed for rule application.
*
* So that the rule application can be uniform, a
* text chain is also allocated for level runs, as well.
*
* The text chain is hung off the "textChain" field in
* both the BIDIRUN and the ISOLATING_RUN_SEQUENCE
* struct definitions. The "len" field defines the
* length of that array.
*/
/*
* BIDIRUN
*
* Once embedding levels are determined, the UBA
* treats each contiguous sequence at the same level
* as a distinct run.
*
* To simplify the processing of runs, a list of runs
* is constructed as a linked list, which hangs off
* the UBACONTEXT.
*
* Each BIDIRUN consists of pointers to the first
* and last BIDIUNIT of the run, and then additional
* information calculated during the X10 rule, when
* the runs are identified.
*
* This may not be the most efficient approach to
* an implementation, but it makes it much easier
* to express the X10 rule and subsequent rules which
* process the runs individually.
*
* The seqID is only used in processing run lists
* into isolating run sequence lists in UBA63.
* It is initialized to zero. During processing
* to identify isolating run sequences, it is set
* to the sequence id of the isolating run sequence
* that a run is assigned to.
*/
typedef struct BidiRunStruct {
struct BidiRunStruct *next; /* next run */
BIDIUNITPTR first; /* first element of this run */
BIDIUNITPTR last; /* last element of this run */
int len; /* explicit len, to simplify processing */
int runID; /* stamp run with id for debugging */
int seqID; /* isolating run sequence id */
int level; /* embedding level of this run */
BIDIPROP sor; /* direction of start of run: L or R */
BIDIPROP eor; /* direction of end of run: L or R */
BIDIUNITPTR *textChain; /* constructed text chain */
} BIDIRUN;
typedef BIDIRUN *BIDIRUNPTR;
/*
* The BidiRunListStruct abstracts the creation of
* a list of bidi runs for attachment to the isolating
* run lists. Instead of duplicating all the run information
* into that list, the list consists just of pointers to
* the already allocated basic list of runs.
*/
typedef struct BidiRunListStruct {
struct BidiRunListStruct *next;
BIDIRUNPTR run;
} BIDIRUNLISTELEMENT;
typedef BIDIRUNLISTELEMENT *BIDIRUNLISTPTR;
/*
* ISOLATING_RUN_SEQUENCE
*
* This is a concept introduced in UBA63.
*
* Essentially it consists of an ordered sequence of
* bidi runs, as defined in BD13.
*
* It is implemented here by attaching the list of runs
* associated with this particular isolating run sequence.
* The attached list just contains pointers to each of
* the relevant BIDIRUN structs in the already constructed
* sequential list of runs attached to the UBACONTEXT.
*
* All runs associated with a single isolating run sequence
* are at the same level, so that level can be stored
* in the isolating run sequence struct for ease of access.
*
* Each isolating run sequence has a start of sequence (sos)
* and end of sequence (eos) directional value assigned.
* These are calculated based on the sor and eor values
* for the associated list of runs, but again, are stored
* in the isolating run sequence struct for ease of access.
*/
typedef struct IRSequenceStruct {
struct IRSequenceStruct *next; /* next sequence */
BIDIRUNLISTPTR theRuns; /* list of runs in this sequence */
BIDIRUNLISTPTR lastRun; /* for list appending */
int len; /* explicit len, to simplify processing */
int seqID; /* stamp seq with id for debugging */
int level; /* embedding level of this seq */
BIDIPROP sos; /* direction of start of seq: L or R */
BIDIPROP eos; /* directino of end of seq: L or R */
BIDIUNITPTR *textChain; /* constructed text chain */
} ISOLATING_RUN_SEQUENCE;
typedef ISOLATING_RUN_SEQUENCE *IRSEQPTR;
/*
* UBACONTEXT
*
* This struct is used to store all context associated
* with the bidi reference UBA processing, including
* input, expected test output,
* and the constructed runs and other intermediate data.
*
* theText, testLen, paragraphDirection are input.
* theRuns and paragraphEmbeddingLevel are calculated.
* expEmbeddingLevel, expOrder are parsed
* from the testcase data and checked against
* calculated values.
* For simplicity, the expected levels data parsed
* from the test case are stored with theText.
*
* Starting from Version 2.0, theText pointer is
* allocated statically, simply pointing to a static
* buffer of BR_MAXINPUTLEN length, to cut down on
* repeated dynamic allocations during long testcase
* runs.
*/
typedef struct {
ALGORITHM_STATE state; /* track state */
int dirtyBit; /* used for debug output control */
int testId; /* used for tagging trace output */
Paragraph_Direction paragraphDirection; /* input */
int paragraphEmbeddingLevel; /* calculated */
int textLen; /* input */
BIDIUNITPTR theText; /* input */
BIDIRUNPTR theRuns; /* calculated */
BIDIRUNPTR lastRun; /* for list appending */
IRSEQPTR theSequences; /* calculated: UBA63 only */
IRSEQPTR lastSequence; /* for list appending */
int expEmbeddingLevel; /* expected test result */
char *expOrder; /* expected test result */
} UBACONTEXT;
typedef UBACONTEXT *UBACTXTPTR;
/*
* BIDI_RULE_TYPE
*
* An enumerated type which facilitates lookup of
* rule callbacks and other data by rule type.
*/
typedef enum {
UBA_RULE_W1,
UBA_RULE_W2,
UBA_RULE_W3,
UBA_RULE_W4,
UBA_RULE_W5,
UBA_RULE_W6,
UBA_RULE_W7,
UBA_RULE_N0,
UBA_RULE_N1,
UBA_RULE_N2,
UBA_RULE_Last
} BIDI_RULE_TYPE;
/*
* RULE_CONTEXT
*
* A struct used to package up parameter information
* for a class of rules, to make parameter passing
* and function prototype neater.
*/
typedef struct {
BIDIUNITPTR *textChain;
int len;
int level;
BIDIPROP sot;
BIDIPROP eot;
} BIDI_RULE_CONTEXT;
typedef BIDI_RULE_CONTEXT *BIDIRULECTXTPTR;
/*
* BIDIRUN_RULE_FPTR
*
* Function type of a UBA rule which operates on
* a single text chain.
*
* This type is used to better abstract the bidi rule
* dispatch mechanism, to distinguish version-specific
* operations to extract level runs from the logic
* which then applies to that run.
*/
typedef int (*BIDIRUN_RULE_FPTR)( BIDIRULECTXTPTR brp );
typedef struct {
BIDIRUN_RULE_FPTR ruleMethod; /* actual function callback for rule */
char *ruleLabel; /* printable label for the function */
char *ruleNumber; /* W1, W2, ... */
char *ruleError;
} RULE_TBL_ELEMENT;
typedef RULE_TBL_ELEMENT *RTEPTR;
/*
* Private Function Declarations
*/
/*
* Exported from libbidir library
*/
extern int Trace ( U_Int_32 traceValue );
extern void TraceOffAll ( void );
extern int GetFileFormat ( void );
extern void SetFileFormat ( int format );
extern int GetUBAVersion ( void );
extern void SetUBAVersion ( int version );
extern int convertHexToInt ( U_Int_32 *dest, char *source );
extern int br_Evaluate ( char *data, int *outval );
extern char *copyField ( char *dest, char *src );
extern char *copySubField ( char *dest, char *src );
extern void br_ErrPrint ( char *errString );
extern int br_GetBCFromLabel ( char* src );
extern BIDIPROP br_GetBC ( U_Int_32 c );
extern BPTPROP br_GetBPT ( U_Int_32 c );
extern U_Int_32 br_GetBPB ( U_Int_32 c );
extern int br_InitTable ( int version );
extern int br_UBA ( UBACTXTPTR ctxt );
extern int br_Check ( UBACTXTPTR ctxt );
/*
* Exported from brinput.c for use in bidiref.c
*/
extern int br_RunStaticTest ( void );
extern int br_RunFileTests ( int version, char *input );
/*
* Exported from brtable.c for use in brtest.c
*/
extern int br_GetInitDone ( void );
#ifdef __cplusplus
}
#endif
#endif /* __BIDIREFP_LOADED */