tempest/resources/3rdparty/xercesc-3.1.2/tools/ICUData/ICUData.cpp


								/*

								 * Licensed to the Apache Software Foundation (ASF) under one or more

								 * contributor license agreements.  See the NOTICE file distributed with

								 * this work for additional information regarding copyright ownership.

								 * The ASF licenses this file to You under the Apache License, Version 2.0

								 * (the "License"); you may not use this file except in compliance with

								 * the License.  You may obtain a copy of the License at

								 *

								 *      http://www.apache.org/licenses/LICENSE-2.0

								 *

								 * Unless required by applicable law or agreed to in writing, software

								 * distributed under the License is distributed on an "AS IS" BASIS,

								 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

								 * See the License for the specific language governing permissions and

								 * limitations under the License.

								 */


								/*

								 * $Id: ICUData.cpp 470094 2006-11-01 20:41:30Z amassari $

								 */


								// ---------------------------------------------------------------------------

								//  This program is designed to parse a standard ICU .UCM file and spit out

								//  a C++ code fragment that represents the tables required by the intrinsic

								//  XML parser transcoders.

								//

								//  The file format is pretty simple and this program is not intended to be

								//  industrial strength by any means. Its use by anyone but the author is

								//  at the user's own risk.

								//

								//  The code looks for the min/max bytes per character to know what kind of

								//  table to spit out, but for now only handles single char sets.

								// ---------------------------------------------------------------------------


								// ---------------------------------------------------------------------------

								//  Includes

								// ---------------------------------------------------------------------------

								#include    <ctype.h>

								#include    <stdio.h>

								#include    <stdlib.h>

								#include    <iostream.h>

								#include    <string.h>


								// ---------------------------------------------------------------------------

								//  Const data

								// ---------------------------------------------------------------------------

								static const unsigned int   gMaxInRecs = 1024;


								// ---------------------------------------------------------------------------

								//  Local data types

								// ---------------------------------------------------------------------------

								struct XlatRec

								{

								    unsigned short  uniVal;

								    unsigned char   cpVal;

								};


								// ---------------------------------------------------------------------------

								//  Local data

								//

								//  gInFile

								//  gOutFile

								//      These are the file stream for the input UCM file and the output file

								//      that we write the C++ code to.

								//

								//  fLineNum

								//      Used to track the current line number in the source file, for error

								//      reporting.

								//

								//  gMainTable

								//  gMainTableSz

								//      This is the table that is filled in from the original source document.

								//      We don't know how big it will be, but its not likely to be much more

								//      than 300 entries or so (256 output code points with some multiply

								//      mapped Unicode code points.) So we make it extra large and watch for

								//      possible overflow.

								//

								//      The size value is bumped up as we load entries into it during the

								//      parse of the file.

								//

								//  gMaxChar

								//  gMinChar

								//      The min/max chars that are used to represent a character. These are

								//      read from the header of the input file.

								//

								//  gRepChar

								//      The replacement character to be used. This is read from the header of

								//      the input file.

								// ---------------------------------------------------------------------------

								static FILE*            gInFile;

								static FILE*            gOutFile;

								static unsigned int     fLineNum;

								static XlatRec          gMainTable[gMaxInRecs];

								static unsigned int     gMainTableSz = 0;

								static unsigned int     gMaxChar;

								static unsigned int     gMinChar;

								static unsigned char    gRepChar = 1;


								// ---------------------------------------------------------------------------

								//  Local functions

								// ---------------------------------------------------------------------------

								static unsigned int getLine(        char* const     toFill

								                            , const unsigned int    maxChars

								                            , const bool            eofOk = false)

								{

								    while (true)

								    {

								        if (!fgets(toFill, maxChars, gInFile))

								        {

								            if (feof(gInFile))

								            {

								                if (eofOk)

								                    return ~0UL;

								                else

								                    cout << "Unexpected end of input at line: " << fLineNum << endl;

								            }

								             else

								            {

								                cout << "Error processing input at line: " << fLineNum << endl;

								                exit(1);

								            }

								        }

								        fLineNum++;


								        //

								        //  If its not a comment, then break out

								        //

								        if (toFill[0] != '#')

								            break;

								    }


								    //

								    //  There could be a trailing comment on this line, so lets get rid

								    //  of it. Search for a # char and put a null there.

								    //

								    char* endPtr = toFill;

								    while (*endPtr && (*endPtr != '#'))

								        endPtr++;

								    if (*endPtr == '#')

								        *endPtr = 0;


								    // Strip trailing whitespace

								    endPtr = toFill + (strlen(toFill) - 1);

								    while (isspace(*endPtr))

								        endPtr--;

								    *(endPtr + 1) = 0;


								    // And return the count of chars we got

								    return strlen(toFill);

								}


								static unsigned int extractVal(char* const srcStr)

								{

								    char* srcPtr = srcStr;


								    // Run forward to the first non-space

								    while (isspace(*srcPtr))

								        srcPtr++;


								    if (!*srcPtr)

								    {

								        cout << "Invalid numeric value on line: " << fLineNum << endl;

								        exit(1);

								    }


								    //

								    //  If it starts with \, then its a hex value in the form \xXX. Else its

								    //  just a decimal value.

								    //

								    unsigned int retVal;

								    char* endPtr;

								    if (*srcPtr == '\\')

								    {

								        // Skip the \\x and interpret as a hex value

								        srcPtr += 2;

								        retVal = (unsigned int)strtoul(srcPtr, &endPtr, 16);

								    }

								     else

								    {

								        retVal = (unsigned int)strtoul(srcPtr, &endPtr, 10);

								    }


								    // We should have translated up to the end of the string

								    if (*endPtr)

								    {

								        cout << "Invalid numeric value on line: " << fLineNum << endl;

								        exit(1);

								    }


								    return retVal;

								}


								static void loadTable()

								{

								    //

								    //  Just loop, reading lines at a time, until we either find the start

								    //  of the character table or hit the end of the file. Along the way, we

								    //  should see a few header values that we store away.

								    //

								    const unsigned int  tmpBufSz = 2048;

								    char                tmpBuf[tmpBufSz - 1];

								    while (getLine(tmpBuf, tmpBufSz))

								    {

								        //

								        //  Check for one of the special values we are intersted int. If

								        //  its CHARMAP, then we fall out of this loop.

								        //

								        if (!strcmp(tmpBuf, "CHARMAP"))

								            break;


								        if (!strncmp(tmpBuf, "<mb_cur_max>", 12))

								        {

								            gMaxChar = extractVal(&tmpBuf[12]);

								        }

								         else if (!strncmp(tmpBuf, "<mb_cur_min>", 12))

								        {

								            gMinChar = extractVal(&tmpBuf[12]);

								        }

								         else if (!strncmp(tmpBuf, "<subchar>", 9))

								        {

								            gRepChar = (char)extractVal(&tmpBuf[9]);

								        }

								    }


								    //

								    //  Ok, now we just run till we hit the "END CHARMAP" line. Each entry

								    //  will be in the form:

								    //

								    //      <UXXXX>     \xXX

								    //

								    //  Where X is a hex number.

								    //

								    char* endPtr;

								    while (getLine(tmpBuf, tmpBufSz))

								    {

								        // Watch for the end of table

								        if (!strcmp(tmpBuf, "END CHARMAP"))

								            break;


								        // The absolute minium it could be is 12 chars

								        if (strlen(tmpBuf) < 12)

								        {

								            cout << "Line " << fLineNum << " is too short to hold a valid entry"

								                 << endl;

								            exit(1);

								        }


								        // Make sure the first token meets the criteria

								        if ((tmpBuf[0] != '<')

								        ||  (tmpBuf[1] != 'U')

								        ||  (tmpBuf[6] != '>'))

								        {

								            cout << "Line " << fLineNum << " has a badly formed Unicode value"

								                 << endl;

								            exit(1);

								        }


								        //

								        //  Looks reasonable so lets try to convert it. We can play tricks

								        //  with this buffer, so put a null over the > char.

								        //

								        tmpBuf[6] = 0;

								        const unsigned int uniVal = strtoul(&tmpBuf[2], &endPtr, 16);

								        if (*endPtr)

								        {

								            cout << "Invalid Unicode value on line " << fLineNum << endl;

								            exit(1);

								        }


								        //

								        //  Ok, lets search over to the second token. We have to find a \\

								        //  character.

								        //

								        char* srcPtr = &tmpBuf[7];

								        while (*srcPtr && (*srcPtr != '\\'))

								            srcPtr++;


								        // If we never found it, its in error

								        if (!*srcPtr)

								        {

								            cout << "Never found second token on line " << fLineNum << endl;

								            exit(1);

								        }


								        // Try to translate it

								        srcPtr += 2;

								        const unsigned int cpVal = strtoul(srcPtr, &endPtr, 16);

								        if (*endPtr)

								        {

								            cout << "Invalid code page value on line " << fLineNum << endl;

								            exit(1);

								        }


								        // Make sure that the values are within range

								        if (uniVal > 0xFFFF)

								        {

								            cout << "Unicode value is too big on line " << fLineNum << endl;

								            exit(1);

								        }


								        if (cpVal > 0xFF)

								        {

								            cout << "Code page value is too big on line " << fLineNum << endl;

								            exit(1);

								        }


								        // Looks reasonable, so add a new entry to the global table

								        gMainTable[gMainTableSz].uniVal = (unsigned short)uniVal;

								        gMainTable[gMainTableSz].cpVal = (unsigned char)cpVal;

								        gMainTableSz++;

								    }

								}


								int compFuncTo(const void* p1, const void* p2)

								{

								    const XlatRec* rec1 = (const XlatRec*)p1;

								    const XlatRec* rec2 = (const XlatRec*)p2;


								    return (int)rec1->uniVal - (int)rec2->uniVal;

								}


								int compFuncFrom(const void* p1, const void* p2)

								{

								    const XlatRec* rec1 = (const XlatRec*)p1;

								    const XlatRec* rec2 = (const XlatRec*)p2;


								    //

								    //  Since there can be multiple Unicode chars that map to a single

								    //  code page char, we have to handle the situationw here they are

								    //  equal specially. If the code page vals are equal, then the one

								    //  with the smaller Unicode code point is considered smaller.

								    //

								    if (rec1->cpVal == rec2->cpVal)

								        return (int)rec1->uniVal - (int)rec2->uniVal;


								    // Else use the code page value for sorting

								    return (int)rec1->cpVal - (int)rec2->cpVal;

								}


								static void formatSBTables()

								{

								    // For now, only handle single byte char sets

								    if ((gMinChar != 1) || (gMaxChar != 1))

								    {

								        cout << "formatSBTables can only handle single byte encodings"

								             << endl;

								        exit(1);

								    }


								    //

								    //  First, we want to sort the table by the code page value field. This

								    //  is the order required for the 'from' table to convert from the code

								    //  page to the internal Unicode format.

								    //

								    qsort(gMainTable, gMainTableSz, sizeof(gMainTable[0]), compFuncFrom);


								    //

								    //  Now spit out the header for the table. This is the same for all

								    //  of them, since they are static to the file and can just all have

								    //  the same name.

								    //

								    fprintf

								    (

								        gOutFile

								        , "static const XMLCh gFromTable[256] =\n{\n    "

								    );


								    //

								    //  Now for each unique entry in the cp value field, we want to put out

								    //  the Unicode value for that entry. Since we sorted them such that

								    //  dups have the one with the smaller Unicode value in the lower index,

								    //  we always hit the desired value first, and then can just skip over

								    //  a duplicate.

								    //

								    unsigned int curValue = 0;

								    unsigned int index;

								    for (index = 0; index < gMainTableSz; index++)

								    {

								        if (curValue)

								        {

								            if (!(curValue % 8))

								                fprintf(gOutFile, "\n  , ");

								            else

								                fprintf(gOutFile, ", ");

								        }


								        if (curValue == gMainTable[index].cpVal)

								        {

								            fprintf(gOutFile, "0x%04X", (unsigned int)gMainTable[index].uniVal);


								            // If there is a dump, then skip it

								            if (index < gMainTableSz)

								            {

								                if (gMainTable[index + 1].cpVal == curValue)

								                    index++;

								            }

								        }

								         else if (curValue < gMainTable[index].cpVal)

								        {

								            fprintf(gOutFile, "0xFFFF");

								        }

								         else

								        {

								            // Screwed up

								            cout << "Current value got above target value\n" << endl;

								            exit(1);

								        }

								        curValue++;


								        // If the current value goes over 256, we are in trouble

								        if (curValue > 256)

								        {

								            cout << "The code page value cannot be > 256 in SB mode\n" << endl;

								            exit(1);

								        }

								    }


								    // And print the trailer for this table

								    fprintf(gOutFile, "\n};\n\n");


								    //

								    //  Now lets sort by the Unicode value field. This sort is used for

								    //  the 'to' table. The Unicode value is found by binary search and

								    //  used to map to the right output encoding value.

								    //

								    qsort(gMainTable, gMainTableSz, sizeof(gMainTable[0]), compFuncTo);


								    // Output the table ehader for this one

								    fprintf

								    (

								        gOutFile

								        , "static const XMLTransService::TransRec gToTable[] =\n{\n    "

								    );


								    for (index = 0; index < gMainTableSz; index++)

								    {

								        if (index)

								        {

								            if (!(index % 4))

								                fprintf(gOutFile, "\n  , ");

								            else

								                fprintf(gOutFile, ", ");

								        }


								        fprintf

								        (

								            gOutFile

								            , "{ 0x%04X, 0x%02X }"

								            , (unsigned int)gMainTable[index].uniVal

								            , (unsigned int)gMainTable[index].cpVal

								        );

								    }


								    // Print the trailer for this table

								    fprintf(gOutFile, "\n};\n");


								    // And print out the table size constant

								    fprintf(gOutFile, "static const unsigned int gToTableSz = %d;\n", gMainTableSz);

								}


								static void showUsage()

								{

								    cout << "ICUData inputUCMfile outputfile\n" << endl;

								}


								// ---------------------------------------------------------------------------

								//  The parameters are:

								//

								//  argV[1] = The source UCM file

								//  argV[2] = The path to the output file

								// ---------------------------------------------------------------------------

								int main(int argC, char** argV)

								{

								    // We have to have 3 parameters

								    if (argC != 3)

								    {

								        showUsage();

								        return 1;

								    }


								    // Try to open the first file for input

								    gInFile = fopen(argV[1], "rt");

								    if (!gInFile)

								    {

								        cout << "Could not find input file: " << argV[1] << endl;

								        return 1;

								    }


								    // Try to open the second file for output (truncated)

								    gOutFile = fopen(argV[2], "wt+");

								    if (!gOutFile)

								    {

								        cout << "Could not create output file: " << argV[1] << endl;

								        return 1;

								    }


								    //

								    //  This will parse the file and load the table. It will also look for

								    //  a couple of key fields in the file header and store that data into

								    //  globals.

								    //

								    loadTable();


								    // If we didn't get any table entries, then give up

								    if (!gMainTableSz)

								    {

								        cout << "No translation table entries were found in the file" << endl;

								        return 1;

								    }


								    //

								    //  Ok, we got the data loaded. Now lets output the tables. This method

								    //  spit out both tables to the output file, in a format ready to be

								    //  incorporated directly into the source code.

								    //

								    formatSBTables();


								    // Close our files

								    fclose(gInFile);

								    fclose(gOutFile);


								    return 0;

								}