You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
538 lines
15 KiB
538 lines
15 KiB
/*
|
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
* contributor license agreements. See the NOTICE file distributed with
|
|
* this work for additional information regarding copyright ownership.
|
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
* (the "License"); you may not use this file except in compliance with
|
|
* the License. You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
|
|
/*
|
|
* $Id: ICUData.cpp 470094 2006-11-01 20:41:30Z amassari $
|
|
*/
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// This program is designed to parse a standard ICU .UCM file and spit out
|
|
// a C++ code fragment that represents the tables required by the intrinsic
|
|
// XML parser transcoders.
|
|
//
|
|
// The file format is pretty simple and this program is not intended to be
|
|
// industrial strength by any means. Its use by anyone but the author is
|
|
// at the user's own risk.
|
|
//
|
|
// The code looks for the min/max bytes per character to know what kind of
|
|
// table to spit out, but for now only handles single char sets.
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Includes
|
|
// ---------------------------------------------------------------------------
|
|
#include <ctype.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <iostream.h>
|
|
#include <string.h>
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Const data
|
|
// ---------------------------------------------------------------------------
|
|
static const unsigned int gMaxInRecs = 1024;
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Local data types
|
|
// ---------------------------------------------------------------------------
|
|
struct XlatRec
|
|
{
|
|
unsigned short uniVal;
|
|
unsigned char cpVal;
|
|
};
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Local data
|
|
//
|
|
// gInFile
|
|
// gOutFile
|
|
// These are the file stream for the input UCM file and the output file
|
|
// that we write the C++ code to.
|
|
//
|
|
// fLineNum
|
|
// Used to track the current line number in the source file, for error
|
|
// reporting.
|
|
//
|
|
// gMainTable
|
|
// gMainTableSz
|
|
// This is the table that is filled in from the original source document.
|
|
// We don't know how big it will be, but its not likely to be much more
|
|
// than 300 entries or so (256 output code points with some multiply
|
|
// mapped Unicode code points.) So we make it extra large and watch for
|
|
// possible overflow.
|
|
//
|
|
// The size value is bumped up as we load entries into it during the
|
|
// parse of the file.
|
|
//
|
|
// gMaxChar
|
|
// gMinChar
|
|
// The min/max chars that are used to represent a character. These are
|
|
// read from the header of the input file.
|
|
//
|
|
// gRepChar
|
|
// The replacement character to be used. This is read from the header of
|
|
// the input file.
|
|
// ---------------------------------------------------------------------------
|
|
static FILE* gInFile;
|
|
static FILE* gOutFile;
|
|
static unsigned int fLineNum;
|
|
static XlatRec gMainTable[gMaxInRecs];
|
|
static unsigned int gMainTableSz = 0;
|
|
static unsigned int gMaxChar;
|
|
static unsigned int gMinChar;
|
|
static unsigned char gRepChar = 1;
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Local functions
|
|
// ---------------------------------------------------------------------------
|
|
static unsigned int getLine( char* const toFill
|
|
, const unsigned int maxChars
|
|
, const bool eofOk = false)
|
|
{
|
|
while (true)
|
|
{
|
|
if (!fgets(toFill, maxChars, gInFile))
|
|
{
|
|
if (feof(gInFile))
|
|
{
|
|
if (eofOk)
|
|
return ~0UL;
|
|
else
|
|
cout << "Unexpected end of input at line: " << fLineNum << endl;
|
|
}
|
|
else
|
|
{
|
|
cout << "Error processing input at line: " << fLineNum << endl;
|
|
exit(1);
|
|
}
|
|
}
|
|
fLineNum++;
|
|
|
|
//
|
|
// If its not a comment, then break out
|
|
//
|
|
if (toFill[0] != '#')
|
|
break;
|
|
}
|
|
|
|
//
|
|
// There could be a trailing comment on this line, so lets get rid
|
|
// of it. Search for a # char and put a null there.
|
|
//
|
|
char* endPtr = toFill;
|
|
while (*endPtr && (*endPtr != '#'))
|
|
endPtr++;
|
|
if (*endPtr == '#')
|
|
*endPtr = 0;
|
|
|
|
// Strip trailing whitespace
|
|
endPtr = toFill + (strlen(toFill) - 1);
|
|
while (isspace(*endPtr))
|
|
endPtr--;
|
|
*(endPtr + 1) = 0;
|
|
|
|
// And return the count of chars we got
|
|
return strlen(toFill);
|
|
}
|
|
|
|
|
|
static unsigned int extractVal(char* const srcStr)
|
|
{
|
|
char* srcPtr = srcStr;
|
|
|
|
// Run forward to the first non-space
|
|
while (isspace(*srcPtr))
|
|
srcPtr++;
|
|
|
|
if (!*srcPtr)
|
|
{
|
|
cout << "Invalid numeric value on line: " << fLineNum << endl;
|
|
exit(1);
|
|
}
|
|
|
|
//
|
|
// If it starts with \, then its a hex value in the form \xXX. Else its
|
|
// just a decimal value.
|
|
//
|
|
unsigned int retVal;
|
|
char* endPtr;
|
|
if (*srcPtr == '\\')
|
|
{
|
|
// Skip the \\x and interpret as a hex value
|
|
srcPtr += 2;
|
|
retVal = (unsigned int)strtoul(srcPtr, &endPtr, 16);
|
|
}
|
|
else
|
|
{
|
|
retVal = (unsigned int)strtoul(srcPtr, &endPtr, 10);
|
|
}
|
|
|
|
// We should have translated up to the end of the string
|
|
if (*endPtr)
|
|
{
|
|
cout << "Invalid numeric value on line: " << fLineNum << endl;
|
|
exit(1);
|
|
}
|
|
|
|
return retVal;
|
|
}
|
|
|
|
|
|
static void loadTable()
|
|
{
|
|
//
|
|
// Just loop, reading lines at a time, until we either find the start
|
|
// of the character table or hit the end of the file. Along the way, we
|
|
// should see a few header values that we store away.
|
|
//
|
|
const unsigned int tmpBufSz = 2048;
|
|
char tmpBuf[tmpBufSz - 1];
|
|
while (getLine(tmpBuf, tmpBufSz))
|
|
{
|
|
//
|
|
// Check for one of the special values we are intersted int. If
|
|
// its CHARMAP, then we fall out of this loop.
|
|
//
|
|
if (!strcmp(tmpBuf, "CHARMAP"))
|
|
break;
|
|
|
|
if (!strncmp(tmpBuf, "<mb_cur_max>", 12))
|
|
{
|
|
gMaxChar = extractVal(&tmpBuf[12]);
|
|
}
|
|
else if (!strncmp(tmpBuf, "<mb_cur_min>", 12))
|
|
{
|
|
gMinChar = extractVal(&tmpBuf[12]);
|
|
}
|
|
else if (!strncmp(tmpBuf, "<subchar>", 9))
|
|
{
|
|
gRepChar = (char)extractVal(&tmpBuf[9]);
|
|
}
|
|
}
|
|
|
|
//
|
|
// Ok, now we just run till we hit the "END CHARMAP" line. Each entry
|
|
// will be in the form:
|
|
//
|
|
// <UXXXX> \xXX
|
|
//
|
|
// Where X is a hex number.
|
|
//
|
|
char* endPtr;
|
|
while (getLine(tmpBuf, tmpBufSz))
|
|
{
|
|
// Watch for the end of table
|
|
if (!strcmp(tmpBuf, "END CHARMAP"))
|
|
break;
|
|
|
|
// The absolute minium it could be is 12 chars
|
|
if (strlen(tmpBuf) < 12)
|
|
{
|
|
cout << "Line " << fLineNum << " is too short to hold a valid entry"
|
|
<< endl;
|
|
exit(1);
|
|
}
|
|
|
|
// Make sure the first token meets the criteria
|
|
if ((tmpBuf[0] != '<')
|
|
|| (tmpBuf[1] != 'U')
|
|
|| (tmpBuf[6] != '>'))
|
|
{
|
|
cout << "Line " << fLineNum << " has a badly formed Unicode value"
|
|
<< endl;
|
|
exit(1);
|
|
}
|
|
|
|
//
|
|
// Looks reasonable so lets try to convert it. We can play tricks
|
|
// with this buffer, so put a null over the > char.
|
|
//
|
|
tmpBuf[6] = 0;
|
|
const unsigned int uniVal = strtoul(&tmpBuf[2], &endPtr, 16);
|
|
if (*endPtr)
|
|
{
|
|
cout << "Invalid Unicode value on line " << fLineNum << endl;
|
|
exit(1);
|
|
}
|
|
|
|
//
|
|
// Ok, lets search over to the second token. We have to find a \\
|
|
// character.
|
|
//
|
|
char* srcPtr = &tmpBuf[7];
|
|
while (*srcPtr && (*srcPtr != '\\'))
|
|
srcPtr++;
|
|
|
|
// If we never found it, its in error
|
|
if (!*srcPtr)
|
|
{
|
|
cout << "Never found second token on line " << fLineNum << endl;
|
|
exit(1);
|
|
}
|
|
|
|
// Try to translate it
|
|
srcPtr += 2;
|
|
const unsigned int cpVal = strtoul(srcPtr, &endPtr, 16);
|
|
if (*endPtr)
|
|
{
|
|
cout << "Invalid code page value on line " << fLineNum << endl;
|
|
exit(1);
|
|
}
|
|
|
|
// Make sure that the values are within range
|
|
if (uniVal > 0xFFFF)
|
|
{
|
|
cout << "Unicode value is too big on line " << fLineNum << endl;
|
|
exit(1);
|
|
}
|
|
|
|
if (cpVal > 0xFF)
|
|
{
|
|
cout << "Code page value is too big on line " << fLineNum << endl;
|
|
exit(1);
|
|
}
|
|
|
|
// Looks reasonable, so add a new entry to the global table
|
|
gMainTable[gMainTableSz].uniVal = (unsigned short)uniVal;
|
|
gMainTable[gMainTableSz].cpVal = (unsigned char)cpVal;
|
|
gMainTableSz++;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
int compFuncTo(const void* p1, const void* p2)
|
|
{
|
|
const XlatRec* rec1 = (const XlatRec*)p1;
|
|
const XlatRec* rec2 = (const XlatRec*)p2;
|
|
|
|
return (int)rec1->uniVal - (int)rec2->uniVal;
|
|
}
|
|
|
|
|
|
int compFuncFrom(const void* p1, const void* p2)
|
|
{
|
|
const XlatRec* rec1 = (const XlatRec*)p1;
|
|
const XlatRec* rec2 = (const XlatRec*)p2;
|
|
|
|
//
|
|
// Since there can be multiple Unicode chars that map to a single
|
|
// code page char, we have to handle the situationw here they are
|
|
// equal specially. If the code page vals are equal, then the one
|
|
// with the smaller Unicode code point is considered smaller.
|
|
//
|
|
if (rec1->cpVal == rec2->cpVal)
|
|
return (int)rec1->uniVal - (int)rec2->uniVal;
|
|
|
|
// Else use the code page value for sorting
|
|
return (int)rec1->cpVal - (int)rec2->cpVal;
|
|
}
|
|
|
|
|
|
static void formatSBTables()
|
|
{
|
|
// For now, only handle single byte char sets
|
|
if ((gMinChar != 1) || (gMaxChar != 1))
|
|
{
|
|
cout << "formatSBTables can only handle single byte encodings"
|
|
<< endl;
|
|
exit(1);
|
|
}
|
|
|
|
//
|
|
// First, we want to sort the table by the code page value field. This
|
|
// is the order required for the 'from' table to convert from the code
|
|
// page to the internal Unicode format.
|
|
//
|
|
qsort(gMainTable, gMainTableSz, sizeof(gMainTable[0]), compFuncFrom);
|
|
|
|
//
|
|
// Now spit out the header for the table. This is the same for all
|
|
// of them, since they are static to the file and can just all have
|
|
// the same name.
|
|
//
|
|
fprintf
|
|
(
|
|
gOutFile
|
|
, "static const XMLCh gFromTable[256] =\n{\n "
|
|
);
|
|
|
|
//
|
|
// Now for each unique entry in the cp value field, we want to put out
|
|
// the Unicode value for that entry. Since we sorted them such that
|
|
// dups have the one with the smaller Unicode value in the lower index,
|
|
// we always hit the desired value first, and then can just skip over
|
|
// a duplicate.
|
|
//
|
|
unsigned int curValue = 0;
|
|
unsigned int index;
|
|
for (index = 0; index < gMainTableSz; index++)
|
|
{
|
|
if (curValue)
|
|
{
|
|
if (!(curValue % 8))
|
|
fprintf(gOutFile, "\n , ");
|
|
else
|
|
fprintf(gOutFile, ", ");
|
|
}
|
|
|
|
if (curValue == gMainTable[index].cpVal)
|
|
{
|
|
fprintf(gOutFile, "0x%04X", (unsigned int)gMainTable[index].uniVal);
|
|
|
|
// If there is a dump, then skip it
|
|
if (index < gMainTableSz)
|
|
{
|
|
if (gMainTable[index + 1].cpVal == curValue)
|
|
index++;
|
|
}
|
|
}
|
|
else if (curValue < gMainTable[index].cpVal)
|
|
{
|
|
fprintf(gOutFile, "0xFFFF");
|
|
}
|
|
else
|
|
{
|
|
// Screwed up
|
|
cout << "Current value got above target value\n" << endl;
|
|
exit(1);
|
|
}
|
|
curValue++;
|
|
|
|
// If the current value goes over 256, we are in trouble
|
|
if (curValue > 256)
|
|
{
|
|
cout << "The code page value cannot be > 256 in SB mode\n" << endl;
|
|
exit(1);
|
|
}
|
|
}
|
|
|
|
// And print the trailer for this table
|
|
fprintf(gOutFile, "\n};\n\n");
|
|
|
|
|
|
//
|
|
// Now lets sort by the Unicode value field. This sort is used for
|
|
// the 'to' table. The Unicode value is found by binary search and
|
|
// used to map to the right output encoding value.
|
|
//
|
|
qsort(gMainTable, gMainTableSz, sizeof(gMainTable[0]), compFuncTo);
|
|
|
|
// Output the table ehader for this one
|
|
fprintf
|
|
(
|
|
gOutFile
|
|
, "static const XMLTransService::TransRec gToTable[] =\n{\n "
|
|
);
|
|
|
|
for (index = 0; index < gMainTableSz; index++)
|
|
{
|
|
if (index)
|
|
{
|
|
if (!(index % 4))
|
|
fprintf(gOutFile, "\n , ");
|
|
else
|
|
fprintf(gOutFile, ", ");
|
|
}
|
|
|
|
fprintf
|
|
(
|
|
gOutFile
|
|
, "{ 0x%04X, 0x%02X }"
|
|
, (unsigned int)gMainTable[index].uniVal
|
|
, (unsigned int)gMainTable[index].cpVal
|
|
);
|
|
}
|
|
|
|
// Print the trailer for this table
|
|
fprintf(gOutFile, "\n};\n");
|
|
|
|
// And print out the table size constant
|
|
fprintf(gOutFile, "static const unsigned int gToTableSz = %d;\n", gMainTableSz);
|
|
}
|
|
|
|
static void showUsage()
|
|
{
|
|
cout << "ICUData inputUCMfile outputfile\n" << endl;
|
|
}
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// The parameters are:
|
|
//
|
|
// argV[1] = The source UCM file
|
|
// argV[2] = The path to the output file
|
|
// ---------------------------------------------------------------------------
|
|
int main(int argC, char** argV)
|
|
{
|
|
// We have to have 3 parameters
|
|
if (argC != 3)
|
|
{
|
|
showUsage();
|
|
return 1;
|
|
}
|
|
|
|
// Try to open the first file for input
|
|
gInFile = fopen(argV[1], "rt");
|
|
if (!gInFile)
|
|
{
|
|
cout << "Could not find input file: " << argV[1] << endl;
|
|
return 1;
|
|
}
|
|
|
|
// Try to open the second file for output (truncated)
|
|
gOutFile = fopen(argV[2], "wt+");
|
|
if (!gOutFile)
|
|
{
|
|
cout << "Could not create output file: " << argV[1] << endl;
|
|
return 1;
|
|
}
|
|
|
|
//
|
|
// This will parse the file and load the table. It will also look for
|
|
// a couple of key fields in the file header and store that data into
|
|
// globals.
|
|
//
|
|
loadTable();
|
|
|
|
// If we didn't get any table entries, then give up
|
|
if (!gMainTableSz)
|
|
{
|
|
cout << "No translation table entries were found in the file" << endl;
|
|
return 1;
|
|
}
|
|
|
|
//
|
|
// Ok, we got the data loaded. Now lets output the tables. This method
|
|
// spit out both tables to the output file, in a format ready to be
|
|
// incorporated directly into the source code.
|
|
//
|
|
formatSBTables();
|
|
|
|
// Close our files
|
|
fclose(gInFile);
|
|
fclose(gOutFile);
|
|
|
|
return 0;
|
|
}
|