/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * $Id: ICUData.cpp 470094 2006-11-01 20:41:30Z amassari $ */ // --------------------------------------------------------------------------- // This program is designed to parse a standard ICU .UCM file and spit out // a C++ code fragment that represents the tables required by the intrinsic // XML parser transcoders. // // The file format is pretty simple and this program is not intended to be // industrial strength by any means. Its use by anyone but the author is // at the user's own risk. // // The code looks for the min/max bytes per character to know what kind of // table to spit out, but for now only handles single char sets. // --------------------------------------------------------------------------- // --------------------------------------------------------------------------- // Includes // --------------------------------------------------------------------------- #include #include #include #include #include // --------------------------------------------------------------------------- // Const data // --------------------------------------------------------------------------- static const unsigned int gMaxInRecs = 1024; // --------------------------------------------------------------------------- // Local data types // --------------------------------------------------------------------------- struct XlatRec { unsigned short uniVal; unsigned char cpVal; }; // --------------------------------------------------------------------------- // Local data // // gInFile // gOutFile // These are the file stream for the input UCM file and the output file // that we write the C++ code to. // // fLineNum // Used to track the current line number in the source file, for error // reporting. // // gMainTable // gMainTableSz // This is the table that is filled in from the original source document. // We don't know how big it will be, but its not likely to be much more // than 300 entries or so (256 output code points with some multiply // mapped Unicode code points.) So we make it extra large and watch for // possible overflow. // // The size value is bumped up as we load entries into it during the // parse of the file. // // gMaxChar // gMinChar // The min/max chars that are used to represent a character. These are // read from the header of the input file. // // gRepChar // The replacement character to be used. This is read from the header of // the input file. // --------------------------------------------------------------------------- static FILE* gInFile; static FILE* gOutFile; static unsigned int fLineNum; static XlatRec gMainTable[gMaxInRecs]; static unsigned int gMainTableSz = 0; static unsigned int gMaxChar; static unsigned int gMinChar; static unsigned char gRepChar = 1; // --------------------------------------------------------------------------- // Local functions // --------------------------------------------------------------------------- static unsigned int getLine( char* const toFill , const unsigned int maxChars , const bool eofOk = false) { while (true) { if (!fgets(toFill, maxChars, gInFile)) { if (feof(gInFile)) { if (eofOk) return ~0UL; else cout << "Unexpected end of input at line: " << fLineNum << endl; } else { cout << "Error processing input at line: " << fLineNum << endl; exit(1); } } fLineNum++; // // If its not a comment, then break out // if (toFill[0] != '#') break; } // // There could be a trailing comment on this line, so lets get rid // of it. Search for a # char and put a null there. // char* endPtr = toFill; while (*endPtr && (*endPtr != '#')) endPtr++; if (*endPtr == '#') *endPtr = 0; // Strip trailing whitespace endPtr = toFill + (strlen(toFill) - 1); while (isspace(*endPtr)) endPtr--; *(endPtr + 1) = 0; // And return the count of chars we got return strlen(toFill); } static unsigned int extractVal(char* const srcStr) { char* srcPtr = srcStr; // Run forward to the first non-space while (isspace(*srcPtr)) srcPtr++; if (!*srcPtr) { cout << "Invalid numeric value on line: " << fLineNum << endl; exit(1); } // // If it starts with \, then its a hex value in the form \xXX. Else its // just a decimal value. // unsigned int retVal; char* endPtr; if (*srcPtr == '\\') { // Skip the \\x and interpret as a hex value srcPtr += 2; retVal = (unsigned int)strtoul(srcPtr, &endPtr, 16); } else { retVal = (unsigned int)strtoul(srcPtr, &endPtr, 10); } // We should have translated up to the end of the string if (*endPtr) { cout << "Invalid numeric value on line: " << fLineNum << endl; exit(1); } return retVal; } static void loadTable() { // // Just loop, reading lines at a time, until we either find the start // of the character table or hit the end of the file. Along the way, we // should see a few header values that we store away. // const unsigned int tmpBufSz = 2048; char tmpBuf[tmpBufSz - 1]; while (getLine(tmpBuf, tmpBufSz)) { // // Check for one of the special values we are intersted int. If // its CHARMAP, then we fall out of this loop. // if (!strcmp(tmpBuf, "CHARMAP")) break; if (!strncmp(tmpBuf, "", 12)) { gMaxChar = extractVal(&tmpBuf[12]); } else if (!strncmp(tmpBuf, "", 12)) { gMinChar = extractVal(&tmpBuf[12]); } else if (!strncmp(tmpBuf, "", 9)) { gRepChar = (char)extractVal(&tmpBuf[9]); } } // // Ok, now we just run till we hit the "END CHARMAP" line. Each entry // will be in the form: // // \xXX // // Where X is a hex number. // char* endPtr; while (getLine(tmpBuf, tmpBufSz)) { // Watch for the end of table if (!strcmp(tmpBuf, "END CHARMAP")) break; // The absolute minium it could be is 12 chars if (strlen(tmpBuf) < 12) { cout << "Line " << fLineNum << " is too short to hold a valid entry" << endl; exit(1); } // Make sure the first token meets the criteria if ((tmpBuf[0] != '<') || (tmpBuf[1] != 'U') || (tmpBuf[6] != '>')) { cout << "Line " << fLineNum << " has a badly formed Unicode value" << endl; exit(1); } // // Looks reasonable so lets try to convert it. We can play tricks // with this buffer, so put a null over the > char. // tmpBuf[6] = 0; const unsigned int uniVal = strtoul(&tmpBuf[2], &endPtr, 16); if (*endPtr) { cout << "Invalid Unicode value on line " << fLineNum << endl; exit(1); } // // Ok, lets search over to the second token. We have to find a \\ // character. // char* srcPtr = &tmpBuf[7]; while (*srcPtr && (*srcPtr != '\\')) srcPtr++; // If we never found it, its in error if (!*srcPtr) { cout << "Never found second token on line " << fLineNum << endl; exit(1); } // Try to translate it srcPtr += 2; const unsigned int cpVal = strtoul(srcPtr, &endPtr, 16); if (*endPtr) { cout << "Invalid code page value on line " << fLineNum << endl; exit(1); } // Make sure that the values are within range if (uniVal > 0xFFFF) { cout << "Unicode value is too big on line " << fLineNum << endl; exit(1); } if (cpVal > 0xFF) { cout << "Code page value is too big on line " << fLineNum << endl; exit(1); } // Looks reasonable, so add a new entry to the global table gMainTable[gMainTableSz].uniVal = (unsigned short)uniVal; gMainTable[gMainTableSz].cpVal = (unsigned char)cpVal; gMainTableSz++; } } int compFuncTo(const void* p1, const void* p2) { const XlatRec* rec1 = (const XlatRec*)p1; const XlatRec* rec2 = (const XlatRec*)p2; return (int)rec1->uniVal - (int)rec2->uniVal; } int compFuncFrom(const void* p1, const void* p2) { const XlatRec* rec1 = (const XlatRec*)p1; const XlatRec* rec2 = (const XlatRec*)p2; // // Since there can be multiple Unicode chars that map to a single // code page char, we have to handle the situationw here they are // equal specially. If the code page vals are equal, then the one // with the smaller Unicode code point is considered smaller. // if (rec1->cpVal == rec2->cpVal) return (int)rec1->uniVal - (int)rec2->uniVal; // Else use the code page value for sorting return (int)rec1->cpVal - (int)rec2->cpVal; } static void formatSBTables() { // For now, only handle single byte char sets if ((gMinChar != 1) || (gMaxChar != 1)) { cout << "formatSBTables can only handle single byte encodings" << endl; exit(1); } // // First, we want to sort the table by the code page value field. This // is the order required for the 'from' table to convert from the code // page to the internal Unicode format. // qsort(gMainTable, gMainTableSz, sizeof(gMainTable[0]), compFuncFrom); // // Now spit out the header for the table. This is the same for all // of them, since they are static to the file and can just all have // the same name. // fprintf ( gOutFile , "static const XMLCh gFromTable[256] =\n{\n " ); // // Now for each unique entry in the cp value field, we want to put out // the Unicode value for that entry. Since we sorted them such that // dups have the one with the smaller Unicode value in the lower index, // we always hit the desired value first, and then can just skip over // a duplicate. // unsigned int curValue = 0; unsigned int index; for (index = 0; index < gMainTableSz; index++) { if (curValue) { if (!(curValue % 8)) fprintf(gOutFile, "\n , "); else fprintf(gOutFile, ", "); } if (curValue == gMainTable[index].cpVal) { fprintf(gOutFile, "0x%04X", (unsigned int)gMainTable[index].uniVal); // If there is a dump, then skip it if (index < gMainTableSz) { if (gMainTable[index + 1].cpVal == curValue) index++; } } else if (curValue < gMainTable[index].cpVal) { fprintf(gOutFile, "0xFFFF"); } else { // Screwed up cout << "Current value got above target value\n" << endl; exit(1); } curValue++; // If the current value goes over 256, we are in trouble if (curValue > 256) { cout << "The code page value cannot be > 256 in SB mode\n" << endl; exit(1); } } // And print the trailer for this table fprintf(gOutFile, "\n};\n\n"); // // Now lets sort by the Unicode value field. This sort is used for // the 'to' table. The Unicode value is found by binary search and // used to map to the right output encoding value. // qsort(gMainTable, gMainTableSz, sizeof(gMainTable[0]), compFuncTo); // Output the table ehader for this one fprintf ( gOutFile , "static const XMLTransService::TransRec gToTable[] =\n{\n " ); for (index = 0; index < gMainTableSz; index++) { if (index) { if (!(index % 4)) fprintf(gOutFile, "\n , "); else fprintf(gOutFile, ", "); } fprintf ( gOutFile , "{ 0x%04X, 0x%02X }" , (unsigned int)gMainTable[index].uniVal , (unsigned int)gMainTable[index].cpVal ); } // Print the trailer for this table fprintf(gOutFile, "\n};\n"); // And print out the table size constant fprintf(gOutFile, "static const unsigned int gToTableSz = %d;\n", gMainTableSz); } static void showUsage() { cout << "ICUData inputUCMfile outputfile\n" << endl; } // --------------------------------------------------------------------------- // The parameters are: // // argV[1] = The source UCM file // argV[2] = The path to the output file // --------------------------------------------------------------------------- int main(int argC, char** argV) { // We have to have 3 parameters if (argC != 3) { showUsage(); return 1; } // Try to open the first file for input gInFile = fopen(argV[1], "rt"); if (!gInFile) { cout << "Could not find input file: " << argV[1] << endl; return 1; } // Try to open the second file for output (truncated) gOutFile = fopen(argV[2], "wt+"); if (!gOutFile) { cout << "Could not create output file: " << argV[1] << endl; return 1; } // // This will parse the file and load the table. It will also look for // a couple of key fields in the file header and store that data into // globals. // loadTable(); // If we didn't get any table entries, then give up if (!gMainTableSz) { cout << "No translation table entries were found in the file" << endl; return 1; } // // Ok, we got the data loaded. Now lets output the tables. This method // spit out both tables to the output file, in a format ready to be // incorporated directly into the source code. // formatSBTables(); // Close our files fclose(gInFile); fclose(gOutFile); return 0; }