You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

538 lines
15 KiB

  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. /*
  18. * $Id: ICUData.cpp 470094 2006-11-01 20:41:30Z amassari $
  19. */
  20. // ---------------------------------------------------------------------------
  21. // This program is designed to parse a standard ICU .UCM file and spit out
  22. // a C++ code fragment that represents the tables required by the intrinsic
  23. // XML parser transcoders.
  24. //
  25. // The file format is pretty simple and this program is not intended to be
  26. // industrial strength by any means. Its use by anyone but the author is
  27. // at the user's own risk.
  28. //
  29. // The code looks for the min/max bytes per character to know what kind of
  30. // table to spit out, but for now only handles single char sets.
  31. // ---------------------------------------------------------------------------
  32. // ---------------------------------------------------------------------------
  33. // Includes
  34. // ---------------------------------------------------------------------------
  35. #include <ctype.h>
  36. #include <stdio.h>
  37. #include <stdlib.h>
  38. #include <iostream.h>
  39. #include <string.h>
  40. // ---------------------------------------------------------------------------
  41. // Const data
  42. // ---------------------------------------------------------------------------
  43. static const unsigned int gMaxInRecs = 1024;
  44. // ---------------------------------------------------------------------------
  45. // Local data types
  46. // ---------------------------------------------------------------------------
  47. struct XlatRec
  48. {
  49. unsigned short uniVal;
  50. unsigned char cpVal;
  51. };
  52. // ---------------------------------------------------------------------------
  53. // Local data
  54. //
  55. // gInFile
  56. // gOutFile
  57. // These are the file stream for the input UCM file and the output file
  58. // that we write the C++ code to.
  59. //
  60. // fLineNum
  61. // Used to track the current line number in the source file, for error
  62. // reporting.
  63. //
  64. // gMainTable
  65. // gMainTableSz
  66. // This is the table that is filled in from the original source document.
  67. // We don't know how big it will be, but its not likely to be much more
  68. // than 300 entries or so (256 output code points with some multiply
  69. // mapped Unicode code points.) So we make it extra large and watch for
  70. // possible overflow.
  71. //
  72. // The size value is bumped up as we load entries into it during the
  73. // parse of the file.
  74. //
  75. // gMaxChar
  76. // gMinChar
  77. // The min/max chars that are used to represent a character. These are
  78. // read from the header of the input file.
  79. //
  80. // gRepChar
  81. // The replacement character to be used. This is read from the header of
  82. // the input file.
  83. // ---------------------------------------------------------------------------
  84. static FILE* gInFile;
  85. static FILE* gOutFile;
  86. static unsigned int fLineNum;
  87. static XlatRec gMainTable[gMaxInRecs];
  88. static unsigned int gMainTableSz = 0;
  89. static unsigned int gMaxChar;
  90. static unsigned int gMinChar;
  91. static unsigned char gRepChar = 1;
  92. // ---------------------------------------------------------------------------
  93. // Local functions
  94. // ---------------------------------------------------------------------------
  95. static unsigned int getLine( char* const toFill
  96. , const unsigned int maxChars
  97. , const bool eofOk = false)
  98. {
  99. while (true)
  100. {
  101. if (!fgets(toFill, maxChars, gInFile))
  102. {
  103. if (feof(gInFile))
  104. {
  105. if (eofOk)
  106. return ~0UL;
  107. else
  108. cout << "Unexpected end of input at line: " << fLineNum << endl;
  109. }
  110. else
  111. {
  112. cout << "Error processing input at line: " << fLineNum << endl;
  113. exit(1);
  114. }
  115. }
  116. fLineNum++;
  117. //
  118. // If its not a comment, then break out
  119. //
  120. if (toFill[0] != '#')
  121. break;
  122. }
  123. //
  124. // There could be a trailing comment on this line, so lets get rid
  125. // of it. Search for a # char and put a null there.
  126. //
  127. char* endPtr = toFill;
  128. while (*endPtr && (*endPtr != '#'))
  129. endPtr++;
  130. if (*endPtr == '#')
  131. *endPtr = 0;
  132. // Strip trailing whitespace
  133. endPtr = toFill + (strlen(toFill) - 1);
  134. while (isspace(*endPtr))
  135. endPtr--;
  136. *(endPtr + 1) = 0;
  137. // And return the count of chars we got
  138. return strlen(toFill);
  139. }
  140. static unsigned int extractVal(char* const srcStr)
  141. {
  142. char* srcPtr = srcStr;
  143. // Run forward to the first non-space
  144. while (isspace(*srcPtr))
  145. srcPtr++;
  146. if (!*srcPtr)
  147. {
  148. cout << "Invalid numeric value on line: " << fLineNum << endl;
  149. exit(1);
  150. }
  151. //
  152. // If it starts with \, then its a hex value in the form \xXX. Else its
  153. // just a decimal value.
  154. //
  155. unsigned int retVal;
  156. char* endPtr;
  157. if (*srcPtr == '\\')
  158. {
  159. // Skip the \\x and interpret as a hex value
  160. srcPtr += 2;
  161. retVal = (unsigned int)strtoul(srcPtr, &endPtr, 16);
  162. }
  163. else
  164. {
  165. retVal = (unsigned int)strtoul(srcPtr, &endPtr, 10);
  166. }
  167. // We should have translated up to the end of the string
  168. if (*endPtr)
  169. {
  170. cout << "Invalid numeric value on line: " << fLineNum << endl;
  171. exit(1);
  172. }
  173. return retVal;
  174. }
  175. static void loadTable()
  176. {
  177. //
  178. // Just loop, reading lines at a time, until we either find the start
  179. // of the character table or hit the end of the file. Along the way, we
  180. // should see a few header values that we store away.
  181. //
  182. const unsigned int tmpBufSz = 2048;
  183. char tmpBuf[tmpBufSz - 1];
  184. while (getLine(tmpBuf, tmpBufSz))
  185. {
  186. //
  187. // Check for one of the special values we are intersted int. If
  188. // its CHARMAP, then we fall out of this loop.
  189. //
  190. if (!strcmp(tmpBuf, "CHARMAP"))
  191. break;
  192. if (!strncmp(tmpBuf, "<mb_cur_max>", 12))
  193. {
  194. gMaxChar = extractVal(&tmpBuf[12]);
  195. }
  196. else if (!strncmp(tmpBuf, "<mb_cur_min>", 12))
  197. {
  198. gMinChar = extractVal(&tmpBuf[12]);
  199. }
  200. else if (!strncmp(tmpBuf, "<subchar>", 9))
  201. {
  202. gRepChar = (char)extractVal(&tmpBuf[9]);
  203. }
  204. }
  205. //
  206. // Ok, now we just run till we hit the "END CHARMAP" line. Each entry
  207. // will be in the form:
  208. //
  209. // <UXXXX> \xXX
  210. //
  211. // Where X is a hex number.
  212. //
  213. char* endPtr;
  214. while (getLine(tmpBuf, tmpBufSz))
  215. {
  216. // Watch for the end of table
  217. if (!strcmp(tmpBuf, "END CHARMAP"))
  218. break;
  219. // The absolute minium it could be is 12 chars
  220. if (strlen(tmpBuf) < 12)
  221. {
  222. cout << "Line " << fLineNum << " is too short to hold a valid entry"
  223. << endl;
  224. exit(1);
  225. }
  226. // Make sure the first token meets the criteria
  227. if ((tmpBuf[0] != '<')
  228. || (tmpBuf[1] != 'U')
  229. || (tmpBuf[6] != '>'))
  230. {
  231. cout << "Line " << fLineNum << " has a badly formed Unicode value"
  232. << endl;
  233. exit(1);
  234. }
  235. //
  236. // Looks reasonable so lets try to convert it. We can play tricks
  237. // with this buffer, so put a null over the > char.
  238. //
  239. tmpBuf[6] = 0;
  240. const unsigned int uniVal = strtoul(&tmpBuf[2], &endPtr, 16);
  241. if (*endPtr)
  242. {
  243. cout << "Invalid Unicode value on line " << fLineNum << endl;
  244. exit(1);
  245. }
  246. //
  247. // Ok, lets search over to the second token. We have to find a \\
  248. // character.
  249. //
  250. char* srcPtr = &tmpBuf[7];
  251. while (*srcPtr && (*srcPtr != '\\'))
  252. srcPtr++;
  253. // If we never found it, its in error
  254. if (!*srcPtr)
  255. {
  256. cout << "Never found second token on line " << fLineNum << endl;
  257. exit(1);
  258. }
  259. // Try to translate it
  260. srcPtr += 2;
  261. const unsigned int cpVal = strtoul(srcPtr, &endPtr, 16);
  262. if (*endPtr)
  263. {
  264. cout << "Invalid code page value on line " << fLineNum << endl;
  265. exit(1);
  266. }
  267. // Make sure that the values are within range
  268. if (uniVal > 0xFFFF)
  269. {
  270. cout << "Unicode value is too big on line " << fLineNum << endl;
  271. exit(1);
  272. }
  273. if (cpVal > 0xFF)
  274. {
  275. cout << "Code page value is too big on line " << fLineNum << endl;
  276. exit(1);
  277. }
  278. // Looks reasonable, so add a new entry to the global table
  279. gMainTable[gMainTableSz].uniVal = (unsigned short)uniVal;
  280. gMainTable[gMainTableSz].cpVal = (unsigned char)cpVal;
  281. gMainTableSz++;
  282. }
  283. }
  284. int compFuncTo(const void* p1, const void* p2)
  285. {
  286. const XlatRec* rec1 = (const XlatRec*)p1;
  287. const XlatRec* rec2 = (const XlatRec*)p2;
  288. return (int)rec1->uniVal - (int)rec2->uniVal;
  289. }
  290. int compFuncFrom(const void* p1, const void* p2)
  291. {
  292. const XlatRec* rec1 = (const XlatRec*)p1;
  293. const XlatRec* rec2 = (const XlatRec*)p2;
  294. //
  295. // Since there can be multiple Unicode chars that map to a single
  296. // code page char, we have to handle the situationw here they are
  297. // equal specially. If the code page vals are equal, then the one
  298. // with the smaller Unicode code point is considered smaller.
  299. //
  300. if (rec1->cpVal == rec2->cpVal)
  301. return (int)rec1->uniVal - (int)rec2->uniVal;
  302. // Else use the code page value for sorting
  303. return (int)rec1->cpVal - (int)rec2->cpVal;
  304. }
  305. static void formatSBTables()
  306. {
  307. // For now, only handle single byte char sets
  308. if ((gMinChar != 1) || (gMaxChar != 1))
  309. {
  310. cout << "formatSBTables can only handle single byte encodings"
  311. << endl;
  312. exit(1);
  313. }
  314. //
  315. // First, we want to sort the table by the code page value field. This
  316. // is the order required for the 'from' table to convert from the code
  317. // page to the internal Unicode format.
  318. //
  319. qsort(gMainTable, gMainTableSz, sizeof(gMainTable[0]), compFuncFrom);
  320. //
  321. // Now spit out the header for the table. This is the same for all
  322. // of them, since they are static to the file and can just all have
  323. // the same name.
  324. //
  325. fprintf
  326. (
  327. gOutFile
  328. , "static const XMLCh gFromTable[256] =\n{\n "
  329. );
  330. //
  331. // Now for each unique entry in the cp value field, we want to put out
  332. // the Unicode value for that entry. Since we sorted them such that
  333. // dups have the one with the smaller Unicode value in the lower index,
  334. // we always hit the desired value first, and then can just skip over
  335. // a duplicate.
  336. //
  337. unsigned int curValue = 0;
  338. unsigned int index;
  339. for (index = 0; index < gMainTableSz; index++)
  340. {
  341. if (curValue)
  342. {
  343. if (!(curValue % 8))
  344. fprintf(gOutFile, "\n , ");
  345. else
  346. fprintf(gOutFile, ", ");
  347. }
  348. if (curValue == gMainTable[index].cpVal)
  349. {
  350. fprintf(gOutFile, "0x%04X", (unsigned int)gMainTable[index].uniVal);
  351. // If there is a dump, then skip it
  352. if (index < gMainTableSz)
  353. {
  354. if (gMainTable[index + 1].cpVal == curValue)
  355. index++;
  356. }
  357. }
  358. else if (curValue < gMainTable[index].cpVal)
  359. {
  360. fprintf(gOutFile, "0xFFFF");
  361. }
  362. else
  363. {
  364. // Screwed up
  365. cout << "Current value got above target value\n" << endl;
  366. exit(1);
  367. }
  368. curValue++;
  369. // If the current value goes over 256, we are in trouble
  370. if (curValue > 256)
  371. {
  372. cout << "The code page value cannot be > 256 in SB mode\n" << endl;
  373. exit(1);
  374. }
  375. }
  376. // And print the trailer for this table
  377. fprintf(gOutFile, "\n};\n\n");
  378. //
  379. // Now lets sort by the Unicode value field. This sort is used for
  380. // the 'to' table. The Unicode value is found by binary search and
  381. // used to map to the right output encoding value.
  382. //
  383. qsort(gMainTable, gMainTableSz, sizeof(gMainTable[0]), compFuncTo);
  384. // Output the table ehader for this one
  385. fprintf
  386. (
  387. gOutFile
  388. , "static const XMLTransService::TransRec gToTable[] =\n{\n "
  389. );
  390. for (index = 0; index < gMainTableSz; index++)
  391. {
  392. if (index)
  393. {
  394. if (!(index % 4))
  395. fprintf(gOutFile, "\n , ");
  396. else
  397. fprintf(gOutFile, ", ");
  398. }
  399. fprintf
  400. (
  401. gOutFile
  402. , "{ 0x%04X, 0x%02X }"
  403. , (unsigned int)gMainTable[index].uniVal
  404. , (unsigned int)gMainTable[index].cpVal
  405. );
  406. }
  407. // Print the trailer for this table
  408. fprintf(gOutFile, "\n};\n");
  409. // And print out the table size constant
  410. fprintf(gOutFile, "static const unsigned int gToTableSz = %d;\n", gMainTableSz);
  411. }
  412. static void showUsage()
  413. {
  414. cout << "ICUData inputUCMfile outputfile\n" << endl;
  415. }
  416. // ---------------------------------------------------------------------------
  417. // The parameters are:
  418. //
  419. // argV[1] = The source UCM file
  420. // argV[2] = The path to the output file
  421. // ---------------------------------------------------------------------------
  422. int main(int argC, char** argV)
  423. {
  424. // We have to have 3 parameters
  425. if (argC != 3)
  426. {
  427. showUsage();
  428. return 1;
  429. }
  430. // Try to open the first file for input
  431. gInFile = fopen(argV[1], "rt");
  432. if (!gInFile)
  433. {
  434. cout << "Could not find input file: " << argV[1] << endl;
  435. return 1;
  436. }
  437. // Try to open the second file for output (truncated)
  438. gOutFile = fopen(argV[2], "wt+");
  439. if (!gOutFile)
  440. {
  441. cout << "Could not create output file: " << argV[1] << endl;
  442. return 1;
  443. }
  444. //
  445. // This will parse the file and load the table. It will also look for
  446. // a couple of key fields in the file header and store that data into
  447. // globals.
  448. //
  449. loadTable();
  450. // If we didn't get any table entries, then give up
  451. if (!gMainTableSz)
  452. {
  453. cout << "No translation table entries were found in the file" << endl;
  454. return 1;
  455. }
  456. //
  457. // Ok, we got the data loaded. Now lets output the tables. This method
  458. // spit out both tables to the output file, in a format ready to be
  459. // incorporated directly into the source code.
  460. //
  461. formatSBTables();
  462. // Close our files
  463. fclose(gInFile);
  464. fclose(gOutFile);
  465. return 0;
  466. }