genUnicode.py 12.7 KB
Newer Older
1
#!/usr/bin/python -u
2 3 4 5 6 7 8
#
# Original script modified in November 2003 to take advantage of
# the character-validation range routines, and updated to the
# current Unicode information (Version 4.0.1)
#
# NOTE: there is an 'alias' facility for blocks which are not present in
#	the current release, but are needed for ABI compatibility.  This
9 10
#	must be accomplished MANUALLY!  Please see the comments below under
#     'blockAliases'
11
#
12 13 14 15
import sys
import string
import time

16 17
webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html"
sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt"
18

19 20 21 22 23 24 25 26 27
#
# blockAliases is a small hack - it is used for mapping block names which
# were were used in the 3.1 release, but are missing or changed in the current
# release.  The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]"
blockAliases = []
blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols")
blockAliases.append("Greek:GreekandCoptic")
blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," + 
	"SupplementaryPrivateUseArea-B")
28 29 30 31 32 33 34 35

# minTableSize gives the minimum number of ranges which must be present
# before a range table is produced.  If there are less than this
# number, inline comparisons are generated
minTableSize = 8

(blockfile, catfile) = string.split(sources)

36

37 38 39 40 41
#
# Now process the "blocks" file, reducing it to a dictionary
# indexed by blockname, containing a tuple with the applicable
# block range
#
42
BlockNames = {}
43
try:
44
    blocks = open(blockfile, "r")
45
except:
46
    print "Missing %s, aborting ..." % blockfile
47 48 49 50 51 52 53
    sys.exit(1)

for line in blocks.readlines():
    if line[0] == '#':
        continue
    line = string.strip(line)
    if line == '':
54
        continue
55
    try:
56 57 58 59 60
        fields = string.split(line, ';')
        range = string.strip(fields[0])
        (start, end) = string.split(range, "..")
        name = string.strip(fields[1])
        name = string.replace(name, ' ', '')
61 62
    except:
        print "Failed to process line: %s" % (line)
63
        continue
64 65 66 67 68 69
    start = "0x" + start
    end = "0x" + end
    try:
        BlockNames[name].append((start, end))
    except:
        BlockNames[name] = [(start, end)]
70 71 72
blocks.close()
print "Parsed %d blocks descriptions" % (len(BlockNames.keys()))

73 74 75 76 77 78 79 80 81 82 83 84 85
for block in blockAliases:
    alias = string.split(block,':')
    alist = string.split(alias[1],',')
    for comp in alist:
        if BlockNames.has_key(comp):
            if alias[0] not in BlockNames:
                BlockNames[alias[0]] = []
            for r in BlockNames[comp]:
                BlockNames[alias[0]].append(r)
        else:
            print "Alias %s: %s not in Blocks" % (alias[0], comp)
            continue

86 87 88 89 90 91 92 93 94 95
#
# Next process the Categories file. This is more complex, since
# the file is in code sequence, and we need to invert it.  We use
# a dictionary with index category-name, with each entry containing
# all the ranges (codepoints) of that category.  Note that category
# names comprise two parts - the general category, and the "subclass"
# within that category.  Therefore, both "general category" (which is
# the first character of the 2-character category-name) and the full
# (2-character) name are entered into this dictionary.
#
96
try:
97
    data = open(catfile, "r")
98
except:
99
    print "Missing %s, aborting ..." % catfile
100 101 102 103 104 105 106 107 108
    sys.exit(1)

nbchar = 0;
Categories = {}
for line in data.readlines():
    if line[0] == '#':
        continue
    line = string.strip(line)
    if line == '':
109
        continue
110
    try:
111 112 113 114 115 116 117 118 119 120 121 122 123
        fields = string.split(line, ';')
        point = string.strip(fields[0])
        value = 0
        while point != '':
            value = value * 16
            if point[0] >= '0' and point[0] <= '9':
                value = value + ord(point[0]) - ord('0')
            elif point[0] >= 'A' and point[0] <= 'F':
                value = value + 10 + ord(point[0]) - ord('A')
            elif point[0] >= 'a' and point[0] <= 'f':
                value = value + 10 + ord(point[0]) - ord('a')
            point = point[1:]
        name = fields[2]
124 125
    except:
        print "Failed to process line: %s" % (line)
126
        continue
127 128
    
    nbchar = nbchar + 1
129
    # update entry for "full name"
130
    try:
131
        Categories[name].append(value)
132 133
    except:
        try:
134 135 136 137
            Categories[name] = [value]
        except:
            print "Failed to process line: %s" % (line)
    # update "general category" name
138
    try:
139
        Categories[name[0]].append(value)
140 141
    except:
        try:
142 143 144 145
            Categories[name[0]] = [value]
        except:
            print "Failed to process line: %s" % (line)

146 147
blocks.close()
print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys()))
148 149 150 151 152

#
# The data is now all read.  Time to process it into a more useful form.
#
# reduce the number list into ranges
153 154 155 156 157 158 159 160
for cat in Categories.keys():
    list = Categories[cat]
    start = -1
    prev = -1
    end = -1
    ranges = []
    for val in list:
        if start == -1:
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
            start = val
            prev = val
            continue
        elif val == prev + 1:
            prev = val
            continue
        elif prev == start:
            ranges.append((prev, prev))
            start = val
            prev = val
            continue
        else:
            ranges.append((start, prev))
            start = val
            prev = val
            continue
177 178 179 180 181
    if prev == start:
        ranges.append((prev, prev))
    else:
        ranges.append((start, prev))
    Categories[cat] = ranges
182 183 184 185 186 187 188 189 190 191 192

#
# Assure all data is in alphabetic order, since we will be doing binary
# searches on the tables.
#
bkeys = BlockNames.keys()
bkeys.sort()

ckeys = Categories.keys()
ckeys.sort()

193 194 195 196
#
# Generate the resulting files
#
try:
197
    header = open("include/libxml/xmlunicode.h", "w")
198
except:
199
    print "Failed to open include/libxml/xmlunicode.h"
200 201 202 203 204 205 206 207 208 209 210 211
    sys.exit(1)

try:
    output = open("xmlunicode.c", "w")
except:
    print "Failed to open xmlunicode.c"
    sys.exit(1)

date = time.asctime(time.localtime(time.time()))

header.write(
"""/*
212 213
 * Summary: Unicode character APIs
 * Description: API for the Unicode character APIs
214 215 216
 *
 * This file is automatically generated from the
 * UCS description files of the Unicode Character Database
217
 * %s
218 219 220 221
 * using the genUnicode.py Python script.
 *
 * Generation date: %s
 * Sources: %s
222
 * Author: Daniel Veillard
223 224 225 226 227
 */

#ifndef __XML_UNICODE_H__
#define __XML_UNICODE_H__

228 229
#include <libxml/xmlversion.h>

230 231
#ifdef LIBXML_UNICODE_ENABLED

232 233 234 235
#ifdef __cplusplus
extern "C" {
#endif

236 237
""" % (webpage, date, sources));

238 239 240 241 242 243
output.write(
"""/*
 * xmlunicode.c: this module implements the Unicode character APIs
 *
 * This file is automatically generated from the
 * UCS description files of the Unicode Character Database
244
 * %s
245 246 247 248 249 250 251 252 253 254 255 256 257 258 259
 * using the genUnicode.py Python script.
 *
 * Generation date: %s
 * Sources: %s
 * Daniel Veillard <veillard@redhat.com>
 */

#define IN_LIBXML
#include "libxml.h"

#ifdef LIBXML_UNICODE_ENABLED

#include <string.h>
#include <libxml/xmlversion.h>
#include <libxml/xmlunicode.h>
260
#include <libxml/chvalid.h>
261

262
typedef int (xmlIntFunc)(int);	/* just to keep one's mind untwisted */
263

264 265 266 267 268 269
typedef struct {
    const char *rangename;
    xmlIntFunc *func;
} xmlUnicodeRange;

typedef struct {
270
    const xmlUnicodeRange *table;
271 272 273 274 275 276
    int		    numentries;
} xmlUnicodeNameTable;


static xmlIntFunc *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname);

277
static const xmlUnicodeRange xmlUnicodeBlocks[] = {
278 279 280 281 282 283 284 285 286
""" % (webpage, date, sources));

flag = 0
for block in bkeys:
    name = string.replace(block, '-', '')
    if flag:
        output.write(',\n')
    else:
        flag = 1
287
    output.write('  {"%s", xmlUCSIs%s}' % (block, name))
288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314
output.write('};\n\n')

output.write('static xmlUnicodeRange xmlUnicodeCats[] = {\n')
flag = 0;
for name in ckeys:
    if flag:
        output.write(',\n')
    else:
        flag = 1
    output.write('  {"%s", xmlUCSIsCat%s}' % (name, name))
output.write('};\n\n')

#
# For any categories with more than minTableSize ranges we generate
# a range table suitable for xmlCharInRange
#
for name in ckeys:
  if len(Categories[name]) > minTableSize:
    numshort = 0
    numlong = 0
    ranges = Categories[name]
    sptr = "NULL"
    lptr = "NULL"
    for range in ranges:
      (low, high) = range
      if high < 0x10000:
        if numshort == 0:
315
          pline = "static const xmlChSRange xml%sS[] = {" % name
316 317 318 319 320 321 322 323
          sptr = "xml%sS" % name
        else:
          pline += ", "
        numshort += 1
      else:
        if numlong == 0:
          if numshort > 0:
            output.write(pline + " };\n")
324
          pline = "static const xmlChLRange xml%sL[] = {" % name
325 326 327 328 329 330 331 332 333 334 335 336 337
          lptr = "xml%sL" % name
        else:
          pline += ", "
        numlong += 1
      if len(pline) > 60:
        output.write(pline + "\n")
        pline = "    "
      pline += "{%s, %s}" % (hex(low), hex(high))
    output.write(pline + " };\nstatic xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
         % (name, numshort, numlong, sptr, lptr))


output.write(
338 339
"""static xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};
static xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};
340 341 342 343 344 345 346 347 348 349 350 351 352 353 354

/**
 * xmlUnicodeLookup:
 * @tptr: pointer to the name table
 * @name: name to be found
 *
 * binary table lookup for user-supplied name
 *
 * Returns pointer to range function if found, otherwise NULL
 */
static xmlIntFunc
*xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname) {
    int low, high, mid, cmp;
    xmlUnicodeRange *sptr;

355 356
    if ((tptr == NULL) || (tname == NULL)) return(NULL);

357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374
    low = 0;
    high = tptr->numentries - 1;
    sptr = tptr->table;
    while (low <= high) {
	mid = (low + high) / 2;
	if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0)
	    return (sptr[mid].func);
	if (cmp < 0)
	    high = mid - 1;
	else
	    low = mid + 1;
    }
    return (NULL);    
}

""" % (len(BlockNames), len(Categories)) )

for block in bkeys:
375
    name = string.replace(block, '-', '')
376
    header.write("XMLPUBFUN int XMLCALL xmlUCSIs%s\t(int code);\n" % name)
377 378 379 380
    output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
    output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
                 (block))
    output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
381 382 383 384 385 386 387 388 389
    output.write("int\nxmlUCSIs%s(int code) {\n    return(" % name)
    flag = 0
    for (start, end) in BlockNames[block]:
        if flag:
            output.write(" ||\n           ")
        else:
            flag = 1
        output.write("((code >= %s) && (code <= %s))" % (start, end))
    output.write(");\n}\n\n")
390

391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410
header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsBlock\t(int code, const char *block);\n\n")
output.write(
"""/**
 * xmlUCSIsBlock:
 * @code: UCS code point
 * @block: UCS block name
 *
 * Check whether the character is part of the UCS Block
 *
 * Returns 1 if true, 0 if false and -1 on unknown block
 */
int
xmlUCSIsBlock(int code, const char *block) {
    xmlIntFunc *func;

    func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block);
    if (func == NULL)
	return (-1);
    return (func(code));
}
411

412
""")
413

414
for name in ckeys:
415
    ranges = Categories[name]
416
    header.write("XMLPUBFUN int XMLCALL xmlUCSIsCat%s\t(int code);\n" % name)
417 418 419 420 421
    output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
    output.write(" *\n * Check whether the character is part of %s UCS Category\n"%
                 (name))
    output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
    output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438
    if len(Categories[name]) > minTableSize:
        output.write("    return(xmlCharInRange((unsigned int)code, &xml%sG)"
            % name)
    else:
        start = 1
        for range in ranges:
            (begin, end) = range;
            if start:
                output.write("    return(");
                start = 0
            else:
                output.write(" ||\n           ");
            if (begin == end):
                output.write("(code == %s)" % (hex(begin)))
            else:
                output.write("((code >= %s) && (code <= %s))" % (
                         hex(begin), hex(end)))
439 440
    output.write(");\n}\n\n")

441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461
header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsCat\t(int code, const char *cat);\n")
output.write(
"""/**
 * xmlUCSIsCat:
 * @code: UCS code point
 * @cat: UCS Category name
 *
 * Check whether the character is part of the UCS Category
 *
 * Returns 1 if true, 0 if false and -1 on unknown category
 */
int
xmlUCSIsCat(int code, const char *cat) {
    xmlIntFunc *func;

    func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat);
    if (func == NULL)
	return (-1);
    return (func(code));
}

462 463
#define bottom_xmlunicode
#include "elfgcchack.h"
464 465
#endif /* LIBXML_UNICODE_ENABLED */
""")
466 467 468 469 470

header.write("""
#ifdef __cplusplus
}
#endif
471 472 473

#endif /* LIBXML_UNICODE_ENABLED */

474 475
#endif /* __XML_UNICODE_H__ */
""");
476

477 478
header.close()
output.close()