#!/usr/bin/env slsh % The unicode database contains 15 fields private define usage () { () = fprintf (stderr, "Usage: %s UnicodeData.txt EastAsianWidth.txt\n", __argv[0]); exit (1); } if (__argc != 3) usage (); private variable Unicode_Data_File = __argv[1]; private variable East_Asian_File = __argv[2]; private define make_char_def_table (num) { variable s = struct { code_point, char_name, general_cat, combining_class, bidirectional_cat, char_decomp_map, decimal_digit_val, digit_val, numeric_val, is_mirrored, unicode1_name, iso10646_comment, uppercase_mapping, lowercase_mapping, titlecase_mapping, east_asian_prop, }; s.code_point = Int32_Type[num]; s.code_point[*] = [0:num-1]; s.char_name = String_Type[num]; s.char_name[*] = ""; s.general_cat = String_Type[num]; s.general_cat[*] = ""; s.combining_class = String_Type[num]; s.combining_class[*] = ""; s.bidirectional_cat = String_Type[num]; s.bidirectional_cat[*] = ""; s.char_decomp_map = String_Type[num]; s.char_decomp_map[*] = ""; s.decimal_digit_val = String_Type[num]; s.digit_val = String_Type[num]; s.numeric_val = String_Type[num]; s.is_mirrored = Char_Type[num]; s.unicode1_name = String_Type[num]; s.iso10646_comment = String_Type[num]; s.lowercase_mapping = Int32_Type[num]; s.uppercase_mapping = Int32_Type[num]; s.titlecase_mapping = Int32_Type[num]; s.east_asian_prop = String_Type[num]; s.east_asian_prop[*] = ""; return s; } private define fixup_ranges (starts, stops, s) { variable nranges = length (starts); foreach (get_struct_field_names (s)) { variable field_name = (); if (field_name == "code_point") continue; variable field = get_struct_field (s, field_name); _for (0, nranges-1, 1) { variable i = (); variable start = starts[i]; variable stop = stops[i]; field[[start+1:stop-1]] = field[start]; } } } private define hexstr_to_int (s) { return integer (strcat ("0x0", strtrim (s))); } private define read_file (file) { variable lines = fgetslines (fopen (file, "r")); % Get the code point of the last line since it determines the number of % code points variable num = 1+hexstr_to_int (strchop (lines[-1], ';', 0)[0]); variable s = make_char_def_table (num); variable is_range = Char_Type[num]; variable i, j; foreach (lines) { variable line = (); variable fields = strchop (line, ';', 0); if (fields[2] == "Cs") continue; % surrogate i = hexstr_to_int (fields[0]); variable field = fields[1]; s.char_name[i] = field; % A range is specified if the field is of the form % or if (field[0] == '<') { if (string_match (field, ", First>$", 1)) is_range [i] = 1; else if (string_match (field, ", Last>$", 1)) is_range[i] = -1; } s.general_cat[i] = fields[2]; s.combining_class[i] = fields[3]; s.bidirectional_cat[i] = fields[4]; s.char_decomp_map[i] = fields[5]; s.decimal_digit_val[i] = fields[6]; s.digit_val[i] = fields[7]; s.numeric_val[i] = fields[8]; s.is_mirrored[i] = (fields[9] == "Y"); s.unicode1_name[i] = fields[10]; s.iso10646_comment[i] = fields[11]; s.uppercase_mapping[i] = hexstr_to_int (fields[12]); s.lowercase_mapping[i] = hexstr_to_int (fields[13]); s.titlecase_mapping[i] = hexstr_to_int (fields[14]); } i = where (is_range == 1); if (length (i)) { j = where (is_range == -1); if (length (i) != length (j)) verror ("First and Last ranges do not match"); fixup_ranges (i, j, s); } i = where (s.lowercase_mapping == 0); s.lowercase_mapping[i] = s.code_point[i]; i = where (s.uppercase_mapping == 0); s.uppercase_mapping[i] = s.code_point[i]; i = where (s.titlecase_mapping == 0); s.titlecase_mapping[i] = s.code_point[i]; return s; } private define read_east_asian_file (s, file) { foreach (fopen (file, "r")) using ("line") { variable line = (); if (line[0] == '#') continue; line = strtrim (line); !if (strlen (line)) continue; variable code, prop; variable fields = strtok (line, "; "); code = fields[0]; if (is_substr (code, "..")) { code = strtok (code, "."); variable code_start = hexstr_to_int (code[0]); variable code_stop = hexstr_to_int (code[1]); prop = fields[1]; _for (code_start, code_stop, 1) { code = (); s.east_asian_prop[code] = prop; } continue; } code = hexstr_to_int (code); s.east_asian_prop[code] = fields[1]; } } private variable LOWER = 0x0001; private variable UPPER = 0x0002; private variable ALPHA = 0x0004; private variable XDIGIT = 0x0008; private variable SPACE = 0x0010; private variable BLANK = 0x0020; private variable CNTRL = 0x0040; private variable PRINT = 0x0080; private variable DIGIT = 0x0100; private variable GRAPH = 0x0200; private variable ALNUM = 0x0400; private variable PUNCT = 0x0800; private variable ASCII = 0x1000; private variable Classification_C_Table_Type = "_pSLuint16_Type"; private variable Classification_C_Table_Format = "0x%04X"; private define init_file (file) { variable fp = fopen (file, "w"); () = fprintf (fp, "/* This file was automatically created by %s */\n", __argv[0]); return fp; } private define check_data_type (datatype, s, what, table_name) { variable min_val, max_val; switch (datatype) { case "char": min_val = -128; max_val = 127; } { case "unsigned char": min_val = 0; max_val = 255; } { case "_pSLint16_Type": min_val = -32768; max_val = 32767; } { case "_pSLuint16_Type": min_val = 0; max_val = 0xFFFF; } { case "_pSLint32_Type": min_val = -2147483648; max_val = 0x7FFFFFFF; } { case "_pSLuint32_Type": min_val = 0; max_val = 0xFFFFFFFFUL; } { case "bit": return; } { () = fprintf (stderr, "check_data_type: %s not supported\n", datatype); return; } variable i = wherenot (min_val <= what <= max_val); if (length (i) == 0) return; () = fprintf (stderr, "***WARNING: table for %s needs a larger type for char 0x%04X\n", table_name, s.code_point[i[0]]); } private define write_toxxx_table (fp, s, what, datatype, table_name, format, shift_bits, greater_than_max_value) { variable ch = s.code_point; variable use_bitmap = 0; variable i, j, k; variable bits_per_value; check_data_type (datatype, s, what, table_name); if (datatype == "bit") { variable max_what = max(what); bits_per_value = -1; variable shift_bits_offset = 4; foreach ([1,2,4,8]) % 7, 3, 1, 0 { i = (); shift_bits_offset--; if (max_what >= (1 shl i)) continue; bits_per_value = i; break; } if (bits_per_value == -1) verror ("bit data type cannot represent this object\n"); datatype = "unsigned char"; use_bitmap = 1; } if (use_bitmap) { variable num_values_per_8bits = 8/bits_per_value; ch = ch/num_values_per_8bits; } % Take advantage of the sparseness of the table. To this end, write % N tables with nentries per table. variable nentries = (1 shl shift_bits); variable ntables = max(ch)/nentries + 1; variable data = Int_Type[ntables * nentries]; if (use_bitmap) { i = length(what)/num_values_per_8bits; if (i * num_values_per_8bits < length(what)) i++; if (greater_than_max_value) { vmessage ("Padding table: num_values_per_8bits = %d", num_values_per_8bits); variable new_what = @Array_Type(_typeof(what), [i*num_values_per_8bits]); new_what[[0:length(what)-1]] = what; new_what[[length(what):]] = greater_than_max_value; what = new_what; } variable bitmap = UChar_Type[i]; % Fillout the bitmap with the correct values for characters beyond the % tabulated range. variable bit = 0; _for (0, num_values_per_8bits-1, 1) { variable b = (); variable values = what[[b::num_values_per_8bits]]; _for (0, bits_per_value-1, 1) { k = (); i = where (values & (1 shl k)); bitmap[i] |= (1 shl bit); bit++; } } what = bitmap; } data[[0:max(ch)]] = what; variable unique_tables = Array_Type[ntables]; variable tables = Int_Type[ntables]; variable num_unique = 0; unique_tables[0] = [1:nentries]*0; num_unique = 1; _for (0, ntables-1, 1) { i = (); variable table = data[nentries*i + [0:nentries-1]]; j = 0; while (j < num_unique) { if (0 == length (where (unique_tables[j] != table))) break; j++; } tables[i] = j; if (j == num_unique) { unique_tables[num_unique] = table; num_unique++; } } % How many tables do we really need? i = where (tables != 0); ntables = 1 + i[-1]; if (typeof (fp) == String_Type) fp = init_file (fp); variable bitmap_multiplier = 1; if (use_bitmap) bitmap_multiplier = num_values_per_8bits; variable table_lookup_name = sprintf ("SL_%s_LOOKUP", strup(table_name)); variable max_char_name = sprintf ("SL_%s_MAX_CHAR", strup(table_name)); variable assign_lookup_name = sprintf ("SL_%s_ALOOKUP", strup(table_name)); table_name = sprintf ("_pSLwc_%s_Table", table_name); () = fprintf (fp, "#define %s 0x%Xul\n\n", max_char_name, bitmap_multiplier * ntables * nentries); if (use_bitmap == 0) { () = fprintf (fp, "#define %s(x) \\\n", table_lookup_name); () = fprintf (fp, " (((unsigned)(x)>=%s)?%d:(%s[(unsigned)(x)>>%d][(unsigned)(x)&0x%X]))\n\n", max_char_name, greater_than_max_value, table_name, shift_bits, nentries-1); } else if (num_values_per_8bits == 8) % boolean (0 or 1) { ()=fprintf(fp, "#define %s(y,x) \\\n", assign_lookup_name); ()=fprintf(fp, "{ \\\n"); ()=fprintf(fp, " const %s *_t; \\\n", datatype); ()=fprintf(fp, " (y) = (((unsigned)(x) < %s) \\\n", max_char_name); ()=fprintf(fp, " && (NULL != (_t = %s[(unsigned)(x)>>%d])) \\\n", table_name, shift_bits_offset + shift_bits); ()=fprintf(fp, " && (_t[(unsigned)((x)>>%d)&0x%X] & (%d << ((x)&%d)))); \\\n", shift_bits_offset, nentries - 1, int(2^bits_per_value-1), num_values_per_8bits-1); ()=fprintf(fp, "}\n"); } else % bit mapped with num_values_per_8bits = 1,2, or 4 { ()=fprintf(fp, "#define %s(y,x) \\\n", assign_lookup_name); ()=fprintf(fp, "{ \\\n"); ()=fprintf(fp, " const %s *_t; \\\n", datatype); ()=fprintf(fp, " (y) = (((unsigned)(x) < %s) \\\n", max_char_name); ()=fprintf(fp, " && (NULL != (_t = %s[(unsigned)(x)>>%d])) \\\n", table_name, shift_bits_offset + shift_bits); ()=fprintf(fp, " ? ((_t[(unsigned)((x)>>%d)&0x%X]>>(%d*((x)&%d)))&%d) : %d); \\\n", shift_bits_offset, nentries - 1, bits_per_value, num_values_per_8bits-1, int(2^bits_per_value-1), greater_than_max_value); ()=fprintf(fp, "}\n"); } () = fprintf (fp, "extern const %s *%s[%d];\n\n", datatype, table_name, ntables); () = fprintf (fp, "#ifdef DEFINE%s\n", strup (table_name)); format = [format, format, format, format, format, format, format, format]; format = strcat (" /*0x%02X-0x%02X*/ ", strjoin (format, ", ")); _for (0, num_unique-1, 1) { i = (); if ((i == 0) and use_bitmap) continue; () = fprintf (fp, "static const %s Table_%02d[%d] =\n{\n", datatype, i, nentries); table = unique_tables[i]; _for (0, nentries-1, 8) { j = (); if (j) () = fputs (",\n", fp); () = fprintf (fp, format, j, (j+7), table[j], table[j+1], table[j+2], table[j+3], table[j+4], table[j+5], table[j+6], table[j+7]); } () = fputs ("\n};\n\n", fp); } () = fprintf (fp, "const %s *%s[%d] =\n{", datatype, table_name, ntables); i = 0; while (i < ntables) { if (i) () = fputs (",", fp); !if (i mod 6) () = fputs ("\n", fp); if (use_bitmap and (tables[i] == 0)) () = fprintf (fp, " NULL"); else () = fprintf (fp, " Table_%02d", tables[i]); i++; } () = fputs ("\n};\n", fp); () = fprintf (fp, "#endif /* DEFINE%s */\n", strup(table_name)); () = fclose (fp); variable size; if (is_substr (datatype, "char")) size = 1; else if (is_substr (datatype, "short")) size = 2; else size = 4; if (use_bitmap == 0) { vmessage ("Estimated table size: %d bytes", 4*ntables + size*nentries*num_unique); } else { vmessage ("Estimated table size: %d bytes", 4*ntables + size*nentries*(num_unique-1)); } } private define make_char_classes (s) { variable i; variable code_point = s.code_point; variable gcat0 = int (s.general_cat); variable char_classes = UShort_Type[length(code_point)]; #iftrue % LOWER i = where (((code_point == s.lowercase_mapping) and (code_point != s.uppercase_mapping))); char_classes[i] |= LOWER; % UPPER i = where (((code_point == s.uppercase_mapping) or (code_point == s.titlecase_mapping)) and (code_point != s.lowercase_mapping)); char_classes[i] |= UPPER; #endif % LOWER i = where ((s.general_cat == "Ll") and (0 == (char_classes & UPPER))); char_classes[i] |= LOWER; % UPPER i = where ((s.general_cat == "Lu") and (0 == (char_classes & LOWER))); char_classes[i] |= UPPER; % ALPHA i = where ((char_classes & (UPPER|LOWER)) or (gcat0 == 'L')); char_classes[i] |= ALPHA; % XDIGIT i = where (((code_point >= '0') and (code_point <= '9')) or ((code_point >= 'A') and (code_point <= 'F')) or ((code_point >= 'a') and (code_point <= 'f'))); char_classes[i] |= XDIGIT; % SPACE, BLANK char_classes[' '] |= SPACE|BLANK; char_classes['\t'] |= SPACE|BLANK; char_classes['\n'] |= SPACE; char_classes['\r'] |= SPACE; char_classes['\f'] |= SPACE; char_classes['\v'] |= SPACE; % char_classes [where (s.bidirectional_cat == "WS")] |= SPACE; i = where ((gcat0 == 'Z') and not array_map (Int_Type, &is_substr, s.char_decomp_map, "")); char_classes [i] |= SPACE; % CNTRL char_classes[where(s.char_name == "")] |= CNTRL; % PRINT char_classes[where((s.char_name != "") and not (char_classes & CNTRL))] |= PRINT; % DIGIT i = where ((char_classes & XDIGIT) and not (char_classes & ALPHA)); char_classes[i] |= DIGIT; % GRAPH char_classes[where ((char_classes & PRINT) and not (char_classes & SPACE))] |= GRAPH; % ALNUM char_classes[where (char_classes & (ALPHA|DIGIT))] |= ALNUM; % PUNCT char_classes[where ((char_classes & GRAPH) and not (char_classes & ALNUM))] |= PUNCT; % ASCII char_classes[where (code_point < 0x80)] |= ASCII; return char_classes; } private define write_char_classes (file, s, char_classes) { variable fp = init_file (file); () = fprintf (fp, "#define SLCHARCLASS_LOWER\t0x%04X\n", LOWER); () = fprintf (fp, "#define SLCHARCLASS_UPPER\t0x%04X\n", UPPER); () = fprintf (fp, "#define SLCHARCLASS_ALPHA\t0x%04X\n", ALPHA); () = fprintf (fp, "#define SLCHARCLASS_XDIGIT\t0x%04X\n", XDIGIT); () = fprintf (fp, "#define SLCHARCLASS_SPACE\t0x%04X\n", SPACE); () = fprintf (fp, "#define SLCHARCLASS_BLANK\t0x%04X\n", BLANK); () = fprintf (fp, "#define SLCHARCLASS_CNTRL\t0x%04X\n", CNTRL); () = fprintf (fp, "#define SLCHARCLASS_PRINT\t0x%04X\n", PRINT); () = fprintf (fp, "#define SLCHARCLASS_DIGIT\t0x%04X\n", DIGIT); () = fprintf (fp, "#define SLCHARCLASS_GRAPH\t0x%04X\n", GRAPH); () = fprintf (fp, "#define SLCHARCLASS_ALNUM\t0x%04X\n", ALNUM); () = fprintf (fp, "#define SLCHARCLASS_PUNCT\t0x%04X\n", PUNCT); () = fprintf (fp, "#define SLCHARCLASS_ASCII\t0x%04X\n", ASCII); () = fprintf (fp, "\n\n"); write_toxxx_table (fp, s, char_classes, Classification_C_Table_Type, "Classification", Classification_C_Table_Format, 8, 0); } private define main () { variable s = read_file (Unicode_Data_File); read_east_asian_file (s, East_Asian_File); variable char_classes = make_char_classes (s); variable ch = s.code_point; variable is_combining = ((s.general_cat == "Mn") or (s.general_cat == "Me")); % Note: "Mc" (combining, yet spacing) is omitted here since I do % not know what that means. % Apparantly Hangul (Conjoining Jamo) characters 0x1160 - 0x11FF % _behave_ like combining characters, but are not flagged as such in % the database. is_combining[[0x1160:0x11FF]] = 1; #ifnfalse variable width = UChar_Type[length(ch)]; width[*] = 1; width[where (s.east_asian_prop == "W")] = 2; width[where (s.east_asian_prop == "F")] = 2; width[where (s.east_asian_prop == "A")] = 3; % ambiguous width[where (s.general_cat == "Cf")] = 0; width[0xAD] = 3; % SOFT-HYPHEN -- mark is as ambiguous width[where(is_combining)] = 0; %width[where (s.bidirectional_cat == "NSM")] = 0; width[where (0 == array_map (Int_Type, &strncmp, s.char_name, "ZERO WIDTH", 10))] = 0; width[[0x80:0x9F]] = 4; % displayed as by SLsmg write_toxxx_table ("slwcwidth.h", s, width, "bit", "Width", "0x%02X", 8, 1); #endif write_toxxx_table ("slcombin.h", s, is_combining, "bit", "Combining", "0x%02X", 6, 0); write_toxxx_table ("sllower.h", s, s.lowercase_mapping-ch, "_pSLint32_Type", "Tolower", "% 5d", 7, 0); variable tmp = s.lowercase_mapping-ch; write_toxxx_table ("slupper.h", s, s.uppercase_mapping-ch, "_pSLint32_Type", "Toupper", "% 5d", 7, 0); write_char_classes ("slischar.h", s, char_classes); } main();