1
1
s-lang/utf8/tools/mktables

639 строки
17 KiB
Plaintext
Исполняемый файл

#!/usr/bin/env slsh
% The unicode database contains 15 fields
private define usage ()
{
() = fprintf (stderr, "Usage: %s UnicodeData.txt EastAsianWidth.txt\n", __argv[0]);
exit (1);
}
if (__argc != 3)
usage ();
private variable Unicode_Data_File = __argv[1];
private variable East_Asian_File = __argv[2];
private define make_char_def_table (num)
{
variable s = struct
{
code_point,
char_name,
general_cat,
combining_class,
bidirectional_cat,
char_decomp_map,
decimal_digit_val,
digit_val,
numeric_val,
is_mirrored,
unicode1_name,
iso10646_comment,
uppercase_mapping,
lowercase_mapping,
titlecase_mapping,
east_asian_prop,
};
s.code_point = Int32_Type[num]; s.code_point[*] = [0:num-1];
s.char_name = String_Type[num]; s.char_name[*] = "";
s.general_cat = String_Type[num]; s.general_cat[*] = "";
s.combining_class = String_Type[num]; s.combining_class[*] = "";
s.bidirectional_cat = String_Type[num]; s.bidirectional_cat[*] = "";
s.char_decomp_map = String_Type[num]; s.char_decomp_map[*] = "";
s.decimal_digit_val = String_Type[num];
s.digit_val = String_Type[num];
s.numeric_val = String_Type[num];
s.is_mirrored = Char_Type[num];
s.unicode1_name = String_Type[num];
s.iso10646_comment = String_Type[num];
s.lowercase_mapping = Int32_Type[num];
s.uppercase_mapping = Int32_Type[num];
s.titlecase_mapping = Int32_Type[num];
s.east_asian_prop = String_Type[num]; s.east_asian_prop[*] = "";
return s;
}
private define fixup_ranges (starts, stops, s)
{
variable nranges = length (starts);
foreach (get_struct_field_names (s))
{
variable field_name = ();
if (field_name == "code_point")
continue;
variable field = get_struct_field (s, field_name);
_for (0, nranges-1, 1)
{
variable i = ();
variable start = starts[i];
variable stop = stops[i];
field[[start+1:stop-1]] = field[start];
}
}
}
private define hexstr_to_int (s)
{
return integer (strcat ("0x0", strtrim (s)));
}
private define read_file (file)
{
variable lines = fgetslines (fopen (file, "r"));
% Get the code point of the last line since it determines the number of
% code points
variable num = 1+hexstr_to_int (strchop (lines[-1], ';', 0)[0]);
variable s = make_char_def_table (num);
variable is_range = Char_Type[num];
variable i, j;
foreach (lines)
{
variable line = ();
variable fields = strchop (line, ';', 0);
if (fields[2] == "Cs")
continue; % surrogate
i = hexstr_to_int (fields[0]);
variable field = fields[1];
s.char_name[i] = field;
% A range is specified if the field is of the form <xxx, First>
% or <xxx, Last>
if (field[0] == '<')
{
if (string_match (field, ", First>$", 1))
is_range [i] = 1;
else if (string_match (field, ", Last>$", 1))
is_range[i] = -1;
}
s.general_cat[i] = fields[2];
s.combining_class[i] = fields[3];
s.bidirectional_cat[i] = fields[4];
s.char_decomp_map[i] = fields[5];
s.decimal_digit_val[i] = fields[6];
s.digit_val[i] = fields[7];
s.numeric_val[i] = fields[8];
s.is_mirrored[i] = (fields[9] == "Y");
s.unicode1_name[i] = fields[10];
s.iso10646_comment[i] = fields[11];
s.uppercase_mapping[i] = hexstr_to_int (fields[12]);
s.lowercase_mapping[i] = hexstr_to_int (fields[13]);
s.titlecase_mapping[i] = hexstr_to_int (fields[14]);
}
i = where (is_range == 1);
if (length (i))
{
j = where (is_range == -1);
if (length (i) != length (j))
verror ("First and Last ranges do not match");
fixup_ranges (i, j, s);
}
i = where (s.lowercase_mapping == 0);
s.lowercase_mapping[i] = s.code_point[i];
i = where (s.uppercase_mapping == 0);
s.uppercase_mapping[i] = s.code_point[i];
i = where (s.titlecase_mapping == 0);
s.titlecase_mapping[i] = s.code_point[i];
return s;
}
private define read_east_asian_file (s, file)
{
foreach (fopen (file, "r")) using ("line")
{
variable line = ();
if (line[0] == '#')
continue;
line = strtrim (line);
!if (strlen (line))
continue;
variable code, prop;
variable fields = strtok (line, "; ");
code = fields[0];
if (is_substr (code, ".."))
{
code = strtok (code, ".");
variable code_start = hexstr_to_int (code[0]);
variable code_stop = hexstr_to_int (code[1]);
prop = fields[1];
_for (code_start, code_stop, 1)
{
code = ();
s.east_asian_prop[code] = prop;
}
continue;
}
code = hexstr_to_int (code);
s.east_asian_prop[code] = fields[1];
}
}
private variable LOWER = 0x0001;
private variable UPPER = 0x0002;
private variable ALPHA = 0x0004;
private variable XDIGIT = 0x0008;
private variable SPACE = 0x0010;
private variable BLANK = 0x0020;
private variable CNTRL = 0x0040;
private variable PRINT = 0x0080;
private variable DIGIT = 0x0100;
private variable GRAPH = 0x0200;
private variable ALNUM = 0x0400;
private variable PUNCT = 0x0800;
private variable ASCII = 0x1000;
private variable Classification_C_Table_Type = "_pSLuint16_Type";
private variable Classification_C_Table_Format = "0x%04X";
private define init_file (file)
{
variable fp = fopen (file, "w");
() = fprintf (fp, "/* This file was automatically created by %s */\n", __argv[0]);
return fp;
}
private define check_data_type (datatype, s, what, table_name)
{
variable min_val, max_val;
switch (datatype)
{
case "char":
min_val = -128; max_val = 127;
}
{
case "unsigned char":
min_val = 0; max_val = 255;
}
{
case "_pSLint16_Type":
min_val = -32768; max_val = 32767;
}
{
case "_pSLuint16_Type":
min_val = 0; max_val = 0xFFFF;
}
{
case "_pSLint32_Type":
min_val = -2147483648;
max_val = 0x7FFFFFFF;
}
{
case "_pSLuint32_Type":
min_val = 0; max_val = 0xFFFFFFFFUL;
}
{
case "bit":
return;
}
{
() = fprintf (stderr, "check_data_type: %s not supported\n", datatype);
return;
}
variable i = wherenot (min_val <= what <= max_val);
if (length (i) == 0)
return;
() = fprintf (stderr, "***WARNING: table for %s needs a larger type for char 0x%04X\n", table_name, s.code_point[i[0]]);
}
private define write_toxxx_table (fp, s, what, datatype,
table_name, format, shift_bits,
greater_than_max_value)
{
variable ch = s.code_point;
variable use_bitmap = 0;
variable i, j, k;
variable bits_per_value;
check_data_type (datatype, s, what, table_name);
if (datatype == "bit")
{
variable max_what = max(what);
bits_per_value = -1;
variable shift_bits_offset = 4;
foreach ([1,2,4,8]) % 7, 3, 1, 0
{
i = ();
shift_bits_offset--;
if (max_what >= (1 shl i))
continue;
bits_per_value = i;
break;
}
if (bits_per_value == -1)
verror ("bit data type cannot represent this object\n");
datatype = "unsigned char";
use_bitmap = 1;
}
if (use_bitmap)
{
variable num_values_per_8bits = 8/bits_per_value;
ch = ch/num_values_per_8bits;
}
% Take advantage of the sparseness of the table. To this end, write
% N tables with nentries per table.
variable nentries = (1 shl shift_bits);
variable ntables = max(ch)/nentries + 1;
variable data = Int_Type[ntables * nentries];
if (use_bitmap)
{
i = length(what)/num_values_per_8bits;
if (i * num_values_per_8bits < length(what))
i++;
if (greater_than_max_value)
{
vmessage ("Padding table: num_values_per_8bits = %d", num_values_per_8bits);
variable new_what = @Array_Type(_typeof(what), [i*num_values_per_8bits]);
new_what[[0:length(what)-1]] = what;
new_what[[length(what):]] = greater_than_max_value;
what = new_what;
}
variable bitmap = UChar_Type[i];
% Fillout the bitmap with the correct values for characters beyond the
% tabulated range.
variable bit = 0;
_for (0, num_values_per_8bits-1, 1)
{
variable b = ();
variable values = what[[b::num_values_per_8bits]];
_for (0, bits_per_value-1, 1)
{
k = ();
i = where (values & (1 shl k));
bitmap[i] |= (1 shl bit);
bit++;
}
}
what = bitmap;
}
data[[0:max(ch)]] = what;
variable unique_tables = Array_Type[ntables];
variable tables = Int_Type[ntables];
variable num_unique = 0;
unique_tables[0] = [1:nentries]*0;
num_unique = 1;
_for (0, ntables-1, 1)
{
i = ();
variable table = data[nentries*i + [0:nentries-1]];
j = 0;
while (j < num_unique)
{
if (0 == length (where (unique_tables[j] != table)))
break;
j++;
}
tables[i] = j;
if (j == num_unique)
{
unique_tables[num_unique] = table;
num_unique++;
}
}
% How many tables do we really need?
i = where (tables != 0);
ntables = 1 + i[-1];
if (typeof (fp) == String_Type)
fp = init_file (fp);
variable bitmap_multiplier = 1;
if (use_bitmap)
bitmap_multiplier = num_values_per_8bits;
variable table_lookup_name = sprintf ("SL_%s_LOOKUP", strup(table_name));
variable max_char_name = sprintf ("SL_%s_MAX_CHAR", strup(table_name));
variable assign_lookup_name = sprintf ("SL_%s_ALOOKUP", strup(table_name));
table_name = sprintf ("_pSLwc_%s_Table", table_name);
() = fprintf (fp, "#define %s 0x%Xul\n\n", max_char_name,
bitmap_multiplier * ntables * nentries);
if (use_bitmap == 0)
{
() = fprintf (fp, "#define %s(x) \\\n", table_lookup_name);
() = fprintf (fp, " (((unsigned)(x)>=%s)?%d:(%s[(unsigned)(x)>>%d][(unsigned)(x)&0x%X]))\n\n",
max_char_name, greater_than_max_value, table_name, shift_bits, nentries-1);
}
else if (num_values_per_8bits == 8) % boolean (0 or 1)
{
()=fprintf(fp, "#define %s(y,x) \\\n", assign_lookup_name);
()=fprintf(fp, "{ \\\n");
()=fprintf(fp, " const %s *_t; \\\n", datatype);
()=fprintf(fp, " (y) = (((unsigned)(x) < %s) \\\n", max_char_name);
()=fprintf(fp, " && (NULL != (_t = %s[(unsigned)(x)>>%d])) \\\n",
table_name, shift_bits_offset + shift_bits);
()=fprintf(fp, " && (_t[(unsigned)((x)>>%d)&0x%X] & (%d << ((x)&%d)))); \\\n",
shift_bits_offset, nentries - 1, int(2^bits_per_value-1), num_values_per_8bits-1);
()=fprintf(fp, "}\n");
}
else % bit mapped with num_values_per_8bits = 1,2, or 4
{
()=fprintf(fp, "#define %s(y,x) \\\n", assign_lookup_name);
()=fprintf(fp, "{ \\\n");
()=fprintf(fp, " const %s *_t; \\\n", datatype);
()=fprintf(fp, " (y) = (((unsigned)(x) < %s) \\\n", max_char_name);
()=fprintf(fp, " && (NULL != (_t = %s[(unsigned)(x)>>%d])) \\\n",
table_name, shift_bits_offset + shift_bits);
()=fprintf(fp, " ? ((_t[(unsigned)((x)>>%d)&0x%X]>>(%d*((x)&%d)))&%d) : %d); \\\n",
shift_bits_offset, nentries - 1, bits_per_value,
num_values_per_8bits-1, int(2^bits_per_value-1),
greater_than_max_value);
()=fprintf(fp, "}\n");
}
() = fprintf (fp, "extern const %s *%s[%d];\n\n", datatype, table_name, ntables);
() = fprintf (fp, "#ifdef DEFINE%s\n", strup (table_name));
format = [format, format, format, format, format, format, format, format];
format = strcat (" /*0x%02X-0x%02X*/ ", strjoin (format, ", "));
_for (0, num_unique-1, 1)
{
i = ();
if ((i == 0) and use_bitmap)
continue;
() = fprintf (fp, "static const %s Table_%02d[%d] =\n{\n",
datatype, i, nentries);
table = unique_tables[i];
_for (0, nentries-1, 8)
{
j = ();
if (j)
() = fputs (",\n", fp);
() = fprintf (fp, format,
j, (j+7),
table[j], table[j+1], table[j+2], table[j+3],
table[j+4], table[j+5], table[j+6], table[j+7]);
}
() = fputs ("\n};\n\n", fp);
}
() = fprintf (fp, "const %s *%s[%d] =\n{", datatype, table_name, ntables);
i = 0;
while (i < ntables)
{
if (i) () = fputs (",", fp);
!if (i mod 6)
() = fputs ("\n", fp);
if (use_bitmap and (tables[i] == 0))
() = fprintf (fp, " NULL");
else
() = fprintf (fp, " Table_%02d", tables[i]);
i++;
}
() = fputs ("\n};\n", fp);
() = fprintf (fp, "#endif /* DEFINE%s */\n", strup(table_name));
() = fclose (fp);
variable size;
if (is_substr (datatype, "char"))
size = 1;
else if (is_substr (datatype, "short"))
size = 2;
else size = 4;
if (use_bitmap == 0)
{
vmessage ("Estimated table size: %d bytes",
4*ntables + size*nentries*num_unique);
}
else
{
vmessage ("Estimated table size: %d bytes",
4*ntables + size*nentries*(num_unique-1));
}
}
private define make_char_classes (s)
{
variable i;
variable code_point = s.code_point;
variable gcat0 = int (s.general_cat);
variable char_classes = UShort_Type[length(code_point)];
#iftrue
% LOWER
i = where (((code_point == s.lowercase_mapping)
and (code_point != s.uppercase_mapping)));
char_classes[i] |= LOWER;
% UPPER
i = where (((code_point == s.uppercase_mapping)
or (code_point == s.titlecase_mapping))
and (code_point != s.lowercase_mapping));
char_classes[i] |= UPPER;
#endif
% LOWER
i = where ((s.general_cat == "Ll") and (0 == (char_classes & UPPER)));
char_classes[i] |= LOWER;
% UPPER
i = where ((s.general_cat == "Lu") and (0 == (char_classes & LOWER)));
char_classes[i] |= UPPER;
% ALPHA
i = where ((char_classes & (UPPER|LOWER)) or (gcat0 == 'L'));
char_classes[i] |= ALPHA;
% XDIGIT
i = where (((code_point >= '0') and (code_point <= '9'))
or ((code_point >= 'A') and (code_point <= 'F'))
or ((code_point >= 'a') and (code_point <= 'f')));
char_classes[i] |= XDIGIT;
% SPACE, BLANK
char_classes[' '] |= SPACE|BLANK;
char_classes['\t'] |= SPACE|BLANK;
char_classes['\n'] |= SPACE;
char_classes['\r'] |= SPACE;
char_classes['\f'] |= SPACE;
char_classes['\v'] |= SPACE;
% char_classes [where (s.bidirectional_cat == "WS")] |= SPACE;
i = where ((gcat0 == 'Z')
and not array_map (Int_Type, &is_substr, s.char_decomp_map, "<noBreak>"));
char_classes [i] |= SPACE;
% CNTRL
char_classes[where(s.char_name == "<control>")] |= CNTRL;
% PRINT
char_classes[where((s.char_name != "") and not (char_classes & CNTRL))]
|= PRINT;
% DIGIT
i = where ((char_classes & XDIGIT) and not (char_classes & ALPHA));
char_classes[i] |= DIGIT;
% GRAPH
char_classes[where ((char_classes & PRINT) and not (char_classes & SPACE))]
|= GRAPH;
% ALNUM
char_classes[where (char_classes & (ALPHA|DIGIT))] |= ALNUM;
% PUNCT
char_classes[where ((char_classes & GRAPH) and not (char_classes & ALNUM))]
|= PUNCT;
% ASCII
char_classes[where (code_point < 0x80)] |= ASCII;
return char_classes;
}
private define write_char_classes (file, s, char_classes)
{
variable fp = init_file (file);
() = fprintf (fp, "#define SLCHARCLASS_LOWER\t0x%04X\n", LOWER);
() = fprintf (fp, "#define SLCHARCLASS_UPPER\t0x%04X\n", UPPER);
() = fprintf (fp, "#define SLCHARCLASS_ALPHA\t0x%04X\n", ALPHA);
() = fprintf (fp, "#define SLCHARCLASS_XDIGIT\t0x%04X\n", XDIGIT);
() = fprintf (fp, "#define SLCHARCLASS_SPACE\t0x%04X\n", SPACE);
() = fprintf (fp, "#define SLCHARCLASS_BLANK\t0x%04X\n", BLANK);
() = fprintf (fp, "#define SLCHARCLASS_CNTRL\t0x%04X\n", CNTRL);
() = fprintf (fp, "#define SLCHARCLASS_PRINT\t0x%04X\n", PRINT);
() = fprintf (fp, "#define SLCHARCLASS_DIGIT\t0x%04X\n", DIGIT);
() = fprintf (fp, "#define SLCHARCLASS_GRAPH\t0x%04X\n", GRAPH);
() = fprintf (fp, "#define SLCHARCLASS_ALNUM\t0x%04X\n", ALNUM);
() = fprintf (fp, "#define SLCHARCLASS_PUNCT\t0x%04X\n", PUNCT);
() = fprintf (fp, "#define SLCHARCLASS_ASCII\t0x%04X\n", ASCII);
() = fprintf (fp, "\n\n");
write_toxxx_table (fp, s, char_classes, Classification_C_Table_Type,
"Classification", Classification_C_Table_Format, 8, 0);
}
private define main ()
{
variable s = read_file (Unicode_Data_File);
read_east_asian_file (s, East_Asian_File);
variable char_classes = make_char_classes (s);
variable ch = s.code_point;
variable is_combining = ((s.general_cat == "Mn") or (s.general_cat == "Me"));
% Note: "Mc" (combining, yet spacing) is omitted here since I do
% not know what that means.
% Apparantly Hangul (Conjoining Jamo) characters 0x1160 - 0x11FF
% _behave_ like combining characters, but are not flagged as such in
% the database.
is_combining[[0x1160:0x11FF]] = 1;
#ifnfalse
variable width = UChar_Type[length(ch)];
width[*] = 1;
width[where (s.east_asian_prop == "W")] = 2;
width[where (s.east_asian_prop == "F")] = 2;
width[where (s.east_asian_prop == "A")] = 3; % ambiguous
width[where (s.general_cat == "Cf")] = 0;
width[0xAD] = 3; % SOFT-HYPHEN -- mark is as ambiguous
width[where(is_combining)] = 0;
%width[where (s.bidirectional_cat == "NSM")] = 0;
width[where (0 == array_map (Int_Type, &strncmp, s.char_name,
"ZERO WIDTH", 10))]
= 0;
width[[0x80:0x9F]] = 4; % displayed as <xx> by SLsmg
write_toxxx_table ("slwcwidth.h", s, width, "bit",
"Width", "0x%02X", 8, 1);
#endif
write_toxxx_table ("slcombin.h", s,
is_combining,
"bit", "Combining", "0x%02X", 6, 0);
write_toxxx_table ("sllower.h", s, s.lowercase_mapping-ch, "_pSLint32_Type",
"Tolower", "% 5d", 7, 0);
variable tmp = s.lowercase_mapping-ch;
write_toxxx_table ("slupper.h", s, s.uppercase_mapping-ch, "_pSLint32_Type",
"Toupper", "% 5d", 7, 0);
write_char_classes ("slischar.h", s, char_classes);
}
main();