842 строки
18 KiB
C
842 строки
18 KiB
C
|
#include "slinclud.h"
|
|||
|
#include <string.h>
|
|||
|
|
|||
|
#include "slang.h"
|
|||
|
#include "_slang.h"
|
|||
|
|
|||
|
static unsigned char Len_Map[256] =
|
|||
|
{
|
|||
|
0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* - 31 */
|
|||
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* - 63 */
|
|||
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* - 95 */
|
|||
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* - 127 */
|
|||
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* - 159 */
|
|||
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* - 191 */
|
|||
|
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* - 223 */
|
|||
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 /* - 255 */
|
|||
|
};
|
|||
|
|
|||
|
/*
|
|||
|
* Also note that the code positions U+D800 to U+DFFF (UTF-16 surrogates)
|
|||
|
* as well as U+FFFE and U+FFFF must not occur in normal UTF-8 or UCS-4
|
|||
|
* data. UTF-8 decoders should treat them like malformed or overlong
|
|||
|
* sequences for safety reasons.
|
|||
|
*/
|
|||
|
#define IS_ILLEGAL_UNICODE(w) \
|
|||
|
(((w >= 0xD800) && (w <= 0xDFFF)) || (w == 0xFFFE) || (w == 0xFFFF))
|
|||
|
|
|||
|
_INLINE_
|
|||
|
static int is_invalid_or_overlong_utf8 (SLuchar_Type *u, unsigned int len)
|
|||
|
{
|
|||
|
unsigned int i;
|
|||
|
unsigned char ch, ch1;
|
|||
|
|
|||
|
/* Check for invalid sequences */
|
|||
|
for (i = 1; i < len; i++)
|
|||
|
{
|
|||
|
if ((u[i] & 0xC0) != 0x80)
|
|||
|
return 1;
|
|||
|
}
|
|||
|
|
|||
|
/* Illegal (overlong) sequences */
|
|||
|
/* 1100000x (10xxxxxx) */
|
|||
|
/* 11100000 100xxxxx (10xxxxxx) */
|
|||
|
/* 11110000 1000xxxx (10xxxxxx 10xxxxxx) */
|
|||
|
/* 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) */
|
|||
|
/* 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) */
|
|||
|
ch = *u;
|
|||
|
if ((ch == 0xC0) || (ch == 0xC1))
|
|||
|
return 1;
|
|||
|
|
|||
|
ch1 = u[1];
|
|||
|
if (((ch1 & ch) == 0x80)
|
|||
|
&& ((ch == 0xE0)
|
|||
|
|| (ch == 0xF0)
|
|||
|
|| (ch == 0xF8)
|
|||
|
|| (ch == 0xFC)))
|
|||
|
return 1;
|
|||
|
|
|||
|
if (len == 3)
|
|||
|
{
|
|||
|
/* D800 is encoded as 0xED 0xA0 0x80 and DFFF as 0xED 0xBF 0xBF */
|
|||
|
if ((ch == 0xED)
|
|||
|
&& ((ch1 >= 0xA0) && (ch1 <= 0xBF))
|
|||
|
&& (u[2] >= 0x80) && (u[2] <= 0xBF))
|
|||
|
return 1;
|
|||
|
/* Now FFFE and FFFF */
|
|||
|
if ((ch == 0xEF)
|
|||
|
&& (ch1 == 0xBF)
|
|||
|
&& ((u[2] == 0xBE) || (u[2] == 0xBF)))
|
|||
|
return 1;
|
|||
|
}
|
|||
|
return 0;
|
|||
|
}
|
|||
|
|
|||
|
/* This function assumes that the necessary checks have been made to ensure
|
|||
|
* a valid UTF-8 encoded character is present.
|
|||
|
*/
|
|||
|
_INLINE_
|
|||
|
static SLwchar_Type fast_utf8_decode (SLuchar_Type *u, unsigned int len)
|
|||
|
{
|
|||
|
static unsigned char masks[7] =
|
|||
|
{
|
|||
|
0, 0, 0x1F, 0xF, 0x7, 0x3, 0x1
|
|||
|
};
|
|||
|
SLuchar_Type *umax;
|
|||
|
SLwchar_Type w;
|
|||
|
|
|||
|
w = (*u & masks[len]);
|
|||
|
umax = u + len;
|
|||
|
u++;
|
|||
|
while (u < umax)
|
|||
|
{
|
|||
|
w = (w << 6)| (u[0] & 0x3F);
|
|||
|
u++;
|
|||
|
}
|
|||
|
return w;
|
|||
|
}
|
|||
|
|
|||
|
unsigned char *SLutf8_skip_char (unsigned char *s, unsigned char *smax)
|
|||
|
{
|
|||
|
unsigned int len;
|
|||
|
|
|||
|
if (s >= smax)
|
|||
|
return s;
|
|||
|
|
|||
|
len = Len_Map[*s];
|
|||
|
if (len <= 1)
|
|||
|
return s+1;
|
|||
|
|
|||
|
if (s + len > smax)
|
|||
|
return s+1;
|
|||
|
|
|||
|
if (is_invalid_or_overlong_utf8 (s, len))
|
|||
|
return s + 1;
|
|||
|
|
|||
|
return s + len;
|
|||
|
}
|
|||
|
|
|||
|
SLuchar_Type *SLutf8_skip_chars (SLuchar_Type *s, SLuchar_Type *smax,
|
|||
|
unsigned int num, unsigned int *dnum,
|
|||
|
int ignore_combining)
|
|||
|
{
|
|||
|
unsigned int n;
|
|||
|
|
|||
|
n = 0;
|
|||
|
while ((n < num) && (s < smax))
|
|||
|
{
|
|||
|
unsigned int len = Len_Map[*s];
|
|||
|
|
|||
|
if (len <= 1)
|
|||
|
{
|
|||
|
n++;
|
|||
|
s++;
|
|||
|
continue;
|
|||
|
}
|
|||
|
|
|||
|
if (s + len > smax)
|
|||
|
{
|
|||
|
s++;
|
|||
|
n++;
|
|||
|
continue;
|
|||
|
}
|
|||
|
|
|||
|
if (is_invalid_or_overlong_utf8 (s, len))
|
|||
|
{
|
|||
|
s++;
|
|||
|
n++;
|
|||
|
continue;
|
|||
|
}
|
|||
|
|
|||
|
if (ignore_combining)
|
|||
|
{
|
|||
|
SLwchar_Type w = fast_utf8_decode (s, len);
|
|||
|
if (0 != SLwchar_wcwidth (w))
|
|||
|
n++;
|
|||
|
s += len;
|
|||
|
continue;
|
|||
|
}
|
|||
|
|
|||
|
n++;
|
|||
|
s += len;
|
|||
|
}
|
|||
|
|
|||
|
if (ignore_combining)
|
|||
|
{
|
|||
|
while (s < smax)
|
|||
|
{
|
|||
|
SLwchar_Type w;
|
|||
|
unsigned int nconsumed;
|
|||
|
if (NULL == SLutf8_decode (s, smax, &w, &nconsumed))
|
|||
|
break;
|
|||
|
|
|||
|
if (0 != SLwchar_wcwidth (w))
|
|||
|
break;
|
|||
|
|
|||
|
s += nconsumed;
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
if (dnum != NULL)
|
|||
|
*dnum = n;
|
|||
|
return s;
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
SLuchar_Type *SLutf8_bskip_chars (SLuchar_Type *smin, SLuchar_Type *s,
|
|||
|
unsigned int num, unsigned int *dnum,
|
|||
|
int ignore_combining)
|
|||
|
{
|
|||
|
unsigned int n;
|
|||
|
SLuchar_Type *smax = s;
|
|||
|
|
|||
|
n = 0;
|
|||
|
while ((n < num) && (s > smin))
|
|||
|
{
|
|||
|
unsigned char ch;
|
|||
|
unsigned int dn;
|
|||
|
|
|||
|
s--;
|
|||
|
ch = *s;
|
|||
|
if (ch < 0x80)
|
|||
|
{
|
|||
|
n++;
|
|||
|
smax = s;
|
|||
|
continue;
|
|||
|
}
|
|||
|
|
|||
|
dn = 0;
|
|||
|
while ((s != smin)
|
|||
|
&& (Len_Map[ch] == 0)
|
|||
|
&& (dn < SLUTF8_MAX_MBLEN))
|
|||
|
{
|
|||
|
s--;
|
|||
|
ch = *s;
|
|||
|
dn++;
|
|||
|
}
|
|||
|
|
|||
|
if (ch <= 0xBF)
|
|||
|
{
|
|||
|
/* Invalid sequence */
|
|||
|
n++;
|
|||
|
smax--;
|
|||
|
s = smax;
|
|||
|
continue;
|
|||
|
}
|
|||
|
|
|||
|
if (ch > 0xBF)
|
|||
|
{
|
|||
|
SLwchar_Type w;
|
|||
|
SLuchar_Type *s1;
|
|||
|
|
|||
|
if ((NULL == (s1 = SLutf8_decode (s, smax, &w, NULL)))
|
|||
|
|| (s1 != smax))
|
|||
|
{
|
|||
|
/* This means we backed up over an invalid sequence */
|
|||
|
dn = (unsigned int) (smax - s);
|
|||
|
n++;
|
|||
|
smax--;
|
|||
|
s = smax;
|
|||
|
continue;
|
|||
|
}
|
|||
|
|
|||
|
if ((ignore_combining == 0)
|
|||
|
|| (0 != SLwchar_wcwidth (w)))
|
|||
|
n++;
|
|||
|
|
|||
|
smax = s;
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
if (dnum != NULL)
|
|||
|
*dnum = n;
|
|||
|
return s;
|
|||
|
}
|
|||
|
|
|||
|
SLuchar_Type *SLutf8_bskip_char (SLuchar_Type *smin, SLuchar_Type *s)
|
|||
|
{
|
|||
|
if (s > smin)
|
|||
|
{
|
|||
|
unsigned int dn;
|
|||
|
|
|||
|
s--;
|
|||
|
if (*s >= 0x80)
|
|||
|
s = SLutf8_bskip_chars (smin, s+1, 1, &dn, 0);
|
|||
|
}
|
|||
|
return s;
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
/* This function counts the number of wide characters in a UTF-8 encoded
|
|||
|
* string. Each byte in an invalid sequence is counted as a single character.
|
|||
|
* If the string contains illegal values, the bytes making up the character is
|
|||
|
* counted as 1 character.
|
|||
|
*/
|
|||
|
unsigned int SLutf8_strlen (SLuchar_Type *s, int ignore_combining)
|
|||
|
{
|
|||
|
unsigned int count, len;
|
|||
|
|
|||
|
if (s == NULL)
|
|||
|
return 0;
|
|||
|
|
|||
|
len = strlen ((char *)s);
|
|||
|
(void) SLutf8_skip_chars (s, s + len, len, &count, ignore_combining);
|
|||
|
return count;
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
/*
|
|||
|
* This function returns NULL if the input does not correspond to a valid
|
|||
|
* UTF-8 sequence, otherwise, it returns the position of the next character
|
|||
|
* in the sequence.
|
|||
|
*/
|
|||
|
unsigned char *SLutf8_decode (unsigned char *u, unsigned char *umax,
|
|||
|
SLwchar_Type *wp, unsigned int *nconsumedp)
|
|||
|
{
|
|||
|
unsigned int len;
|
|||
|
unsigned char ch;
|
|||
|
SLwchar_Type w;
|
|||
|
|
|||
|
if (u >= umax)
|
|||
|
{
|
|||
|
*wp = 0;
|
|||
|
if (nconsumedp != NULL)
|
|||
|
*nconsumedp = 0;
|
|||
|
return NULL;
|
|||
|
}
|
|||
|
|
|||
|
*wp = ch = *u;
|
|||
|
if (ch < 0x80)
|
|||
|
{
|
|||
|
if (nconsumedp != NULL) *nconsumedp = 1;
|
|||
|
return u+1;
|
|||
|
}
|
|||
|
|
|||
|
len = Len_Map[ch];
|
|||
|
if (len < 2)
|
|||
|
{
|
|||
|
/* should not happen--- code here for completeness */
|
|||
|
if (nconsumedp != NULL) *nconsumedp = 1;
|
|||
|
return NULL;
|
|||
|
}
|
|||
|
if (u + len > umax)
|
|||
|
{
|
|||
|
if (nconsumedp != NULL) *nconsumedp = 1; /* (unsigned int) (umax - u); */
|
|||
|
return NULL;
|
|||
|
}
|
|||
|
|
|||
|
if (is_invalid_or_overlong_utf8 (u, len))
|
|||
|
{
|
|||
|
if (nconsumedp != NULL)
|
|||
|
*nconsumedp = 1;
|
|||
|
|
|||
|
return NULL;
|
|||
|
}
|
|||
|
|
|||
|
if (nconsumedp != NULL)
|
|||
|
*nconsumedp = len;
|
|||
|
|
|||
|
*wp = w = fast_utf8_decode (u, len);
|
|||
|
|
|||
|
if (IS_ILLEGAL_UNICODE(w))
|
|||
|
return NULL;
|
|||
|
|
|||
|
return u + len;
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
/* Encode the wide character returning a pointer to the end of the
|
|||
|
* utf8 of the encoded multi-byte character. This function will also encode
|
|||
|
* illegal unicode values. It returns NULL if buflen is too small.
|
|||
|
* Otherwise, it returns a pointer at the end of the last encoded byte.
|
|||
|
* It does not null terminate the encoded string.
|
|||
|
*/
|
|||
|
SLuchar_Type *SLutf8_encode (SLwchar_Type w, SLuchar_Type *u, unsigned int ulen)
|
|||
|
{
|
|||
|
SLuchar_Type *umax = u + ulen;
|
|||
|
|
|||
|
/* U-00000000 - U-0000007F: 0xxxxxxx */
|
|||
|
if (w <= 0x7F)
|
|||
|
{
|
|||
|
if (u >= umax)
|
|||
|
return NULL;
|
|||
|
|
|||
|
*u++ = (unsigned char) w;
|
|||
|
return u;
|
|||
|
}
|
|||
|
|
|||
|
/* U-00000080 - U-000007FF: 110xxxxx 10xxxxxx */
|
|||
|
if (w <= 0x7FF)
|
|||
|
{
|
|||
|
if ((u + 1) >= umax)
|
|||
|
return NULL;
|
|||
|
|
|||
|
*u++ = (w >> 6) | 0xC0;
|
|||
|
*u++ = (w & 0x3F) | 0x80;
|
|||
|
return u;
|
|||
|
}
|
|||
|
|
|||
|
/* First bad character starts at 0xD800 */
|
|||
|
|
|||
|
/* Allow illegal values to be encoded */
|
|||
|
|
|||
|
/*
|
|||
|
*if (IS_ILLEGAL_UNICODE(w))
|
|||
|
* return NULL;
|
|||
|
*/
|
|||
|
|
|||
|
/* U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx */
|
|||
|
if (w <= 0xFFFF)
|
|||
|
{
|
|||
|
if (u+2 >= umax)
|
|||
|
return NULL;
|
|||
|
*u++ = (w >> 12 ) | 0xE0;
|
|||
|
goto finish_2;
|
|||
|
}
|
|||
|
|
|||
|
/* U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
|||
|
if (w <= 0x1FFFFF)
|
|||
|
{
|
|||
|
if (u+3 >= umax)
|
|||
|
return NULL;
|
|||
|
*u++ = (w >> 18) | 0xF0;
|
|||
|
goto finish_3;
|
|||
|
}
|
|||
|
|
|||
|
/* U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
|||
|
if (w <= 0x3FFFFFF)
|
|||
|
{
|
|||
|
if (u+4 >= umax)
|
|||
|
return NULL;
|
|||
|
*u++ = (w >> 24) | 0xF8;
|
|||
|
goto finish_4;
|
|||
|
}
|
|||
|
|
|||
|
/* U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
|||
|
if (w <= 0x7FFFFFFF)
|
|||
|
{
|
|||
|
if (u+5 >= umax)
|
|||
|
return NULL;
|
|||
|
*u++ = (w >> 30) | 0xFC;
|
|||
|
goto finish_5;
|
|||
|
}
|
|||
|
|
|||
|
/* unreached?? */
|
|||
|
return NULL;
|
|||
|
|
|||
|
finish_5: *u++ = ((w >> 24) & 0x3F)|0x80;
|
|||
|
finish_4: *u++ = ((w >> 18) & 0x3F)|0x80;
|
|||
|
finish_3: *u++ = ((w >> 12) & 0x3F)|0x80;
|
|||
|
finish_2: *u++ = ((w >> 6) & 0x3F)|0x80;
|
|||
|
*u++ = (w & 0x3F)|0x80;
|
|||
|
|
|||
|
return u;
|
|||
|
}
|
|||
|
|
|||
|
/* Like SLutf8_encode, but null terminates the result.
|
|||
|
* At least SLUTF8_MAX_MBLEN+1 bytes assumed.
|
|||
|
*/
|
|||
|
SLuchar_Type *SLutf8_encode_null_terminate (SLwchar_Type w, SLuchar_Type *u)
|
|||
|
{
|
|||
|
SLuchar_Type *p;
|
|||
|
|
|||
|
p = SLutf8_encode (w, u, SLUTF8_MAX_MBLEN);
|
|||
|
if (p != NULL)
|
|||
|
*p = 0;
|
|||
|
return p;
|
|||
|
}
|
|||
|
|
|||
|
#if 0
|
|||
|
int SLutf8_decode_bytes (SLuchar_Type *u, SLuchar_Type *umax,
|
|||
|
unsigned char *b, unsigned int *np)
|
|||
|
{
|
|||
|
unsigned char *bmax;
|
|||
|
|
|||
|
bmax = b;
|
|||
|
while (u < umax)
|
|||
|
{
|
|||
|
SLwchar_Type w;
|
|||
|
|
|||
|
if (0 == (*u & 0x80))
|
|||
|
{
|
|||
|
*bmax++ = *u++;
|
|||
|
continue;
|
|||
|
}
|
|||
|
|
|||
|
if (NULL == (u = SLutf8_decode (u, umax, &w, NULL)))
|
|||
|
return -1; /* FIXME: HANDLE ERROR */
|
|||
|
|
|||
|
if (w > 0xFF)
|
|||
|
{
|
|||
|
#if 0
|
|||
|
sprintf (bmax, "<U+%04X>", w);
|
|||
|
bmax += strlen (bmax);
|
|||
|
continue;
|
|||
|
#endif
|
|||
|
/* FIXME: HANDLE ERROR */
|
|||
|
w = w & 0xFF;
|
|||
|
}
|
|||
|
|
|||
|
*bmax++ = w;
|
|||
|
}
|
|||
|
*np = bmax - b;
|
|||
|
*bmax = 0;
|
|||
|
return 0;
|
|||
|
}
|
|||
|
|
|||
|
/* UTF-8 Encode the bytes between b and bmax storing the results in the
|
|||
|
* buffer defined by u and umax, returning the position following the
|
|||
|
* last encoded character. Upon return, *np is set to the number of bytes
|
|||
|
* sucessfully encoded.
|
|||
|
*/
|
|||
|
SLuchar_Type *SLutf8_encode_bytes (unsigned char *b, unsigned char *bmax,
|
|||
|
SLuchar_Type *u, unsigned int ulen,
|
|||
|
unsigned int *np)
|
|||
|
{
|
|||
|
unsigned char *bstart = b;
|
|||
|
SLuchar_Type *umax = u + ulen;
|
|||
|
|
|||
|
while (b < bmax)
|
|||
|
{
|
|||
|
SLuchar_Type *u1;
|
|||
|
|
|||
|
if (0 == (*b & 0x80))
|
|||
|
{
|
|||
|
if (u >= umax)
|
|||
|
break;
|
|||
|
|
|||
|
*u++ = *b++;
|
|||
|
continue;
|
|||
|
}
|
|||
|
|
|||
|
if (NULL == (u1 = SLutf8_encode (*b, u, umax - u)))
|
|||
|
break;
|
|||
|
u = u1;
|
|||
|
b++;
|
|||
|
}
|
|||
|
|
|||
|
*np = b - bstart;
|
|||
|
if (u < umax)
|
|||
|
*u = 0;
|
|||
|
|
|||
|
return u;
|
|||
|
}
|
|||
|
#endif
|
|||
|
|
|||
|
static SLuchar_Type *xform_utf8 (SLuchar_Type *u, SLuchar_Type *umax,
|
|||
|
SLwchar_Type (*fun)(SLwchar_Type))
|
|||
|
{
|
|||
|
SLuchar_Type *buf, *p;
|
|||
|
unsigned int malloced_len, len;
|
|||
|
|
|||
|
if (umax < u)
|
|||
|
return NULL;
|
|||
|
|
|||
|
len = 0;
|
|||
|
p = buf = NULL;
|
|||
|
malloced_len = 0;
|
|||
|
|
|||
|
while (1)
|
|||
|
{
|
|||
|
SLwchar_Type w;
|
|||
|
SLuchar_Type *u1;
|
|||
|
unsigned int nconsumed;
|
|||
|
|
|||
|
if (malloced_len <= len + SLUTF8_MAX_MBLEN)
|
|||
|
{
|
|||
|
SLuchar_Type *newbuf;
|
|||
|
malloced_len += 1 + (umax - u) + SLUTF8_MAX_MBLEN;
|
|||
|
|
|||
|
newbuf = (SLuchar_Type *)SLrealloc ((char *)buf, malloced_len);
|
|||
|
if (newbuf == NULL)
|
|||
|
{
|
|||
|
SLfree ((char *)buf);
|
|||
|
return NULL;
|
|||
|
}
|
|||
|
buf = newbuf;
|
|||
|
p = buf + len;
|
|||
|
}
|
|||
|
|
|||
|
if (u >= umax)
|
|||
|
{
|
|||
|
*p = 0;
|
|||
|
p = (SLuchar_Type *) SLang_create_nslstring ((char *)buf, len);
|
|||
|
SLfree ((char *)buf);
|
|||
|
return p;
|
|||
|
}
|
|||
|
|
|||
|
if (NULL == (u1 = SLutf8_decode (u, umax, &w, &nconsumed)))
|
|||
|
{
|
|||
|
/* Invalid sequence */
|
|||
|
memcpy ((char *) p, u, nconsumed);
|
|||
|
p += nconsumed;
|
|||
|
len += nconsumed;
|
|||
|
u1 = u + nconsumed;
|
|||
|
}
|
|||
|
else
|
|||
|
{
|
|||
|
SLuchar_Type *p1;
|
|||
|
|
|||
|
p1 = SLutf8_encode ((*fun)(w), p, malloced_len);
|
|||
|
if (p1 == NULL)
|
|||
|
{
|
|||
|
SLfree ((char *)buf);
|
|||
|
SLang_verror (SL_INTERNAL_ERROR, "SLutf8_encode returned NULL");
|
|||
|
return NULL;
|
|||
|
}
|
|||
|
len += p1 - p;
|
|||
|
p = p1;
|
|||
|
}
|
|||
|
|
|||
|
u = u1;
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
/* Returned an uppercased version of an UTF-8 encoded string. Illegal or
|
|||
|
* invalid sequences will be returned as-is. This function returns
|
|||
|
* an SLstring.
|
|||
|
*/
|
|||
|
SLuchar_Type *SLutf8_strup (SLuchar_Type *u, SLuchar_Type *umax)
|
|||
|
{
|
|||
|
return xform_utf8 (u, umax, SLwchar_toupper);
|
|||
|
}
|
|||
|
|
|||
|
/* Returned an lowercased version of an UTF-8 encoded string. Illegal or
|
|||
|
* invalid sequences will be returned as-is. This function returns
|
|||
|
* an SLstring.
|
|||
|
*/
|
|||
|
SLuchar_Type *SLutf8_strlo (SLuchar_Type *u, SLuchar_Type *umax)
|
|||
|
{
|
|||
|
return xform_utf8 (u, umax, SLwchar_tolower);
|
|||
|
}
|
|||
|
|
|||
|
int SLutf8_compare (SLuchar_Type *a, SLuchar_Type *amax,
|
|||
|
SLuchar_Type *b, SLuchar_Type *bmax,
|
|||
|
unsigned int nchars,
|
|||
|
int cs)
|
|||
|
{
|
|||
|
while (nchars && (a < amax) && (b < bmax))
|
|||
|
{
|
|||
|
SLwchar_Type cha, chb;
|
|||
|
unsigned int na, nb;
|
|||
|
int aok, bok;
|
|||
|
|
|||
|
if (*a < 0x80)
|
|||
|
{
|
|||
|
cha = (SLwchar_Type) *a++;
|
|||
|
aok = 1;
|
|||
|
}
|
|||
|
else
|
|||
|
{
|
|||
|
aok = (NULL != SLutf8_decode (a, amax, &cha, &na));
|
|||
|
a += na;
|
|||
|
}
|
|||
|
|
|||
|
if (*b < 0x80)
|
|||
|
{
|
|||
|
chb = (SLwchar_Type) *b++;
|
|||
|
bok = 1;
|
|||
|
}
|
|||
|
else
|
|||
|
{
|
|||
|
bok = (NULL != SLutf8_decode (b, bmax, &chb, &nb));
|
|||
|
b += nb;
|
|||
|
}
|
|||
|
|
|||
|
nchars--;
|
|||
|
|
|||
|
if (aok && bok)
|
|||
|
{
|
|||
|
if (cs == 0)
|
|||
|
{
|
|||
|
cha = SLwchar_toupper (cha);
|
|||
|
chb = SLwchar_toupper (chb);
|
|||
|
}
|
|||
|
}
|
|||
|
else if (aok)
|
|||
|
return 1;
|
|||
|
else if (bok)
|
|||
|
return -1;
|
|||
|
|
|||
|
if (cha == chb)
|
|||
|
continue;
|
|||
|
|
|||
|
if (cha > chb)
|
|||
|
return 1;
|
|||
|
|
|||
|
return -1;
|
|||
|
}
|
|||
|
|
|||
|
if (nchars == 0)
|
|||
|
return 0;
|
|||
|
|
|||
|
if ((a >= amax) && (b >= bmax))
|
|||
|
return 0;
|
|||
|
|
|||
|
if (b >= bmax)
|
|||
|
return 1;
|
|||
|
|
|||
|
return -1;
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
/* Returns an SLstring */
|
|||
|
SLstr_Type *SLutf8_subst_wchar (SLuchar_Type *u, SLuchar_Type *umax,
|
|||
|
SLwchar_Type wch, unsigned int pos,
|
|||
|
int ignore_combining)
|
|||
|
{
|
|||
|
SLuchar_Type *a, *a1, *b;
|
|||
|
unsigned int dpos;
|
|||
|
SLuchar_Type buf[SLUTF8_MAX_MBLEN+1];
|
|||
|
SLstr_Type *c;
|
|||
|
unsigned int n1, n2, n3, len;
|
|||
|
|
|||
|
a = SLutf8_skip_chars (u, umax, pos, &dpos, ignore_combining);
|
|||
|
|
|||
|
if ((dpos != pos) || (a == umax))
|
|||
|
{
|
|||
|
SLang_verror (SL_INDEX_ERROR, "Specified character position is invalid for string");
|
|||
|
return NULL;
|
|||
|
}
|
|||
|
|
|||
|
a1 = SLutf8_skip_chars (a, umax, 1, NULL, ignore_combining);
|
|||
|
|
|||
|
b = SLutf8_encode (wch, buf, SLUTF8_MAX_MBLEN);
|
|||
|
if (b == NULL)
|
|||
|
{
|
|||
|
SLang_verror (SL_UNICODE_ERROR, "Unable to encode wchar 0x%lX", (unsigned long)wch);
|
|||
|
return NULL;
|
|||
|
}
|
|||
|
|
|||
|
n1 = (a-u);
|
|||
|
n2 = (b-buf);
|
|||
|
n3 = (umax-a1);
|
|||
|
len = n1 + n2 + n3;
|
|||
|
c = _pSLallocate_slstring (len);
|
|||
|
if (c == NULL)
|
|||
|
return NULL;
|
|||
|
|
|||
|
memcpy (c, (char *)u, n1);
|
|||
|
memcpy (c+n1, (char *)buf, n2);
|
|||
|
memcpy (c+n1+n2, (char *)a1, n3);
|
|||
|
c[len] = 0;
|
|||
|
|
|||
|
/* No need to worry about this failing-- it frees its argument */
|
|||
|
return _pSLcreate_via_alloced_slstring (c, len);
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
/* utf8 buffer assumed to be at least SLUTF8_MAX_MBLEN+1 bytes. Result will be
|
|||
|
* null terminated. Returns position of NEXT character.
|
|||
|
* Analogous to: *p++
|
|||
|
*/
|
|||
|
SLuchar_Type *SLutf8_extract_utf8_char (SLuchar_Type *u,
|
|||
|
SLuchar_Type *umax,
|
|||
|
SLuchar_Type *utf8)
|
|||
|
{
|
|||
|
SLuchar_Type *u1;
|
|||
|
|
|||
|
u1 = SLutf8_skip_char (u, umax);
|
|||
|
memcpy ((char *)utf8, u, u1-u);
|
|||
|
utf8[u1-u] = 0;
|
|||
|
|
|||
|
return u1;
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
|
|||
|
/* These routines depend upon the value of the _pSLinterp_UTF8_Mode variable.
|
|||
|
* They also generate slang errors upon error.
|
|||
|
*/
|
|||
|
SLuchar_Type *_pSLinterp_decode_wchar (SLuchar_Type *u,
|
|||
|
SLuchar_Type *umax,
|
|||
|
SLwchar_Type *chp)
|
|||
|
{
|
|||
|
if (_pSLinterp_UTF8_Mode == 0)
|
|||
|
{
|
|||
|
if (u < umax)
|
|||
|
*chp = (SLwchar_Type) *u++;
|
|||
|
return u;
|
|||
|
}
|
|||
|
|
|||
|
if (NULL == (u = SLutf8_decode (u, umax, chp, NULL)))
|
|||
|
SLang_verror (SL_INVALID_UTF8, "Invalid UTF-8 encoded string");
|
|||
|
|
|||
|
return u;
|
|||
|
}
|
|||
|
|
|||
|
/* At least SLUTF8_MAX_MBLEN+1 bytes assumed-- null terminates result.
|
|||
|
* Upon success, it returns a pointer to the _end_ of the encoded character
|
|||
|
*/
|
|||
|
SLuchar_Type *_pSLinterp_encode_wchar (SLwchar_Type wch, SLuchar_Type *u, unsigned int *encoded_len)
|
|||
|
{
|
|||
|
SLuchar_Type *u1;
|
|||
|
|
|||
|
if (_pSLinterp_UTF8_Mode == 0)
|
|||
|
{
|
|||
|
*encoded_len = 1;
|
|||
|
*u++ = (SLuchar_Type) wch;
|
|||
|
*u++ = 0;
|
|||
|
return u;
|
|||
|
}
|
|||
|
|
|||
|
if (NULL == (u1 = SLutf8_encode_null_terminate (wch, u)))
|
|||
|
{
|
|||
|
SLang_verror (SL_UNICODE_ERROR, "Unable to encode character 0x%lX", (unsigned long)wch);
|
|||
|
return NULL;
|
|||
|
}
|
|||
|
|
|||
|
*encoded_len = (unsigned int) (u1 - u);
|
|||
|
return u1;
|
|||
|
}
|
|||
|
|
|||
|
#ifdef REGRESSION
|
|||
|
int main (int argc, char **argv)
|
|||
|
{
|
|||
|
unsigned char *s, *smax;
|
|||
|
char **t;
|
|||
|
char *ok_tests [] =
|
|||
|
{
|
|||
|
"",
|
|||
|
"",
|
|||
|
"<EFBFBD>",
|
|||
|
"",
|
|||
|
"<EFBFBD><EFBFBD><EFBFBD><EFBFBD>",
|
|||
|
NULL
|
|||
|
};
|
|||
|
char *long_tests [] =
|
|||
|
{
|
|||
|
"<EFBFBD><EFBFBD>",
|
|||
|
"<EFBFBD><EFBFBD><EFBFBD>",
|
|||
|
"<EFBFBD><EFBFBD><EFBFBD><EFBFBD>",
|
|||
|
"<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>",
|
|||
|
"<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>",
|
|||
|
NULL
|
|||
|
};
|
|||
|
|
|||
|
t = long_tests;
|
|||
|
while ((s = (unsigned char *) *t++) != NULL)
|
|||
|
{
|
|||
|
smax = s + strlen ((char *)s);
|
|||
|
|
|||
|
while (s < smax)
|
|||
|
{
|
|||
|
SLwchar_Type w;
|
|||
|
|
|||
|
if (NULL == (s = SLutf8_to_wc (s, smax, &w)))
|
|||
|
{
|
|||
|
fprintf (stderr, "SLutf8_to_wc failed\n");
|
|||
|
break;
|
|||
|
}
|
|||
|
if (w == 0)
|
|||
|
break;
|
|||
|
fprintf (stdout, " 0x%X", w);
|
|||
|
}
|
|||
|
|
|||
|
fprintf (stdout, "\n");
|
|||
|
}
|
|||
|
return 0;
|
|||
|
}
|
|||
|
#endif
|
|||
|
|