nano/src/chars.c

964 lines
22 KiB
C
Raw Normal View History

/* $Id$ */
/**************************************************************************
* chars.c *
* *
* Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007 *
* Free Software Foundation, Inc. *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 3, or (at your option) *
* any later version. *
* *
* This program is distributed in the hope that it will be useful, but *
* WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU *
* General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write to the Free Software *
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA *
* 02110-1301, USA. *
* *
**************************************************************************/
#include "proto.h"
#include <string.h>
#include <ctype.h>
#ifdef ENABLE_UTF8
#ifdef HAVE_WCHAR_H
#include <wchar.h>
#endif
#ifdef HAVE_WCTYPE_H
#include <wctype.h>
#endif
static bool use_utf8 = FALSE;
/* Whether we've enabled UTF-8 support. */
static const wchar_t bad_wchar = 0xFFFD;
/* If we get an invalid multibyte sequence, we treat it as
* Unicode FFFD (Replacement Character), unless we're searching
* for a match to it. */
static const char *const bad_mbchar = "\xEF\xBF\xBD";
static const int bad_mbchar_len = 3;
/* Enable UTF-8 support. */
void utf8_init(void)
{
use_utf8 = TRUE;
}
/* Is UTF-8 support enabled? */
bool using_utf8(void)
{
return use_utf8;
}
#endif
#ifndef HAVE_ISBLANK
/* This function is equivalent to isblank(). */
bool nisblank(int c)
{
return isspace(c) && (c == '\t' || !is_cntrl_char(c));
}
#endif
#if !defined(HAVE_ISWBLANK) && defined(ENABLE_UTF8)
/* This function is equivalent to iswblank(). */
bool niswblank(wchar_t wc)
{
return iswspace(wc) && (wc == '\t' || !is_cntrl_wchar(wc));
}
#endif
/* Return TRUE if the value of c is in byte range, and FALSE
* otherwise. */
bool is_byte(int c)
{
return ((unsigned int)c == (unsigned char)c);
}
static void mbtowc_reset(void)
{
IGNORE_CALL_RESULT(mbtowc(NULL, NULL, 0));
}
static void wctomb_reset(void)
{
IGNORE_CALL_RESULT(wctomb(NULL, 0));
}
/* This function is equivalent to isalnum() for multibyte characters. */
bool is_alnum_mbchar(const char *c)
{
assert(c != NULL);
#ifdef ENABLE_UTF8
if (use_utf8) {
wchar_t wc;
if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
mbtowc_reset();
wc = bad_wchar;
}
return iswalnum(wc);
} else
#endif
return isalnum((unsigned char)*c);
}
/* This function is equivalent to isblank() for multibyte characters. */
bool is_blank_mbchar(const char *c)
{
assert(c != NULL);
#ifdef ENABLE_UTF8
if (use_utf8) {
wchar_t wc;
if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
mbtowc_reset();
wc = bad_wchar;
}
return iswblank(wc);
} else
#endif
return isblank((unsigned char)*c);
}
/* This function is equivalent to iscntrl(), except in that it only
* handles non-high-bit control characters. */
bool is_ascii_cntrl_char(int c)
{
return (0 <= c && c < 32);
}
/* This function is equivalent to iscntrl(), except in that it also
* handles high-bit control characters. */
bool is_cntrl_char(int c)
{
return (-128 <= c && c < -96) || (0 <= c && c < 32) ||
(127 <= c && c < 160);
}
#ifdef ENABLE_UTF8
/* This function is equivalent to iscntrl() for wide characters, except
* in that it also handles wide control characters with their high bits
* set. */
bool is_cntrl_wchar(wchar_t wc)
{
return (0 <= wc && wc < 32) || (127 <= wc && wc < 160);
}
#endif
/* This function is equivalent to iscntrl() for multibyte characters,
* except in that it also handles multibyte control characters with
* their high bits set. */
bool is_cntrl_mbchar(const char *c)
{
assert(c != NULL);
#ifdef ENABLE_UTF8
if (use_utf8) {
wchar_t wc;
if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
mbtowc_reset();
wc = bad_wchar;
}
return is_cntrl_wchar(wc);
} else
#endif
return is_cntrl_char((unsigned char)*c);
}
/* This function is equivalent to ispunct() for multibyte characters. */
bool is_punct_mbchar(const char *c)
{
assert(c != NULL);
#ifdef ENABLE_UTF8
if (use_utf8) {
wchar_t wc;
int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX);
if (c_mb_len < 0) {
mbtowc_reset();
wc = bad_wchar;
}
return iswpunct(wc);
} else
#endif
return ispunct((unsigned char)*c);
}
/* Return TRUE for a multibyte character found in a word (currently only
* an alphanumeric or punctuation character, and only the latter if
* allow_punct is TRUE) and FALSE otherwise. */
bool is_word_mbchar(const char *c, bool allow_punct)
{
assert(c != NULL);
return is_alnum_mbchar(c) || (allow_punct ? is_punct_mbchar(c) :
FALSE);
}
/* c is a control character. It displays as ^@, ^?, or ^[ch], where ch
* is (c + 64). We return that character. */
char control_rep(char c)
{
assert(is_cntrl_char(c));
/* Treat newlines embedded in a line as encoded nulls. */
if (c == '\n')
return '@';
else if (c == NANO_CONTROL_8)
return '?';
else
return c + 64;
}
#ifdef ENABLE_UTF8
/* c is a wide control character. It displays as ^@, ^?, or ^[ch],
* where ch is (c + 64). We return that wide character. */
wchar_t control_wrep(wchar_t wc)
{
assert(is_cntrl_wchar(wc));
/* Treat newlines embedded in a line as encoded nulls. */
if (wc == '\n')
return '@';
else if (wc == NANO_CONTROL_8)
return '?';
else
return wc + 64;
}
#endif
/* c is a multibyte control character. It displays as ^@, ^?, or ^[ch],
* where ch is (c + 64). We return that multibyte character. If crep
* is an invalid multibyte sequence, it will be replaced with Unicode
* 0xFFFD (Replacement Character). */
char *control_mbrep(const char *c, char *crep, int *crep_len)
{
assert(c != NULL && crep != NULL && crep_len != NULL);
#ifdef ENABLE_UTF8
if (use_utf8) {
wchar_t wc;
if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
mbtowc_reset();
*crep_len = bad_mbchar_len;
strncpy(crep, bad_mbchar, *crep_len);
} else {
*crep_len = wctomb(crep, control_wrep(wc));
if (*crep_len < 0) {
wctomb_reset();
*crep_len = 0;
}
}
} else {
#endif
*crep_len = 1;
*crep = control_rep(*c);
#ifdef ENABLE_UTF8
}
#endif
return crep;
}
/* c is a multibyte non-control character. We return that multibyte
* character. If crep is an invalid multibyte sequence, it will be
* replaced with Unicode 0xFFFD (Replacement Character). */
char *mbrep(const char *c, char *crep, int *crep_len)
{
assert(c != NULL && crep != NULL && crep_len != NULL);
#ifdef ENABLE_UTF8
if (use_utf8) {
wchar_t wc;
/* Reject invalid Unicode characters. */
if (mbtowc(&wc, c, MB_CUR_MAX) < 0 || !is_valid_unicode(wc)) {
mbtowc_reset();
*crep_len = bad_mbchar_len;
strncpy(crep, bad_mbchar, *crep_len);
} else {
*crep_len = wctomb(crep, wc);
if (*crep_len < 0) {
wctomb_reset();
*crep_len = 0;
}
}
} else {
#endif
*crep_len = 1;
*crep = *c;
#ifdef ENABLE_UTF8
}
#endif
return crep;
}
/* This function is equivalent to wcwidth() for multibyte characters. */
int mbwidth(const char *c)
{
assert(c != NULL);
#ifdef ENABLE_UTF8
if (use_utf8) {
wchar_t wc;
int width;
if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
mbtowc_reset();
wc = bad_wchar;
}
width = wcwidth(wc);
if (width == -1) {
wc = bad_wchar;
width = wcwidth(wc);
}
return width;
} else
#endif
return 1;
}
/* Return the maximum width in bytes of a multibyte character. */
int mb_cur_max(void)
{
return
#ifdef ENABLE_UTF8
use_utf8 ? MB_CUR_MAX :
#endif
1;
}
/* Convert the Unicode value in chr to a multibyte character with the
* same wide character value as chr, if possible. If the conversion
* succeeds, return the (dynamically allocated) multibyte character and
* its length. Otherwise, return an undefined (dynamically allocated)
* multibyte character and a length of zero. */
char *make_mbchar(long chr, int *chr_mb_len)
{
char *chr_mb;
assert(chr_mb_len != NULL);
#ifdef ENABLE_UTF8
if (use_utf8) {
chr_mb = charalloc(MB_CUR_MAX);
*chr_mb_len = wctomb(chr_mb, (wchar_t)chr);
/* Reject invalid Unicode characters. */
if (*chr_mb_len < 0 || !is_valid_unicode((wchar_t)chr)) {
wctomb_reset();
*chr_mb_len = 0;
}
} else {
#endif
*chr_mb_len = 1;
chr_mb = mallocstrncpy(NULL, (char *)&chr, 1);
#ifdef ENABLE_UTF8
}
#endif
return chr_mb;
}
/* Parse a multibyte character from buf. Return the number of bytes
* used. If chr isn't NULL, store the multibyte character in it. If
* col isn't NULL, store the new display width in it. If *buf is '\t',
* we expect col to have the current display width. */
int parse_mbchar(const char *buf, char *chr, size_t *col)
{
int buf_mb_len;
assert(buf != NULL);
#ifdef ENABLE_UTF8
if (use_utf8) {
/* Get the number of bytes in the multibyte character. */
buf_mb_len = mblen(buf, MB_CUR_MAX);
/* If buf contains an invalid multibyte character, only
* interpret buf's first byte. */
if (buf_mb_len < 0) {
IGNORE_CALL_RESULT(mblen(NULL, 0));
buf_mb_len = 1;
} else if (buf_mb_len == 0)
buf_mb_len++;
/* Save the multibyte character in chr. */
if (chr != NULL) {
int i;
for (i = 0; i < buf_mb_len; i++)
chr[i] = buf[i];
}
/* Save the column width of the wide character in col. */
if (col != NULL) {
/* If we have a tab, get its width in columns using the
* current value of col. */
if (*buf == '\t')
*col += tabsize - *col % tabsize;
/* If we have a control character, get its width using one
* column for the "^" that will be displayed in front of it,
* and the width in columns of its visible equivalent as
* returned by control_mbrep(). */
else if (is_cntrl_mbchar(buf)) {
char *ctrl_buf_mb = charalloc(MB_CUR_MAX);
int ctrl_buf_mb_len;
(*col)++;
ctrl_buf_mb = control_mbrep(buf, ctrl_buf_mb,
&ctrl_buf_mb_len);
*col += mbwidth(ctrl_buf_mb);
free(ctrl_buf_mb);
/* If we have a normal character, get its width in columns
* normally. */
} else
*col += mbwidth(buf);
}
} else {
#endif
/* Get the number of bytes in the byte character. */
buf_mb_len = 1;
/* Save the byte character in chr. */
if (chr != NULL)
*chr = *buf;
if (col != NULL) {
/* If we have a tab, get its width in columns using the
* current value of col. */
if (*buf == '\t')
*col += tabsize - *col % tabsize;
/* If we have a control character, it's two columns wide:
* one column for the "^" that will be displayed in front of
* it, and one column for its visible equivalent as returned
* by control_mbrep(). */
else if (is_cntrl_char((unsigned char)*buf))
*col += 2;
/* If we have a normal character, it's one column wide. */
else
(*col)++;
}
#ifdef ENABLE_UTF8
}
#endif
return buf_mb_len;
}
/* Return the index in buf of the beginning of the multibyte character
* before the one at pos. */
size_t move_mbleft(const char *buf, size_t pos)
{
size_t pos_prev = pos;
assert(buf != NULL && pos <= strlen(buf));
/* There is no library function to move backward one multibyte
* character. Here is the naive, O(pos) way to do it. */
while (TRUE) {
int buf_mb_len = parse_mbchar(buf + pos - pos_prev, NULL, NULL);
if (pos_prev <= buf_mb_len)
break;
pos_prev -= buf_mb_len;
}
return pos - pos_prev;
}
/* Return the index in buf of the beginning of the multibyte character
* after the one at pos. */
size_t move_mbright(const char *buf, size_t pos)
{
return pos + parse_mbchar(buf + pos, NULL, NULL);
}
#ifndef HAVE_STRCASECMP
/* This function is equivalent to strcasecmp(). */
int nstrcasecmp(const char *s1, const char *s2)
{
return strncasecmp(s1, s2, (size_t)-1);
}
#endif
/* This function is equivalent to strcasecmp() for multibyte strings. */
int mbstrcasecmp(const char *s1, const char *s2)
{
return mbstrncasecmp(s1, s2, (size_t)-1);
}
#ifndef HAVE_STRNCASECMP
/* This function is equivalent to strncasecmp(). */
int nstrncasecmp(const char *s1, const char *s2, size_t n)
{
if (s1 == s2)
return 0;
assert(s1 != NULL && s2 != NULL);
for (; *s1 != '\0' && *s2 != '\0' && n > 0; s1++, s2++, n--) {
if (tolower(*s1) != tolower(*s2))
break;
}
return (n > 0) ? tolower(*s1) - tolower(*s2) : 0;
}
#endif
/* This function is equivalent to strncasecmp() for multibyte
* strings. */
int mbstrncasecmp(const char *s1, const char *s2, size_t n)
{
#ifdef ENABLE_UTF8
if (use_utf8) {
char *s1_mb, *s2_mb;
wchar_t ws1, ws2;
if (s1 == s2)
return 0;
assert(s1 != NULL && s2 != NULL);
s1_mb = charalloc(MB_CUR_MAX);
s2_mb = charalloc(MB_CUR_MAX);
for (; *s1 != '\0' && *s2 != '\0' && n > 0; s1 +=
move_mbright(s1, 0), s2 += move_mbright(s2, 0), n--) {
bool bad_s1_mb = FALSE, bad_s2_mb = FALSE;
int s1_mb_len, s2_mb_len;
s1_mb_len = parse_mbchar(s1, s1_mb, NULL);
if (mbtowc(&ws1, s1_mb, s1_mb_len) < 0) {
mbtowc_reset();
ws1 = (unsigned char)*s1_mb;
bad_s1_mb = TRUE;
}
s2_mb_len = parse_mbchar(s2, s2_mb, NULL);
if (mbtowc(&ws2, s2_mb, s2_mb_len) < 0) {
mbtowc_reset();
ws2 = (unsigned char)*s2_mb;
bad_s2_mb = TRUE;
}
if (bad_s1_mb != bad_s2_mb || towlower(ws1) !=
towlower(ws2))
break;
}
free(s1_mb);
free(s2_mb);
return (n > 0) ? towlower(ws1) - towlower(ws2) : 0;
} else
#endif
return strncasecmp(s1, s2, n);
}
#ifndef HAVE_STRCASESTR
/* This function is equivalent to strcasestr(). */
char *nstrcasestr(const char *haystack, const char *needle)
{
size_t haystack_len, needle_len;
assert(haystack != NULL && needle != NULL);
if (*needle == '\0')
return (char *)haystack;
haystack_len = strlen(haystack);
needle_len = strlen(needle);
for (; *haystack != '\0' && haystack_len >= needle_len; haystack++,
haystack_len--) {
if (strncasecmp(haystack, needle, needle_len) == 0)
return (char *)haystack;
}
return NULL;
}
#endif
/* This function is equivalent to strcasestr() for multibyte strings. */
char *mbstrcasestr(const char *haystack, const char *needle)
{
#ifdef ENABLE_UTF8
if (use_utf8) {
size_t haystack_len, needle_len;
assert(haystack != NULL && needle != NULL);
if (*needle == '\0')
return (char *)haystack;
haystack_len = mbstrlen(haystack);
needle_len = mbstrlen(needle);
for (; *haystack != '\0' && haystack_len >= needle_len;
haystack += move_mbright(haystack, 0), haystack_len--) {
if (mbstrncasecmp(haystack, needle, needle_len) == 0)
return (char *)haystack;
}
return NULL;