diff options
Diffstat (limited to 'utf8.c')
| -rw-r--r-- | utf8.c | 250 |
1 files changed, 192 insertions, 58 deletions
@@ -1,4 +1,5 @@ #include "git-compat-util.h" +#include "strbuf.h" #include "utf8.h" /* This code is originally from http://www.cl.cam.ac.uk/~mgk25/ucs/ */ @@ -9,7 +10,8 @@ struct interval { }; /* auxiliary function for binary search in interval table */ -static int bisearch(wchar_t ucs, const struct interval *table, int max) { +static int bisearch(ucs_char_t ucs, const struct interval *table, int max) +{ int min = 0; int mid; @@ -56,11 +58,11 @@ static int bisearch(wchar_t ucs, const struct interval *table, int max) { * ISO 8859-1 and WGL4 characters, Unicode control characters, * etc.) have a column width of 1. * - * This implementation assumes that wchar_t characters are encoded + * This implementation assumes that ucs_char_t characters are encoded * in ISO 10646. */ -static int wcwidth(wchar_t ch) +static int git_wcwidth(ucs_char_t ch) { /* * Sorted list of non-overlapping intervals of non-spacing characters, @@ -150,62 +152,118 @@ static int wcwidth(wchar_t ch) } /* - * This function returns the number of columns occupied by the character - * pointed to by the variable start. The pointer is updated to point at - * the next character. If it was not valid UTF-8, the pointer is set to NULL. + * Pick one ucs character starting from the location *start points at, + * and return it, while updating the *start pointer to point at the + * end of that character. When remainder_p is not NULL, the location + * holds the number of bytes remaining in the string that we are allowed + * to pick from. Otherwise we are allowed to pick up to the NUL that + * would eventually appear in the string. *remainder_p is also reduced + * by the number of bytes we have consumed. + * + * If the string was not a valid UTF-8, *start pointer is set to NULL + * and the return value is undefined. */ -int utf8_width(const char **start) +static ucs_char_t pick_one_utf8_char(const char **start, size_t *remainder_p) { unsigned char *s = (unsigned char *)*start; - wchar_t ch; + ucs_char_t ch; + size_t remainder, incr; + + /* + * A caller that assumes NUL terminated text can choose + * not to bother with the remainder length. We will + * stop at the first NUL. + */ + remainder = (remainder_p ? *remainder_p : 999); - if (*s < 0x80) { + if (remainder < 1) { + goto invalid; + } else if (*s < 0x80) { /* 0xxxxxxx */ ch = *s; - *start += 1; + incr = 1; } else if ((s[0] & 0xe0) == 0xc0) { /* 110XXXXx 10xxxxxx */ - if ((s[1] & 0xc0) != 0x80 || - /* overlong? */ - (s[0] & 0xfe) == 0xc0) + if (remainder < 2 || + (s[1] & 0xc0) != 0x80 || + (s[0] & 0xfe) == 0xc0) goto invalid; ch = ((s[0] & 0x1f) << 6) | (s[1] & 0x3f); - *start += 2; + incr = 2; } else if ((s[0] & 0xf0) == 0xe0) { /* 1110XXXX 10Xxxxxx 10xxxxxx */ - if ((s[1] & 0xc0) != 0x80 || - (s[2] & 0xc0) != 0x80 || - /* overlong? */ - (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || - /* surrogate? */ - (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || - /* U+FFFE or U+FFFF? */ - (s[0] == 0xef && s[1] == 0xbf && - (s[2] & 0xfe) == 0xbe)) + if (remainder < 3 || + (s[1] & 0xc0) != 0x80 || + (s[2] & 0xc0) != 0x80 || + /* overlong? */ + (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || + /* surrogate? */ + (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || + /* U+FFFE or U+FFFF? */ + (s[0] == 0xef && s[1] == 0xbf && + (s[2] & 0xfe) == 0xbe)) goto invalid; ch = ((s[0] & 0x0f) << 12) | ((s[1] & 0x3f) << 6) | (s[2] & 0x3f); - *start += 3; + incr = 3; } else if ((s[0] & 0xf8) == 0xf0) { /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */ - if ((s[1] & 0xc0) != 0x80 || - (s[2] & 0xc0) != 0x80 || - (s[3] & 0xc0) != 0x80 || - /* overlong? */ - (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) || - /* > U+10FFFF? */ - (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) + if (remainder < 4 || + (s[1] & 0xc0) != 0x80 || + (s[2] & 0xc0) != 0x80 || + (s[3] & 0xc0) != 0x80 || + /* overlong? */ + (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) || + /* > U+10FFFF? */ + (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) goto invalid; ch = ((s[0] & 0x07) << 18) | ((s[1] & 0x3f) << 12) | ((s[2] & 0x3f) << 6) | (s[3] & 0x3f); - *start += 4; + incr = 4; } else { invalid: *start = NULL; return 0; } - return wcwidth(ch); + *start += incr; + if (remainder_p) + *remainder_p = remainder - incr; + return ch; +} + +/* + * This function returns the number of columns occupied by the character + * pointed to by the variable start. The pointer is updated to point at + * the next character. When remainder_p is not NULL, it points at the + * location that stores the number of remaining bytes we can use to pick + * a character (see pick_one_utf8_char() above). + */ +int utf8_width(const char **start, size_t *remainder_p) +{ + ucs_char_t ch = pick_one_utf8_char(start, remainder_p); + if (!*start) + return 0; + return git_wcwidth(ch); +} + +/* + * Returns the total number of columns required by a null-terminated + * string, assuming that the string is utf8. Returns strlen() instead + * if the string does not look like a valid utf8 string. + */ +int utf8_strwidth(const char *string) +{ + int width = 0; + const char *orig = string; + + while (1) { + if (!string) + return strlen(orig); + if (!*string) + return width; + width += utf8_width(&string, NULL); + } } int is_utf8(const char *text) @@ -215,62 +273,132 @@ int is_utf8(const char *text) text++; continue; } - utf8_width(&text); + utf8_width(&text, NULL); if (!text) return 0; } return 1; } -static void print_spaces(int count) +static void strbuf_addchars(struct strbuf *sb, int c, size_t n) { - static const char s[] = " "; - while (count >= sizeof(s)) { - fwrite(s, sizeof(s) - 1, 1, stdout); - count -= sizeof(s) - 1; + strbuf_grow(sb, n); + memset(sb->buf + sb->len, c, n); + strbuf_setlen(sb, sb->len + n); +} + +static void strbuf_add_indented_text(struct strbuf *buf, const char *text, + int indent, int indent2) +{ + if (indent < 0) + indent = 0; + while (*text) { + const char *eol = strchrnul(text, '\n'); + if (*eol == '\n') + eol++; + strbuf_addchars(buf, ' ', indent); + strbuf_add(buf, text, eol - text); + text = eol; + indent = indent2; } - fwrite(s, count, 1, stdout); +} + +static size_t display_mode_esc_sequence_len(const char *s) +{ + const char *p = s; + if (*p++ != '\033') + return 0; + if (*p++ != '[') + return 0; + while (isdigit(*p) || *p == ';') + p++; + if (*p++ != 'm') + return 0; + return p - s; } /* * Wrap the text, if necessary. The variable indent is the indent for the * first line, indent2 is the indent for all other lines. + * If indent is negative, assume that already -indent columns have been + * consumed (and no extra indent is necessary for the first line). */ -void print_wrapped_text(const char *text, int indent, int indent2, int width) +int strbuf_add_wrapped_text(struct strbuf *buf, + const char *text, int indent1, int indent2, int width) { - int w = indent, assume_utf8 = is_utf8(text); - const char *bol = text, *space = NULL; + int indent, w, assume_utf8 = 1; + const char *bol, *space, *start = text; + size_t orig_len = buf->len; + + if (width <= 0) { + strbuf_add_indented_text(buf, text, indent1, indent2); + return 1; + } + +retry: + bol = text; + w = indent = indent1; + space = NULL; + if (indent < 0) { + w = -indent; + space = text; + } for (;;) { - char c = *text; + char c; + size_t skip; + + while ((skip = display_mode_esc_sequence_len(text))) + text += skip; + + c = *text; if (!c || isspace(c)) { if (w < width || !space) { const char *start = bol; + if (!c && text == start) + return w; if (space) start = space; else - print_spaces(indent); - fwrite(start, text - start, 1, stdout); - if (!c) { - putchar('\n'); - return; - } else if (c == '\t') - w |= 0x07; + strbuf_addchars(buf, ' ', indent); + strbuf_add(buf, start, text - start); + if (!c) + return w; space = text; + if (c == '\t') + w |= 0x07; + else if (c == '\n') { + space++; + if (*space == '\n') { + strbuf_addch(buf, '\n'); + goto new_line; + } + else if (!isalnum(*space)) + goto new_line; + else + strbuf_addch(buf, ' '); + } w++; text++; } else { - putchar('\n'); - text = bol = space + 1; +new_line: + strbuf_addch(buf, '\n'); + text = bol = space + isspace(*space); space = NULL; w = indent = indent2; } continue; } - if (assume_utf8) - w += utf8_width(&text); - else { + if (assume_utf8) { + w += utf8_width(&text, NULL); + if (!text) { + assume_utf8 = 0; + text = start; + strbuf_setlen(buf, orig_len); + goto retry; + } + } else { w++; text++; } @@ -291,11 +419,17 @@ int is_encoding_utf8(const char *name) * with iconv. If the conversion fails, returns NULL. */ #ifndef NO_ICONV +#if defined(OLD_ICONV) || (defined(__sun__) && !defined(_XPG6)) + typedef const char * iconv_ibp; +#else + typedef char * iconv_ibp; +#endif char *reencode_string(const char *in, const char *out_encoding, const char *in_encoding) { iconv_t conv; size_t insz, outsz, outalloc; - char *out, *outpos, *cp; + char *out, *outpos; + iconv_ibp cp; if (!in_encoding) return NULL; @@ -307,7 +441,7 @@ char *reencode_string(const char *in, const char *out_encoding, const char *in_e outalloc = outsz + 1; /* for terminating NUL */ out = xmalloc(outalloc); outpos = out; - cp = (char *)in; + cp = (iconv_ibp)in; while (1) { size_t cnt = iconv(conv, &cp, &insz, &outpos, &outsz); |
