Adding upstream version 0.5.1
This commit is contained in:
346
libwc/utf8.c
Normal file
346
libwc/utf8.c
Normal file
@@ -0,0 +1,346 @@
|
||||
|
||||
#ifdef USE_UNICODE
|
||||
|
||||
#include "wc.h"
|
||||
#include "ucs.h"
|
||||
#include "utf8.h"
|
||||
#include "wtf.h"
|
||||
|
||||
wc_uint8 WC_UTF8_MAP[ 0x100 ] = {
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8,
|
||||
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 7,
|
||||
};
|
||||
|
||||
static wc_uchar utf8_buf[7];
|
||||
|
||||
size_t
|
||||
wc_ucs_to_utf8(wc_uint32 ucs, wc_uchar *utf8)
|
||||
{
|
||||
if (ucs < WC_C_UTF8_L2) {
|
||||
utf8[0] = ucs;
|
||||
utf8[1] = 0;
|
||||
return 1;
|
||||
} else if (ucs < WC_C_UTF8_L3) {
|
||||
utf8[0] = (ucs >> 6) | 0xc0;
|
||||
utf8[1] = (ucs & 0x3f) | 0x80;
|
||||
utf8[2] = 0;
|
||||
return 2;
|
||||
} else if (ucs < WC_C_UTF8_L4) {
|
||||
utf8[0] = (ucs >> 12) | 0xe0;
|
||||
utf8[1] = ((ucs >> 6) & 0x3f) | 0x80;
|
||||
utf8[2] = (ucs & 0x3f) | 0x80;
|
||||
utf8[3] = 0;
|
||||
return 3;
|
||||
} else if (ucs < WC_C_UTF8_L5) {
|
||||
utf8[0] = (ucs >> 18) | 0xf0;
|
||||
utf8[1] = ((ucs >> 12) & 0x3f) | 0x80;
|
||||
utf8[2] = ((ucs >> 6) & 0x3f) | 0x80;
|
||||
utf8[3] = (ucs & 0x3f) | 0x80;
|
||||
utf8[4] = 0;
|
||||
return 4;
|
||||
} else if (ucs < WC_C_UTF8_L6) {
|
||||
utf8[0] = (ucs >> 24) | 0xf8;
|
||||
utf8[1] = ((ucs >> 18) & 0x3f) | 0x80;
|
||||
utf8[2] = ((ucs >> 12) & 0x3f) | 0x80;
|
||||
utf8[3] = ((ucs >> 6) & 0x3f) | 0x80;
|
||||
utf8[4] = (ucs & 0x3f) | 0x80;
|
||||
utf8[5] = 0;
|
||||
return 5;
|
||||
} else if (ucs <= WC_C_UCS4_END) {
|
||||
utf8[0] = (ucs >> 30) | 0xfc;
|
||||
utf8[1] = ((ucs >> 24) & 0x3f) | 0x80;
|
||||
utf8[2] = ((ucs >> 18) & 0x3f) | 0x80;
|
||||
utf8[3] = ((ucs >> 12) & 0x3f) | 0x80;
|
||||
utf8[4] = ((ucs >> 6) & 0x3f) | 0x80;
|
||||
utf8[5] = (ucs & 0x3f) | 0x80;
|
||||
utf8[6] = 0;
|
||||
return 6;
|
||||
} else {
|
||||
utf8[0] = 0;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
wc_uint32
|
||||
wc_utf8_to_ucs(wc_uchar *utf8)
|
||||
{
|
||||
wc_uint32 ucs;
|
||||
|
||||
switch (WC_UTF8_MAP[utf8[0]]) {
|
||||
case 1:
|
||||
ucs = (wc_uint32) utf8[0];
|
||||
if (ucs >= WC_C_UTF8_L2)
|
||||
break;
|
||||
return ucs;
|
||||
case 2:
|
||||
ucs = ((wc_uint32)(utf8[0] & 0x1f) << 6)
|
||||
| (wc_uint32)(utf8[1] & 0x3f);
|
||||
if (ucs < WC_C_UTF8_L2)
|
||||
break;
|
||||
return ucs;
|
||||
case 3:
|
||||
ucs = ((wc_uint32)(utf8[0] & 0x0f) << 12)
|
||||
| ((wc_uint32)(utf8[1] & 0x3f) << 6)
|
||||
| (wc_uint32)(utf8[2] & 0x3f);
|
||||
if (ucs < WC_C_UTF8_L3)
|
||||
break;
|
||||
return ucs;
|
||||
case 4:
|
||||
ucs = ((wc_uint32)(utf8[0] & 0x07) << 18)
|
||||
| ((wc_uint32)(utf8[1] & 0x3f) << 12)
|
||||
| ((wc_uint32)(utf8[2] & 0x3f) << 6)
|
||||
| (wc_uint32)(utf8[3] & 0x3f);
|
||||
if (ucs < WC_C_UTF8_L4)
|
||||
break;
|
||||
return ucs;
|
||||
case 5:
|
||||
ucs = ((wc_uint32)(utf8[0] & 0x03) << 24)
|
||||
| ((wc_uint32)(utf8[1] & 0x3f) << 18)
|
||||
| ((wc_uint32)(utf8[2] & 0x3f) << 12)
|
||||
| ((wc_uint32)(utf8[3] & 0x3f) << 6)
|
||||
| (wc_uint32)(utf8[4] & 0x3f);
|
||||
if (ucs < WC_C_UTF8_L5)
|
||||
break;
|
||||
return ucs;
|
||||
case 6:
|
||||
ucs = ((wc_uint32)(utf8[0] & 0x01) << 30)
|
||||
| ((wc_uint32)(utf8[1] & 0x3f) << 24)
|
||||
| ((wc_uint32)(utf8[2] & 0x3f) << 18)
|
||||
| ((wc_uint32)(utf8[3] & 0x3f) << 12)
|
||||
| ((wc_uint32)(utf8[4] & 0x3f) << 6)
|
||||
| (wc_uint32)(utf8[5] & 0x3f);
|
||||
if (ucs < WC_C_UTF8_L6)
|
||||
break;
|
||||
return ucs;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return WC_C_UCS4_ERROR;
|
||||
}
|
||||
|
||||
Str
|
||||
wc_conv_from_utf8(Str is, wc_ces ces)
|
||||
{
|
||||
Str os;
|
||||
wc_uchar *sp = (wc_uchar *)is->ptr;
|
||||
wc_uchar *ep = sp + is->length;
|
||||
wc_uchar *p;
|
||||
wc_uchar *q = NULL;
|
||||
int state = WC_UTF8_NOSTATE;
|
||||
size_t next = 0;
|
||||
wc_uint32 ucs;
|
||||
wc_status st;
|
||||
|
||||
for (p = sp; p < ep && *p < 0x80; p++)
|
||||
;
|
||||
if (p == ep)
|
||||
return is;
|
||||
os = Strnew_size(is->length * 4 / 3);
|
||||
if (p > sp)
|
||||
Strcat_charp_n(os, is->ptr, (int)(p - sp));
|
||||
|
||||
st.tag = NULL;
|
||||
st.ntag = 0;
|
||||
for (; p < ep; p++) {
|
||||
switch (state) {
|
||||
case WC_UTF8_NOSTATE:
|
||||
next = WC_UTF8_MAP[*p];
|
||||
switch (next) {
|
||||
case 1:
|
||||
wtf_push_ucs(os, (wc_uint32)*p, &st);
|
||||
break;
|
||||
case 8:
|
||||
Strcat_char(os, (char)*p);
|
||||
break;
|
||||
case 0:
|
||||
case 7:
|
||||
wtf_push_unknown(os, p, 1);
|
||||
break;
|
||||
default:
|
||||
q = p;
|
||||
next--;
|
||||
state = WC_UTF8_NEXT;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case WC_UTF8_NEXT:
|
||||
if (WC_UTF8_MAP[*p]) {
|
||||
wtf_push_unknown(os, q, p - q + 1);
|
||||
state = WC_UTF8_NOSTATE;
|
||||
break;
|
||||
}
|
||||
if (--next)
|
||||
break;
|
||||
state = WC_UTF8_NOSTATE;
|
||||
ucs = wc_utf8_to_ucs(q);
|
||||
if (ucs == WC_C_UCS4_ERROR ||
|
||||
(ucs >= WC_C_UCS2_SURROGATE && ucs <= WC_C_UCS2_SURROGATE_END))
|
||||
wtf_push_unknown(os, q, p - q + 1);
|
||||
else if (ucs != WC_C_UCS2_BOM)
|
||||
wtf_push_ucs(os, ucs, &st);
|
||||
break;
|
||||
}
|
||||
}
|
||||
switch (state) {
|
||||
case WC_UTF8_NEXT:
|
||||
wtf_push_unknown(os, q, p - q);
|
||||
break;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
static int
|
||||
wc_push_tag_to_utf8(Str os, int ntag)
|
||||
{
|
||||
char *p;
|
||||
|
||||
if (ntag) {
|
||||
p = wc_ucs_get_tag(ntag);
|
||||
if (p == NULL)
|
||||
ntag = 0;
|
||||
}
|
||||
if (ntag) {
|
||||
wc_ucs_to_utf8(WC_C_LANGUAGE_TAG, utf8_buf);
|
||||
Strcat_charp(os, (char *)utf8_buf);
|
||||
for (; *p; p++) {
|
||||
wc_ucs_to_utf8(WC_C_LANGUAGE_TAG0 | *p, utf8_buf);
|
||||
Strcat_charp(os, (char *)utf8_buf);
|
||||
}
|
||||
} else {
|
||||
wc_ucs_to_utf8(WC_C_CANCEL_TAG, utf8_buf);
|
||||
Strcat_charp(os, (char *)utf8_buf);
|
||||
}
|
||||
return ntag;
|
||||
}
|
||||
|
||||
void
|
||||
wc_push_to_utf8(Str os, wc_wchar_t cc, wc_status *st)
|
||||
{
|
||||
while (1) {
|
||||
switch (WC_CCS_SET(cc.ccs)) {
|
||||
case WC_CCS_US_ASCII:
|
||||
if (st->ntag)
|
||||
st->ntag = wc_push_tag_to_utf8(os, 0);
|
||||
Strcat_char(os, (char)(cc.code & 0x7f));
|
||||
return;
|
||||
case WC_CCS_UCS2:
|
||||
case WC_CCS_UCS4:
|
||||
if (st->ntag)
|
||||
st->ntag = wc_push_tag_to_utf8(os, 0);
|
||||
wc_ucs_to_utf8(cc.code, utf8_buf);
|
||||
Strcat_charp(os, (char *)utf8_buf);
|
||||
return;
|
||||
case WC_CCS_UCS_TAG:
|
||||
if (WcOption.use_language_tag && wc_ucs_tag_to_tag(cc.code) != st->ntag)
|
||||
st->ntag = wc_push_tag_to_utf8(os, wc_ucs_tag_to_tag(cc.code));
|
||||
wc_ucs_to_utf8(wc_ucs_tag_to_ucs(cc.code), utf8_buf);
|
||||
Strcat_charp(os, (char *)utf8_buf);
|
||||
return;
|
||||
case WC_CCS_ISO_8859_1:
|
||||
if (st->ntag)
|
||||
st->ntag = wc_push_tag_to_utf8(os, 0);
|
||||
wc_ucs_to_utf8((cc.code | 0x80), utf8_buf);
|
||||
Strcat_charp(os, (char *)utf8_buf);
|
||||
return;
|
||||
case WC_CCS_UNKNOWN_W:
|
||||
if (!WcOption.no_replace) {
|
||||
if (st->ntag)
|
||||
st->ntag = wc_push_tag_to_utf8(os, 0);
|
||||
Strcat_charp(os, WC_REPLACE_W);
|
||||
}
|
||||
return;
|
||||
case WC_CCS_UNKNOWN:
|
||||
if (!WcOption.no_replace) {
|
||||
if (st->ntag)
|
||||
st->ntag = wc_push_tag_to_utf8(os, 0);
|
||||
Strcat_charp(os, WC_REPLACE);
|
||||
}
|
||||
return;
|
||||
default:
|
||||
if (WcOption.ucs_conv &&
|
||||
(cc.code = wc_any_to_ucs(cc)) != WC_C_UCS4_ERROR)
|
||||
cc.ccs = WC_CCS_UCS2;
|
||||
else
|
||||
cc.ccs = WC_CCS_IS_WIDE(cc.ccs) ? WC_CCS_UNKNOWN_W : WC_CCS_UNKNOWN;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
wc_push_to_utf8_end(Str os, wc_status *st)
|
||||
{
|
||||
if (st->ntag)
|
||||
st->ntag = wc_push_tag_to_utf8(os, 0);
|
||||
return;
|
||||
}
|
||||
|
||||
Str
|
||||
wc_char_conv_from_utf8(wc_uchar c, wc_status *st)
|
||||
{
|
||||
static Str os;
|
||||
static wc_uchar buf[6];
|
||||
static size_t nbuf, next;
|
||||
wc_uint32 ucs;
|
||||
|
||||
if (st->state == -1) {
|
||||
st->state = WC_UTF8_NOSTATE;
|
||||
os = Strnew_size(8);
|
||||
st->tag = NULL;
|
||||
st->ntag = 0;
|
||||
nbuf = 0;
|
||||
}
|
||||
|
||||
switch (st->state) {
|
||||
case WC_UTF8_NOSTATE:
|
||||
switch (next = WC_UTF8_MAP[c]) {
|
||||
case 1:
|
||||
wtf_push_ucs(os, (wc_uint32)c, st);
|
||||
break;
|
||||
case 8:
|
||||
Strcat_char(os, (char)c);
|
||||
break;
|
||||
case 0:
|
||||
case 7:
|
||||
break;
|
||||
default:
|
||||
buf[nbuf++] = c;
|
||||
next--;
|
||||
st->state = WC_UTF8_NEXT;
|
||||
return NULL;
|
||||
}
|
||||
break;
|
||||
case WC_UTF8_NEXT:
|
||||
if (WC_UTF8_MAP[c])
|
||||
break;
|
||||
buf[nbuf++] = c;
|
||||
if (--next)
|
||||
return NULL;
|
||||
ucs = wc_utf8_to_ucs(buf);
|
||||
if (ucs == WC_C_UCS4_ERROR ||
|
||||
(ucs >= WC_C_UCS2_SURROGATE && ucs <= WC_C_UCS2_SURROGATE_END))
|
||||
break;
|
||||
if (ucs != WC_C_UCS2_BOM)
|
||||
wtf_push_ucs(os, ucs, st);
|
||||
break;
|
||||
}
|
||||
st->state = -1;
|
||||
return os;
|
||||
}
|
||||
|
||||
#endif
|
Reference in New Issue
Block a user