NLS: update handling of Unicode

This patch (as1239) updates the kernel's treatment of Unicode.  The
character-set conversion routines are well behind the current state of
the Unicode specification: They don't recognize the existence of code
points beyond plane 0 or of surrogate pairs in the UTF-16 encoding.

The old wchar_t 16-bit type is retained because it's still used in
lots of places.  This shouldn't cause any new problems; if a
conversion now results in an invalid 16-bit code then before it must
have yielded an undefined code.

Difficult-to-read names like "utf_mbstowcs" are replaced with more
transparent names like "utf8s_to_utf16s" and the ordering of the
parameters is rationalized (buffer lengths come immediate after the
pointers they refer to, and the inputs precede the outputs).
Fortunately the low-level conversion routines are used in only a few
places; the interfaces to the higher-level uni2char and char2uni
methods have been left unchanged.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Acked-by: Clemens Ladisch <clemens@ladisch.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
This commit is contained in:
Alan Stern
2009-04-30 10:08:18 -04:00
committad av Greg Kroah-Hartman
förälder a853a3d4eb
incheckning 74675a5850
9 ändrade filer med 183 tillägg och 138 borttagningar

Visa fil

@@ -15,6 +15,7 @@
#include <linux/errno.h>
#include <linux/kmod.h>
#include <linux/spinlock.h>
#include <asm/byteorder.h>
static struct nls_table default_table;
static struct nls_table *tables = &default_table;
@@ -43,10 +44,17 @@ static const struct utf8_table utf8_table[] =
{0, /* end of table */}
};
int
utf8_mbtowc(wchar_t *p, const __u8 *s, int n)
#define UNICODE_MAX 0x0010ffff
#define PLANE_SIZE 0x00010000
#define SURROGATE_MASK 0xfffff800
#define SURROGATE_PAIR 0x0000d800
#define SURROGATE_LOW 0x00000400
#define SURROGATE_BITS 0x000003ff
int utf8_to_utf32(const u8 *s, int len, unicode_t *pu)
{
long l;
unsigned long l;
int c0, c, nc;
const struct utf8_table *t;
@@ -57,12 +65,13 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, int n)
nc++;
if ((c0 & t->cmask) == t->cval) {
l &= t->lmask;
if (l < t->lval)
if (l < t->lval || l > UNICODE_MAX ||
(l & SURROGATE_MASK) == SURROGATE_PAIR)
return -1;
*p = l;
*pu = (unicode_t) l;
return nc;
}
if (n <= nc)
if (len <= nc)
return -1;
s++;
c = (*s ^ 0x80) & 0xFF;
@@ -72,76 +81,119 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, int n)
}
return -1;
}
EXPORT_SYMBOL(utf8_to_utf32);
int
utf8_mbstowcs(wchar_t *pwcs, const __u8 *s, int n)
int utf32_to_utf8(unicode_t u, u8 *s, int maxlen)
{
__u16 *op;
const __u8 *ip;
int size;
op = pwcs;
ip = s;
while (*ip && n > 0) {
if (*ip & 0x80) {
size = utf8_mbtowc(op, ip, n);
if (size == -1) {
/* Ignore character and move on */
ip++;
n--;
} else {
op++;
ip += size;
n -= size;
}
} else {
*op++ = *ip++;
n--;
}
}
return (op - pwcs);
}
int
utf8_wctomb(__u8 *s, wchar_t wc, int maxlen)
{
long l;
unsigned long l;
int c, nc;
const struct utf8_table *t;
if (!s)
return 0;
l = wc;
l = u;
if (l > UNICODE_MAX || (l & SURROGATE_MASK) == SURROGATE_PAIR)
return -1;
nc = 0;
for (t = utf8_table; t->cmask && maxlen; t++, maxlen--) {
nc++;
if (l <= t->lmask) {
c = t->shift;
*s = t->cval | (l >> c);
*s = (u8) (t->cval | (l >> c));
while (c > 0) {
c -= 6;
s++;
*s = 0x80 | ((l >> c) & 0x3F);
*s = (u8) (0x80 | ((l >> c) & 0x3F));
}
return nc;
}
}
return -1;
}
EXPORT_SYMBOL(utf32_to_utf8);
int
utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen)
int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs)
{
const __u16 *ip;
__u8 *op;
u16 *op;
int size;
unicode_t u;
op = pwcs;
while (*s && len > 0) {
if (*s & 0x80) {
size = utf8_to_utf32(s, len, &u);
if (size < 0) {
/* Ignore character and move on */
size = 1;
} else if (u >= PLANE_SIZE) {
u -= PLANE_SIZE;
*op++ = (wchar_t) (SURROGATE_PAIR |
((u >> 10) & SURROGATE_BITS));
*op++ = (wchar_t) (SURROGATE_PAIR |
SURROGATE_LOW |
(u & SURROGATE_BITS));
} else {
*op++ = (wchar_t) u;
}
s += size;
len -= size;
} else {
*op++ = *s++;
len--;
}
}
return op - pwcs;
}
EXPORT_SYMBOL(utf8s_to_utf16s);
static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian)
{
switch (endian) {
default:
return c;
case UTF16_LITTLE_ENDIAN:
return __le16_to_cpu(c);
case UTF16_BIG_ENDIAN:
return __be16_to_cpu(c);
}
}
int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian,
u8 *s, int maxlen)
{
u8 *op;
int size;
unsigned long u, v;
op = s;
ip = pwcs;
while (*ip && maxlen > 0) {
if (*ip > 0x7f) {
size = utf8_wctomb(op, *ip, maxlen);
while (len > 0 && maxlen > 0) {
u = get_utf16(*pwcs, endian);
if (!u)
break;
pwcs++;
len--;
if (u > 0x7f) {
if ((u & SURROGATE_MASK) == SURROGATE_PAIR) {
if (u & SURROGATE_LOW) {
/* Ignore character and move on */
continue;
}
if (len <= 0)
break;
v = get_utf16(*pwcs, endian);
if ((v & SURROGATE_MASK) != SURROGATE_PAIR ||
!(v & SURROGATE_LOW)) {
/* Ignore character and move on */
continue;
}
u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10)
+ (v & SURROGATE_BITS);
pwcs++;
len--;
}
size = utf32_to_utf8(u, op, maxlen);
if (size == -1) {
/* Ignore character and move on */
} else {
@@ -149,13 +201,13 @@ utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen)
maxlen -= size;
}
} else {
*op++ = (__u8) *ip;
*op++ = (u8) u;
maxlen--;
}
ip++;
}
return (op - s);
return op - s;
}
EXPORT_SYMBOL(utf16s_to_utf8s);
int register_nls(struct nls_table * nls)
{
@@ -467,9 +519,5 @@ EXPORT_SYMBOL(unregister_nls);
EXPORT_SYMBOL(unload_nls);
EXPORT_SYMBOL(load_nls);
EXPORT_SYMBOL(load_nls_default);
EXPORT_SYMBOL(utf8_mbtowc);
EXPORT_SYMBOL(utf8_mbstowcs);
EXPORT_SYMBOL(utf8_wctomb);
EXPORT_SYMBOL(utf8_wcstombs);
MODULE_LICENSE("Dual BSD/GPL");

Visa fil

@@ -15,7 +15,11 @@ static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
{
int n;
if ( (n = utf8_wctomb(out, uni, boundlen)) == -1) {
if (boundlen <= 0)
return -ENAMETOOLONG;
n = utf32_to_utf8(uni, out, boundlen);
if (n < 0) {
*out = '?';
return -EINVAL;
}
@@ -25,11 +29,14 @@ static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
{
int n;
unicode_t u;
if ( (n = utf8_mbtowc(uni, rawstring, boundlen)) == -1) {
n = utf8_to_utf32(rawstring, boundlen, &u);
if (n < 0 || u > MAX_WCHAR_T) {
*uni = 0x003f; /* ? */
n = -EINVAL;
return -EINVAL;
}
*uni = (wchar_t) u;
return n;
}