diff options
author | Rickard Green <[email protected]> | 2013-01-10 12:47:46 +0100 |
---|---|---|
committer | Rickard Green <[email protected]> | 2013-01-16 17:16:52 +0100 |
commit | 0dd3b88cdf90283d9c276ee415f985cb764e522f (patch) | |
tree | 1584d76d9960339a03c04412ef7919473e7b2efc /erts/emulator/beam/erl_unicode.c | |
parent | 5d79f55ca441727578d34b78ee0d6d8aa80976ee (diff) | |
download | otp-0dd3b88cdf90283d9c276ee415f985cb764e522f.tar.gz otp-0dd3b88cdf90283d9c276ee415f985cb764e522f.tar.bz2 otp-0dd3b88cdf90283d9c276ee415f985cb764e522f.zip |
UTF-8 support for distribution
Diffstat (limited to 'erts/emulator/beam/erl_unicode.c')
-rw-r--r-- | erts/emulator/beam/erl_unicode.c | 139 |
1 files changed, 115 insertions, 24 deletions
diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c index e24b6f1458..6600ce4a4a 100644 --- a/erts/emulator/beam/erl_unicode.c +++ b/erts/emulator/beam/erl_unicode.c @@ -1154,15 +1154,24 @@ BIF_RETTYPE unicode_characters_to_list_2(BIF_ALIST_2) * When input to characters_to_list is a plain binary and the format is 'unicode', we do * a faster analyze and size count with this function. */ -int erts_analyze_utf8(byte *source, Uint size, - byte **err_pos, Uint *num_chars, int *left) +static ERTS_INLINE int +analyze_utf8(byte *source, Uint size, byte **err_pos, Uint *num_chars, int *left, + Sint *num_latin1_chars, Uint max_chars) { + Uint latin1_count; + int is_latin1; *err_pos = source; + if (num_latin1_chars) { + is_latin1 = 1; + latin1_count = 0; + } *num_chars = 0; while (size) { if (((*source) & ((byte) 0x80)) == 0) { source++; - --size; + --size; + if (num_latin1_chars) + latin1_count++; } else if (((*source) & ((byte) 0xE0)) == 0xC0) { if (size < 2) { return ERTS_UTF8_INCOMPLETE; @@ -1173,6 +1182,11 @@ int erts_analyze_utf8(byte *source, Uint size, } source += 2; size -= 2; + if (num_latin1_chars) { + latin1_count++; + if ((source[0] & ((byte) 0xFC)) != ((byte) 0xC0)) + is_latin1 = 0; + } } else if (((*source) & ((byte) 0xF0)) == 0xE0) { if (size < 3) { return ERTS_UTF8_INCOMPLETE; @@ -1188,6 +1202,8 @@ int erts_analyze_utf8(byte *source, Uint size, } source += 3; size -= 3; + if (num_latin1_chars) + is_latin1 = 0; } else if (((*source) & ((byte) 0xF8)) == 0xF0) { if (size < 4) { return ERTS_UTF8_INCOMPLETE; @@ -1205,22 +1221,41 @@ int erts_analyze_utf8(byte *source, Uint size, } source += 4; size -= 4; + if (num_latin1_chars) + is_latin1 = 0; } else { return ERTS_UTF8_ERROR; } ++(*num_chars); *err_pos = source; + if (max_chars && size > 0 && *num_chars == max_chars) + return ERTS_UTF8_OK_MAX_CHARS; if (left && --(*left) <= 0 && size) { return ERTS_UTF8_ANALYZE_MORE; } } + if (num_latin1_chars) + *num_latin1_chars = is_latin1 ? latin1_count : -1; return ERTS_UTF8_OK; } +int erts_analyze_utf8(byte *source, Uint size, + byte **err_pos, Uint *num_chars, int *left) +{ + return analyze_utf8(source, size, err_pos, num_chars, left, NULL, 0); +} + +int erts_analyze_utf8_x(byte *source, Uint size, + byte **err_pos, Uint *num_chars, int *left, + Sint *num_latin1_chars, Uint max_chars) +{ + return analyze_utf8(source, size, err_pos, num_chars, left, num_latin1_chars, max_chars); +} + /* * No errors should be able to occur - no overlongs, no malformed, no nothing - */ -Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, + */ +static Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, Uint left, Uint *num_built, Uint *num_eaten, Eterm tail) { @@ -1275,6 +1310,12 @@ Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, return ret; } +Eterm erts_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, Uint left, + Uint *num_built, Uint *num_eaten, Eterm tail) +{ + return do_utf8_to_list(p, num, bytes, sz, left, num_built, num_eaten, tail); +} + static int is_candidate(Uint cp) { int index,pos; @@ -1849,14 +1890,14 @@ BIF_RETTYPE atom_to_binary_2(BIF_ALIST_2) } static BIF_RETTYPE -binary_to_atom(Process* p, Eterm bin, Eterm enc, int must_exist) +binary_to_atom(Process* proc, Eterm bin, Eterm enc, int must_exist) { byte* bytes; byte *temp_alloc = NULL; Uint bin_size; if ((bytes = erts_get_aligned_binary_bytes(bin, &temp_alloc)) == 0) { - BIF_ERROR(p, BADARG); + BIF_ERROR(proc, BADARG); } bin_size = binary_size(bin); if (enc == am_latin1) { @@ -1864,11 +1905,16 @@ binary_to_atom(Process* p, Eterm bin, Eterm enc, int must_exist) if (bin_size > MAX_ATOM_CHARACTERS) { system_limit: erts_free_aligned_binary_bytes(temp_alloc); - BIF_ERROR(p, SYSTEM_LIMIT); + BIF_ERROR(proc, SYSTEM_LIMIT); } if (!must_exist) { - a = am_atom_put2(bytes, bin_size, 1); - erts_free_aligned_binary_bytes(temp_alloc); + a = erts_atom_put((byte *) bytes, + bin_size, + ERTS_ATOM_ENC_LATIN1, + 0); + erts_free_aligned_binary_bytes(temp_alloc); + if (is_non_value(a)) + goto badarg; BIF_RET(a); } else if (erts_atom_get((char *)bytes, bin_size, &a, 1)) { erts_free_aligned_binary_bytes(temp_alloc); @@ -1900,17 +1946,22 @@ binary_to_atom(Process* p, Eterm bin, Eterm enc, int must_exist) } if (!must_exist) { - res = am_atom_put((char*)bytes, bin_size); + res = erts_atom_put((byte *) bytes, + bin_size, + ERTS_ATOM_ENC_UTF8, + 0); } else if (!erts_atom_get((char*)bytes, bin_size, &res, 0)) { goto badarg; } erts_free_aligned_binary_bytes(temp_alloc); + if (is_non_value(res)) + goto badarg; BIF_RET(res); } else { badarg: erts_free_aligned_binary_bytes(temp_alloc); - BIF_ERROR(p, BADARG); + BIF_ERROR(proc, BADARG); } } @@ -2625,30 +2676,70 @@ BIF_RETTYPE file_native_name_encoding_0(BIF_ALIST_0) } } -/* Assumes 'dest' has enough room. - */ -int erts_utf8_to_latin1(byte* dest, const byte* source, unsigned slen) +int erts_utf8_is_latin1_string(const byte *string, int len) +{ + /* Assumes string is encoded in valid UTF-8 */ + int i; + while (i < len) { + if ((string[i] & 0x80) == 0) + i++; + else if (i+1 < len + && (string[i] & 0xFE) == 0xC2 + && (string[i+1] & 0xC0) == 0x80) + i +=2; + else + return 0; + } + return 1; +} + +int erts_utf8_to_latin1(byte* dest, const byte* source, int slen) { + /* + * Assumes source contains valid utf8 that can be encoded as latin1, + * and that dest has enough room. + */ byte* dp = dest; while (slen > 0) { if ((source[0] & 0x80) == 0) { *dp++ = *source++; --slen; } - else if (slen > 1 && - (source[0] & 0xFE) == 0xC2 && - (source[1] & 0xC0) == 0x80) { + else { + ASSERT(slen > 1); + ASSERT((source[0] & 0xFE) == 0xC2); + ASSERT((source[1] & 0xC0) == 0x80); *dp++ = (char) ((source[0] << 6) | (source[1] & 0x3F)); source += 2; slen -= 2; } - else { - /* Just let unconvertable octets through. This should not happen - in a correctly upgraded system */ - *dp++ = *source++; - --slen; - } } return dp - dest; } +int erts_utf8_to_latin1_backwards(byte *dest, const byte *source, int slen) +{ + /* + * Assumes source contains valid utf8 that can be encoded as latin1, + * and that dest has enough room. + */ + int dix = 0; + int six = slen; + while (six > 0) { + six--; + dix--; + if ((source[six] & 0x80) == 0) + dest[dix] = source[six]; + else { + byte c; + ASSERT(six > 0); + ASSERT((source[six] & 0xC0) == 0x80); + ASSERT((source[six-1] & 0xFE) == 0xC2); + c = source[six] & 0x3F; + six--; + c |= source[six] << 6; + dest[dix] = c; + } + } + return -dix; +} |