From a9a57385f0a5648bbfeccc5e8ef0bfe2cdac80c7 Mon Sep 17 00:00:00 2001 From: Sverker Eriksson Date: Fri, 14 Dec 2012 19:52:59 +0100 Subject: erts: Change internal representation of atoms to utf8 --- erts/emulator/beam/atom.h | 51 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 14 deletions(-) (limited to 'erts/emulator/beam/atom.h') diff --git a/erts/emulator/beam/atom.h b/erts/emulator/beam/atom.h index fd9c04d3d0..84dd6d8901 100644 --- a/erts/emulator/beam/atom.h +++ b/erts/emulator/beam/atom.h @@ -26,7 +26,9 @@ #include "erl_atom_table.h" -#define MAX_ATOM_LENGTH 255 +#define MAX_ATOM_CHARACTERS 255 +#define MAX_ATOM_SZ_FROM_LATIN1 (2*MAX_ATOM_CHARACTERS) +#define MAX_ATOM_SZ_LIMIT (4*MAX_ATOM_CHARACTERS) /* theoretical byte limit */ #define ATOM_LIMIT (1024*1024) #define MIN_ATOM_TABLE_SIZE 8192 @@ -53,8 +55,8 @@ typedef struct atom { extern IndexTable erts_atom_table; ERTS_GLB_INLINE Atom* atom_tab(Uint i); -ERTS_GLB_INLINE int erts_is_atom_bytes(byte *text, size_t len, Eterm term); -ERTS_GLB_INLINE int erts_is_atom_str(char *str, Eterm term); +ERTS_GLB_INLINE int erts_is_atom_utf8_bytes(byte *text, size_t len, Eterm term); +ERTS_GLB_INLINE int erts_is_atom_str(const char *str, Eterm term, int is_latin1); #if ERTS_GLB_INLINE_INCL_FUNC_DEF ERTS_GLB_INLINE Atom* @@ -63,7 +65,7 @@ atom_tab(Uint i) return (Atom *) erts_index_lookup(&erts_atom_table, i); } -ERTS_GLB_INLINE int erts_is_atom_bytes(byte *text, size_t len, Eterm term) +ERTS_GLB_INLINE int erts_is_atom_utf8_bytes(byte *text, size_t len, Eterm term) { Atom *a; if (!is_atom(term)) @@ -73,30 +75,50 @@ ERTS_GLB_INLINE int erts_is_atom_bytes(byte *text, size_t len, Eterm term) && sys_memcmp((void *) a->name, (void *) text, len) == 0); } -ERTS_GLB_INLINE int erts_is_atom_str(char *str, Eterm term) +ERTS_GLB_INLINE int erts_is_atom_str(const char *str, Eterm term, int is_latin1) { Atom *a; int i, len; - char *aname; + const byte* aname; + const byte* s = (const byte*) str; + if (!is_atom(term)) return 0; a = atom_tab(atom_val(term)); len = a->len; - aname = (char *) a->name; - for (i = 0; i < len; i++) - if (aname[i] != str[i] || str[i] == '\0') - return 0; - return str[len] == '\0'; + aname = a->name; + if (is_latin1) { + for (i = 0; i < len; s++) { + if (aname[i] < 0x80) { + if (aname[i] != *s || *s == '\0') + return 0; + i++; + } + else { + if (aname[i] != (0xC0 | (*s >> 6)) || + aname[i+1] != (0x80 | (*s & 0x3F))) { + return 0; + } + i += 2; + } + } + } + else { + for (i = 0; i < len; i++, s++) + if (aname[i] != *s || *s == '\0') + return 0; + } + return *s == '\0'; } #endif /* * Note, ERTS_IS_ATOM_STR() expects the first argument to be a - * string literal. + * 7-bit ASCII string literal. */ #define ERTS_IS_ATOM_STR(LSTR, TERM) \ - (erts_is_atom_bytes((byte *) LSTR, sizeof(LSTR) - 1, (TERM))) + (erts_is_atom_utf8_bytes((byte *) LSTR, sizeof(LSTR) - 1, (TERM))) #define ERTS_DECL_AM(S) Eterm AM_ ## S = am_atom_put(#S, sizeof(#S) - 1) #define ERTS_INIT_AM(S) AM_ ## S = am_atom_put(#S, sizeof(#S) - 1) @@ -104,12 +126,13 @@ int atom_table_size(void); /* number of elements */ int atom_table_sz(void); /* table size in bytes, excluding stored objects */ Eterm am_atom_put(const char*, int); /* most callers pass plain char*'s */ +Eterm am_atom_put2(const byte*, int, int is_latin1); int atom_erase(byte*, int); int atom_static_put(byte*, int); void init_atom_table(void); void atom_info(int, void *); void dump_atoms(int, void *); -int erts_atom_get(const char* name, int len, Eterm* ap); +int erts_atom_get(const char* name, int len, Eterm* ap, int is_latin1); void erts_atom_get_text_space_sizes(Uint *reserved, Uint *used); #endif -- cgit v1.2.3 From 0dd3b88cdf90283d9c276ee415f985cb764e522f Mon Sep 17 00:00:00 2001 From: Rickard Green Date: Thu, 10 Jan 2013 12:47:46 +0100 Subject: UTF-8 support for distribution --- erts/emulator/beam/atom.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'erts/emulator/beam/atom.h') diff --git a/erts/emulator/beam/atom.h b/erts/emulator/beam/atom.h index 84dd6d8901..f721999a4c 100644 --- a/erts/emulator/beam/atom.h +++ b/erts/emulator/beam/atom.h @@ -47,7 +47,8 @@ */ typedef struct atom { IndexSlot slot; /* MUST BE LOCATED AT TOP OF STRUCT!!! */ - int len; /* length of atom name */ + Sint16 len; /* length of atom name (UTF-8 encoded) */ + Sint16 latin1_chars; /* 0-255 if atom can be encoded in latin1; otherwise, -1 */ int ord0; /* ordinal value of first 3 bytes + 7 bits */ byte* name; /* name of atom */ } Atom; @@ -113,6 +114,12 @@ ERTS_GLB_INLINE int erts_is_atom_str(const char *str, Eterm term, int is_latin1) #endif +typedef enum { + ERTS_ATOM_ENC_7BIT_ASCII, + ERTS_ATOM_ENC_LATIN1, + ERTS_ATOM_ENC_UTF8 +} ErtsAtomEncoding; + /* * Note, ERTS_IS_ATOM_STR() expects the first argument to be a * 7-bit ASCII string literal. @@ -125,8 +132,8 @@ ERTS_GLB_INLINE int erts_is_atom_str(const char *str, Eterm term, int is_latin1) int atom_table_size(void); /* number of elements */ int atom_table_sz(void); /* table size in bytes, excluding stored objects */ -Eterm am_atom_put(const char*, int); /* most callers pass plain char*'s */ -Eterm am_atom_put2(const byte*, int, int is_latin1); +Eterm am_atom_put(const char*, int); /* ONLY 7-bit ascii! */ +Eterm erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc); int atom_erase(byte*, int); int atom_static_put(byte*, int); void init_atom_table(void); -- cgit v1.2.3