aboutsummaryrefslogtreecommitdiffstats
path: root/erts/emulator/beam/atom.h
diff options
context:
space:
mode:
authorSverker Eriksson <[email protected]>2013-01-23 18:09:35 +0100
committerSverker Eriksson <[email protected]>2013-01-23 18:09:35 +0100
commitb8e623410d1c22fe6d5fdeb8ccb0b2305533f033 (patch)
tree708d64e36e18b61ae1801c02ec3aeef42a697be3 /erts/emulator/beam/atom.h
parente99df74bee7c245ec76678e336fcd09d4b51a089 (diff)
parentd6e3e256b850050b7a86323b2948009d5fcc30a9 (diff)
downloadotp-b8e623410d1c22fe6d5fdeb8ccb0b2305533f033.tar.gz
otp-b8e623410d1c22fe6d5fdeb8ccb0b2305533f033.tar.bz2
otp-b8e623410d1c22fe6d5fdeb8ccb0b2305533f033.zip
Merge branch 'sverk/r16/utf8-atoms'
* sverk/r16/utf8-atoms: erl_interface: Fix bug when transcoding atoms from and to UTF8 erl_interface: Changed erlang_char_encoding interface erts: Testcase doing unicode atom printout with ~w erl_interface: even more utf8 atom stuff erts: Fix bug in analyze_utf8 causing faulty latin1 detection Add UTF-8 node name support for epmd workaround... Fix merge conflict with hasse UTF-8 atom documentation test case erl_interface: utf8 atoms continued Add utf8 atom distribution test cases atom fixes for NIFs and atom_to_binary UTF-8 support for distribution Implement UTF-8 atom support for jinterface erl_interface: Enable decode of unicode atoms stdlib: Fix printing of unicode atoms erts: Change internal representation of atoms to utf8 erts: Refactor rename DFLAG(S)_INTERNAL_TAGS for conformity Conflicts: erts/emulator/beam/io.c OTP-10753
Diffstat (limited to 'erts/emulator/beam/atom.h')
-rw-r--r--erts/emulator/beam/atom.h62
1 files changed, 46 insertions, 16 deletions
diff --git a/erts/emulator/beam/atom.h b/erts/emulator/beam/atom.h
index fd9c04d3d0..f721999a4c 100644
--- a/erts/emulator/beam/atom.h
+++ b/erts/emulator/beam/atom.h
@@ -26,7 +26,9 @@
#include "erl_atom_table.h"
-#define MAX_ATOM_LENGTH 255
+#define MAX_ATOM_CHARACTERS 255
+#define MAX_ATOM_SZ_FROM_LATIN1 (2*MAX_ATOM_CHARACTERS)
+#define MAX_ATOM_SZ_LIMIT (4*MAX_ATOM_CHARACTERS) /* theoretical byte limit */
#define ATOM_LIMIT (1024*1024)
#define MIN_ATOM_TABLE_SIZE 8192
@@ -45,7 +47,8 @@
*/
typedef struct atom {
IndexSlot slot; /* MUST BE LOCATED AT TOP OF STRUCT!!! */
- int len; /* length of atom name */
+ Sint16 len; /* length of atom name (UTF-8 encoded) */
+ Sint16 latin1_chars; /* 0-255 if atom can be encoded in latin1; otherwise, -1 */
int ord0; /* ordinal value of first 3 bytes + 7 bits */
byte* name; /* name of atom */
} Atom;
@@ -53,8 +56,8 @@ typedef struct atom {
extern IndexTable erts_atom_table;
ERTS_GLB_INLINE Atom* atom_tab(Uint i);
-ERTS_GLB_INLINE int erts_is_atom_bytes(byte *text, size_t len, Eterm term);
-ERTS_GLB_INLINE int erts_is_atom_str(char *str, Eterm term);
+ERTS_GLB_INLINE int erts_is_atom_utf8_bytes(byte *text, size_t len, Eterm term);
+ERTS_GLB_INLINE int erts_is_atom_str(const char *str, Eterm term, int is_latin1);
#if ERTS_GLB_INLINE_INCL_FUNC_DEF
ERTS_GLB_INLINE Atom*
@@ -63,7 +66,7 @@ atom_tab(Uint i)
return (Atom *) erts_index_lookup(&erts_atom_table, i);
}
-ERTS_GLB_INLINE int erts_is_atom_bytes(byte *text, size_t len, Eterm term)
+ERTS_GLB_INLINE int erts_is_atom_utf8_bytes(byte *text, size_t len, Eterm term)
{
Atom *a;
if (!is_atom(term))
@@ -73,43 +76,70 @@ ERTS_GLB_INLINE int erts_is_atom_bytes(byte *text, size_t len, Eterm term)
&& sys_memcmp((void *) a->name, (void *) text, len) == 0);
}
-ERTS_GLB_INLINE int erts_is_atom_str(char *str, Eterm term)
+ERTS_GLB_INLINE int erts_is_atom_str(const char *str, Eterm term, int is_latin1)
{
Atom *a;
int i, len;
- char *aname;
+ const byte* aname;
+ const byte* s = (const byte*) str;
+
if (!is_atom(term))
return 0;
a = atom_tab(atom_val(term));
len = a->len;
- aname = (char *) a->name;
- for (i = 0; i < len; i++)
- if (aname[i] != str[i] || str[i] == '\0')
- return 0;
- return str[len] == '\0';
+ aname = a->name;
+ if (is_latin1) {
+ for (i = 0; i < len; s++) {
+ if (aname[i] < 0x80) {
+ if (aname[i] != *s || *s == '\0')
+ return 0;
+ i++;
+ }
+ else {
+ if (aname[i] != (0xC0 | (*s >> 6)) ||
+ aname[i+1] != (0x80 | (*s & 0x3F))) {
+ return 0;
+ }
+ i += 2;
+ }
+ }
+ }
+ else {
+ for (i = 0; i < len; i++, s++)
+ if (aname[i] != *s || *s == '\0')
+ return 0;
+ }
+ return *s == '\0';
}
#endif
+typedef enum {
+ ERTS_ATOM_ENC_7BIT_ASCII,
+ ERTS_ATOM_ENC_LATIN1,
+ ERTS_ATOM_ENC_UTF8
+} ErtsAtomEncoding;
+
/*
* Note, ERTS_IS_ATOM_STR() expects the first argument to be a
- * string literal.
+ * 7-bit ASCII string literal.
*/
#define ERTS_IS_ATOM_STR(LSTR, TERM) \
- (erts_is_atom_bytes((byte *) LSTR, sizeof(LSTR) - 1, (TERM)))
+ (erts_is_atom_utf8_bytes((byte *) LSTR, sizeof(LSTR) - 1, (TERM)))
#define ERTS_DECL_AM(S) Eterm AM_ ## S = am_atom_put(#S, sizeof(#S) - 1)
#define ERTS_INIT_AM(S) AM_ ## S = am_atom_put(#S, sizeof(#S) - 1)
int atom_table_size(void); /* number of elements */
int atom_table_sz(void); /* table size in bytes, excluding stored objects */
-Eterm am_atom_put(const char*, int); /* most callers pass plain char*'s */
+Eterm am_atom_put(const char*, int); /* ONLY 7-bit ascii! */
+Eterm erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc);
int atom_erase(byte*, int);
int atom_static_put(byte*, int);
void init_atom_table(void);
void atom_info(int, void *);
void dump_atoms(int, void *);
-int erts_atom_get(const char* name, int len, Eterm* ap);
+int erts_atom_get(const char* name, int len, Eterm* ap, int is_latin1);
void erts_atom_get_text_space_sizes(Uint *reserved, Uint *used);
#endif