From 5d79f55ca441727578d34b78ee0d6d8aa80976ee Mon Sep 17 00:00:00 2001 From: Rickard Green Date: Sat, 5 Jan 2013 03:07:14 +0100 Subject: Implement UTF-8 atom support for jinterface --- .../com/ericsson/otp/erlang/AbstractNode.java | 4 +- .../com/ericsson/otp/erlang/OtpErlangAtom.java | 2 +- .../com/ericsson/otp/erlang/OtpExternal.java | 6 + .../com/ericsson/otp/erlang/OtpInputStream.java | 64 +++++++-- .../com/ericsson/otp/erlang/OtpOutputStream.java | 60 ++++++++- lib/jinterface/test/nc_SUITE.erl | 146 +++++++++++++++------ 6 files changed, 226 insertions(+), 56 deletions(-) diff --git a/lib/jinterface/java_src/com/ericsson/otp/erlang/AbstractNode.java b/lib/jinterface/java_src/com/ericsson/otp/erlang/AbstractNode.java index 16cb544a16..c76fad5e45 100644 --- a/lib/jinterface/java_src/com/ericsson/otp/erlang/AbstractNode.java +++ b/lib/jinterface/java_src/com/ericsson/otp/erlang/AbstractNode.java @@ -90,6 +90,8 @@ public class AbstractNode { static final int dFlagExportPtrTag = 0x200; // NOT SUPPORTED static final int dFlagBitBinaries = 0x400; static final int dFlagNewFloats = 0x800; + static final int dFlagUnicodeIo = 0x1000; + static final int dFlagUtf8Atoms = 0x10000; int ntype = NTYPE_R6; int proto = 0; // tcp/ip @@ -98,7 +100,7 @@ public class AbstractNode { int creation = 0; int flags = dFlagExtendedReferences | dFlagExtendedPidsPorts | dFlagBitBinaries | dFlagNewFloats | dFlagFunTags - | dflagNewFunTags; + | dflagNewFunTags | dFlagUtf8Atoms; /* initialize hostname and default cookie */ static { diff --git a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpErlangAtom.java b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpErlangAtom.java index ced4dbb8c2..2768edc6fa 100644 --- a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpErlangAtom.java +++ b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpErlangAtom.java @@ -51,7 +51,7 @@ public class OtpErlangAtom extends OtpErlangObject implements Serializable, "null string value"); } - if (atom.length() > maxAtomLength) { + if (atom.codePointCount(0, atom.length()) > maxAtomLength) { throw new java.lang.IllegalArgumentException("Atom may not exceed " + maxAtomLength + " characters: " + atom); } diff --git a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpExternal.java b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpExternal.java index e70b9a786b..2a4cd4fa2d 100644 --- a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpExternal.java +++ b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpExternal.java @@ -88,6 +88,12 @@ public class OtpExternal { /** The tag used for old Funs */ public static final int funTag = 117; + /** The tag used for unicode atoms */ + public static final int atomUtf8Tag = 118; + + /** The tag used for small unicode atoms */ + public static final int smallAtomUtf8Tag = 119; + /** The tag used for compressed terms */ public static final int compressedTag = 80; diff --git a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpInputStream.java b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpInputStream.java index ae5f4ee072..c2a79af841 100644 --- a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpInputStream.java +++ b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpInputStream.java @@ -351,26 +351,64 @@ public class OtpInputStream extends ByteArrayInputStream { */ public String read_atom() throws OtpErlangDecodeException { int tag; - int len; + int len = -1; byte[] strbuf; String atom; tag = read1skip_version(); - if (tag != OtpExternal.atomTag) { - throw new OtpErlangDecodeException( - "wrong tag encountered, expected " + OtpExternal.atomTag - + ", got " + tag); - } + switch (tag) { - len = read2BE(); + case OtpExternal.atomTag: + len = read2BE(); + strbuf = new byte[len]; + this.readN(strbuf); + try { + atom = new String(strbuf, "ISO-8859-1"); + } catch (final java.io.UnsupportedEncodingException e) { + throw new OtpErlangDecodeException( + "Failed to decode ISO-8859-1 atom"); + } + if (atom.length() > OtpExternal.maxAtomLength) { + /* + * Throwing an exception would be better I think, + * but truncation seems to be the way it has + * been done in other parts of OTP... + */ + atom = atom.substring(0, OtpExternal.maxAtomLength); + } + break; - strbuf = new byte[len]; - this.readN(strbuf); - atom = OtpErlangString.newString(strbuf); + case OtpExternal.smallAtomUtf8Tag: + len = read1(); + /* fall through */ + case OtpExternal.atomUtf8Tag: + if (len < 0) { + len = read2BE(); + } + strbuf = new byte[len]; + this.readN(strbuf); + try { + atom = new String(strbuf, "UTF-8"); + } catch (final java.io.UnsupportedEncodingException e) { + throw new OtpErlangDecodeException( + "Failed to decode UTF-8 atom"); + } + if (atom.codePointCount(0, atom.length()) > OtpExternal.maxAtomLength) { + /* + * Throwing an exception would be better I think, + * but truncation seems to be the way it has + * been done in other parts of OTP... + */ + final int[] cps = OtpErlangString.stringToCodePoints(atom); + atom = new String(cps, 0, OtpExternal.maxAtomLength); + } + break; - if (atom.length() > OtpExternal.maxAtomLength) { - atom = atom.substring(0, OtpExternal.maxAtomLength); + default: + throw new OtpErlangDecodeException( + "wrong tag encountered, expected " + OtpExternal.atomTag + + ", or " + OtpExternal.atomUtf8Tag + ", got " + tag); } return atom; @@ -1152,6 +1190,8 @@ public class OtpInputStream extends ByteArrayInputStream { return new OtpErlangLong(this); case OtpExternal.atomTag: + case OtpExternal.smallAtomUtf8Tag: + case OtpExternal.atomUtf8Tag: return new OtpErlangAtom(this); case OtpExternal.floatTag: diff --git a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpOutputStream.java b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpOutputStream.java index 22ebb4688a..10bdf389cd 100644 --- a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpOutputStream.java +++ b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpOutputStream.java @@ -343,9 +343,63 @@ public class OtpOutputStream extends ByteArrayOutputStream { * the string to write. */ public void write_atom(final String atom) { - write1(OtpExternal.atomTag); - write2BE(atom.length()); - writeN(atom.getBytes()); + String enc_atom; + byte[] bytes; + boolean isLatin1 = true; + + if (atom.codePointCount(0, atom.length()) <= OtpExternal.maxAtomLength) { + enc_atom = atom; + } + else { + /* + * Throwing an exception would be better I think, + * but truncation seems to be the way it has + * been done in other parts of OTP... + */ + enc_atom = new String(OtpErlangString.stringToCodePoints(atom), + 0, OtpExternal.maxAtomLength); + } + + for (int offset = 0; offset < enc_atom.length();) { + final int cp = enc_atom.codePointAt(offset); + if ((cp & ~0xFF) != 0) { + isLatin1 = false; + break; + } + offset += Character.charCount(cp); + } + try { + if (isLatin1) { + bytes = enc_atom.getBytes("ISO-8859-1"); + write1(OtpExternal.atomTag); + write2BE(bytes.length); + } + else { + bytes = enc_atom.getBytes("UTF-8"); + final int length = bytes.length; + if (length < 256) { + write1(OtpExternal.smallAtomUtf8Tag); + write1(length); + } + else { + write1(OtpExternal.atomUtf8Tag); + write2BE(length); + } + } + writeN(bytes); + } catch (final java.io.UnsupportedEncodingException e) { + /* + * Sigh, why didn't the API designer add an + * OtpErlangEncodeException to these encoding + * functions?!? Instead of changing the API we + * write an invalid atom and let it fail for + * whoever trying to decode this... Sigh, + * again... + */ + write1(OtpExternal.smallAtomUtf8Tag); + write1(2); + write2BE(0xffff); /* Invalid UTF-8 */ + } } /** diff --git a/lib/jinterface/test/nc_SUITE.erl b/lib/jinterface/test/nc_SUITE.erl index d5388e54f4..ccc87a6761 100644 --- a/lib/jinterface/test/nc_SUITE.erl +++ b/lib/jinterface/test/nc_SUITE.erl @@ -22,6 +22,15 @@ -include_lib("common_test/include/ct.hrl"). -include("test_server_line.hrl"). +-define(VERSION_MAGIC, 131). + +-define(ATOM_EXT, 100). +-define(REFERENCE_EXT, 101). +-define(PORT_EXT, 102). +-define(PID_EXT, 103). +-define(NEW_REFERENCE_EXT, 114). +-define(ATOM_UTF8_EXT, 118). +-define(SMALL_ATOM_UTF8_EXT, 119). -export([all/0, suite/0,groups/0,init_per_group/2,end_per_group/2, init_per_suite/1, @@ -45,6 +54,10 @@ unicode/1, unicode_list_to_string/1, unicode_string_to_list/1, + utf8_atom/1, + utf8_pid/1, + utf8_port/1, + utf8_ref/1, connect/1]). @@ -58,7 +71,9 @@ all() -> decompress_roundtrip, compress_roundtrip, integer_roundtrip, fun_roundtrip, lists_roundtrip, lists_roundtrip_2, lists_iterator, unicode, - unicode_list_to_string, unicode_string_to_list, connect]. + unicode_list_to_string, unicode_string_to_list, + utf8_atom, utf8_pid, utf8_port, utf8_ref, + connect]. groups() -> []. @@ -448,6 +463,71 @@ unicode_string_to_list(Config) when is_list(Config) -> end, ["unicode"]). +evil_smiley() -> + <<240,159,152,136>>. + +evil_smileys(0) -> + []; +evil_smileys(N) -> + [evil_smiley() | evil_smileys(N-1)]. + +utf8_atom(Config) when is_list(Config) -> + ES = evil_smiley(), + SmallUA = binary_to_term(list_to_binary([?VERSION_MAGIC, + ?SMALL_ATOM_UTF8_EXT, + size(ES), + ES])), + true = is_atom(SmallUA), + NoESs = 300 div size(ES), + ESs = evil_smileys(NoESs), + LargeUA = binary_to_term(list_to_binary([?VERSION_MAGIC, + ?ATOM_UTF8_EXT, + uint16_be(NoESs*size(ES)), + ESs])), + true = is_atom(LargeUA), + erlang:display({atom, SmallUA, LargeUA}), + do_echo([SmallUA, LargeUA], Config). + +utf8_nodenames_ext() -> + H = "@host", + ES = evil_smiley(), + SmallUANodeExt = list_to_binary([?SMALL_ATOM_UTF8_EXT, + size(ES)+length(H), + ES, + H]), + NoESs = 300 div size(ES), + ESs = evil_smileys(NoESs), + LargeUANodeExt = list_to_binary([?ATOM_UTF8_EXT, + uint16_be(NoESs*size(ES)+length(H)), + ESs, + H]), + {SmallUANodeExt, LargeUANodeExt}. + +utf8_pid(Config) when is_list(Config) -> + {SmallUANodeExt, LargeUANodeExt} = utf8_nodenames_ext(), + SmallPid = mk_pid({SmallUANodeExt, 2}, 4711, 4711), + LargePid = mk_pid({LargeUANodeExt, 2}, 4711, 4711), + erlang:display({pid, SmallPid, node(SmallPid)}), + erlang:display({pid, LargePid, node(LargePid)}), + do_echo([SmallPid, LargePid], Config). + +utf8_port(Config) when is_list(Config) -> + {SmallUANodeExt, LargeUANodeExt} = utf8_nodenames_ext(), + SmallPort = mk_port({SmallUANodeExt, 2}, 4711), + erlang:display({port, SmallPort, node(SmallPort)}), + LargePort = mk_port({LargeUANodeExt, 2}, 4711), + erlang:display({port, LargePort, node(LargePort)}), + do_echo([SmallPort, LargePort], Config). + +utf8_ref(Config) when is_list(Config) -> + {SmallUANodeExt, LargeUANodeExt} = utf8_nodenames_ext(), + SmallRef = mk_ref({SmallUANodeExt, 2}, [4711, 4711, 4711]), + erlang:display({ref, SmallRef, node(SmallRef)}), + LargeRef = mk_ref({LargeUANodeExt, 2}, [4711, 4711, 4711]), + erlang:display({ref, LargeRef, node(LargeRef)}), + do_echo([SmallRef, LargeRef], Config). + + %% Lazy list cp_gen(N) -> cp_gen(N, -1, 16#110000). @@ -646,16 +726,6 @@ make_name() -> ++ "-" ++ integer_to_list(B) ++ "-" ++ integer_to_list(C)). - - --define(VERSION_MAGIC, 131). - --define(ATOM_EXT, 100). --define(REFERENCE_EXT, 101). --define(PORT_EXT, 102). --define(PID_EXT, 103). --define(NEW_REFERENCE_EXT, 114). - uint32_be(Uint) when is_integer(Uint), 0 =< Uint, Uint < 1 bsl 32 -> [(Uint bsr 24) band 16#ff, (Uint bsr 16) band 16#ff, @@ -679,72 +749,70 @@ uint8(Uint) -> mk_pid({NodeName, Creation}, Number, Serial) when is_atom(NodeName) -> - mk_pid({atom_to_list(NodeName), Creation}, Number, Serial); -mk_pid({NodeName, Creation}, Number, Serial) -> + <> = term_to_binary(NodeName), + mk_pid({NodeNameExt, Creation}, Number, Serial); +mk_pid({NodeNameExt, Creation}, Number, Serial) -> case catch binary_to_term(list_to_binary([?VERSION_MAGIC, ?PID_EXT, - ?ATOM_EXT, - uint16_be(length(NodeName)), - NodeName, + NodeNameExt, uint32_be(Number), uint32_be(Serial), uint8(Creation)])) of Pid when is_pid(Pid) -> Pid; {'EXIT', {badarg, _}} -> - exit({badarg, mk_pid, [{NodeName, Creation}, Number, Serial]}); + exit({badarg, mk_pid, [{NodeNameExt, Creation}, Number, Serial]}); Other -> exit({unexpected_binary_to_term_result, Other}) end. mk_port({NodeName, Creation}, Number) when is_atom(NodeName) -> - mk_port({atom_to_list(NodeName), Creation}, Number); -mk_port({NodeName, Creation}, Number) -> + <> = term_to_binary(NodeName), + mk_port({NodeNameExt, Creation}, Number); +mk_port({NodeNameExt, Creation}, Number) -> case catch binary_to_term(list_to_binary([?VERSION_MAGIC, ?PORT_EXT, - ?ATOM_EXT, - uint16_be(length(NodeName)), - NodeName, + NodeNameExt, uint32_be(Number), uint8(Creation)])) of Port when is_port(Port) -> Port; {'EXIT', {badarg, _}} -> - exit({badarg, mk_port, [{NodeName, Creation}, Number]}); + exit({badarg, mk_port, [{NodeNameExt, Creation}, Number]}); Other -> exit({unexpected_binary_to_term_result, Other}) end. -mk_ref({NodeName, Creation}, Numbers) when is_atom(NodeName), - is_integer(Creation), - is_list(Numbers) -> - mk_ref({atom_to_list(NodeName), Creation}, Numbers); -mk_ref({NodeName, Creation}, [Number]) when is_list(NodeName), - is_integer(Creation), - is_integer(Number) -> +mk_ref({NodeName, Creation}, [Number] = NL) when is_atom(NodeName), + is_integer(Creation), + is_integer(Number) -> + <> = term_to_binary(NodeName), + mk_ref({NodeNameExt, Creation}, NL); +mk_ref({NodeNameExt, Creation}, [Number]) when is_integer(Creation), + is_integer(Number) -> case catch binary_to_term(list_to_binary([?VERSION_MAGIC, ?REFERENCE_EXT, - ?ATOM_EXT, - uint16_be(length(NodeName)), - NodeName, + NodeNameExt, uint32_be(Number), uint8(Creation)])) of Ref when is_reference(Ref) -> Ref; {'EXIT', {badarg, _}} -> - exit({badarg, mk_ref, [{NodeName, Creation}, [Number]]}); + exit({badarg, mk_ref, [{NodeNameExt, Creation}, [Number]]}); Other -> exit({unexpected_binary_to_term_result, Other}) end; -mk_ref({NodeName, Creation}, Numbers) when is_list(NodeName), +mk_ref({NodeName, Creation}, Numbers) when is_atom(NodeName), is_integer(Creation), is_list(Numbers) -> + <> = term_to_binary(NodeName), + mk_ref({NodeNameExt, Creation}, Numbers); +mk_ref({NodeNameExt, Creation}, Numbers) when is_integer(Creation), + is_list(Numbers) -> case catch binary_to_term(list_to_binary([?VERSION_MAGIC, ?NEW_REFERENCE_EXT, uint16_be(length(Numbers)), - ?ATOM_EXT, - uint16_be(length(NodeName)), - NodeName, + NodeNameExt, uint8(Creation), lists:map(fun (N) -> uint32_be(N) @@ -753,7 +821,7 @@ mk_ref({NodeName, Creation}, Numbers) when is_list(NodeName), Ref when is_reference(Ref) -> Ref; {'EXIT', {badarg, _}} -> - exit({badarg, mk_ref, [{NodeName, Creation}, Numbers]}); + exit({badarg, mk_ref, [{NodeNameExt, Creation}, Numbers]}); Other -> exit({unexpected_binary_to_term_result, Other}) end. -- cgit v1.2.3