Support UTF16-BE decoding (radareorg#15450)

* Support UTF16-BE decoding * Add spaces
tbodt · Nov 13, 2019 · a1f893e · a1f893e
1 parent 7b23752
commit a1f893e
Show file tree

Hide file tree

Showing 6 changed files with 45 additions and 13 deletions.
diff --git a/libr/core/cconfig.c b/libr/core/cconfig.c
@@ -3114,7 +3114,7 @@ R_API int r_core_config_init(RCore *core) {
 	SETICB ("bin.maxstrbuf", 1024*1024*10, & cb_binmaxstrbuf, "Maximum size of range to load strings from");
 	n = NODECB ("bin.str.enc", "guess", &cb_binstrenc);
 	SETDESC (n, "Default string encoding of binary");
-	SETOPTIONS (n, "latin1", "utf8", "utf16le", "utf32le", "guess", NULL);
+	SETOPTIONS (n, "latin1", "utf8", "utf16le", "utf32le", "utf16be", "guess", NULL);
 	SETCB ("bin.prefix", NULL, &cb_binprefix, "Prefix all symbols/sections/relocs with a specific string");
 	SETCB ("bin.rawstr", "false", &cb_rawstr, "Load strings from raw binaries");
 	SETCB ("bin.strings", "true", &cb_binstrings, "Load strings from rbin on startup");

diff --git a/libr/core/disasm.c b/libr/core/disasm.c
@@ -744,6 +744,8 @@ static RDisasmState * ds_init(RCore *core) {
 		ds->strenc = R_STRING_ENC_UTF16LE;
 	} else if (!strcmp (strenc_str, "utf32le")) {
 		ds->strenc = R_STRING_ENC_UTF32LE;
+	} else if (!strcmp (strenc_str, "utf16be")) {
+		ds->strenc = R_STRING_ENC_UTF16BE;
 	} else {
 		ds->strenc = R_STRING_ENC_GUESS;
 	}
@@ -3710,6 +3712,10 @@ static char *ds_esc_str(RDisasmState *ds, const char *str, int len, const char *
 		escstr = r_str_escape_utf32le (str, len, ds->show_asciidot, esc_bslash);
 		prefix = "U";
 		break;
+	case R_STRING_ENC_UTF16BE:
+		escstr = r_str_escape_utf16be (str, len, ds->show_asciidot, esc_bslash);
+		prefix = "ub";
+		break;
 	default:
 		str_len = strlen (str);
 		if ((str_len == 1 && len > 3 && str[2] && !str[3])

diff --git a/libr/include/r_util/r_str.h b/libr/include/r_util/r_str.h
@@ -14,6 +14,7 @@ typedef enum {
 	R_STRING_ENC_UTF8 = '8',
 	R_STRING_ENC_UTF16LE = 'u',
 	R_STRING_ENC_UTF32LE = 'U',
+	R_STRING_ENC_UTF16BE = 'b',
 	R_STRING_ENC_GUESS = 'g',
 } RStrEnc;
 
@@ -153,6 +154,7 @@ R_API char *r_str_escape_latin1(const char *buf, bool show_asciidot, bool esc_bs
 R_API char *r_str_escape_utf8(const char *buf, bool show_asciidot, bool esc_bslash);
 R_API char *r_str_escape_utf16le(const char *buf, int buf_size, bool show_asciidot, bool esc_bslash);
 R_API char *r_str_escape_utf32le(const char *buf, int buf_size, bool show_asciidot, bool esc_bslash);
+R_API char *r_str_escape_utf16be(const char *buf, int buf_size, bool show_asciidot, bool esc_bslash);
 R_API void r_str_byte_escape(const char *p, char **dst, int dot_nl, bool default_dot, bool esc_bslash);
 R_API void r_str_uri_decode(char *buf);
 R_API char *r_str_uri_encode(const char *buf);

diff --git a/libr/include/r_util/r_utf16.h b/libr/include/r_util/r_utf16.h
@@ -4,7 +4,9 @@
 /* For RRune definition */
 #include "r_utf8.h"
 
+R_API int r_utf16_decode(const ut8 *ptr, int ptrlen, RRune *ch, bool bigendian);
 R_API int r_utf16le_decode(const ut8 *ptr, int ptrlen, RRune *ch);
+R_API int r_utf16be_decode(const ut8 *ptr, int ptrlen, RRune *ch);
 R_API int r_utf16le_encode(ut8 *ptr, RRune ch);
 
 #endif //  R_UTF16_H
diff --git a/libr/util/str.c b/libr/util/str.c
@@ -1347,11 +1347,12 @@ static char *r_str_escape_utf(const char *buf, int buf_size, RStrEnc enc, bool s
 	}
 	switch (enc) {
 	case R_STRING_ENC_UTF16LE:
+	case R_STRING_ENC_UTF16BE:
 	case R_STRING_ENC_UTF32LE:
 		if (buf_size < 0) {
 			return NULL;
 		}
-		if (enc == R_STRING_ENC_UTF16LE) {
+		if (enc == R_STRING_ENC_UTF16LE || enc == R_STRING_ENC_UTF16BE) {
 			end = (char *)r_mem_mem_aligned ((ut8 *)buf, buf_size, (ut8 *)"\0\0", 2, 2);
 		} else {
 			end = (char *)r_mem_mem_aligned ((ut8 *)buf, buf_size, (ut8 *)"\0\0\0\0", 4, 4);
@@ -1375,10 +1376,13 @@ static char *r_str_escape_utf(const char *buf, int buf_size, RStrEnc enc, bool s
 	while (p < end) {
 		switch (enc) {
 		case R_STRING_ENC_UTF16LE:
+		case R_STRING_ENC_UTF16BE:
 		case R_STRING_ENC_UTF32LE:
-			ch_bytes = (enc == R_STRING_ENC_UTF16LE ?
-				    r_utf16le_decode ((ut8 *)p, end - p, &ch) :
-				    r_utf32le_decode ((ut8 *)p, end - p, &ch));
+			if (enc == R_STRING_ENC_UTF16LE || enc == R_STRING_ENC_UTF16BE) {
+				ch_bytes = r_utf16_decode ((ut8 *)p, end - p, &ch, enc == R_STRING_ENC_UTF16BE);
+			} else {
+				ch_bytes = r_utf32le_decode ((ut8 *)p, end - p, &ch);
+			}
 			if (ch_bytes == 0) {
 				p++;
 				continue;
@@ -1400,10 +1404,11 @@ static char *r_str_escape_utf(const char *buf, int buf_size, RStrEnc enc, bool s
 				*q++ = "0123456789abcdef"[ch >> 4 * i & 0xf];
 			}
 		} else {
-			r_str_byte_escape (p, &q, false, false, esc_bslash);
+			r_str_byte_escape (p + (enc == R_STRING_ENC_UTF16BE), &q, false, false, esc_bslash);
 		}
 		switch (enc) {
 		case R_STRING_ENC_UTF16LE:
+		case R_STRING_ENC_UTF16BE:
 			p += ch_bytes < 2 ? 2 : ch_bytes;
 			break;
 		case R_STRING_ENC_UTF32LE:
@@ -1429,6 +1434,10 @@ R_API char *r_str_escape_utf32le(const char *buf, int buf_size, bool show_asciid
 	return r_str_escape_utf (buf, buf_size, R_STRING_ENC_UTF32LE, show_asciidot, esc_bslash);
 }
 
+R_API char *r_str_escape_utf16be(const char *buf, int buf_size, bool show_asciidot, bool esc_bslash) {
+	return r_str_escape_utf (buf, buf_size, R_STRING_ENC_UTF16BE, show_asciidot, esc_bslash);
+}
+
 // JSON has special escaping requirements
 // TODO: merge with r_str_escape_utf() and r_str_byte_escape() using RStrEsc
 R_API char *r_str_escape_utf8_for_json(const char *buf, int buf_size) {

diff --git a/libr/util/utf16.c b/libr/util/utf16.c
@@ -3,32 +3,45 @@
 #include <r_types.h>
 #include <r_util.h>
 
-/* Convert an UTF-16LE buf into a unicode RRune */
-R_API int r_utf16le_decode(const ut8 *ptr, int ptrlen, RRune *ch) {
+/* Convert an UTF-16 buf into a unicode RRune */
+R_API int r_utf16_decode(const ut8 *ptr, int ptrlen, RRune *ch, bool bigendian) {
 	if (ptrlen < 1) {
 		return 0;
 	}
-	if (ptrlen > 3 && (ptr[1] & 0xdc) == 0xd8 && (ptr[3] & 0xdc) == 0xdc) {
+	int high = !bigendian;
+	int low = !high;
+	if (ptrlen > 3 && (ptr[high] & 0xdc) == 0xd8 && (ptr[high + 2] & 0xdc) == 0xdc) {
 		if (ch) {
-			*ch = ((ptr[1] & 3) << 24 | ptr[0] << 16 | (ptr[3] & 3) << 8 | ptr[2]) + 0x10000;
+			*ch = ((ptr[high] & 3) << 24 | ptr[low] << 16 | (ptr[high + 2] & 3) << 8 | ptr[low + 2])
+			      + 0x10000;
 		}
 		return 4;
 	}
-	if (ptrlen > 1 && ptr[1]) {
+	if (ptrlen > 1 && ptr[high]) {
 		if (ch) {
-			*ch = ptr[1] << 8 | ptr[0];
+			*ch = ptr[high] << 8 | ptr[low];
 		}
 		return 2;
 	}
 	if (ptrlen > 1) {
 		if (ch) {
-			*ch = (ut32)ptr[0];
+			*ch = (ut32)ptr[low];
 		}
 		return 1;
 	}
 	return 0;
 }
 
+/* Convert an UTF-16LE buf into a unicode RRune */
+R_API int r_utf16le_decode(const ut8 *ptr, int ptrlen, RRune *ch) {
+	return r_utf16_decode (ptr, ptrlen, ch, false);
+}
+
+/* Convert an UTF-16BE buf into a unicode RRune */
+R_API int r_utf16be_decode(const ut8 *ptr, int ptrlen, RRune *ch) {
+	return r_utf16_decode (ptr, ptrlen, ch, true);
+}
+
 /* Convert a unicode RRune into a UTF-16LE buf */
 R_API int r_utf16le_encode(ut8 *ptr, RRune ch) {
 	if (ch < 0x10000) {