Skip to content

Commit

Permalink
Support UTF16-BE decoding (radareorg#15450)
Browse files Browse the repository at this point in the history
* Support UTF16-BE decoding

* Add spaces
  • Loading branch information
kazarmy authored Nov 13, 2019
1 parent 7b23752 commit a1f893e
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 13 deletions.
2 changes: 1 addition & 1 deletion libr/core/cconfig.c
Original file line number Diff line number Diff line change
Expand Up @@ -3114,7 +3114,7 @@ R_API int r_core_config_init(RCore *core) {
SETICB ("bin.maxstrbuf", 1024*1024*10, & cb_binmaxstrbuf, "Maximum size of range to load strings from");
n = NODECB ("bin.str.enc", "guess", &cb_binstrenc);
SETDESC (n, "Default string encoding of binary");
SETOPTIONS (n, "latin1", "utf8", "utf16le", "utf32le", "guess", NULL);
SETOPTIONS (n, "latin1", "utf8", "utf16le", "utf32le", "utf16be", "guess", NULL);
SETCB ("bin.prefix", NULL, &cb_binprefix, "Prefix all symbols/sections/relocs with a specific string");
SETCB ("bin.rawstr", "false", &cb_rawstr, "Load strings from raw binaries");
SETCB ("bin.strings", "true", &cb_binstrings, "Load strings from rbin on startup");
Expand Down
6 changes: 6 additions & 0 deletions libr/core/disasm.c
Original file line number Diff line number Diff line change
Expand Up @@ -744,6 +744,8 @@ static RDisasmState * ds_init(RCore *core) {
ds->strenc = R_STRING_ENC_UTF16LE;
} else if (!strcmp (strenc_str, "utf32le")) {
ds->strenc = R_STRING_ENC_UTF32LE;
} else if (!strcmp (strenc_str, "utf16be")) {
ds->strenc = R_STRING_ENC_UTF16BE;
} else {
ds->strenc = R_STRING_ENC_GUESS;
}
Expand Down Expand Up @@ -3710,6 +3712,10 @@ static char *ds_esc_str(RDisasmState *ds, const char *str, int len, const char *
escstr = r_str_escape_utf32le (str, len, ds->show_asciidot, esc_bslash);
prefix = "U";
break;
case R_STRING_ENC_UTF16BE:
escstr = r_str_escape_utf16be (str, len, ds->show_asciidot, esc_bslash);
prefix = "ub";
break;
default:
str_len = strlen (str);
if ((str_len == 1 && len > 3 && str[2] && !str[3])
Expand Down
2 changes: 2 additions & 0 deletions libr/include/r_util/r_str.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ typedef enum {
R_STRING_ENC_UTF8 = '8',
R_STRING_ENC_UTF16LE = 'u',
R_STRING_ENC_UTF32LE = 'U',
R_STRING_ENC_UTF16BE = 'b',
R_STRING_ENC_GUESS = 'g',
} RStrEnc;

Expand Down Expand Up @@ -153,6 +154,7 @@ R_API char *r_str_escape_latin1(const char *buf, bool show_asciidot, bool esc_bs
R_API char *r_str_escape_utf8(const char *buf, bool show_asciidot, bool esc_bslash);
R_API char *r_str_escape_utf16le(const char *buf, int buf_size, bool show_asciidot, bool esc_bslash);
R_API char *r_str_escape_utf32le(const char *buf, int buf_size, bool show_asciidot, bool esc_bslash);
R_API char *r_str_escape_utf16be(const char *buf, int buf_size, bool show_asciidot, bool esc_bslash);
R_API void r_str_byte_escape(const char *p, char **dst, int dot_nl, bool default_dot, bool esc_bslash);
R_API void r_str_uri_decode(char *buf);
R_API char *r_str_uri_encode(const char *buf);
Expand Down
2 changes: 2 additions & 0 deletions libr/include/r_util/r_utf16.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
/* For RRune definition */
#include "r_utf8.h"

R_API int r_utf16_decode(const ut8 *ptr, int ptrlen, RRune *ch, bool bigendian);
R_API int r_utf16le_decode(const ut8 *ptr, int ptrlen, RRune *ch);
R_API int r_utf16be_decode(const ut8 *ptr, int ptrlen, RRune *ch);
R_API int r_utf16le_encode(ut8 *ptr, RRune ch);

#endif // R_UTF16_H
19 changes: 14 additions & 5 deletions libr/util/str.c
Original file line number Diff line number Diff line change
Expand Up @@ -1347,11 +1347,12 @@ static char *r_str_escape_utf(const char *buf, int buf_size, RStrEnc enc, bool s
}
switch (enc) {
case R_STRING_ENC_UTF16LE:
case R_STRING_ENC_UTF16BE:
case R_STRING_ENC_UTF32LE:
if (buf_size < 0) {
return NULL;
}
if (enc == R_STRING_ENC_UTF16LE) {
if (enc == R_STRING_ENC_UTF16LE || enc == R_STRING_ENC_UTF16BE) {
end = (char *)r_mem_mem_aligned ((ut8 *)buf, buf_size, (ut8 *)"\0\0", 2, 2);
} else {
end = (char *)r_mem_mem_aligned ((ut8 *)buf, buf_size, (ut8 *)"\0\0\0\0", 4, 4);
Expand All @@ -1375,10 +1376,13 @@ static char *r_str_escape_utf(const char *buf, int buf_size, RStrEnc enc, bool s
while (p < end) {
switch (enc) {
case R_STRING_ENC_UTF16LE:
case R_STRING_ENC_UTF16BE:
case R_STRING_ENC_UTF32LE:
ch_bytes = (enc == R_STRING_ENC_UTF16LE ?
r_utf16le_decode ((ut8 *)p, end - p, &ch) :
r_utf32le_decode ((ut8 *)p, end - p, &ch));
if (enc == R_STRING_ENC_UTF16LE || enc == R_STRING_ENC_UTF16BE) {
ch_bytes = r_utf16_decode ((ut8 *)p, end - p, &ch, enc == R_STRING_ENC_UTF16BE);
} else {
ch_bytes = r_utf32le_decode ((ut8 *)p, end - p, &ch);
}
if (ch_bytes == 0) {
p++;
continue;
Expand All @@ -1400,10 +1404,11 @@ static char *r_str_escape_utf(const char *buf, int buf_size, RStrEnc enc, bool s
*q++ = "0123456789abcdef"[ch >> 4 * i & 0xf];
}
} else {
r_str_byte_escape (p, &q, false, false, esc_bslash);
r_str_byte_escape (p + (enc == R_STRING_ENC_UTF16BE), &q, false, false, esc_bslash);
}
switch (enc) {
case R_STRING_ENC_UTF16LE:
case R_STRING_ENC_UTF16BE:
p += ch_bytes < 2 ? 2 : ch_bytes;
break;
case R_STRING_ENC_UTF32LE:
Expand All @@ -1429,6 +1434,10 @@ R_API char *r_str_escape_utf32le(const char *buf, int buf_size, bool show_asciid
return r_str_escape_utf (buf, buf_size, R_STRING_ENC_UTF32LE, show_asciidot, esc_bslash);
}

R_API char *r_str_escape_utf16be(const char *buf, int buf_size, bool show_asciidot, bool esc_bslash) {
return r_str_escape_utf (buf, buf_size, R_STRING_ENC_UTF16BE, show_asciidot, esc_bslash);
}

// JSON has special escaping requirements
// TODO: merge with r_str_escape_utf() and r_str_byte_escape() using RStrEsc
R_API char *r_str_escape_utf8_for_json(const char *buf, int buf_size) {
Expand Down
27 changes: 20 additions & 7 deletions libr/util/utf16.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,32 +3,45 @@
#include <r_types.h>
#include <r_util.h>

/* Convert an UTF-16LE buf into a unicode RRune */
R_API int r_utf16le_decode(const ut8 *ptr, int ptrlen, RRune *ch) {
/* Convert an UTF-16 buf into a unicode RRune */
R_API int r_utf16_decode(const ut8 *ptr, int ptrlen, RRune *ch, bool bigendian) {
if (ptrlen < 1) {
return 0;
}
if (ptrlen > 3 && (ptr[1] & 0xdc) == 0xd8 && (ptr[3] & 0xdc) == 0xdc) {
int high = !bigendian;
int low = !high;
if (ptrlen > 3 && (ptr[high] & 0xdc) == 0xd8 && (ptr[high + 2] & 0xdc) == 0xdc) {
if (ch) {
*ch = ((ptr[1] & 3) << 24 | ptr[0] << 16 | (ptr[3] & 3) << 8 | ptr[2]) + 0x10000;
*ch = ((ptr[high] & 3) << 24 | ptr[low] << 16 | (ptr[high + 2] & 3) << 8 | ptr[low + 2])
+ 0x10000;
}
return 4;
}
if (ptrlen > 1 && ptr[1]) {
if (ptrlen > 1 && ptr[high]) {
if (ch) {
*ch = ptr[1] << 8 | ptr[0];
*ch = ptr[high] << 8 | ptr[low];
}
return 2;
}
if (ptrlen > 1) {
if (ch) {
*ch = (ut32)ptr[0];
*ch = (ut32)ptr[low];
}
return 1;
}
return 0;
}

/* Convert an UTF-16LE buf into a unicode RRune */
R_API int r_utf16le_decode(const ut8 *ptr, int ptrlen, RRune *ch) {
return r_utf16_decode (ptr, ptrlen, ch, false);
}

/* Convert an UTF-16BE buf into a unicode RRune */
R_API int r_utf16be_decode(const ut8 *ptr, int ptrlen, RRune *ch) {
return r_utf16_decode (ptr, ptrlen, ch, true);
}

/* Convert a unicode RRune into a UTF-16LE buf */
R_API int r_utf16le_encode(ut8 *ptr, RRune ch) {
if (ch < 0x10000) {
Expand Down

0 comments on commit a1f893e

Please sign in to comment.