Skip to content

Commit

Permalink
Added emitUTF8 setting. (open-source-parsers#1045)
Browse files Browse the repository at this point in the history
* Added emitUTF8 setting to emit UTF8 format JSON.

* Added a test for emitUTF8, with it in default, on and off states.

* Review comments addressed.

* Merged master into my branch & resolved conflicts.

* Fix clang-format errors.

* Fix clang-format errors.

* Fixed clang-format errors.

* Fixed clang-format errors.
  • Loading branch information
nicolaswilson authored and baylesj committed Oct 17, 2019
1 parent f59ac2a commit a955529
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 29 deletions.
75 changes: 46 additions & 29 deletions src/lib_json/json_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,8 @@ static String toHex16Bit(unsigned int x) {
return result;
}

static String valueToQuotedStringN(const char* value, unsigned length) {
static String valueToQuotedStringN(const char* value, unsigned length,
bool emitUTF8 = false) {
if (value == nullptr)
return "";

Expand Down Expand Up @@ -310,21 +311,31 @@ static String valueToQuotedStringN(const char* value, unsigned length) {
// Should add a flag to allow this compatibility mode and prevent this
// sequence from occurring.
default: {
unsigned int cp = utf8ToCodepoint(c, end);
// don't escape non-control characters
// (short escape sequence are applied above)
if (cp < 0x80 && cp >= 0x20)
result += static_cast<char>(cp);
else if (cp < 0x10000) { // codepoint is in Basic Multilingual Plane
result += "\\u";
result += toHex16Bit(cp);
} else { // codepoint is not in Basic Multilingual Plane
// convert to surrogate pair first
cp -= 0x10000;
result += "\\u";
result += toHex16Bit((cp >> 10) + 0xD800);
result += "\\u";
result += toHex16Bit((cp & 0x3FF) + 0xDC00);
if (emitUTF8) {
result += *c;
} else {
unsigned int codepoint = utf8ToCodepoint(c, end);
const unsigned int FIRST_NON_CONTROL_CODEPOINT = 0x20;
const unsigned int LAST_NON_CONTROL_CODEPOINT = 0x7F;
const unsigned int FIRST_SURROGATE_PAIR_CODEPOINT = 0x10000;
// don't escape non-control characters
// (short escape sequence are applied above)
if (FIRST_NON_CONTROL_CODEPOINT <= codepoint &&
codepoint <= LAST_NON_CONTROL_CODEPOINT) {
result += static_cast<char>(codepoint);
} else if (codepoint <
FIRST_SURROGATE_PAIR_CODEPOINT) { // codepoint is in Basic
// Multilingual Plane
result += "\\u";
result += toHex16Bit(codepoint);
} else { // codepoint is not in Basic Multilingual Plane
// convert to surrogate pair first
codepoint -= FIRST_SURROGATE_PAIR_CODEPOINT;
result += "\\u";
result += toHex16Bit((codepoint >> 10) + 0xD800);
result += "\\u";
result += toHex16Bit((codepoint & 0x3FF) + 0xDC00);
}
}
} break;
}
Expand Down Expand Up @@ -864,7 +875,8 @@ struct BuiltStyledStreamWriter : public StreamWriter {
BuiltStyledStreamWriter(String indentation, CommentStyle::Enum cs,
String colonSymbol, String nullSymbol,
String endingLineFeedSymbol, bool useSpecialFloats,
unsigned int precision, PrecisionType precisionType);
bool emitUTF8, unsigned int precision,
PrecisionType precisionType);
int write(Value const& root, OStream* sout) override;

private:
Expand Down Expand Up @@ -893,19 +905,20 @@ struct BuiltStyledStreamWriter : public StreamWriter {
bool addChildValues_ : 1;
bool indented_ : 1;
bool useSpecialFloats_ : 1;
bool emitUTF8_ : 1;
unsigned int precision_;
PrecisionType precisionType_;
};
BuiltStyledStreamWriter::BuiltStyledStreamWriter(
String indentation, CommentStyle::Enum cs, String colonSymbol,
String nullSymbol, String endingLineFeedSymbol, bool useSpecialFloats,
unsigned int precision, PrecisionType precisionType)
bool emitUTF8, unsigned int precision, PrecisionType precisionType)
: rightMargin_(74), indentation_(std::move(indentation)), cs_(cs),
colonSymbol_(std::move(colonSymbol)), nullSymbol_(std::move(nullSymbol)),
endingLineFeedSymbol_(std::move(endingLineFeedSymbol)),
addChildValues_(false), indented_(false),
useSpecialFloats_(useSpecialFloats), precision_(precision),
precisionType_(precisionType) {}
useSpecialFloats_(useSpecialFloats), emitUTF8_(emitUTF8),
precision_(precision), precisionType_(precisionType) {}
int BuiltStyledStreamWriter::write(Value const& root, OStream* sout) {
sout_ = sout;
addChildValues_ = false;
Expand Down Expand Up @@ -942,7 +955,8 @@ void BuiltStyledStreamWriter::writeValue(Value const& value) {
char const* end;
bool ok = value.getString(&str, &end);
if (ok)
pushValue(valueToQuotedStringN(str, static_cast<unsigned>(end - str)));
pushValue(valueToQuotedStringN(str, static_cast<unsigned>(end - str),
emitUTF8_));
else
pushValue("");
break;
Expand All @@ -966,7 +980,7 @@ void BuiltStyledStreamWriter::writeValue(Value const& value) {
Value const& childValue = value[name];
writeCommentBeforeValue(childValue);
writeWithIndent(valueToQuotedStringN(
name.data(), static_cast<unsigned>(name.length())));
name.data(), static_cast<unsigned>(name.length()), emitUTF8_));
*sout_ << colonSymbol_;
writeValue(childValue);
if (++it == members.end()) {
Expand Down Expand Up @@ -1142,12 +1156,13 @@ StreamWriter::Factory::~Factory() = default;
StreamWriterBuilder::StreamWriterBuilder() { setDefaults(&settings_); }
StreamWriterBuilder::~StreamWriterBuilder() = default;
StreamWriter* StreamWriterBuilder::newStreamWriter() const {
String indentation = settings_["indentation"].asString();
String cs_str = settings_["commentStyle"].asString();
String pt_str = settings_["precisionType"].asString();
bool eyc = settings_["enableYAMLCompatibility"].asBool();
bool dnp = settings_["dropNullPlaceholders"].asBool();
bool usf = settings_["useSpecialFloats"].asBool();
const String indentation = settings_["indentation"].asString();
const String cs_str = settings_["commentStyle"].asString();
const String pt_str = settings_["precisionType"].asString();
const bool eyc = settings_["enableYAMLCompatibility"].asBool();
const bool dnp = settings_["dropNullPlaceholders"].asBool();
const bool usf = settings_["useSpecialFloats"].asBool();
const bool emitUTF8 = settings_["emitUTF8"].asBool();
unsigned int pre = settings_["precision"].asUInt();
CommentStyle::Enum cs = CommentStyle::All;
if (cs_str == "All") {
Expand Down Expand Up @@ -1179,7 +1194,7 @@ StreamWriter* StreamWriterBuilder::newStreamWriter() const {
pre = 17;
String endingLineFeedSymbol;
return new BuiltStyledStreamWriter(indentation, cs, colonSymbol, nullSymbol,
endingLineFeedSymbol, usf, pre,
endingLineFeedSymbol, usf, emitUTF8, pre,
precisionType);
}
static void getValidWriterKeys(std::set<String>* valid_keys) {
Expand All @@ -1189,6 +1204,7 @@ static void getValidWriterKeys(std::set<String>* valid_keys) {
valid_keys->insert("enableYAMLCompatibility");
valid_keys->insert("dropNullPlaceholders");
valid_keys->insert("useSpecialFloats");
valid_keys->insert("emitUTF8");
valid_keys->insert("precision");
valid_keys->insert("precisionType");
}
Expand Down Expand Up @@ -1220,6 +1236,7 @@ void StreamWriterBuilder::setDefaults(Json::Value* settings) {
(*settings)["enableYAMLCompatibility"] = false;
(*settings)["dropNullPlaceholders"] = false;
(*settings)["useSpecialFloats"] = false;
(*settings)["emitUTF8"] = false;
(*settings)["precision"] = 17;
(*settings)["precisionType"] = "significant";
//! [StreamWriterBuilderDefaults]
Expand Down
29 changes: 29 additions & 0 deletions src/test_lib_json/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2481,6 +2481,35 @@ JSONTEST_FIXTURE_LOCAL(StreamWriterTest, writeZeroes) {
}
}

JSONTEST_FIXTURE_LOCAL(StreamWriterTest, unicode) {
// Create a Json value containing UTF-8 string with some chars that need
// escape (tab,newline).
Json::Value root;
root["test"] = "\t\n\xF0\x91\xA2\xA1\x3D\xC4\xB3\xF0\x9B\x84\x9B\xEF\xBD\xA7";

Json::StreamWriterBuilder b;

// Default settings - should be unicode escaped.
JSONTEST_ASSERT(Json::writeString(b, root) ==
"{\n\t\"test\" : "
"\"\\t\\n\\ud806\\udca1=\\u0133\\ud82c\\udd1b\\uff67\"\n}");

b.settings_["emitUTF8"] = true;

// Should not be unicode escaped.
JSONTEST_ASSERT(
Json::writeString(b, root) ==
"{\n\t\"test\" : "
"\"\\t\\n\xF0\x91\xA2\xA1=\xC4\xB3\xF0\x9B\x84\x9B\xEF\xBD\xA7\"\n}");

b.settings_["emitUTF8"] = false;

// Should be unicode escaped.
JSONTEST_ASSERT(Json::writeString(b, root) ==
"{\n\t\"test\" : "
"\"\\t\\n\\ud806\\udca1=\\u0133\\ud82c\\udd1b\\uff67\"\n}");
}

struct ReaderTest : JsonTest::TestCase {};

JSONTEST_FIXTURE_LOCAL(ReaderTest, parseWithNoErrors) {
Expand Down

0 comments on commit a955529

Please sign in to comment.