Skip to content

Commit

Permalink
Unicode normalization (mmp#225)
Browse files Browse the repository at this point in the history
Normalize unicode strings for user-supplied names (objects, materials,
media, etc.)  Note that there is no need to normalize strings for things
like the name of the selected sampler, light source types, or the
parameters provided to pbrt objects, as all of the valid ones are plain old
ASCII text. We also intentionally do not normalize pathnames, as doing so
can cause all sorts of trouble.
  • Loading branch information
mmp authored Feb 2, 2022
1 parent 6b63416 commit c7e3879
Show file tree
Hide file tree
Showing 8 changed files with 83 additions and 10 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,6 @@
[submodule "src/ext/lodepng"]
path = src/ext/lodepng
url = https://github.com/lvandeve/lodepng.git
[submodule "src/ext/utf8proc"]
path = src/ext/utf8proc
url = https://github.com/JuliaStrings/utf8proc.git
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ check_ext ("filesystem" "filesystem/filesystem" c5f9de30142453eb3c6fe991e82dfc25
check_ext ("libdeflate" "libdeflate/common" 1fd0bea6ca2073c68493632dafc4b1ddda1bcbc3)
check_ext ("lodepng" "lodepng/examples" 8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a)
check_ext ("stb" "stb/tools" af1a5bc352164740c1cc1354942b1c6b72eacb8a)
check_ext ("utf8proc" "utf8proc/bench" 2484e2ed5e1d9c19edcccf392a7d9920ad90dfaf)
check_ext ("zlib" "zlib/doc" 54d591eabf9fe0e84c725638f8d5d8d202a093fa)

add_compile_definitions ("$<$<CONFIG:DEBUG>:PBRT_DEBUG_BUILD>")
Expand Down Expand Up @@ -861,6 +862,7 @@ set (ALL_PBRT_LIBS
${LIBDEFLATE_LIBRARIES}
double-conversion
${PBRT_CUDA_LIB}
utf8proc
)

if (PBRT_CUDA_ENABLED)
Expand Down Expand Up @@ -1024,6 +1026,7 @@ set (PBRT_TEST_SOURCE
src/pbrt/util/sampling_test.cpp
src/pbrt/util/spectrum_test.cpp
src/pbrt/util/splines_test.cpp
src/pbrt/util/string_test.cpp
src/pbrt/util/taggedptr_test.cpp
src/pbrt/util/transform_test.cpp
src/pbrt/util/vecmath_test.cpp
Expand Down
7 changes: 7 additions & 0 deletions src/ext/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,10 @@ set (FLIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/flip PARENT_SCOPE)
add_library (flip_lib STATIC ${CMAKE_CURRENT_SOURCE_DIR}/flip/flip.cpp)

set_property (TARGET flip_lib PROPERTY FOLDER "ext")

###########################################################################
# utf8proc

set (UTF8PROC_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/utf8proc PARENT_SCOPE)

add_subdirectory (utf8proc)
1 change: 1 addition & 0 deletions src/ext/utf8proc
Submodule utf8proc added at 2484e2
33 changes: 23 additions & 10 deletions src/pbrt/scene.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <pbrt/util/parallel.h>
#include <pbrt/util/print.h>
#include <pbrt/util/spectrum.h>
#include <pbrt/util/string.h>
#include <pbrt/util/transform.h>

#include <iostream>
Expand Down Expand Up @@ -122,11 +123,13 @@ void BasicSceneBuilder::Translate(Float dx, Float dy, Float dz, FileLoc loc) {
[=](auto t) { return t * pbrt::Translate(Vector3f(dx, dy, dz)); });
}

void BasicSceneBuilder::CoordinateSystem(const std::string &name, FileLoc loc) {
void BasicSceneBuilder::CoordinateSystem(const std::string &origName, FileLoc loc) {
std::string name = NormalizeUTF8(origName);
namedCoordinateSystems[name] = graphicsState.ctm;
}

void BasicSceneBuilder::CoordSysTransform(const std::string &name, FileLoc loc) {
void BasicSceneBuilder::CoordSysTransform(const std::string &origName, FileLoc loc) {
std::string name = NormalizeUTF8(origName);
if (namedCoordinateSystems.find(name) != namedCoordinateSystems.end())
graphicsState.ctm = namedCoordinateSystems[name];
else
Expand Down Expand Up @@ -230,8 +233,9 @@ void BasicSceneBuilder::WorldBegin(FileLoc loc) {
scene->SetOptions(filter, film, camera, sampler, integrator, accelerator);
}

void BasicSceneBuilder::MakeNamedMedium(const std::string &name,
void BasicSceneBuilder::MakeNamedMedium(const std::string &origName,
ParsedParameterVector params, FileLoc loc) {
std::string name = NormalizeUTF8(origName);
// Issue error if medium _name_ is multiply defined
if (mediumNames.find(name) != mediumNames.end()) {
ErrorExitDeferred(&loc, "Named medium \"%s\" redefined.", name);
Expand Down Expand Up @@ -302,7 +306,9 @@ void BasicSceneBuilder::Shape(const std::string &name, ParsedParameterVector par
}
}

void BasicSceneBuilder::ObjectBegin(const std::string &name, FileLoc loc) {
void BasicSceneBuilder::ObjectBegin(const std::string &origName, FileLoc loc) {
std::string name = NormalizeUTF8(origName);

VERIFY_WORLD("ObjectBegin");
pushedGraphicsStates.push_back(graphicsState);

Expand Down Expand Up @@ -356,7 +362,8 @@ void BasicSceneBuilder::ObjectEnd(FileLoc loc) {
activeInstanceDefinition = nullptr;
}

void BasicSceneBuilder::ObjectInstance(const std::string &name, FileLoc loc) {
void BasicSceneBuilder::ObjectInstance(const std::string &origName, FileLoc loc) {
std::string name = NormalizeUTF8(origName);
VERIFY_WORLD("ObjectInstance");

if (activeInstanceDefinition) {
Expand Down Expand Up @@ -643,15 +650,19 @@ void BasicSceneBuilder::Integrator(const std::string &name, ParsedParameterVecto
integrator = SceneEntity(name, std::move(dict), loc);
}

void BasicSceneBuilder::MediumInterface(const std::string &insideName,
const std::string &outsideName, FileLoc loc) {
void BasicSceneBuilder::MediumInterface(const std::string &origInsideName,
const std::string &origOutsideName, FileLoc loc) {
std::string insideName = NormalizeUTF8(origInsideName);
std::string outsideName = NormalizeUTF8(origOutsideName);

graphicsState.currentInsideMedium = insideName;
graphicsState.currentOutsideMedium = outsideName;
}

void BasicSceneBuilder::Texture(const std::string &name, const std::string &type,
void BasicSceneBuilder::Texture(const std::string &origName, const std::string &type,
const std::string &texname, ParsedParameterVector params,
FileLoc loc) {
std::string name = NormalizeUTF8(origName);
VERIFY_WORLD("Texture");

ParameterDictionary dict(std::move(params), graphicsState.textureAttributes,
Expand Down Expand Up @@ -691,8 +702,9 @@ void BasicSceneBuilder::Material(const std::string &name, ParsedParameterVector
graphicsState.currentMaterialName.clear();
}

void BasicSceneBuilder::MakeNamedMaterial(const std::string &name,
void BasicSceneBuilder::MakeNamedMaterial(const std::string &origName,
ParsedParameterVector params, FileLoc loc) {
std::string name = NormalizeUTF8(origName);
VERIFY_WORLD("MakeNamedMaterial");

ParameterDictionary dict(std::move(params), graphicsState.materialAttributes,
Expand All @@ -707,7 +719,8 @@ void BasicSceneBuilder::MakeNamedMaterial(const std::string &name,
scene->AddNamedMaterial(name, SceneEntity("", std::move(dict), loc));
}

void BasicSceneBuilder::NamedMaterial(const std::string &name, FileLoc loc) {
void BasicSceneBuilder::NamedMaterial(const std::string &origName, FileLoc loc) {
std::string name = NormalizeUTF8(origName);
VERIFY_WORLD("NamedMaterial");
graphicsState.currentMaterialName = name;
graphicsState.currentMaterialIndex = -1;
Expand Down
18 changes: 18 additions & 0 deletions src/pbrt/util/string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
#include <pbrt/util/string.h>

#include <pbrt/util/check.h>
#include <pbrt/util/error.h>

#define UTF8PROC_STATIC
#include <utf8proc/utf8proc.h>

#include <ctype.h>
#include <codecvt>
Expand Down Expand Up @@ -185,4 +189,18 @@ std::u16string UTF16FromUTF8(std::string str) {
return utf16;
}

std::string NormalizeUTF8(std::string str) {
utf8proc_option_t options = UTF8PROC_COMPOSE;

utf8proc_uint8_t *result;
utf8proc_ssize_t length = utf8proc_map((const unsigned char *)str.data(), str.size(),
&result, options);
if (length < 0)
ErrorExit("Unicode normalization error: %s: \"%s\"", utf8proc_errmsg(length), str);

str = std::string(result, result + length);
free(result);
return str;
}

} // namespace pbrt
2 changes: 2 additions & 0 deletions src/pbrt/util/string.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ std::wstring WStringFromUTF8(std::string str);
std::string UTF8FromWString(std::wstring str);
#endif // PBRT_IS_WINDOWS

std::string NormalizeUTF8(std::string str);

// InternedString Definition
class InternedString {
public:
Expand Down
26 changes: 26 additions & 0 deletions src/pbrt/util/string_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// pbrt is Copyright(c) 1998-2020 Matt Pharr, Wenzel Jakob, and Greg Humphreys.
// The pbrt source code is licensed under the Apache License, Version 2.0.
// SPDX: Apache-2.0

#include <gtest/gtest.h>

#include <pbrt/pbrt.h>
#include <pbrt/util/string.h>

#include <string>

using namespace pbrt;

TEST(Unicode, BasicNormalization) {
// "Amélie" two ways, via https://en.wikipedia.org/wiki/Unicode_equivalence
std::u16string nfc16(u"\u0041\u006d\u00e9\u006c\u0069\u0065");
std::u16string nfd16(u"\u0041\u006d\u0065\u0301\u006c\u0069\u0065");
EXPECT_NE(nfc16, nfd16);

std::string nfc8 = UTF8FromUTF16(nfc16);
std::string nfd8 = UTF8FromUTF16(nfd16);
EXPECT_NE(nfc8, nfd8);

EXPECT_EQ(nfc8, NormalizeUTF8(nfc8)); // nfc is already normalized
EXPECT_EQ(nfc8, NormalizeUTF8(nfd8)); // normalizing nfd should make it equal nfc
}

0 comments on commit c7e3879

Please sign in to comment.