Skip to content

Commit

Permalink
Address Issue #119
Browse files Browse the repository at this point in the history
ExampleProvider now has save_mem_caches and load_mem_caches so the
current state of the in-memory cache can be saved and restored.
  • Loading branch information
dkoes committed May 29, 2024
1 parent 47213e1 commit 5a642b1
Show file tree
Hide file tree
Showing 20 changed files with 994 additions and 456 deletions.
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@ _site/
.bundle/
vendor/

# Ignore editors
.vscode
# Ignore editors (but not vscode)
/.pytest_cache/
.cproject
.idea
Expand Down
23 changes: 23 additions & 0 deletions .vscode/c_cpp_properties.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"configurations": [
{
"name": "Linux",
"includePath": [
"${workspaceFolder}/**",
"${workspaceFolder}/include/**",
"/usr/local/cuda/include",
"/usr/local/include/openbabel3/",
"/usr/include",
"/usr/include/c++/11/",
"${workspaceFolder}/include/**"],
"defines": [],
"compilerPath": "/usr/bin/c++",
"cStandard": "c17",
"cppStandard": "gnu++17",
"intelliSenseMode": "linux-gcc-x64",
"compileCommands": "${workspaceFolder}/build/compile_commands.json",
"configurationProvider": "ms-vscode.cmake-tools"
}
],
"version": 4
}
74 changes: 74 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
{
"files.associations": {
"stdexcept": "cpp",
"cctype": "cpp",
"clocale": "cpp",
"cmath": "cpp",
"cstdarg": "cpp",
"cstddef": "cpp",
"cstdio": "cpp",
"cstdlib": "cpp",
"cstring": "cpp",
"ctime": "cpp",
"cwchar": "cpp",
"cwctype": "cpp",
"array": "cpp",
"atomic": "cpp",
"strstream": "cpp",
"bit": "cpp",
"*.tcc": "cpp",
"bitset": "cpp",
"chrono": "cpp",
"compare": "cpp",
"complex": "cpp",
"concepts": "cpp",
"condition_variable": "cpp",
"cstdint": "cpp",
"deque": "cpp",
"list": "cpp",
"map": "cpp",
"set": "cpp",
"string": "cpp",
"unordered_map": "cpp",
"unordered_set": "cpp",
"vector": "cpp",
"exception": "cpp",
"algorithm": "cpp",
"functional": "cpp",
"iterator": "cpp",
"memory": "cpp",
"memory_resource": "cpp",
"numeric": "cpp",
"optional": "cpp",
"random": "cpp",
"ratio": "cpp",
"string_view": "cpp",
"system_error": "cpp",
"tuple": "cpp",
"type_traits": "cpp",
"utility": "cpp",
"fstream": "cpp",
"initializer_list": "cpp",
"iomanip": "cpp",
"iosfwd": "cpp",
"iostream": "cpp",
"istream": "cpp",
"limits": "cpp",
"mutex": "cpp",
"new": "cpp",
"numbers": "cpp",
"ostream": "cpp",
"semaphore": "cpp",
"sstream": "cpp",
"stop_token": "cpp",
"streambuf": "cpp",
"thread": "cpp",
"cinttypes": "cpp",
"typeindex": "cpp",
"typeinfo": "cpp",
"variant": "cpp"
},
"C_Cpp.files.exclude": {
"/usr/local/include/libmolgrid/**": true
}
}
24 changes: 24 additions & 0 deletions .vscode/tasks.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"version": "2.0.0",
"isShelllCommand": true,
"tasks": [
{
"label": "build",
"options": {
"cwd": "${workspaceRoot}/build"
},
"type": "shell",
"command": "make -j24",
"group": "build"
},
{
"label": "build-debug",
"options": {
"cwd": "${workspaceRoot}/build-debug"
},
"type": "shell",
"command": "make -j24",
"group": "build"
}
]
}
4 changes: 3 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ if(NOT CMAKE_CONFIGURATION_TYPES AND NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

option(BUILD_SHARED "Build shared library" ON)
option(BUILD_STATIC "Build static library" ON)
option(BUILD_COVERAGE "Build with code coverage" OFF)
Expand All @@ -36,7 +38,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_BINDIR})

#dependencies
find_package(CUDA REQUIRED)
find_package(Boost REQUIRED COMPONENTS regex unit_test_framework program_options system filesystem iostreams)
find_package(Boost REQUIRED COMPONENTS regex unit_test_framework program_options system filesystem iostreams serialization)
find_package(OpenBabel3 REQUIRED)
include_directories(SYSTEM ${OPENBABEL3_INCLUDE_DIR})
find_package(ZLIB)
Expand Down
108 changes: 76 additions & 32 deletions include/libmolgrid/coord_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,14 @@
#ifndef COORD_CACHE_H_
#define COORD_CACHE_H_

#include "libmolgrid/coordinateset.h"
#include "libmolgrid/atom_typer.h"
#include "libmolgrid/coordinateset.h"
#include "libmolgrid/example.h"
#include <boost/iostreams/device/mapped_file.hpp>

#include <boost/archive/binary_iarchive.hpp>
#include <boost/archive/binary_oarchive.hpp>
#include <boost/serialization/serialization.hpp>
#include <boost/serialization/unordered_map.hpp>
namespace libmolgrid {

/** \brief Load and cache molecular coordinates and atom types.
Expand All @@ -21,36 +24,77 @@ namespace libmolgrid {
* training runs.
*/
class CoordCache {
using MemCache = std::unordered_map<const char*, CoordinateSet>;
MemCache memcache;
std::shared_ptr<AtomTyper> typer;
std::string data_root;
std::string molcache;
bool use_cache = true; //is possible to disable caching
bool addh = true; ///protonate
bool make_vector_types = false; ///convert index types to vector, will also convert to type based radii and add a dummy type

//for memory mapped cache
boost::iostreams::mapped_file_source cache_map;
std::unordered_map<const char*, size_t> offsets; //map from names to position in cache_map

public:
CoordCache() {}
CoordCache(std::shared_ptr<AtomTyper> t,
const ExampleProviderSettings& settings = ExampleProviderSettings(),
const std::string& mc = "");
~CoordCache() {}

/** \brief Set coord to the appropriate CoordinateSet for fname
* @param[in] fname file name, not including root directory prefix, of molecular data
* @param[out] coord CoordinateSet for passed molecule
*/
void set_coords(const char *fname, CoordinateSet& coord);

/// return the number of types (channels) each example will have
size_t num_types() const { return typer->num_types(); }

std::vector<std::string> get_type_names() const { return typer->get_type_names(); }
using MemCache = std::unordered_map<const char *, CoordinateSet>;
MemCache memcache;
std::shared_ptr<AtomTyper> typer;
std::string data_root;
std::string molcache;
bool use_cache = true; // is possible to disable caching
bool addh = true; /// protonate
bool make_vector_types =
false; /// convert index types to vector, will also convert to type based
/// radii and add a dummy type

// for memory mapped cache
boost::iostreams::mapped_file_source cache_map;
std::unordered_map<const char *, size_t>
offsets; // map from names to position in cache_map

public:
CoordCache() {}
CoordCache(
std::shared_ptr<AtomTyper> t,
const ExampleProviderSettings &settings = ExampleProviderSettings(),
const std::string &mc = "");
~CoordCache() {}

/** \brief Set coord to the appropriate CoordinateSet for fname
* @param[in] fname file name, not including root directory prefix, of
* molecular data
* @param[out] coord CoordinateSet for passed molecule
*/
void set_coords(const char *fname, CoordinateSet &coord);

/// return the number of types (channels) each example will have
size_t num_types() const { return typer->num_types(); }

std::vector<std::string> get_type_names() const {
return typer->get_type_names();
}

/** \brief Write out current contents of memory cache to provided output
* stream.
* @param[in] out output stream
*/
void save_mem_cache(std::ostream &out) const;

/** \brief Write out current contents of memory cache to provided file.
* @param[in] fname file name
*/
void save_mem_cache(const std::string &fname) const {
std::ofstream out(fname.c_str());
if (!out)
throw std::invalid_argument("Could not open file " + fname);
save_mem_cache(out);
}

/** \brief Read contents of input stream into memory cache.
* @param[in] in input stream
*/
void load_mem_cache(std::istream &in);
/** \brief Read contents of provided file into memory cache.
* @param[in] fname file name
*/
void load_mem_cache(const std::string &fname) {
std::ifstream in(fname.c_str());
if (!in)
throw std::invalid_argument("Could not load file " + fname);
load_mem_cache(in);
}

size_t mem_cache_size() const {
return memcache.size();
}
};

} /* namespace libmolgrid */
Expand Down
26 changes: 26 additions & 0 deletions include/libmolgrid/coordinateset.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@

#include <vector>
#include <openbabel/mol.h>
#include <boost/serialization/access.hpp>

#include "libmolgrid/managed_grid.h"
#include "libmolgrid/string_cache.h"

namespace libmolgrid {

Expand Down Expand Up @@ -136,6 +139,29 @@ struct CoordinateSet {

///for debugging
void dump(std::ostream& out) const;

friend class boost::serialization::access;
template <class Archive> void serialize(Archive &ar, const unsigned version) {
ar & coords;
ar & type_index;
ar & type_vector;
ar & radii;
ar & max_type;

if (Archive::is_saving::value) {
std::string tmp;
if(src) {
tmp = src;
}
ar & tmp;
} else { //reading
std::string tmp;
ar & tmp;
if(tmp.length() > 0) {
src = string_cache.get(tmp);
}
}
}
};

extern template size_t CoordinateSet::copyTo(Grid<float, 2, false>& c, Grid<float, 1, false>& t, Grid<float, 1, false>& r) const;
Expand Down
13 changes: 1 addition & 12 deletions include/libmolgrid/example.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <vector>
#include <unordered_set>
#include "libmolgrid/coordinateset.h"
#include "libmolgrid/string_cache.h"

namespace libmolgrid {

Expand Down Expand Up @@ -161,19 +162,7 @@ struct ExampleRef {
};


//for memory efficiency, only store a given string once and use the const char*
class StringCache {
std::unordered_set<std::string> strings;
public:
const char* get(const std::string& s)
{
strings.insert(s);
//we assume even as the set is resized that strings never get allocated
return strings.find(s)->c_str();
}
};

extern StringCache string_cache;

} /* namespace libmolgrid */

Expand Down
32 changes: 32 additions & 0 deletions include/libmolgrid/example_extractor.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,38 @@ class ExampleExtractor {
///return names of types for explicitly typed examples
///type names are prepended by coordinate set index
virtual std::vector<std::string> get_type_names() const;

/** \brief Write out current contents of memory caches to provided output
* stream.
* @param[in] out output stream
*/
void save_mem_caches(std::ostream &out) const;

/** \brief Write out current contents of memory caches to provided file.
* @param[in] fname file name
*/
void save_mem_caches(const std::string &fname) const {
std::ofstream out(fname.c_str());
if (!out)
throw std::invalid_argument("Could not open file " + fname);
save_mem_caches(out);
}

/** \brief Read contents of input stream into memory caches.
* @param[in] in input stream
*/
void load_mem_caches(std::istream &in);
/** \brief Read contents of provided file into memory caches.
* @param[in] fname file name
*/
void load_mem_caches(const std::string &fname) {
std::ifstream in(fname.c_str());
if (!in)
throw std::invalid_argument("Could not load file " + fname);
load_mem_caches(in);
}

size_t mem_caches_size() const;
};

} /* namespace libmolgrid */
Expand Down
Loading

0 comments on commit 5a642b1

Please sign in to comment.