diff --git a/.clangd b/.clangd
new file mode 100644
index 00000000..8c76d81b
--- /dev/null
+++ b/.clangd
@@ -0,0 +1,11 @@
+---
+Language: Cpp
+BasedOnStyle: Google
+ColumnLimit: 100
+IncludeBlocks: Preserve
+DerivePointerAlignment: false
+PointerAlignment: Right
+---
+Language: Proto
+BasedOnStyle: Google
+...
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 00000000..44af7ac5
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,19 @@
+src/curobo/content/assets/debug** filter=lfs diff=lfs merge=lfs -text
+*.obj filter=lfs diff=lfs merge=lfs -text
+*.stl filter=lfs diff=lfs merge=lfs -text
+*.dae filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.mtl filter=lfs diff=lfs merge=lfs -text
+*.nvblx filter=lfs diff=lfs merge=lfs -text
+*.ply filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.svg filter=lfs diff=lfs merge=lfs -text
+*.usd filter=lfs diff=lfs merge=lfs -text
+*.usda filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+*.webm filter=lfs diff=lfs merge=lfs -text
+docs/_static/nvidia_logo.png filter=lfs diff=lfs merge=lfs -text
+docs/images/motion_opt.png filter=lfs diff=lfs merge=lfs -text
+src/curobo/content/assets/urdf/nvblox/out4_color.png filter=lfs diff=lfs merge=lfs -text
+docs/_html_extra/* filter=lfs diff=lfs merge=lfs -text
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..b504c77b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,214 @@
+.vscode*
+*.hdf5
+*data/
+*.bat
+*.p
+*.mlx
+
+*#
+*logs
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+*.json
+
+# C extensions
+*.so
+requirements.txt
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+docs/_source/
+_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+.#*
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+experiments/
+
+# torch pickles:
+*.t
+*.npy
+*.zip
+*.qdrep
+*.sql*
+*.npz
+/examples/*.stl
+
+*Flymake log*
+#*.pt
+*.pth*
+*.tar
+examples/*.pt
+
+curobo.*
+docs/modules.rst
+docs/setup.rst
+docs/install_instructions.rst
+docs/_api/*
+
+# VIM
+*swp
+*swo
+
+examples/debug/*.onnx
+*.engine
+*.profile
+*.cache
+*.flatbuffers
+#*.obj
+*.glb
+*.onnx
+*stltoobj.bat
+external/*
+public/*
+docs/doxygen/html/*
+docs/doxygen/xml/*
+
+
+docker/pkgs/*
+
+
+nvidia_curobo*/
+
+*.code-workspace
+*.usda
+*.pdf
+#*.png
+*.usd
+*.obj
+*.STL
+*.dae
+*.stl
+*.mtl
+*.SLDPRT
+
+*tfevents*
+
+*.csv
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 00000000..33b2ea8d
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,15 @@
+<!--
+Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+property and proprietary rights in and to this material, related
+documentation and any modifications thereto. Any use, reproduction,
+disclosure or distribution of this material and related documentation
+without an express license agreement from NVIDIA CORPORATION or
+its affiliates is strictly prohibited.
+-->
+# Changelog
+
+## Version 0.0.0
+
+- First version of CuRobo.
diff --git a/CODEOWNERS b/CODEOWNERS
new file mode 100644
index 00000000..136ff7ad
--- /dev/null
+++ b/CODEOWNERS
@@ -0,0 +1,16 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#######################################################
+# KEEP ITEMS IN ALPHABETICAL ORDER WITHIN THEIR GROUP #
+#######################################################
+# See this page for information about this file
+# https://docs.gitlab.com/ee/user/project/code_owners.html
+
+# Default
+NVIDIA
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 00000000..8f6fe577
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,77 @@
+Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+
+
+NVIDIA License
+
+1. Definitions
+
+“Licensor” means any person or entity that distributes its Work.
+“Work” means (a) the original work of authorship made available under this license, which may 
+include software, documentation, or other files, and (b) any additions to or derivative works  
+thereof  that are made available under this license.
+The terms “reproduce,” “reproduction,” “derivative works,” and “distribution” have the meaning as 
+provided under U.S. copyright law; provided, however, that for the purposes of this license, 
+derivative works shall not include works that remain separable from, or merely link (or bind by 
+name) to the interfaces of, the Work.
+
+Works are “made available” under this license by including in or with the Work either (a) a 
+copyright notice referencing the applicability of this license to the Work, or (b) a copy of 
+this license.
+
+2. License Grant
+
+2.1 Copyright Grant. Subject to the terms and conditions of this license, each Licensor grants to 
+you a perpetual, worldwide, non-exclusive, royalty-free, copyright license to use, reproduce, 
+prepare derivative works of, publicly display, publicly perform, sublicense and distribute its Work
+and any resulting derivative works in any form.
+
+3. Limitations
+
+3.1 Redistribution. You may reproduce or distribute the Work only if (a) you do so under this 
+license, (b) you include a complete copy of this license with your distribution, and (c) you retain 
+without modification any copyright, patent, trademark, or attribution notices that are present in 
+the Work.
+
+3.2 Derivative Works. You may specify that additional or different terms apply to the use, 
+reproduction, and distribution of your derivative works of the Work (“Your Terms”) only if (a) 
+Your Terms provide that the use limitation in Section 3.3 applies to your derivative works, and (b) 
+you identify the specific derivative works that are subject to Your Terms. Notwithstanding Your 
+Terms, this license (including the redistribution requirements in Section 3.1) will continue to 
+apply to the Work itself.
+
+3.3 Use Limitation. The Work and any derivative works thereof only may be used or intended for 
+use non-commercially. Notwithstanding the foregoing, NVIDIA Corporation and its affiliates may use 
+the Work and any derivative works commercially. As used herein, “non-commercially” means for 
+research or evaluation purposes only.
+
+3.4 Patent Claims. If you bring or threaten to bring a patent claim against any Licensor (including
+any claim, cross-claim or counterclaim in a lawsuit) to enforce any patents that you allege are 
+infringed by any Work, then your rights under this license from such Licensor (including the grant 
+in Section 2.1) will terminate immediately.
+
+3.5 Trademarks. This license does not grant any rights to use any Licensor’s or its affiliates’ 
+names, logos, or trademarks, except as necessary to reproduce the notices described in this license.
+
+3.6 Termination. If you violate any term of this license, then your rights under this license 
+(including the grant in Section 2.1) will terminate immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED “AS IS” WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR 
+IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, 
+TITLE OR NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER THIS LICENSE. 
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT 
+(INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, 
+INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR 
+RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK (INCLUDING BUT NOT LIMITED TO LOSS OF
+GOODWILL, BUSINESS INTERRUPTION, LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+
+****************************************************************************************************
+
+This package contains examples that use external assets (urdf and mesh files). Licenses for external
+assets are listed in LICENSE_ASSETS.
+
diff --git a/LICENSE_ASSETS b/LICENSE_ASSETS
new file mode 100644
index 00000000..c34ce50e
--- /dev/null
+++ b/LICENSE_ASSETS
@@ -0,0 +1,402 @@
+
+Licenses for external assets are listed here:
+
+franka_ros 0.7.0
+----------------
+
+The Franka urdf files in src/curobo/content/assets/robot/franka_description/ has minor modifications
+from those in the "franka_description" component of the franka_ros package.
+
+Site:    https://github.com/frankaemika/franka_ros/
+License: https://github.com/frankaemika/franka_ros/blob/0.7.0/NOTICE
+         https://github.com/frankaemika/franka_ros/blob/0.7.0/LICENSE
+
+Copyright 2017 Franka Emika GmbH
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+***********************
+The files in src/curobo/content/assets/robot/kinova were obtained from 
+https://github.com/Kinovarobotics/ros_kortex/tree/noetic-devel/kortex_description with minor 
+modifications to make the gripper fingers as fixed joints.
+
+The license for this is given below:
+
+Copyright (c) 2018, Kinova inc.
+All rights reserved.
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software
+without specific prior written permission.
+ 
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+***********************
+
+The files in src/curobo/content/assets/robot/ur_description have been obtained from 
+https://github.com/UniversalRobots/Universal_Robots_ROS2_Description with BSD 3-Clause License
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+   * Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+
+   * Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+
+   * Neither the name of the copyright holder nor the names of its
+     contributors may be used to endorse or promote products derived from
+     this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***********************
+The files in src/curobo/content/assets/robot/jaco were obtained from 
+https://github.com/Kinovarobotics/kinova-ros which has BSD 3-Clause License
+
+Copyright (c) 2017, Kinova Robotics inc.
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+      
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+***********************
+
+The urdfs in  src/curobo/content/assets/robot/iiwa_allegro_description were obtained by
+combining files from https://github.com/IFL-CAMP/iiwa_stack for the KUKA IIWA and 
+https://github.com/simlabrobotics/allegro_hand_ros_catkin for SimLab's Allegro Hand.
+
+Both have BSD 2-Cluase License as below.
+
+BSD 2-Clause License
+
+Copyright (c) 2016-2019, Salvatore Virga - salvo.virga@tum.de, Marco Esposito - marco.esposito@tum.de 
+Technische Universität München 
+Chair for Computer Aided Medical Procedures and Augmented Reality 
+Fakultät für Informatik / I16, Boltzmannstraße 3, 85748 Garching bei München, Germany 
+http://campar.in.tum.de
+All rights reserved.
+
+Copyright (c) 2015, Robert Krug & Todor Stoyanov, AASS Research Center, Orebro University, Sweden
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+Copyright (c) 2016, SimLab
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+**********************************
+The files in src/curobo/content/assets/robot/techman were obtained from 
+https://github.com/TechmanRobotInc/tmr_ros1 which have BSD 3-Clause license.
+
+BSD 3-Clause License
+
+Copyright (c) 2019-2023, TECHMAN ROBOT INC.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+***********************************
\ No newline at end of file
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 00000000..2f36d885
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,18 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+# This file list the additional files that should be include in the package distribution.
+#
+# References:
+# * https://packaging.python.org/guides/using-manifest-in/
+# * https://setuptools.readthedocs.io/en/latest/userguide/datafiles.html
+# * https://stackoverflow.com/questions/6028000/how-to-read-a-static-file-from-inside-a-python-package
+
+# graft <path to files or directories to add to the project>
+graft src/curobo/content
+global-include py.typed
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 00000000..0b1a2cbd
--- /dev/null
+++ b/README.md
@@ -0,0 +1,54 @@
+<!--
+Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+property and proprietary rights in and to this material, related
+documentation and any modifications thereto. Any use, reproduction,
+disclosure or distribution of this material and related documentation
+without an express license agreement from NVIDIA CORPORATION or
+its affiliates is strictly prohibited.
+-->
+# CuRobo
+
+*CUDA Accelerated Robot Library*
+
+**Check [curobo.org](https://curobo.org) for installing and getting started with examples!**
+
+Use [Discussions](https://github.com/NVlabs/curobo/discussions) for questions on using this package.
+
+Use [Issues](https://github.com/NVlabs/curobo/issues) if you find a bug.
+
+
+For business inquiries, please visit our website and submit the form: [NVIDIA Research Licensing](https://www.nvidia.com/en-us/research/inquiries/)
+
+## Overview
+
+CuRobo is a CUDA accelerated library containing a suite of robotics algorithms that run significantly faster than existing implementations leveraging parallel compute. CuRobo currently provides the following algorithms: (1) forward and inverse kinematics,
+(2) collision checking between robot and world, with the world represented as Cuboids, Meshes, and Depth images, (3) numerical optimization with gradient descent, L-BFGS, and MPPI, (4) geometric planning, (5) trajectory optimization, (6) motion generation that combines inverse kinematics, geometric planning, and trajectory optimization to generate global motions within 30ms.
+
+<p align="center">
+<img width="500" src="images/robot_demo.gif">
+</p>
+
+
+CuRobo performs trajectory optimization across many seeds in parallel to find a solution. CuRobo's trajectory optimization penalizes jerk and accelerations, encouraging smoother and shorter trajectories. Below we compare CuRobo's motion generation on the left to a BiRRT planner on a pick and place task.
+
+<p align="center">
+<img width="500" src="images/rrt_compare.gif">
+</p>
+
+## Citation
+
+If you found this work useful, please cite the below report,
+
+```
+@article{curobo_report23,
+         title={CuRobo: Parallelized Collision-Free Minimum-Jerk Robot Motion Generation},
+         author={Sundaralingam, Balakumar and Hari, Siva Kumar Sastry and 
+         Fishman, Adam and Garrett, Caelan and Van Wyk, Karl and Blukis, Valts and 
+         Millane, Alexander and Oleynikova, Helen and Handa, Ankur and 
+         Ramos, Fabio and Ratliff, Nathan and Fox, Dieter},
+         journal={arXiv preprint},
+         year={2023}
+}
+```
\ No newline at end of file
diff --git a/benchmark/README.md b/benchmark/README.md
new file mode 100644
index 00000000..d2b49f9b
--- /dev/null
+++ b/benchmark/README.md
@@ -0,0 +1,13 @@
+<!--
+Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+property and proprietary rights in and to this material, related
+documentation and any modifications thereto. Any use, reproduction,
+disclosure or distribution of this material and related documentation
+without an express license agreement from NVIDIA CORPORATION or
+its affiliates is strictly prohibited.
+-->
+This folder contains scripts to run the motion planning benchmarks.
+
+Refer to Benchmarks & Profiling instructions in documentation for more information.
diff --git a/benchmark/curobo_benchmark.py b/benchmark/curobo_benchmark.py
new file mode 100644
index 00000000..74851bd9
--- /dev/null
+++ b/benchmark/curobo_benchmark.py
@@ -0,0 +1,727 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+from copy import deepcopy
+from typing import Optional
+
+# Third Party
+import numpy as np
+import torch
+from tqdm import tqdm
+
+# CuRobo
+from curobo.geom.sdf.world import CollisionCheckerType, WorldConfig
+from curobo.geom.types import Mesh
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import RobotConfig
+from curobo.types.state import JointState
+from curobo.util.logger import setup_curobo_logger
+from curobo.util_file import (
+    get_assets_path,
+    get_robot_configs_path,
+    get_world_configs_path,
+    join_path,
+    load_yaml,
+    write_yaml,
+)
+from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
+
+# torch.set_num_threads(8)
+# torch.use_deterministic_algorithms(True)
+torch.manual_seed(0)
+
+torch.backends.cudnn.benchmark = True
+
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+
+# torch.backends.cuda.matmul.allow_tf32 = False
+# torch.backends.cudnn.allow_tf32 = False
+
+# torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
+np.random.seed(0)
+# Standard Library
+import argparse
+import warnings
+from typing import List, Optional
+
+# Third Party
+from metrics import CuroboGroupMetrics, CuroboMetrics
+from robometrics.datasets import demo_raw, motion_benchmaker_raw, mpinets_raw
+
+
+def plot_cost_iteration(cost: torch.Tensor, save_path="cost", title="", log_scale=False):
+    # Third Party
+    import matplotlib.pyplot as plt
+
+    fig = plt.figure(figsize=(5, 4))
+    cost = cost.cpu().numpy()
+    # save to csv:
+    np.savetxt(save_path + ".csv", cost, delimiter=",")
+
+    # if cost.shape[0] > 1:
+    colormap = plt.cm.winter
+    plt.gca().set_prop_cycle(plt.cycler("color", colormap(np.linspace(0, 1, cost.shape[0]))))
+    x = [i for i in range(cost.shape[-1])]
+    for i in range(cost.shape[0]):
+        plt.plot(x, cost[i], label="seed_" + str(i))
+    plt.tight_layout()
+    # plt.title(title)
+    plt.xlabel("iteration")
+    plt.ylabel("cost")
+    if log_scale:
+        plt.yscale("log")
+    plt.grid()
+    # plt.legend()
+    plt.tight_layout()
+    plt.savefig(save_path + ".pdf")
+    plt.close()
+
+
+def plot_traj(act_seq: JointState, dt=0.25, title="", save_path="plot.png", sma_filter=False):
+    # Third Party
+    import matplotlib.pyplot as plt
+
+    fig, ax = plt.subplots(4, 1, figsize=(5, 8), sharex=True)
+    t_steps = np.linspace(0, act_seq.position.shape[0] * dt, act_seq.position.shape[0])
+    # compute acceleration from finite difference of velocity:
+    # act_seq.acceleration = (torch.roll(act_seq.velocity, -1, 0) - act_seq.velocity) / dt
+    # act_seq.acceleration = ( act_seq.velocity - torch.roll(act_seq.velocity, 1, 0)) / dt
+    # act_seq.acceleration[0,:] = 0.0
+    # act_seq.jerk = ( act_seq.acceleration - torch.roll(act_seq.acceleration, 1, 0)) / dt
+    # act_seq.jerk[0,:] = 0.0
+    if sma_filter:
+        kernel = 5
+        sma = torch.nn.AvgPool1d(kernel_size=kernel, stride=1, padding=2, ceil_mode=False).cuda()
+    # act_seq.jerk = sma(act_seq.jerk)
+    # act_seq.acceleration[-1,:] = 0.0
+    for i in range(act_seq.position.shape[-1]):
+        ax[0].plot(t_steps, act_seq.position[:, i].cpu(), "-", label=str(i))
+        # act_seq.velocity[1:-1, i] = sma(act_seq.velocity[:,i].view(1,-1)).squeeze()#@[1:-2]
+
+        ax[1].plot(t_steps[: act_seq.velocity.shape[0]], act_seq.velocity[:, i].cpu(), "-")
+        if sma_filter:
+            act_seq.acceleration[:, i] = sma(
+                act_seq.acceleration[:, i].view(1, -1)
+            ).squeeze()  # @[1:-2]
+
+        ax[2].plot(t_steps[: act_seq.acceleration.shape[0]], act_seq.acceleration[:, i].cpu(), "-")
+        if sma_filter:
+            act_seq.jerk[:, i] = sma(act_seq.jerk[:, i].view(1, -1)).squeeze()  # @[1:-2]\
+
+        ax[3].plot(t_steps[: act_seq.jerk.shape[0]], act_seq.jerk[:, i].cpu(), "-")
+    ax[0].set_title(title + " dt=" + "{:.3f}".format(dt))
+    ax[3].set_xlabel("Time(s)")
+    ax[3].set_ylabel("Jerk rad. s$^{-3}$")
+    ax[0].set_ylabel("Position rad.")
+    ax[1].set_ylabel("Velocity rad. s$^{-1}$")
+    ax[2].set_ylabel("Acceleration rad. s$^{-2}$")
+    ax[0].grid()
+    ax[1].grid()
+    ax[2].grid()
+    ax[3].grid()
+    # ax[0].legend(loc="upper right")
+    ax[0].legend(bbox_to_anchor=(0.5, 1.6), loc="upper center", ncol=4)
+    plt.tight_layout()
+    plt.savefig(save_path)
+    plt.close()
+    # plt.legend()
+
+
+def load_curobo(
+    n_cubes: int,
+    enable_debug: bool = False,
+    tsteps: int = 30,
+    trajopt_seeds: int = 4,
+    mpinets: bool = False,
+    graph_mode: bool = True,
+    mesh_mode: bool = False,
+    cuda_graph: bool = True,
+    collision_buffer: float = -0.01,
+):
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), "franka.yml"))["robot_cfg"]
+    robot_cfg["kinematics"]["collision_sphere_buffer"] = collision_buffer
+    robot_cfg["kinematics"]["collision_spheres"] = "spheres/franka_mesh.yml"
+    robot_cfg["kinematics"]["collision_link_names"].remove("attached_object")
+
+    # del robot_cfg["kinematics"]
+
+    ik_seeds = 30  # 500
+    if graph_mode:
+        trajopt_seeds = 4
+    if trajopt_seeds >= 14:
+        ik_seeds = max(100, trajopt_seeds * 4)
+    if mpinets:
+        robot_cfg["kinematics"]["lock_joints"] = {
+            "panda_finger_joint1": 0.025,
+            "panda_finger_joint2": -0.025,
+        }
+    world_cfg = WorldConfig.from_dict(
+        load_yaml(join_path(get_world_configs_path(), "collision_table.yml"))
+    ).get_obb_world()
+    interpolation_steps = 2000
+    c_checker = CollisionCheckerType.PRIMITIVE
+    c_cache = {"obb": n_cubes}
+    if mesh_mode:
+        c_checker = CollisionCheckerType.MESH
+        c_cache = {"mesh": n_cubes}
+        world_cfg = world_cfg.get_mesh_world()
+    if graph_mode:
+        interpolation_steps = 100
+
+    robot_cfg_instance = RobotConfig.from_dict(robot_cfg, tensor_args=TensorDeviceType())
+
+    K = robot_cfg_instance.kinematics.kinematics_config.joint_limits
+    K.position[0, :] -= 0.1
+    K.position[1, :] += 0.1
+
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_cfg_instance,
+        world_cfg,
+        trajopt_tsteps=tsteps,
+        collision_checker_type=c_checker,
+        use_cuda_graph=cuda_graph,
+        collision_cache=c_cache,
+        position_threshold=0.005,  # 5 mm
+        rotation_threshold=0.05,
+        num_ik_seeds=ik_seeds,
+        num_graph_seeds=trajopt_seeds,
+        num_trajopt_seeds=trajopt_seeds,
+        interpolation_dt=0.025,
+        store_ik_debug=enable_debug,
+        store_trajopt_debug=enable_debug,
+        interpolation_steps=interpolation_steps,
+        collision_activation_distance=0.03,
+        trajopt_dt=0.25,
+        finetune_dt_scale=1.05,  # 1.05,
+        maximum_trajectory_dt=0.1,
+    )
+    mg = MotionGen(motion_gen_config)
+    mg.warmup(enable_graph=True, warmup_js_trajopt=False)
+    return mg, robot_cfg
+
+
+def benchmark_mb(
+    write_usd=False,
+    save_log=False,
+    write_plot=False,
+    write_benchmark=False,
+    plot_cost=False,
+    override_tsteps: Optional[int] = None,
+    save_kpi=False,
+    graph_mode=False,
+    args=None,
+):
+    # load dataset:
+
+    interpolation_dt = 0.02
+    # mpinets_data = True
+    # if mpinets_data:
+    file_paths = [motion_benchmaker_raw, mpinets_raw][:]
+    if args.demo:
+        file_paths = [demo_raw]
+
+    # else:22
+    #    file_paths = [get_mb_dataset_path()][:1]
+    enable_debug = save_log or plot_cost
+    all_files = []
+    og_tsteps = 32
+    if override_tsteps is not None:
+        og_tsteps = override_tsteps
+
+    og_trajopt_seeds = 12
+    for file_path in file_paths:
+        all_groups = []
+        mpinets_data = False
+        problems = file_path()
+        if "dresser_task_oriented" in list(problems.keys()):
+            mpinets_data = True
+        for key, v in tqdm(problems.items()):
+            tsteps = og_tsteps
+            trajopt_seeds = og_trajopt_seeds
+
+            if "cage_panda" in key:
+                trajopt_seeds = 16
+            # else:
+            #    continue
+            if "table_under_pick_panda" in key:
+                tsteps = 44
+                trajopt_seeds = 28
+
+            if "cubby_task_oriented" in key and "merged" not in key:
+                trajopt_seeds = 16
+            if "dresser_task_oriented" in key:
+                trajopt_seeds = 16
+            scene_problems = problems[key]  # [:4]  # [:1]  # [:20]  # [0:10]
+            n_cubes = check_problems(scene_problems)
+            # torch.cuda.empty_cache()
+            mg, robot_cfg = load_curobo(
+                n_cubes,
+                enable_debug,
+                tsteps,
+                trajopt_seeds,
+                mpinets_data,
+                graph_mode,
+                args.mesh,
+                not args.disable_cuda_graph,
+                collision_buffer=args.collision_buffer,
+            )
+            m_list = []
+            i = 0
+            ik_fail = 0
+            for problem in tqdm(scene_problems, leave=False):
+                i += 1
+                if problem["collision_buffer_ik"] < 0.0:
+                    #    print("collision_ik:", problem["collision_buffer_ik"])
+                    continue
+                # if i != 269: # 226
+                #    continue
+
+                plan_config = MotionGenPlanConfig(
+                    max_attempts=100,  # 00,  # 00,  # 100,  # 00,  # 000,#,00,#00,  # 5000,
+                    enable_graph_attempt=3,
+                    enable_finetune_trajopt=True,
+                    partial_ik_opt=False,
+                    enable_graph=graph_mode,
+                    timeout=60,
+                    enable_opt=not graph_mode,
+                )
+                # if "table_under_pick_panda" in key:
+                #    plan_config.enable_graph = True
+                #    plan_config.partial_ik_opt = False
+                q_start = problem["start"]
+                pose = (
+                    problem["goal_pose"]["position_xyz"] + problem["goal_pose"]["quaternion_wxyz"]
+                )
+                problem_name = "d_" + key + "_" + str(i)
+
+                # reset planner
+                mg.reset(reset_seed=False)
+                if args.mesh:
+                    world = WorldConfig.from_dict(deepcopy(problem["obstacles"])).get_mesh_world()
+                else:
+                    world = WorldConfig.from_dict(deepcopy(problem["obstacles"])).get_obb_world()
+                mg.world_coll_checker.clear_cache()
+                mg.update_world(world)
+                # continue
+                # load obstacles
+
+                # run planner
+                start_state = JointState.from_position(mg.tensor_args.to_device([q_start]))
+                goal_pose = Pose.from_list(pose)
+
+                result = mg.plan_single(
+                    start_state,
+                    goal_pose,
+                    plan_config,
+                )
+                if result.status == "IK Fail":
+                    ik_fail += 1
+                # rint(plan_config.enable_graph, plan_config.enable_graph_attempt)
+                problem["solution"] = None
+                if plan_config.enable_finetune_trajopt:
+                    problem_name = key + "_" + str(i)
+                else:
+                    problem_name = "noft_" + key + "_" + str(i)
+                problem_name = "nosw_" + problem_name
+                if write_usd or save_log:
+                    # CuRobo
+                    from curobo.util.usd_helper import UsdHelper
+
+                    world.randomize_color(r=[0.5, 0.9], g=[0.2, 0.5], b=[0.0, 0.2])
+
+                    gripper_mesh = Mesh(
+                        name="target_gripper",
+                        file_path=join_path(
+                            get_assets_path(),
+                            "robot/franka_description/meshes/visual/hand.dae",
+                        ),
+                        color=[0.0, 0.8, 0.1, 1.0],
+                        pose=pose,
+                    )
+                    world.add_obstacle(gripper_mesh)
+                # get costs:
+                if plot_cost:
+                    dt = 0.5
+                    problem_name = "approx_wolfe_p" + problem_name
+                    if result.optimized_dt is not None:
+                        dt = result.optimized_dt.item()
+                    if "trajopt_result" in result.debug_info:
+                        success = result.success.item()
+                        traj_cost = (
+                            # result.debug_info["trajopt_result"].debug_info["solver"]["cost"][0]
+                            result.debug_info["trajopt_result"].debug_info["solver"]["cost"][-1]
+                        )
+                        # print(traj_cost[0])
+                        traj_cost = torch.cat(traj_cost, dim=-1)
+                        plot_cost_iteration(
+                            traj_cost,
+                            title=problem_name + "_" + str(success) + "_" + str(dt),
+                            save_path=join_path("log/plot/", problem_name + "_cost"),
+                            log_scale=False,
+                        )
+                        if "finetune_trajopt_result" in result.debug_info:
+                            traj_cost = result.debug_info["finetune_trajopt_result"].debug_info[
+                                "solver"
+                            ]["cost"][0]
+                            traj_cost = torch.cat(traj_cost, dim=-1)
+                            plot_cost_iteration(
+                                traj_cost,
+                                title=problem_name + "_" + str(success) + "_" + str(dt),
+                                save_path=join_path("log/plot/", problem_name + "_ft_cost"),
+                                log_scale=False,
+                            )
+                if result.success.item():
+                    # print("GT: ", result.graph_time)
+                    q_traj = result.get_interpolated_plan()
+                    problem["goal_ik"] = q_traj.position.cpu().squeeze().numpy()[-1, :].tolist()
+                    problem["solution"] = {
+                        "position": result.get_interpolated_plan()
+                        .position.cpu()
+                        .squeeze()
+                        .numpy()
+                        .tolist(),
+                        "velocity": result.get_interpolated_plan()
+                        .velocity.cpu()
+                        .squeeze()
+                        .numpy()
+                        .tolist(),
+                        "acceleration": result.get_interpolated_plan()
+                        .acceleration.cpu()
+                        .squeeze()
+                        .numpy()
+                        .tolist(),
+                        "jerk": result.get_interpolated_plan()
+                        .jerk.cpu()
+                        .squeeze()
+                        .numpy()
+                        .tolist(),
+                        "dt": interpolation_dt,
+                    }
+                    # print(problem["solution"]["position"])
+                    # exit()
+                    debug = {
+                        "used_graph": result.used_graph,
+                        "attempts": result.attempts,
+                        "ik_time": result.ik_time,
+                        "graph_time": result.graph_time,
+                        "trajopt_time": result.trajopt_time,
+                        "total_time": result.total_time,
+                        "solve_time": result.solve_time,
+                        "opt_traj": {
+                            "position": result.optimized_plan.position.cpu()
+                            .squeeze()
+                            .numpy()
+                            .tolist(),
+                            "velocity": result.optimized_plan.velocity.cpu()
+                            .squeeze()
+                            .numpy()
+                            .tolist(),
+                            "acceleration": result.optimized_plan.acceleration.cpu()
+                            .squeeze()
+                            .numpy()
+                            .tolist(),
+                            "jerk": result.optimized_plan.jerk.cpu().squeeze().numpy().tolist(),
+                            "dt": result.optimized_dt.item(),
+                        },
+                        "valid_query": result.valid_query,
+                    }
+                    problem["solution_debug"] = debug
+                    # print(
+                    #    "T: ",
+                    #    result.motion_time.item(),
+                    #    result.optimized_dt.item(),
+                    #    (len(problem["solution"]["position"]) - 1 ) * result.interpolation_dt,
+                    #    result.interpolation_dt,
+                    #    )
+                    # exit()
+                    reached_pose = mg.compute_kinematics(result.optimized_plan[-1]).ee_pose
+                    rot_error = goal_pose.angular_distance(reached_pose) * 100.0
+                    if args.graph:
+                        solve_time = result.graph_time
+                    else:
+                        solve_time = result.solve_time
+                    current_metrics = CuroboMetrics(
+                        skip=False,
+                        success=True,
+                        time=result.total_time,
+                        collision=False,
+                        joint_limit_violation=False,
+                        self_collision=False,
+                        position_error=result.position_error.item() * 1000.0,
+                        orientation_error=rot_error.item(),
+                        eef_position_path_length=10,
+                        eef_orientation_path_length=10,
+                        attempts=result.attempts,
+                        motion_time=result.motion_time.item(),
+                        solve_time=solve_time,
+                    )
+
+                    if write_usd:
+                        # CuRobo
+
+                        q_traj = result.get_interpolated_plan()
+                        UsdHelper.write_trajectory_animation_with_robot_usd(
+                            robot_cfg,
+                            world,
+                            start_state,
+                            q_traj,
+                            dt=result.interpolation_dt,
+                            save_path=join_path("log/usd/", problem_name) + ".usd",
+                            interpolation_steps=1,
+                            write_robot_usd_path="log/usd/assets/",
+                            robot_usd_local_reference="assets/",
+                            base_frame="/world_" + problem_name,
+                            visualize_robot_spheres=True,
+                        )
+
+                    if write_plot:
+                        problem_name = problem_name
+                        plot_traj(
+                            result.optimized_plan,
+                            result.optimized_dt.item(),
+                            # result.get_interpolated_plan(),
+                            # result.interpolation_dt,
+                            title=problem_name,
+                            save_path=join_path("log/plot/", problem_name + ".pdf"),
+                        )
+                        plot_traj(
+                            # result.optimized_plan,
+                            # result.optimized_dt.item(),
+                            result.get_interpolated_plan(),
+                            result.interpolation_dt,
+                            title=problem_name,
+                            save_path=join_path("log/plot/", problem_name + "_int.pdf"),
+                        )
+                        # exit()
+
+                    m_list.append(current_metrics)
+                    all_groups.append(current_metrics)
+                elif result.valid_query:
+                    # print("fail")
+                    # print(result.status)
+                    current_metrics = CuroboMetrics()
+                    debug = {
+                        "used_graph": result.used_graph,
+                        "attempts": result.attempts,
+                        "ik_time": result.ik_time,
+                        "graph_time": result.graph_time,
+                        "trajopt_time": result.trajopt_time,
+                        "total_time": result.total_time,
+                        "solve_time": result.solve_time,
+                        "status": result.status,
+                        "valid_query": result.valid_query,
+                    }
+                    problem["solution_debug"] = debug
+
+                    m_list.append(current_metrics)
+                    all_groups.append(current_metrics)
+                else:
+                    # print("invalid: " + problem_name)
+                    debug = {
+                        "used_graph": result.used_graph,
+                        "attempts": result.attempts,
+                        "ik_time": result.ik_time,
+                        "graph_time": result.graph_time,
+                        "trajopt_time": result.trajopt_time,
+                        "total_time": result.total_time,
+                        "solve_time": result.solve_time,
+                        "status": result.status,
+                        "valid_query": result.valid_query,
+                    }
+
+                    problem["solution_debug"] = debug
+                    if False:
+                        world.save_world_as_mesh(problem_name + ".obj")
+
+                        q_traj = start_state  # .unsqueeze(0)
+                        # CuRobo
+                        from curobo.util.usd_helper import UsdHelper
+
+                        UsdHelper.write_trajectory_animation_with_robot_usd(
+                            robot_cfg,
+                            world,
+                            start_state,
+                            q_traj,
+                            dt=result.interpolation_dt,
+                            save_path=join_path("log/usd/", problem_name) + ".usd",
+                            interpolation_steps=1,
+                            write_robot_usd_path="log/usd/assets/",
+                            robot_usd_local_reference="assets/",
+                            base_frame="/world_" + problem_name,
+                            visualize_robot_spheres=True,
+                        )
+                if save_log:  # and not result.success.item():
+                    # print("save log")
+                    UsdHelper.write_motion_gen_log(
+                        result,
+                        robot_cfg,
+                        world,
+                        start_state,
+                        Pose.from_list(pose),
+                        join_path("log/usd/", problem_name) + "_debug",
+                        write_ik=False,
+                        write_trajopt=True,
+                        visualize_robot_spheres=False,
+                        grid_space=2,
+                    )
+                # exit()
+
+            g_m = CuroboGroupMetrics.from_list(m_list)
+            if not args.kpi:
+                print(
+                    key,
+                    f"{g_m.success:2.2f}",
+                    # g_m.motion_time,
+                    g_m.time.mean,
+                    # g_m.time.percent_75,
+                    g_m.time.percent_98,
+                    g_m.position_error.percent_98,
+                    # g_m.position_error.median,
+                    g_m.orientation_error.percent_98,
+                    # g_m.orientation_error.median,
+                )  # , g_m.attempts)
+                print(g_m.attempts)
+            # print("MT: ", g_m.motion_time)
+            # $print(ik_fail)
+            # exit()
+            # print(g_m.position_error, g_m.orientation_error)
+
+        g_m = CuroboGroupMetrics.from_list(all_groups)
+        if not args.kpi:
+            print(
+                "All: ",
+                f"{g_m.success:2.2f}",
+                g_m.motion_time.percent_98,
+                g_m.time.mean,
+                g_m.time.percent_75,
+                g_m.position_error.percent_75,
+                g_m.orientation_error.percent_75,
+            )  # g_m.time, g_m.attempts)
+        # print("MT: ", g_m.motion_time)
+
+        # print(g_m.position_error, g_m.orientation_error)
+
+        # exit()
+        if write_benchmark:
+            if not mpinets_data:
+                write_yaml(problems, args.file_name + "_mb_solution.yaml")
+            else:
+                write_yaml(problems, args.file_name + "_mpinets_solution.yaml")
+        all_files += all_groups
+    g_m = CuroboGroupMetrics.from_list(all_files)
+    # print(g_m.success, g_m.time, g_m.attempts, g_m.position_error, g_m.orientation_error)
+    print("######## FULL SET ############")
+    print("All: ", f"{g_m.success:2.2f}")
+    print("MT: ", g_m.motion_time)
+    print("PT:", g_m.time)
+    print("ST: ", g_m.solve_time)
+    print("position error (mm): ", g_m.position_error)
+    print("orientation error(%): ", g_m.orientation_error)
+
+    if args.kpi:
+        kpi_data = {
+            "Success": g_m.success,
+            "Planning Time Mean": float(g_m.time.mean),
+            "Planning Time Std": float(g_m.time.std),
+            "Planning Time Median": float(g_m.time.median),
+            "Planning Time 75th": float(g_m.time.percent_75),
+            "Planning Time 98th": float(g_m.time.percent_98),
+        }
+        write_yaml(kpi_data, join_path(args.save_path, args.file_name + ".yml"))
+
+    # run on mb dataset:
+
+
+def check_problems(all_problems):
+    n_cube = 0
+    for problem in all_problems:
+        cache = (
+            WorldConfig.from_dict(deepcopy(problem["obstacles"])).get_obb_world().get_cache_dict()
+        )
+        n_cube = max(n_cube, cache["obb"])
+    return n_cube
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save_path",
+        type=str,
+        default=".",
+        help="path to save file",
+    )
+    parser.add_argument(
+        "--file_name",
+        type=str,
+        default="mg_curobo_",
+        help="File name prefix to use to save benchmark results",
+    )
+    parser.add_argument(
+        "--collision_buffer",
+        type=float,
+        default=-0.00,  # in meters
+        help="Robot collision buffer",
+    )
+
+    parser.add_argument(
+        "--graph",
+        action="store_true",
+        help="When True, runs only geometric planner",
+        default=False,
+    )
+    parser.add_argument(
+        "--mesh",
+        action="store_true",
+        help="When True, converts obstacles to meshes",
+        default=False,
+    )
+    parser.add_argument(
+        "--kpi",
+        action="store_true",
+        help="When True, saves minimal metrics",
+        default=False,
+    )
+
+    parser.add_argument(
+        "--demo",
+        action="store_true",
+        help="When True, runs only on small dataaset",
+        default=False,
+    )
+    parser.add_argument(
+        "--disable_cuda_graph",
+        action="store_true",
+        help="When True, disable cuda graph during benchmarking",
+        default=False,
+    )
+    parser.add_argument(
+        "--write_benchmark",
+        action="store_true",
+        help="When True, writes paths to file",
+        default=False,
+    )
+
+    args = parser.parse_args()
+
+    setup_curobo_logger("error")
+    benchmark_mb(
+        save_log=False,
+        write_usd=False,
+        write_plot=False,
+        write_benchmark=args.write_benchmark,
+        plot_cost=False,
+        save_kpi=args.kpi,
+        graph_mode=args.graph,
+        args=args,
+    )
diff --git a/benchmark/curobo_nvblox_benchmark.py b/benchmark/curobo_nvblox_benchmark.py
new file mode 100644
index 00000000..5c3db341
--- /dev/null
+++ b/benchmark/curobo_nvblox_benchmark.py
@@ -0,0 +1,558 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+import argparse
+from copy import deepcopy
+from typing import Optional
+
+# Third Party
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from metrics import CuroboGroupMetrics, CuroboMetrics
+from nvblox_torch.datasets.mesh_dataset import MeshDataset
+from robometrics.datasets import demo_raw, motion_benchmaker_raw, mpinets_raw
+from tqdm import tqdm
+
+# CuRobo
+from curobo.geom.sdf.world import CollisionCheckerType, WorldConfig
+from curobo.geom.types import Mesh
+from curobo.types.base import TensorDeviceType
+from curobo.types.camera import CameraObservation
+from curobo.types.math import Pose
+from curobo.types.robot import RobotConfig
+from curobo.types.state import JointState
+from curobo.util.logger import setup_curobo_logger
+from curobo.util_file import (
+    get_assets_path,
+    get_robot_configs_path,
+    get_world_configs_path,
+    join_path,
+    load_yaml,
+    write_yaml,
+)
+from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
+
+torch.manual_seed(0)
+
+torch.backends.cudnn.benchmark = True
+
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+
+np.random.seed(0)
+
+
+def plot_cost_iteration(cost: torch.Tensor, save_path="cost", title="", log_scale=False):
+    fig = plt.figure(figsize=(5, 4))
+    cost = cost.cpu().numpy()
+    # save to csv:
+    np.savetxt(save_path + ".csv", cost, delimiter=",")
+
+    # if cost.shape[0] > 1:
+    colormap = plt.cm.winter
+    plt.gca().set_prop_cycle(plt.cycler("color", colormap(np.linspace(0, 1, cost.shape[0]))))
+    x = [i for i in range(cost.shape[-1])]
+    for i in range(cost.shape[0]):
+        plt.plot(x, cost[i], label="seed_" + str(i))
+    plt.tight_layout()
+    # plt.title(title)
+    plt.xlabel("iteration")
+    plt.ylabel("cost")
+    if log_scale:
+        plt.yscale("log")
+    plt.grid()
+    # plt.legend()
+    plt.tight_layout()
+    plt.savefig(save_path + ".pdf")
+    plt.close()
+
+
+def plot_traj(act_seq: JointState, dt=0.25, title="", save_path="plot.png", sma_filter=False):
+    fig, ax = plt.subplots(4, 1, figsize=(5, 8), sharex=True)
+    t_steps = np.linspace(0, act_seq.position.shape[0] * dt, act_seq.position.shape[0])
+
+    if sma_filter:
+        kernel = 5
+        sma = torch.nn.AvgPool1d(kernel_size=kernel, stride=1, padding=2, ceil_mode=False).cuda()
+
+    for i in range(act_seq.position.shape[-1]):
+        ax[0].plot(t_steps, act_seq.position[:, i].cpu(), "-", label=str(i))
+
+        ax[1].plot(t_steps[: act_seq.velocity.shape[0]], act_seq.velocity[:, i].cpu(), "-")
+        if sma_filter:
+            act_seq.acceleration[:, i] = sma(
+                act_seq.acceleration[:, i].view(1, -1)
+            ).squeeze()  # @[1:-2]
+
+        ax[2].plot(t_steps[: act_seq.acceleration.shape[0]], act_seq.acceleration[:, i].cpu(), "-")
+        if sma_filter:
+            act_seq.jerk[:, i] = sma(act_seq.jerk[:, i].view(1, -1)).squeeze()  # @[1:-2]\
+
+        ax[3].plot(t_steps[: act_seq.jerk.shape[0]], act_seq.jerk[:, i].cpu(), "-")
+    ax[0].set_title(title + " dt=" + "{:.3f}".format(dt))
+    ax[3].set_xlabel("Time(s)")
+    ax[3].set_ylabel("Jerk rad. s$^{-3}$")
+    ax[0].set_ylabel("Position rad.")
+    ax[1].set_ylabel("Velocity rad. s$^{-1}$")
+    ax[2].set_ylabel("Acceleration rad. s$^{-2}$")
+    ax[0].grid()
+    ax[1].grid()
+    ax[2].grid()
+    ax[3].grid()
+    ax[0].legend(bbox_to_anchor=(0.5, 1.6), loc="upper center", ncol=4)
+    plt.tight_layout()
+    plt.savefig(save_path)
+    plt.close()
+
+
+def load_curobo(
+    n_cubes: int,
+    enable_debug: bool = False,
+    tsteps: int = 30,
+    trajopt_seeds: int = 4,
+    mpinets: bool = False,
+    graph_mode: bool = False,
+    cuda_graph: bool = True,
+):
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), "franka.yml"))["robot_cfg"]
+    robot_cfg["kinematics"]["collision_sphere_buffer"] = -0.015
+
+    ik_seeds = 30  # 500
+    if graph_mode:
+        trajopt_seeds = 4
+    if trajopt_seeds >= 14:
+        ik_seeds = max(100, trajopt_seeds * 4)
+    if mpinets:
+        robot_cfg["kinematics"]["lock_joints"] = {
+            "panda_finger_joint1": 0.025,
+            "panda_finger_joint2": -0.025,
+        }
+    world_cfg = WorldConfig.from_dict(
+        load_yaml(join_path(get_world_configs_path(), "collision_nvblox_online.yml"))
+    )
+    interpolation_steps = 2000
+    if graph_mode:
+        interpolation_steps = 100
+    robot_cfg_instance = RobotConfig.from_dict(robot_cfg, tensor_args=TensorDeviceType())
+    K = robot_cfg_instance.kinematics.kinematics_config.joint_limits
+    K.position[0, :] -= 0.1
+    K.position[1, :] += 0.1
+
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_cfg_instance,
+        world_cfg,
+        trajopt_tsteps=tsteps,
+        collision_checker_type=CollisionCheckerType.BLOX,
+        use_cuda_graph=cuda_graph,
+        position_threshold=0.005,  # 0.5 cm
+        rotation_threshold=0.05,
+        num_ik_seeds=ik_seeds,
+        num_graph_seeds=trajopt_seeds,
+        num_trajopt_seeds=trajopt_seeds,
+        interpolation_dt=0.025,
+        store_ik_debug=enable_debug,
+        store_trajopt_debug=enable_debug,
+        interpolation_steps=interpolation_steps,
+        collision_activation_distance=0.025,
+        state_finite_difference_mode="CENTRAL",
+        trajopt_dt=0.25,
+        minimize_jerk=True,
+        finetune_dt_scale=1.05,
+        maximum_trajectory_dt=0.1,
+    )
+    mg = MotionGen(motion_gen_config)
+    mg.warmup(enable_graph=True, warmup_js_trajopt=False)
+    return mg, robot_cfg
+
+
+def benchmark_mb(
+    write_usd=False,
+    save_log=False,
+    write_plot=False,
+    write_benchmark=False,
+    plot_cost=False,
+    override_tsteps: Optional[int] = None,
+    args=None,
+):
+    # load dataset:
+    graph_mode = args.graph
+    interpolation_dt = 0.02
+    file_paths = [demo_raw, motion_benchmaker_raw, mpinets_raw][2:]
+
+    enable_debug = save_log or plot_cost
+    all_files = []
+    og_tsteps = 32
+    if override_tsteps is not None:
+        og_tsteps = override_tsteps
+
+    og_trajopt_seeds = 12
+
+    if args.graph:
+        og_trajopt_seeds = 4
+    for file_path in file_paths:
+        all_groups = []
+        mpinets_data = False
+        problems = file_path()
+        if "dresser_task_oriented" in list(problems.keys()):
+            mpinets_data = True
+
+        mg, robot_cfg = load_curobo(
+            1,
+            enable_debug,
+            og_tsteps,
+            og_trajopt_seeds,
+            mpinets_data,
+            graph_mode,
+            not args.disable_cuda_graph,
+        )
+
+        for key, v in tqdm(problems.items()):
+            scene_problems = problems[key][:]  # [:1]  # [:20]  # [0:10]
+            m_list = []
+            i = 0
+            ik_fail = 0
+            for problem in tqdm(scene_problems, leave=False):
+                i += 1
+
+                plan_config = MotionGenPlanConfig(
+                    max_attempts=10,  # 00,  # 00,  # 100,  # 00,  # 000,#,00,#00,  # 5000,
+                    enable_graph_attempt=3,
+                    enable_finetune_trajopt=True,
+                    partial_ik_opt=False,
+                    enable_graph=graph_mode,
+                    timeout=60,
+                    enable_opt=not graph_mode,
+                )
+
+                q_start = problem["start"]
+                pose = (
+                    problem["goal_pose"]["position_xyz"] + problem["goal_pose"]["quaternion_wxyz"]
+                )
+
+                problem_name = "d_" + key + "_" + str(i)
+
+                # reset planner
+                mg.reset(reset_seed=False)
+                world = WorldConfig.from_dict(deepcopy(problem["obstacles"])).get_mesh_world(
+                    merge_meshes=True
+                )
+                mesh = world.mesh[0].get_trimesh_mesh()
+
+                # world.save_world_as_mesh(problem_name + ".stl")
+                mg.world_coll_checker.clear_cache()
+                m_dataset = MeshDataset(
+                    None, n_frames=200, image_size=640, save_data_dir=None, trimesh_mesh=mesh
+                )
+                tensor_args = mg.tensor_args
+                for j in range(len(m_dataset)):
+                    data = m_dataset[j]
+                    cam_obs = CameraObservation(
+                        rgb_image=tensor_args.to_device(data["rgba"]),
+                        depth_image=tensor_args.to_device(data["depth"]),
+                        intrinsics=data["intrinsics"],
+                        pose=Pose.from_matrix(data["pose"].to(device=mg.tensor_args.device)),
+                    )
+                    cam_obs = cam_obs
+
+                    mg.add_camera_frame(cam_obs, "world")
+
+                    mg.process_camera_frames("world", False)
+                torch.cuda.synchronize()
+                mg.world_coll_checker.update_blox_hashes()
+                torch.cuda.synchronize()
+                if save_log or write_usd:
+                    # nvblox_obs = mg.world_coll_checker.get_mesh_from_blox_layer("world", mode="nvblox")
+
+                    # nvblox_obs.save_as_mesh("debug_tsdf.obj")
+                    nvblox_obs = mg.world_coll_checker.get_mesh_from_blox_layer(
+                        "world", mode="voxel"
+                    )
+                    nvblox_obs.color = [0.0, 0.0, 0.8, 0.8]
+                    # nvblox_obs.save_as_mesh("debug_voxel_occ.obj")
+                    # exit()
+                    nvblox_obs.name = "nvblox_world"
+                    world.add_obstacle(nvblox_obs)
+                # run planner
+                start_state = JointState.from_position(mg.tensor_args.to_device([q_start]))
+                result = mg.plan_single(
+                    start_state,
+                    Pose.from_list(pose),
+                    plan_config,
+                )
+                if result.status == "IK Fail":
+                    ik_fail += 1
+                problem["solution"] = None
+                if plan_config.enable_finetune_trajopt:
+                    problem_name = key + "_" + str(i)
+                else:
+                    problem_name = "noft_" + key + "_" + str(i)
+                problem_name = "nvblox_" + problem_name
+                if write_usd or save_log:
+                    # CuRobo
+                    from curobo.util.usd_helper import UsdHelper
+
+                    world.randomize_color(r=[0.5, 0.9], g=[0.2, 0.5], b=[0.0, 0.2])
+                    if len(world.mesh) > 1:
+                        world.mesh[1].color = [125 / 255, 255 / 255, 70.0 / 255, 1.0]
+                    gripper_mesh = Mesh(
+                        name="target_gripper",
+                        file_path=join_path(
+                            get_assets_path(),
+                            "robot/franka_description/meshes/visual/hand.dae",
+                        ),
+                        color=[0.0, 0.8, 0.1, 1.0],
+                        pose=pose,
+                    )
+                    world.add_obstacle(gripper_mesh)
+
+                # get costs:
+                if plot_cost:
+                    dt = 0.5
+                    problem_name = "approx_wolfe_p" + problem_name
+                    if result.optimized_dt is not None:
+                        dt = result.optimized_dt.item()
+                    if "trajopt_result" in result.debug_info:
+                        success = result.success.item()
+                        traj_cost = result.debug_info["trajopt_result"].debug_info["solver"][
+                            "cost"
+                        ][-1]
+                        traj_cost = torch.cat(traj_cost, dim=-1)
+                        plot_cost_iteration(
+                            traj_cost,
+                            title=problem_name + "_" + str(success) + "_" + str(dt),
+                            save_path=join_path("log/plot/", problem_name + "_cost"),
+                            log_scale=False,
+                        )
+                        if "finetune_trajopt_result" in result.debug_info:
+                            traj_cost = result.debug_info["finetune_trajopt_result"].debug_info[
+                                "solver"
+                            ]["cost"][0]
+                            traj_cost = torch.cat(traj_cost, dim=-1)
+                            plot_cost_iteration(
+                                traj_cost,
+                                title=problem_name + "_" + str(success) + "_" + str(dt),
+                                save_path=join_path("log/plot/", problem_name + "_ft_cost"),
+                                log_scale=False,
+                            )
+                if result.success.item():
+                    q_traj = result.get_interpolated_plan()
+                    problem["goal_ik"] = q_traj.position.cpu().squeeze().numpy()[-1, :].tolist()
+                    problem["solution"] = {
+                        "position": result.get_interpolated_plan()
+                        .position.cpu()
+                        .squeeze()
+                        .numpy()
+                        .tolist(),
+                        "velocity": result.get_interpolated_plan()
+                        .velocity.cpu()
+                        .squeeze()
+                        .numpy()
+                        .tolist(),
+                        "acceleration": result.get_interpolated_plan()
+                        .acceleration.cpu()
+                        .squeeze()
+                        .numpy()
+                        .tolist(),
+                        "jerk": result.get_interpolated_plan()
+                        .jerk.cpu()
+                        .squeeze()
+                        .numpy()
+                        .tolist(),
+                        "dt": interpolation_dt,
+                    }
+                    debug = {
+                        "used_graph": result.used_graph,
+                        "attempts": result.attempts,
+                        "ik_time": result.ik_time,
+                        "graph_time": result.graph_time,
+                        "trajopt_time": result.trajopt_time,
+                        "total_time": result.total_time,
+                        "solve_time": result.solve_time,
+                        "opt_traj": {
+                            "position": result.optimized_plan.position.cpu()
+                            .squeeze()
+                            .numpy()
+                            .tolist(),
+                            "velocity": result.optimized_plan.velocity.cpu()
+                            .squeeze()
+                            .numpy()
+                            .tolist(),
+                            "acceleration": result.optimized_plan.acceleration.cpu()
+                            .squeeze()
+                            .numpy()
+                            .tolist(),
+                            "jerk": result.optimized_plan.jerk.cpu().squeeze().numpy().tolist(),
+                            "dt": result.optimized_dt.item(),
+                        },
+                        "valid_query": result.valid_query,
+                    }
+                    problem["solution_debug"] = debug
+                    current_metrics = CuroboMetrics(
+                        skip=False,
+                        success=True,
+                        time=result.total_time,
+                        collision=False,
+                        joint_limit_violation=False,
+                        self_collision=False,
+                        position_error=result.position_error.item() * 100.0,
+                        orientation_error=result.rotation_error.item() * 100.0,
+                        eef_position_path_length=10,
+                        eef_orientation_path_length=10,
+                        attempts=result.attempts,
+                        motion_time=result.motion_time.item(),
+                        solve_time=result.solve_time,
+                    )
+
+                    if write_usd:
+                        # CuRobo
+
+                        q_traj = result.get_interpolated_plan()
+                        UsdHelper.write_trajectory_animation_with_robot_usd(
+                            robot_cfg,
+                            world,
+                            start_state,
+                            q_traj,
+                            dt=result.interpolation_dt,
+                            save_path=join_path("log/usd/", problem_name) + ".usd",
+                            interpolation_steps=1,
+                            write_robot_usd_path="log/usd/assets/",
+                            robot_usd_local_reference="assets/",
+                            base_frame="/world_" + problem_name,
+                            visualize_robot_spheres=True,
+                        )
+
+                    if write_plot:
+                        problem_name = problem_name
+                        plot_traj(
+                            result.optimized_plan,
+                            result.optimized_dt.item(),
+                            # result.get_interpolated_plan(),
+                            # result.interpolation_dt,
+                            title=problem_name,
+                            save_path=join_path("log/plot/", problem_name + ".pdf"),
+                        )
+                        plot_traj(
+                            # result.optimized_plan,
+                            # result.optimized_dt.item(),
+                            result.get_interpolated_plan(),
+                            result.interpolation_dt,
+                            title=problem_name,
+                            save_path=join_path("log/plot/", problem_name + "_int.pdf"),
+                        )
+
+                    m_list.append(current_metrics)
+                    all_groups.append(current_metrics)
+                elif result.valid_query:
+                    current_metrics = CuroboMetrics()
+                    debug = {
+                        "used_graph": result.used_graph,
+                        "attempts": result.attempts,
+                        "ik_time": result.ik_time,
+                        "graph_time": result.graph_time,
+                        "trajopt_time": result.trajopt_time,
+                        "total_time": result.total_time,
+                        "solve_time": result.solve_time,
+                        "status": result.status,
+                        "valid_query": result.valid_query,
+                    }
+                    problem["solution_debug"] = debug
+
+                    m_list.append(current_metrics)
+                    all_groups.append(current_metrics)
+                else:
+                    print("invalid: " + problem_name)
+                    debug = {
+                        "used_graph": result.used_graph,
+                        "attempts": result.attempts,
+                        "ik_time": result.ik_time,
+                        "graph_time": result.graph_time,
+                        "trajopt_time": result.trajopt_time,
+                        "total_time": result.total_time,
+                        "solve_time": result.solve_time,
+                        "status": result.status,
+                        "valid_query": result.valid_query,
+                    }
+                    problem["solution_debug"] = debug
+                if save_log:
+                    UsdHelper.write_motion_gen_log(
+                        result,
+                        robot_cfg,
+                        world,
+                        start_state,
+                        Pose.from_list(pose),
+                        join_path("log/usd/", problem_name) + "_debug",
+                        write_ik=not result.success.item(),
+                        write_trajopt=True,
+                        visualize_robot_spheres=True,
+                        grid_space=2,
+                    )
+            g_m = CuroboGroupMetrics.from_list(m_list)
+            print(
+                key,
+                f"{g_m.success:2.2f}",
+                g_m.time.mean,
+                g_m.time.percent_98,
+                g_m.position_error.percent_98,
+                g_m.orientation_error.percent_98,
+            )
+        g_m = CuroboGroupMetrics.from_list(all_groups)
+        print(
+            "All: ",
+            f"{g_m.success:2.2f}",
+            g_m.motion_time.percent_98,
+            g_m.time.mean,
+            g_m.time.percent_75,
+            g_m.position_error.percent_75,
+            g_m.orientation_error.percent_75,
+        )
+        print(g_m.attempts)
+        if write_benchmark:
+            if not mpinets_data:
+                write_yaml(problems, "mb_curobo_solution_nvblox.yaml")
+            else:
+                write_yaml(problems, "mpinets_curobo_solution_nvblox.yaml")
+        all_files += all_groups
+    g_m = CuroboGroupMetrics.from_list(all_files)
+    print("######## FULL SET ############")
+    print("All: ", f"{g_m.success:2.2f}")
+    print("MT: ", g_m.motion_time)
+    print("PT:", g_m.time)
+    print("ST: ", g_m.solve_time)
+    print("accuracy: ", g_m.position_error, g_m.orientation_error)
+
+
+if __name__ == "__main__":
+    setup_curobo_logger("error")
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--graph",
+        action="store_true",
+        help="When True, runs only geometric planner",
+        default=False,
+    )
+    parser.add_argument(
+        "--disable_cuda_graph",
+        action="store_true",
+        help="When True, disable cuda graph during benchmarking",
+        default=False,
+    )
+    args = parser.parse_args()
+    benchmark_mb(
+        save_log=False,
+        write_usd=False,
+        write_plot=False,
+        write_benchmark=False,
+        plot_cost=False,
+        args=args,
+    )
diff --git a/benchmark/curobo_nvblox_profile.py b/benchmark/curobo_nvblox_profile.py
new file mode 100644
index 00000000..cd554882
--- /dev/null
+++ b/benchmark/curobo_nvblox_profile.py
@@ -0,0 +1,193 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+import time
+from typing import Any, Dict, List
+
+# Third Party
+import numpy as np
+import torch
+import torch.autograd.profiler as profiler
+from robometrics.datasets import demo_raw
+from torch.profiler import ProfilerActivity, profile, record_function
+from tqdm import tqdm
+
+# CuRobo
+from curobo.geom.sdf.world import CollisionCheckerType, WorldConfig
+from curobo.geom.types import Mesh
+from curobo.types.math import Pose
+from curobo.types.state import JointState
+from curobo.util.logger import setup_curobo_logger
+from curobo.util_file import (
+    get_assets_path,
+    get_robot_configs_path,
+    get_task_configs_path,
+    get_world_configs_path,
+    join_path,
+    load_yaml,
+)
+from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
+
+# torch.set_num_threads(8)
+# ttorch.use_deterministic_algorithms(True)
+torch.manual_seed(0)
+
+torch.backends.cudnn.benchmark = True
+
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+np.random.seed(10)
+# Third Party
+from nvblox_torch.datasets.mesh_dataset import MeshDataset
+
+# CuRobo
+from curobo.types.camera import CameraObservation
+
+
+def load_curobo(n_cubes: int, enable_log: bool = False):
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), "franka.yml"))["robot_cfg"]
+    robot_cfg["kinematics"]["collision_sphere_buffer"] = -0.015
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_cfg,
+        "collision_nvblox_online.yml",
+        trajopt_tsteps=32,
+        collision_checker_type=CollisionCheckerType.BLOX,
+        use_cuda_graph=False,
+        position_threshold=0.005,
+        rotation_threshold=0.05,
+        num_ik_seeds=30,
+        num_trajopt_seeds=12,
+        interpolation_dt=0.02,
+        # grad_trajopt_iters=200,
+        store_ik_debug=enable_log,
+        store_trajopt_debug=enable_log,
+    )
+    mg = MotionGen(motion_gen_config)
+    mg.warmup(enable_graph=False)
+    # print("warmed up")
+    # exit()
+    return mg
+
+
+def benchmark_mb(write_usd=False, save_log=False):
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), "franka.yml"))["robot_cfg"]
+    spheres = robot_cfg["kinematics"]["collision_spheres"]
+    if isinstance(spheres, str):
+        spheres = load_yaml(join_path(get_robot_configs_path(), spheres))["collision_spheres"]
+
+    plan_config = MotionGenPlanConfig(
+        max_attempts=2,
+        enable_graph_attempt=3,
+        enable_finetune_trajopt=True,
+        partial_ik_opt=False,
+        enable_graph=False,
+    )
+    # load dataset:
+
+    file_paths = [demo_raw]
+    all_files = []
+    for file_path in file_paths:
+        all_groups = []
+
+        problems = file_path()
+
+        for key, v in tqdm(problems.items()):
+            # if key not in ["table_under_pick_panda"]:
+            #    continue
+            scene_problems = problems[key]  # [:2]
+            n_cubes = check_problems(scene_problems)
+            mg = load_curobo(n_cubes, save_log)
+            m_list = []
+            i = 0
+            for problem in tqdm(scene_problems, leave=False):
+                q_start = problem["start"]
+                pose = (
+                    problem["goal_pose"]["position_xyz"] + problem["goal_pose"]["quaternion_wxyz"]
+                )
+
+                # reset planner
+                mg.reset(reset_seed=False)
+                world = WorldConfig.from_dict(problem["obstacles"]).get_mesh_world(
+                    merge_meshes=True
+                )
+                # clear cache:
+                mesh = world.mesh[0].get_trimesh_mesh()
+                mg.clear_world_cache()
+                obs = []
+                # get camera_observations:
+                m_dataset = MeshDataset(
+                    None, n_frames=200, image_size=640, save_data_dir=None, trimesh_mesh=mesh
+                )
+                obs = []
+                tensor_args = mg.tensor_args
+                for j in range(len(m_dataset)):
+                    with profiler.record_function("nvblox/create_camera_images"):
+                        data = m_dataset[j]
+                        cam_obs = CameraObservation(
+                            rgb_image=tensor_args.to_device(data["rgba"]),
+                            depth_image=tensor_args.to_device(data["depth"]),
+                            intrinsics=data["intrinsics"],
+                            pose=Pose.from_matrix(data["pose"].to(device=mg.tensor_args.device)),
+                        )
+                        obs.append(cam_obs)
+                with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+                    for j in range(len(obs)):
+                        cam_obs = obs[j]
+                        cam_obs.rgb_image = None
+                        with profiler.record_function("nvblox/add_camera_images"):
+                            mg.add_camera_frame(cam_obs, "world")
+
+                        with profiler.record_function("nvblox/process_camera_images"):
+                            mg.process_camera_frames("world", False)
+                    mg.world_coll_checker.update_blox_hashes()
+
+                    # run planner
+                    start_state = JointState.from_position(mg.tensor_args.to_device([q_start]))
+                    result = mg.plan_single(
+                        start_state,
+                        Pose.from_list(pose),
+                        plan_config,
+                    )
+                print("Exporting the trace..")
+                prof.export_chrome_trace("log/trace/trajopt_global_nvblox.json")
+                print(result.success, result.status)
+                exit()
+
+
+def get_metrics_obstacles(obs: Dict[str, List[Any]]):
+    obs_list = []
+    if "cylinder" in obs and len(obs["cylinder"].items()) > 0:
+        for _, vi in enumerate(obs["cylinder"].values()):
+            obs_list.append(
+                Cylinder(
+                    np.ravel(vi["pose"][:3]), vi["radius"], vi["height"], np.ravel(vi["pose"][3:])
+                )
+            )
+    if "cuboid" in obs and len(obs["cuboid"].items()) > 0:
+        for _, vi in enumerate(obs["cuboid"].values()):
+            obs_list.append(
+                Cuboid(np.ravel(vi["pose"][:3]), np.ravel(vi["dims"]), np.ravel(vi["pose"][3:]))
+            )
+    return obs_list
+
+
+def check_problems(all_problems):
+    n_cube = 0
+    for problem in all_problems:
+        cache = WorldConfig.from_dict(problem["obstacles"]).get_obb_world().get_cache_dict()
+        n_cube = max(n_cube, cache["obb"])
+    return n_cube
+
+
+if __name__ == "__main__":
+    setup_curobo_logger("error")
+    benchmark_mb()
diff --git a/benchmark/curobo_profile.py b/benchmark/curobo_profile.py
new file mode 100644
index 00000000..7e72692d
--- /dev/null
+++ b/benchmark/curobo_profile.py
@@ -0,0 +1,188 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+import argparse
+import time
+from typing import Any, Dict, List
+
+# Third Party
+import numpy as np
+import torch
+
+# from geometrout.primitive import Cuboid, Cylinder
+# from geometrout.transform import SE3
+# from robometrics.robot import CollisionSphereConfig, Robot
+from torch.profiler import ProfilerActivity, profile, record_function
+from tqdm import tqdm
+
+# CuRobo
+from curobo.geom.sdf.world import CollisionCheckerType, WorldConfig
+from curobo.geom.types import Mesh
+from curobo.types.math import Pose
+from curobo.types.state import JointState
+from curobo.util.logger import setup_curobo_logger
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
+
+# torch.set_num_threads(8)
+# ttorch.use_deterministic_algorithms(True)
+torch.manual_seed(0)
+
+torch.backends.cudnn.benchmark = True
+
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+np.random.seed(10)
+# Third Party
+from robometrics.datasets import demo_raw
+
+
+def load_curobo(
+    n_cubes: int, enable_log: bool = False, mesh_mode: bool = False, cuda_graph: bool = False
+):
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), "franka.yml"))["robot_cfg"]
+    robot_cfg["kinematics"]["collision_sphere_buffer"] = -0.015
+    world_cfg = WorldConfig.from_dict(
+        load_yaml(join_path(get_world_configs_path(), "collision_table.yml"))
+    ).get_obb_world()
+
+    c_checker = CollisionCheckerType.PRIMITIVE
+    c_cache = {"obb": n_cubes}
+    if mesh_mode:
+        c_checker = CollisionCheckerType.MESH
+        c_cache = {"mesh": n_cubes}
+        world_cfg = world_cfg.get_mesh_world()
+
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        trajopt_tsteps=32,
+        collision_checker_type=c_checker,
+        use_cuda_graph=cuda_graph,
+        collision_cache=c_cache,
+        ee_link_name="panda_hand",
+        position_threshold=0.005,
+        rotation_threshold=0.05,
+        num_ik_seeds=30,
+        num_trajopt_seeds=10,
+        interpolation_dt=0.02,
+        # grad_trajopt_iters=200,
+        store_ik_debug=enable_log,
+        store_trajopt_debug=enable_log,
+    )
+    mg = MotionGen(motion_gen_config)
+    mg.warmup(enable_graph=False)
+    return mg
+
+
+def benchmark_mb(args):
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), "franka.yml"))["robot_cfg"]
+    spheres = robot_cfg["kinematics"]["collision_spheres"]
+    if isinstance(spheres, str):
+        spheres = load_yaml(join_path(get_robot_configs_path(), spheres))["collision_spheres"]
+
+    plan_config = MotionGenPlanConfig(
+        max_attempts=2,
+        enable_graph_attempt=3,
+        enable_finetune_trajopt=True,
+        partial_ik_opt=False,
+        enable_graph=False,
+    )
+    # load dataset:
+
+    file_paths = [demo_raw]
+    all_files = []
+    for file_path in file_paths:
+        all_groups = []
+
+        problems = file_path()
+
+        for key, v in tqdm(problems.items()):
+            # if key not in ["table_under_pick_panda"]:
+            #    continue
+            scene_problems = problems[key]  # [:2]
+            n_cubes = check_problems(scene_problems)
+            mg = load_curobo(n_cubes, False, args.mesh, args.cuda_graph)
+            m_list = []
+            i = 0
+            for problem in tqdm(scene_problems, leave=False):
+                q_start = problem["start"]
+
+                pose = (
+                    problem["goal_pose"]["position_xyz"] + problem["goal_pose"]["quaternion_wxyz"]
+                )
+
+                # reset planner
+                mg.reset(reset_seed=False)
+                if args.mesh:
+                    world = WorldConfig.from_dict(problem["obstacles"]).get_mesh_world()
+
+                else:
+                    world = WorldConfig.from_dict(problem["obstacles"]).get_obb_world()
+
+                mg.update_world(world)
+                # continue
+                # load obstacles
+
+                # run planner
+                start_state = JointState.from_position(mg.tensor_args.to_device([q_start]))
+                with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+                    result = mg.plan_single(
+                        start_state,
+                        Pose.from_list(pose),
+                        plan_config,
+                    )
+                print("Exporting the trace..")
+                prof.export_chrome_trace(join_path(args.save_path, args.file_name) + ".json")
+                print(result.success, result.status)
+                exit()
+
+
+def check_problems(all_problems):
+    n_cube = 0
+    for problem in all_problems:
+        cache = WorldConfig.from_dict(problem["obstacles"]).get_obb_world().get_cache_dict()
+        n_cube = max(n_cube, cache["obb"])
+    return n_cube
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save_path",
+        type=str,
+        default="log/trace",
+        help="path to save file",
+    )
+    parser.add_argument(
+        "--file_name",
+        type=str,
+        default="motion_gen_trace",
+        help="File name prefix to use to save benchmark results",
+    )
+    parser.add_argument(
+        "--mesh",
+        action="store_true",
+        help="When True, converts obstacles to meshes",
+        default=False,
+    )
+    parser.add_argument(
+        "--cuda_graph",
+        action="store_true",
+        help="When True, uses cuda graph during profiing",
+        default=False,
+    )
+
+    args = parser.parse_args()
+
+    setup_curobo_logger("error")
+    benchmark_mb(args)
diff --git a/benchmark/curobo_python_profile.py b/benchmark/curobo_python_profile.py
new file mode 100644
index 00000000..1af5bed7
--- /dev/null
+++ b/benchmark/curobo_python_profile.py
@@ -0,0 +1,180 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+import argparse
+import cProfile
+import time
+
+# Third Party
+import torch
+from torch.profiler import ProfilerActivity, profile, record_function
+
+# CuRobo
+from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel, CudaRobotModelConfig
+from curobo.geom.sdf.world import CollisionCheckerType
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.util_file import get_robot_configs_path, get_robot_path, join_path, load_yaml
+from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
+
+
+def demo_motion_gen():
+    # Standard Library
+
+    st_time = time.time()
+    tensor_args = TensorDeviceType()
+    world_file = "collision_table.yml"
+    robot_file = "franka.yml"
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_file,
+        world_file,
+        tensor_args,
+        trajopt_tsteps=32,
+        collision_checker_type=CollisionCheckerType.PRIMITIVE,
+        use_cuda_graph=False,
+        num_trajopt_seeds=4,
+        num_graph_seeds=1,
+        evaluate_interpolated_trajectory=True,
+        interpolation_dt=0.01,
+    )
+    motion_gen = MotionGen(motion_gen_config)
+
+    # st_time = time.time()
+    motion_gen.warmup(batch=50, enable_graph=False, warmup_js_trajopt=False)
+    print("motion gen time:", time.time() - st_time)
+
+    # print(time.time() - st_time)
+    return
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+    retract_cfg = motion_gen.get_retract_config()
+    print(retract_cfg)
+    state = motion_gen.rollout_fn.compute_kinematics(
+        JointState.from_position(retract_cfg.view(1, -1))
+    )
+
+    retract_pose = Pose(state.ee_pos_seq.squeeze(), quaternion=state.ee_quat_seq.squeeze())
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.3)
+    result = motion_gen.plan(
+        start_state, retract_pose, enable_graph=True, enable_opt=False, max_attempts=1
+    )
+    print(result.optimized_plan.position.shape)
+    traj = result.get_interpolated_plan()  # $.position.view(-1, 7)  # optimized plan
+    print("Trajectory Generated: ", result.success, result.optimized_dt.item())
+
+
+def demo_basic_robot():
+    st_time = time.time()
+    tensor_args = TensorDeviceType()
+    # load a urdf:
+    config_file = load_yaml(join_path(get_robot_path(), "franka.yml"))
+
+    urdf_file = config_file["robot_cfg"]["kinematics"][
+        "urdf_path"
+    ]  # Send global path starting with "/"
+    base_link = config_file["robot_cfg"]["kinematics"]["base_link"]
+    ee_link = config_file["robot_cfg"]["kinematics"]["ee_link"]
+    robot_cfg = RobotConfig.from_basic(urdf_file, base_link, ee_link, tensor_args)
+
+    kin_model = CudaRobotModel(robot_cfg.kinematics)
+    print("base kin time:", time.time() - st_time)
+    return
+    # compute forward kinematics:
+
+    # q = torch.rand((10, kin_model.get_dof()), **vars(tensor_args))
+    # out = kin_model.get_state(q)
+    # here is the kinematics state:
+    # print(out)
+
+
+def demo_full_config_robot(config_file):
+    st_time = time.time()
+    tensor_args = TensorDeviceType()
+    # load a urdf:
+
+    robot_cfg = RobotConfig.from_dict(config_file, tensor_args)
+
+    # kin_model = CudaRobotModel(robot_cfg.kinematics)
+    print("full kin time: ", time.time() - st_time)
+    # compute forward kinematics:
+    # q = torch.rand((10, kin_model.get_dof()), **vars(tensor_args))
+    # out = kin_model.get_state(q)
+    # here is the kinematics state:
+    # print(out)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save_path",
+        type=str,
+        default="log/trace",
+        help="path to save file",
+    )
+    parser.add_argument(
+        "--file_name",
+        type=str,
+        default="startup_trace",
+        help="File name prefix to use to save benchmark results",
+    )
+    parser.add_argument(
+        "--motion_gen",
+        action="store_true",
+        help="When True, runs startup for motion generation",
+        default=False,
+    )
+    parser.add_argument(
+        "--kinematics",
+        action="store_true",
+        help="When True, runs startup for kinematics",
+        default=True,
+    )
+
+    args = parser.parse_args()
+
+    # cProfile.run('demo_motion_gen()')
+    config_file = load_yaml(join_path(get_robot_path(), "franka.yml"))["robot_cfg"]
+
+    # Third Party
+
+    if args.kinematics:
+        for _ in range(5):
+            demo_full_config_robot(config_file)
+
+        pr = cProfile.Profile()
+        pr.enable()
+        demo_full_config_robot(config_file)
+        pr.disable()
+        filename = join_path(args.save_path, args.file_name) + "_kinematics_cprofile.prof"
+        pr.dump_stats(filename)
+
+        with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+            demo_full_config_robot(config_file)
+        filename = join_path(args.save_path, args.file_name) + "_kinematics_trace.json"
+        prof.export_chrome_trace(filename)
+
+    if args.motion_gen:
+        for _ in range(5):
+            demo_motion_gen()
+
+        pr = cProfile.Profile()
+        pr.enable()
+        demo_motion_gen()
+        pr.disable()
+        filename = join_path(args.save_path, args.file_name) + "_motion_gen_cprofile.prof"
+        pr.dump_stats(filename)
+
+        with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+            demo_motion_gen()
+        filename = join_path(args.save_path, args.file_name) + "_motion_gen_trace.json"
+        prof.export_chrome_trace(filename)
diff --git a/benchmark/ik_benchmark.py b/benchmark/ik_benchmark.py
new file mode 100644
index 00000000..a3798e38
--- /dev/null
+++ b/benchmark/ik_benchmark.py
@@ -0,0 +1,182 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+import argparse
+import time
+
+# Third Party
+import numpy as np
+import torch
+
+# CuRobo
+from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel, CudaRobotModelConfig
+from curobo.geom.types import WorldConfig
+from curobo.rollout.arm_base import ArmBase, ArmBaseConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import RobotConfig
+from curobo.util_file import (
+    get_motion_gen_robot_list,
+    get_robot_configs_path,
+    get_robot_list,
+    get_task_configs_path,
+    get_world_configs_path,
+    join_path,
+    load_yaml,
+    write_yaml,
+)
+from curobo.wrap.reacher.ik_solver import IKSolver, IKSolverConfig
+
+torch.backends.cudnn.benchmark = True
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+
+
+def run_full_config_collision_free_ik(
+    robot_file,
+    world_file,
+    batch_size,
+    use_cuda_graph=False,
+    collision_free=True,
+    high_precision=False,
+):
+    tensor_args = TensorDeviceType()
+    robot_data = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    if not collision_free:
+        robot_data["kinematics"]["collision_link_names"] = None
+        robot_data["kinematics"]["lock_joints"] = {}
+    robot_cfg = RobotConfig.from_dict(robot_data)
+    world_cfg = WorldConfig.from_dict(load_yaml(join_path(get_world_configs_path(), world_file)))
+    position_threshold = 0.005
+    if high_precision:
+        position_threshold = 0.001
+    ik_config = IKSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        rotation_threshold=0.05,
+        position_threshold=position_threshold,
+        num_seeds=30,
+        self_collision_check=collision_free,
+        self_collision_opt=collision_free,
+        tensor_args=tensor_args,
+        use_cuda_graph=use_cuda_graph,
+        high_precision=high_precision,
+        regularization=False,
+        # grad_iters=500,
+    )
+    ik_solver = IKSolver(ik_config)
+
+    for _ in range(3):
+        q_sample = ik_solver.sample_configs(batch_size)
+        while q_sample.shape[0] == 0:
+            q_sample = ik_solver.sample_configs(batch_size)
+
+        kin_state = ik_solver.fk(q_sample)
+        goal = Pose(kin_state.ee_position, kin_state.ee_quaternion)
+
+        st_time = time.time()
+        result = ik_solver.solve_batch(goal)
+        torch.cuda.synchronize()
+        total_time = (time.time() - st_time) / q_sample.shape[0]
+    return (
+        total_time,
+        100.0 * torch.count_nonzero(result.success).item() / len(q_sample),
+        np.percentile(result.position_error[result.success].cpu().numpy(), 98).item(),
+        np.percentile(result.rotation_error[result.success].cpu().numpy(), 98).item(),
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save_path",
+        type=str,
+        default=".",
+        help="path to save file",
+    )
+    parser.add_argument(
+        "--high_precision",
+        action="store_true",
+        help="When True, enables IK for 1 mm precision, when False 5mm precision",
+        default=False,
+    )
+    parser.add_argument(
+        "--file_name",
+        type=str,
+        default="ik",
+        help="File name prefix to use to save benchmark results",
+    )
+
+    args = parser.parse_args()
+
+    b_list = [1, 10, 100, 1000][-1:]
+
+    robot_list = get_motion_gen_robot_list()
+
+    world_file = "collision_test.yml"
+
+    print("running...")
+    data = {
+        "robot": [],
+        "IK time(ms)": [],
+        "Collision-Free IK time(ms)": [],
+        "Batch Size": [],
+        "Success IK": [],
+        "Success Collision-Free IK": [],
+        "Position Error(mm)": [],
+        "Orientation Error": [],
+        "Position Error Collision-Free IK(mm)": [],
+        "Orientation Error Collision-Free IK": [],
+    }
+    for robot_file in robot_list:
+        # create a sampler with dof:
+        for b_size in b_list:
+            # sample test configs:
+
+            dt_cu_ik, succ, p_err, q_err = run_full_config_collision_free_ik(
+                robot_file,
+                world_file,
+                batch_size=b_size,
+                use_cuda_graph=True,
+                collision_free=False,
+                high_precision=args.high_precision,
+            )
+            dt_cu_ik_cfree, success, p_err_c, q_err_c = run_full_config_collision_free_ik(
+                robot_file,
+                world_file,
+                batch_size=b_size,
+                use_cuda_graph=True,
+                collision_free=True,
+            )
+
+            # print(dt_cu/b_size, dt_cu_cg/b_size)
+            data["robot"].append(robot_file)
+            data["IK time(ms)"].append(dt_cu_ik * 1000.0)
+            data["Batch Size"].append(b_size)
+            data["Success Collision-Free IK"].append(success)
+            data["Success IK"].append(succ)
+
+            data["Position Error(mm)"].append(p_err * 1000.0)
+            data["Orientation Error"].append(q_err)
+            data["Position Error Collision-Free IK(mm)"].append(p_err_c * 1000.0)
+            data["Orientation Error Collision-Free IK"].append(q_err_c)
+
+            data["Collision-Free IK time(ms)"].append(dt_cu_ik_cfree * 1000.0)
+    write_yaml(data, join_path(args.save_path, args.file_name + ".yml"))
+    try:
+        # Third Party
+        import pandas as pd
+
+        df = pd.DataFrame(data)
+        print(df)
+        df.to_csv(join_path(args.save_path, args.file_name + ".csv"))
+    except ImportError:
+        pass
diff --git a/benchmark/kinematics_benchmark.py b/benchmark/kinematics_benchmark.py
new file mode 100644
index 00000000..03d4f6d1
--- /dev/null
+++ b/benchmark/kinematics_benchmark.py
@@ -0,0 +1,231 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+import argparse
+import time
+
+# Third Party
+import numpy as np
+import torch
+
+# CuRobo
+from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel, CudaRobotModelConfig
+from curobo.rollout.arm_base import ArmBase, ArmBaseConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.robot import RobotConfig
+from curobo.util_file import (
+    get_robot_configs_path,
+    get_robot_list,
+    get_task_configs_path,
+    get_world_configs_path,
+    join_path,
+    load_yaml,
+    write_yaml,
+)
+
+torch.backends.cudnn.benchmark = True
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+
+
+def load_curobo(robot_file, world_file):
+    # load curobo arm base?
+
+    world_cfg = load_yaml(join_path(get_world_configs_path(), world_file))
+
+    base_config_data = load_yaml(join_path(get_task_configs_path(), "base_cfg.yml"))
+    graph_config_data = load_yaml(join_path(get_task_configs_path(), "graph.yml"))
+    # base_config_data["constraint"]["self_collision_cfg"]["weight"] = 0.0
+    # if not compute_distance:
+    #    base_config_data["constraint"]["primitive_collision_cfg"]["classify"] = False
+    robot_config_data = load_yaml(join_path(get_robot_configs_path(), robot_file))
+
+    arm_base = ArmBaseConfig.from_dict(
+        robot_config_data["robot_cfg"],
+        graph_config_data["model"],
+        base_config_data["cost"],
+        base_config_data["constraint"],
+        base_config_data["convergence"],
+        base_config_data["world_collision_checker_cfg"],
+        world_cfg,
+    )
+    arm_base = ArmBase(arm_base)
+    return arm_base
+
+
+def bench_collision_curobo(robot_file, world_file, q_test, use_cuda_graph=True):
+    arm_base = load_curobo(robot_file, world_file)
+    # load graph module:
+    tensor_args = TensorDeviceType()
+    q_test = tensor_args.to_device(q_test).unsqueeze(1)
+
+    tensor_args = TensorDeviceType()
+    q_warm = q_test + 0.5
+
+    if not use_cuda_graph:
+        for _ in range(10):
+            out = arm_base.rollout_constraint(q_warm)
+            torch.cuda.synchronize()
+
+        torch.cuda.synchronize()
+
+        st_time = time.time()
+        out = arm_base.rollout_constraint(q_test)
+
+        torch.cuda.synchronize()
+        dt = time.time() - st_time
+    else:
+        q = q_warm.clone()
+
+        g = torch.cuda.CUDAGraph()
+        s = torch.cuda.Stream()
+        s.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(s):
+            for i in range(3):
+                out = arm_base.rollout_constraint(q_warm)
+        torch.cuda.current_stream().wait_stream(s)
+        with torch.cuda.graph(g):
+            out = arm_base.rollout_constraint(q_warm)
+
+        for _ in range(10):
+            q.copy_(q_warm.detach())
+            g.replay()
+            a = out.feasible
+            # print(a)
+            # a = ee_mat.clone()
+        # q_new = torch.rand((b_size, robot_model.get_dof()), **vars(tensor_args))
+
+        torch.cuda.synchronize()
+        st_time = time.time()
+
+        q.copy_(q_test.detach().requires_grad_(False))
+        g.replay()
+        a = out.feasible
+        # print(a)
+        # a = ee_mat.clone()
+        torch.cuda.synchronize()
+        dt = time.time() - st_time
+    return dt
+
+
+def bench_kin_curobo(robot_file, q_test, use_cuda_graph=True, use_coll_spheres=True):
+    robot_data = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    if not use_coll_spheres:
+        robot_data["kinematics"]["collision_spheres"] = None
+        robot_data["kinematics"]["collision_link_names"] = None
+        robot_data["kinematics"]["use_global_cumul"] = False
+        robot_data["kinematics"]["lock_joints"] = {}
+    tensor_args = TensorDeviceType()
+    robot_data["kinematics"]["use_global_cumul"] = False
+
+    robot_cfg = RobotConfig.from_dict(robot_data)
+    robot_model = CudaRobotModel(robot_cfg.kinematics)
+
+    if not use_cuda_graph:
+        for _ in range(10):
+            q = torch.rand((b_size, robot_model.get_dof()), **vars(tensor_args))
+            out = robot_model.forward(q)
+            torch.cuda.synchronize()
+
+        q = torch.rand((b_size, robot_model.get_dof()), **vars(tensor_args))
+        torch.cuda.synchronize()
+
+        st_time = time.time()
+        out = robot_model.forward(q)
+        torch.cuda.synchronize()
+        dt = time.time() - st_time
+    else:
+        q = torch.rand((b_size, robot_model.get_dof()), **vars(tensor_args))
+
+        g = torch.cuda.CUDAGraph()
+        q = q.detach()
+        s = torch.cuda.Stream()
+        s.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(s):
+            for i in range(3):
+                ee_mat, _, _, _, _, _, _ = robot_model.forward(q=q)
+        torch.cuda.current_stream().wait_stream(s)
+        with torch.cuda.graph(g):
+            ee_mat, _, _, _, _, _, _ = robot_model.forward(q=q)
+        q_new = torch.rand((b_size, robot_model.get_dof()), **vars(tensor_args))
+
+        for _ in range(10):
+            q.copy_(q_new.detach().requires_grad_(False))
+            g.replay()
+            # a = ee_mat.clone()
+        q_new = torch.rand((b_size, robot_model.get_dof()), **vars(tensor_args))
+
+        torch.cuda.synchronize()
+        st_time = time.time()
+
+        q.copy_(q_new.detach().requires_grad_(False))
+        g.replay()
+        # a = ee_mat.clone()
+        torch.cuda.synchronize()
+        dt = time.time() - st_time
+    return dt
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save_path",
+        type=str,
+        default=".",
+        help="path to save file",
+    )
+    parser.add_argument(
+        "--file_name",
+        type=str,
+        default="kinematics",
+        help="File name prefix to use to save benchmark results",
+    )
+
+    args = parser.parse_args()
+    b_list = [1, 10, 100, 1000, 10000]
+
+    robot_list = get_robot_list()
+
+    world_file = "collision_test.yml"
+
+    print("running...")
+    data = {"robot": [], "Kinematics": [], "Collision Checking": [], "Batch Size": []}
+    for robot_file in robot_list:
+        arm_sampler = load_curobo(robot_file, world_file)
+
+        # create a sampler with dof:
+        for b_size in b_list:
+            # sample test configs:
+            q_test = arm_sampler.sample_random_actions(b_size).cpu().numpy()
+
+            dt_cu_cg = bench_collision_curobo(
+                robot_file,
+                world_file,
+                q_test,
+                use_cuda_graph=True,
+            )
+            dt_kin_cg = bench_kin_curobo(
+                robot_file, q_test, use_cuda_graph=True, use_coll_spheres=True
+            )
+
+            data["robot"].append(robot_file)
+            data["Collision Checking"].append(dt_cu_cg)
+            data["Kinematics"].append(dt_kin_cg)
+            data["Batch Size"].append(b_size)
+    write_yaml(data, join_path(args.save_path, args.file_name + ".yml"))
+    try:
+        # Third Party
+        import pandas as pd
+
+        df = pd.DataFrame(data)
+        df.to_csv(join_path(args.save_path, args.file_name + ".csv"))
+    except ImportError:
+        pass
diff --git a/benchmark/metrics.py b/benchmark/metrics.py
new file mode 100644
index 00000000..d8ba65ba
--- /dev/null
+++ b/benchmark/metrics.py
@@ -0,0 +1,36 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+from dataclasses import dataclass
+from typing import List
+
+# Third Party
+import numpy as np
+from robometrics.statistics import Statistic, TrajectoryGroupMetrics, TrajectoryMetrics
+
+
+@dataclass
+class CuroboMetrics(TrajectoryMetrics):
+    time: float = np.inf
+
+
+@dataclass
+class CuroboGroupMetrics(TrajectoryGroupMetrics):
+    time: float = np.inf
+
+    @classmethod
+    def from_list(cls, group: List[CuroboMetrics]):
+        unskipped = [m for m in group if not m.skip]
+        successes = [m for m in unskipped if m.success]
+        data = super().from_list(group)
+        data.time = Statistic.from_list([m.time for m in successes])
+        return data
diff --git a/benchmark/robometrics_benchmark.py b/benchmark/robometrics_benchmark.py
new file mode 100644
index 00000000..07381e29
--- /dev/null
+++ b/benchmark/robometrics_benchmark.py
@@ -0,0 +1,612 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+import argparse
+import time
+from copy import deepcopy
+from typing import Any, Dict, List, Optional
+
+# Third Party
+import numpy as np
+import torch
+from geometrout.primitive import Cuboid, Cylinder
+from geometrout.transform import SE3
+from metrics import CuroboGroupMetrics, CuroboMetrics
+from robometrics.datasets import demo_raw, motion_benchmaker_raw, mpinets_raw
+from robometrics.evaluator import Evaluator
+from robometrics.robot import CollisionSphereConfig, Robot
+from tqdm import tqdm
+
+# CuRobo
+from curobo.geom.sdf.world import CollisionCheckerType, WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import RobotConfig
+from curobo.types.state import JointState
+from curobo.util.logger import setup_curobo_logger
+from curobo.util_file import (
+    get_assets_path,
+    get_robot_configs_path,
+    get_task_configs_path,
+    get_world_configs_path,
+    join_path,
+    load_yaml,
+    write_yaml,
+)
+from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
+
+torch.manual_seed(0)
+
+torch.backends.cudnn.benchmark = True
+
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+
+np.random.seed(0)
+
+
+def load_curobo(
+    n_cubes: int,
+    enable_debug: bool = False,
+    tsteps: int = 30,
+    trajopt_seeds: int = 4,
+    mpinets: bool = False,
+    graph_mode: bool = True,
+):
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), "franka.yml"))["robot_cfg"]
+    robot_cfg["kinematics"]["collision_sphere_buffer"] = 0.0
+    robot_cfg["kinematics"]["collision_spheres"] = "spheres/franka_mesh.yml"
+    robot_cfg["kinematics"]["collision_link_names"].remove("attached_object")
+
+    ik_seeds = 30  # 500
+    if graph_mode:
+        trajopt_seeds = 4
+    if trajopt_seeds >= 14:
+        ik_seeds = max(100, trajopt_seeds * 4)
+    if mpinets:
+        robot_cfg["kinematics"]["lock_joints"] = {
+            "panda_finger_joint1": 0.025,
+            "panda_finger_joint2": -0.025,
+        }
+    world_cfg = WorldConfig.from_dict(
+        load_yaml(join_path(get_world_configs_path(), "collision_table.yml"))
+    ).get_obb_world()
+    interpolation_steps = 2000
+    if graph_mode:
+        interpolation_steps = 100
+    robot_cfg_instance = RobotConfig.from_dict(robot_cfg, tensor_args=TensorDeviceType())
+
+    K = robot_cfg_instance.kinematics.kinematics_config.joint_limits
+    K.position[0, :] -= 0.1
+    K.position[1, :] += 0.1
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_cfg_instance,
+        world_cfg,
+        trajopt_tsteps=tsteps,
+        collision_checker_type=CollisionCheckerType.PRIMITIVE,
+        use_cuda_graph=True,
+        collision_cache={"obb": n_cubes},
+        position_threshold=0.005,  # 0.5 cm
+        rotation_threshold=0.05,
+        num_ik_seeds=ik_seeds,
+        num_graph_seeds=trajopt_seeds,
+        num_trajopt_seeds=trajopt_seeds,
+        interpolation_dt=0.025,
+        interpolation_steps=interpolation_steps,
+        collision_activation_distance=0.03,
+        state_finite_difference_mode="CENTRAL",
+        trajopt_dt=0.25,
+        minimize_jerk=True,
+        finetune_dt_scale=1.05,  # 1.05,
+        maximum_trajectory_dt=0.1,
+    )
+    mg = MotionGen(motion_gen_config)
+    mg.warmup(enable_graph=True, warmup_js_trajopt=False)
+    robot_cfg["kinematics"]["ee_link"] = "panda_hand"
+    return mg, robot_cfg
+
+
+def benchmark_mb(
+    write_usd=False,
+    save_log=False,
+    write_plot=False,
+    write_benchmark=False,
+    plot_cost=False,
+    override_tsteps: Optional[int] = None,
+    save_kpi=False,
+    graph_mode=False,
+    args=None,
+):
+    interpolation_dt = 0.02
+
+    enable_debug = False
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), "franka.yml"))["robot_cfg"]
+    urdf = join_path(get_assets_path(), robot_cfg["kinematics"]["urdf_path"])
+    spheres = robot_cfg["kinematics"]["collision_spheres"]
+
+    if isinstance(spheres, str):
+        spheres = load_yaml(join_path(get_robot_configs_path(), spheres))["collision_spheres"]
+    for s in spheres:
+        for k in spheres[s]:
+            k["radius"] = max(0.001, k["radius"] - 0.015)
+    data = {
+        "collision_spheres": spheres,
+        "self_collision_ignore": robot_cfg["kinematics"]["self_collision_ignore"],
+        "self_collision_buffer": robot_cfg["kinematics"]["self_collision_buffer"],
+    }
+    config = CollisionSphereConfig.load_from_dictionary(data)
+    metrics_robot = Robot(urdf, config)
+    evaluator = Evaluator(metrics_robot)
+    all_files = []
+
+    # modify robot joint limits as some start states in the dataset are at the joint limits:
+    if True:
+        for k in evaluator.robot.actuated_joints:
+            k.limit.lower -= 0.1
+            k.limit.upper += 0.1
+
+    plan_config = MotionGenPlanConfig(
+        max_attempts=60,
+        enable_graph_attempt=3,
+        enable_finetune_trajopt=False,
+        partial_ik_opt=True,
+    )
+    file_paths = [motion_benchmaker_raw, mpinets_raw]
+    if args.demo:
+        file_paths = [demo_raw]
+    # load dataset:
+    og_tsteps = 32
+    if override_tsteps is not None:
+        og_tsteps = override_tsteps
+
+    og_trajopt_seeds = 12
+    for file_path in file_paths:
+        all_groups = []
+        mpinets_data = False
+        problems = file_path()
+        if "dresser_task_oriented" in list(problems.keys()):
+            mpinets_data = True
+        for key, v in tqdm(problems.items()):
+            tsteps = og_tsteps
+            trajopt_seeds = og_trajopt_seeds
+            if "cage_panda" in key:
+                trajopt_seeds = 16
+            if "table_under_pick_panda" in key:
+                tsteps = 44
+                trajopt_seeds = 28
+
+            if "cubby_task_oriented" in key and "merged" not in key:
+                trajopt_seeds = 16
+
+            if "dresser_task_oriented" in key:
+                trajopt_seeds = 16
+
+            scene_problems = problems[key]  # [:4]  # [:1]  # [:20]  # [0:10]
+            n_cubes = check_problems(scene_problems)
+            mg, robot_cfg = load_curobo(
+                n_cubes, enable_debug, tsteps, trajopt_seeds, mpinets_data, graph_mode
+            )
+            m_list = []
+            i = 0
+            ik_fail = 0
+            for problem in tqdm(scene_problems, leave=False):
+                i += 1
+                if problem["collision_buffer_ik"] < 0.0:
+                    continue
+
+                plan_config = MotionGenPlanConfig(
+                    max_attempts=100,  # 00,  # 00,  # 100,  # 00,  # 000,#,00,#00,  # 5000,
+                    enable_graph_attempt=3,
+                    enable_finetune_trajopt=True,
+                    partial_ik_opt=False,
+                    enable_graph=graph_mode,
+                    timeout=60,
+                    enable_opt=not graph_mode,
+                )
+                q_start = problem["start"]
+                pose = (
+                    problem["goal_pose"]["position_xyz"] + problem["goal_pose"]["quaternion_wxyz"]
+                )
+
+                problem_name = "d_" + key + "_" + str(i)
+
+                # reset planner
+                mg.reset(reset_seed=False)
+                world = WorldConfig.from_dict(deepcopy(problem["obstacles"])).get_obb_world()
+                # world.save_world_as_mesh(problem_name + ".stl")
+                mg.world_coll_checker.clear_cache()
+                mg.update_world(world)
+                # continue
+                # load obstacles
+
+                # run planner
+                start_state = JointState.from_position(mg.tensor_args.to_device([q_start]))
+                result = mg.plan_single(
+                    start_state,
+                    Pose.from_list(pose),
+                    plan_config,
+                )
+                if result.status == "IK Fail":
+                    ik_fail += 1
+                if result.success.item():
+                    eval_obs = get_metrics_obstacles(problem["obstacles"])
+
+                    q_traj = result.get_interpolated_plan()
+                    ee_trajectory = mg.compute_kinematics(q_traj)
+
+                    q_metrics = q_traj.position.cpu().numpy()
+                    ee_m = ee_trajectory.ee_pose
+                    ee_pos = ee_m.position.cpu().numpy()
+                    ee_q = ee_m.quaternion.cpu().numpy()
+                    se3_list = [
+                        SE3(np.ravel(ee_pos[p]), np.ravel(ee_q[p]))
+                        for p in range(ee_m.position.shape[0])
+                    ]
+                    # add gripper position:
+                    q_met = np.zeros((q_metrics.shape[0], 8))
+                    q_met[:, :7] = q_metrics
+                    q_met[:, 7] = 0.04
+                    if mpinets_data:
+                        q_met[:, 7] = 0.025
+                    st_time = time.time()
+
+                    current_metrics = evaluator.evaluate_trajectory(
+                        q_met,
+                        se3_list,
+                        SE3(np.ravel(pose[:3]), np.ravel(pose[3:])),
+                        eval_obs,
+                        result.total_time,
+                    )
+                    # if not current_metrics.success:
+                    #    print(current_metrics)
+                    #    write_usd = True
+                    # else:
+                    #    write_usd = False
+                # rint(plan_config.enable_graph, plan_config.enable_graph_attempt)
+                problem["solution"] = None
+                if plan_config.enable_finetune_trajopt:
+                    problem_name = key + "_" + str(i)
+                else:
+                    problem_name = "noft_" + key + "_" + str(i)
+                problem_name = "nosw_" + problem_name
+                if write_usd or save_log:
+                    # CuRobo
+                    from curobo.util.usd_helper import UsdHelper
+
+                    world.randomize_color(r=[0.5, 0.9], g=[0.2, 0.5], b=[0.0, 0.2])
+
+                    gripper_mesh = Mesh(
+                        name="target_gripper",
+                        file_path=join_path(
+                            get_assets_path(),
+                            "robot/franka_description/meshes/visual/hand_ee_link.dae",
+                        ),
+                        color=[0.0, 0.8, 0.1, 1.0],
+                        pose=pose,
+                    )
+                    world.add_obstacle(gripper_mesh)
+                # get costs:
+                if plot_cost:
+                    dt = 0.5
+                    problem_name = "approx_wolfe_p" + problem_name
+                    if result.optimized_dt is not None:
+                        dt = result.optimized_dt.item()
+                    if "trajopt_result" in result.debug_info:
+                        success = result.success.item()
+                        traj_cost = (
+                            # result.debug_info["trajopt_result"].debug_info["solver"]["cost"][0]
+                            result.debug_info["trajopt_result"].debug_info["solver"]["cost"][-1]
+                        )
+                        # print(traj_cost[0])
+                        traj_cost = torch.cat(traj_cost, dim=-1)
+                        plot_cost_iteration(
+                            traj_cost,
+                            title=problem_name + "_" + str(success) + "_" + str(dt),
+                            save_path=join_path("log/plot/", problem_name + "_cost"),
+                            log_scale=False,
+                        )
+                        if "finetune_trajopt_result" in result.debug_info:
+                            traj_cost = result.debug_info["finetune_trajopt_result"].debug_info[
+                                "solver"
+                            ]["cost"][0]
+                            traj_cost = torch.cat(traj_cost, dim=-1)
+                            plot_cost_iteration(
+                                traj_cost,
+                                title=problem_name + "_" + str(success) + "_" + str(dt),
+                                save_path=join_path("log/plot/", problem_name + "_ft_cost"),
+                                log_scale=False,
+                            )
+                if result.success.item() and current_metrics.success:
+                    q_traj = result.get_interpolated_plan()
+                    problem["goal_ik"] = q_traj.position.cpu().squeeze().numpy()[-1, :].tolist()
+                    problem["solution"] = {
+                        "position": result.get_interpolated_plan()
+                        .position.cpu()
+                        .squeeze()
+                        .numpy()
+                        .tolist(),
+                        "velocity": result.get_interpolated_plan()
+                        .velocity.cpu()
+                        .squeeze()
+                        .numpy()
+                        .tolist(),
+                        "acceleration": result.get_interpolated_plan()
+                        .acceleration.cpu()
+                        .squeeze()
+                        .numpy()
+                        .tolist(),
+                        "jerk": result.get_interpolated_plan()
+                        .jerk.cpu()
+                        .squeeze()
+                        .numpy()
+                        .tolist(),
+                        "dt": interpolation_dt,
+                    }
+                    # print(problem["solution"]["position"])
+                    # exit()
+                    debug = {
+                        "used_graph": result.used_graph,
+                        "attempts": result.attempts,
+                        "ik_time": result.ik_time,
+                        "graph_time": result.graph_time,
+                        "trajopt_time": result.trajopt_time,
+                        "total_time": result.total_time,
+                        "solve_time": result.solve_time,
+                        "opt_traj": {
+                            "position": result.optimized_plan.position.cpu()
+                            .squeeze()
+                            .numpy()
+                            .tolist(),
+                            "velocity": result.optimized_plan.velocity.cpu()
+                            .squeeze()
+                            .numpy()
+                            .tolist(),
+                            "acceleration": result.optimized_plan.acceleration.cpu()
+                            .squeeze()
+                            .numpy()
+                            .tolist(),
+                            "jerk": result.optimized_plan.jerk.cpu().squeeze().numpy().tolist(),
+                            "dt": result.optimized_dt.item(),
+                        },
+                        "valid_query": result.valid_query,
+                    }
+                    problem["solution_debug"] = debug
+                    # print(
+                    #    "T: ",
+                    #    result.motion_time.item(),
+                    #    result.optimized_dt.item(),
+                    #    (len(problem["solution"]["position"]) - 1 ) * result.interpolation_dt,
+                    #    result.interpolation_dt,
+                    #    )
+                    # exit()
+                    current_metrics = CuroboMetrics(
+                        skip=False,
+                        success=True,
+                        time=result.total_time,
+                        collision=False,
+                        joint_limit_violation=False,
+                        self_collision=False,
+                        position_error=result.position_error.item() * 100.0,
+                        orientation_error=result.rotation_error.item() * 100.0,
+                        eef_position_path_length=10,
+                        eef_orientation_path_length=10,
+                        attempts=result.attempts,
+                        motion_time=result.motion_time.item(),
+                        solve_time=result.solve_time,
+                    )
+
+                    if write_usd:
+                        # CuRobo
+
+                        q_traj = result.get_interpolated_plan()
+                        UsdHelper.write_trajectory_animation_with_robot_usd(
+                            robot_cfg,
+                            world,
+                            start_state,
+                            q_traj,
+                            dt=result.interpolation_dt,
+                            save_path=join_path("log/usd/", problem_name) + ".usd",
+                            interpolation_steps=1,
+                            write_robot_usd_path="log/usd/assets/",
+                            robot_usd_local_reference="assets/",
+                            base_frame="/world_" + problem_name,
+                            visualize_robot_spheres=True,
+                        )
+
+                    if write_plot:
+                        problem_name = problem_name
+                        plot_traj(
+                            result.optimized_plan,
+                            result.optimized_dt.item(),
+                            # result.get_interpolated_plan(),
+                            # result.interpolation_dt,
+                            title=problem_name,
+                            save_path=join_path("log/plot/", problem_name + ".pdf"),
+                        )
+                        plot_traj(
+                            # result.optimized_plan,
+                            # result.optimized_dt.item(),
+                            result.get_interpolated_plan(),
+                            result.interpolation_dt,
+                            title=problem_name,
+                            save_path=join_path("log/plot/", problem_name + "_int.pdf"),
+                        )
+                        # exit()
+
+                    m_list.append(current_metrics)
+                    all_groups.append(current_metrics)
+                elif result.valid_query:
+                    # print("fail")
+                    # print(result.status)
+                    current_metrics = CuroboMetrics()
+                    debug = {
+                        "used_graph": result.used_graph,
+                        "attempts": result.attempts,
+                        "ik_time": result.ik_time,
+                        "graph_time": result.graph_time,
+                        "trajopt_time": result.trajopt_time,
+                        "total_time": result.total_time,
+                        "solve_time": result.solve_time,
+                        "status": result.status,
+                        "valid_query": result.valid_query,
+                    }
+                    problem["solution_debug"] = debug
+
+                    m_list.append(current_metrics)
+                    all_groups.append(current_metrics)
+                else:
+                    # print("invalid: " + problem_name)
+                    debug = {
+                        "used_graph": result.used_graph,
+                        "attempts": result.attempts,
+                        "ik_time": result.ik_time,
+                        "graph_time": result.graph_time,
+                        "trajopt_time": result.trajopt_time,
+                        "total_time": result.total_time,
+                        "solve_time": result.solve_time,
+                        "status": result.status,
+                        "valid_query": result.valid_query,
+                    }
+                    problem["solution_debug"] = debug
+                if save_log:  # and not result.success.item():
+                    # print("save log")
+                    UsdHelper.write_motion_gen_log(
+                        result,
+                        robot_cfg,
+                        world,
+                        start_state,
+                        Pose.from_list(pose),
+                        join_path("log/usd/", problem_name) + "_debug",
+                        write_ik=False,
+                        write_trajopt=True,
+                        visualize_robot_spheres=False,
+                        grid_space=2,
+                    )
+                # exit()
+
+            g_m = CuroboGroupMetrics.from_list(m_list)
+            print(
+                key,
+                f"{g_m.success:2.2f}",
+                # g_m.motion_time,
+                g_m.time.mean,
+                # g_m.time.percent_75,
+                g_m.time.percent_98,
+                g_m.position_error.percent_98,
+                # g_m.position_error.median,
+                g_m.orientation_error.percent_98,
+                # g_m.orientation_error.median,
+            )  # , g_m.attempts)
+            print(g_m.attempts)
+            # print("MT: ", g_m.motion_time)
+            # $print(ik_fail)
+            # exit()
+            # print(g_m.position_error, g_m.orientation_error)
+
+        g_m = CuroboGroupMetrics.from_list(all_groups)
+        print(
+            "All: ",
+            f"{g_m.success:2.2f}",
+            g_m.motion_time.percent_98,
+            g_m.time.mean,
+            g_m.time.percent_75,
+            g_m.position_error.percent_75,
+            g_m.orientation_error.percent_75,
+        )  # g_m.time, g_m.attempts)
+        # print("MT: ", g_m.motion_time)
+
+        # print(g_m.position_error, g_m.orientation_error)
+
+        # exit()
+        if write_benchmark:
+            if not mpinets_data:
+                write_yaml(problems, "robometrics_mb_curobo_solution.yaml")
+            else:
+                write_yaml(problems, "robometrics_mpinets_curobo_solution.yaml")
+        all_files += all_groups
+    g_m = CuroboGroupMetrics.from_list(all_files)
+    # print(g_m.success, g_m.time, g_m.attempts, g_m.position_error, g_m.orientation_error)
+    print("######## FULL SET ############")
+    print("All: ", f"{g_m.success:2.2f}")
+    print("MT: ", g_m.motion_time)
+    print("PT:", g_m.time)
+    print("ST: ", g_m.solve_time)
+    print("position accuracy: ", g_m.position_error)
+    print("orientation accuracy: ", g_m.orientation_error)
+
+    if args.kpi:
+        kpi_data = {"Success": g_m.success, "Planning Time": float(g_m.time.mean)}
+        write_yaml(kpi_data, join_path(args.save_path, args.file_name + ".yml"))
+
+
+def get_metrics_obstacles(obs: Dict[str, List[Any]]):
+    obs_list = []
+    if "cylinder" in obs and len(obs["cylinder"].items()) > 0:
+        for _, vi in enumerate(obs["cylinder"].values()):
+            obs_list.append(
+                Cylinder(
+                    np.ravel(vi["pose"][:3]), vi["radius"], vi["height"], np.ravel(vi["pose"][3:])
+                )
+            )
+    if "cuboid" in obs and len(obs["cuboid"].items()) > 0:
+        for _, vi in enumerate(obs["cuboid"].values()):
+            obs_list.append(
+                Cuboid(np.ravel(vi["pose"][:3]), np.ravel(vi["dims"]), np.ravel(vi["pose"][3:]))
+            )
+    return obs_list
+
+
+def check_problems(all_problems):
+    n_cube = 0
+    for problem in all_problems:
+        cache = WorldConfig.from_dict(problem["obstacles"]).get_obb_world().get_cache_dict()
+        n_cube = max(n_cube, cache["obb"])
+    return n_cube
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save_path",
+        type=str,
+        default=".",
+        help="path to save file",
+    )
+    parser.add_argument(
+        "--file_name",
+        type=str,
+        default="mg",
+        help="File name prefix to use to save benchmark results",
+    )
+    parser.add_argument(
+        "--graph",
+        action="store_true",
+        help="When True, runs only geometric planner",
+        default=False,
+    )
+    parser.add_argument(
+        "--kpi",
+        action="store_true",
+        help="When True, saves minimal metrics",
+        default=False,
+    )
+
+    parser.add_argument(
+        "--demo",
+        action="store_true",
+        help="When True, runs only on small dataaset",
+        default=False,
+    )
+
+    args = parser.parse_args()
+    setup_curobo_logger("error")
+    benchmark_mb(args=args, save_kpi=args.kpi)
diff --git a/docker/.dockerignore b/docker/.dockerignore
new file mode 100644
index 00000000..8bf02f5c
--- /dev/null
+++ b/docker/.dockerignore
@@ -0,0 +1,3 @@
+*.git
+*_build
+*build
\ No newline at end of file
diff --git a/docker/README.md b/docker/README.md
new file mode 100644
index 00000000..413413cd
--- /dev/null
+++ b/docker/README.md
@@ -0,0 +1,36 @@
+<!--
+Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+property and proprietary rights in and to this material, related
+documentation and any modifications thereto. Any use, reproduction,
+disclosure or distribution of this material and related documentation
+without an express license agreement from NVIDIA CORPORATION or
+its affiliates is strictly prohibited.
+-->
+# Docker Instructions
+
+## Running docker from NGC (Recommended)
+1. `sh build_user_docker.sh $UID`
+2. `sh start_docker_x86.sh` will start the docker
+
+## Building your own docker image with CuRobo
+1. Add default nvidia runtime to enable cuda compilation during docker build:
+    ```
+    Edit/create the /etc/docker/daemon.json with content:
+    {
+    "runtimes": {
+        "nvidia": {
+            "path": "/usr/bin/nvidia-container-runtime",
+            "runtimeArgs": []
+         } 
+    },
+    "default-runtime": "nvidia" # ADD this line (the above lines will already exist in your json file)
+    }
+    ```
+2. `sh pull_repos.sh`
+3. `bash build_dev_docker.sh`
+4. Change the docker image name in `user.dockerfile`
+5. `sh build_user_docker.sh`
+6. `sh start_docker_x86.sh` will start the docker
+
diff --git a/docker/arm64.dockerfile b/docker/arm64.dockerfile
new file mode 100644
index 00000000..e99666de
--- /dev/null
+++ b/docker/arm64.dockerfile
@@ -0,0 +1,181 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+FROM nvcr.io/nvidia/l4t-pytorch:r35.1.0-pth1.13-py3 AS l4t_pytorch
+
+# Install ros components:
+RUN apt-get update &&\
+    apt-get install -y sudo git bash unattended-upgrades glmark2 &&\
+    rm -rf /var/lib/apt/lists/*
+
+
+# Deal with getting tons of debconf messages
+# See: https://github.com/phusion/baseimage-docker/issues/58
+RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
+
+# TODO: Don't hardcode timezone setting to Los_Angeles, pull from host computer
+# Set timezone info
+RUN apt-get update && apt-get install -y \
+  tzdata \
+  && rm -rf /var/lib/apt/lists/* \
+  && ln -fs /usr/share/zoneinfo/America/Los_Angeles /etc/localtime \
+  && echo "America/Los_Angeles" > /etc/timezone \
+  && dpkg-reconfigure -f noninteractive tzdata
+
+# Install apt-get packages necessary for building, downloading, etc
+# NOTE: Dockerfile best practices recommends having apt-get update
+# and install commands in one line to avoid apt-get caching issues.
+# https://docs.docker.com/develop/develop-images/dockerfile_best-practices/#run
+RUN apt-get update && apt-get install -y \
+  curl \
+  lsb-core \
+  software-properties-common \
+  wget \
+  && rm -rf /var/lib/apt/lists/*
+
+RUN add-apt-repository -y ppa:git-core/ppa
+
+RUN apt-get update && apt-get install -y \
+  build-essential \
+  cmake \
+  git \
+  git-lfs \
+  iputils-ping \
+  make \
+  openssh-server \
+  openssh-client \
+  libeigen3-dev \
+  libssl-dev \
+  python3-pip \
+  python3-ipdb \
+  python3-tk \
+  python3-wstool \
+  sudo git bash unattended-upgrades \
+  apt-utils \
+  terminator \
+  && rm -rf /var/lib/apt/lists/*
+
+ARG ROS_PKG=ros_base # desktop does not work
+ENV ROS_DISTRO=noetic
+ENV ROS_ROOT=/opt/ros/${ROS_DISTRO}
+ENV ROS_PYTHON_VERSION=3
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+WORKDIR /workspace
+
+
+#
+# add the ROS deb repo to the apt sources list
+#
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+          git \
+		cmake \
+		build-essential \
+		curl \
+		wget \
+		gnupg2 \
+		lsb-release \
+		ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN sh -c 'echo "deb http://packages.ros.org/ros/ubuntu $(lsb_release -sc) main" > /etc/apt/sources.list.d/ros-latest.list'
+RUN curl -s https://raw.githubusercontent.com/ros/rosdistro/master/ros.asc | apt-key add -
+
+
+#
+# install bootstrap dependencies
+#
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+          libpython3-dev \
+          python3-rosdep \
+          python3-rosinstall-generator \
+          python3-vcstool \
+          build-essential && \
+    rosdep init && \
+    rosdep update && \
+    rm -rf /var/lib/apt/lists/*
+
+
+#
+# download/build the ROS source
+#
+RUN mkdir ros_catkin_ws && \
+    cd ros_catkin_ws && \
+    rosinstall_generator ${ROS_PKG} vision_msgs --rosdistro ${ROS_DISTRO} --deps --tar > ${ROS_DISTRO}-${ROS_PKG}.rosinstall && \
+    mkdir src && \
+    vcs import --input ${ROS_DISTRO}-${ROS_PKG}.rosinstall ./src && \
+    apt-get update && \
+    rosdep install --from-paths ./src --ignore-packages-from-source --rosdistro ${ROS_DISTRO} --skip-keys python3-pykdl -y && \
+    python3 ./src/catkin/bin/catkin_make_isolated --install --install-space ${ROS_ROOT} -DCMAKE_BUILD_TYPE=Release && \
+    rm -rf /var/lib/apt/lists/*
+
+
+RUN pip3 install trimesh \
+  numpy-quaternion \
+  networkx \
+  pyyaml \
+  rospkg \ 
+  rosdep \
+  empy
+# copy pkgs directory:
+COPY pkgs /pkgs
+
+
+# install warp:
+RUN cd /pkgs/warp && python3 build_lib.py && pip3 install .
+
+# install curobolib-extras/warp_torch:
+RUN cd /pkgs/warp_torch && pip3 install .
+
+# install nvblox:
+RUN apt-get update && \
+    apt-get install -y libgoogle-glog-dev libgtest-dev curl libsqlite3-dev && \
+    cd /usr/src/googletest && cmake . && cmake --build . --target install && \
+    rm -rf /var/lib/apt/lists/*
+RUN cd /pkgs/nvblox/nvblox && mkdir build && cd build && cmake .. && \
+    make -j8 && make install
+
+# install curobolib-extras/nvblox_torch:
+RUN cd /pkgs/nvblox_torch && sh install.sh 
+
+# install curobolib and curobo:
+RUN cd /pkgs/curobolib && pip3 install .
+
+RUN cd /pkgs/curobo && pip3 install .
+
+# sudo apt-get install lcov libbullet-extras-dev python3-catkin-tools swig ros-noetic-ifopt libyaml-cpp-dev libjsoncpp-dev
+# pip3 install tqdm
+# sudo apt-get install liborocos-kdl-dev ros-noetic-fcl libpcl-dev libompl-dev  libnlopt-cxx-dev liburdf-dev libkdlparser-dev
+# clone taskflow and compile
+
+
+# libfranka:
+RUN apt-get update && apt-get install -y build-essential cmake git libpoco-dev libeigen3-dev && cd /pkgs && git clone --recursive https://github.com/frankaemika/libfranka && cd libfranka && git checkout 0.7.1 && git submodule update && mkdir build && cd build && cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=OFF .. && \
+  cmake --build . && cpack -G DEB && dpkg -i libfranka*.deb &&  rm -rf /var/lib/apt/lists/*
+# franka_ros and franka_motion_control:
+RUN mkdir -p /curobo_ws/src && mv /pkgs/curobo_ros /curobo_ws/src/ && mv /pkgs/franka_motion_control /curobo_ws/src/
+
+RUN cd /curobo_ws/src && git clone --branch 0.7.1 https://github.com/frankaemika/franka_ros.git && \
+    git clone https://github.com/justagist/franka_panda_description.git 
+
+
+RUN apt-get update && apt-get install -y ros-noetic-robot-state-publisher ros-noetic-rviz \
+  ros-noetic-moveit-visualization ros-noetic-moveit-msgs \
+  ros-noetic-controller-interface ros-noetic-combined-robot-hw ros-noetic-joint-limits-interface \
+  ros-noetic-control-msgs ros-noetic-controller-manager ros-noetic-realtime-tools ros-noetic-eigen-conversions ros-noetic-tf-conversions ros-noetic-moveit-ros-visualization && rm -rf /var/lib/apt/lists/*
+
+RUN cd /curobo_ws && \
+  /bin/bash -c "source /opt/ros/noetic/setup.bash && catkin_make -DCMAKE_BUILD_TYPE=Release"
+
+#RUN apt-get update && apt-get install -y ros-noetic-robot-state-publisher ros-noetic-rviz ros-noetic-moveit-msgs\
+#  ros-noetic-moveit-ros ros-noetic-controller-interface ros-noetic-combined-robot-hw ros-noetic-joint-limits-interface ros-noetic-control-msgs && rm -rf /var/lib/apt/lists/*
\ No newline at end of file
diff --git a/docker/base.dockerfile b/docker/base.dockerfile
new file mode 100644
index 00000000..a177f75f
--- /dev/null
+++ b/docker/base.dockerfile
@@ -0,0 +1,30 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+#@FROM nvcr.io/nvidia/pytorch:22.12-py3
+FROM nvcr.io/nvidia/pytorch:23.08-py3 AS torch_cuda_base
+
+RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
+
+RUN apt-get update && apt-get install -y \
+  tzdata \
+  && rm -rf /var/lib/apt/lists/* \
+  && ln -fs /usr/share/zoneinfo/America/Los_Angeles /etc/localtime \
+  && echo "America/Los_Angeles" > /etc/timezone \
+  && dpkg-reconfigure -f noninteractive tzdata
+
+RUN apt-get update &&\
+    apt-get install -y sudo git bash software-properties-common graphviz &&\
+    rm -rf /var/lib/apt/lists/*
+
+
+
+RUN python -m pip install --upgrade pip && python3 -m pip install graphviz
diff --git a/docker/build_dev_docker.sh b/docker/build_dev_docker.sh
new file mode 100644
index 00000000..66daa8e8
--- /dev/null
+++ b/docker/build_dev_docker.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+
+# This script will create a dev docker. Run this script by calling `bash build_dev_docker.sh`
+
+# Make sure you have pulled all required repos into pkgs folder (see pull_repos.sh script)
+
+# Check architecture to build:
+arch=`uname -m`
+
+if [ ${arch} == "x86_64" ]; then
+    echo "Building for X86 Architecture"
+    dockerfile="x86.dockerfile"
+elif [ ${arch} = "aarch64" ]; then
+    echo "Building for ARM Architecture"
+    dockerfile="arm64.dockerfile"
+else
+    echo "Unknown Architecture, defaulting to " + ${arch}
+    dockerfile="x86.dockerfile"
+fi
+
+# build docker file:
+# Make sure you enable nvidia runtime by:
+# Edit/create the /etc/docker/daemon.json with content:
+# {
+#    "runtimes": {
+#        "nvidia": {
+#            "path": "/usr/bin/nvidia-container-runtime",
+#            "runtimeArgs": []
+#         } 
+#    },
+#    "default-runtime": "nvidia" # ADD this line (the above lines will already exist in your json file)
+# }
+# 
+echo "${dockerfile}"
+docker build -t curobo_docker:latest -f ${dockerfile} .
diff --git a/docker/build_user_docker.sh b/docker/build_user_docker.sh
new file mode 100644
index 00000000..f3cc2367
--- /dev/null
+++ b/docker/build_user_docker.sh
@@ -0,0 +1,11 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+docker build --build-arg USERNAME=$USER --no-cache --build-arg USER_ID=$1 --tag curobo_user_docker:latest -f user.dockerfile . 
\ No newline at end of file
diff --git a/docker/ros1_x86.dockerfile b/docker/ros1_x86.dockerfile
new file mode 100644
index 00000000..c4e00311
--- /dev/null
+++ b/docker/ros1_x86.dockerfile
@@ -0,0 +1,69 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+FROM nvcr.io/nvidia/pytorch:23.02-py3
+
+
+RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
+
+RUN apt-get update && apt-get install -y \
+  tzdata \
+  && rm -rf /var/lib/apt/lists/* \
+  && ln -fs /usr/share/zoneinfo/America/Los_Angeles /etc/localtime \
+  && echo "America/Los_Angeles" > /etc/timezone \
+  && dpkg-reconfigure -f noninteractive tzdata
+
+RUN apt-get update &&\
+    apt-get install -y sudo git bash software-properties-common graphviz &&\
+    rm -rf /var/lib/apt/lists/*
+
+
+
+RUN python -m pip install --upgrade pip && python3 -m pip install graphviz
+
+# Install ROS noetic
+RUN sh -c 'echo "deb http://packages.ros.org/ros/ubuntu focal main" > /etc/apt/sources.list.d/ros-latest.list' \
+&& apt-key adv --keyserver 'hkp://keyserver.ubuntu.com:80' --recv-key C1CF6E31E6BADE8868B172B4F42ED6FBAB17C654 \
+&& apt-get update && apt-get install -y \
+  ros-noetic-desktop-full git build-essential python3-rosdep \
+  && rm -rf /var/lib/apt/lists/*
+
+
+# install realsense and azure kinect
+# Install the RealSense library (https://github.com/IntelRealSense/librealsense/blob/master/doc/distribution_linux.md#installing-the-packages)
+#RUN sudo apt-key adv --keyserver keys.gnupg.net --recv-key F6E65AC044F831AC80A06380C8B3A55A6F3EFCDE || sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-key F6E65AC044F831AC80A06380C8B3A55A6F3EFCDE
+#RUN sudo add-apt-repository "deb https://librealsense.intel.com/Debian/apt-repo $(lsb_release -cs) main" -u
+#RUN apt-get update && apt-get install -y \
+#  librealsense2-dkms \
+#  software-properties-common \
+#  librealsense2-utils \
+#  && rm -rf /var/lib/apt/lists/*
+
+
+# install moveit from source for all algos:
+ARG ROS_DISTRO=noetic
+RUN apt-get update && apt-get install -y \
+  ros-$ROS_DISTRO-apriltag-ros \
+  ros-$ROS_DISTRO-realsense2-camera \
+  ros-$ROS_DISTRO-ros-numpy \
+  ros-$ROS_DISTRO-vision-msgs \
+  ros-$ROS_DISTRO-franka-ros \
+  ros-$ROS_DISTRO-moveit-resources \
+  ros-$ROS_DISTRO-rosparam-shortcuts \
+  libglfw3-dev \
+  ros-$ROS_DISTRO-collada-urdf \
+  ros-$ROS_DISTRO-ur-msgs \
+  swig \
+  && rm -rf /var/lib/apt/lists/*
+
+
+RUN apt-get update && rosdep init && rosdep update && apt-get install -y ros-noetic-moveit-ros-visualization && rm -rf /var/lib/apt/lists/*
+RUN pip3 install netifaces
+
diff --git a/docker/start_docker_arm64.sh b/docker/start_docker_arm64.sh
new file mode 100644
index 00000000..96461aee
--- /dev/null
+++ b/docker/start_docker_arm64.sh
@@ -0,0 +1,22 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+docker run --rm -it \
+--runtime nvidia \
+--mount type=bind,src=/home/$USER/code,target=/home/$USER/code \
+--hostname ros1-docker \
+--add-host ros1-docker:127.0.0.1 \
+--network host \
+--gpus all \
+--env ROS_HOSTNAME=localhost \
+--env DISPLAY=$DISPLAY \
+--volume /tmp/.X11-unix:/tmp/.X11-unix \
+--volume /dev/input:/dev/input \
+curobo_user_docker:latest
diff --git a/docker/start_docker_x86.sh b/docker/start_docker_x86.sh
new file mode 100644
index 00000000..3400e97f
--- /dev/null
+++ b/docker/start_docker_x86.sh
@@ -0,0 +1,22 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+docker run --rm -it \
+--privileged --mount type=bind,src=/home/$USER/code,target=/home/$USER/code \
+-e NVIDIA_DISABLE_REQUIRE=1 \
+-e NVIDIA_DRIVER_CAPABILITIES=all  --device /dev/dri \
+--hostname ros1-docker \
+--add-host ros1-docker:127.0.0.1 \
+--gpus all \
+--network host \
+--env DISPLAY=unix$DISPLAY \
+--volume /tmp/.X11-unix:/tmp/.X11-unix \
+--volume /dev:/dev \
+curobo_user_docker:latest
diff --git a/docker/start_docker_x86_robot.sh b/docker/start_docker_x86_robot.sh
new file mode 100644
index 00000000..a4d7cbfc
--- /dev/null
+++ b/docker/start_docker_x86_robot.sh
@@ -0,0 +1,24 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+docker run --rm -it \
+--privileged --mount type=bind,src=/home/$USER/code,target=/home/$USER/code \
+-e NVIDIA_DISABLE_REQUIRE=1 \
+-e NVIDIA_DRIVER_CAPABILITIES=all  --device /dev/dri \
+--hostname ros1-docker \
+--add-host ros1-docker:127.0.0.1 \
+--gpus all \
+--network host \
+--env ROS_MASTER_URI=http://127.0.0.1:11311 \
+--env ROS_IP=127.0.0.1 \
+--env DISPLAY=unix$DISPLAY \
+--volume /tmp/.X11-unix:/tmp/.X11-unix \
+--volume /dev/input:/dev/input \
+curobo_user_docker:latest
\ No newline at end of file
diff --git a/docker/user.dockerfile b/docker/user.dockerfile
new file mode 100644
index 00000000..6ab23a1a
--- /dev/null
+++ b/docker/user.dockerfile
@@ -0,0 +1,34 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+# Check architecture and load:
+FROM curobo_docker:latest
+# Set variables
+ARG USERNAME
+ARG USER_ID
+
+# Set environment variables
+
+# Set up sudo user
+#RUN /sbin/adduser --disabled-password --gecos '' --uid $USER_ID $USERNAME
+RUN useradd -l -u $USER_ID -g users $USERNAME
+
+RUN /sbin/adduser $USERNAME sudo
+RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
+
+# Set user
+USER $USERNAME
+WORKDIR /home/$USERNAME
+ENV USER=$USERNAME
+ENV PATH="${PATH}:/home/${USER}/.local/bin"
+
+RUN echo 'completed'
+
diff --git a/docker/x86.dockerfile b/docker/x86.dockerfile
new file mode 100644
index 00000000..c35289db
--- /dev/null
+++ b/docker/x86.dockerfile
@@ -0,0 +1,93 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+FROM nvcr.io/nvidia/pytorch:23.08-py3 AS torch_cuda_base
+
+LABEL maintainer "User Name"
+
+# add GL:
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        pkg-config \
+        libglvnd-dev \
+        libgl1-mesa-dev \
+        libegl1-mesa-dev \
+        libgles2-mesa-dev && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV NVIDIA_VISIBLE_DEVICES all
+ENV NVIDIA_DRIVER_CAPABILITIES graphics,utility,compute
+
+
+RUN apt-get update &&\
+    apt-get install -y sudo git bash unattended-upgrades glmark2 &&\
+    rm -rf /var/lib/apt/lists/*
+
+
+# Deal with getting tons of debconf messages
+# See: https://github.com/phusion/baseimage-docker/issues/58
+RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
+
+# Set timezone info
+RUN apt-get update && apt-get install -y \
+  tzdata \
+  software-properties-common \
+  && rm -rf /var/lib/apt/lists/* \
+  && ln -fs /usr/share/zoneinfo/America/Los_Angeles /etc/localtime \
+  && echo "America/Los_Angeles" > /etc/timezone \
+  && dpkg-reconfigure -f noninteractive tzdata \
+  && add-apt-repository -y ppa:git-core/ppa
+
+RUN apt-get update && apt-get install -y \
+  curl \
+  lsb-core \
+  wget \
+  build-essential \
+  cmake \
+  git \
+  git-lfs \
+  iputils-ping \
+  make \
+  openssh-server \
+  openssh-client \
+  libeigen3-dev \
+  libssl-dev \
+  python3-pip \
+  python3-ipdb \
+  python3-tk \
+  python3-wstool \
+  sudo git bash unattended-upgrades \
+  apt-utils \
+  terminator \
+  && rm -rf /var/lib/apt/lists/*
+
+# push defaults to bashrc:
+RUN apt-get update && apt-get install --reinstall -y \
+  libmpich-dev \
+  hwloc-nox libmpich12 mpich \
+  && rm -rf /var/lib/apt/lists/*
+
+# This is required to enable mpi lib access:
+ENV PATH="${PATH}:/opt/hpcx/ompi/bin"
+ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/hpcx/ompi/lib"
+
+
+
+ENV TORCH_CUDA_ARCH_LIST "7.0+PTX"
+ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}"
+
+
+# copy pkgs directory: clone curobo into docker/pkgs folder.
+COPY pkgs /pkgs
+
+RUN pip install "robometrics[evaluator] @ git+https://github.com/fishbotics/robometrics.git"
+
+
+RUN cd /pkgs/curobo && pip3 install .[dev,usd] --no-build-isolation
+
diff --git a/examples/collision_check_example.py b/examples/collision_check_example.py
new file mode 100644
index 00000000..05bac4bc
--- /dev/null
+++ b/examples/collision_check_example.py
@@ -0,0 +1,57 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+""" Example computing collisions using curobo
+
+"""
+# Third Party
+import torch
+
+# CuRobo
+from curobo.types.base import TensorDeviceType
+from curobo.wrap.model.robot_world import RobotWorld, RobotWorldConfig
+
+if __name__ == "__main__":
+    robot_file = "franka.yml"
+    world_file = "collision_test.yml"
+    tensor_args = TensorDeviceType()
+    # config = RobotWorldConfig.load_from_config(robot_file, world_file, pose_weight=[10, 200, 1, 10],
+    #                                           collision_activation_distance=0.0)
+    # curobo_fn = RobotWorld(config)
+    robot_file = "franka.yml"
+    world_config = {
+        "cuboid": {
+            "table": {"dims": [2, 2, 0.2], "pose": [0.4, 0.0, 0.3, 1, 0, 0, 0]},
+            "cube_1": {"dims": [0.1, 0.1, 0.2], "pose": [0.4, 0.0, 0.5, 1, 0, 0, 0]},
+        },
+        "mesh": {
+            "scene": {
+                "pose": [1.5, 0.080, 1.6, 0.043, -0.471, 0.284, 0.834],
+                "file_path": "scene/nvblox/srl_ur10_bins.obj",
+            }
+        },
+    }
+    tensor_args = TensorDeviceType()
+    config = RobotWorldConfig.load_from_config(
+        robot_file, world_file, collision_activation_distance=0.0
+    )
+    curobo_fn = RobotWorld(config)
+
+    q_sph = torch.randn((10, 1, 1, 4), device=tensor_args.device, dtype=tensor_args.dtype)
+    q_sph[..., 3] = 0.2
+    d = curobo_fn.get_collision_distance(q_sph)
+    print(d)
+
+    q_s = curobo_fn.sample(5, mask_valid=False)
+
+    d_world, d_self = curobo_fn.get_world_self_collision_distance_from_joints(q_s)
+    print("Collision Distance:")
+    print("World:", d_world)
+    print("Self:", d_self)
diff --git a/examples/ik_example.py b/examples/ik_example.py
new file mode 100644
index 00000000..7798a6f0
--- /dev/null
+++ b/examples/ik_example.py
@@ -0,0 +1,234 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+import time
+
+# Third Party
+import torch
+
+# CuRobo
+from curobo.geom.types import WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import RobotConfig
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.ik_solver import IKSolver, IKSolverConfig
+
+torch.backends.cudnn.benchmark = True
+
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+
+
+def demo_basic_ik():
+    tensor_args = TensorDeviceType()
+
+    config_file = load_yaml(join_path(get_robot_configs_path(), "franka.yml"))
+    urdf_file = config_file["robot_cfg"]["kinematics"][
+        "urdf_path"
+    ]  # Send global path starting with "/"
+    base_link = config_file["robot_cfg"]["kinematics"]["base_link"]
+    ee_link = config_file["robot_cfg"]["kinematics"]["ee_link"]
+    robot_cfg = RobotConfig.from_basic(urdf_file, base_link, ee_link, tensor_args)
+
+    ik_config = IKSolverConfig.load_from_robot_config(
+        robot_cfg,
+        None,
+        rotation_threshold=0.05,
+        position_threshold=0.005,
+        num_seeds=20,
+        self_collision_check=False,
+        self_collision_opt=False,
+        tensor_args=tensor_args,
+        use_cuda_graph=False,
+    )
+    ik_solver = IKSolver(ik_config)
+
+    # print(kin_state)
+    for _ in range(10):
+        q_sample = ik_solver.sample_configs(5000)
+        kin_state = ik_solver.fk(q_sample)
+        goal = Pose(kin_state.ee_position, kin_state.ee_quaternion)
+
+        st_time = time.time()
+        result = ik_solver.solve_batch(goal)
+        torch.cuda.synchronize()
+        print(
+            "Success, Solve Time(s), Total Time(s) ",
+            torch.count_nonzero(result.success).item() / len(q_sample),
+            result.solve_time,
+            q_sample.shape[0] / (time.time() - st_time),
+            torch.mean(result.position_error) * 100.0,
+            torch.mean(result.rotation_error) * 100.0,
+        )
+
+
+def demo_full_config_collision_free_ik():
+    tensor_args = TensorDeviceType()
+    world_file = "collision_cage.yml"
+
+    robot_file = "franka.yml"
+    robot_cfg = RobotConfig.from_dict(
+        load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    )
+    world_cfg = WorldConfig.from_dict(load_yaml(join_path(get_world_configs_path(), world_file)))
+    ik_config = IKSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        rotation_threshold=0.05,
+        position_threshold=0.005,
+        num_seeds=20,
+        self_collision_check=True,
+        self_collision_opt=True,
+        tensor_args=tensor_args,
+        use_cuda_graph=True,
+        # use_fixed_samples=True,
+    )
+    ik_solver = IKSolver(ik_config)
+
+    # print(kin_state)
+    print("Running Single IK")
+    for _ in range(10):
+        q_sample = ik_solver.sample_configs(5000)
+        kin_state = ik_solver.fk(q_sample)
+        goal = Pose(kin_state.ee_position, kin_state.ee_quaternion)
+
+        st_time = time.time()
+        result = ik_solver.solve_batch(goal)
+        torch.cuda.synchronize()
+        total_time = (time.time() - st_time) / q_sample.shape[0]
+        print(
+            "Success, Solve Time(s), Total Time(s)",
+            torch.count_nonzero(result.success).item() / len(q_sample),
+            result.solve_time,
+            total_time,
+            1.0 / total_time,
+            torch.mean(result.position_error) * 100.0,
+            torch.mean(result.rotation_error) * 100.0,
+        )
+    exit()
+    print("Running Batch IK (10 goals)")
+    q_sample = ik_solver.sample_configs(10)
+    kin_state = ik_solver.fk(q_sample)
+    goal = Pose(kin_state.ee_position, kin_state.ee_quaternion)
+
+    for _ in range(3):
+        st_time = time.time()
+        result = ik_solver.solve_batch(goal)
+        torch.cuda.synchronize()
+        print(
+            "Success, Solve Time(s), Total Time(s)",
+            torch.count_nonzero(result.success).item() / len(q_sample),
+            result.solve_time,
+            time.time() - st_time,
+        )
+
+    print("Running Goalset IK (10 goals in 1 set)")
+    q_sample = ik_solver.sample_configs(10)
+    kin_state = ik_solver.fk(q_sample)
+    goal = Pose(kin_state.ee_position.unsqueeze(0), kin_state.ee_quaternion.unsqueeze(0))
+
+    for _ in range(3):
+        st_time = time.time()
+        result = ik_solver.solve_goalset(goal)
+        torch.cuda.synchronize()
+        print(
+            "Success, Solve Time(s), Total Time(s)",
+            torch.count_nonzero(result.success).item() / len(result.success),
+            result.solve_time,
+            time.time() - st_time,
+        )
+
+    print("Running Batch Goalset IK (10 goals in 10 sets)")
+    q_sample = ik_solver.sample_configs(100)
+    kin_state = ik_solver.fk(q_sample)
+    goal = Pose(
+        kin_state.ee_position.view(10, 10, 3).contiguous(),
+        kin_state.ee_quaternion.view(10, 10, 4).contiguous(),
+    )
+
+    for _ in range(3):
+        st_time = time.time()
+        result = ik_solver.solve_batch_goalset(goal)
+        torch.cuda.synchronize()
+        print(
+            "Success, Solve Time(s), Total Time(s)",
+            torch.count_nonzero(result.success).item() / len(result.success.view(-1)),
+            result.solve_time,
+            time.time() - st_time,
+        )
+
+
+def demo_full_config_batch_env_collision_free_ik():
+    tensor_args = TensorDeviceType()
+    world_file = ["collision_test.yml", "collision_cubby.yml"]
+
+    robot_file = "franka.yml"
+    robot_cfg = RobotConfig.from_dict(
+        load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    )
+    world_cfg = [
+        WorldConfig.from_dict(load_yaml(join_path(get_world_configs_path(), x))) for x in world_file
+    ]
+    ik_config = IKSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        rotation_threshold=0.05,
+        position_threshold=0.005,
+        num_seeds=100,
+        self_collision_check=True,
+        self_collision_opt=True,
+        tensor_args=tensor_args,
+        use_cuda_graph=False,
+        # use_fixed_samples=True,
+    )
+    ik_solver = IKSolver(ik_config)
+    q_sample = ik_solver.sample_configs(len(world_file))
+    kin_state = ik_solver.fk(q_sample)
+    goal = Pose(kin_state.ee_position, kin_state.ee_quaternion)
+
+    print("Running Batch Env IK")
+    for _ in range(3):
+        st_time = time.time()
+        result = ik_solver.solve_batch_env(goal)
+        print(result.success)
+        torch.cuda.synchronize()
+        print(
+            "Success, Solve Time(s), Total Time(s)",
+            torch.count_nonzero(result.success).item() / len(q_sample),
+            result.solve_time,
+            time.time() - st_time,
+        )
+
+    q_sample = ik_solver.sample_configs(10 * len(world_file))
+    kin_state = ik_solver.fk(q_sample)
+    goal = Pose(
+        kin_state.ee_position.view(len(world_file), 10, 3),
+        kin_state.ee_quaternion.view(len(world_file), 10, 4),
+    )
+
+    print("Running Batch Env Goalset IK")
+    for _ in range(3):
+        st_time = time.time()
+        result = ik_solver.solve_batch_env_goalset(goal)
+        torch.cuda.synchronize()
+        print(
+            "Success, Solve Time(s), Total Time(s)",
+            torch.count_nonzero(result.success).item() / len(result.success.view(-1)),
+            result.solve_time,
+            time.time() - st_time,
+        )
+
+
+if __name__ == "__main__":
+    demo_basic_ik()
+    # demo_full_config_collision_free_ik()
+    # demo_full_config_batch_env_collision_free_ik()
diff --git a/examples/isaac_sim/batch_collision_checker.py b/examples/isaac_sim/batch_collision_checker.py
new file mode 100644
index 00000000..619982a3
--- /dev/null
+++ b/examples/isaac_sim/batch_collision_checker.py
@@ -0,0 +1,171 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+
+# Third Party
+import torch
+
+a = torch.zeros(4, device="cuda:0")
+
+# Standard Library
+
+# Third Party
+from omni.isaac.kit import SimulationApp
+
+# CuRobo
+# from curobo.wrap.reacher.ik_solver import IKSolver, IKSolverConfig
+from curobo.geom.sdf.world import CollisionCheckerType
+from curobo.geom.types import WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.util_file import get_world_configs_path, join_path, load_yaml
+from curobo.wrap.model.robot_world import RobotWorld, RobotWorldConfig
+
+simulation_app = SimulationApp(
+    {
+        "headless": False,
+        "width": "1920",
+        "height": "1080",
+    }
+)
+# Third Party
+from omni.isaac.core.utils.extensions import enable_extension
+
+ext_list = [
+    "omni.kit.asset_converter",
+    # "omni.kit.livestream.native",
+    "omni.kit.tool.asset_importer",
+    "omni.isaac.asset_browser",
+]
+[enable_extension(x) for x in ext_list]
+
+
+# Third Party
+import carb
+import numpy as np
+from omni.isaac.core import World
+from omni.isaac.core.materials import OmniPBR
+from omni.isaac.core.objects import cuboid, sphere
+
+########### OV #################
+from omni.isaac.core.utils.extensions import enable_extension
+from omni.isaac.core.utils.types import ArticulationAction
+
+# CuRobo
+from curobo.util.logger import setup_curobo_logger
+from curobo.util.usd_helper import UsdHelper
+
+
+def main():
+    usd_help = UsdHelper()
+    act_distance = 0.2
+
+    n_envs = 2
+    # assuming obstacles are in objects_path:
+    my_world = World(stage_units_in_meters=1.0)
+    my_world.scene.add_default_ground_plane()
+
+    stage = my_world.stage
+    usd_help.load_stage(stage)
+    xform = stage.DefinePrim("/World", "Xform")
+    stage.SetDefaultPrim(xform)
+    stage.DefinePrim("/curobo", "Xform")
+
+    # my_world.stage.SetDefaultPrim(my_world.stage.GetPrimAtPath("/World"))
+    stage = my_world.stage
+    # stage.SetDefaultPrim(stage.GetPrimAtPath("/World"))
+    # Make a target to follow
+    target_list = []
+    target_material_list = []
+    offset_x = 3.5
+    radius = 0.1
+    pose = Pose.from_list([0, 0, 0, 1, 0, 0, 0])
+
+    for i in range(n_envs):
+        if i > 0:
+            pose.position[0, 0] += offset_x
+        usd_help.add_subroot("/World", "/World/world_" + str(i), pose)
+
+        target_material = OmniPBR("/World/looks/t_" + str(i), color=np.array([0, 1, 0]))
+
+        target = sphere.VisualSphere(
+            "/World/world_" + str(i) + "/target",
+            position=np.array([0.5, 0, 0.5]) + pose.position[0].cpu().numpy(),
+            orientation=np.array([1, 0, 0, 0]),
+            radius=radius,
+            visual_material=target_material,
+        )
+        target_list.append(target)
+        target_material_list.append(target_material)
+
+    setup_curobo_logger("warn")
+
+    # warmup curobo instance
+
+    tensor_args = TensorDeviceType()
+    robot_file = "franka.yml"
+    world_file = ["collision_thin_walls.yml", "collision_test.yml"]
+    world_cfg_list = []
+    for i in range(n_envs):
+        world_cfg = WorldConfig.from_dict(
+            load_yaml(join_path(get_world_configs_path(), world_file[i]))
+        )  # .get_mesh_world()
+        world_cfg.objects[0].pose[2] += 0.1
+        world_cfg.randomize_color(r=[0.2, 0.3], b=[0.0, 0.05], g=[0.2, 0.3])
+        usd_help.add_world_to_stage(world_cfg, base_frame="/World/world_" + str(i))
+        world_cfg_list.append(world_cfg)
+    config = RobotWorldConfig.load_from_config(
+        robot_file, world_cfg_list, collision_activation_distance=act_distance
+    )
+    model = RobotWorld(config)
+    i = 0
+    max_distance = 0.5
+    x_sph = torch.zeros((n_envs, 1, 1, 4), device=tensor_args.device, dtype=tensor_args.dtype)
+    x_sph[..., 3] = radius
+    env_query_idx = torch.arange(n_envs, device=tensor_args.device, dtype=torch.int32)
+    while simulation_app.is_running():
+        my_world.step(render=True)
+        if not my_world.is_playing():
+            if i % 100 == 0:
+                print("**** Click Play to start simulation *****")
+            i += 1
+            continue
+        step_index = my_world.current_time_step_index
+
+        if step_index == 0:
+            my_world.reset()
+
+        if step_index < 20:
+            continue
+        sp_buffer = []
+        for k in target_list:
+            sph_position, _ = k.get_local_pose()
+            sp_buffer.append(sph_position)
+
+        x_sph[..., :3] = tensor_args.to_device(sp_buffer).view(n_envs, 1, 1, 3)
+
+        d, d_vec = model.get_collision_vector(x_sph, env_query_idx=env_query_idx)
+
+        d = d.view(-1).cpu()
+
+        for i in range(d.shape[0]):
+            p = d[i].item()
+            p = max(1, p * 5)
+            if d[i].item() == 0.0:
+                target_material_list[i].set_color(np.ravel([0, 1, 0]))
+            elif d[i].item() <= model.contact_distance:
+                target_material_list[i].set_color(np.array([0, 0, p]))
+            elif d[i].item() >= model.contact_distance:
+                target_material_list[i].set_color(np.array([p, 0, 0]))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/isaac_sim/batch_motion_gen_reacher.py b/examples/isaac_sim/batch_motion_gen_reacher.py
new file mode 100644
index 00000000..02299e76
--- /dev/null
+++ b/examples/isaac_sim/batch_motion_gen_reacher.py
@@ -0,0 +1,311 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+
+# Third Party
+import torch
+
+a = torch.zeros(4, device="cuda:0")
+
+# Standard Library
+
+# Third Party
+from omni.isaac.kit import SimulationApp
+
+# CuRobo
+# from curobo.wrap.reacher.ik_solver import IKSolver, IKSolverConfig
+from curobo.geom.sdf.world import CollisionCheckerType
+from curobo.geom.types import WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.state import JointState
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.model.robot_world import RobotWorld, RobotWorldConfig
+from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
+
+simulation_app = SimulationApp(
+    {
+        "headless": False,
+        "width": "1920",
+        "height": "1080",
+    }
+)
+# Third Party
+from omni.isaac.core.utils.extensions import enable_extension
+
+ext_list = [
+    "omni.kit.asset_converter",
+    # "omni.kit.livestream.native",
+    "omni.kit.tool.asset_importer",
+    "omni.isaac.asset_browser",
+]
+[enable_extension(x) for x in ext_list]
+
+
+# Standard Library
+import argparse
+
+# Third Party
+import carb
+import numpy as np
+from helper import add_robot_to_scene
+from omni.isaac.core import World
+from omni.isaac.core.materials import OmniPBR
+from omni.isaac.core.objects import cuboid, sphere
+
+########### OV #################
+from omni.isaac.core.utils.extensions import enable_extension
+from omni.isaac.core.utils.types import ArticulationAction
+
+# CuRobo
+from curobo.util.logger import setup_curobo_logger
+from curobo.util.usd_helper import UsdHelper
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--headless", action="store_true", help="When True, enables headless mode", default=False
+)
+parser.add_argument(
+    "--visualize_spheres",
+    action="store_true",
+    help="When True, visualizes robot spheres",
+    default=False,
+)
+
+parser.add_argument("--robot", type=str, default="franka.yml", help="robot configuration to load")
+args = parser.parse_args()
+
+
+def main():
+    usd_help = UsdHelper()
+    act_distance = 0.2
+
+    n_envs = 2
+    # assuming obstacles are in objects_path:
+    my_world = World(stage_units_in_meters=1.0)
+    my_world.scene.add_default_ground_plane()
+
+    stage = my_world.stage
+    usd_help.load_stage(stage)
+    xform = stage.DefinePrim("/World", "Xform")
+    stage.SetDefaultPrim(xform)
+    stage.DefinePrim("/curobo", "Xform")
+
+    # my_world.stage.SetDefaultPrim(my_world.stage.GetPrimAtPath("/World"))
+    stage = my_world.stage
+    # stage.SetDefaultPrim(stage.GetPrimAtPath("/World"))
+    # Make a target to follow
+    target_list = []
+    target_material_list = []
+    offset_y = 2.5
+    radius = 0.1
+    pose = Pose.from_list([0, 0, 0, 1, 0, 0, 0])
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), args.robot))["robot_cfg"]
+
+    robot_list = []
+
+    for i in range(n_envs):
+        if i > 0:
+            pose.position[0, 1] += offset_y
+        usd_help.add_subroot("/World", "/World/world_" + str(i), pose)
+
+        target = cuboid.VisualCuboid(
+            "/World/world_" + str(i) + "/target",
+            position=np.array([0.5, 0, 0.5]) + pose.position[0].cpu().numpy(),
+            orientation=np.array([0, 1, 0, 0]),
+            color=np.array([1.0, 0, 0]),
+            size=0.05,
+        )
+        target_list.append(target)
+        r = add_robot_to_scene(
+            robot_cfg,
+            my_world,
+            "/World/world_" + str(i) + "/",
+            robot_name="robot_" + str(i),
+            position=pose.position[0].cpu().numpy(),
+        )
+        robot_list.append(r[0])
+    setup_curobo_logger("warn")
+
+    # warmup curobo instance
+
+    tensor_args = TensorDeviceType()
+    robot_file = "franka.yml"
+
+    world_file = ["collision_test.yml", "collision_thin_walls.yml"]
+    world_cfg_list = []
+    for i in range(n_envs):
+        world_cfg = WorldConfig.from_dict(
+            load_yaml(join_path(get_world_configs_path(), world_file[i]))
+        )  # .get_mesh_world()
+        world_cfg.objects[0].pose[2] -= 0.02
+        world_cfg.randomize_color(r=[0.2, 0.3], b=[0.0, 0.05], g=[0.2, 0.3])
+        usd_help.add_world_to_stage(world_cfg, base_frame="/World/world_" + str(i))
+        world_cfg_list.append(world_cfg)
+
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg_list,
+        tensor_args,
+        trajopt_tsteps=32,
+        collision_checker_type=CollisionCheckerType.MESH,
+        use_cuda_graph=True,
+        num_trajopt_seeds=12,
+        num_graph_seeds=12,
+        interpolation_dt=0.03,
+        collision_cache={"obb": 6, "mesh": 6},
+        collision_activation_distance=0.025,
+        acceleration_scale=1.0,
+        self_collision_check=True,
+        maximum_trajectory_dt=0.25,
+        fixed_iters_trajopt=True,
+        finetune_dt_scale=1.05,
+        grad_trajopt_iters=300,
+        minimize_jerk=False,
+    )
+    motion_gen = MotionGen(motion_gen_config)
+    j_names = robot_cfg["kinematics"]["cspace"]["joint_names"]
+    default_config = robot_cfg["kinematics"]["cspace"]["retract_config"]
+
+    print("warming up...")
+    motion_gen.warmup(
+        enable_graph=False, warmup_js_trajopt=False, batch=n_envs, batch_env_mode=True
+    )
+
+    config = RobotWorldConfig.load_from_config(
+        robot_file, world_cfg_list, collision_activation_distance=act_distance
+    )
+    model = RobotWorld(config)
+    i = 0
+    max_distance = 0.5
+    x_sph = torch.zeros((n_envs, 1, 1, 4), device=tensor_args.device, dtype=tensor_args.dtype)
+    x_sph[..., 3] = radius
+    env_query_idx = torch.arange(n_envs, device=tensor_args.device, dtype=torch.int32)
+    plan_config = MotionGenPlanConfig(
+        enable_graph=False, enable_graph_attempt=4, max_attempts=2, enable_finetune_trajopt=True
+    )
+    prev_goal = None
+    cmd_plan = [None, None]
+    art_controllers = [r.get_articulation_controller() for r in robot_list]
+    cmd_idx = 0
+    past_goal = None
+    while simulation_app.is_running():
+        my_world.step(render=True)
+        if not my_world.is_playing():
+            if i % 100 == 0:
+                print("**** Click Play to start simulation *****")
+            i += 1
+            continue
+        step_index = my_world.current_time_step_index
+
+        if step_index == 0:
+            my_world.reset()
+            for robot in robot_list:
+                idx_list = [robot.get_dof_index(x) for x in j_names]
+                robot.set_joint_positions(default_config, idx_list)
+
+                robot._articulation_view.set_max_efforts(
+                    values=np.array([5000 for i in range(len(idx_list))]), joint_indices=idx_list
+                )
+        if step_index < 20:
+            continue
+        sp_buffer = []
+        sq_buffer = []
+        for k in target_list:
+            sph_position, sph_orientation = k.get_local_pose()
+            sp_buffer.append(sph_position)
+            sq_buffer.append(sph_orientation)
+
+        ik_goal = Pose(
+            position=tensor_args.to_device(sp_buffer),
+            quaternion=tensor_args.to_device(sq_buffer),
+        )
+        if prev_goal is None:
+            prev_goal = ik_goal.clone()
+        if past_goal is None:
+            past_goal = ik_goal.clone()
+        sim_js_names = robot_list[0].dof_names
+        sim_js = robot_list[0].get_joints_state()
+        full_js = JointState(
+            position=tensor_args.to_device(sim_js.positions).view(1, -1),
+            velocity=tensor_args.to_device(sim_js.velocities).view(1, -1) * 0.0,
+            acceleration=tensor_args.to_device(sim_js.velocities).view(1, -1) * 0.0,
+            jerk=tensor_args.to_device(sim_js.velocities).view(1, -1) * 0.0,
+            joint_names=sim_js_names,
+        )
+        for i in range(1, len(robot_list)):
+            sim_js = robot_list[i].get_joints_state()
+            cu_js = JointState(
+                position=tensor_args.to_device(sim_js.positions).view(1, -1),
+                velocity=tensor_args.to_device(sim_js.velocities).view(1, -1) * 0.0,
+                acceleration=tensor_args.to_device(sim_js.velocities).view(1, -1) * 0.0,
+                jerk=tensor_args.to_device(sim_js.velocities).view(1, -1) * 0.0,
+                joint_names=sim_js_names,
+            )
+            full_js = full_js.stack(cu_js)
+
+        prev_distance = ik_goal.distance(prev_goal)
+        past_distance = ik_goal.distance(past_goal)
+
+        if (
+            (torch.sum(prev_distance[0] > 1e-2) or torch.sum(prev_distance[1] > 1e-2))
+            and (torch.sum(past_distance[0]) == 0.0 and torch.sum(past_distance[1] == 0.0))
+            and torch.norm(full_js.velocity) < 0.01
+            and cmd_plan[0] is None
+            and cmd_plan[1] is None
+        ):
+            full_js = full_js.get_ordered_joint_state(motion_gen.kinematics.joint_names)
+            result = motion_gen.plan_batch_env(full_js, ik_goal, plan_config.clone())
+
+            prev_goal.copy_(ik_goal)
+            trajs = result.get_paths()
+            for s in range(len(result.success)):
+                if result.success[s]:
+                    cmd_plan[s] = motion_gen.get_full_js(trajs[s])
+                    # cmd_plan = result.get_interpolated_plan()
+                    # cmd_plan = motion_gen.get_full_js(cmd_plan)
+                    # get only joint names that are in both:
+                    idx_list = []
+                    common_js_names = []
+                    for x in sim_js_names:
+                        if x in cmd_plan[s].joint_names:
+                            idx_list.append(robot_list[s].get_dof_index(x))
+                            common_js_names.append(x)
+
+                    cmd_plan[s] = cmd_plan[s].get_ordered_joint_state(common_js_names)
+
+                cmd_idx = 0
+        # print(cmd_plan)
+
+        for s in range(len(cmd_plan)):
+            if cmd_plan[s] is not None and cmd_idx < len(cmd_plan[s].position):
+                cmd_state = cmd_plan[s][cmd_idx]
+
+                # get full dof state
+                art_action = ArticulationAction(
+                    cmd_state.position.cpu().numpy(),
+                    cmd_state.velocity.cpu().numpy(),
+                    joint_indices=idx_list,
+                )
+                # print(cmd_state.position)
+                # set desired joint angles obtained from IK:
+                art_controllers[s].apply_action(art_action)
+            else:
+                cmd_plan[s] = None
+        cmd_idx += 1
+        past_goal.copy_(ik_goal)
+
+        for _ in range(2):
+            my_world.step(render=False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/isaac_sim/collision_checker.py b/examples/isaac_sim/collision_checker.py
new file mode 100644
index 00000000..96dec2e7
--- /dev/null
+++ b/examples/isaac_sim/collision_checker.py
@@ -0,0 +1,212 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+
+# Third Party
+import torch
+
+a = torch.zeros(4, device="cuda:0")
+
+# Standard Library
+
+# Third Party
+from omni.isaac.kit import SimulationApp
+
+# CuRobo
+# from curobo.wrap.reacher.ik_solver import IKSolver, IKSolverConfig
+from curobo.geom.sdf.world import CollisionCheckerType
+from curobo.geom.types import WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.util_file import get_world_configs_path, join_path, load_yaml
+from curobo.wrap.model.robot_world import RobotWorld, RobotWorldConfig
+
+simulation_app = SimulationApp(
+    {
+        "headless": False,
+        "width": "1920",
+        "height": "1080",
+    }
+)
+# Third Party
+from omni.isaac.core.utils.extensions import enable_extension
+
+ext_list = [
+    "omni.kit.asset_converter",
+    # "omni.kit.livestream.native",
+    "omni.kit.tool.asset_importer",
+    "omni.isaac.asset_browser",
+]
+[enable_extension(x) for x in ext_list]
+
+
+# Standard Library
+import argparse
+
+# Third Party
+import carb
+import numpy as np
+from omni.isaac.core import World
+from omni.isaac.core.materials import OmniPBR
+from omni.isaac.core.objects import cuboid, sphere
+
+########### OV #################
+from omni.isaac.core.utils.extensions import enable_extension
+from omni.isaac.core.utils.types import ArticulationAction
+
+# CuRobo
+from curobo.util.logger import setup_curobo_logger
+from curobo.util.usd_helper import UsdHelper
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--nvblox", action="store_true", help="When True, enables headless mode", default=False
+)
+args = parser.parse_args()
+
+
+def draw_line(start, gradient):
+    # Third Party
+    from omni.isaac.debug_draw import _debug_draw
+
+    draw = _debug_draw.acquire_debug_draw_interface()
+    # if draw.get_num_points() > 0:
+    draw.clear_lines()
+    start_list = [start]
+    end_list = [start + gradient]
+
+    colors = [(1, 0, 0, 0.8)]
+
+    sizes = [10.0]
+    draw.draw_lines(start_list, end_list, colors, sizes)
+
+
+def main():
+    usd_help = UsdHelper()
+    act_distance = 0.4
+    ignore_list = ["/World/target", "/World/defaultGroundPlane"]
+
+    # assuming obstacles are in objects_path:
+    my_world = World(stage_units_in_meters=1.0)
+    my_world.scene.add_default_ground_plane()
+
+    stage = my_world.stage
+    usd_help.load_stage(stage)
+    xform = stage.DefinePrim("/World", "Xform")
+    stage.SetDefaultPrim(xform)
+    stage.DefinePrim("/curobo", "Xform")
+
+    # my_world.stage.SetDefaultPrim(my_world.stage.GetPrimAtPath("/World"))
+    stage = my_world.stage
+    radius = 0.1
+    pose = Pose.from_list([0, 0, 0, 1, 0, 0, 0])
+
+    target_material = OmniPBR("/World/looks/t", color=np.array([0, 1, 0]))
+
+    target = sphere.VisualSphere(
+        "/World/target",
+        position=np.array([0.5, 0, 1.0]) + pose.position[0].cpu().numpy(),
+        orientation=np.array([1, 0, 0, 0]),
+        radius=radius,
+        visual_material=target_material,
+    )
+
+    setup_curobo_logger("warn")
+
+    # warmup curobo instance
+
+    tensor_args = TensorDeviceType()
+    robot_file = "franka.yml"
+    world_file = ["collision_thin_walls.yml", "collision_test.yml"][-1]
+    collision_checker_type = CollisionCheckerType.MESH
+    world_cfg = WorldConfig.from_dict(load_yaml(join_path(get_world_configs_path(), world_file)))
+    world_cfg.objects[0].pose[2] += 0.2
+    vis_world_cfg = world_cfg
+
+    if args.nvblox:
+        world_file = "collision_nvblox.yml"
+        collision_checker_type = CollisionCheckerType.BLOX
+        world_cfg = WorldConfig.from_dict(
+            load_yaml(join_path(get_world_configs_path(), world_file))
+        )
+        world_cfg.objects[0].pose[2] += 0.4
+        ignore_list.append(world_cfg.objects[0].name)
+        vis_world_cfg = world_cfg.get_mesh_world()
+        # world_cfg = vis_world_cfg
+
+    usd_help.add_world_to_stage(vis_world_cfg, base_frame="/World")
+    config = RobotWorldConfig.load_from_config(
+        robot_file,
+        world_cfg,
+        collision_activation_distance=act_distance,
+        collision_checker_type=collision_checker_type,
+    )
+    model = RobotWorld(config)
+    i = 0
+    x_sph = torch.zeros((1, 1, 1, 4), device=tensor_args.device, dtype=tensor_args.dtype)
+    x_sph[..., 3] = radius
+
+    while simulation_app.is_running():
+        my_world.step(render=True)
+        if not my_world.is_playing():
+            if i % 100 == 0:
+                print("**** Click Play to start simulation *****")
+            i += 1
+            continue
+        step_index = my_world.current_time_step_index
+
+        if step_index == 0:
+            my_world.reset()
+
+        if step_index < 20:
+            continue
+        if step_index % 1000 == 0.0:
+            obstacles = usd_help.get_obstacles_from_stage(
+                # only_paths=[obstacles_path],
+                reference_prim_path="/World",
+                ignore_substring=ignore_list,
+            ).get_collision_check_world()
+
+            model.update_world(obstacles)
+            print("Updated World")
+
+        sp_buffer = []
+        sph_position, _ = target.get_local_pose()
+
+        x_sph[..., :3] = tensor_args.to_device(sph_position).view(1, 1, 1, 3)
+
+        d, d_vec = model.get_collision_vector(x_sph)
+
+        d = d.view(-1).cpu()
+
+        p = d.item()
+        p = max(1, p * 5)
+        if d.item() != 0.0:
+            draw_line(sph_position, d_vec[..., :3].view(3).cpu().numpy())
+            print(d, d_vec)
+
+        else:
+            # Third Party
+            from omni.isaac.debug_draw import _debug_draw
+
+            draw = _debug_draw.acquire_debug_draw_interface()
+            # if draw.get_num_points() > 0:
+            draw.clear_lines()
+        if d.item() == 0.0:
+            target_material.set_color(np.ravel([0, 1, 0]))
+        elif d.item() <= model.contact_distance:
+            target_material.set_color(np.array([0, 0, p]))
+        elif d.item() >= model.contact_distance:
+            target_material.set_color(np.array([p, 0, 0]))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/isaac_sim/helper.py b/examples/isaac_sim/helper.py
new file mode 100644
index 00000000..a1eb9a72
--- /dev/null
+++ b/examples/isaac_sim/helper.py
@@ -0,0 +1,134 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+from typing import Dict, List
+
+# Third Party
+import numpy as np
+from matplotlib import cm
+from omni.isaac.core import World
+from omni.isaac.core.materials import OmniPBR
+from omni.isaac.core.objects import cuboid
+from omni.isaac.core.robots import Robot
+
+try:
+    # Third Party
+    from omni.isaac.urdf import _urdf  # isaacsim 2022.2
+except ImportError:
+    from omni.importer.urdf import _urdf  # isaac sim 2023.1
+# CuRobo
+from curobo.util_file import get_assets_path, get_filename, get_path_of_dir, join_path
+
+
+############################################################
+def add_robot_to_scene(
+    robot_config: Dict,
+    my_world: World,
+    load_from_usd: bool = False,
+    subroot: str = "",
+    robot_name: str = "robot",
+    position: np.array = np.array([0, 0, 0]),
+):
+    urdf_interface = _urdf.acquire_urdf_interface()
+
+    import_config = _urdf.ImportConfig()
+    import_config.merge_fixed_joints = False
+    import_config.convex_decomp = False
+    import_config.import_inertia_tensor = True
+    import_config.fix_base = True
+    import_config.make_default_prim = False
+    import_config.self_collision = False
+    import_config.create_physics_scene = True
+    import_config.import_inertia_tensor = False
+    import_config.default_drive_strength = 20000
+    import_config.default_position_drive_damping = 500
+    import_config.default_drive_type = _urdf.UrdfJointTargetType.JOINT_DRIVE_POSITION
+    import_config.distance_scale = 1
+    import_config.density = 0.0
+    full_path = join_path(get_assets_path(), robot_config["kinematics"]["urdf_path"])
+    robot_path = get_path_of_dir(full_path)
+    filename = get_filename(full_path)
+    imported_robot = urdf_interface.parse_urdf(robot_path, filename, import_config)
+    dest_path = subroot
+    robot_path = urdf_interface.import_robot(
+        robot_path,
+        filename,
+        imported_robot,
+        import_config,
+        dest_path,
+    )
+
+    # prim_path = omni.usd.get_stage_next_free_path(
+    # my_world.scene.stage, str(my_world.scene.stage.GetDefaultPrim().GetPath()) + robot_path, False)
+    # print(prim_path)
+    # robot_prim = my_world.scene.stage.OverridePrim(prim_path)
+    # robot_prim.GetReferences().AddReference(dest_path)
+
+    robot = my_world.scene.add(
+        Robot(
+            prim_path=robot_path,
+            name=robot_name,
+            position=position,
+        )
+    )
+
+    return robot, robot_path
+
+
+class VoxelManager:
+    def __init__(
+        self,
+        num_voxels: int = 5000,
+        size: float = 0.02,
+        color: List[float] = [1, 1, 1],
+        prefix_path: str = "/World/curobo/voxel_",
+        material_path: str = "/World/looks/v_",
+    ) -> None:
+        self.cuboid_list = []
+        self.cuboid_material_list = []
+        self.disable_idx = num_voxels
+        for i in range(num_voxels):
+            target_material = OmniPBR("/World/looks/v_" + str(i), color=np.ravel(color))
+
+            cube = cuboid.VisualCuboid(
+                prefix_path + str(i),
+                position=np.array([0, 0, -10]),
+                orientation=np.array([1, 0, 0, 0]),
+                size=size,
+                visual_material=target_material,
+            )
+            self.cuboid_list.append(cube)
+            self.cuboid_material_list.append(target_material)
+            cube.set_visibility(True)
+
+    def update_voxels(self, voxel_position: np.ndarray, color_axis: int = 0):
+        max_index = min(voxel_position.shape[0], len(self.cuboid_list))
+
+        jet = cm.get_cmap("hot")  # .reversed()
+        z_val = voxel_position[:, 0]
+
+        jet_colors = jet(z_val)
+
+        for i in range(max_index):
+            self.cuboid_list[i].set_visibility(True)
+
+            self.cuboid_list[i].set_local_pose(translation=voxel_position[i])
+            self.cuboid_material_list[i].set_color(jet_colors[i][:3])
+
+        for i in range(max_index, len(self.cuboid_list)):
+            self.cuboid_list[i].set_local_pose(translation=np.ravel([0, 0, -10.0]))
+
+            # self.cuboid_list[i].set_visibility(False)
+
+    def clear(self):
+        for i in range(len(self.cuboid_list)):
+            self.cuboid_list[i].set_local_pose(translation=np.ravel([0, 0, -10.0]))
diff --git a/examples/isaac_sim/ik_reachability.py b/examples/isaac_sim/ik_reachability.py
new file mode 100644
index 00000000..6da6c62f
--- /dev/null
+++ b/examples/isaac_sim/ik_reachability.py
@@ -0,0 +1,381 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import torch
+
+a = torch.zeros(4, device="cuda:0")
+
+# Standard Library
+import argparse
+
+# CuRobo
+from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel
+
+# from curobo.wrap.reacher.ik_solver import IKSolver, IKSolverConfig
+from curobo.geom.sdf.world import CollisionCheckerType
+from curobo.geom.types import WorldConfig
+from curobo.rollout.rollout_base import Goal
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.types.state import JointState
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.ik_solver import IKSolver, IKSolverConfig
+from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
+from curobo.wrap.reacher.mpc import MpcSolver, MpcSolverConfig
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--headless", action="store_true", help="When True, enables headless mode", default=False
+)
+parser.add_argument(
+    "--visualize_spheres",
+    action="store_true",
+    help="When True, visualizes robot spheres",
+    default=False,
+)
+
+parser.add_argument("--robot", type=str, default="franka.yml", help="robot configuration to load")
+args = parser.parse_args()
+
+############################################################
+
+# Third Party
+from omni.isaac.kit import SimulationApp
+
+simulation_app = SimulationApp(
+    {
+        "headless": args.headless,
+        "width": "1920",
+        "height": "1080",
+    }
+)
+# Third Party
+from omni.isaac.core.utils.extensions import enable_extension
+
+# CuRobo
+from curobo.util_file import (
+    get_assets_path,
+    get_filename,
+    get_path_of_dir,
+    get_robot_configs_path,
+    join_path,
+    load_yaml,
+)
+
+ext_list = [
+    "omni.kit.asset_converter",
+    # "omni.kit.livestream.native",
+    "omni.kit.tool.asset_importer",
+    "omni.isaac.asset_browser",
+    "omni.isaac.urdf",
+]
+[enable_extension(x) for x in ext_list]
+simulation_app.update()
+
+# Standard Library
+from typing import Dict
+
+# Third Party
+import carb
+import numpy as np
+from helper import add_robot_to_scene
+from omni.isaac.core import World
+from omni.isaac.core.objects import cuboid, sphere
+from omni.isaac.core.robots import Robot
+
+########### OV #################
+from omni.isaac.core.utils.extensions import enable_extension
+from omni.isaac.core.utils.types import ArticulationAction
+
+# CuRobo
+from curobo.util.logger import setup_curobo_logger
+from curobo.util.usd_helper import UsdHelper
+
+############################################################
+
+
+########### OV #################;;;;;
+
+
+def get_pose_grid(n_x, n_y, n_z, max_x, max_y, max_z):
+    x = np.linspace(-max_x, max_x, n_x)
+    y = np.linspace(-max_y, max_y, n_y)
+    z = np.linspace(0, max_z, n_z)
+    x, y, z = np.meshgrid(x, y, z, indexing="ij")
+
+    position_arr = np.zeros((n_x * n_y * n_z, 3))
+    position_arr[:, 0] = x.flatten()
+    position_arr[:, 1] = y.flatten()
+    position_arr[:, 2] = z.flatten()
+    return position_arr
+
+
+def draw_points(pose, success):
+    # Third Party
+    from omni.isaac.debug_draw import _debug_draw
+
+    draw = _debug_draw.acquire_debug_draw_interface()
+    N = 100
+    # if draw.get_num_points() > 0:
+    draw.clear_points()
+    cpu_pos = pose.position.cpu().numpy()
+    b, _ = cpu_pos.shape
+    point_list = []
+    colors = []
+    for i in range(b):
+        # get list of points:
+        point_list += [(cpu_pos[i, 0], cpu_pos[i, 1], cpu_pos[i, 2])]
+        if success[i].item():
+            colors += [(0, 1, 0, 0.25)]
+        else:
+            colors += [(1, 0, 0, 0.25)]
+    sizes = [40.0 for _ in range(b)]
+
+    draw.draw_points(point_list, colors, sizes)
+
+
+def main():
+    # assuming obstacles are in objects_path:
+    my_world = World(stage_units_in_meters=1.0)
+    stage = my_world.stage
+
+    xform = stage.DefinePrim("/World", "Xform")
+    stage.SetDefaultPrim(xform)
+    stage.DefinePrim("/curobo", "Xform")
+    # my_world.stage.SetDefaultPrim(my_world.stage.GetPrimAtPath("/World"))
+    stage = my_world.stage
+    # stage.SetDefaultPrim(stage.GetPrimAtPath("/World"))
+
+    # Make a target to follow
+    target = cuboid.VisualCuboid(
+        "/World/target",
+        position=np.array([0.5, 0, 0.5]),
+        orientation=np.array([0, 1, 0, 0]),
+        color=np.array([1.0, 0, 0]),
+        size=0.05,
+    )
+
+    setup_curobo_logger("warn")
+    past_pose = None
+    n_obstacle_cuboids = 30
+    n_obstacle_mesh = 10
+
+    # warmup curobo instance
+    usd_help = UsdHelper()
+    target_pose = None
+
+    tensor_args = TensorDeviceType()
+
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), args.robot))["robot_cfg"]
+
+    j_names = robot_cfg["kinematics"]["cspace"]["joint_names"]
+    default_config = robot_cfg["kinematics"]["cspace"]["retract_config"]
+
+    robot, robot_prim_path = add_robot_to_scene(robot_cfg, my_world)
+
+    articulation_controller = robot.get_articulation_controller()
+
+    world_cfg_table = WorldConfig.from_dict(
+        load_yaml(join_path(get_world_configs_path(), "collision_table.yml"))
+    )
+    world_cfg_table.cuboid[0].pose[2] -= 0.002
+    world_cfg1 = WorldConfig.from_dict(
+        load_yaml(join_path(get_world_configs_path(), "collision_table.yml"))
+    ).get_mesh_world()
+    world_cfg1.mesh[0].name += "_mesh"
+    world_cfg1.mesh[0].pose[2] = -10.5
+
+    world_cfg = WorldConfig(cuboid=world_cfg_table.cuboid, mesh=world_cfg1.mesh)
+
+    ik_config = IKSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        rotation_threshold=0.05,
+        position_threshold=0.005,
+        num_seeds=20,
+        self_collision_check=True,
+        self_collision_opt=True,
+        tensor_args=tensor_args,
+        use_cuda_graph=True,
+        collision_checker_type=CollisionCheckerType.MESH,
+        collision_cache={"obb": n_obstacle_cuboids, "mesh": n_obstacle_mesh},
+        # use_fixed_samples=True,
+    )
+    ik_solver = IKSolver(ik_config)
+
+    # get pose grid:
+    position_grid_offset = tensor_args.to_device(get_pose_grid(10, 10, 5, 0.5, 0.5, 0.5))
+
+    # read current ik pose and warmup?
+    fk_state = ik_solver.fk(ik_solver.get_retract_config().view(1, -1))
+    goal_pose = fk_state.ee_pose
+    goal_pose = goal_pose.repeat(position_grid_offset.shape[0])
+    goal_pose.position += position_grid_offset
+
+    result = ik_solver.solve_batch(goal_pose)
+
+    print("Curobo is Ready")
+
+    usd_help.load_stage(my_world.stage)
+    usd_help.add_world_to_stage(world_cfg, base_frame="/World")
+
+    cmd_plan = None
+    cmd_idx = 0
+    my_world.scene.add_default_ground_plane()
+    i = 0
+    spheres = None
+    while simulation_app.is_running():
+        my_world.step(render=True)
+        if not my_world.is_playing():
+            if i % 100 == 0:
+                print("**** Click Play to start simulation *****")
+            i += 1
+            # if step_index == 0:
+            #    my_world.play()
+            continue
+
+        step_index = my_world.current_time_step_index
+        # print(step_index)
+        if step_index == 0:
+            my_world.reset()
+            idx_list = [robot.get_dof_index(x) for x in j_names]
+            robot.set_joint_positions(default_config, idx_list)
+
+            robot._articulation_view.set_max_efforts(
+                values=np.array([5000 for i in range(len(idx_list))]), joint_indices=idx_list
+            )
+        if step_index < 20:
+            continue
+
+        if step_index == 50 or step_index % 500 == 0.0:  # and cmd_plan is None:
+            print("Updating world, reading w.r.t.", robot_prim_path)
+            obstacles = usd_help.get_obstacles_from_stage(
+                # only_paths=[obstacles_path],
+                reference_prim_path=robot_prim_path,
+                ignore_substring=[
+                    robot_prim_path,
+                    "/World/target",
+                    "/World/defaultGroundPlane",
+                    "/curobo",
+                ],
+            ).get_collision_check_world()
+            print([x.name for x in obstacles.objects])
+            ik_solver.update_world(obstacles)
+            print("Updated World")
+            carb.log_info("Synced CuRobo world from stage.")
+
+        # position and orientation of target virtual cube:
+        cube_position, cube_orientation = target.get_world_pose()
+
+        if past_pose is None:
+            past_pose = cube_position
+        if target_pose is None:
+            target_pose = cube_position
+        sim_js = robot.get_joints_state()
+        sim_js_names = robot.dof_names
+        cu_js = JointState(
+            position=tensor_args.to_device(sim_js.positions),
+            velocity=tensor_args.to_device(sim_js.velocities) * 0.0,
+            acceleration=tensor_args.to_device(sim_js.velocities) * 0.0,
+            jerk=tensor_args.to_device(sim_js.velocities) * 0.0,
+            joint_names=sim_js_names,
+        )
+        cu_js = cu_js.get_ordered_joint_state(ik_solver.kinematics.joint_names)
+
+        if args.visualize_spheres and step_index % 2 == 0:
+            sph_list = ik_solver.kinematics.get_robot_as_spheres(cu_js.position)
+
+            if spheres is None:
+                spheres = []
+                # create spheres:
+
+                for si, s in enumerate(sph_list[0]):
+                    sp = sphere.VisualSphere(
+                        prim_path="/curobo/robot_sphere_" + str(si),
+                        position=np.ravel(s.position),
+                        radius=float(s.radius),
+                        color=np.array([0, 0.8, 0.2]),
+                    )
+                    spheres.append(sp)
+            else:
+                for si, s in enumerate(sph_list[0]):
+                    spheres[si].set_world_pose(position=np.ravel(s.position))
+                    spheres[si].set_radius(float(s.radius))
+        # print(sim_js.velocities)
+        if (
+            np.linalg.norm(cube_position - target_pose) > 1e-3
+            and np.linalg.norm(past_pose - cube_position) == 0.0
+            and np.linalg.norm(sim_js.velocities) < 0.2
+        ):
+            # Set EE teleop goals, use cube for simple non-vr init:
+            ee_translation_goal = cube_position
+            ee_orientation_teleop_goal = cube_orientation
+
+            # compute curobo solution:
+            ik_goal = Pose(
+                position=tensor_args.to_device(ee_translation_goal),
+                quaternion=tensor_args.to_device(ee_orientation_teleop_goal),
+            )
+            goal_pose.position[:] = ik_goal.position[:] + position_grid_offset
+            goal_pose.quaternion[:] = ik_goal.quaternion[:]
+            result = ik_solver.solve_batch(goal_pose)
+
+            succ = torch.any(result.success)
+            print(
+                "IK completed: Poses: "
+                + str(goal_pose.batch)
+                + " Time(s): "
+                + str(result.solve_time)
+            )
+            # get spheres and flags:
+            draw_points(goal_pose, result.success)
+
+            if succ:
+                # get all solutions:
+
+                cmd_plan = result.js_solution[result.success]
+                # get only joint names that are in both:
+                idx_list = []
+                common_js_names = []
+                for x in sim_js_names:
+                    if x in cmd_plan.joint_names:
+                        idx_list.append(robot.get_dof_index(x))
+                        common_js_names.append(x)
+                # idx_list = [robot.get_dof_index(x) for x in sim_js_names]
+
+                cmd_plan = cmd_plan.get_ordered_joint_state(common_js_names)
+
+                cmd_idx = 0
+
+            else:
+                carb.log_warn("Plan did not converge to a solution.  No action is being taken.")
+            target_pose = cube_position
+        past_pose = cube_position
+        if cmd_plan is not None and step_index % 20 == 0 and True:
+            cmd_state = cmd_plan[cmd_idx]
+
+            robot.set_joint_positions(cmd_state.position.cpu().numpy(), idx_list)
+
+            # set desired joint angles obtained from IK:
+            # articulation_controller.apply_action(art_action)
+            cmd_idx += 1
+            if cmd_idx >= len(cmd_plan.position):
+                cmd_idx = 0
+                cmd_plan = None
+                my_world.step(render=True)
+                robot.set_joint_positions(default_config, idx_list)
+    simulation_app.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/isaac_sim/load_all_robots.py b/examples/isaac_sim/load_all_robots.py
new file mode 100644
index 00000000..aaf0c0c3
--- /dev/null
+++ b/examples/isaac_sim/load_all_robots.py
@@ -0,0 +1,180 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import torch
+
+a = torch.zeros(4, device="cuda:0")
+
+# Standard Library
+
+# Standard Library
+import argparse
+
+# Third Party
+from omni.isaac.kit import SimulationApp
+
+# CuRobo
+from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel, CudaRobotModelConfig
+
+# from curobo.wrap.reacher.ik_solver import IKSolver, IKSolverConfig
+from curobo.geom.sdf.world import CollisionCheckerType
+from curobo.geom.types import WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import RobotConfig
+from curobo.types.state import JointState
+from curobo.util_file import (
+    get_motion_gen_robot_list,
+    get_robot_configs_path,
+    get_robot_path,
+    get_world_configs_path,
+    join_path,
+    load_yaml,
+)
+from curobo.wrap.model.robot_world import RobotWorld, RobotWorldConfig
+from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--headless", action="store_true", help="When True, enables headless mode", default=False
+)
+parser.add_argument(
+    "--visualize_spheres",
+    action="store_true",
+    help="When True, visualizes robot spheres",
+    default=False,
+)
+
+args = parser.parse_args()
+simulation_app = SimulationApp(
+    {
+        "headless": args.headless,
+        "width": "1920",
+        "height": "1080",
+    }
+)
+# Third Party
+from omni.isaac.core.utils.extensions import enable_extension
+
+ext_list = [
+    "omni.kit.asset_converter",
+    # "omni.kit.livestream.native",
+    "omni.kit.tool.asset_importer",
+    "omni.isaac.asset_browser",
+]
+[enable_extension(x) for x in ext_list]
+
+
+# Third Party
+import carb
+import numpy as np
+from helper import add_robot_to_scene
+from omni.isaac.core import World
+from omni.isaac.core.materials import OmniPBR
+from omni.isaac.core.objects import cuboid, sphere
+
+########### OV #################
+from omni.isaac.core.utils.extensions import enable_extension
+from omni.isaac.core.utils.types import ArticulationAction
+
+# CuRobo
+from curobo.util.logger import setup_curobo_logger
+from curobo.util.usd_helper import UsdHelper
+
+
+def main():
+    list_of_robots = get_motion_gen_robot_list()  # [:2]
+    usd_help = UsdHelper()
+
+    # assuming obstacles are in objects_path:
+    my_world = World(stage_units_in_meters=1.0)
+    my_world.scene.add_default_ground_plane()
+
+    stage = my_world.stage
+    usd_help.load_stage(stage)
+    xform = stage.DefinePrim("/World", "Xform")
+    stage.SetDefaultPrim(xform)
+    stage.DefinePrim("/curobo", "Xform")
+
+    # my_world.stage.SetDefaultPrim(my_world.stage.GetPrimAtPath("/World"))
+    stage = my_world.stage
+    # stage.SetDefaultPrim(stage.GetPrimAtPath("/World"))
+    # Make a target to follow
+    offset_y = 2
+    pose = Pose.from_list([0, 0, 0, 1, 0, 0, 0])
+
+    robot_cfg_list = []
+    robot_list = []
+    tensor_args = TensorDeviceType()
+    spheres = []
+
+    for i in range(len(list_of_robots)):
+        if i > 0:
+            pose.position[0, 1] += offset_y
+        if i == int(len(list_of_robots) / 2):
+            pose.position[0, 0] = -offset_y
+            pose.position[0, 1] = 0
+        robot_cfg = load_yaml(join_path(get_robot_configs_path(), list_of_robots[i]))["robot_cfg"]
+        robot_cfg_list.append(robot_cfg)
+        r = add_robot_to_scene(
+            robot_cfg,
+            my_world,
+            "/World/world_" + str(i) + "/",
+            robot_name="/World/world_" + str(i) + "/" "robot_" + str(i),
+            position=pose.position[0].cpu().numpy(),
+        )
+
+        robot_list.append(r[0])
+        robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+
+        kin_model = CudaRobotModel(robot_cfg.kinematics)
+        default_config = kin_model.cspace.retract_config
+
+        sph_list = kin_model.get_robot_as_spheres(default_config)
+        for si, s in enumerate(sph_list[0]):
+            sp = sphere.VisualSphere(
+                prim_path="/curobo/robot_sphere_" + str(i) + "_" + str(si),
+                position=np.ravel(s.position)
+                + pose.position[0].cpu().numpy()
+                + np.ravel([0, 0.5, 0.0]),
+                radius=float(s.radius),
+                color=np.array([0, 0.8, 0.2]),
+            )
+            spheres.append(sp)
+
+    setup_curobo_logger("warn")
+
+    while simulation_app.is_running():
+        my_world.step(render=True)
+
+        if not my_world.is_playing():
+            if i % 100 == 0:
+                print("**** Click Play to start simulation *****")
+            i += 1
+            continue
+        step_index = my_world.current_time_step_index
+
+        if step_index == 0:
+            my_world.reset()
+            for ri, robot in enumerate(robot_list):
+                j_names = robot_cfg_list[ri]["kinematics"]["cspace"]["joint_names"]
+                default_config = robot_cfg_list[ri]["kinematics"]["cspace"]["retract_config"]
+
+                idx_list = [robot.get_dof_index(x) for x in j_names]
+                robot.set_joint_positions(default_config, idx_list)
+                robot._articulation_view.set_max_efforts(
+                    values=np.array([5000 for i in range(len(idx_list))]), joint_indices=idx_list
+                )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/isaac_sim/motion_gen_reacher.py b/examples/isaac_sim/motion_gen_reacher.py
new file mode 100644
index 00000000..9dbdf8df
--- /dev/null
+++ b/examples/isaac_sim/motion_gen_reacher.py
@@ -0,0 +1,336 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+
+# Third Party
+import torch
+
+a = torch.zeros(4, device="cuda:0")
+
+# Standard Library
+import argparse
+
+# CuRobo
+from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel
+
+# from curobo.wrap.reacher.ik_solver import IKSolver, IKSolverConfig
+from curobo.geom.sdf.world import CollisionCheckerType
+from curobo.geom.types import WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.types.state import JointState
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--headless", action="store_true", help="When True, enables headless mode", default=False
+)
+parser.add_argument(
+    "--visualize_spheres",
+    action="store_true",
+    help="When True, visualizes robot spheres",
+    default=False,
+)
+
+parser.add_argument("--robot", type=str, default="franka.yml", help="robot configuration to load")
+args = parser.parse_args()
+
+############################################################
+
+# Third Party
+from omni.isaac.kit import SimulationApp
+
+simulation_app = SimulationApp(
+    {
+        "headless": args.headless,
+        "width": "1920",
+        "height": "1080",
+    }
+)
+# Third Party
+from omni.isaac.core.utils.extensions import enable_extension
+
+# CuRobo
+from curobo.util_file import (
+    get_assets_path,
+    get_filename,
+    get_path_of_dir,
+    get_robot_configs_path,
+    join_path,
+    load_yaml,
+)
+
+ext_list = [
+    "omni.kit.asset_converter",
+    # "omni.kit.livestream.native",
+    "omni.kit.tool.asset_importer",
+    "omni.isaac.asset_browser",
+]
+[enable_extension(x) for x in ext_list]
+# simulation_app.update()
+
+# Standard Library
+from typing import Dict
+
+# Third Party
+import carb
+import numpy as np
+from helper import add_robot_to_scene
+from omni.isaac.core import World
+from omni.isaac.core.objects import cuboid, sphere
+from omni.isaac.core.robots import Robot
+
+########### OV #################
+from omni.isaac.core.utils.extensions import enable_extension
+from omni.isaac.core.utils.types import ArticulationAction
+
+# CuRobo
+from curobo.util.logger import setup_curobo_logger
+from curobo.util.usd_helper import UsdHelper
+
+############################################################
+
+
+########### OV #################;;;;;
+
+
+def main():
+    # assuming obstacles are in objects_path:
+    my_world = World(stage_units_in_meters=1.0)
+    stage = my_world.stage
+
+    xform = stage.DefinePrim("/World", "Xform")
+    stage.SetDefaultPrim(xform)
+    stage.DefinePrim("/curobo", "Xform")
+    # my_world.stage.SetDefaultPrim(my_world.stage.GetPrimAtPath("/World"))
+    stage = my_world.stage
+    # stage.SetDefaultPrim(stage.GetPrimAtPath("/World"))
+
+    # Make a target to follow
+    target = cuboid.VisualCuboid(
+        "/World/target",
+        position=np.array([0.5, 0, 0.5]),
+        orientation=np.array([0, 1, 0, 0]),
+        color=np.array([1.0, 0, 0]),
+        size=0.05,
+    )
+
+    setup_curobo_logger("warn")
+    past_pose = None
+    n_obstacle_cuboids = 30
+    n_obstacle_mesh = 10
+
+    # warmup curobo instance
+    usd_help = UsdHelper()
+    target_pose = None
+
+    tensor_args = TensorDeviceType()
+
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), args.robot))["robot_cfg"]
+
+    j_names = robot_cfg["kinematics"]["cspace"]["joint_names"]
+    default_config = robot_cfg["kinematics"]["cspace"]["retract_config"]
+
+    robot, robot_prim_path = add_robot_to_scene(robot_cfg, my_world)
+
+    articulation_controller = robot.get_articulation_controller()
+
+    world_cfg_table = WorldConfig.from_dict(
+        load_yaml(join_path(get_world_configs_path(), "collision_table.yml"))
+    )
+    world_cfg_table.cuboid[0].pose[2] -= 0.04
+    world_cfg1 = WorldConfig.from_dict(
+        load_yaml(join_path(get_world_configs_path(), "collision_table.yml"))
+    ).get_mesh_world()
+    world_cfg1.mesh[0].name += "_mesh"
+    world_cfg1.mesh[0].pose[2] = -10.5
+
+    world_cfg = WorldConfig(cuboid=world_cfg_table.cuboid, mesh=world_cfg1.mesh)
+
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        tensor_args,
+        trajopt_tsteps=32,
+        collision_checker_type=CollisionCheckerType.MESH,
+        use_cuda_graph=True,
+        num_trajopt_seeds=12,
+        num_graph_seeds=12,
+        interpolation_dt=0.03,
+        collision_cache={"obb": n_obstacle_cuboids, "mesh": n_obstacle_mesh},
+        collision_activation_distance=0.025,
+        acceleration_scale=1.0,
+        self_collision_check=True,
+        maximum_trajectory_dt=0.25,
+        fixed_iters_trajopt=True,
+        finetune_dt_scale=1.05,
+        velocity_scale=[0.25, 1, 1, 1, 1.0, 1.0, 1.0, 1.0, 1.0],
+    )
+    motion_gen = MotionGen(motion_gen_config)
+    print("warming up...")
+    motion_gen.warmup(enable_graph=False, warmup_js_trajopt=False)
+
+    print("Curobo is Ready")
+    plan_config = MotionGenPlanConfig(
+        enable_graph=False, enable_graph_attempt=4, max_attempts=2, enable_finetune_trajopt=True
+    )
+
+    usd_help.load_stage(my_world.stage)
+    usd_help.add_world_to_stage(world_cfg, base_frame="/World")
+
+    cmd_plan = None
+    cmd_idx = 0
+    my_world.scene.add_default_ground_plane()
+    i = 0
+    spheres = None
+
+    while simulation_app.is_running():
+        my_world.step(render=True)
+        if not my_world.is_playing():
+            if i % 100 == 0:
+                print("**** Click Play to start simulation *****")
+            i += 1
+            # if step_index == 0:
+            #    my_world.play()
+            continue
+
+        step_index = my_world.current_time_step_index
+        # print(step_index)
+        if step_index < 2:
+            my_world.reset()
+            robot._articulation_view.initialize()
+            idx_list = [robot.get_dof_index(x) for x in j_names]
+            robot.set_joint_positions(default_config, idx_list)
+
+            robot._articulation_view.set_max_efforts(
+                values=np.array([5000 for i in range(len(idx_list))]), joint_indices=idx_list
+            )
+        if step_index < 20:
+            continue
+
+        if step_index == 50 or step_index % 1000 == 0.0:
+            print("Updating world, reading w.r.t.", robot_prim_path)
+            obstacles = usd_help.get_obstacles_from_stage(
+                # only_paths=[obstacles_path],
+                reference_prim_path=robot_prim_path,
+                ignore_substring=[
+                    robot_prim_path,
+                    "/World/target",
+                    "/World/defaultGroundPlane",
+                    "/curobo",
+                ],
+            ).get_collision_check_world()
+
+            motion_gen.update_world(obstacles)
+            print("Updated World")
+            carb.log_info("Synced CuRobo world from stage.")
+
+        # position and orientation of target virtual cube:
+        cube_position, cube_orientation = target.get_world_pose()
+
+        if past_pose is None:
+            past_pose = cube_position
+        if target_pose is None:
+            target_pose = cube_position
+
+        sim_js = robot.get_joints_state()
+        sim_js_names = robot.dof_names
+        cu_js = JointState(
+            position=tensor_args.to_device(sim_js.positions),
+            velocity=tensor_args.to_device(sim_js.velocities) * 0.0,
+            acceleration=tensor_args.to_device(sim_js.velocities) * 0.0,
+            jerk=tensor_args.to_device(sim_js.velocities) * 0.0,
+            joint_names=sim_js_names,
+        )
+        cu_js = cu_js.get_ordered_joint_state(motion_gen.kinematics.joint_names)
+
+        if args.visualize_spheres and step_index % 2 == 0:
+            sph_list = motion_gen.kinematics.get_robot_as_spheres(cu_js.position)
+
+            if spheres is None:
+                spheres = []
+                # create spheres:
+
+                for si, s in enumerate(sph_list[0]):
+                    sp = sphere.VisualSphere(
+                        prim_path="/curobo/robot_sphere_" + str(si),
+                        position=np.ravel(s.position),
+                        radius=float(s.radius),
+                        color=np.array([0, 0.8, 0.2]),
+                    )
+                    spheres.append(sp)
+            else:
+                for si, s in enumerate(sph_list[0]):
+                    spheres[si].set_world_pose(position=np.ravel(s.position))
+                    spheres[si].set_radius(float(s.radius))
+        if (
+            np.linalg.norm(cube_position - target_pose) > 1e-3
+            and np.linalg.norm(past_pose - cube_position) == 0.0
+            and np.linalg.norm(sim_js.velocities) < 0.2
+        ):
+            # Set EE teleop goals, use cube for simple non-vr init:
+            ee_translation_goal = cube_position
+            ee_orientation_teleop_goal = cube_orientation
+
+            # compute curobo solution:
+            ik_goal = Pose(
+                position=tensor_args.to_device(ee_translation_goal),
+                quaternion=tensor_args.to_device(ee_orientation_teleop_goal),
+            )
+
+            result = motion_gen.plan_single(cu_js.unsqueeze(0), ik_goal, plan_config)
+            # ik_result = ik_solver.solve_single(ik_goal, cu_js.position.view(1,-1), cu_js.position.view(1,1,-1))
+
+            succ = result.success.item()  # ik_result.success.item()
+            if succ:
+                cmd_plan = result.get_interpolated_plan()
+                cmd_plan = motion_gen.get_full_js(cmd_plan)
+                # get only joint names that are in both:
+                idx_list = []
+                common_js_names = []
+                for x in sim_js_names:
+                    if x in cmd_plan.joint_names:
+                        idx_list.append(robot.get_dof_index(x))
+                        common_js_names.append(x)
+                # idx_list = [robot.get_dof_index(x) for x in sim_js_names]
+
+                cmd_plan = cmd_plan.get_ordered_joint_state(common_js_names)
+
+                cmd_idx = 0
+
+            else:
+                carb.log_warn("Plan did not converge to a solution.  No action is being taken.")
+            target_pose = cube_position
+        past_pose = cube_position
+        if cmd_plan is not None:
+            cmd_state = cmd_plan[cmd_idx]
+
+            # get full dof state
+            art_action = ArticulationAction(
+                cmd_state.position.cpu().numpy(),
+                cmd_state.velocity.cpu().numpy(),
+                joint_indices=idx_list,
+            )
+            # set desired joint angles obtained from IK:
+            articulation_controller.apply_action(art_action)
+            cmd_idx += 1
+            for _ in range(2):
+                my_world.step(render=False)
+            if cmd_idx >= len(cmd_plan.position):
+                cmd_idx = 0
+                cmd_plan = None
+    simulation_app.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/isaac_sim/motion_gen_reacher_nvblox.py b/examples/isaac_sim/motion_gen_reacher_nvblox.py
new file mode 100644
index 00000000..81cc207d
--- /dev/null
+++ b/examples/isaac_sim/motion_gen_reacher_nvblox.py
@@ -0,0 +1,313 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+
+# Third Party
+import torch
+
+a = torch.zeros(4, device="cuda:0")
+
+# Standard Library
+import argparse
+
+# CuRobo
+# from curobo.wrap.reacher.ik_solver import IKSolver, IKSolverConfig
+from curobo.geom.sdf.world import CollisionCheckerType
+from curobo.geom.types import WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.types.state import JointState
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
+from curobo.wrap.reacher.mpc import MpcSolver, MpcSolverConfig
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--headless", action="store_true", help="When True, enables headless mode", default=False
+)
+parser.add_argument(
+    "--visualize_spheres",
+    action="store_true",
+    help="When True, visualizes robot spheres",
+    default=False,
+)
+
+parser.add_argument("--robot", type=str, default="franka.yml", help="robot configuration to load")
+args = parser.parse_args()
+
+############################################################
+
+# Third Party
+from omni.isaac.kit import SimulationApp
+
+simulation_app = SimulationApp(
+    {
+        "headless": args.headless,
+        "width": "1920",
+        "height": "1080",
+    }
+)
+# Third Party
+from omni.isaac.core.utils.extensions import enable_extension
+
+# CuRobo
+from curobo.util_file import (
+    get_assets_path,
+    get_filename,
+    get_path_of_dir,
+    get_robot_configs_path,
+    join_path,
+    load_yaml,
+)
+
+ext_list = [
+    "omni.kit.asset_converter",
+    "omni.kit.livestream.native",
+    "omni.kit.tool.asset_importer",
+    "omni.isaac.asset_browser",
+]
+# [enable_extension(x) for x in ext_list]
+# simulation_app.update()
+
+# Standard Library
+from typing import Dict
+
+# Third Party
+import carb
+import numpy as np
+from helper import add_robot_to_scene
+from omni.isaac.core import World
+from omni.isaac.core.objects import cuboid, sphere
+from omni.isaac.core.robots import Robot
+
+########### OV #################
+from omni.isaac.core.utils.extensions import enable_extension
+from omni.isaac.core.utils.types import ArticulationAction
+
+# CuRobo
+from curobo.util.logger import setup_curobo_logger
+from curobo.util.usd_helper import UsdHelper
+
+############################################################
+
+
+########### OV #################;;;;;
+
+
+############################################################
+
+
+def main():
+    # assuming obstacles are in objects_path:
+    my_world = World(stage_units_in_meters=1.0)
+    stage = my_world.stage
+
+    xform = stage.DefinePrim("/World", "Xform")
+    stage.SetDefaultPrim(xform)
+    stage.DefinePrim("/curobo", "Xform")
+    # my_world.stage.SetDefaultPrim(my_world.stage.GetPrimAtPath("/World"))
+    stage = my_world.stage
+    # stage.SetDefaultPrim(stage.GetPrimAtPath("/World"))
+
+    # Make a target to follow
+    target = cuboid.VisualCuboid(
+        "/World/target",
+        position=np.array([0.5, 0, 0.5]),
+        orientation=np.array([0, 1, 0, 0]),
+        color=np.array([1.0, 0, 0]),
+        size=0.05,
+    )
+
+    setup_curobo_logger("warn")
+    past_pose = None
+
+    # warmup curobo instance
+    usd_help = UsdHelper()
+    target_pose = None
+
+    tensor_args = TensorDeviceType()
+
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), args.robot))["robot_cfg"]
+
+    j_names = robot_cfg["kinematics"]["cspace"]["joint_names"]
+    default_config = robot_cfg["kinematics"]["cspace"]["retract_config"]
+
+    robot, _ = add_robot_to_scene(robot_cfg, my_world)
+
+    articulation_controller = robot.get_articulation_controller()
+    world_cfg_table = WorldConfig.from_dict(
+        load_yaml(join_path(get_world_configs_path(), "collision_table.yml"))
+    )
+    world_cfg = WorldConfig.from_dict(
+        load_yaml(join_path(get_world_configs_path(), "collision_nvblox.yml"))
+    )
+    world_cfg_table.cuboid[0].pose[2] -= 0.04
+
+    world_cfg.add_obstacle(world_cfg_table.cuboid[0])
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        tensor_args,
+        trajopt_tsteps=32,
+        collision_checker_type=CollisionCheckerType.BLOX,
+        use_cuda_graph=True,
+        num_trajopt_seeds=12,
+        num_graph_seeds=12,
+        interpolation_dt=0.03,
+        collision_activation_distance=0.025,
+        acceleration_scale=1.0,
+        self_collision_check=True,
+        maximum_trajectory_dt=0.25,
+        finetune_dt_scale=1.05,
+        fixed_iters_trajopt=True,
+        finetune_trajopt_iters=500,
+        minimize_jerk=False,
+    )
+    motion_gen = MotionGen(motion_gen_config)
+    print("warming up...")
+    motion_gen.warmup(enable_graph=True, warmup_js_trajopt=False)
+
+    print("Curobo is Ready")
+    plan_config = MotionGenPlanConfig(
+        enable_graph=False, enable_graph_attempt=4, max_attempts=2, enable_finetune_trajopt=True
+    )
+
+    usd_help.load_stage(my_world.stage)
+    usd_help.add_world_to_stage(world_cfg.get_mesh_world(), base_frame="/World")
+
+    cmd_plan = None
+    cmd_idx = 0
+    my_world.scene.add_default_ground_plane()
+    i = 0
+    spheres = None
+    while simulation_app.is_running():
+        my_world.step(render=True)
+        if not my_world.is_playing():
+            if i % 100 == 0:
+                print("**** Click Play to start simulation *****")
+            i += 1
+            # if step_index == 0:
+            #    my_world.play()
+            continue
+
+        step_index = my_world.current_time_step_index
+        # print(step_index)
+        if step_index == 0:
+            my_world.reset()
+            idx_list = [robot.get_dof_index(x) for x in j_names]
+            robot.set_joint_positions(default_config, idx_list)
+
+            robot._articulation_view.set_max_efforts(
+                values=np.array([5000 for i in range(len(idx_list))]), joint_indices=idx_list
+            )
+        if step_index < 20:
+            continue
+
+        # position and orientation of target virtual cube:
+        cube_position, cube_orientation = target.get_world_pose()
+
+        if past_pose is None:
+            past_pose = cube_position
+        if target_pose is None:
+            target_pose = cube_position
+        sim_js = robot.get_joints_state()
+        sim_js_names = robot.dof_names
+        cu_js = JointState(
+            position=tensor_args.to_device(sim_js.positions),
+            velocity=tensor_args.to_device(sim_js.velocities) * 0.0,
+            acceleration=tensor_args.to_device(sim_js.velocities) * 0.0,
+            jerk=tensor_args.to_device(sim_js.velocities) * 0.0,
+            joint_names=sim_js_names,
+        )
+        cu_js = cu_js.get_ordered_joint_state(motion_gen.kinematics.joint_names)
+
+        if args.visualize_spheres and step_index % 2 == 0:
+            sph_list = motion_gen.kinematics.get_robot_as_spheres(cu_js.position)
+
+            if spheres is None:
+                spheres = []
+                # create spheres:
+
+                for si, s in enumerate(sph_list[0]):
+                    sp = sphere.VisualSphere(
+                        prim_path="/curobo/robot_sphere_" + str(si),
+                        position=np.ravel(s.position),
+                        radius=float(s.radius),
+                        color=np.array([0, 0.8, 0.2]),
+                    )
+                    spheres.append(sp)
+            else:
+                for si, s in enumerate(sph_list[0]):
+                    spheres[si].set_world_pose(position=np.ravel(s.position))
+                    spheres[si].set_radius(float(s.radius))
+        # print(sim_js.velocities)
+        if (
+            np.linalg.norm(cube_position - target_pose) > 1e-3
+            and np.linalg.norm(past_pose - cube_position) == 0.0
+            and np.linalg.norm(sim_js.velocities) < 0.2
+        ):
+            # Set EE teleop goals, use cube for simple non-vr init:
+            ee_translation_goal = cube_position
+            ee_orientation_teleop_goal = cube_orientation
+
+            # compute curobo solution:
+            ik_goal = Pose(
+                position=tensor_args.to_device(ee_translation_goal),
+                quaternion=tensor_args.to_device(ee_orientation_teleop_goal),
+            )
+
+            result = motion_gen.plan_single(cu_js.unsqueeze(0), ik_goal, plan_config)
+            # ik_result = ik_solver.solve_single(ik_goal, cu_js.position.view(1,-1), cu_js.position.view(1,1,-1))
+
+            succ = result.success.item()  # ik_result.success.item()
+            if succ:
+                cmd_plan = result.get_interpolated_plan()
+                cmd_plan = motion_gen.get_full_js(cmd_plan)
+                # get only joint names that are in both:
+                idx_list = []
+                common_js_names = []
+                for x in sim_js_names:
+                    if x in cmd_plan.joint_names:
+                        idx_list.append(robot.get_dof_index(x))
+                        common_js_names.append(x)
+                # idx_list = [robot.get_dof_index(x) for x in sim_js_names]
+
+                cmd_plan = cmd_plan.get_ordered_joint_state(common_js_names)
+
+                cmd_idx = 0
+
+            else:
+                carb.log_warn("Plan did not converge to a solution.  No action is being taken.")
+            target_pose = cube_position
+        past_pose = cube_position
+        if cmd_plan is not None:
+            cmd_state = cmd_plan[cmd_idx]
+
+            # get full dof state
+            art_action = ArticulationAction(
+                cmd_state.position.cpu().numpy(),
+                cmd_state.velocity.cpu().numpy(),
+                joint_indices=idx_list,
+            )
+            # set desired joint angles obtained from IK:
+            articulation_controller.apply_action(art_action)
+            cmd_idx += 1
+            for _ in range(2):
+                my_world.step(render=False)
+            if cmd_idx >= len(cmd_plan.position):
+                cmd_idx = 0
+                cmd_plan = None
+    simulation_app.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/isaac_sim/mpc_example.py b/examples/isaac_sim/mpc_example.py
new file mode 100644
index 00000000..7dbbdff4
--- /dev/null
+++ b/examples/isaac_sim/mpc_example.py
@@ -0,0 +1,402 @@
+#!/usr/bin/env python3
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+
+# script running (ubuntu):
+#
+
+############################################################
+
+
+# Third Party
+import torch
+
+a = torch.zeros(4, device="cuda:0")
+
+# Standard Library
+import argparse
+import pickle
+import shutil
+import sys
+
+## import curobo:
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--headless", action="store_true", help="When True, enables headless mode", default=False
+)
+parser.add_argument(
+    "--visualize_spheres",
+    action="store_true",
+    help="When True, visualizes robot spheres",
+    default=False,
+)
+
+parser.add_argument("--robot", type=str, default="franka.yml", help="robot configuration to load")
+args = parser.parse_args()
+
+###########################################################
+
+# Third Party
+from omni.isaac.kit import SimulationApp
+
+simulation_app = SimulationApp(
+    {"headless": False, "width": 1920, "height": 1080}
+)  # _make_simulation_app({"headless": False})
+# Third Party
+# Enable the layers and stage windows in the UI
+from omni.isaac.core.utils.extensions import enable_extension
+
+ext_list = [
+    "omni.kit.asset_converter",
+    # "omni.kit.livestream.native",
+    "omni.kit.tool.asset_importer",
+    "omni.isaac.asset_browser",
+]
+[enable_extension(x) for x in ext_list]
+simulation_app.update()
+# Standard Library
+import os
+
+# Third Party
+import carb
+import numpy as np
+from helper import add_robot_to_scene
+from omni.isaac.core import World
+from omni.isaac.core.objects import DynamicCuboid, VisualCuboid, cuboid, sphere
+from omni.isaac.core.prims.xform_prim import XFormPrim
+
+########### OV #################
+from omni.isaac.core.utils.extensions import enable_extension
+
+# from omni.isaac.cortex.motion_commander import MotionCommand, PosePq, open_gripper, close_gripper
+from omni.isaac.core.utils.rotations import euler_angles_to_quat
+
+########### frame prim #################
+from omni.isaac.core.utils.stage import get_stage_units
+from omni.isaac.core.utils.types import ArticulationAction
+from omni.isaac.cortex.cortex_object import CortexObject
+from omni.isaac.franka import KinematicsSolver
+from omni.isaac.franka.franka import Franka
+from omni.isaac.franka.tasks import FollowTarget
+from pxr import Gf
+from scipy.spatial.transform import Rotation as R
+
+# CuRobo
+from curobo.util.logger import setup_curobo_logger
+from curobo.util.usd_helper import UsdHelper
+
+############################################################
+
+
+########### OV #################;;;;;
+
+
+###########
+EXT_DIR = os.path.abspath(os.path.join(os.path.abspath(os.path.dirname(__file__))))
+DATA_DIR = os.path.join(EXT_DIR, "data")
+########### frame prim #################;;;;;
+
+
+# Standard Library
+from typing import Optional
+
+# Third Party
+from helper import add_robot_to_scene
+from omni.isaac.core.materials import PhysicsMaterial, VisualMaterial
+from omni.isaac.core.utils.prims import add_reference_to_stage, get_prim_at_path
+from pxr import Sdf, UsdShade
+
+# CuRobo
+from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel
+from curobo.curobolib import geom_cu
+
+# from curobo.wrap.reacher.ik_solver import IKSolver, IKSolverConfig
+from curobo.geom.sdf.world import CollisionCheckerType
+from curobo.geom.types import WorldConfig
+from curobo.rollout.cost.self_collision_cost import SelfCollisionCost, SelfCollisionCostConfig
+from curobo.rollout.rollout_base import Goal
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.types.state import JointState
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.mpc import MpcSolver, MpcSolverConfig
+
+############################################################
+
+
+def draw_points(rollouts: torch.Tensor):
+    if rollouts is None:
+        return
+    # Standard Library
+    import random
+
+    # Third Party
+    from omni.isaac.debug_draw import _debug_draw
+
+    draw = _debug_draw.acquire_debug_draw_interface()
+    N = 100
+    # if draw.get_num_points() > 0:
+    draw.clear_points()
+    cpu_rollouts = rollouts.cpu().numpy()
+    b, h, _ = cpu_rollouts.shape
+    point_list = []
+    colors = []
+    for i in range(b):
+        # get list of points:
+        point_list += [
+            (cpu_rollouts[i, j, 0], cpu_rollouts[i, j, 1], cpu_rollouts[i, j, 2]) for j in range(h)
+        ]
+        colors += [(1.0 - (i + 1.0 / b), 0.3 * (i + 1.0 / b), 0.0, 0.1) for _ in range(h)]
+    sizes = [10.0 for _ in range(b * h)]
+    draw.draw_points(point_list, colors, sizes)
+
+
+def main():
+    # assuming obstacles are in objects_path:
+    my_world = World(stage_units_in_meters=1.0)
+    stage = my_world.stage
+
+    xform = stage.DefinePrim("/World", "Xform")
+    stage.SetDefaultPrim(xform)
+    stage.DefinePrim("/curobo", "Xform")
+    # my_world.stage.SetDefaultPrim(my_world.stage.GetPrimAtPath("/World"))
+    stage = my_world.stage
+    my_world.scene.add_default_ground_plane()
+
+    # stage.SetDefaultPrim(stage.GetPrimAtPath("/World"))
+
+    # Make a target to follow
+    target = cuboid.VisualCuboid(
+        "/World/target",
+        position=np.array([0.5, 0, 0.5]),
+        orientation=np.array([0, 1, 0, 0]),
+        color=np.array([1.0, 0, 0]),
+        size=0.05,
+    )
+
+    setup_curobo_logger("warn")
+    past_pose = None
+    n_obstacle_cuboids = 30
+    n_obstacle_mesh = 10
+
+    # warmup curobo instance
+    usd_help = UsdHelper()
+    target_pose = None
+
+    tensor_args = TensorDeviceType()
+
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), args.robot))["robot_cfg"]
+
+    j_names = robot_cfg["kinematics"]["cspace"]["joint_names"]
+    default_config = robot_cfg["kinematics"]["cspace"]["retract_config"]
+    robot_cfg["kinematics"]["collision_sphere_buffer"] += 0.02
+
+    robot, robot_prim_path = add_robot_to_scene(robot_cfg, my_world)
+
+    articulation_controller = robot.get_articulation_controller()
+
+    world_cfg_table = WorldConfig.from_dict(
+        load_yaml(join_path(get_world_configs_path(), "collision_table.yml"))
+    )
+    world_cfg_table.cuboid[0].pose[2] -= 0.04
+    world_cfg1 = WorldConfig.from_dict(
+        load_yaml(join_path(get_world_configs_path(), "collision_table.yml"))
+    ).get_mesh_world()
+    world_cfg1.mesh[0].name += "_mesh"
+    world_cfg1.mesh[0].pose[2] = -10.5
+
+    world_cfg = WorldConfig(cuboid=world_cfg_table.cuboid, mesh=world_cfg1.mesh)
+
+    init_curobo = False
+
+    tensor_args = TensorDeviceType()
+
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), args.robot))["robot_cfg"]
+
+    world_cfg_table = WorldConfig.from_dict(
+        load_yaml(join_path(get_world_configs_path(), "collision_table.yml"))
+    )
+    world_cfg1 = WorldConfig.from_dict(
+        load_yaml(join_path(get_world_configs_path(), "collision_table.yml"))
+    ).get_mesh_world()
+    world_cfg1.mesh[0].pose[2] = -10.0
+
+    world_cfg = WorldConfig(cuboid=world_cfg_table.cuboid, mesh=world_cfg1.mesh)
+    j_names = robot_cfg["kinematics"]["cspace"]["joint_names"]
+
+    default_config = robot_cfg["kinematics"]["cspace"]["retract_config"]
+
+    mpc_config = MpcSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        use_cuda_graph=True,
+        use_cuda_graph_metrics=True,
+        use_cuda_graph_full_step=False,
+        self_collision_check=True,
+        collision_checker_type=CollisionCheckerType.MESH,
+        collision_cache={"obb": n_obstacle_cuboids, "mesh": n_obstacle_mesh},
+        use_mppi=True,
+        use_lbfgs=False,
+        use_es=False,
+        store_rollouts=True,
+        step_dt=0.02,
+    )
+
+    mpc = MpcSolver(mpc_config)
+
+    retract_cfg = mpc.rollout_fn.dynamics_model.retract_config.clone().unsqueeze(0)
+    joint_names = mpc.rollout_fn.joint_names
+
+    state = mpc.rollout_fn.compute_kinematics(
+        JointState.from_position(retract_cfg, joint_names=joint_names)
+    )
+    current_state = JointState.from_position(retract_cfg, joint_names=joint_names)
+    retract_pose = Pose(state.ee_pos_seq, quaternion=state.ee_quat_seq)
+    goal = Goal(
+        current_state=current_state,
+        goal_state=JointState.from_position(retract_cfg, joint_names=joint_names),
+        goal_pose=retract_pose,
+    )
+
+    goal_buffer = mpc.setup_solve_single(goal, 1)
+    mpc.update_goal(goal_buffer)
+
+    usd_help.load_stage(my_world.stage)
+    init_world = False
+    cmd_state_full = None
+    step = 0
+    while simulation_app.is_running():
+        if not init_world:
+            for _ in range(10):
+                my_world.step(render=True)
+            init_world = True
+        draw_points(mpc.get_visual_rollouts())
+
+        my_world.step(render=True)
+        if not my_world.is_playing():
+            continue
+
+        step_index = my_world.current_time_step_index
+
+        if step_index == 0:
+            my_world.reset()
+            idx_list = [robot.get_dof_index(x) for x in j_names]
+            robot.set_joint_positions(default_config, idx_list)
+
+            robot._articulation_view.set_max_efforts(
+                values=np.array([5000 for i in range(len(idx_list))]), joint_indices=idx_list
+            )
+
+        if not init_curobo:
+            init_curobo = True
+        step += 1
+        step_index = step
+        if step_index % 1000 == 0:
+            print("Updating world")
+            obstacles = usd_help.get_obstacles_from_stage(
+                # only_paths=[obstacles_path],
+                ignore_substring=[
+                    robot_prim_path,
+                    "/World/target",
+                    "/World/defaultGroundPlane",
+                    "/curobo",
+                ],
+                reference_prim_path=robot_prim_path,
+            )
+            obstacles.add_obstacle(world_cfg_table.cuboid[0])
+            mpc.world_coll_checker.load_collision_model(obstacles)
+
+        # position and orientation of target virtual cube:
+        cube_position, cube_orientation = target.get_world_pose()
+
+        if past_pose is None:
+            past_pose = cube_position + 1.0
+
+        if np.linalg.norm(cube_position - past_pose) > 1e-3:
+            # Set EE teleop goals, use cube for simple non-vr init:
+            ee_translation_goal = cube_position
+            ee_orientation_teleop_goal = cube_orientation
+            ik_goal = Pose(
+                position=tensor_args.to_device(ee_translation_goal),
+                quaternion=tensor_args.to_device(ee_orientation_teleop_goal),
+            )
+            goal_buffer.goal_pose.copy_(ik_goal)
+            mpc.update_goal(goal_buffer)
+            past_pose = cube_position
+
+        # if not changed don't call curobo:
+
+        # get robot current state:
+        sim_js = robot.get_joints_state()
+        js_names = robot.dof_names
+        sim_js_names = robot.dof_names
+
+        cu_js = JointState(
+            position=tensor_args.to_device(sim_js.positions),
+            velocity=tensor_args.to_device(sim_js.velocities) * 0.0,
+            acceleration=tensor_args.to_device(sim_js.velocities) * 0.0,
+            jerk=tensor_args.to_device(sim_js.velocities) * 0.0,
+            joint_names=sim_js_names,
+        )
+        cu_js = cu_js.get_ordered_joint_state(mpc.rollout_fn.joint_names)
+        if cmd_state_full is None:
+            current_state.copy_(cu_js)
+        else:
+            current_state_partial = cmd_state_full.get_ordered_joint_state(
+                mpc.rollout_fn.joint_names
+            )
+            current_state.copy_(current_state_partial)
+            current_state.joint_names = current_state_partial.joint_names
+            # current_state = current_state.get_ordered_joint_state(mpc.rollout_fn.joint_names)
+        common_js_names = []
+        current_state.copy_(cu_js)
+
+        mpc_result = mpc.step(current_state, max_attempts=2)
+        # ik_result = ik_solver.solve_single(ik_goal, cu_js.position.view(1,-1), cu_js.position.view(1,1,-1))
+
+        succ = True  # ik_result.success.item()
+        cmd_state_full = mpc_result.js_action
+        common_js_names = []
+        idx_list = []
+        for x in sim_js_names:
+            if x in cmd_state_full.joint_names:
+                idx_list.append(robot.get_dof_index(x))
+                common_js_names.append(x)
+
+        cmd_state = cmd_state_full.get_ordered_joint_state(common_js_names)
+        cmd_state_full = cmd_state
+
+        art_action = ArticulationAction(
+            cmd_state.position.cpu().numpy(),
+            # cmd_state.velocity.cpu().numpy(),
+            joint_indices=idx_list,
+        )
+        # positions_goal = articulation_action.joint_positions
+        if step_index % 1000 == 0:
+            print(mpc_result.metrics.feasible.item(), mpc_result.metrics.pose_error.item())
+
+        if succ:
+            # set desired joint angles obtained from IK:
+            for _ in range(3):
+                articulation_controller.apply_action(art_action)
+
+        else:
+            carb.log_warn("No action is being taken.")
+
+
+############################################################
+
+if __name__ == "__main__":
+    main()
+    simulation_app.close()
diff --git a/examples/isaac_sim/mpc_nvblox_example.py b/examples/isaac_sim/mpc_nvblox_example.py
new file mode 100644
index 00000000..4627a5fd
--- /dev/null
+++ b/examples/isaac_sim/mpc_nvblox_example.py
@@ -0,0 +1,387 @@
+#!/usr/bin/env python3
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+
+# script running (ubuntu):
+#
+
+############################################################
+
+
+# Third Party
+import torch
+
+a = torch.zeros(4, device="cuda:0")
+
+# Standard Library
+import argparse
+import pickle
+import shutil
+import sys
+
+## import curobo:
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--headless", action="store_true", help="When True, enables headless mode", default=False
+)
+parser.add_argument(
+    "--visualize_spheres",
+    action="store_true",
+    help="When True, visualizes robot spheres",
+    default=False,
+)
+
+parser.add_argument("--robot", type=str, default="franka.yml", help="robot configuration to load")
+args = parser.parse_args()
+
+###########################################################
+
+# Third Party
+from omni.isaac.kit import SimulationApp
+
+simulation_app = SimulationApp(
+    {"headless": False, "width": 1920, "height": 1080}
+)  # _make_simulation_app({"headless": False})
+# Third Party
+# Enable the layers and stage windows in the UI
+from omni.isaac.core.utils.extensions import enable_extension
+
+ext_list = [
+    "omni.kit.asset_converter",
+    # "omni.kit.livestream.native",
+    "omni.kit.tool.asset_importer",
+    "omni.isaac.asset_browser",
+]
+[enable_extension(x) for x in ext_list]
+simulation_app.update()
+# Standard Library
+import os
+
+# Third Party
+import carb
+import numpy as np
+from helper import add_robot_to_scene
+from omni.isaac.core import World
+from omni.isaac.core.objects import DynamicCuboid, VisualCuboid, cuboid, sphere
+from omni.isaac.core.prims.xform_prim import XFormPrim
+
+########### OV #################
+from omni.isaac.core.utils.extensions import enable_extension
+
+# from omni.isaac.cortex.motion_commander import MotionCommand, PosePq, open_gripper, close_gripper
+from omni.isaac.core.utils.rotations import euler_angles_to_quat
+
+########### frame prim #################
+from omni.isaac.core.utils.stage import get_stage_units
+from omni.isaac.core.utils.types import ArticulationAction
+from omni.isaac.cortex.cortex_object import CortexObject
+from omni.isaac.franka import KinematicsSolver
+from omni.isaac.franka.franka import Franka
+from omni.isaac.franka.tasks import FollowTarget
+from pxr import Gf
+from scipy.spatial.transform import Rotation as R
+
+# CuRobo
+from curobo.util.logger import setup_curobo_logger
+from curobo.util.usd_helper import UsdHelper
+
+############################################################
+
+
+########### OV #################;;;;;
+
+
+###########
+EXT_DIR = os.path.abspath(os.path.join(os.path.abspath(os.path.dirname(__file__))))
+DATA_DIR = os.path.join(EXT_DIR, "data")
+########### frame prim #################;;;;;
+
+
+# Standard Library
+from typing import Optional
+
+# Third Party
+from helper import add_robot_to_scene
+from omni.isaac.core.materials import PhysicsMaterial, VisualMaterial
+from omni.isaac.core.utils.prims import add_reference_to_stage, get_prim_at_path
+from pxr import Sdf, UsdShade
+
+# CuRobo
+from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel
+from curobo.curobolib import geom_cu
+
+# from curobo.wrap.reacher.ik_solver import IKSolver, IKSolverConfig
+from curobo.geom.sdf.world import CollisionCheckerType
+from curobo.geom.types import WorldConfig
+from curobo.rollout.cost.self_collision_cost import SelfCollisionCost, SelfCollisionCostConfig
+from curobo.rollout.rollout_base import Goal
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.types.state import JointState
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.mpc import MpcSolver, MpcSolverConfig
+
+############################################################
+
+
+def draw_points(rollouts: torch.Tensor):
+    if rollouts is None:
+        return
+    # Standard Library
+    import random
+
+    # Third Party
+    from omni.isaac.debug_draw import _debug_draw
+
+    draw = _debug_draw.acquire_debug_draw_interface()
+    N = 100
+    # if draw.get_num_points() > 0:
+    draw.clear_points()
+    cpu_rollouts = rollouts.cpu().numpy()
+    b, h, _ = cpu_rollouts.shape
+    point_list = []
+    colors = []
+    for i in range(b):
+        # get list of points:
+        point_list += [
+            (cpu_rollouts[i, j, 0], cpu_rollouts[i, j, 1], cpu_rollouts[i, j, 2]) for j in range(h)
+        ]
+        colors += [(1.0 - (i + 1.0 / b), 0.3 * (i + 1.0 / b), 0.0, 0.1) for _ in range(h)]
+    sizes = [10.0 for _ in range(b * h)]
+    draw.draw_points(point_list, colors, sizes)
+
+
+def main():
+    # assuming obstacles are in objects_path:
+    my_world = World(stage_units_in_meters=1.0)
+    stage = my_world.stage
+
+    xform = stage.DefinePrim("/World", "Xform")
+    stage.SetDefaultPrim(xform)
+    stage.DefinePrim("/curobo", "Xform")
+    # my_world.stage.SetDefaultPrim(my_world.stage.GetPrimAtPath("/World"))
+    stage = my_world.stage
+    my_world.scene.add_default_ground_plane()
+
+    # stage.SetDefaultPrim(stage.GetPrimAtPath("/World"))
+
+    # Make a target to follow
+    target = cuboid.VisualCuboid(
+        "/World/target",
+        position=np.array([0.5, 0, 0.5]),
+        orientation=np.array([0, 1, 0, 0]),
+        color=np.array([1.0, 0, 0]),
+        size=0.05,
+    )
+
+    setup_curobo_logger("warn")
+    past_pose = None
+    n_obstacle_cuboids = 30
+    n_obstacle_mesh = 10
+
+    # warmup curobo instance
+    usd_help = UsdHelper()
+    target_pose = None
+
+    tensor_args = TensorDeviceType()
+
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), args.robot))["robot_cfg"]
+
+    j_names = robot_cfg["kinematics"]["cspace"]["joint_names"]
+    default_config = robot_cfg["kinematics"]["cspace"]["retract_config"]
+    robot_cfg["kinematics"]["collision_sphere_buffer"] += 0.02
+
+    robot, robot_prim_path = add_robot_to_scene(robot_cfg, my_world)
+
+    articulation_controller = robot.get_articulation_controller()
+
+    world_cfg_table = WorldConfig.from_dict(
+        load_yaml(join_path(get_world_configs_path(), "collision_table.yml"))
+    )
+    world_cfg_table.cuboid[0].pose[2] -= 0.04
+
+    init_curobo = False
+
+    tensor_args = TensorDeviceType()
+
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), args.robot))["robot_cfg"]
+
+    # world_cfg = WorldConfig(cuboid=world_cfg_table.cuboid, mesh=world_cfg1.mesh)
+    j_names = robot_cfg["kinematics"]["cspace"]["joint_names"]
+    world_cfg = WorldConfig.from_dict(
+        load_yaml(join_path(get_world_configs_path(), "collision_nvblox.yml"))
+    )
+    world_cfg.add_obstacle(world_cfg_table.cuboid[0])
+
+    default_config = robot_cfg["kinematics"]["cspace"]["retract_config"]
+
+    mpc_config = MpcSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        use_cuda_graph=True,
+        use_cuda_graph_metrics=True,
+        use_cuda_graph_full_step=False,
+        self_collision_check=True,
+        collision_checker_type=CollisionCheckerType.BLOX,
+        use_mppi=True,
+        use_lbfgs=False,
+        use_es=False,
+        store_rollouts=True,
+        step_dt=0.02,
+    )
+
+    mpc = MpcSolver(mpc_config)
+
+    retract_cfg = mpc.rollout_fn.dynamics_model.retract_config.clone().unsqueeze(0)
+    joint_names = mpc.rollout_fn.joint_names
+
+    state = mpc.rollout_fn.compute_kinematics(
+        JointState.from_position(retract_cfg, joint_names=joint_names)
+    )
+    current_state = JointState.from_position(retract_cfg, joint_names=joint_names)
+    retract_pose = Pose(state.ee_pos_seq, quaternion=state.ee_quat_seq)
+    goal = Goal(
+        current_state=current_state,
+        goal_state=JointState.from_position(retract_cfg, joint_names=joint_names),
+        goal_pose=retract_pose,
+    )
+
+    goal_buffer = mpc.setup_solve_single(goal, 1)
+    mpc.update_goal(goal_buffer)
+
+    usd_help.load_stage(my_world.stage)
+    usd_help.add_world_to_stage(world_cfg.get_mesh_world(), base_frame="/World")
+
+    init_world = False
+    cmd_state_full = None
+    step = 0
+    while simulation_app.is_running():
+        if not init_world:
+            for _ in range(10):
+                my_world.step(render=True)
+            init_world = True
+        draw_points(mpc.get_visual_rollouts())
+
+        my_world.step(render=True)
+        if not my_world.is_playing():
+            continue
+
+        step_index = my_world.current_time_step_index
+
+        if step_index == 0:
+            my_world.reset()
+            idx_list = [robot.get_dof_index(x) for x in j_names]
+            robot.set_joint_positions(default_config, idx_list)
+
+            robot._articulation_view.set_max_efforts(
+                values=np.array([5000 for i in range(len(idx_list))]), joint_indices=idx_list
+            )
+
+        if not init_curobo:
+            init_curobo = True
+        step += 1
+        step_index = step
+
+        # position and orientation of target virtual cube:
+        cube_position, cube_orientation = target.get_world_pose()
+
+        if past_pose is None:
+            past_pose = cube_position + 1.0
+
+        if np.linalg.norm(cube_position - past_pose) > 1e-3:
+            # Set EE teleop goals, use cube for simple non-vr init:
+            ee_translation_goal = cube_position
+            ee_orientation_teleop_goal = cube_orientation
+            ik_goal = Pose(
+                position=tensor_args.to_device(ee_translation_goal),
+                quaternion=tensor_args.to_device(ee_orientation_teleop_goal),
+            )
+            goal_buffer.goal_pose.copy_(ik_goal)
+            mpc.update_goal(goal_buffer)
+            past_pose = cube_position
+
+        # if not changed don't call curobo:
+
+        # get robot current state:
+        sim_js = robot.get_joints_state()
+        js_names = robot.dof_names
+        sim_js_names = robot.dof_names
+
+        cu_js = JointState(
+            position=tensor_args.to_device(sim_js.positions),
+            velocity=tensor_args.to_device(sim_js.velocities) * 0.0,
+            acceleration=tensor_args.to_device(sim_js.velocities) * 0.0,
+            jerk=tensor_args.to_device(sim_js.velocities) * 0.0,
+            joint_names=sim_js_names,
+        )
+        cu_js = cu_js.get_ordered_joint_state(mpc.rollout_fn.joint_names)
+        if cmd_state_full is None:
+            current_state.copy_(cu_js)
+        else:
+            current_state_partial = cmd_state_full.get_ordered_joint_state(
+                mpc.rollout_fn.joint_names
+            )
+            current_state.copy_(current_state_partial)
+            current_state.joint_names = current_state_partial.joint_names
+            # current_state = current_state.get_ordered_joint_state(mpc.rollout_fn.joint_names)
+        common_js_names = []
+        current_state.copy_(cu_js)
+
+        mpc_result = mpc.step(current_state, max_attempts=2)
+        # ik_result = ik_solver.solve_single(ik_goal, cu_js.position.view(1,-1), cu_js.position.view(1,1,-1))
+
+        succ = True  # ik_result.success.item()
+        cmd_state_full = mpc_result.js_action
+        common_js_names = []
+        idx_list = []
+        for x in sim_js_names:
+            if x in cmd_state_full.joint_names:
+                idx_list.append(robot.get_dof_index(x))
+                common_js_names.append(x)
+
+        cmd_state = cmd_state_full.get_ordered_joint_state(common_js_names)
+        cmd_state_full = cmd_state
+        # print(ee_translation_goal, ee_orientation_teleop_goal)
+
+        # Compute IK for given EE Teleop goals
+        # articulation_action, succ = my_controller.compute_inverse_kinematics(
+        #    ee_translation_goal,
+        #    ee_orientation_teleop_goal,
+        # )
+
+        # create articulation action:
+        # get full dof state
+        art_action = ArticulationAction(
+            cmd_state.position.cpu().numpy(),
+            # cmd_state.velocity.cpu().numpy(),
+            joint_indices=idx_list,
+        )
+        # positions_goal = articulation_action.joint_positions
+        if step_index % 1000 == 0:
+            print(mpc_result.metrics.feasible.item(), mpc_result.metrics.pose_error.item())
+
+        if succ:
+            # set desired joint angles obtained from IK:
+            for _ in range(3):
+                articulation_controller.apply_action(art_action)
+
+        else:
+            carb.log_warn("No action is being taken.")
+
+
+############################################################
+
+if __name__ == "__main__":
+    main()
+    simulation_app.close()
diff --git a/examples/isaac_sim/multi_arm_reacher.py b/examples/isaac_sim/multi_arm_reacher.py
new file mode 100644
index 00000000..e4feb8b9
--- /dev/null
+++ b/examples/isaac_sim/multi_arm_reacher.py
@@ -0,0 +1,379 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+
+# Third Party
+import torch
+
+a = torch.zeros(4, device="cuda:0")
+
+# Standard Library
+import argparse
+
+# CuRobo
+from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel
+
+# from curobo.wrap.reacher.ik_solver import IKSolver, IKSolverConfig
+from curobo.geom.sdf.world import CollisionCheckerType
+from curobo.geom.types import WorldConfig
+from curobo.rollout.rollout_base import Goal
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.types.state import JointState
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--headless", action="store_true", help="When True, enables headless mode", default=False
+)
+parser.add_argument(
+    "--visualize_spheres",
+    action="store_true",
+    help="When True, visualizes robot spheres",
+    default=False,
+)
+
+parser.add_argument(
+    "--robot", type=str, default="dual_ur10e.yml", help="robot configuration to load"
+)
+args = parser.parse_args()
+
+############################################################
+
+# Third Party
+from omni.isaac.kit import SimulationApp
+
+simulation_app = SimulationApp(
+    {
+        "headless": args.headless,
+        "width": "1920",
+        "height": "1080",
+    }
+)
+# Third Party
+from omni.isaac.core.utils.extensions import enable_extension
+
+# CuRobo
+from curobo.util_file import (
+    get_assets_path,
+    get_filename,
+    get_path_of_dir,
+    get_robot_configs_path,
+    join_path,
+    load_yaml,
+)
+
+ext_list = [
+    "omni.kit.asset_converter",
+    "omni.kit.livestream.native",
+    "omni.kit.tool.asset_importer",
+    "omni.isaac.asset_browser",
+]
+# [enable_extension(x) for x in ext_list]
+# simulation_app.update()
+
+# Standard Library
+from typing import Dict
+
+# Third Party
+import carb
+import numpy as np
+from helper import add_robot_to_scene
+from omni.isaac.core import World
+from omni.isaac.core.objects import cuboid, sphere
+from omni.isaac.core.robots import Robot
+
+########### OV #################
+from omni.isaac.core.utils.extensions import enable_extension
+from omni.isaac.core.utils.types import ArticulationAction
+
+# CuRobo
+from curobo.util.logger import setup_curobo_logger
+from curobo.util.usd_helper import UsdHelper
+
+############################################################
+
+
+########### OV #################;;;;;
+
+
+############################################################
+
+
+def main():
+    # assuming obstacles are in objects_path:
+    my_world = World(stage_units_in_meters=1.0)
+    stage = my_world.stage
+
+    xform = stage.DefinePrim("/World", "Xform")
+    stage.SetDefaultPrim(xform)
+    stage.DefinePrim("/curobo", "Xform")
+    # my_world.stage.SetDefaultPrim(my_world.stage.GetPrimAtPath("/World"))
+    stage = my_world.stage
+    # stage.SetDefaultPrim(stage.GetPrimAtPath("/World"))
+
+    # Make a target to follow
+
+    setup_curobo_logger("warn")
+    past_pose = None
+    n_obstacle_cuboids = 30
+    n_obstacle_mesh = 10
+
+    # warmup curobo instance
+    usd_help = UsdHelper()
+    target_pose = None
+
+    tensor_args = TensorDeviceType()
+
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), args.robot))["robot_cfg"]
+
+    j_names = robot_cfg["kinematics"]["cspace"]["joint_names"]
+    default_config = robot_cfg["kinematics"]["cspace"]["retract_config"]
+
+    robot, robot_prim_path = add_robot_to_scene(robot_cfg, my_world)
+
+    articulation_controller = robot.get_articulation_controller()
+
+    world_cfg_table = WorldConfig.from_dict(
+        load_yaml(join_path(get_world_configs_path(), "collision_table.yml"))
+    )
+    world_cfg_table.cuboid[0].pose[2] -= 0.02
+
+    world_cfg1 = WorldConfig.from_dict(
+        load_yaml(join_path(get_world_configs_path(), "collision_table.yml"))
+    ).get_mesh_world()
+    world_cfg1.mesh[0].name += "_mesh"
+    world_cfg1.mesh[0].pose[2] = -10.5
+
+    world_cfg = WorldConfig(cuboid=world_cfg_table.cuboid, mesh=world_cfg1.mesh)
+
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        tensor_args,
+        trajopt_tsteps=32,
+        collision_checker_type=CollisionCheckerType.MESH,
+        use_cuda_graph=True,
+        num_trajopt_seeds=12,
+        num_graph_seeds=12,
+        interpolation_dt=0.03,
+        collision_cache={"obb": n_obstacle_cuboids, "mesh": n_obstacle_mesh},
+        collision_activation_distance=0.025,
+        acceleration_scale=1.0,
+        maximum_trajectory_dt=0.2,
+        fixed_iters_trajopt=True,
+    )
+    motion_gen = MotionGen(motion_gen_config)
+    print("warming up...")
+    motion_gen.warmup(enable_graph=True, warmup_js_trajopt=False)
+
+    print("Curobo is Ready")
+    plan_config = MotionGenPlanConfig(
+        enable_graph=False, enable_graph_attempt=4, max_attempts=10, enable_finetune_trajopt=True
+    )
+
+    usd_help.load_stage(my_world.stage)
+    usd_help.add_world_to_stage(world_cfg, base_frame="/World")
+
+    cmd_plan = None
+    cmd_idx = 0
+    my_world.scene.add_default_ground_plane()
+    i = 0
+    spheres = None
+
+    # read number of targets in link names:
+    link_names = motion_gen.kinematics.link_names
+    ee_link_name = motion_gen.kinematics.ee_link
+    # get link poses at retract configuration:
+
+    kin_state = motion_gen.kinematics.get_state(motion_gen.get_retract_config().view(1, -1))
+
+    link_retract_pose = kin_state.link_pose
+    t_pos = np.ravel(kin_state.ee_pose.to_list())
+    target = cuboid.VisualCuboid(
+        "/World/target",
+        position=t_pos[:3],
+        orientation=t_pos[3:],
+        color=np.array([1.0, 0, 0]),
+        size=0.05,
+    )
+
+    # create new targets for new links:
+    ee_idx = link_names.index(ee_link_name)
+    target_links = {}
+    names = []
+    for i in link_names:
+        if i != ee_link_name:
+            k_pose = np.ravel(link_retract_pose[i].to_list())
+            color = np.random.randn(3) * 0.2
+            color[0] += 0.5
+            color[1] = 0.5
+            color[2] = 0.0
+            target_links[i] = cuboid.VisualCuboid(
+                "/World/target_" + i,
+                position=np.array(k_pose[:3]),
+                orientation=np.array(k_pose[3:]),
+                color=color,
+                size=0.05,
+            )
+            names.append("/World/target_" + i)
+    i = 0
+    while simulation_app.is_running():
+        my_world.step(render=True)
+        if not my_world.is_playing():
+            if i % 100 == 0:
+                print("**** Click Play to start simulation *****")
+            i += 1
+            # if step_index == 0:
+            #    my_world.play()
+            continue
+
+        step_index = my_world.current_time_step_index
+        # print(step_index)
+        if step_index == 0:
+            my_world.reset()
+            idx_list = [robot.get_dof_index(x) for x in j_names]
+            robot.set_joint_positions(default_config, idx_list)
+
+            robot._articulation_view.set_max_efforts(
+                values=np.array([5000 for i in range(len(idx_list))]), joint_indices=idx_list
+            )
+        if step_index < 20:
+            continue
+
+        if step_index == 50 or step_index % 1000 == 0.0:
+            print("Updating world, reading w.r.t.", robot_prim_path)
+            obstacles = usd_help.get_obstacles_from_stage(
+                # only_paths=[obstacles_path],
+                reference_prim_path=robot_prim_path,
+                ignore_substring=[
+                    robot_prim_path,
+                    "/World/target",
+                    "/World/defaultGroundPlane",
+                    "/curobo",
+                ]
+                + names,
+            ).get_collision_check_world()
+
+            motion_gen.update_world(obstacles)
+            print("Updated World")
+            carb.log_info("Synced CuRobo world from stage.")
+
+        # position and orientation of target virtual cube:
+        cube_position, cube_orientation = target.get_world_pose()
+
+        if past_pose is None:
+            past_pose = cube_position
+        if target_pose is None:
+            target_pose = cube_position
+        sim_js = robot.get_joints_state()
+        sim_js_names = robot.dof_names
+        cu_js = JointState(
+            position=tensor_args.to_device(sim_js.positions),
+            velocity=tensor_args.to_device(sim_js.velocities) * 0.0,
+            acceleration=tensor_args.to_device(sim_js.velocities) * 0.0,
+            jerk=tensor_args.to_device(sim_js.velocities) * 0.0,
+            joint_names=sim_js_names,
+        )
+        cu_js = cu_js.get_ordered_joint_state(motion_gen.kinematics.joint_names)
+
+        if args.visualize_spheres and step_index % 2 == 0:
+            sph_list = motion_gen.kinematics.get_robot_as_spheres(cu_js.position)
+
+            if spheres is None:
+                spheres = []
+                # create spheres:
+
+                for si, s in enumerate(sph_list[0]):
+                    sp = sphere.VisualSphere(
+                        prim_path="/curobo/robot_sphere_" + str(si),
+                        position=np.ravel(s.position),
+                        radius=float(s.radius),
+                        color=np.array([0, 0.8, 0.2]),
+                    )
+                    spheres.append(sp)
+            else:
+                for si, s in enumerate(sph_list[0]):
+                    spheres[si].set_world_pose(position=np.ravel(s.position))
+                    spheres[si].set_radius(float(s.radius))
+        # print(sim_js.velocities)
+        if (
+            np.linalg.norm(cube_position - target_pose) > 1e-3
+            and np.linalg.norm(past_pose - cube_position) == 0.0
+            and np.linalg.norm(sim_js.velocities) < 0.2
+        ):
+            # Set EE teleop goals, use cube for simple non-vr init:
+            ee_translation_goal = cube_position
+            ee_orientation_teleop_goal = cube_orientation
+
+            # compute curobo solution:
+            ik_goal = Pose(
+                position=tensor_args.to_device(ee_translation_goal),
+                quaternion=tensor_args.to_device(ee_orientation_teleop_goal),
+            )
+            # add link poses:
+            link_poses = {}
+            for i in target_links.keys():
+                c_p, c_rot = target_links[i].get_world_pose()
+                link_poses[i] = Pose(
+                    position=tensor_args.to_device(c_p),
+                    quaternion=tensor_args.to_device(c_rot),
+                )
+            result = motion_gen.plan_single(
+                cu_js.unsqueeze(0), ik_goal, plan_config.clone(), link_poses=link_poses
+            )
+            # ik_result = ik_solver.solve_single(ik_goal, cu_js.position.view(1,-1), cu_js.position.view(1,1,-1))
+
+            succ = result.success.item()  # ik_result.success.item()
+            if succ:
+                cmd_plan = result.get_interpolated_plan()
+                cmd_plan = motion_gen.get_full_js(cmd_plan)
+                # get only joint names that are in both:
+                idx_list = []
+                common_js_names = []
+                for x in sim_js_names:
+                    if x in cmd_plan.joint_names:
+                        idx_list.append(robot.get_dof_index(x))
+                        common_js_names.append(x)
+                # idx_list = [robot.get_dof_index(x) for x in sim_js_names]
+
+                cmd_plan = cmd_plan.get_ordered_joint_state(common_js_names)
+
+                cmd_idx = 0
+
+            else:
+                carb.log_warn("Plan did not converge to a solution.  No action is being taken.")
+            target_pose = cube_position
+        past_pose = cube_position
+        if cmd_plan is not None:
+            cmd_state = cmd_plan[cmd_idx]
+
+            # get full dof state
+            art_action = ArticulationAction(
+                cmd_state.position.cpu().numpy(),
+                cmd_state.velocity.cpu().numpy(),
+                joint_indices=idx_list,
+            )
+            # set desired joint angles obtained from IK:
+            articulation_controller.apply_action(art_action)
+            cmd_idx += 1
+            for _ in range(2):
+                my_world.step(render=False)
+            if cmd_idx >= len(cmd_plan.position):
+                cmd_idx = 0
+                cmd_plan = None
+    simulation_app.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/isaac_sim/realsense_collision.py b/examples/isaac_sim/realsense_collision.py
new file mode 100644
index 00000000..a804fa19
--- /dev/null
+++ b/examples/isaac_sim/realsense_collision.py
@@ -0,0 +1,260 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import torch
+
+a = torch.zeros(4, device="cuda:0")
+# Third Party
+import cv2
+import numpy as np
+import torch
+from matplotlib import cm
+from nvblox_torch.datasets.realsense_dataset import RealsenseDataloader
+from omni.isaac.kit import SimulationApp
+
+# CuRobo
+from curobo.geom.sdf.world import CollisionCheckerType
+from curobo.geom.types import Cuboid, WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.camera import CameraObservation
+from curobo.types.math import Pose
+from curobo.util_file import get_world_configs_path, join_path, load_yaml
+from curobo.wrap.model.robot_world import RobotWorld, RobotWorldConfig
+
+simulation_app = SimulationApp(
+    {
+        "headless": False,
+        "width": "1920",
+        "height": "1080",
+    }
+)
+simulation_app.update()
+# Third Party
+from omni.isaac.core import World
+from omni.isaac.core.materials import OmniPBR
+from omni.isaac.core.objects import cuboid, sphere
+
+
+def draw_points(voxels):
+    # Third Party
+
+    # Third Party
+    from omni.isaac.debug_draw import _debug_draw
+
+    draw = _debug_draw.acquire_debug_draw_interface()
+    # if draw.get_num_points() > 0:
+    draw.clear_points()
+    if len(voxels) == 0:
+        return
+
+    jet = cm.get_cmap("plasma").reversed()
+
+    cpu_pos = voxels[..., :3].view(-1, 3).cpu().numpy()
+    z_val = cpu_pos[:, 1]
+    # add smallest and largest values:
+    # z_val = np.append(z_val, 1.0)
+    # z_val = np.append(z_val,0.4)
+    # scale values
+    # z_val += 0.4
+    # z_val[z_val>1.0] = 1.0
+    # z_val = 1.0/z_val
+    # z_val = z_val/1.5
+    # z_val[z_val!=z_val] = 0.0
+    # z_val[z_val==0.0] = 0.4
+
+    jet_colors = jet(z_val)
+
+    b, _ = cpu_pos.shape
+    point_list = []
+    colors = []
+    for i in range(b):
+        # get list of points:
+        point_list += [(cpu_pos[i, 0], cpu_pos[i, 1], cpu_pos[i, 2])]
+        colors += [(jet_colors[i][0], jet_colors[i][1], jet_colors[i][2], 1.0)]
+    sizes = [10.0 for _ in range(b)]
+
+    draw.draw_points(point_list, colors, sizes)
+
+
+def clip_camera(camera_data):
+    # clip camera image to bounding box:
+    h_ratio = 0.15
+    w_ratio = 0.15
+    depth = camera_data["raw_depth"]
+    depth_tensor = camera_data["depth"]
+    h, w = depth_tensor.shape
+    depth[: int(h_ratio * h), :] = 0.0
+    depth[int((1 - h_ratio) * h) :, :] = 0.0
+    depth[:, : int(w_ratio * w)] = 0.0
+    depth[:, int((1 - w_ratio) * w) :] = 0.0
+
+    depth_tensor[: int(h_ratio * h), :] = 0.0
+    depth_tensor[int(1 - h_ratio * h) :, :] = 0.0
+    depth_tensor[:, : int(w_ratio * w)] = 0.0
+    depth_tensor[:, int(1 - w_ratio * w) :] = 0.0
+
+
+def draw_line(start, gradient):
+    # Third Party
+    from omni.isaac.debug_draw import _debug_draw
+
+    draw = _debug_draw.acquire_debug_draw_interface()
+    # if draw.get_num_points() > 0:
+    draw.clear_lines()
+    start_list = [start]
+    end_list = [start + gradient]
+
+    colors = [(0.0, 0, 0.8, 0.9)]
+
+    sizes = [10.0]
+    draw.draw_lines(start_list, end_list, colors, sizes)
+
+
+if __name__ == "__main__":
+    radius = 0.05
+    act_distance = 0.4
+    my_world = World(stage_units_in_meters=1.0)
+    stage = my_world.stage
+    my_world.scene.add_default_ground_plane()
+    # my_world.scene.add_ground_plane(color=np.array([0.2,0.2,0.2]))
+
+    xform = stage.DefinePrim("/World", "Xform")
+    stage.SetDefaultPrim(xform)
+    target_material = OmniPBR("/World/looks/t", color=np.array([0, 1, 0]))
+
+    target = sphere.VisualSphere(
+        "/World/target",
+        position=np.array([0.0, 0, 0.5]),
+        orientation=np.array([1, 0, 0, 0]),
+        radius=radius,
+        visual_material=target_material,
+    )
+
+    # Make a target to follow
+    camera_marker = cuboid.VisualCuboid(
+        "/World/camera_nvblox",
+        position=np.array([0.0, -0.1, 0.25]),
+        orientation=np.array([0.843, -0.537, 0.0, 0.0]),
+        color=np.array([0.1, 0.1, 0.5]),
+        size=0.03,
+    )
+    collision_checker_type = CollisionCheckerType.BLOX
+    world_cfg = WorldConfig.from_dict(
+        {
+            "blox": {
+                "world": {
+                    "pose": [0, 0, 0, 1, 0, 0, 0],
+                    "integrator_type": "occupancy",
+                    "voxel_size": 0.03,
+                }
+            }
+        }
+    )
+
+    config = RobotWorldConfig.load_from_config(
+        "franka.yml",
+        world_cfg,
+        collision_activation_distance=act_distance,
+        collision_checker_type=collision_checker_type,
+    )
+
+    model = RobotWorld(config)
+
+    realsense_data = RealsenseDataloader(clipping_distance_m=1.0)
+    data = realsense_data.get_data()
+
+    camera_pose = Pose.from_list([0, 0, 0, 0.707, 0.707, 0, 0])
+    i = 0
+    tensor_args = TensorDeviceType()
+    x_sph = torch.zeros((1, 1, 1, 4), device=tensor_args.device, dtype=tensor_args.dtype)
+    x_sph[..., 3] = radius
+    while simulation_app.is_running():
+        my_world.step(render=True)
+        if not my_world.is_playing():
+            if i % 100 == 0:
+                print("**** Click Play to start simulation *****")
+            i += 1
+            # if step_index == 0:
+            #    my_world.play()
+            continue
+
+        sp_buffer = []
+        sph_position, _ = target.get_local_pose()
+
+        x_sph[..., :3] = tensor_args.to_device(sph_position).view(1, 1, 1, 3)
+
+        model.world_model.decay_layer("world")
+        data = realsense_data.get_data()
+        clip_camera(data)
+        cube_position, cube_orientation = camera_marker.get_local_pose()
+        camera_pose = Pose(
+            position=tensor_args.to_device(cube_position),
+            quaternion=tensor_args.to_device(cube_orientation),
+        )
+        # print(data["rgba"].shape, data["depth"].shape, data["intrinsics"])
+
+        data_camera = CameraObservation(  # rgb_image = data["rgba_nvblox"],
+            depth_image=data["depth"], intrinsics=data["intrinsics"], pose=camera_pose
+        )
+        data_camera = data_camera.to(device=model.tensor_args.device)
+        # print(data_camera.depth_image, data_camera.rgb_image, data_camera.intrinsics)
+        # print("got new message")
+        model.world_model.add_camera_frame(data_camera, "world")
+        # print("added camera frame")
+        model.world_model.process_camera_frames("world", False)
+        torch.cuda.synchronize()
+        model.world_model.update_blox_hashes()
+        bounding = Cuboid("t", dims=[1, 1, 1], pose=[0, 0, 0, 1, 0, 0, 0])
+        voxels = model.world_model.get_voxels_in_bounding_box(bounding, 0.025)
+        # print(data_camera.depth_image)
+        depth_image = data["raw_depth"]
+        color_image = data["raw_rgb"]
+        depth_colormap = cv2.applyColorMap(
+            cv2.convertScaleAbs(depth_image, alpha=100), cv2.COLORMAP_VIRIDIS
+        )
+        images = np.hstack((color_image, depth_colormap))
+
+        cv2.namedWindow("Align Example", cv2.WINDOW_NORMAL)
+        cv2.imshow("Align Example", images)
+        key = cv2.waitKey(1)
+        # Press esc or 'q' to close the image window
+        if key & 0xFF == ord("q") or key == 27:
+            cv2.destroyAllWindows()
+            break
+
+        draw_points(voxels)
+        d, d_vec = model.get_collision_vector(x_sph)
+
+        p = d.item()
+        p = max(1, p * 5)
+        if d.item() == 0.0:
+            target_material.set_color(np.ravel([0, 1, 0]))
+        elif d.item() <= model.contact_distance:
+            target_material.set_color(np.array([0, 0, p]))
+        elif d.item() >= model.contact_distance:
+            target_material.set_color(np.array([p, 0, 0]))
+
+        if d.item() != 0.0:
+            print(d, d_vec)
+
+            draw_line(sph_position, d_vec[..., :3].view(3).cpu().numpy())
+        else:
+            # Third Party
+            from omni.isaac.debug_draw import _debug_draw
+
+            draw = _debug_draw.acquire_debug_draw_interface()
+            # if draw.get_num_points() > 0:
+            draw.clear_lines()
+
+    realsense_data.stop_device()
+    print("finished program")
+    simulation_app.close()
diff --git a/examples/isaac_sim/realsense_mpc.py b/examples/isaac_sim/realsense_mpc.py
new file mode 100644
index 00000000..77f9cce2
--- /dev/null
+++ b/examples/isaac_sim/realsense_mpc.py
@@ -0,0 +1,484 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import torch
+
+a = torch.zeros(4, device="cuda:0")
+# Third Party
+import cv2
+import numpy as np
+import torch
+from matplotlib import cm
+from nvblox_torch.datasets.realsense_dataset import RealsenseDataloader
+from omni.isaac.kit import SimulationApp
+
+# CuRobo
+from curobo.geom.sdf.world import CollisionCheckerType
+from curobo.geom.types import Cuboid, WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.camera import CameraObservation
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.types.state import JointState
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.model.robot_world import RobotWorld, RobotWorldConfig
+from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
+
+simulation_app = SimulationApp(
+    {
+        "headless": False,
+        "width": "1920",
+        "height": "1080",
+    }
+)
+simulation_app.update()
+# Standard Library
+import argparse
+
+# Third Party
+import carb
+from helper import VoxelManager, add_robot_to_scene
+from omni.isaac.core import World
+from omni.isaac.core.materials import OmniPBR
+from omni.isaac.core.objects import cuboid, sphere
+from omni.isaac.core.utils.types import ArticulationAction
+
+# CuRobo
+from curobo.rollout.rollout_base import Goal
+from curobo.util.usd_helper import UsdHelper
+from curobo.wrap.reacher.mpc import MpcSolver, MpcSolverConfig
+
+parser = argparse.ArgumentParser()
+
+
+parser.add_argument("--robot", type=str, default="franka.yml", help="robot configuration to load")
+parser.add_argument(
+    "--waypoints", action="store_true", help="When True, sets robot in static mode", default=False
+)
+parser.add_argument(
+    "--use-debug-draw",
+    action="store_true",
+    help="When True, sets robot in static mode",
+    default=False,
+)
+args = parser.parse_args()
+
+
+def draw_rollout_points(rollouts: torch.Tensor, clear: bool = False):
+    if rollouts is None:
+        return
+    # Standard Library
+    import random
+
+    # Third Party
+    from omni.isaac.debug_draw import _debug_draw
+
+    draw = _debug_draw.acquire_debug_draw_interface()
+    N = 100
+    if clear:
+        draw.clear_points()
+    # if draw.get_num_points() > 0:
+    # draw.clear_points()
+    cpu_rollouts = rollouts.cpu().numpy()
+    b, h, _ = cpu_rollouts.shape
+    point_list = []
+    colors = []
+    for i in range(b):
+        # get list of points:
+        point_list += [
+            (cpu_rollouts[i, j, 0], cpu_rollouts[i, j, 1], cpu_rollouts[i, j, 2]) for j in range(h)
+        ]
+        colors += [(1.0 - (i + 1.0 / b), 0.3 * (i + 1.0 / b), 0.0, 0.1) for _ in range(h)]
+    sizes = [10.0 for _ in range(b * h)]
+    draw.draw_points(point_list, colors, sizes)
+
+
+def draw_points(voxels):
+    # Third Party
+
+    # Third Party
+    from omni.isaac.debug_draw import _debug_draw
+
+    draw = _debug_draw.acquire_debug_draw_interface()
+    # if draw.get_num_points() > 0:
+    draw.clear_points()
+    if len(voxels) == 0:
+        return
+
+    jet = cm.get_cmap("plasma").reversed()
+
+    cpu_pos = voxels[..., :3].view(-1, 3).cpu().numpy()
+    z_val = cpu_pos[:, 0]
+
+    jet_colors = jet(z_val)
+
+    b, _ = cpu_pos.shape
+    point_list = []
+    colors = []
+    for i in range(b):
+        # get list of points:
+        point_list += [(cpu_pos[i, 0], cpu_pos[i, 1], cpu_pos[i, 2])]
+        colors += [(jet_colors[i][0], jet_colors[i][1], jet_colors[i][2], 0.8)]
+    sizes = [20.0 for _ in range(b)]
+
+    draw.draw_points(point_list, colors, sizes)
+
+
+def clip_camera(camera_data):
+    # clip camera image to bounding box:
+    h_ratio = 0.05
+    w_ratio = 0.05
+    depth = camera_data["raw_depth"]
+    depth_tensor = camera_data["depth"]
+    h, w = depth_tensor.shape
+    depth[: int(h_ratio * h), :] = 0.0
+    depth[int((1 - h_ratio) * h) :, :] = 0.0
+    depth[:, : int(w_ratio * w)] = 0.0
+    depth[:, int((1 - w_ratio) * w) :] = 0.0
+
+    depth_tensor[: int(h_ratio * h), :] = 0.0
+    depth_tensor[int(1 - h_ratio * h) :, :] = 0.0
+    depth_tensor[:, : int(w_ratio * w)] = 0.0
+    depth_tensor[:, int(1 - w_ratio * w) :] = 0.0
+
+
+def draw_line(start, gradient):
+    # Third Party
+    from omni.isaac.debug_draw import _debug_draw
+
+    draw = _debug_draw.acquire_debug_draw_interface()
+    # if draw.get_num_points() > 0:
+    draw.clear_lines()
+    start_list = [start]
+    end_list = [start + gradient]
+
+    colors = [(0.0, 0, 0.8, 0.9)]
+
+    sizes = [10.0]
+    draw.draw_lines(start_list, end_list, colors, sizes)
+
+
+if __name__ == "__main__":
+    radius = 0.05
+    act_distance = 0.4
+    voxel_size = 0.05
+    render_voxel_size = 0.02
+    clipping_distance = 0.7
+
+    my_world = World(stage_units_in_meters=1.0)
+    stage = my_world.stage
+
+    stage = my_world.stage
+    my_world.scene.add_default_ground_plane()
+
+    xform = stage.DefinePrim("/World", "Xform")
+    stage.SetDefaultPrim(xform)
+    target_material = OmniPBR("/World/looks/t", color=np.array([0, 1, 0]))
+    target_material_2 = OmniPBR("/World/looks/t2", color=np.array([0, 1, 0]))
+    if not args.waypoints:
+        target = cuboid.VisualCuboid(
+            "/World/target_1",
+            position=np.array([0.5, 0.0, 0.4]),
+            orientation=np.array([0, 1.0, 0, 0]),
+            size=0.04,
+            visual_material=target_material,
+        )
+
+    else:
+        target = cuboid.VisualCuboid(
+            "/World/target_1",
+            position=np.array([0.4, -0.5, 0.2]),
+            orientation=np.array([0, 1.0, 0, 0]),
+            size=0.04,
+            visual_material=target_material,
+        )
+
+    # Make a target to follow
+    target_2 = cuboid.VisualCuboid(
+        "/World/target_2",
+        position=np.array([0.4, 0.5, 0.2]),
+        orientation=np.array([0.0, 1, 0.0, 0.0]),
+        size=0.04,
+        visual_material=target_material_2,
+    )
+
+    # Make a target to follow
+    camera_marker = cuboid.VisualCuboid(
+        "/World/camera_nvblox",
+        position=np.array([-0.05, 0.0, 0.45]),
+        # orientation=np.array([0.793, 0, 0.609,0.0]),
+        orientation=np.array([0.5, -0.5, 0.5, -0.5]),
+        # orientation=np.array([0.561, -0.561, 0.431,-0.431]),
+        color=np.array([0, 0, 1]),
+        size=0.01,
+    )
+    camera_marker.set_visibility(False)
+    collision_checker_type = CollisionCheckerType.BLOX
+    world_cfg = WorldConfig.from_dict(
+        {
+            "blox": {
+                "world": {
+                    "pose": [0, 0, 0, 1, 0, 0, 0],
+                    "integrator_type": "occupancy",
+                    "voxel_size": 0.03,
+                }
+            }
+        }
+    )
+    tensor_args = TensorDeviceType()
+
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), args.robot))["robot_cfg"]
+
+    j_names = robot_cfg["kinematics"]["cspace"]["joint_names"]
+    default_config = robot_cfg["kinematics"]["cspace"]["retract_config"]
+    robot_cfg["kinematics"]["collision_sphere_buffer"] = 0.02
+    robot, _ = add_robot_to_scene(robot_cfg, my_world, "/World/world_robot/")
+
+    world_cfg_table = WorldConfig.from_dict(
+        load_yaml(join_path(get_world_configs_path(), "collision_wall.yml"))
+    )
+
+    world_cfg_table.cuboid[0].pose[2] -= 0.01
+    usd_help = UsdHelper()
+
+    usd_help.load_stage(my_world.stage)
+    usd_help.add_world_to_stage(world_cfg_table.get_mesh_world(), base_frame="/World")
+    world_cfg.add_obstacle(world_cfg_table.cuboid[0])
+    world_cfg.add_obstacle(world_cfg_table.cuboid[1])
+
+    mpc_config = MpcSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        use_cuda_graph=True,
+        use_cuda_graph_metrics=True,
+        use_cuda_graph_full_step=False,
+        self_collision_check=True,
+        collision_checker_type=CollisionCheckerType.BLOX,
+        use_mppi=True,
+        use_lbfgs=False,
+        use_es=False,
+        store_rollouts=True,
+        step_dt=0.02,
+    )
+
+    mpc = MpcSolver(mpc_config)
+
+    retract_cfg = mpc.rollout_fn.dynamics_model.retract_config.clone().unsqueeze(0)
+    joint_names = mpc.rollout_fn.joint_names
+
+    state = mpc.rollout_fn.compute_kinematics(
+        JointState.from_position(retract_cfg, joint_names=joint_names)
+    )
+    current_state = JointState.from_position(retract_cfg, joint_names=joint_names)
+    retract_pose = Pose(state.ee_pos_seq, quaternion=state.ee_quat_seq)
+    goal = Goal(
+        current_state=current_state,
+        goal_state=JointState.from_position(retract_cfg, joint_names=joint_names),
+        goal_pose=retract_pose,
+    )
+
+    goal_buffer = mpc.setup_solve_single(goal, 1)
+    mpc.update_goal(goal_buffer)
+
+    world_model = mpc.world_collision
+    realsense_data = RealsenseDataloader(clipping_distance_m=clipping_distance)
+    data = realsense_data.get_data()
+
+    camera_pose = Pose.from_list([0, 0, 0, 0.707, 0.707, 0, 0])
+    i = 0
+    tensor_args = TensorDeviceType()
+    target_list = [target, target_2]
+    target_material_list = [target_material, target_material_2]
+    for material in target_material_list:
+        material.set_color(np.array([0.1, 0.1, 0.1]))
+    target_idx = 0
+    cmd_idx = 0
+    cmd_plan = None
+    articulation_controller = robot.get_articulation_controller()
+    cmd_state_full = None
+
+    cmd_step_idx = 0
+    current_error = 0.0
+    error_thresh = 0.01
+    first_target = False
+    if not args.use_debug_draw:
+        voxel_viewer = VoxelManager(100, size=render_voxel_size)
+
+    while simulation_app.is_running():
+        my_world.step(render=True)
+
+        if not my_world.is_playing():
+            if i % 100 == 0:
+                print("**** Click Play to start simulation *****")
+            i += 1
+            # if step_index == 0:
+            #    my_world.play()
+            continue
+        step_index = my_world.current_time_step_index
+        if cmd_step_idx == 0:
+            draw_rollout_points(mpc.get_visual_rollouts(), clear=not args.use_debug_draw)
+
+        if step_index == 0:
+            my_world.reset()
+            idx_list = [robot.get_dof_index(x) for x in j_names]
+            robot.set_joint_positions(default_config, idx_list)
+
+            robot._articulation_view.set_max_efforts(
+                values=np.array([5000 for i in range(len(idx_list))]), joint_indices=idx_list
+            )
+
+        if step_index % 2 == 0.0:
+            # camera data updation
+            world_model.decay_layer("world")
+            data = realsense_data.get_data()
+            clip_camera(data)
+            cube_position, cube_orientation = camera_marker.get_local_pose()
+            camera_pose = Pose(
+                position=tensor_args.to_device(cube_position),
+                quaternion=tensor_args.to_device(cube_orientation),
+            )
+
+            data_camera = CameraObservation(  # rgb_image = data["rgba_nvblox"],
+                depth_image=data["depth"], intrinsics=data["intrinsics"], pose=camera_pose
+            )
+            data_camera = data_camera.to(device=tensor_args.device)
+            world_model.add_camera_frame(data_camera, "world")
+            world_model.process_camera_frames("world", False)
+            torch.cuda.synchronize()
+            world_model.update_blox_hashes()
+
+            bounding = Cuboid("t", dims=[1, 1, 1.0], pose=[0, 0, 0, 1, 0, 0, 0])
+            voxels = world_model.get_voxels_in_bounding_box(bounding, voxel_size)
+            if voxels.shape[0] > 0:
+                voxels = voxels[voxels[:, 2] > voxel_size]
+                voxels = voxels[voxels[:, 0] > 0.0]
+                if args.use_debug_draw:
+                    draw_points(voxels)
+
+                else:
+                    voxels = voxels.cpu().numpy()
+                    voxel_viewer.update_voxels(voxels[:, :3])
+            else:
+                if not args.use_debug_draw:
+                    voxel_viewer.clear()
+
+        if True:
+            depth_image = data["raw_depth"]
+            color_image = data["raw_rgb"]
+            depth_colormap = cv2.applyColorMap(
+                cv2.convertScaleAbs(depth_image, alpha=100), cv2.COLORMAP_VIRIDIS
+            )
+            color_image = cv2.flip(color_image, 1)
+            depth_colormap = cv2.flip(depth_colormap, 1)
+
+            images = np.hstack((color_image, depth_colormap))
+
+            cv2.namedWindow("NVBLOX Example", cv2.WINDOW_NORMAL)
+            cv2.imshow("NVBLOX Example", images)
+            key = cv2.waitKey(1)
+            # Press esc or 'q' to close the image window
+            if key & 0xFF == ord("q") or key == 27:
+                cv2.destroyAllWindows()
+                break
+
+        sim_js = robot.get_joints_state()
+        sim_js_names = robot.dof_names
+        cu_js = JointState(
+            position=tensor_args.to_device(sim_js.positions),
+            velocity=tensor_args.to_device(sim_js.velocities) * 0.0,
+            acceleration=tensor_args.to_device(sim_js.velocities) * 0.0,
+            jerk=tensor_args.to_device(sim_js.velocities) * 0.0,
+            joint_names=sim_js_names,
+        )
+        cu_js = cu_js.get_ordered_joint_state(mpc.rollout_fn.joint_names)
+
+        if cmd_state_full is None:
+            current_state.copy_(cu_js)
+        else:
+            current_state_partial = cmd_state_full.get_ordered_joint_state(
+                mpc.rollout_fn.joint_names
+            )
+            current_state.copy_(current_state_partial)
+            current_state.joint_names = current_state_partial.joint_names
+
+        if current_error <= error_thresh and (not first_target or args.waypoints):
+            first_target = True
+            # motion generation:
+            for ks in range(len(target_material_list)):
+                if ks == target_idx:
+                    target_material_list[ks].set_color(np.ravel([0, 1.0, 0]))
+                else:
+                    target_material_list[ks].set_color(np.ravel([0.1, 0.1, 0.1]))
+
+            cube_position, cube_orientation = target_list[target_idx].get_world_pose()
+
+            # Set EE teleop goals, use cube for simple non-vr init:
+            ee_translation_goal = cube_position
+            ee_orientation_teleop_goal = cube_orientation
+
+            # compute curobo solution:
+            ik_goal = Pose(
+                position=tensor_args.to_device(ee_translation_goal),
+                quaternion=tensor_args.to_device(ee_orientation_teleop_goal),
+            )
+            goal_buffer.goal_pose.copy_(ik_goal)
+            mpc.update_goal(goal_buffer)
+            target_idx += 1
+            if target_idx >= len(target_list):
+                target_idx = 0
+
+        if cmd_step_idx == 0:
+            mpc_result = mpc.step(current_state, max_attempts=2)
+            current_error = mpc_result.metrics.pose_error.item()
+        cmd_state_full = mpc_result.js_action
+        common_js_names = []
+        idx_list = []
+        for x in sim_js_names:
+            if x in cmd_state_full.joint_names:
+                idx_list.append(robot.get_dof_index(x))
+                common_js_names.append(x)
+
+        cmd_state = cmd_state_full.get_ordered_joint_state(common_js_names)
+        cmd_state_full = cmd_state
+
+        art_action = ArticulationAction(
+            cmd_state.position.cpu().numpy(),
+            # cmd_state.velocity.cpu().numpy(),
+            joint_indices=idx_list,
+        )
+        articulation_controller.apply_action(art_action)
+
+        if cmd_step_idx == 2:
+            cmd_step_idx = 0
+
+        # positions_goal = a
+        if cmd_plan is not None:
+            cmd_state = cmd_plan[cmd_idx]
+
+            # get full dof state
+            art_action = ArticulationAction(
+                cmd_state.position.cpu().numpy(),
+                # cmd_state.velocity.cpu().numpy(),
+                joint_indices=idx_list,
+            )
+            # set desired joint angles obtained from IK:
+            articulation_controller.apply_action(art_action)
+            cmd_step_idx += 1
+            # for _ in range(2):
+            #    my_world.step(render=False)
+            if cmd_idx >= len(cmd_plan.position):
+                cmd_idx = 0
+                cmd_plan = None
+    realsense_data.stop_device()
+    print("finished program")
+
+    simulation_app.close()
diff --git a/examples/isaac_sim/realsense_reacher.py b/examples/isaac_sim/realsense_reacher.py
new file mode 100644
index 00000000..374605e3
--- /dev/null
+++ b/examples/isaac_sim/realsense_reacher.py
@@ -0,0 +1,414 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import torch
+
+a = torch.zeros(4, device="cuda:0")
+# Third Party
+import cv2
+import numpy as np
+import torch
+from matplotlib import cm
+from nvblox_torch.datasets.realsense_dataset import RealsenseDataloader
+from omni.isaac.kit import SimulationApp
+
+# CuRobo
+from curobo.geom.sdf.world import CollisionCheckerType
+from curobo.geom.types import Cuboid, WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.camera import CameraObservation
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.types.state import JointState
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.model.robot_world import RobotWorld, RobotWorldConfig
+from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
+
+simulation_app = SimulationApp(
+    {
+        "headless": False,
+        "width": "1920",
+        "height": "1080",
+    }
+)
+simulation_app.update()
+# Standard Library
+import argparse
+
+# Third Party
+import carb
+from helper import VoxelManager, add_robot_to_scene
+from omni.isaac.core import World
+from omni.isaac.core.materials import OmniPBR
+from omni.isaac.core.objects import cuboid, sphere
+from omni.isaac.core.utils.types import ArticulationAction
+
+# CuRobo
+from curobo.util.usd_helper import UsdHelper
+
+parser = argparse.ArgumentParser()
+
+
+parser.add_argument("--robot", type=str, default="franka.yml", help="robot configuration to load")
+parser.add_argument(
+    "--use-debug-draw",
+    action="store_true",
+    help="When True, sets robot in static mode",
+    default=False,
+)
+args = parser.parse_args()
+
+
+def draw_points(voxels):
+    # Third Party
+
+    # Third Party
+    from omni.isaac.debug_draw import _debug_draw
+
+    draw = _debug_draw.acquire_debug_draw_interface()
+    # if draw.get_num_points() > 0:
+    draw.clear_points()
+    if len(voxels) == 0:
+        return
+
+    jet = cm.get_cmap("plasma").reversed()
+
+    cpu_pos = voxels[..., :3].view(-1, 3).cpu().numpy()
+    z_val = cpu_pos[:, 0]
+
+    jet_colors = jet(z_val)
+
+    b, _ = cpu_pos.shape
+    point_list = []
+    colors = []
+    for i in range(b):
+        # get list of points:
+        point_list += [(cpu_pos[i, 0], cpu_pos[i, 1], cpu_pos[i, 2])]
+        colors += [(jet_colors[i][0], jet_colors[i][1], jet_colors[i][2], 0.8)]
+    sizes = [20.0 for _ in range(b)]
+
+    draw.draw_points(point_list, colors, sizes)
+
+
+def clip_camera(camera_data):
+    # clip camera image to bounding box:
+    h_ratio = 0.05
+    w_ratio = 0.05
+    depth = camera_data["raw_depth"]
+    depth_tensor = camera_data["depth"]
+    h, w = depth_tensor.shape
+    depth[: int(h_ratio * h), :] = 0.0
+    depth[int((1 - h_ratio) * h) :, :] = 0.0
+    depth[:, : int(w_ratio * w)] = 0.0
+    depth[:, int((1 - w_ratio) * w) :] = 0.0
+
+    depth_tensor[: int(h_ratio * h), :] = 0.0
+    depth_tensor[int(1 - h_ratio * h) :, :] = 0.0
+    depth_tensor[:, : int(w_ratio * w)] = 0.0
+    depth_tensor[:, int(1 - w_ratio * w) :] = 0.0
+
+
+def draw_line(start, gradient):
+    # Third Party
+    from omni.isaac.debug_draw import _debug_draw
+
+    draw = _debug_draw.acquire_debug_draw_interface()
+    # if draw.get_num_points() > 0:
+    draw.clear_lines()
+    start_list = [start]
+    end_list = [start + gradient]
+
+    colors = [(0.0, 0, 0.8, 0.9)]
+
+    sizes = [10.0]
+    draw.draw_lines(start_list, end_list, colors, sizes)
+
+
+if __name__ == "__main__":
+    radius = 0.05
+    act_distance = 0.4
+    voxel_size = 0.05
+    render_voxel_size = 0.02
+    clipping_distance = 0.7
+    my_world = World(stage_units_in_meters=1.0)
+    stage = my_world.stage
+
+    stage = my_world.stage
+    my_world.scene.add_default_ground_plane()
+
+    xform = stage.DefinePrim("/World", "Xform")
+    stage.SetDefaultPrim(xform)
+    target_material = OmniPBR("/World/looks/t", color=np.array([0, 1, 0]))
+    target_material_2 = OmniPBR("/World/looks/t2", color=np.array([0, 1, 0]))
+
+    target = cuboid.VisualCuboid(
+        "/World/target_1",
+        position=np.array([0.4, -0.5, 0.2]),
+        orientation=np.array([0, 1.0, 0, 0]),
+        size=0.04,
+        visual_material=target_material,
+    )
+
+    # Make a target to follow
+    target_2 = cuboid.VisualCuboid(
+        "/World/target_2",
+        position=np.array([0.4, 0.5, 0.2]),
+        orientation=np.array([0.0, 1, 0.0, 0.0]),
+        size=0.04,
+        visual_material=target_material_2,
+    )
+
+    # Make a target to follow
+    camera_marker = cuboid.VisualCuboid(
+        "/World/camera_nvblox",
+        position=np.array([-0.05, 0.0, 0.45]),
+        # orientation=np.array([0.793, 0, 0.609,0.0]),
+        orientation=np.array([0.5, -0.5, 0.5, -0.5]),
+        # orientation=np.array([0.561, -0.561, 0.431,-0.431]),
+        color=np.array([0, 0, 1]),
+        size=0.01,
+    )
+    camera_marker.set_visibility(False)
+
+    collision_checker_type = CollisionCheckerType.BLOX
+    world_cfg = WorldConfig.from_dict(
+        {
+            "blox": {
+                "world": {
+                    "pose": [0, 0, 0, 1, 0, 0, 0],
+                    "integrator_type": "occupancy",
+                    "voxel_size": 0.03,
+                }
+            }
+        }
+    )
+    tensor_args = TensorDeviceType()
+
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), args.robot))["robot_cfg"]
+
+    j_names = robot_cfg["kinematics"]["cspace"]["joint_names"]
+    default_config = robot_cfg["kinematics"]["cspace"]["retract_config"]
+
+    robot, _ = add_robot_to_scene(robot_cfg, my_world, "/World/world_robot/")
+
+    world_cfg_table = WorldConfig.from_dict(
+        load_yaml(join_path(get_world_configs_path(), "collision_wall.yml"))
+    )
+
+    world_cfg_table.cuboid[0].pose[2] -= 0.01
+    usd_help = UsdHelper()
+
+    usd_help.load_stage(my_world.stage)
+    usd_help.add_world_to_stage(world_cfg_table.get_mesh_world(), base_frame="/World")
+    world_cfg.add_obstacle(world_cfg_table.cuboid[0])
+    world_cfg.add_obstacle(world_cfg_table.cuboid[1])
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        tensor_args,
+        trajopt_tsteps=32,
+        collision_checker_type=CollisionCheckerType.BLOX,
+        use_cuda_graph=True,
+        num_trajopt_seeds=12,
+        num_graph_seeds=12,
+        interpolation_dt=0.03,
+        collision_activation_distance=0.025,
+        acceleration_scale=1.0,
+        self_collision_check=True,
+        maximum_trajectory_dt=0.25,
+        finetune_dt_scale=1.05,
+        fixed_iters_trajopt=True,
+        finetune_trajopt_iters=300,
+        minimize_jerk=True,
+    )
+    motion_gen = MotionGen(motion_gen_config)
+    print("warming up..")
+    motion_gen.warmup(warmup_js_trajopt=False)
+
+    world_model = motion_gen.world_collision
+    realsense_data = RealsenseDataloader(clipping_distance_m=clipping_distance)
+    data = realsense_data.get_data()
+
+    camera_pose = Pose.from_list([0, 0, 0, 0.707, 0.707, 0, 0])
+    i = 0
+    tensor_args = TensorDeviceType()
+    target_list = [target, target_2]
+    target_material_list = [target_material, target_material_2]
+    for material in target_material_list:
+        material.set_color(np.array([0.1, 0.1, 0.1]))
+    target_idx = 0
+    cmd_idx = 0
+    cmd_plan = None
+    articulation_controller = robot.get_articulation_controller()
+    plan_config = MotionGenPlanConfig(
+        enable_graph=False, enable_graph_attempt=4, max_attempts=2, enable_finetune_trajopt=True
+    )
+    if not args.use_debug_draw:
+        voxel_viewer = VoxelManager(100, size=render_voxel_size)
+    cmd_step_idx = 0
+    while simulation_app.is_running():
+        my_world.step(render=True)
+
+        if not my_world.is_playing():
+            if i % 100 == 0:
+                print("**** Click Play to start simulation *****")
+            i += 1
+            # if step_index == 0:
+            #    my_world.play()
+            continue
+        step_index = my_world.current_time_step_index
+
+        if step_index == 0:
+            my_world.reset()
+            idx_list = [robot.get_dof_index(x) for x in j_names]
+            robot.set_joint_positions(default_config, idx_list)
+
+            robot._articulation_view.set_max_efforts(
+                values=np.array([5000 for i in range(len(idx_list))]), joint_indices=idx_list
+            )
+
+        if step_index % 5 == 0.0:
+            # camera data updation
+            world_model.decay_layer("world")
+            data = realsense_data.get_data()
+            clip_camera(data)
+            cube_position, cube_orientation = camera_marker.get_local_pose()
+            camera_pose = Pose(
+                position=tensor_args.to_device(cube_position),
+                quaternion=tensor_args.to_device(cube_orientation),
+            )
+
+            data_camera = CameraObservation(  # rgb_image = data["rgba_nvblox"],
+                depth_image=data["depth"], intrinsics=data["intrinsics"], pose=camera_pose
+            )
+            data_camera = data_camera.to(device=tensor_args.device)
+            world_model.add_camera_frame(data_camera, "world")
+            world_model.process_camera_frames("world", False)
+            torch.cuda.synchronize()
+            world_model.update_blox_hashes()
+
+            bounding = Cuboid("t", dims=[1, 1, 1.0], pose=[0, 0, 0, 1, 0, 0, 0])
+            voxels = world_model.get_voxels_in_bounding_box(bounding, voxel_size)
+            if voxels.shape[0] > 0:
+                voxels = voxels[voxels[:, 2] > voxel_size]
+                voxels = voxels[voxels[:, 0] > 0.0]
+                if args.use_debug_draw:
+                    draw_points(voxels)
+                else:
+                    voxels = voxels.cpu().numpy()
+                    voxel_viewer.update_voxels(voxels[:, :3])
+
+                voxel_viewer.update_voxels(voxels[:, :3])
+            else:
+                if not args.use_debug_draw:
+                    voxel_viewer.clear()
+            # draw_points(voxels)
+
+        if True:
+            depth_image = data["raw_depth"]
+            color_image = data["raw_rgb"]
+            depth_colormap = cv2.applyColorMap(
+                cv2.convertScaleAbs(depth_image, alpha=100), cv2.COLORMAP_VIRIDIS
+            )
+            color_image = cv2.flip(color_image, 1)
+            depth_colormap = cv2.flip(depth_colormap, 1)
+
+            images = np.hstack((color_image, depth_colormap))
+
+            cv2.namedWindow("NVBLOX Example", cv2.WINDOW_NORMAL)
+            cv2.imshow("NVBLOX Example", images)
+            key = cv2.waitKey(1)
+            # Press esc or 'q' to close the image window
+            if key & 0xFF == ord("q") or key == 27:
+                cv2.destroyAllWindows()
+                break
+
+        if cmd_plan is None and step_index % 10 == 0:
+            # motion generation:
+            for ks in range(len(target_material_list)):
+                if ks == target_idx:
+                    target_material_list[ks].set_color(np.ravel([0, 1.0, 0]))
+                else:
+                    target_material_list[ks].set_color(np.ravel([0.1, 0.1, 0.1]))
+
+            sim_js = robot.get_joints_state()
+            sim_js_names = robot.dof_names
+            cu_js = JointState(
+                position=tensor_args.to_device(sim_js.positions),
+                velocity=tensor_args.to_device(sim_js.velocities) * 0.0,
+                acceleration=tensor_args.to_device(sim_js.velocities) * 0.0,
+                jerk=tensor_args.to_device(sim_js.velocities) * 0.0,
+                joint_names=sim_js_names,
+            )
+            cu_js = cu_js.get_ordered_joint_state(motion_gen.kinematics.joint_names)
+
+            cube_position, cube_orientation = target_list[target_idx].get_world_pose()
+
+            # Set EE teleop goals, use cube for simple non-vr init:
+            ee_translation_goal = cube_position
+            ee_orientation_teleop_goal = cube_orientation
+
+            # compute curobo solution:
+            ik_goal = Pose(
+                position=tensor_args.to_device(ee_translation_goal),
+                quaternion=tensor_args.to_device(ee_orientation_teleop_goal),
+            )
+
+            result = motion_gen.plan_single(cu_js.unsqueeze(0), ik_goal, plan_config)
+            # ik_result = ik_solver.solve_single(ik_goal, cu_js.position.view(1,-1), cu_js.position.view(1,1,-1))
+
+            succ = result.success.item()  # ik_result.success.item()
+            if succ:
+                cmd_plan = result.get_interpolated_plan()
+                cmd_plan = motion_gen.get_full_js(cmd_plan)
+                # get only joint names that are in both:
+                idx_list = []
+                common_js_names = []
+                for x in sim_js_names:
+                    if x in cmd_plan.joint_names:
+                        idx_list.append(robot.get_dof_index(x))
+                        common_js_names.append(x)
+                # idx_list = [robot.get_dof_index(x) for x in sim_js_names]
+
+                cmd_plan = cmd_plan.get_ordered_joint_state(common_js_names)
+
+                cmd_idx = 0
+                target_idx += 1
+                if target_idx >= len(target_list):
+                    target_idx = 0
+
+            else:
+                carb.log_warn("Plan did not converge to a solution.  No action is being taken.")
+        if cmd_plan is not None:
+            cmd_state = cmd_plan[cmd_idx]
+
+            # get full dof state
+            art_action = ArticulationAction(
+                cmd_state.position.cpu().numpy(),
+                cmd_state.velocity.cpu().numpy(),
+                joint_indices=idx_list,
+            )
+            # set desired joint angles obtained from IK:
+            articulation_controller.apply_action(art_action)
+            cmd_step_idx += 1
+            if cmd_step_idx == 2:
+                cmd_idx += 1
+                cmd_step_idx = 0
+            # for _ in range(2):
+            #    my_world.step(render=False)
+            if cmd_idx >= len(cmd_plan.position):
+                cmd_idx = 0
+                cmd_plan = None
+    realsense_data.stop_device()
+    print("finished program")
+
+    simulation_app.close()
diff --git a/examples/isaac_sim/realsense_viewer.py b/examples/isaac_sim/realsense_viewer.py
new file mode 100644
index 00000000..7f989bb6
--- /dev/null
+++ b/examples/isaac_sim/realsense_viewer.py
@@ -0,0 +1,46 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import cv2
+import numpy as np
+from nvblox_torch.datasets.realsense_dataset import RealsenseDataloader
+
+
+def view_realsense():
+    realsense_data = RealsenseDataloader(clipping_distance_m=1.0)
+    # Streaming loop
+    try:
+        while True:
+            data = realsense_data.get_raw_data()
+            depth_image = data[0]
+            color_image = data[1]
+            # Render images:
+            #   depth align to color on left
+            #   depth on right
+            depth_colormap = cv2.applyColorMap(
+                cv2.convertScaleAbs(depth_image, alpha=100), cv2.COLORMAP_JET
+            )
+            images = np.hstack((color_image, depth_colormap))
+
+            cv2.namedWindow("Align Example", cv2.WINDOW_NORMAL)
+            cv2.imshow("Align Example", images)
+            key = cv2.waitKey(1)
+            # Press esc or 'q' to close the image window
+            if key & 0xFF == ord("q") or key == 27:
+                cv2.destroyAllWindows()
+                break
+    finally:
+        realsense_data.stop_device()
+
+
+if __name__ == "__main__":
+    view_realsense()
diff --git a/examples/isaac_sim/simple_stacking.py b/examples/isaac_sim/simple_stacking.py
new file mode 100644
index 00000000..f3d14384
--- /dev/null
+++ b/examples/isaac_sim/simple_stacking.py
@@ -0,0 +1,487 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import torch
+
+a = torch.zeros(
+    4, device="cuda:0"
+)  # this is necessary to allow isaac sim to use this torch instance
+# Third Party
+import numpy as np
+
+np.set_printoptions(suppress=True)
+# Standard Library
+import os.path as osp
+
+# Third Party
+from omni.isaac.kit import SimulationApp
+
+simulation_app = SimulationApp({"headless": False})  # This adds paths for the following importing
+# Standard Library
+from typing import Optional
+
+# Third Party
+import carb
+from omni.isaac.core import World
+from omni.isaac.core.controllers import BaseController
+from omni.isaac.core.tasks import Stacking as BaseStacking
+from omni.isaac.core.utils.extensions import enable_extension
+from omni.isaac.core.utils.prims import is_prim_path_valid
+from omni.isaac.core.utils.stage import get_stage_units
+from omni.isaac.core.utils.string import find_unique_string_name
+from omni.isaac.core.utils.types import ArticulationAction
+from omni.isaac.core.utils.viewports import set_camera_view
+from omni.isaac.franka import Franka
+
+# CuRobo
+from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel
+from curobo.geom.sdf.world import CollisionCheckerType
+from curobo.geom.sphere_fit import SphereFitType
+from curobo.geom.types import WorldConfig
+from curobo.rollout.rollout_base import Goal
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.types.state import JointState
+from curobo.util.usd_helper import UsdHelper
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.motion_gen import (
+    MotionGen,
+    MotionGenConfig,
+    MotionGenPlanConfig,
+    MotionGenResult,
+)
+from curobo.wrap.reacher.mpc import MpcSolver, MpcSolverConfig
+
+ext_list = [
+    "omni.kit.asset_converter",
+    "omni.kit.livestream.native",
+    "omni.kit.tool.asset_importer",
+    "omni.isaac.asset_browser",
+    # "omni.kit.window.sequencer.scripts.sequencer_extension",
+    "omni.kit.window.movie_capture",
+]
+[enable_extension(x) for x in ext_list]  # toggle this for remote streaming
+simulation_app.update()
+
+
+class CuroboController(BaseController):
+    def __init__(
+        self,
+        my_world: World,
+        my_task: BaseStacking,
+        name: str = "curobo_controller",
+    ) -> None:
+        BaseController.__init__(self, name=name)
+        self._save_log = False
+        self.my_world = my_world
+        self.my_task = my_task
+        n_obstacle_cuboids = 20
+        n_obstacle_mesh = 2
+        # warmup curobo instance
+        self.usd_help = UsdHelper()
+        self.init_curobo = False
+        self.world_file = "collision_table.yml"
+        self.cmd_js_names = [
+            "panda_joint1",
+            "panda_joint2",
+            "panda_joint3",
+            "panda_joint4",
+            "panda_joint5",
+            "panda_joint6",
+            "panda_joint7",
+        ]
+        self.tensor_args = TensorDeviceType()
+        self.robot_cfg = load_yaml(join_path(get_robot_configs_path(), "franka.yml"))["robot_cfg"]
+        self.robot_cfg["kinematics"][
+            "base_link"
+        ] = "panda_link0"  # controls which frame the controller is controlling
+
+        self.robot_cfg["kinematics"][
+            "ee_link"
+        ] = "panda_hand"  # controls which frame the controller is controlling
+        # self.robot_cfg["kinematics"]["cspace"]["max_acceleration"] = 10.0 # controls how fast robot moves
+        self.robot_cfg["kinematics"]["extra_collision_spheres"] = {"attached_object": 100}
+        self.robot_cfg["kinematics"]["collision_sphere_buffer"] = 0.0
+        self.robot_cfg["kinematics"]["collision_spheres"] = "spheres/franka_collision_mesh.yml"
+
+        world_cfg_table = WorldConfig.from_dict(
+            load_yaml(join_path(get_world_configs_path(), "collision_table.yml"))
+        )
+        self._world_cfg_table = world_cfg_table
+
+        world_cfg1 = WorldConfig.from_dict(
+            load_yaml(join_path(get_world_configs_path(), "collision_table.yml"))
+        ).get_mesh_world()
+        world_cfg1.mesh[0].pose[2] = -10.5
+
+        self._world_cfg = WorldConfig(cuboid=world_cfg_table.cuboid, mesh=world_cfg1.mesh)
+
+        motion_gen_config = MotionGenConfig.load_from_robot_config(
+            self.robot_cfg,
+            self._world_cfg,
+            self.tensor_args,
+            trajopt_tsteps=32,
+            collision_checker_type=CollisionCheckerType.MESH,
+            use_cuda_graph=True,
+            num_ik_seeds=40,
+            num_trajopt_seeds=10,
+            num_graph_seeds=10,
+            interpolation_dt=0.01,
+            collision_cache={"obb": n_obstacle_cuboids, "mesh": n_obstacle_mesh},
+            store_ik_debug=self._save_log,
+            store_trajopt_debug=self._save_log,
+            state_finite_difference_mode="CENTRAL",
+            minimize_jerk=True,
+            acceleration_scale=0.3,
+            fixed_iters_trajopt=True,
+        )
+        self.motion_gen = MotionGen(motion_gen_config)
+        print("warming up...")
+        self.motion_gen.warmup()
+
+        self.plan_config = MotionGenPlanConfig(
+            enable_graph=False,
+            max_attempts=10,
+            enable_graph_attempt=None,
+            enable_finetune_trajopt=True,
+            partial_ik_opt=False,
+        )
+        self.usd_help.load_stage(self.my_world.stage)
+        self.cmd_plan = None
+        self.cmd_idx = 0
+        self.idx_list = None
+
+    def attach_obj(
+        self,
+        sim_js: JointState,
+        js_names: list,
+    ) -> None:
+        cube_name = self.my_task.get_cube_prim(self.my_task.target_cube)
+
+        cu_js = JointState(
+            position=self.tensor_args.to_device(sim_js.positions),
+            velocity=self.tensor_args.to_device(sim_js.velocities) * 0.0,
+            acceleration=self.tensor_args.to_device(sim_js.velocities) * 0.0,
+            jerk=self.tensor_args.to_device(sim_js.velocities) * 0.0,
+            joint_names=js_names,
+        )
+
+        self.motion_gen.attach_objects_to_robot(
+            cu_js,
+            [cube_name],
+            sphere_fit_type=SphereFitType.VOXEL_VOLUME_SAMPLE_SURFACE,
+            world_objects_pose_offset=Pose.from_list([0, 0, 0.005, 1, 0, 0, 0], self.tensor_args),
+        )
+
+    def detach_obj(self) -> None:
+        self.motion_gen.detach_object_from_robot()
+
+    def plan(
+        self,
+        ee_translation_goal: np.array,
+        ee_orientation_goal: np.array,
+        sim_js: JointState,
+        js_names: list,
+    ) -> MotionGenResult:
+        ik_goal = Pose(
+            position=self.tensor_args.to_device(ee_translation_goal),
+            quaternion=self.tensor_args.to_device(ee_orientation_goal),
+        )
+        cu_js = JointState(
+            position=self.tensor_args.to_device(sim_js.positions),
+            velocity=self.tensor_args.to_device(sim_js.velocities) * 0.0,
+            acceleration=self.tensor_args.to_device(sim_js.velocities) * 0.0,
+            jerk=self.tensor_args.to_device(sim_js.velocities) * 0.0,
+            joint_names=js_names,
+        )
+        cu_js = cu_js.get_ordered_joint_state(self.motion_gen.kinematics.joint_names)
+        result = self.motion_gen.plan_single(cu_js.unsqueeze(0), ik_goal, self.plan_config.clone())
+        if self._save_log:  # and not result.success.item(): # logging for debugging
+            UsdHelper.write_motion_gen_log(
+                result,
+                {"robot_cfg": self.robot_cfg},
+                self._world_cfg,
+                cu_js,
+                ik_goal,
+                join_path("log/usd/", "cube") + "_debug",
+                write_ik=False,
+                write_trajopt=True,
+                visualize_robot_spheres=True,
+                link_spheres=self.motion_gen.kinematics.kinematics_config.link_spheres,
+                grid_space=2,
+                write_robot_usd_path="log/usd/assets",
+            )
+        return result
+
+    def forward(
+        self,
+        sim_js: JointState,
+        js_names: list,
+    ) -> ArticulationAction:
+        assert self.my_task.target_position is not None
+        assert self.my_task.target_cube is not None
+
+        if self.cmd_plan is None:
+            self.cmd_idx = 0
+            # Set EE goals
+            ee_translation_goal = self.my_task.target_position
+            ee_orientation_goal = np.array([0, 0, -1, 0])
+            # compute curobo solution:
+            result = self.plan(ee_translation_goal, ee_orientation_goal, sim_js, js_names)
+            succ = result.success.item()
+            if succ:
+                cmd_plan = result.get_interpolated_plan()
+                self.idx_list = [i for i in range(len(self.cmd_js_names))]
+                self.cmd_plan = cmd_plan.get_ordered_joint_state(self.cmd_js_names)
+            else:
+                carb.log_warn("Plan did not converge to a solution.")
+                return None
+
+        cmd_state = self.cmd_plan[self.cmd_idx]
+        self.cmd_idx += 1
+        # get full dof state
+        art_action = ArticulationAction(
+            cmd_state.position.cpu().numpy(),
+            # cmd_state.velocity.cpu().numpy(),
+            joint_indices=self.idx_list,
+        )
+        if self.cmd_idx >= len(self.cmd_plan.position):
+            self.cmd_idx = 0
+            self.cmd_plan = None
+
+        return art_action
+
+    def reached_target(self, observations: dict) -> bool:
+        curr_ee_position = observations["my_franka"]["end_effector_position"]
+        if np.linalg.norm(
+            self.my_task.target_position - curr_ee_position
+        ) < 0.04 and (  # This is half gripper width, curobo succ threshold is 0.5 cm
+            self.cmd_plan is None
+        ):
+            if self.my_task.cube_in_hand is None:
+                print("reached picking target: ", self.my_task.target_cube)
+            else:
+                print("reached placing target: ", self.my_task.target_cube)
+            return True
+        else:
+            return False
+
+    def reset(
+        self,
+        ignore_substring: str,
+        robot_prim_path: str,
+    ) -> None:
+        # init
+        self.update(ignore_substring, robot_prim_path)
+        self.init_curobo = True
+        self.cmd_plan = None
+        self.cmd_idx = 0
+
+    def update(
+        self,
+        ignore_substring: str,
+        robot_prim_path: str,
+    ) -> None:
+        # print("updating world...")
+        obstacles = self.usd_help.get_obstacles_from_stage(
+            ignore_substring=ignore_substring, reference_prim_path=robot_prim_path
+        ).get_collision_check_world()
+        # add ground plane as it's not readable:
+        obstacles.add_obstacle(self._world_cfg_table.cuboid[0])
+        self.motion_gen.update_world(obstacles)
+        self._world_cfg = obstacles
+
+
+class MultiModalStacking(BaseStacking):
+    def __init__(
+        self,
+        name: str = "multi_modal_stacking",
+        offset: Optional[np.ndarray] = None,
+    ) -> None:
+        BaseStacking.__init__(
+            self,
+            name=name,
+            cube_initial_positions=np.array(
+                [
+                    [0.50, 0.0, 0.1],
+                    [0.50, -0.20, 0.1],
+                    [0.50, 0.20, 0.1],
+                    [0.30, -0.20, 0.1],
+                    [0.30, 0.0, 0.1],
+                    [0.30, 0.20, 0.1],
+                    [0.70, -0.20, 0.1],
+                    [0.70, 0.0, 0.1],
+                    [0.70, 0.20, 0.1],
+                ]
+            )
+            / get_stage_units(),
+            cube_initial_orientations=None,
+            stack_target_position=None,
+            cube_size=np.array([0.045, 0.045, 0.07]),
+            offset=offset,
+        )
+        self.cube_list = None
+        self.target_position = None
+        self.target_cube = None
+        self.cube_in_hand = None
+
+    def reset(self) -> None:
+        self.cube_list = self.get_cube_names()
+        self.target_position = None
+        self.target_cube = None
+        self.cube_in_hand = None
+
+    def update_task(self) -> bool:
+        # after detaching the cube in hand
+        assert self.target_cube is not None
+        assert self.cube_in_hand is not None
+        self.cube_list.insert(0, self.cube_in_hand)
+        self.target_cube = None
+        self.target_position = None
+        self.cube_in_hand = None
+        if len(self.cube_list) <= 1:
+            task_finished = True
+        else:
+            task_finished = False
+        return task_finished
+
+    def get_cube_prim(self, cube_name: str):
+        for i in range(self._num_of_cubes):
+            if cube_name == self._cubes[i].name:
+                return self._cubes[i].prim_path
+
+    def get_place_position(self, observations: dict) -> None:
+        assert self.target_cube is not None
+        self.cube_in_hand = self.target_cube
+        self.target_cube = self.cube_list[0]
+        ee_to_grasped_cube = (
+            observations["my_franka"]["end_effector_position"][2]
+            - observations[self.cube_in_hand]["position"][2]
+        )
+        self.target_position = observations[self.target_cube]["position"] + [
+            0,
+            0,
+            self._cube_size[2] + ee_to_grasped_cube + 0.02,
+        ]
+        self.cube_list.remove(self.target_cube)
+
+    def get_pick_position(self, observations: dict) -> None:
+        assert self.cube_in_hand is None
+        self.target_cube = self.cube_list[1]
+        self.target_position = observations[self.target_cube]["position"] + [
+            0,
+            0,
+            self._cube_size[2] / 2 + 0.092,
+        ]
+        self.cube_list.remove(self.target_cube)
+
+    def set_robot(self) -> Franka:
+        franka_prim_path = find_unique_string_name(
+            initial_name="/World/Franka", is_unique_fn=lambda x: not is_prim_path_valid(x)
+        )
+        franka_robot_name = find_unique_string_name(
+            initial_name="my_franka", is_unique_fn=lambda x: not self.scene.object_exists(x)
+        )
+        return Franka(
+            prim_path=franka_prim_path, name=franka_robot_name, end_effector_prim_name="panda_hand"
+        )
+
+
+robot_prim_path = "/World/Franka/panda_link0"
+ignore_substring = ["Franka", "TargetCube", "material", "Plane"]
+my_world = World(stage_units_in_meters=1.0)
+stage = my_world.stage
+stage.SetDefaultPrim(stage.GetPrimAtPath("/World"))
+
+my_task = MultiModalStacking()
+my_world.add_task(my_task)
+my_world.reset()
+robot_name = my_task.get_params()["robot_name"]["value"]
+my_franka = my_world.scene.get_object(robot_name)
+my_controller = CuroboController(my_world=my_world, my_task=my_task)
+articulation_controller = my_franka.get_articulation_controller()
+set_camera_view(eye=[2, 0, 1], target=[0.00, 0.00, 0.00], camera_prim_path="/OmniverseKit_Persp")
+wait_steps = 8
+
+initial_steps = 10
+################################################################
+print("Start simulation...")
+my_franka.gripper.open()
+for _ in range(wait_steps):
+    my_world.step(render=True)
+my_task.reset()
+task_finished = False
+observations = my_world.get_observations()
+my_task.get_pick_position(observations)
+robot = my_franka
+print("**********************")
+# print(robot.get_solver_position_iteration_count(), robot.get_solver_velocity_iteration_count())
+robot.enable_gravity()
+robot._articulation_view.set_gains(
+    kps=np.array([100000000, 10000000]), joint_indices=np.array([0, 2])
+)
+
+robot._articulation_view.set_max_efforts(
+    values=np.array([10000, 10000]), joint_indices=np.array([0, 2])
+)
+i = 0
+while simulation_app.is_running():
+    my_world.step(render=True)  # necessary to visualize changes
+    i += 1
+
+    if task_finished or i < initial_steps:
+        continue
+
+    if not my_controller.init_curobo:
+        my_controller.reset(ignore_substring, robot_prim_path)
+
+    step_index = my_world.current_time_step_index
+    observations = my_world.get_observations()
+    sim_js = my_franka.get_joints_state()
+
+    if my_controller.reached_target(observations):
+        if my_franka.gripper.get_joint_positions()[0] < 0.035:  # reached placing target
+            my_franka.gripper.open()
+            for _ in range(wait_steps):
+                my_world.step(render=True)
+            my_controller.detach_obj()
+            my_controller.update(
+                ignore_substring, robot_prim_path
+            )  # update world collision configuration
+            task_finished = my_task.update_task()
+            if task_finished:
+                print("\nTASK DONE\n")
+                for _ in range(wait_steps):
+                    my_world.step(render=True)
+                continue
+            else:
+                my_task.get_pick_position(observations)
+
+        else:  # reached picking target
+            my_franka.gripper.close()
+            for _ in range(wait_steps):
+                my_world.step(render=True)
+            sim_js = my_franka.get_joints_state()
+            my_controller.update(ignore_substring, robot_prim_path)
+            my_controller.attach_obj(sim_js, my_franka.dof_names)
+            my_task.get_place_position(observations)
+
+    else:  # target position has been set
+        sim_js = my_franka.get_joints_state()
+        art_action = my_controller.forward(sim_js, my_franka.dof_names)
+        if art_action is not None:
+            articulation_controller.apply_action(art_action)
+            for _ in range(2):
+                my_world.step(render=False)
+
+simulation_app.close()
diff --git a/examples/isaac_sim/util/convert_urdf_to_usd.py b/examples/isaac_sim/util/convert_urdf_to_usd.py
new file mode 100644
index 00000000..f2c0f2f6
--- /dev/null
+++ b/examples/isaac_sim/util/convert_urdf_to_usd.py
@@ -0,0 +1,172 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import torch
+
+a = torch.zeros(4, device="cuda:0")
+
+# Standard Library
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--robot",
+    type=str,
+    default="franka.yml",
+    help="Robot configuration to download",
+)
+parser.add_argument("--save_usd", default=False, action="store_true")
+args = parser.parse_args()
+
+# Third Party
+from omni.isaac.kit import SimulationApp
+
+simulation_app = SimulationApp({"headless": args.save_usd})
+
+# Third Party
+import omni.usd
+from omni.isaac.core import World
+from omni.isaac.core.robots import Robot
+from omni.isaac.core.utils.types import ArticulationAction
+
+try:
+    # Third Party
+    from omni.isaac.urdf import _urdf  # isaacsim 2022.2
+except ImportError:
+    from omni.importer.urdf import _urdf  # isaac sim 2023.1
+
+# CuRobo
+from curobo.util.usd_helper import UsdHelper
+from curobo.util_file import (
+    get_assets_path,
+    get_filename,
+    get_path_of_dir,
+    get_robot_configs_path,
+    join_path,
+    load_yaml,
+)
+
+
+def save_usd():
+    my_world = World(stage_units_in_meters=1.0)
+
+    import_config = _urdf.ImportConfig()
+    import_config.merge_fixed_joints = False
+    import_config.convex_decomp = False
+    import_config.import_inertia_tensor = True
+    import_config.fix_base = True
+    import_config.make_default_prim = True
+    import_config.self_collision = False
+    import_config.create_physics_scene = True
+    import_config.import_inertia_tensor = False
+    import_config.default_drive_strength = 10000
+    import_config.default_position_drive_damping = 100
+    import_config.default_drive_type = _urdf.UrdfJointTargetType.JOINT_DRIVE_POSITION
+    import_config.distance_scale = 1
+    import_config.density = 0.0
+    # Get the urdf file path
+    robot_config = load_yaml(join_path(get_robot_configs_path(), args.robot))
+    urdf_path = join_path(get_assets_path(), robot_config["robot_cfg"]["kinematics"]["urdf_path"])
+    asset_path = join_path(
+        get_assets_path(), robot_config["robot_cfg"]["kinematics"]["asset_root_path"]
+    )
+    urdf_interface = _urdf.acquire_urdf_interface()
+    full_path = join_path(get_assets_path(), robot_config["robot_cfg"]["kinematics"]["urdf_path"])
+    default_config = robot_config["robot_cfg"]["kinematics"]["cspace"]["retract_config"]
+    j_names = robot_config["robot_cfg"]["kinematics"]["cspace"]["joint_names"]
+
+    robot_path = get_path_of_dir(full_path)
+    filename = get_filename(full_path)
+    imported_robot = urdf_interface.parse_urdf(robot_path, filename, import_config)
+    robot_path = urdf_interface.import_robot(
+        robot_path, filename, imported_robot, import_config, ""
+    )
+    robot = my_world.scene.add(Robot(prim_path=robot_path, name="robot"))
+    # robot.disable_gravity()
+    i = 0
+
+    my_world.reset()
+
+    usd_help = UsdHelper()
+    usd_help.load_stage(my_world.stage)
+    save_path = join_path(get_assets_path(), robot_config["robot_cfg"]["kinematics"]["usd_path"])
+    usd_help.write_stage_to_file(save_path, True)
+    print("Wrote usd file to " + save_path)
+    simulation_app.close()
+
+
+def debug_usd():
+    my_world = World(stage_units_in_meters=1.0)
+
+    import_config = _urdf.ImportConfig()
+    import_config.merge_fixed_joints = False
+    import_config.convex_decomp = False
+    import_config.import_inertia_tensor = True
+    import_config.fix_base = True
+    import_config.make_default_prim = True
+    import_config.self_collision = False
+    import_config.create_physics_scene = True
+    import_config.import_inertia_tensor = False
+    import_config.default_drive_strength = 10000
+    import_config.default_position_drive_damping = 100
+    import_config.default_drive_type = _urdf.UrdfJointTargetType.JOINT_DRIVE_POSITION
+    import_config.distance_scale = 1
+    import_config.density = 0.0
+    # Get the urdf file path
+    robot_config = load_yaml(join_path(get_robot_configs_path(), args.robot))
+    urdf_path = join_path(get_assets_path(), robot_config["robot_cfg"]["kinematics"]["urdf_path"])
+    asset_path = join_path(
+        get_assets_path(), robot_config["robot_cfg"]["kinematics"]["asset_root_path"]
+    )
+    urdf_interface = _urdf.acquire_urdf_interface()
+    full_path = join_path(get_assets_path(), robot_config["robot_cfg"]["kinematics"]["urdf_path"])
+    default_config = robot_config["robot_cfg"]["kinematics"]["cspace"]["retract_config"]
+    j_names = robot_config["robot_cfg"]["kinematics"]["cspace"]["joint_names"]
+
+    robot_path = get_path_of_dir(full_path)
+    filename = get_filename(full_path)
+    imported_robot = urdf_interface.parse_urdf(robot_path, filename, import_config)
+    robot_path = urdf_interface.import_robot(
+        robot_path, filename, imported_robot, import_config, ""
+    )
+    robot = my_world.scene.add(Robot(prim_path=robot_path, name="robot"))
+    # robot.disable_gravity()
+    i = 0
+
+    articulation_controller = robot.get_articulation_controller()
+    my_world.reset()
+
+    while simulation_app.is_running():
+        my_world.step(render=True)
+        if i == 0:
+            idx_list = [robot.get_dof_index(x) for x in j_names]
+            robot.set_joint_positions(default_config, idx_list)
+            i += 1
+        # if dof_n is not None:
+        #    dof_i = [robot.get_dof_index(x) for x in j_names]
+        #
+        #    robot.set_joint_positions(default_config, dof_i)
+        if robot.is_valid():
+            art_action = ArticulationAction(default_config, joint_indices=idx_list)
+            articulation_controller.apply_action(art_action)
+    usd_help = UsdHelper()
+    usd_help.load_stage(my_world.stage)
+    save_path = join_path(get_assets_path(), robot_config["robot_cfg"]["kinematics"]["usd_path"])
+    usd_help.write_stage_to_file(save_path, True)
+    simulation_app.close()
+
+
+if __name__ == "__main__":
+    if args.save_usd:
+        save_usd()
+    else:
+        debug_usd()
diff --git a/examples/isaac_sim/util/dowload_assets.py b/examples/isaac_sim/util/dowload_assets.py
new file mode 100644
index 00000000..b5e38122
--- /dev/null
+++ b/examples/isaac_sim/util/dowload_assets.py
@@ -0,0 +1,69 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# This script downloads robot usd assets from isaac sim for using in CuRobo.
+
+
+# Third Party
+import torch
+
+a = torch.zeros(4, device="cuda:0")
+
+# Third Party
+from omni.isaac.kit import SimulationApp
+
+simulation_app = SimulationApp({"headless": True})
+# Third Party
+from omni.isaac.core import World
+from omni.isaac.core.robots import Robot
+from omni.isaac.core.utils.nucleus import get_assets_root_path as nucleus_path
+from omni.isaac.core.utils.stage import add_reference_to_stage
+
+# CuRobo
+from curobo.util.usd_helper import UsdHelper
+from curobo.util_file import get_assets_path, get_robot_configs_path, join_path, load_yaml
+
+# supported robots:
+robots = ["franka.yml", "ur10.yml"]
+# Standard Library
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--robot",
+    type=str,
+    default="franka.yml",
+    help="Robot configuration to download",
+)
+args = parser.parse_args()
+
+if __name__ == "__main__":
+    r = args.robot
+    my_world = World(stage_units_in_meters=1.0)
+    robot_config = load_yaml(join_path(get_robot_configs_path(), r))
+    usd_path = nucleus_path() + robot_config["robot_cfg"]["kinematics"]["isaac_usd_path"]
+
+    usd_help = UsdHelper()
+    robot_name = r
+    prim_path = robot_config["robot_cfg"]["kinematics"]["usd_robot_root"]
+    add_reference_to_stage(usd_path=usd_path, prim_path=prim_path)
+    robot = my_world.scene.add(Robot(prim_path=prim_path, name=robot_name))
+    usd_help.load_stage(my_world.stage)
+
+    my_world.reset()
+    articulation_controller = robot.get_articulation_controller()
+
+    # create a new stage and add robot to usd path:
+    save_path = join_path(get_assets_path(), robot_config["robot_cfg"]["kinematics"]["usd_path"])
+    usd_help.write_stage_to_file(save_path, True)
+    my_world.clear()
+    my_world.clear_instance()
+    simulation_app.close()
diff --git a/examples/kinematics_example.py b/examples/kinematics_example.py
new file mode 100644
index 00000000..a32827a5
--- /dev/null
+++ b/examples/kinematics_example.py
@@ -0,0 +1,66 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+""" Example computing fk using curobo
+
+"""
+# Third Party
+import torch
+
+# CuRobo
+from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel, CudaRobotModelConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.robot import RobotConfig
+from curobo.util.logger import setup_curobo_logger
+from curobo.util_file import get_robot_path, join_path, load_yaml
+
+
+def demo_basic_robot():
+    tensor_args = TensorDeviceType()
+
+    # load a urdf:
+    config_file = load_yaml(join_path(get_robot_path(), "franka.yml"))
+
+    urdf_file = config_file["robot_cfg"]["kinematics"][
+        "urdf_path"
+    ]  # Send global path starting with "/"
+    base_link = config_file["robot_cfg"]["kinematics"]["base_link"]
+    ee_link = config_file["robot_cfg"]["kinematics"]["ee_link"]
+    robot_cfg = RobotConfig.from_basic(urdf_file, base_link, ee_link, tensor_args)
+
+    kin_model = CudaRobotModel(robot_cfg.kinematics)
+
+    # compute forward kinematics:
+
+    q = torch.rand((10, kin_model.get_dof()), **vars(tensor_args))
+    out = kin_model.get_state(q)
+    # here is the kinematics state:
+    # print(out)
+
+
+def demo_full_config_robot():
+    setup_curobo_logger("info")
+    tensor_args = TensorDeviceType()
+    # load a urdf:
+    config_file = load_yaml(join_path(get_robot_path(), "franka.yml"))["robot_cfg"]
+    robot_cfg = RobotConfig.from_dict(config_file, tensor_args)
+
+    kin_model = CudaRobotModel(robot_cfg.kinematics)
+
+    # compute forward kinematics:
+    q = torch.rand((10, kin_model.get_dof()), **vars(tensor_args))
+    out = kin_model.get_state(q)
+    # here is the kinematics state:
+    # print(out)
+
+
+if __name__ == "__main__":
+    demo_basic_robot()
+    demo_full_config_robot()
diff --git a/examples/motion_gen_api_example.py b/examples/motion_gen_api_example.py
new file mode 100644
index 00000000..13b1b6ab
--- /dev/null
+++ b/examples/motion_gen_api_example.py
@@ -0,0 +1,101 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import torch
+
+# CuRobo
+from curobo.geom.types import Cuboid, WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState
+from curobo.util.usd_helper import UsdHelper
+from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig
+
+
+def demo_motion_gen_api():
+    tensor_args = TensorDeviceType(device=torch.device("cuda:0"))
+
+    interpolation_dt = 0.02
+    collision_activation_distance = 0.02  # meters
+    # create motion gen with a cuboid cache to be able to load obstacles later:
+    motion_gen_cfg = MotionGenConfig.load_from_robot_config(
+        "franka.yml",
+        "collision_table.yml",
+        tensor_args,
+        trajopt_tsteps=34,
+        interpolation_steps=5000,
+        num_ik_seeds=50,
+        num_trajopt_seeds=6,
+        grad_trajopt_iters=500,
+        trajopt_dt=0.5,
+        interpolation_dt=interpolation_dt,
+        evaluate_interpolated_trajectory=True,
+        js_trajopt_dt=0.5,
+        js_trajopt_tsteps=34,
+        collision_activation_distance=collision_activation_distance,
+    )
+    motion_gen = MotionGen(motion_gen_cfg)
+
+    motion_gen.warmup()
+
+    # create world representation:
+    motion_gen.world_coll_checker.clear_cache()
+    motion_gen.reset(reset_seed=False)
+    cuboids = [Cuboid(name="obs_1", pose=[0.9, 0.0, 0.5, 1, 0, 0, 0], dims=[0.1, 0.5, 0.5])]
+    world = WorldConfig(cuboid=cuboids)
+
+    motion_gen.update_world(world)
+
+    q_start = JointState.from_position(
+        tensor_args.to_device([[0.0, -1.3, 0.0, -2.5, 0.0, 1.0, 0.0]]),
+        joint_names=[
+            "panda_joint1",
+            "panda_joint2",
+            "panda_joint3",
+            "panda_joint4",
+            "panda_joint5",
+            "panda_joint6",
+            "panda_joint7",
+        ],
+    )
+
+    goal_pose = Pose(
+        position=tensor_args.to_device([[0.5, 0.0, 0.3]]),
+        quaternion=tensor_args.to_device([[1, 0, 0, 0]]),
+    )
+
+    result = motion_gen.plan_single(q_start, goal_pose)
+
+    if result.success.item():
+        # get result:
+        # this contains trajectory with 34 tsteps and the final
+        # result.optimized_plan
+        # result.optimized_dt
+
+        # this contains a linearly interpolated trajectory with fixed dt
+        interpolated_solution = result.get_interpolated_plan()
+
+        UsdHelper.write_trajectory_animation_with_robot_usd(
+            "franka.yml",
+            world,
+            q_start,
+            interpolated_solution,
+            dt=result.interpolation_dt,
+            save_path="demo.usd",
+            base_frame="/world_base",
+        )
+    else:
+        print("Failed")
+
+
+if __name__ == "__main__":
+    demo_motion_gen_api()
diff --git a/examples/motion_gen_example.py b/examples/motion_gen_example.py
new file mode 100644
index 00000000..bf0ea4da
--- /dev/null
+++ b/examples/motion_gen_example.py
@@ -0,0 +1,366 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+
+# Third Party
+import torch
+
+# CuRobo
+from curobo.geom.sdf.world import CollisionCheckerType
+from curobo.geom.types import Cuboid, WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.util.logger import setup_curobo_logger
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
+
+
+def plot_traj(trajectory, dt):
+    # Third Party
+    import matplotlib.pyplot as plt
+
+    _, axs = plt.subplots(4, 1)
+    q = trajectory.position.cpu().numpy()
+    qd = trajectory.velocity.cpu().numpy()
+    qdd = trajectory.acceleration.cpu().numpy()
+    qddd = trajectory.jerk.cpu().numpy()
+    timesteps = [i * dt for i in range(q.shape[0])]
+    for i in range(q.shape[-1]):
+        axs[0].plot(timesteps, q[:, i], label=str(i))
+        axs[1].plot(timesteps, qd[:, i], label=str(i))
+        axs[2].plot(timesteps, qdd[:, i], label=str(i))
+        axs[3].plot(timesteps, qddd[:, i], label=str(i))
+
+    plt.legend()
+    # plt.savefig("test.png")
+    plt.show()
+
+
+def plot_iters_traj(trajectory, d_id=1, dof=7, seed=0):
+    # Third Party
+    import matplotlib.pyplot as plt
+
+    _, axs = plt.subplots(len(trajectory), 1)
+    if len(trajectory) == 1:
+        axs = [axs]
+    for k in range(len(trajectory)):
+        q = trajectory[k]
+
+        for i in range(len(q)):
+            axs[k].plot(
+                q[i][seed, :-1, d_id].cpu(),
+                "r+-",
+                label=str(i),
+                alpha=0.1 + min(0.9, float(i) / (len(q))),
+            )
+    plt.legend()
+    plt.show()
+
+
+def plot_iters_traj_3d(trajectory, d_id=1, dof=7, seed=0):
+    # Third Party
+    import matplotlib.pyplot as plt
+
+    ax = plt.axes(projection="3d")
+    c = 0
+    h = trajectory[0][0].shape[1] - 1
+    x = [x for x in range(h)]
+
+    for k in range(len(trajectory)):
+        q = trajectory[k]
+
+        for i in range(len(q)):
+            # ax.plot3D(x,[c for _ in range(h)],  q[i][seed, :, d_id].cpu())#, 'r')
+            ax.scatter3D(
+                x, [c for _ in range(h)], q[i][seed, :h, d_id].cpu(), c=q[i][seed, :, d_id].cpu()
+            )
+            # @plt.show()
+            c += 1
+    # plt.legend()
+    plt.show()
+
+
+def demo_motion_gen_mesh():
+    PLOT = True
+    tensor_args = TensorDeviceType()
+    world_file = "collision_mesh_scene.yml"
+    robot_file = "franka.yml"
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_file,
+        world_file,
+        tensor_args,
+        trajopt_tsteps=40,
+        collision_checker_type=CollisionCheckerType.MESH,
+        use_cuda_graph=False,
+    )
+    motion_gen = MotionGen(motion_gen_config)
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+    retract_cfg = robot_cfg.retract_config
+    state = motion_gen.rollout_fn.compute_kinematics(
+        JointState.from_position(retract_cfg.view(1, -1))
+    )
+
+    retract_pose = Pose(state.ee_pos_seq.squeeze(), quaternion=state.ee_quat_seq.squeeze())
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.5)
+    result = motion_gen.plan(
+        start_state,
+        retract_pose,
+        enable_graph=False,
+        enable_opt=True,
+        max_attempts=1,
+        num_trajopt_seeds=10,
+        num_graph_seeds=10,
+    )
+    print(result.status, result.attempts, result.trajopt_time)
+    traj = result.raw_plan  # optimized plan
+    print("Trajectory Generated: ", result.success)
+    if PLOT:
+        plot_traj(traj.cpu().numpy())
+
+
+def demo_motion_gen():
+    # Standard Library
+    PLOT = False
+    tensor_args = TensorDeviceType()
+    world_file = "collision_table.yml"
+    robot_file = "franka.yml"
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_file,
+        world_file,
+        tensor_args,
+        interpolation_dt=0.01,
+    )
+
+    motion_gen = MotionGen(motion_gen_config)
+
+    motion_gen.warmup(enable_graph=False)
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+    retract_cfg = motion_gen.get_retract_config()
+    state = motion_gen.rollout_fn.compute_kinematics(
+        JointState.from_position(retract_cfg.view(1, -1))
+    )
+
+    retract_pose = Pose(state.ee_pos_seq.squeeze(), quaternion=state.ee_quat_seq.squeeze())
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.3)
+    result = motion_gen.plan_single(start_state, retract_pose, MotionGenPlanConfig(max_attempts=1))
+    traj = result.get_interpolated_plan()
+    print("Trajectory Generated: ", result.success, result.optimized_dt.item())
+    if PLOT:
+        plot_traj(traj, result.interpolation_dt)
+        # plt.save("test.png")
+        # plt.close()
+        # traj = result.debug_info["opt_solution"].position.view(-1,7)
+        # plot_traj(traj.cpu().numpy())
+
+
+def demo_motion_gen_debug():
+    PLOT = True
+    tensor_args = TensorDeviceType()
+    world_file = "collision_cubby.yml"
+    robot_file = "franka.yml"
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_file,
+        world_file,
+        tensor_args,
+        trajopt_tsteps=24,
+        collision_checker_type=CollisionCheckerType.PRIMITIVE,
+        use_cuda_graph=True,
+        num_trajopt_seeds=1,
+        num_graph_seeds=1,
+        store_ik_debug=True,
+        store_trajopt_debug=True,
+        trajopt_particle_opt=False,
+        grad_trajopt_iters=100,
+    )
+    motion_gen = MotionGen(motion_gen_config)
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+    retract_cfg = robot_cfg.cspace.retract_config
+    state = motion_gen.rollout_fn.compute_kinematics(
+        JointState.from_position(retract_cfg.view(1, -1))
+    )
+
+    retract_pose = Pose(state.ee_pos_seq.squeeze(), quaternion=state.ee_quat_seq.squeeze())
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.4)
+    result = motion_gen.plan(start_state, retract_pose, enable_graph=True, enable_opt=True)
+    if result.status not in [None, "Opt Fail"]:
+        return
+    traj = result.plan.view(-1, 7)  # optimized plan
+    print("Trajectory Generated: ", result.success)
+    trajectory_iter_steps = result.debug_info["trajopt_result"].debug_info["solver"]["steps"]
+
+    if PLOT:
+        plot_iters_traj_3d(trajectory_iter_steps, d_id=6)
+        # plot_traj(traj.cpu().numpy())
+
+
+def demo_motion_gen_batch():
+    PLOT = False
+    tensor_args = TensorDeviceType()
+    world_file = "collision_cubby.yml"
+    robot_file = "franka.yml"
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_file,
+        world_file,
+        tensor_args,
+        trajopt_tsteps=30,
+        collision_checker_type=CollisionCheckerType.PRIMITIVE,
+        use_cuda_graph=True,
+        num_trajopt_seeds=4,
+        num_graph_seeds=1,
+        num_ik_seeds=30,
+    )
+    motion_gen = MotionGen(motion_gen_config)
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+    retract_cfg = motion_gen.get_retract_config()
+    state = motion_gen.rollout_fn.compute_kinematics(
+        JointState.from_position(retract_cfg.view(1, -1))
+    )
+
+    retract_pose = Pose(state.ee_pos_seq.squeeze(), quaternion=state.ee_quat_seq.squeeze())
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.6)
+
+    retract_pose = retract_pose.repeat_seeds(2)
+    retract_pose.position[0, 0] = -0.3
+    for _ in range(2):
+        result = motion_gen.plan_batch(
+            start_state.repeat_seeds(2),
+            retract_pose,
+            MotionGenPlanConfig(enable_graph=False, enable_opt=True),
+        )
+    traj = result.optimized_plan.position.view(2, -1, 7)  # optimized plan
+    print("Trajectory Generated: ", result.success)
+    if PLOT:
+        plot_traj(traj[0, : result.path_buffer_last_tstep[0], :].cpu().numpy())
+        plot_traj(traj[1, : result.path_buffer_last_tstep[1], :].cpu().numpy())
+
+
+def demo_motion_gen_api():
+    tensor_args = TensorDeviceType(device=torch.device("cuda:0"))
+    interpolation_dt = 0.02
+    # create motion gen with a cuboid cache to be able to load obstacles later:
+    motion_gen_cfg = MotionGenConfig.load_from_robot_config(
+        "franka.yml",
+        "collision_table.yml",
+        tensor_args,
+        trajopt_tsteps=34,
+        interpolation_steps=5000,
+        num_ik_seeds=50,
+        num_trajopt_seeds=6,
+        collision_checker_type=CollisionCheckerType.PRIMITIVE,
+        grad_trajopt_iters=500,
+        trajopt_dt=0.5,
+        interpolation_dt=interpolation_dt,
+        evaluate_interpolated_trajectory=True,
+        js_trajopt_dt=0.5,
+        js_trajopt_tsteps=34,
+    )
+    motion_gen = MotionGen(motion_gen_cfg)
+
+    motion_gen.warmup(warmup_js_trajopt=False)
+
+    # create world representation:
+
+    cuboids = [Cuboid(name="obs_1", pose=[0, 0, 0, 1, 0, 0, 0], dims=[0.1, 0.1, 0.1])]
+    world = WorldConfig(cuboid=cuboids)
+
+    motion_gen.update_world(world)
+
+    q_start = JointState.from_position(
+        tensor_args.to_device([[0.0, -1.3, 0.0, -2.5, 0.0, 1.0, 0.0]]),
+        joint_names=[
+            "panda_joint1",
+            "panda_joint2",
+            "panda_joint3",
+            "panda_joint4",
+            "panda_joint5",
+            "panda_joint6",
+            "panda_joint7",
+        ],
+    )
+
+    goal_pose = Pose(
+        position=tensor_args.to_device([[0.5, 0.0, 0.3]]),
+        quaternion=tensor_args.to_device([[1, 0, 0, 0]]),
+    )
+    print(goal_pose)
+
+    result = motion_gen.plan_single(q_start, goal_pose)
+
+    # get result:
+
+    interpolated_solution = result.get_interpolated_plan()
+
+    print(result.get_interpolated_plan())
+
+
+def demo_motion_gen_batch_env(n_envs: int = 10):
+    tensor_args = TensorDeviceType()
+    world_files = ["collision_table.yml" for _ in range(int(n_envs / 2))] + [
+        "collision_test.yml" for _ in range(int(n_envs / 2))
+    ]
+
+    world_cfg = [
+        WorldConfig.from_dict(load_yaml(join_path(get_world_configs_path(), world_file)))
+        for world_file in world_files
+    ]
+    robot_file = "franka.yml"
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_file,
+        world_cfg,
+        tensor_args,
+        trajopt_tsteps=30,
+        use_cuda_graph=False,
+        num_trajopt_seeds=4,
+        num_ik_seeds=30,
+        num_batch_ik_seeds=30,
+        evaluate_interpolated_trajectory=True,
+        interpolation_dt=0.05,
+        interpolation_steps=500,
+        grad_trajopt_iters=30,
+    )
+    motion_gen_batch_env = MotionGen(motion_gen_config)
+    motion_gen_batch_env.reset()
+    motion_gen_batch_env.warmup(
+        enable_graph=False, batch=n_envs, warmup_js_trajopt=False, batch_env_mode=True
+    )
+    retract_cfg = motion_gen_batch_env.get_retract_config().clone()
+    state = motion_gen_batch_env.compute_kinematics(
+        JointState.from_position(retract_cfg.view(1, -1))
+    )
+
+    goal_pose = Pose(
+        state.ee_pos_seq.squeeze(), quaternion=state.ee_quat_seq.squeeze()
+    ).repeat_seeds(n_envs)
+
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.3).repeat_seeds(n_envs)
+
+    goal_pose.position[1, 0] -= 0.2
+
+    m_config = MotionGenPlanConfig(
+        False, True, max_attempts=1, enable_graph_attempt=None, enable_finetune_trajopt=False
+    )
+    result = motion_gen_batch_env.plan_batch_env(start_state, goal_pose, m_config)
+
+    print(n_envs, result.total_time, result.total_time / n_envs)
+
+
+if __name__ == "__main__":
+    setup_curobo_logger("error")
+    demo_motion_gen()
+    n = [2, 10]
+    for n_envs in n:
+        demo_motion_gen_batch_env(n_envs=n_envs)
diff --git a/examples/motion_gen_profile.py b/examples/motion_gen_profile.py
new file mode 100644
index 00000000..b64f11d7
--- /dev/null
+++ b/examples/motion_gen_profile.py
@@ -0,0 +1,81 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+
+
+# CuRobo
+
+# Third Party
+from torch.profiler import ProfilerActivity, profile
+
+# CuRobo
+from curobo.geom.sdf.world import CollisionCheckerType
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.util_file import get_robot_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig
+
+
+def plot_traj(trajectory):
+    # Third Party
+    import matplotlib.pyplot as plt
+
+    _, axs = plt.subplots(1, 1)
+    q = trajectory
+
+    for i in range(q.shape[-1]):
+        axs.plot(q[:, i], label=str(i))
+    plt.legend()
+    plt.show()
+
+
+def demo_motion_gen():
+    PLOT = False
+    tensor_args = TensorDeviceType()
+    world_file = "collision_test.yml"
+    robot_file = "franka.yml"
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_file,
+        world_file,
+        tensor_args,
+        trajopt_tsteps=40,
+        collision_checker_type=CollisionCheckerType.PRIMITIVE,
+        use_cuda_graph=False,
+    )
+    motion_gen = MotionGen(motion_gen_config)
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+    retract_cfg = robot_cfg.retract_config
+    state = motion_gen.rollout_fn.compute_kinematics(
+        JointState.from_position(retract_cfg.view(1, -1))
+    )
+
+    retract_pose = Pose(state.ee_pos_seq.squeeze(), quaternion=state.ee_quat_seq.squeeze())
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.5)
+    result = motion_gen.plan(start_state, retract_pose, enable_graph=False)
+
+    # profile:
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+        result = motion_gen.plan(start_state, retract_pose, enable_graph=False)
+
+    print("Exporting the trace..")
+    prof.export_chrome_trace("trace.json")
+    exit(10)
+    traj = result.raw_plan  # optimized plan
+    print("Trajectory Generated: ", result.success)
+    if PLOT:
+        plot_traj(traj.cpu().numpy())
+
+
+if __name__ == "__main__":
+    demo_motion_gen()
diff --git a/examples/mpc_example.py b/examples/mpc_example.py
new file mode 100644
index 00000000..fef9b8eb
--- /dev/null
+++ b/examples/mpc_example.py
@@ -0,0 +1,131 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+import time
+
+# Third Party
+import numpy as np
+import torch
+
+# CuRobo
+from curobo.geom.sdf.world import CollisionCheckerType
+from curobo.geom.types import WorldConfig
+from curobo.rollout.rollout_base import Goal
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.mpc import MpcSolver, MpcSolverConfig
+
+
+def plot_traj(trajectory, dof):
+    # Third Party
+    import matplotlib.pyplot as plt
+
+    _, axs = plt.subplots(3, 1)
+    q = trajectory[:, :dof]
+    qd = trajectory[:, dof : dof * 2]
+    qdd = trajectory[:, dof * 2 : dof * 3]
+
+    for i in range(q.shape[-1]):
+        axs[0].plot(q[:, i], label=str(i))
+        axs[1].plot(qd[:, i], label=str(i))
+        axs[2].plot(qdd[:, i], label=str(i))
+    plt.legend()
+    plt.savefig("test.png")
+    # plt.show()
+
+
+def demo_full_config_mpc():
+    PLOT = True
+    tensor_args = TensorDeviceType()
+    world_file = "collision_test.yml"
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), "franka.yml"))["robot_cfg"]
+    robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+
+    mpc_config = MpcSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_file,
+        use_cuda_graph=True,
+        use_cuda_graph_metrics=True,
+        use_cuda_graph_full_step=False,
+        use_lbfgs=False,
+        use_es=False,
+        use_mppi=True,
+        store_rollouts=True,
+        step_dt=0.03,
+    )
+    mpc = MpcSolver(mpc_config)
+
+    # retract_cfg = robot_cfg.cspace.retract_config.view(1, -1)
+    retract_cfg = mpc.rollout_fn.dynamics_model.retract_config.unsqueeze(0)
+    joint_names = mpc.joint_names
+
+    state = mpc.rollout_fn.compute_kinematics(
+        JointState.from_position(retract_cfg + 0.5, joint_names=joint_names)
+    )
+    retract_pose = Pose(state.ee_pos_seq, quaternion=state.ee_quat_seq)
+    start_state = JointState.from_position(retract_cfg, joint_names=joint_names)
+
+    goal = Goal(
+        current_state=start_state,
+        goal_state=JointState.from_position(retract_cfg, joint_names=joint_names),
+        goal_pose=retract_pose,
+    )
+    goal_buffer = mpc.setup_solve_single(goal, 1)
+
+    # test_q = tensor_args.to_device( [2.7735, -1.6737,  0.4998, -2.9865,  0.3386,  0.8413,  0.4371])
+    # start_state.position[:] = test_q
+    converged = False
+    tstep = 0
+    traj_list = []
+    mpc_time = []
+    mpc.update_goal(goal_buffer)
+    current_state = start_state  # .clone()
+    while not converged:
+        st_time = time.time()
+        # current_state.position += 0.1
+        print(current_state.position)
+        result = mpc.step(current_state, 1)
+
+        print(mpc.get_visual_rollouts().shape)
+        # exit()
+        torch.cuda.synchronize()
+        if tstep > 5:
+            mpc_time.append(time.time() - st_time)
+        # goal_buffer.current_state.position[:] = result.action.position
+        # result.action.position += 0.1
+        current_state.copy_(result.action)
+        # goal_buffer.current_state.velocity[:] = result.action.vel
+        traj_list.append(result.action.get_state_tensor())
+        tstep += 1
+        # if tstep % 10 == 0:
+        #    print(result.metrics.pose_error.item(), result.solve_time, mpc_time[-1])
+        if result.metrics.pose_error.item() < 0.01:
+            converged = True
+        if tstep > 1000:
+            break
+    print(
+        "MPC (converged, error, steps, opt_time, mpc_time): ",
+        converged,
+        result.metrics.pose_error.item(),
+        tstep,
+        result.solve_time,
+        np.mean(mpc_time),
+    )
+    if PLOT:
+        plot_traj(torch.cat(traj_list, dim=0).cpu().numpy(), dof=retract_cfg.shape[-1])
+
+
+if __name__ == "__main__":
+    demo_full_config_mpc()
+    # demo_full_config_mesh_mpc()
diff --git a/examples/nvblox_example.py b/examples/nvblox_example.py
new file mode 100644
index 00000000..f0a7913f
--- /dev/null
+++ b/examples/nvblox_example.py
@@ -0,0 +1,120 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+
+
+# CuRobo
+
+# CuRobo
+from curobo.geom.sdf.world import CollisionCheckerType
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.util_file import get_robot_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
+
+
+def plot_traj(trajectory):
+    # Third Party
+    import matplotlib.pyplot as plt
+
+    _, axs = plt.subplots(1, 1)
+    q = trajectory
+
+    for i in range(q.shape[-1]):
+        axs.plot(q[:, i], label=str(i))
+    plt.legend()
+    plt.show()
+
+
+def plot_iters_traj(trajectory, d_id=1, dof=7, seed=0):
+    # Third Party
+    import matplotlib.pyplot as plt
+
+    _, axs = plt.subplots(len(trajectory), 1)
+    if len(trajectory) == 1:
+        axs = [axs]
+    for k in range(len(trajectory)):
+        q = trajectory[k]
+
+        for i in range(len(q)):
+            axs[k].plot(
+                q[i][seed, :-1, d_id].cpu(),
+                "r+-",
+                label=str(i),
+                alpha=0.1 + min(0.9, float(i) / (len(q))),
+            )
+    plt.legend()
+    plt.show()
+
+
+def plot_iters_traj_3d(trajectory, d_id=1, dof=7, seed=0):
+    # Third Party
+    import matplotlib.pyplot as plt
+
+    ax = plt.axes(projection="3d")
+    c = 0
+    h = trajectory[0][0].shape[1] - 1
+    x = [x for x in range(h)]
+
+    for k in range(len(trajectory)):
+        q = trajectory[k]
+
+        for i in range(len(q)):
+            # ax.plot3D(x,[c for _ in range(h)],  q[i][seed, :, d_id].cpu())#, 'r')
+            ax.scatter3D(
+                x, [c for _ in range(h)], q[i][seed, :h, d_id].cpu(), c=q[i][seed, :, d_id].cpu()
+            )
+            # @plt.show()
+            c += 1
+    # plt.legend()
+    plt.show()
+
+
+def demo_motion_gen_nvblox():
+    PLOT = True
+    tensor_args = TensorDeviceType()
+    world_file = "collision_nvblox.yml"
+    robot_file = "franka.yml"
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_file,
+        world_file,
+        tensor_args,
+        trajopt_tsteps=32,
+        collision_checker_type=CollisionCheckerType.BLOX,
+        use_cuda_graph=False,
+        num_trajopt_seeds=2,
+        num_graph_seeds=2,
+        evaluate_interpolated_trajectory=True,
+    )
+    goals = tensor_args.to_device(
+        [
+            [0.5881, 0.0589, 0.3055],
+            [0.5881, 0.4155, 0.3055],
+            [0.5881, 0.4155, 0.1238],
+            [0.5881, -0.4093, 0.1238],
+            [0.7451, 0.0287, 0.2539],
+        ]
+    ).view(-1, 3)
+
+    motion_gen = MotionGen(motion_gen_config)
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+    motion_gen.warmup()
+    print("ready")
+    # print("Trajectory Generated: ", result.success)
+    # if PLOT:
+    #    plot_traj(traj.cpu().numpy())
+
+
+if __name__ == "__main__":
+    demo_motion_gen_nvblox()
diff --git a/examples/torch_layers_example.py b/examples/torch_layers_example.py
new file mode 100644
index 00000000..3157757a
--- /dev/null
+++ b/examples/torch_layers_example.py
@@ -0,0 +1,229 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+"""This example will use CuRobo's functions as pytorch layers as part of a NN."""
+# Standard Library
+import uuid
+from typing import Optional
+
+# Third Party
+import numpy as np
+import torch
+from torch import nn
+from torch.utils.tensorboard import SummaryWriter
+from tqdm import tqdm
+
+# CuRobo
+from curobo.geom.sdf.world import WorldConfig
+from curobo.geom.types import Mesh
+from curobo.types.math import Pose
+from curobo.types.state import JointState
+from curobo.util.usd_helper import UsdHelper
+from curobo.util_file import get_assets_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.model.robot_world import RobotWorld, RobotWorldConfig
+
+
+class CuroboTorch(torch.nn.Module):
+    def __init__(self, robot_world: RobotWorld):
+        """Build a simple structured NN:
+
+        q_current -> kinematics -> sdf -> features
+        [features, x_des] -> NN -> kinematics -> sdf -> [sdf, pose_distance] -> NN -> q_out
+        loss = (fk(q_out) - x_des) + (q_current - q_out) + valid(q_out)
+        """
+        super(CuroboTorch, self).__init__()
+
+        feature_dims = robot_world.kinematics.kinematics_config.link_spheres.shape[0] * 5 + 7 + 1
+        q_feature_dims = 7
+        final_feature_dims = feature_dims + 1 + 7
+        output_dims = robot_world.kinematics.get_dof()
+
+        # build neural network:
+        self._robot_world = robot_world
+
+        self._feature_mlp = nn.Sequential(
+            nn.Linear(q_feature_dims, 512),
+            nn.ReLU6(),
+            nn.Linear(512, 512),
+            nn.ReLU6(),
+            nn.Linear(512, 512),
+            nn.ReLU6(),
+            nn.Linear(512, output_dims),
+            nn.Tanh(),
+        )
+        self._final_mlp = nn.Sequential(
+            nn.Linear(final_feature_dims, 256),
+            nn.ReLU6(),
+            nn.Linear(256, 256),
+            nn.ReLU6(),
+            nn.Linear(256, 64),
+            nn.ReLU6(),
+            nn.Linear(64, output_dims),
+            nn.Tanh(),
+        )
+
+    def get_features(self, q: torch.Tensor, x_des: Optional[Pose] = None):
+        kin_state = self._robot_world.get_kinematics(q)
+        spheres = kin_state.link_spheres_tensor.unsqueeze(2)
+        q_sdf = self._robot_world.get_collision_distance(spheres)
+        q_self = self._robot_world.get_self_collision_distance(
+            kin_state.link_spheres_tensor.unsqueeze(1)
+        )
+
+        features = [
+            kin_state.link_spheres_tensor.view(q.shape[0], -1),
+            q_sdf,
+            q_self,
+            kin_state.ee_position,
+            kin_state.ee_quaternion,
+        ]
+        if x_des is not None:
+            pose_distance = self._robot_world.pose_distance(x_des, kin_state.ee_pose)
+            features.append(pose_distance)
+            features.append(x_des.position)
+            features.append(x_des.quaternion)
+        features = torch.cat(features, dim=-1)
+
+        return features
+
+    def forward(self, q: torch.Tensor, x_des: Pose):
+        """Forward for neural network
+
+        Args:
+            q (torch.Tensor): _description_
+            x_des (torch.Tensor): _description_
+        """
+        # get features for input:
+        in_features = torch.cat([x_des.position, x_des.quaternion], dim=-1)
+        # pass through initial mlp:
+        q_mid = self._feature_mlp(in_features)
+
+        q_scale = self._robot_world.bound_scale * q_mid
+        # get new features:
+        mid_features = self.get_features(q_scale, x_des=x_des)
+        q_out = self._final_mlp(mid_features)
+        q_out = self._robot_world.bound_scale * q_out
+        return q_out
+
+    def loss(self, x_des: Pose, q: torch.Tensor, q_in: torch.Tensor):
+        kin_state = self._robot_world.get_kinematics(q)
+        distance = self._robot_world.pose_distance(x_des, kin_state.ee_pose)
+        d_sdf = self._robot_world.collision_constraint(kin_state.link_spheres_tensor.unsqueeze(1))
+        d_self = self._robot_world.self_collision_cost(kin_state.link_spheres_tensor.unsqueeze(1))
+        loss = 0.1 * torch.linalg.norm(q_in - q, dim=-1) + distance + 100.0 * (d_self + d_sdf)
+        return loss
+
+    def val_loss(self, x_des: Pose, q: torch.Tensor, q_in: torch.Tensor):
+        kin_state = self._robot_world.get_kinematics(q)
+        distance = self._robot_world.pose_distance(x_des, kin_state.ee_pose)
+        d_sdf = self._robot_world.collision_constraint(kin_state.link_spheres_tensor.unsqueeze(1))
+        d_self = self._robot_world.self_collision_cost(kin_state.link_spheres_tensor.unsqueeze(1))
+        loss = 10.0 * (d_self + d_sdf) + distance
+        return loss
+
+
+if __name__ == "__main__":
+    update_goal = False
+    write_usd = False
+    writer = SummaryWriter("log/runs/ik/" + str(uuid.uuid4()))
+    robot_file = "franka.yml"
+    world_file = "collision_table.yml"
+    config = RobotWorldConfig.load_from_config(robot_file, world_file, pose_weight=[10, 200, 1, 10])
+    curobo_fn = RobotWorld(config)
+
+    model = CuroboTorch(curobo_fn)
+    model.cuda()
+    with torch.no_grad():
+        # q_train = curobo_fn.sample(10000)
+        q_val = curobo_fn.sample(100)
+        q_train = curobo_fn.sample(5 * 2048)
+    usd_list = []
+    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-8)
+    batch_size = 512
+    batch_start = torch.arange(0, q_train.shape[0], batch_size)
+    with torch.no_grad():
+        x_des = curobo_fn.get_kinematics(q_train[1:2]).ee_pose
+        x_des_train = curobo_fn.get_kinematics(q_train).ee_pose
+        x_des_val = x_des.repeat(q_val.shape[0])
+    q_debug = []
+    bar = tqdm(range(500))
+    for e in bar:
+        model.train()
+        for j in range(batch_start.shape[0]):
+            x_train = q_train[batch_start[j] : batch_start[j] + batch_size]
+            if x_train.shape[0] != batch_size:
+                continue
+            x_des_batch = x_des_train[batch_start[j] : batch_start[j] + batch_size]
+            q = model.forward(x_train, x_des_batch)
+            loss = model.loss(x_des_batch, q, x_train)
+            loss = torch.mean(loss)
+
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+        writer.add_scalar("training loss", loss.item(), e)
+        model.eval()
+        with torch.no_grad():
+            q_pred = model.forward(q_val, x_des_val)
+            val_loss = model.val_loss(x_des_val, q_pred, q_val)
+            val_loss = torch.mean(val_loss)
+            q_debug.append(q_pred[0:1].clone())
+        writer.add_scalar("validation loss", val_loss.item(), e)
+        bar.set_description("t: " + str(val_loss.item()))
+        if e % 100 == 0 and len(q_debug) > 1:
+            if write_usd:
+                q_traj = torch.cat(q_debug, dim=0)
+                world_model = WorldConfig.from_dict(
+                    load_yaml(join_path(get_world_configs_path(), world_file))
+                )
+                gripper_mesh = Mesh(
+                    name="target_gripper",
+                    file_path=join_path(
+                        get_assets_path(),
+                        "robot/franka_description/meshes/visual/hand_ee_link.dae",
+                    ),
+                    color=[0.0, 0.8, 0.1, 1.0],
+                    pose=x_des[0].tolist(),
+                )
+                world_model.add_obstacle(gripper_mesh)
+                save_name = "e_" + str(e)
+                UsdHelper.write_trajectory_animation_with_robot_usd(
+                    "franka.yml",
+                    world_model,
+                    JointState(position=q_traj[0]),
+                    JointState(position=q_traj),
+                    dt=1.0,
+                    visualize_robot_spheres=False,
+                    save_path=save_name + ".usd",
+                    base_frame="/" + save_name,
+                )
+                usd_list.append(save_name + ".usd")
+            if update_goal:
+                with torch.no_grad():
+                    rand_perm = torch.randperm(q_val.shape[0])
+                    q_val = q_val[rand_perm].clone()
+                    x_des = curobo_fn.get_kinematics(q_val[0:1]).ee_pose
+                    x_des_val = x_des.repeat(q_val.shape[0])
+            q_debug = []
+
+    # create loss function:
+    if write_usd:
+        UsdHelper.create_grid_usd(
+            usd_list,
+            "epoch_grid.usd",
+            "/world",
+            max_envs=len(usd_list),
+            max_timecode=len(q_debug),
+            x_space=2.0,
+            y_space=2.0,
+            x_per_row=int(np.sqrt(len(usd_list))) + 1,
+            local_asset_path="",
+            dt=1,
+        )
diff --git a/examples/trajopt_example.py b/examples/trajopt_example.py
new file mode 100644
index 00000000..2d03a466
--- /dev/null
+++ b/examples/trajopt_example.py
@@ -0,0 +1,122 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+import time
+
+# Third Party
+import torch
+
+# CuRobo
+from curobo.geom.types import WorldConfig
+from curobo.rollout.rollout_base import Goal
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.ik_solver import IKSolver, IKSolverConfig
+from curobo.wrap.reacher.trajopt import TrajOptSolver, TrajOptSolverConfig
+
+
+def plot_js(trajectory: JointState):
+    # Third Party
+    import matplotlib.pyplot as plt
+
+    _, axs = plt.subplots(4, 1)
+    q = trajectory.position.cpu().numpy()
+    qd = trajectory.velocity.cpu().numpy()
+    qdd = trajectory.acceleration.cpu().numpy()
+    qddd = None
+    if trajectory.jerk is not None:
+        qddd = trajectory.jerk.cpu().numpy()
+
+    for i in range(q.shape[-1]):
+        axs[0].plot(q[:, i], label=str(i))
+        axs[1].plot(qd[:, i], label=str(i))
+        axs[2].plot(qdd[:, i], label=str(i))
+        if qddd is not None:
+            axs[3].plot(qddd[:, i], label=str(i))
+    plt.legend()
+    plt.show()
+
+
+def plot_traj(trajectory):
+    # Third Party
+    import matplotlib.pyplot as plt
+
+    _, axs = plt.subplots(1, 1)
+    q = trajectory
+
+    for i in range(q.shape[-1]):
+        axs.plot(q[:, i], label=str(i))
+    plt.legend()
+    plt.show()
+
+
+def demo_trajopt_collision_free():
+    PLOT = True
+    tensor_args = TensorDeviceType()
+    world_file = "collision_table.yml"
+
+    robot_file = "franka.yml"
+    robot_cfg = RobotConfig.from_dict(
+        load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    )
+    world_cfg = WorldConfig.from_dict(load_yaml(join_path(get_world_configs_path(), world_file)))
+
+    trajopt_config = TrajOptSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        tensor_args,
+        use_cuda_graph=False,
+    )
+    trajopt_solver = TrajOptSolver(trajopt_config)
+    q_start = trajopt_solver.retract_config
+
+    q_goal = q_start.clone() + 0.1
+    # q_goal[...,-1] -=0.2
+    kin_state = trajopt_solver.fk(q_goal)
+    goal_pose = Pose(kin_state.ee_position, kin_state.ee_quaternion)
+    goal_state = JointState.from_position(q_goal)
+    current_state = JointState.from_position(q_start)
+    # do single planning:
+    # print("Running Goal State trajopt")
+    # js_goal = Goal(goal_state=goal_state, current_state=current_state)
+    # result = trajopt_solver.solve_single(js_goal)
+    # traj = result.solution.position
+    # print(result.success, result.cspace_error)
+    # print(goal_state.position)
+    # print(traj[...,-1,:])
+    # print(torch.linalg.norm((goal_state.position - traj[...,-1,:])).item())
+
+    # exit()
+    # if PLOT:
+    #    #plot_traj(traj)
+    #    plot_js(result.solution)
+    # exit()
+    print("Running Goal Pose trajopt")
+    js_goal = Goal(goal_pose=goal_pose, current_state=current_state)
+    result = trajopt_solver.solve_single(js_goal)
+    print(result.success)
+    if PLOT:
+        plot_js(result.solution)
+
+    # run goalset planning:
+    print("Running Goal Pose Set trajopt")
+    # g_set = Pose(kin_state.ee_position, kin_state.ee_quaternion.repeat(2,1).view())
+    # js_goal = Goal(goal_pose=goal_pose, current_state=current_state)
+    # result = trajopt_solver.solve_single(js_goal)
+
+
+if __name__ == "__main__":
+    # demo_basic_ik()
+    # demo_full_config_collision_free_ik()
+    # demo_full_config_batch_env_collision_free_ik()
+    demo_trajopt_collision_free()
diff --git a/examples/usd_example.py b/examples/usd_example.py
new file mode 100644
index 00000000..dbc2c05f
--- /dev/null
+++ b/examples/usd_example.py
@@ -0,0 +1,171 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import torch
+
+# CuRobo
+from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel, CudaRobotModelConfig
+from curobo.geom.sdf.world import CollisionCheckerType
+from curobo.geom.types import WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.util.usd_helper import UsdHelper
+from curobo.util_file import (
+    get_assets_path,
+    get_robot_configs_path,
+    get_world_configs_path,
+    join_path,
+    load_yaml,
+)
+from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
+
+
+def save_curobo_world_to_usd():
+    world_file = "collision_table.yml"
+    world_cfg = WorldConfig.from_dict(
+        load_yaml(join_path(get_world_configs_path(), world_file))
+    ).get_mesh_world(process=False)
+    usd_helper = UsdHelper()
+    usd_helper.create_stage()
+
+    usd_helper.add_obstacles_to_stage(world_cfg)
+
+    usd_helper.write_stage_to_file("test.usd")
+
+
+def get_trajectory(robot_file="franka.yml", dt=1.0 / 60.0):
+    tensor_args = TensorDeviceType()
+    world_file = "collision_test.yml"
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_file,
+        world_file,
+        tensor_args,
+        trajopt_tsteps=24,
+        collision_checker_type=CollisionCheckerType.PRIMITIVE,
+        use_cuda_graph=True,
+        num_trajopt_seeds=2,
+        num_graph_seeds=2,
+        evaluate_interpolated_trajectory=True,
+        interpolation_dt=dt,
+        self_collision_check=True,
+    )
+    motion_gen = MotionGen(motion_gen_config)
+    motion_gen.warmup()
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+    retract_cfg = motion_gen.get_retract_config()
+    state = motion_gen.rollout_fn.compute_kinematics(
+        JointState.from_position(retract_cfg.view(1, -1))
+    )
+
+    retract_pose = Pose(state.ee_pos_seq.squeeze(), quaternion=state.ee_quat_seq.squeeze())
+    start_state = JointState.from_position(retract_cfg.view(1, -1).clone() + 0.5)
+    # start_state.position[0,2] = 0.5
+    # start_state.position[0,1] = 0.5
+    # start_state.position[0,0] = 0.5
+    # print(start_state.position)
+    result = motion_gen.plan_single(start_state, retract_pose)
+    # print(result.plan_state.position)
+    print(result.success)
+    # print(result.position_error)
+    # result = motion_gen.plan(
+    #    start_state, retract_pose, enable_graph=False, enable_opt=True, max_attempts=10
+    # )
+    traj = result.get_interpolated_plan()  # optimized plan
+    return traj
+
+
+def save_curobo_robot_world_to_usd(robot_file="franka.yml"):
+    tensor_args = TensorDeviceType()
+    world_file = "collision_test.yml"
+    world_model = WorldConfig.from_dict(
+        load_yaml(join_path(get_world_configs_path(), world_file))
+    ).get_obb_world()
+    dt = 1 / 60.0
+
+    q_traj = get_trajectory(robot_file, dt)
+    q_start = q_traj[0]
+
+    UsdHelper.write_trajectory_animation_with_robot_usd(
+        robot_file, world_model, q_start, q_traj, save_path="test.usd"
+    )
+
+
+def save_curobo_robot_to_usd(robot_file="franka.yml"):
+    # print(robot_file)
+    tensor_args = TensorDeviceType()
+    robot_cfg_y = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    robot_cfg_y["kinematics"]["use_usd_kinematics"] = True
+    print(
+        len(robot_cfg_y["kinematics"]["cspace"]["null_space_weight"]),
+        len(robot_cfg_y["kinematics"]["cspace"]["retract_config"]),
+        len(robot_cfg_y["kinematics"]["cspace"]["joint_names"]),
+    )
+    # print(robot_cfg_y)
+    robot_cfg = RobotConfig.from_dict(robot_cfg_y, tensor_args)
+    start = JointState.from_position(robot_cfg.cspace.retract_config)
+    retract_cfg = robot_cfg.cspace.retract_config.clone()
+    retract_cfg[0] = 0.5
+
+    # print(retract_cfg)
+    kin_model = CudaRobotModel(robot_cfg.kinematics)
+    position = retract_cfg
+    q_traj = JointState.from_position(position.unsqueeze(0))
+    q_traj.joint_names = kin_model.joint_names
+    # print(q_traj.joint_names)
+    usd_helper = UsdHelper()
+    # usd_helper.create_stage(
+    #    "test.usd", timesteps=q_traj.position.shape[0] + 1, dt=dt, interpolation_steps=10
+    # )
+    world_file = "collision_table.yml"
+    world_model = WorldConfig.from_dict(
+        load_yaml(join_path(get_world_configs_path(), world_file))
+    ).get_obb_world()
+
+    # print(q_traj.position.shape)
+    # usd_helper.load_robot_usd(robot_cfg.kinematics.usd_path, js)
+    usd_helper.write_trajectory_animation_with_robot_usd(
+        {"robot_cfg": robot_cfg_y},
+        world_model,
+        start,
+        q_traj,
+        save_path="test.usd",
+        # robot_asset_prim_path="/robot"
+    )
+
+    # usd_helper.save()
+    # usd_helper.write_stage_to_file("test.usda")
+
+
+def read_world_from_usd(file_path: str):
+    usd_helper = UsdHelper()
+    usd_helper.load_stage_from_file(file_path)
+    # world_model = usd_helper.get_obstacles_from_stage(reference_prim_path="/Root/world_obstacles")
+    world_model = usd_helper.get_obstacles_from_stage(
+        only_paths=["/world/obstacles"], reference_prim_path="/world"
+    )
+    # print(world_model)
+    for x in world_model.cuboid:
+        print(x.name + ":")
+        print("  pose: ", x.pose)
+        print("  dims: ", x.dims)
+
+
+def read_robot_from_usd(robot_file: str = "franka.yml"):
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    robot_cfg["kinematics"]["use_usd_kinematics"] = True
+    robot_cfg = RobotConfig.from_dict(robot_cfg, TensorDeviceType())
+
+
+if __name__ == "__main__":
+    save_curobo_world_to_usd()
diff --git a/examples/world_representation_example.py b/examples/world_representation_example.py
new file mode 100644
index 00000000..0fa9b8a0
--- /dev/null
+++ b/examples/world_representation_example.py
@@ -0,0 +1,105 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# CuRobo
+from curobo.geom.types import Capsule, Cuboid, Cylinder, Mesh, Sphere, WorldConfig
+from curobo.util_file import get_assets_path, join_path
+
+
+def approximate_geometry():
+    # CuRobo
+    from curobo.geom.sphere_fit import SphereFitType
+    from curobo.geom.types import Capsule, WorldConfig
+
+    obstacle_capsule = Capsule(
+        name="capsule",
+        radius=0.2,
+        base=[0, 0, 0],
+        tip=[0, 0, 0.5],
+        pose=[0.0, 5, 0.0, 0.043, -0.471, 0.284, 0.834],
+        color=[0, 1.0, 0, 1.0],
+    )
+
+    sph = obstacle_capsule.get_bounding_spheres(
+        500, 0.005, SphereFitType.VOXEL_VOLUME_SAMPLE_SURFACE
+    )
+
+    WorldConfig(sphere=sph).save_world_as_mesh("bounding_spheres.obj")
+
+    WorldConfig(capsule=[obstacle_capsule]).save_world_as_mesh("capsule.obj")
+
+
+def doc_example():
+    # describe a cuboid obstacle
+
+    obstacle_1 = Cuboid(
+        name="cube_1",
+        pose=[0.0, 0.0, 0.0, 0.043, -0.471, 0.284, 0.834],
+        dims=[0.2, 1.0, 0.2],
+        color=[0.8, 0.0, 0.0, 1.0],
+    )
+
+    # describe a mesh obstacle
+    # import a mesh file:
+
+    mesh_file = join_path(get_assets_path(), "scene/nvblox/srl_ur10_bins.obj")
+
+    obstacle_2 = Mesh(
+        name="mesh_1",
+        pose=[0.0, 2, 0.5, 0.043, -0.471, 0.284, 0.834],
+        file_path=mesh_file,
+        scale=[0.5, 0.5, 0.5],
+    )
+
+    obstacle_3 = Capsule(
+        name="capsule",
+        radius=0.2,
+        base=[0, 0, 0],
+        tip=[0, 0, 0.5],
+        pose=[0.0, 5, 0.0, 0.043, -0.471, 0.284, 0.834],
+        color=[0, 1.0, 0, 1.0],
+    )
+
+    obstacle_4 = Cylinder(
+        name="cylinder_1",
+        radius=0.2,
+        height=0.5,
+        pose=[0.0, 6, 0.0, 0.043, -0.471, 0.284, 0.834],
+        color=[0, 1.0, 0, 1.0],
+    )
+
+    obstacle_5 = Sphere(
+        name="sphere_1",
+        radius=0.2,
+        pose=[0.0, 7, 0.0, 0.043, -0.471, 0.284, 0.834],
+        color=[0, 1.0, 0, 1.0],
+    )
+
+    world_model = WorldConfig(
+        mesh=[obstacle_2],
+        cuboid=[obstacle_1],
+        capsule=[obstacle_3],
+        cylinder=[obstacle_4],
+        sphere=[obstacle_5],
+    )
+    world_model.randomize_color(r=[0.2, 0.7], g=[0.4, 0.8], b=[0.0, 0.4])
+    file_path = "debug_mesh.obj"
+    world_model.save_world_as_mesh(file_path)
+
+    cuboid_world = WorldConfig.create_obb_world(world_model)
+    cuboid_world.save_world_as_mesh("debug_cuboid_mesh.obj")
+
+    collision_support_world = WorldConfig.create_collision_support_world(world_model)
+    collision_support_world.save_world_as_mesh("debug_collision_mesh.obj")
+
+
+if __name__ == "__main__":
+    approximate_geometry()
diff --git a/images/robot_demo.gif b/images/robot_demo.gif
new file mode 100644
index 00000000..acac9542
--- /dev/null
+++ b/images/robot_demo.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8496a75cf043db596aa194894c8f8c8038fdc6fc46f3c15f0cc64acf5eb1c05e
+size 4026516
diff --git a/images/rrt_compare.gif b/images/rrt_compare.gif
new file mode 100644
index 00000000..89aafdcb
--- /dev/null
+++ b/images/rrt_compare.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df0dbe0775664436989433c5e785878733520d38c4116a863f601f34dc4788c5
+size 2181094
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..e337471c
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,66 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto. Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+[build-system]
+requires = ["setuptools>=45", "setuptools_scm>=6.2", "wheel", "torch"]
+
+build-backend = "setuptools.build_meta"
+
+[tool.black]
+line-length = 100
+extend-exclude = "deprecated/"
+
+# See the `setuptools_scm` documentation for the description of the schemes used below.
+# https://pypi.org/project/setuptools-scm/
+# NOTE: If these values are updated, they need to be also updated in `__init__.py`.
+[tool.setuptools_scm]
+version_scheme = "no-guess-dev"
+local_scheme = "dirty-tag"
+
+[tool.isort]
+# see https://github.com/PyCQA/isort#multi-line-output-modes
+multi_line_output = 2
+import_heading_stdlib = "Standard Library"
+import_heading_thirdparty = "Third Party"
+import_heading_firstparty = "CuRobo"
+import_heading_localfolder = "Local Folder"
+import_heading_isaacgym = "Loading isaacgym before pytorch"
+known_first_party = "CuRobo"
+known_isaacgym = "isaacgym"
+line_length = 100
+sections = "FUTURE, STDLIB, ISAACGYM, THIRDPARTY, FIRSTPARTY, LOCALFOLDER"
+
+[tool.pytest.ini_options]
+norecursedirs = [".git", ".venv", "deprecated", "dist"]
+python_files = ["*_test.py"]
+
+
+#[tool.pylint.TYPECHECK]
+# List of members which are set dynamically and missed by Pylint inference
+# system, and so shouldn't trigger E1101 when accessed.
+# generated-members=numpy.*, torch.*
+
+
+
+[tool.coverage.report]
+# NOTE: Single-quoted strings are required in TOML for regular expressions
+exclude_lines = [
+	"pragma: no cover", # Need to re-enable the standard no cover match
+	'^\s*pass\s*$',  # Skip any pass lines
+    "wp.kernel",
+    "torch.jit.script",
+    "abstractmethod",
+    "def backward",
+]
+omit = [
+    "src/curobo/util/error_metrics.py",
+    "src/curobo/rollout/cost/straight_line_cost.py",
+    "src/curobo/rollout/cost/projected_dist_cost.py",
+    "src/curobo/rollout/cost/manipulability_cost.py",
+]
+
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 00000000..c9eb209a
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,124 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto. Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+# Additional files that need to be included in the package distribution must be
+# listed in the MANIFEST.in file.
+#
+# References:
+# * https://newbedev.com/how-include-static-files-to-setuptools-python-package
+
+[metadata]
+
+# Configure specific project settings
+name = nvidia_curobo
+author = NVIDIA Seattle Robotics Lab
+description = GPU robot motion toolkit containing cuda accelerated kinematics, IK, MPC, Global motion planning, and optimization solvers.
+url = https://curobo.org
+license = NVIDIA
+
+# Configure general project settings
+long_description = file: README.md
+long-description-content-type = text/markdown
+license_file = LICENSE
+
+# List of classifiers can be found here:
+# https://pypi.org/classifiers/
+classifiers =
+  License :: Other/Proprietary License
+  Operating System :: OS Independent
+  Intended Audience :: Developers
+  Natural Language :: English
+  Programming Language :: Python :: 3
+  Topic :: Scientific/Engineering :: Robotics
+
+# Change if running only on Windows, Mac or Linux (comma-separated)
+platforms = any
+
+[options]
+install_requires =
+  networkx
+  numpy
+  numpy-quaternion
+  pyyaml
+  setuptools_scm>=6.2
+  torchtyping
+  torch>=1.10
+  trimesh
+  yourdfpy>=0.0.53
+  warp-lang>=0.9.0
+  scipy>=1.7.0
+  tqdm
+  wheel
+  importlib_resources
+
+packages = find_namespace:
+package_dir =
+  = src
+include_package_data = True
+
+[options.packages.find]
+where = src
+exclude =
+    tests
+
+[options.extras_require]
+ci =
+  anybadge
+  black
+  build
+  flake8
+  flake8-docstrings
+  flake8-isort
+  pytest>6.2.5
+  pytest-cov
+  twine
+  sphinx
+  sphinx_rtd_theme
+  graphviz>=0.20.1
+  sphinxcontrib-youtube
+  sphinxcontrib-video>=0.2.0
+
+# this is only available in 3.8+
+smooth = 
+  trajectory_smoothing @ https://github.com/balakumar-s/trajectory_smoothing/raw/main/dist/trajectory_smoothing-0.3-cp38-cp38-linux_x86_64.whl
+
+usd = 
+  usd-core
+
+dev =
+  ipdb
+  ipython
+  black
+  flake8
+  flake8-docstrings
+  flake8-isort
+  pytest>6.2.5
+  pytest-cov
+
+
+doc = 
+  sphinx
+  sphinx_rtd_theme
+  graphviz>=0.20.1
+  sphinxcontrib-youtube
+  sphinxcontrib-video>=0.2.0
+
+[options.entry_points]
+# Add here console scripts like:
+# console_scripts =
+#     script_name = package.module:function
+
+# NOTE (roflaherty): Flake8 doesn't support pyproject.toml configuration yet.
+[flake8]
+max-line-length = 100
+docstring-convention = google
+exclude = .git,build,deprecated,dist,venv
+ignore =
+  W503
+  E203
+  E731
diff --git a/setup.py b/setup.py
new file mode 100644
index 00000000..172b9d97
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,90 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+"""curobo package setuptools."""
+
+# NOTE (roflaherty): This file is still needed to allow the package to be
+# installed in editable mode.
+#
+# References:
+# * https://setuptools.pypa.io/en/latest/setuptools.html#setup-cfg-only-projects
+
+# Third Party
+import setuptools
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+print("*********************************")
+print("CuRobo: Compiling CUDA kernels...")
+print(
+    "This will take 20+ minutes, to speedup compilation set TORCH_CUDA_ARCH_LIST={X}+PTX to "
+    + " compile only for a specific architecture. e.g., export TORCH_CUDA_ARCH_LIST=7.0+PTX"
+)
+extra_cuda_args = {
+    "nvcc": [
+        "--threads=6",
+        "-O3",
+        "--ftz=true",
+        "--fmad=true",
+        "--prec-div=false",
+        "--prec-sqrt=false",
+    ]
+}
+# create a list of modules to be compiled:
+ext_modules = [
+    CUDAExtension(
+        "curobo.curobolib.lbfgs_step_cu",
+        [
+            "src/curobo/curobolib/cpp/lbfgs_step_cuda.cpp",
+            "src/curobo/curobolib/cpp/lbfgs_step_kernel.cu",
+        ],
+        extra_compile_args=extra_cuda_args,
+    ),
+    CUDAExtension(
+        "curobo.curobolib.kinematics_fused_cu",
+        [
+            "src/curobo/curobolib/cpp/kinematics_fused_cuda.cpp",
+            "src/curobo/curobolib/cpp/kinematics_fused_kernel.cu",
+        ],
+        extra_compile_args=extra_cuda_args,
+    ),
+    CUDAExtension(
+        "curobo.curobolib.geom_cu",
+        [
+            "src/curobo/curobolib/cpp/geom_cuda.cpp",
+            "src/curobo/curobolib/cpp/sphere_obb_kernel.cu",
+            "src/curobo/curobolib/cpp/pose_distance_kernel.cu",
+            "src/curobo/curobolib/cpp/self_collision_kernel.cu",
+        ],
+        extra_compile_args=extra_cuda_args,
+    ),
+    CUDAExtension(
+        "curobo.curobolib.line_search_cu",
+        [
+            "src/curobo/curobolib/cpp/line_search_cuda.cpp",
+            "src/curobo/curobolib/cpp/line_search_kernel.cu",
+            "src/curobo/curobolib/cpp/update_best_kernel.cu",
+        ],
+        extra_compile_args=extra_cuda_args,
+    ),
+    CUDAExtension(
+        "curobo.curobolib.tensor_step_cu",
+        [
+            "src/curobo/curobolib/cpp/tensor_step_cuda.cpp",
+            "src/curobo/curobolib/cpp/tensor_step_kernel.cu",
+        ],
+        extra_compile_args=extra_cuda_args,
+    ),
+]
+
+setuptools.setup(
+    ext_modules=ext_modules,
+    cmdclass={"build_ext": BuildExtension},
+)
diff --git a/src/curobo/__init__.py b/src/curobo/__init__.py
new file mode 100644
index 00000000..58d5ff03
--- /dev/null
+++ b/src/curobo/__init__.py
@@ -0,0 +1,52 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+"""CuRobo package."""
+
+
+# NOTE (roflaherty): This is inspired by how matplotlib does creates its version value.
+# https://github.com/matplotlib/matplotlib/blob/master/lib/matplotlib/__init__.py#L161
+def _get_version():
+    """Return the version string used for __version__."""
+    # Standard Library
+    import pathlib
+
+    root = pathlib.Path(__file__).resolve().parent.parent.parent
+    if (root / ".git").exists() and not (root / ".git/shallow").exists():
+        # Third Party
+        import setuptools_scm
+
+        # See the `setuptools_scm` documentation for the description of the schemes used below.
+        # https://pypi.org/project/setuptools-scm/
+        # NOTE: If these values are updated, they need to be also updated in `pyproject.toml`.
+        return setuptools_scm.get_version(
+            root=root,
+            version_scheme="no-guess-dev",
+            local_scheme="dirty-tag",
+        )
+    else:  # Get the version from the _version.py setuptools_scm file.
+        try:
+            # Standard Library
+            from importlib.metadata import version
+        except ModuleNotFoundError:
+            # NOTE: `importlib.resources` is part of the standard library in Python 3.9.
+            # `importlib_metadata` is the back ported library for older versions of python.
+            # Third Party
+            from importlib_metadata import version
+
+        return version("nvidia_curobo")
+
+
+# Set `__version__` attribute
+__version__ = _get_version()
+
+# Remove `_get_version` so it is not added as an attribute
+del _get_version
diff --git a/src/curobo/content/assets/robot/franka_description/franka_panda.urdf b/src/curobo/content/assets/robot/franka_description/franka_panda.urdf
new file mode 100644
index 00000000..30d64a09
--- /dev/null
+++ b/src/curobo/content/assets/robot/franka_description/franka_panda.urdf
@@ -0,0 +1,279 @@
+<?xml version="1.0" ?>
+<robot name="panda" xmlns:xacro="http://www.ros.org/wiki/xacro">
+  <link name="base_link"/>
+  <joint name="panda_fixed" type="fixed">
+    <origin rpy="0 0 0.0" xyz="0 0 0.0"/>
+    <parent link="base_link"/>
+    <child link="panda_link0"/>
+    <axis xyz="0 0 0"/>
+  </joint>
+
+  <link name="panda_link0">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link0.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link0.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <link name="panda_link1">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link1.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link1.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint1" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="0 0 0" xyz="0 0 0.333"/>
+    <parent link="panda_link0"/>
+    <child link="panda_link1"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-2.8973" upper="2.8973" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link2">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link2.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link2.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint2" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-1.7628" soft_upper_limit="1.7628"/>
+    <origin rpy="-1.57079632679 0 0" xyz="0 0 0"/>
+    <parent link="panda_link1"/>
+    <child link="panda_link2"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-1.7628" upper="1.7628" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link3">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link3.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link3.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint3" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="1.57079632679 0 0" xyz="0 -0.316 0"/>
+    <parent link="panda_link2"/>
+    <child link="panda_link3"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-2.8973" upper="2.8973" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link4">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link4.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link4.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint4" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-3.0718" soft_upper_limit="-0.0698"/>
+    <origin rpy="1.57079632679 0 0" xyz="0.0825 0 0"/>
+    <parent link="panda_link3"/>
+    <child link="panda_link4"/>
+    <axis xyz="0 0 1"/>    
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-3.0718" upper="-0.0698" velocity="2.1750"/>
+    <!-- something is weird with this joint limit config
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-3.0" upper="0.087" velocity="2.1750"/>  -->
+  </joint>
+  <link name="panda_link5">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link5.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link5.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint5" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="-1.57079632679 0 0" xyz="-0.0825 0.384 0"/>
+    <parent link="panda_link4"/>
+    <child link="panda_link5"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
+  </joint>
+  <link name="panda_link6">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link6.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link6.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint6" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-0.0175" soft_upper_limit="3.7525"/>
+    <origin rpy="1.57079632679 0 0" xyz="0 0 0"/>
+    <parent link="panda_link5"/>
+    <child link="panda_link6"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="12" lower="-0.0175" upper="3.7525" velocity="2.6100"/>
+    <!-- <dynamics damping="10.0"/>
+    <limit effort="12" lower="-0.0873" upper="3.0" velocity="2.6100"/> -->
+  </joint>
+  <link name="panda_link7">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link7.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link7.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint7" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="1.57079632679 0 0" xyz="0.088 0 0"/>
+    <parent link="panda_link6"/>
+    <child link="panda_link7"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
+  </joint>
+  <!--
+  <link name="panda_link8"/>
+  <joint name="panda_joint8" type="fixed">
+    <origin rpy="0 0 0" xyz="0 0 0.107"/>
+    <parent link="panda_link7"/>
+    <child link="panda_link8"/>
+    <axis xyz="0 0 0"/>
+  </joint>
+  Removing this joint seems to help with some stability things
+  -->
+  <joint name="panda_hand_joint" type="fixed">
+    <!--
+    <parent link="panda_link8"/>
+    -->
+    <parent link="panda_link7"/>
+    <child link="panda_hand"/>
+    <origin rpy="0 0 -0.785398163397" xyz="0 0 0.107"/>
+    <!--
+    <origin rpy="0 0 -0.785398163397" xyz="0 0 0"/>
+    -->
+  </joint>
+  <link name="panda_hand">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/hand.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/hand.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <link name="panda_leftfinger">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/finger.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/finger.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <link name="panda_rightfinger">
+    <visual>
+      <origin rpy="0 0 3.14159265359" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/visual/finger.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.14159265359" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/collision/finger.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_finger_joint1" type="prismatic">
+    <parent link="panda_hand"/>
+    <child link="panda_leftfinger"/>
+    <origin rpy="0 0 0" xyz="0 0 0.0584"/>
+    <axis xyz="0 1 0"/>
+    <dynamics damping="10.0"/>
+    <limit effort="20" lower="0.0" upper="0.04" velocity="0.2"/>
+  </joint>
+  <joint name="panda_finger_joint2" type="prismatic">
+    <parent link="panda_hand"/>
+    <child link="panda_rightfinger"/>
+    <origin rpy="0 0 0" xyz="0 0 0.0584"/>
+    <axis xyz="0 1 0"/>
+    <dynamics damping="10.0"/>
+    <limit effort="20" lower="-0.04" upper="0.0" velocity="0.2"/>
+    <mimic joint="panda_finger_joint1"/>
+  </joint>
+  
+  <link name="ee_link"/>
+
+  <joint name="ee_fixed_joint" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="ee_link"/>
+    <origin rpy="0 0.0 0" xyz="0.0 0. 0.1"/>
+  </joint>
+  <link name="right_gripper">
+    <inertial>
+      <!-- Dummy inertial parameters to avoid link lumping-->
+      <mass value="0.01"/>
+      <inertia ixx="0.0001" ixy="0.0" ixz="0.0" iyy="0.0001" iyz="0.0" izz="0.0001"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+    </inertial>
+  </link>
+  <joint name="right_gripper" type="fixed">
+    <!--<origin rpy="0 0 2.35619449019" xyz="0 0 0.1"/>-->
+    <origin rpy="0 0 2.35619449019" xyz="0 0 0.207"/>
+    <axis xyz="0 0 1"/>
+    <parent link="panda_link7"/>
+    <!--<parent link="panda_link8"/>-->
+    <child link="right_gripper"/>
+  </joint>
+
+
+</robot>
+
+
diff --git a/src/curobo/content/assets/robot/franka_description/franka_panda_mobile.urdf b/src/curobo/content/assets/robot/franka_description/franka_panda_mobile.urdf
new file mode 100644
index 00000000..d776e8fc
--- /dev/null
+++ b/src/curobo/content/assets/robot/franka_description/franka_panda_mobile.urdf
@@ -0,0 +1,325 @@
+<?xml version="1.0" ?>
+<robot name="panda" xmlns:xacro="http://www.ros.org/wiki/xacro">
+  <link name="base_link"/>
+  <link name="base_link_x"/>
+  <link name="base_link_y"/>
+  <link name="base_link_z"/>
+
+
+<joint name="base_x" type="prismatic">
+    <origin rpy="0 0 0.0" xyz="0 0 0.0"/>
+    <parent link="base_link"/>
+    <child link="base_link_x"/>
+    <axis xyz="1 0 0"/>
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-1.0" upper="1.0" velocity="2.1750"/>
+  </joint>
+
+<joint name="base_y" type="prismatic">
+    <origin rpy="0 0 0.0" xyz="0 0 0.0"/>
+    <parent link="base_link_x"/>
+    <child link="base_link_y"/>
+    <axis xyz="0 1 0"/>
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-1.0" upper="1.0" velocity="2.1750"/>
+  </joint>
+
+<joint name="base_z" type="prismatic">
+    <origin rpy="0 0 0.0" xyz="0 0 0.0"/>
+    <parent link="base_link_y"/>
+    <child link="base_link_z"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="0.1" upper="1.0" velocity="2.1750"/>
+  </joint>
+
+  
+
+<joint name="base_fixed" type="fixed">
+    <origin rpy="0 0 0.0" xyz="0 0 0.0"/>
+    <parent link="base_link_z"/>
+    <child link="panda_link0"/>
+    <axis xyz="0 0 0"/>
+  </joint>
+
+  
+
+
+  <link name="panda_link0">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link0.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link0.obj"/>
+      </geometry>
+    </collision>
+    
+  </link>
+  <link name="panda_link1">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link1.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link1.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint1" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="0 0 0" xyz="0 0 0.333"/>
+    <parent link="panda_link0"/>
+    <child link="panda_link1"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <!--limit effort="87" lower="-2.8973" upper="2.8973" velocity="2.1750"/-->
+    <limit effort="87" lower="-2.8973" upper="2.8973" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link2">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link2.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link2.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint2" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-1.7628" soft_upper_limit="1.7628"/>
+    <origin rpy="-1.57079632679 0 0" xyz="0 0 0"/>
+    <parent link="panda_link1"/>
+    <child link="panda_link2"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-1.7628" upper="1.7628" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link3">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link3.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link3.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint3" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="1.57079632679 0 0" xyz="0 -0.316 0"/>
+    <parent link="panda_link2"/>
+    <child link="panda_link3"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-2.8973" upper="2.8973" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link4">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link4.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link4.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint4" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-3.0718" soft_upper_limit="-0.0698"/>
+    <origin rpy="1.57079632679 0 0" xyz="0.0825 0 0"/>
+    <parent link="panda_link3"/>
+    <child link="panda_link4"/>
+    <axis xyz="0 0 1"/>    
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-3.0718" upper="-0.0698" velocity="2.1750"/>
+    <!-- something is weird with this joint limit config
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-3.0" upper="0.087" velocity="2.1750"/>  -->
+  </joint>
+  <link name="panda_link5">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link5.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link5.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint5" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="-1.57079632679 0 0" xyz="-0.0825 0.384 0"/>
+    <parent link="panda_link4"/>
+    <child link="panda_link5"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
+  </joint>
+  <link name="panda_link6">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link6.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link6.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint6" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-0.0175" soft_upper_limit="3.7525"/>
+    <origin rpy="1.57079632679 0 0" xyz="0 0 0"/>
+    <parent link="panda_link5"/>
+    <child link="panda_link6"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="12" lower="0.5" upper="3.7525" velocity="2.6100"/>
+    <!-- <dynamics damping="10.0"/>
+    <limit effort="12" lower="-0.0873" upper="3.0" velocity="2.6100"/> -->
+  </joint>
+  <link name="panda_link7">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link7.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link7.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint7" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="1.57079632679 0 0" xyz="0.088 0 0"/>
+    <parent link="panda_link6"/>
+    <child link="panda_link7"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
+  </joint>
+  <link name="panda_link8"/>
+  <joint name="panda_joint8" type="fixed">
+    <origin rpy="0 0 0" xyz="0 0 0.107"/>
+    <parent link="panda_link7"/>
+    <child link="panda_link8"/>
+    <axis xyz="0 0 0"/>
+  </joint>
+  <joint name="panda_hand_joint" type="fixed">
+    <!--
+    <parent link="panda_link8"/>
+    -->
+    <parent link="panda_link8"/>
+    <child link="panda_hand"/>
+    <origin rpy="0 0 -0.785398163397" xyz="0 0 0.0"/>
+    <!--
+    <origin rpy="0 0 -0.785398163397" xyz="0 0 0"/>
+    -->
+  </joint>
+  <link name="panda_hand">
+    <visual>
+      <origin rpy="1.57 0.0 0.0" xyz="0 0 0.0"/>
+      <!--origin rpy="-1.57 0.0 0.0" xyz="0 0 0.0"/-->
+      <geometry>
+        <mesh filename="meshes/visual/hand.obj"/>
+      </geometry>
+    </visual>
+    <collision>
+      <origin rpy="0.0 0.0 0.0" xyz="0 0 0.0"/>
+      <geometry>
+        <!--box size="0.01 0.01 0.01"/-->
+        <!--mesh filename="meshes/visual/hand.obj"/-->
+        <mesh filename="meshes/collision/hand_gripper.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="ee_fixed_joint" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="grasp_frame"/>
+    <!--origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0922 0.0"/-->
+    <!--origin rpy="-1.57 0.0 1.57" xyz="0.0 0. 0.0"/-->
+    <origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0 0.11"/> 
+  </joint>
+  <link name="grasp_frame">
+    <collision>
+      <geometry>
+        <box size="0.01 0.01 0.01"/>
+      </geometry>
+
+      <origin rpy="0 0 0" xyz="0.0 0 0"/>
+    </collision>
+  </link>
+    <joint name="ee_fixed_t" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="ee_link"/>
+    <!--origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0922 0.0"/-->
+    <origin rpy="0 0.0 0" xyz="0.0 0. 0.1"/>
+    <!--origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0 0.1"/--> 
+    </joint>
+  <link name="ee_link"/>
+  <link name="ee_grasp_frame"/>
+  <joint name="ee_grasp_joint" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="ee_grasp_frame"/>
+    <!--origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0922 0.0"/-->
+    <origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0 0.0"/> 
+    <!--origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0 0.1"/--> 
+  </joint>
+
+ 
+
+
+  
+    <link name="panda_leftfinger">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/finger.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/finger.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <link name="panda_rightfinger">
+    <visual>
+      <origin rpy="0 0 3.14159265359" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/visual/finger.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.14159265359" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/collision/finger.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_finger_joint1" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="panda_leftfinger"/>
+    <origin rpy="0 0 0" xyz="0 0.04 0.0584"/>
+  </joint>
+  <joint name="panda_finger_joint2" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="panda_rightfinger"/>
+    <origin rpy="0 0 0" xyz="0 -0.04 0.0584"/>
+  </joint>
+
+</robot>
+
+
diff --git a/src/curobo/content/assets/robot/franka_description/franka_panda_mobile_base.urdf b/src/curobo/content/assets/robot/franka_description/franka_panda_mobile_base.urdf
new file mode 100644
index 00000000..70b79803
--- /dev/null
+++ b/src/curobo/content/assets/robot/franka_description/franka_panda_mobile_base.urdf
@@ -0,0 +1,52 @@
+<?xml version="1.0" ?>
+<robot name="panda" xmlns:xacro="http://www.ros.org/wiki/xacro">
+  <link name="base_link"/>
+  <link name="base_link_x"/>
+  <link name="base_link_y"/>
+
+<joint name="base_x" type="prismatic">
+    <origin rpy="0 0 0.0" xyz="0 0 0.0"/>
+    <parent link="base_link"/>
+    <child link="base_link_x"/>
+    <axis xyz="1 0 0"/>
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-5.0" upper="5.0" velocity="1.0"/>
+  </joint>
+
+<joint name="base_y" type="prismatic">
+    <origin rpy="0 0 0.0" xyz="0 0 0.0"/>
+    <parent link="base_link_x"/>
+    <child link="base_link_y"/>
+    <axis xyz="0 1 0"/>
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-5.0" upper="5.0" velocity="1.0"/>
+  </joint>
+
+<joint name="base_z" type="revolute">
+    <origin rpy="0 0 0.0" xyz="0 0 0.0"/>
+    <parent link="base_link_y"/>
+    <child link="panda_link0"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-6.0" upper="6.0" velocity="1.50"/>
+  </joint>
+
+  
+
+
+  <link name="panda_link0">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link0.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link0.obj"/>
+      </geometry>
+    </collision>
+    
+  </link>
+</robot>
+
+
diff --git a/src/curobo/content/assets/robot/franka_description/franka_panda_mobile_x.urdf b/src/curobo/content/assets/robot/franka_description/franka_panda_mobile_x.urdf
new file mode 100644
index 00000000..d7ac4c82
--- /dev/null
+++ b/src/curobo/content/assets/robot/franka_description/franka_panda_mobile_x.urdf
@@ -0,0 +1,303 @@
+<?xml version="1.0" ?>
+<robot name="panda" xmlns:xacro="http://www.ros.org/wiki/xacro">
+  <link name="base_link"/>
+  <link name="base_link_x"/>
+
+
+<joint name="base_x" type="prismatic">
+    <origin rpy="0 0 0.0" xyz="0 0 0.0"/>
+    <parent link="base_link"/>
+    <child link="base_link_x"/>
+    <axis xyz="1 0 0"/>
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-5.0" upper="5.0" velocity="1.0"/>
+  </joint>
+
+<joint name="base_fixed" type="fixed">
+    <origin rpy="0 0 0.0" xyz="0 0 0.1"/>
+    <parent link="base_link_x"/>
+    <child link="panda_link0"/>
+    <axis xyz="0 0 0"/>
+  </joint>
+
+  
+
+
+  <link name="panda_link0">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link0.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link0.obj"/>
+      </geometry>
+    </collision>
+    
+  </link>
+  <link name="panda_link1">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link1.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link1.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint1" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="0 0 0" xyz="0 0 0.333"/>
+    <parent link="panda_link0"/>
+    <child link="panda_link1"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <!--limit effort="87" lower="-2.8973" upper="2.8973" velocity="2.1750"/-->
+    <limit effort="87" lower="-2.8973" upper="2.8973" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link2">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link2.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link2.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint2" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-1.7628" soft_upper_limit="1.7628"/>
+    <origin rpy="-1.57079632679 0 0" xyz="0 0 0"/>
+    <parent link="panda_link1"/>
+    <child link="panda_link2"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-1.7628" upper="1.7628" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link3">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link3.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link3.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint3" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="1.57079632679 0 0" xyz="0 -0.316 0"/>
+    <parent link="panda_link2"/>
+    <child link="panda_link3"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-2.8973" upper="2.8973" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link4">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link4.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link4.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint4" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-3.0718" soft_upper_limit="-0.0698"/>
+    <origin rpy="1.57079632679 0 0" xyz="0.0825 0 0"/>
+    <parent link="panda_link3"/>
+    <child link="panda_link4"/>
+    <axis xyz="0 0 1"/>    
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-3.0718" upper="-0.0698" velocity="2.1750"/>
+    <!-- something is weird with this joint limit config
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-3.0" upper="0.087" velocity="2.1750"/>  -->
+  </joint>
+  <link name="panda_link5">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link5.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link5.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint5" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="-1.57079632679 0 0" xyz="-0.0825 0.384 0"/>
+    <parent link="panda_link4"/>
+    <child link="panda_link5"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
+  </joint>
+  <link name="panda_link6">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link6.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link6.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint6" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-0.0175" soft_upper_limit="3.7525"/>
+    <origin rpy="1.57079632679 0 0" xyz="0 0 0"/>
+    <parent link="panda_link5"/>
+    <child link="panda_link6"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="12" lower="0.5" upper="3.7525" velocity="2.6100"/>
+    <!-- <dynamics damping="10.0"/>
+    <limit effort="12" lower="-0.0873" upper="3.0" velocity="2.6100"/> -->
+  </joint>
+  <link name="panda_link7">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link7.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link7.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint7" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="1.57079632679 0 0" xyz="0.088 0 0"/>
+    <parent link="panda_link6"/>
+    <child link="panda_link7"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
+  </joint>
+  <link name="panda_link8"/>
+  <joint name="panda_joint8" type="fixed">
+    <origin rpy="0 0 0" xyz="0 0 0.107"/>
+    <parent link="panda_link7"/>
+    <child link="panda_link8"/>
+    <axis xyz="0 0 0"/>
+  </joint>
+  <joint name="panda_hand_joint" type="fixed">
+    <!--
+    <parent link="panda_link8"/>
+    -->
+    <parent link="panda_link8"/>
+    <child link="panda_hand"/>
+    <origin rpy="0 0 -0.785398163397" xyz="0 0 0.0"/>
+    <!--
+    <origin rpy="0 0 -0.785398163397" xyz="0 0 0"/>
+    -->
+  </joint>
+  <link name="panda_hand">
+    <visual>
+      <origin rpy="1.57 0.0 0.0" xyz="0 0 0.0"/>
+      <!--origin rpy="-1.57 0.0 0.0" xyz="0 0 0.0"/-->
+      <geometry>
+        <mesh filename="meshes/visual/hand.obj"/>
+      </geometry>
+    </visual>
+    <collision>
+      <origin rpy="0.0 0.0 0.0" xyz="0 0 0.0"/>
+      <geometry>
+        <!--box size="0.01 0.01 0.01"/-->
+        <!--mesh filename="meshes/visual/hand.obj"/-->
+        <mesh filename="meshes/collision/hand_gripper.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="ee_fixed_joint" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="grasp_frame"/>
+    <!--origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0922 0.0"/-->
+    <!--origin rpy="-1.57 0.0 1.57" xyz="0.0 0. 0.0"/-->
+    <origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0 0.11"/> 
+  </joint>
+  <link name="grasp_frame">
+    <collision>
+      <geometry>
+        <box size="0.01 0.01 0.01"/>
+      </geometry>
+
+      <origin rpy="0 0 0" xyz="0.0 0 0"/>
+    </collision>
+  </link>
+    <joint name="ee_fixed_t" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="ee_link"/>
+    <!--origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0922 0.0"/-->
+    <origin rpy="0 0.0 0" xyz="0.0 0. 0.1"/>
+    <!--origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0 0.1"/--> 
+    </joint>
+  <link name="ee_link"/>
+  <link name="ee_grasp_frame"/>
+  <joint name="ee_grasp_joint" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="ee_grasp_frame"/>
+    <!--origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0922 0.0"/-->
+    <origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0 0.0"/> 
+    <!--origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0 0.1"/--> 
+  </joint>
+
+ 
+
+
+  
+    <link name="panda_leftfinger">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/finger.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/finger.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <link name="panda_rightfinger">
+    <visual>
+      <origin rpy="0 0 3.14159265359" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/visual/finger.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.14159265359" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/collision/finger.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_finger_joint1" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="panda_leftfinger"/>
+    <origin rpy="0 0 0" xyz="0 0.04 0.0584"/>
+  </joint>
+  <joint name="panda_finger_joint2" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="panda_rightfinger"/>
+    <origin rpy="0 0 0" xyz="0 -0.04 0.0584"/>
+  </joint>
+
+</robot>
+
+
diff --git a/src/curobo/content/assets/robot/franka_description/franka_panda_mobile_xy.urdf b/src/curobo/content/assets/robot/franka_description/franka_panda_mobile_xy.urdf
new file mode 100644
index 00000000..8892e333
--- /dev/null
+++ b/src/curobo/content/assets/robot/franka_description/franka_panda_mobile_xy.urdf
@@ -0,0 +1,316 @@
+<?xml version="1.0" ?>
+<robot name="panda" xmlns:xacro="http://www.ros.org/wiki/xacro">
+  <link name="base_link"/>
+  <link name="base_link_x"/>
+  <link name="base_link_y"/>
+
+
+<joint name="base_x" type="prismatic">
+    <origin rpy="0 0 0.0" xyz="0 0 0.0"/>
+    <parent link="base_link"/>
+    <child link="base_link_x"/>
+    <axis xyz="1 0 0"/>
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-1.0" upper="1.0" velocity="3.0"/>
+  </joint>
+
+<joint name="base_y" type="prismatic">
+    <origin rpy="0 0 0.0" xyz="0 0 0.0"/>
+    <parent link="base_link_x"/>
+    <child link="base_link_y"/>
+    <axis xyz="0 1 0"/>
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-1.0" upper="1.0" velocity="3.0"/>
+  </joint>
+
+
+  
+
+<joint name="base_fixed" type="fixed">
+    <origin rpy="0 0 0.0" xyz="0 0 0.1"/>
+    <parent link="base_link_y"/>
+    <child link="panda_link0"/>
+    <axis xyz="0 0 0"/>
+  </joint>
+
+  
+
+
+  <link name="panda_link0">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link0.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link0.obj"/>
+      </geometry>
+    </collision>
+    
+  </link>
+  <link name="panda_link1">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link1.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link1.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint1" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="0 0 0" xyz="0 0 0.333"/>
+    <parent link="panda_link0"/>
+    <child link="panda_link1"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <!--limit effort="87" lower="-2.8973" upper="2.8973" velocity="2.1750"/-->
+    <limit effort="87" lower="-2.8973" upper="2.8973" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link2">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link2.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link2.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint2" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-1.7628" soft_upper_limit="1.7628"/>
+    <origin rpy="-1.57079632679 0 0" xyz="0 0 0"/>
+    <parent link="panda_link1"/>
+    <child link="panda_link2"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-1.7628" upper="1.7628" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link3">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link3.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link3.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint3" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="1.57079632679 0 0" xyz="0 -0.316 0"/>
+    <parent link="panda_link2"/>
+    <child link="panda_link3"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-2.8973" upper="2.8973" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link4">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link4.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link4.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint4" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-3.0718" soft_upper_limit="-0.0698"/>
+    <origin rpy="1.57079632679 0 0" xyz="0.0825 0 0"/>
+    <parent link="panda_link3"/>
+    <child link="panda_link4"/>
+    <axis xyz="0 0 1"/>    
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-3.0718" upper="-0.0698" velocity="2.1750"/>
+    <!-- something is weird with this joint limit config
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-3.0" upper="0.087" velocity="2.1750"/>  -->
+  </joint>
+  <link name="panda_link5">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link5.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link5.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint5" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="-1.57079632679 0 0" xyz="-0.0825 0.384 0"/>
+    <parent link="panda_link4"/>
+    <child link="panda_link5"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
+  </joint>
+  <link name="panda_link6">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link6.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link6.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint6" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-0.0175" soft_upper_limit="3.7525"/>
+    <origin rpy="1.57079632679 0 0" xyz="0 0 0"/>
+    <parent link="panda_link5"/>
+    <child link="panda_link6"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="12" lower="0.5" upper="3.7525" velocity="2.6100"/>
+    <!-- <dynamics damping="10.0"/>
+    <limit effort="12" lower="-0.0873" upper="3.0" velocity="2.6100"/> -->
+  </joint>
+  <link name="panda_link7">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link7.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link7.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint7" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="1.57079632679 0 0" xyz="0.088 0 0"/>
+    <parent link="panda_link6"/>
+    <child link="panda_link7"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
+  </joint>
+  <link name="panda_link8"/>
+  <joint name="panda_joint8" type="fixed">
+    <origin rpy="0 0 0" xyz="0 0 0.107"/>
+    <parent link="panda_link7"/>
+    <child link="panda_link8"/>
+    <axis xyz="0 0 0"/>
+  </joint>
+  <joint name="panda_hand_joint" type="fixed">
+    <!--
+    <parent link="panda_link8"/>
+    -->
+    <parent link="panda_link8"/>
+    <child link="panda_hand"/>
+    <origin rpy="0 0 -0.785398163397" xyz="0 0 0.0"/>
+    <!--
+    <origin rpy="0 0 -0.785398163397" xyz="0 0 0"/>
+    -->
+  </joint>
+  <link name="panda_hand">
+    <visual>
+      <!--origin rpy="0.0 0.0 0.0" xyz="0 0 0.0"/-->
+      <origin rpy="1.57 0.0 0.0" xyz="0 0 0.0"/>
+      <geometry>
+        <mesh filename="meshes/visual/hand.obj"/>
+      </geometry>
+    </visual>
+    <collision>
+      <origin rpy="0.0 0.0 0.0" xyz="0 0 0.0"/>
+      <geometry>
+        <!--box size="0.01 0.01 0.01"/-->
+        <!--mesh filename="meshes/visual/hand.obj"/-->
+        <mesh filename="meshes/collision/hand_gripper.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="ee_fixed_joint" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="grasp_frame"/>
+    <!--origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0922 0.0"/-->
+    <!--origin rpy="-1.57 0.0 1.57" xyz="0.0 0. 0.0"/-->
+    <origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0 0.11"/> 
+  </joint>
+  <link name="grasp_frame">
+    <collision>
+      <geometry>
+        <box size="0.01 0.01 0.01"/>
+      </geometry>
+
+      <origin rpy="0 0 0" xyz="0.0 0 0"/>
+    </collision>
+  </link>
+    <joint name="ee_fixed_t" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="ee_link"/>
+    <!--origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0922 0.0"/-->
+    <origin rpy="0 0.0 0" xyz="0.0 0. 0.1"/>
+    <!--origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0 0.1"/--> 
+    </joint>
+  <link name="ee_link"/>
+  <link name="ee_grasp_frame"/>
+  <joint name="ee_grasp_joint" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="ee_grasp_frame"/>
+    <!--origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0922 0.0"/-->
+    <origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0 0.0"/> 
+    <!--origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0 0.1"/--> 
+  </joint>
+
+ 
+
+
+  
+    <link name="panda_leftfinger">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/finger.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/finger.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <link name="panda_rightfinger">
+    <visual>
+      <origin rpy="0 0 3.14159265359" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/visual/finger.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.14159265359" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/collision/finger.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_finger_joint1" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="panda_leftfinger"/>
+    <origin rpy="0 0 0" xyz="0 0.04 0.0584"/>
+  </joint>
+  <joint name="panda_finger_joint2" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="panda_rightfinger"/>
+    <origin rpy="0 0 0" xyz="0 -0.04 0.0584"/>
+  </joint>
+
+</robot>
+
+
diff --git a/src/curobo/content/assets/robot/franka_description/franka_panda_mobile_xy_tz.urdf b/src/curobo/content/assets/robot/franka_description/franka_panda_mobile_xy_tz.urdf
new file mode 100644
index 00000000..f225fe76
--- /dev/null
+++ b/src/curobo/content/assets/robot/franka_description/franka_panda_mobile_xy_tz.urdf
@@ -0,0 +1,332 @@
+<?xml version="1.0" ?>
+<robot name="panda" xmlns:xacro="http://www.ros.org/wiki/xacro">
+  <link name="base_link"/>
+  <link name="base_link_x"/>
+  <link name="base_link_y"/>
+  <link name="base_link_z"/>
+
+
+<joint name="base_x" type="prismatic">
+    <origin rpy="0 0 0.0" xyz="0 0 0.0"/>
+    <parent link="base_link"/>
+    <child link="base_link_x"/>
+    <axis xyz="1 0 0"/>
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-5.0" upper="5.0" velocity="1.0"/>
+  </joint>
+
+<joint name="base_y" type="prismatic">
+    <origin rpy="0 0 0.0" xyz="0 0 0.0"/>
+    <parent link="base_link_x"/>
+    <child link="base_link_y"/>
+    <axis xyz="0 1 0"/>
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-5.0" upper="5.0" velocity="1.0"/>
+  </joint>
+
+<joint name="base_z" type="revolute">
+    <origin rpy="0 0 0.0" xyz="0 0 0.0"/>
+    <parent link="base_link_y"/>
+    <child link="base_link_z"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-6.0" upper="6.0" velocity="1.50"/>
+  </joint>
+
+  
+
+<joint name="base_fixed" type="fixed">
+    <origin rpy="0 0 0.0" xyz="0 0 0.1"/>
+    <parent link="base_link_z"/>
+    <child link="panda_link0"/>
+    <axis xyz="0 0 0"/>
+  </joint>
+
+  
+
+
+  <link name="panda_link0">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link0.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link0.obj"/>
+      </geometry>
+    </collision>
+    
+  </link>
+  <link name="panda_link1">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link1.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link1.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint1" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="0 0 0" xyz="0 0 0.333"/>
+    <parent link="panda_link0"/>
+    <child link="panda_link1"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <!--limit effort="87" lower="-2.8973" upper="2.8973" velocity="2.1750"/-->
+    <limit effort="87" lower="-2.8973" upper="2.8973" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link2">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link2.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link2.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint2" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-1.7628" soft_upper_limit="1.7628"/>
+    <origin rpy="-1.57079632679 0 0" xyz="0 0 0"/>
+    <parent link="panda_link1"/>
+    <child link="panda_link2"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-1.7628" upper="1.7628" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link3">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link3.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link3.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint3" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="1.57079632679 0 0" xyz="0 -0.316 0"/>
+    <parent link="panda_link2"/>
+    <child link="panda_link3"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-2.8973" upper="2.8973" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link4">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link4.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link4.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint4" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-3.0718" soft_upper_limit="-0.0698"/>
+    <origin rpy="1.57079632679 0 0" xyz="0.0825 0 0"/>
+    <parent link="panda_link3"/>
+    <child link="panda_link4"/>
+    <axis xyz="0 0 1"/>    
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-3.0718" upper="-0.0698" velocity="2.1750"/>
+    <!-- something is weird with this joint limit config
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-3.0" upper="0.087" velocity="2.1750"/>  -->
+  </joint>
+  <link name="panda_link5">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link5.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link5.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint5" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="-1.57079632679 0 0" xyz="-0.0825 0.384 0"/>
+    <parent link="panda_link4"/>
+    <child link="panda_link5"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
+  </joint>
+  <link name="panda_link6">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link6.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link6.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint6" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-0.0175" soft_upper_limit="3.7525"/>
+    <origin rpy="1.57079632679 0 0" xyz="0 0 0"/>
+    <parent link="panda_link5"/>
+    <child link="panda_link6"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="12" lower="0.5" upper="3.7525" velocity="2.6100"/>
+    <!-- <dynamics damping="10.0"/>
+    <limit effort="12" lower="-0.0873" upper="3.0" velocity="2.6100"/> -->
+  </joint>
+  <link name="panda_link7">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link7.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link7.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint7" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="1.57079632679 0 0" xyz="0.088 0 0"/>
+    <parent link="panda_link6"/>
+    <child link="panda_link7"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
+  </joint>
+  <link name="panda_link8"/>
+  <joint name="panda_joint8" type="fixed">
+    <origin rpy="0 0 0" xyz="0 0 0.107"/>
+    <parent link="panda_link7"/>
+    <child link="panda_link8"/>
+    <axis xyz="0 0 0"/>
+  </joint>
+  <joint name="panda_hand_joint" type="fixed">
+    <!--
+    <parent link="panda_link8"/>
+    -->
+    <parent link="panda_link8"/>
+    <child link="panda_hand"/>
+    <origin rpy="0 0 -0.785398163397" xyz="0 0 0.0"/>
+    <!--
+    <origin rpy="0 0 -0.785398163397" xyz="0 0 0"/>
+    -->
+  </joint>
+  <link name="panda_hand">
+    <visual>
+      <origin rpy="1.57 0.0 0.0" xyz="0 0 0.0"/>
+      <!--origin rpy="-1.57 0.0 0.0" xyz="0 0 0.0"/-->
+      <geometry>
+        <mesh filename="meshes/visual/hand.obj"/>
+      </geometry>
+    </visual>
+    <collision>
+      <origin rpy="0.0 0.0 0.0" xyz="0 0 0.0"/>
+      <geometry>
+        <!--box size="0.01 0.01 0.01"/-->
+        <!--mesh filename="meshes/visual/hand.obj"/-->
+        <mesh filename="meshes/collision/hand_gripper.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="ee_fixed_joint" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="grasp_frame"/>
+    <!--origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0922 0.0"/-->
+    <!--origin rpy="-1.57 0.0 1.57" xyz="0.0 0. 0.0"/-->
+    <origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0 0.11"/> 
+  </joint>
+  <link name="grasp_frame">
+    <collision>
+      <geometry>
+        <box size="0.01 0.01 0.01"/>
+      </geometry>
+
+      <origin rpy="0 0 0" xyz="0.0 0 0"/>
+    </collision>
+  </link>
+    <joint name="ee_fixed_t" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="ee_link"/>
+    <!--origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0922 0.0"/-->
+    <origin rpy="0 0.0 0" xyz="0.0 0. 0.1"/>
+    <!--origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0 0.1"/--> 
+    </joint>
+  <link name="ee_link"/>
+  <link name="ee_grasp_frame"/>
+  <joint name="ee_grasp_joint" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="ee_grasp_frame"/>
+    <!--origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0922 0.0"/-->
+    <origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0 0.0"/> 
+    <!--origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0 0.1"/--> 
+  </joint>
+
+ 
+
+
+  
+    <link name="panda_leftfinger">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/finger.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/finger.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <link name="panda_rightfinger">
+    <visual>
+      <origin rpy="0 0 3.14159265359" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/visual/finger.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.14159265359" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/collision/finger.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_finger_joint1" type="prismatic">
+    <parent link="panda_hand"/>
+    <child link="panda_leftfinger"/>
+    <origin rpy="0 0 0" xyz="0 0 0.0584"/>
+    <axis xyz="0 1 0"/>
+    <dynamics damping="10.0"/>
+    <limit effort="20" lower="0.0" upper="0.04" velocity="0.2"/>
+  </joint>
+  <joint name="panda_finger_joint2" type="prismatic">
+    <parent link="panda_hand"/>
+    <child link="panda_rightfinger"/>
+    <origin rpy="0 0 0" xyz="0 0 0.0584"/>
+    <axis xyz="0 1 0"/>
+    <dynamics damping="10.0"/>
+    <limit effort="20" lower="-0.04" upper="0.0" velocity="0.2"/>
+    <mimic joint="panda_finger_joint1"/>
+  </joint>
+
+</robot>
+
+
diff --git a/src/curobo/content/assets/robot/franka_description/franka_panda_no_gripper.urdf b/src/curobo/content/assets/robot/franka_description/franka_panda_no_gripper.urdf
new file mode 100644
index 00000000..2b9979fe
--- /dev/null
+++ b/src/curobo/content/assets/robot/franka_description/franka_panda_no_gripper.urdf
@@ -0,0 +1,302 @@
+<?xml version="1.0" ?>
+<robot name="panda" xmlns:xacro="http://www.ros.org/wiki/xacro">
+  <link name="base_link"/>
+  <joint name="panda_fixed" type="fixed">
+    <origin rpy="0 0 0.0" xyz="0 0 0.0"/>
+    <parent link="base_link"/>
+    <child link="panda_link0"/>
+    <axis xyz="0 0 0"/>
+  </joint>
+
+  <link name="panda_link0">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link0.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link0.obj"/>
+      </geometry>
+    </collision>
+    
+  </link>
+  <link name="panda_link1">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link1.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link1.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint1" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="0 0 0" xyz="0 0 0.333"/>
+    <parent link="panda_link0"/>
+    <child link="panda_link1"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <!--limit effort="87" lower="-2.8973" upper="2.8973" velocity="2.1750"/-->
+    <limit effort="87" lower="-2.8973" upper="2.8973" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link2">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link2.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link2.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint2" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-1.7628" soft_upper_limit="1.7628"/>
+    <origin rpy="-1.57079632679 0 0" xyz="0 0 0"/>
+    <parent link="panda_link1"/>
+    <child link="panda_link2"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-1.7628" upper="1.7628" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link3">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link3.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link3.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint3" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="1.57079632679 0 0" xyz="0 -0.316 0"/>
+    <parent link="panda_link2"/>
+    <child link="panda_link3"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-2.8973" upper="2.8973" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link4">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link4.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link4.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint4" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-3.0718" soft_upper_limit="-0.0698"/>
+    <origin rpy="1.57079632679 0 0" xyz="0.0825 0 0"/>
+    <parent link="panda_link3"/>
+    <child link="panda_link4"/>
+    <axis xyz="0 0 1"/>    
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-3.0718" upper="-0.0698" velocity="2.1750"/>
+    <!-- something is weird with this joint limit config
+    <dynamics damping="10.0"/>
+    <limit effort="87" lower="-3.0" upper="0.087" velocity="2.1750"/>  -->
+  </joint>
+  <link name="panda_link5">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link5.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link5.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint5" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="-1.57079632679 0 0" xyz="-0.0825 0.384 0"/>
+    <parent link="panda_link4"/>
+    <child link="panda_link5"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
+  </joint>
+  <link name="panda_link6">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link6.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link6.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint6" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-0.0175" soft_upper_limit="3.7525"/>
+    <origin rpy="1.57079632679 0 0" xyz="0 0 0"/>
+    <parent link="panda_link5"/>
+    <child link="panda_link6"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="12" lower="0.5" upper="3.7525" velocity="2.6100"/>
+    <!-- <dynamics damping="10.0"/>
+    <limit effort="12" lower="-0.0873" upper="3.0" velocity="2.6100"/> -->
+  </joint>
+  <link name="panda_link7">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link7.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link7.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint7" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="1.57079632679 0 0" xyz="0.088 0 0"/>
+    <parent link="panda_link6"/>
+    <child link="panda_link7"/>
+    <axis xyz="0 0 1"/>
+    <dynamics damping="10.0"/>
+    <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
+  </joint>
+  <link name="panda_link8"/>
+  <joint name="panda_joint8" type="fixed">
+    <origin rpy="0 0 0" xyz="0 0 0.107"/>
+    <parent link="panda_link7"/>
+    <child link="panda_link8"/>
+    <axis xyz="0 0 0"/>
+  </joint>
+  <joint name="panda_hand_joint" type="fixed">
+    <!--
+    <parent link="panda_link8"/>
+    -->
+    <parent link="panda_link8"/>
+    <child link="panda_hand"/>
+    <origin rpy="0 0 -0.785398163397" xyz="0 0 0.0"/>
+    <!--
+    <origin rpy="0 0 -0.785398163397" xyz="0 0 0"/>
+    -->
+  </joint>
+  <link name="panda_hand">
+    <visual>
+      <!--origin rpy="0.0 0.0 0.0" xyz="0 0 0.0"/-->
+      <origin rpy="1.57 0.0 0.0" xyz="0 0 0.0"/>
+      <geometry>
+        <mesh filename="meshes/visual/hand.obj"/>
+      </geometry>
+    </visual>
+    <collision>
+      <origin rpy="0.0 0.0 0.0" xyz="0 0 0.0"/>
+      <geometry>
+        <!--box size="0.01 0.01 0.01"/-->
+        <!--mesh filename="meshes/visual/hand.obj"/-->
+        <mesh filename="meshes/collision/hand_gripper.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="ee_fixed_joint" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="grasp_frame"/>
+    <!--origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0922 0.0"/-->
+    <!--origin rpy="-1.57 0.0 1.57" xyz="0.0 0. 0.0"/-->
+    <origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0 0.11"/> 
+  </joint>
+  <link name="grasp_frame">
+    <collision>
+      <geometry>
+        <box size="0.01 0.01 0.01"/>
+      </geometry>
+
+      <origin rpy="0 0 0" xyz="0.0 0 0"/>
+    </collision>
+  </link>
+    <joint name="ee_fixed_t" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="ee_link"/>
+    <!--origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0922 0.0"/-->
+    <origin rpy="0 0.0 0" xyz="0.0 0. 0.1"/>
+    <!--origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0 0.1"/--> 
+    </joint>
+  <link name="ee_link"/>
+  <link name="ee_grasp_frame"/>
+  <joint name="ee_grasp_joint" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="ee_grasp_frame"/>
+    <!--origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0922 0.0"/-->
+    <origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0 0.0"/> 
+    <!--origin rpy="0.0 0.0 1.570796325" xyz="0.0 0.0 0.1"/--> 
+  </joint>
+
+
+  
+    <link name="panda_leftfinger">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/finger.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/finger.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <link name="panda_rightfinger">
+    <visual>
+      <origin rpy="0 0 3.14159265359" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/visual/finger.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.14159265359" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/collision/finger.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_finger_joint1" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="panda_leftfinger"/>
+    <origin rpy="0 0 0" xyz="0 0.04 0.0584"/>
+  </joint>
+  <joint name="panda_finger_joint2" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="panda_rightfinger"/>
+    <origin rpy="0 0 0" xyz="0 -0.04 0.0584"/>
+  </joint>
+  <link name="right_gripper">
+    <inertial>
+      <!-- Dummy inertial parameters to avoid link lumping-->
+      <mass value="0.01"/>
+      <inertia ixx="0.0001" ixy="0.0" ixz="0.0" iyy="0.0001" iyz="0.0" izz="0.0001"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+    </inertial>
+  </link>
+  <joint name="right_gripper" type="fixed">
+    <!--<origin rpy="0 0 2.35619449019" xyz="0 0 0.1"/>-->
+    <origin rpy="0 0 2.35619449019" xyz="0 0 0.207"/>
+    <axis xyz="0 0 1"/>
+    <parent link="panda_link7"/>
+    <!--<parent link="panda_link8"/>-->
+    <child link="right_gripper"/>
+  </joint>
+
+</robot>
+
+
diff --git a/src/curobo/content/assets/robot/franka_description/franka_test.urdf b/src/curobo/content/assets/robot/franka_description/franka_test.urdf
new file mode 100644
index 00000000..2b841b3a
--- /dev/null
+++ b/src/curobo/content/assets/robot/franka_description/franka_test.urdf
@@ -0,0 +1,312 @@
+<?xml version="1.0" ?>
+<!-- =================================================================================== -->
+<!-- |    This document was autogenerated by xacro from panda_arm_hand.urdf.xacro      | -->
+<!-- |    EDITING THIS FILE BY HAND IS NOT RECOMMENDED                                 | -->
+<!-- =================================================================================== -->
+<robot name="panda" xmlns:xacro="http://www.ros.org/wiki/xacro">
+  <link name="panda_link0">
+  	<inertial>
+      <origin rpy="0 0 0" xyz="0 0 0.05"/>
+       <mass value="2.9"/>
+       <inertia ixx="0.1" ixy="0" ixz="0" iyy="0.1" iyz="0" izz="0.1"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://meshes/collision/link0.obj"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://meshes/collision/link0.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <link name="panda_link1">
+  	<inertial>
+      <origin rpy="0 0 0" xyz="0 -0.04 -0.05"/>
+       <mass value="2.7"/>
+       <inertia ixx="0.1" ixy="0" ixz="0" iyy="0.1" iyz="0" izz="0.1"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://meshes/visual/link1.obj"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://meshes/collision/link1.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint1" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="0 0 0" xyz="0 0 0.333"/>
+    <parent link="panda_link0"/>
+    <child link="panda_link1"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="87" lower="-2.9671" upper="2.9671" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link2">
+  	<inertial>
+      <origin rpy="0 0 0" xyz="0 -0.04 0.06"/>
+       <mass value="2.73"/>
+       <inertia ixx="0.1" ixy="0" ixz="0" iyy="0.1" iyz="0" izz="0.1"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://meshes/visual/link2.obj"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://meshes/collision/link2.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint2" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-1.7628" soft_upper_limit="1.7628"/>
+    <origin rpy="-1.57079632679 0 0" xyz="0 0 0"/>
+    <parent link="panda_link1"/>
+    <child link="panda_link2"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="87" lower="-1.8326" upper="1.8326" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link3">
+	  <inertial>
+      <origin rpy="0 0 0" xyz="0.01 0.01 -0.05"/>
+       <mass value="2.04"/>
+       <inertia ixx="0.1" ixy="0" ixz="0" iyy="0.1" iyz="0" izz="0.1"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://meshes/visual/link3.obj"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://meshes/collision/link3.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint3" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="1.57079632679 0 0" xyz="0 -0.316 0"/>
+    <parent link="panda_link2"/>
+    <child link="panda_link3"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="87" lower="-2.9671" upper="2.9671" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link4">
+  	<inertial>
+      <origin rpy="0 0 0" xyz="-0.03 0.03 0.02"/>
+       <mass value="2.08"/>
+       <inertia ixx="0.1" ixy="0" ixz="0" iyy="0.1" iyz="0" izz="0.1"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://meshes/visual/link4.obj"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://meshes/collision/link4.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint4" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-3.0718" soft_upper_limit="-0.0698"/>
+    <origin rpy="1.57079632679 0 0" xyz="0.0825 0 0"/>
+    <parent link="panda_link3"/>
+    <child link="panda_link4"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="87" lower="-3.1416" upper="0.0" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link5">
+  	<inertial>
+      <origin rpy="0 0 0" xyz="0 0.04 -0.12"/>
+       <mass value="3"/>
+       <inertia ixx="0.1" ixy="0" ixz="0" iyy="0.1" iyz="0" izz="0.1"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://meshes/visual/link5.obj"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://meshes/collision/link5.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint5" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="-1.57079632679 0 0" xyz="-0.0825 0.384 0"/>
+    <parent link="panda_link4"/>
+    <child link="panda_link5"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="12" lower="-2.9671" upper="2.9671" velocity="2.6100"/>
+  </joint>
+  <link name="panda_link6">
+  	<inertial>
+      <origin rpy="0 0 0" xyz="0.04 0 0"/>
+       <mass value="1.3"/>
+       <inertia ixx="0.1" ixy="0" ixz="0" iyy="0.1" iyz="0" izz="0.1"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://meshes/visual/link6.obj"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://meshes/collision/link6.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint6" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-0.0175" soft_upper_limit="3.7525"/>
+    <origin rpy="1.57079632679 0 0" xyz="0 0 0"/>
+    <parent link="panda_link5"/>
+    <child link="panda_link6"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="12" lower="-0.0873" upper="3.8223" velocity="2.6100"/>
+  </joint>
+  <link name="panda_link7">
+  	<inertial>
+      <origin rpy="0 0 0" xyz="0 0 0.08"/>
+       <mass value=".2"/>
+       <inertia ixx="0.1" ixy="0" ixz="0" iyy="0.1" iyz="0" izz="0.1"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://meshes/collision/link7.obj"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://meshes/collision/link7.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint7" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="1.57079632679 0 0" xyz="0.088 0 0"/>
+    <parent link="panda_link6"/>
+    <child link="panda_link7"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="12" lower="-2.9671" upper="2.9671" velocity="2.6100"/>
+  </joint>
+  <link name="panda_link8">
+  	 <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+       <mass value="0.0"/>
+       <inertia ixx="0.1" ixy="0" ixz="0" iyy="0.1" iyz="0" izz="0.1"/>
+    </inertial>
+  </link>
+  <joint name="panda_joint8" type="fixed">
+    <origin rpy="0 0 0" xyz="0 0 0.107"/>
+    <parent link="panda_link7"/>
+    <child link="panda_link8"/>
+    <axis xyz="0 0 0"/>
+  </joint>
+  <joint name="panda_hand_joint" type="fixed">
+    <parent link="panda_link8"/>
+    <child link="panda_hand"/>
+    <origin rpy="0 0 -0.785398163397" xyz="0 0 0"/>
+  </joint>
+  <link name="panda_hand">
+  	<inertial>
+      <origin rpy="0 0 0" xyz="0 0 0.04"/>
+       <mass value=".81"/>
+       <inertia ixx="0.1" ixy="0" ixz="0" iyy="0.1" iyz="0" izz="0.1"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://meshes/visual/hand.obj"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://meshes/collision/hand.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <link name="panda_leftfinger">
+  	<inertial>
+      <origin rpy="0 0 0" xyz="0 0.01 0.02"/>
+       <mass value="0.1"/>
+       <inertia ixx="0.1" ixy="0" ixz="0" iyy="0.1" iyz="0" izz="0.1"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="package://meshes/visual/finger.obj"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://meshes/collision/finger.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <link name="panda_rightfinger">
+  	<inertial>
+      <origin rpy="0 0 0" xyz="0 -0.01 0.02"/>
+       <mass value="0.1"/>
+       <inertia ixx="0.1" ixy="0" ixz="0" iyy="0.1" iyz="0" izz="0.1"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 3.14159265359" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://meshes/visual/finger.obj"/>
+      </geometry>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.14159265359" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="package://meshes/collision/finger.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_finger_joint1" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="panda_leftfinger"/>
+    <origin rpy="0 0 0" xyz="0 0.04 0.0584"/>
+    <axis xyz="0 1 0"/>
+    <limit effort="20" lower="0.0" upper="0.04" velocity="0.2"/>
+  </joint>
+  <joint name="panda_finger_joint2" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="panda_rightfinger"/>
+    <origin rpy="0 0 0" xyz="0 -0.04 0.0584"/>
+    <axis xyz="0 -1 0"/>
+    <limit effort="20" lower="0.0" upper="0.04" velocity="0.2"/>
+    <mimic joint="panda_finger_joint1"/>
+  </joint>
+  <link name="right_gripper">
+    <inertial>
+      <!-- Dummy inertial parameters to avoid link lumping-->
+      <mass value="0.01"/>
+      <inertia ixx="0.0001" ixy="0.0" ixz="0.0" iyy="0.0001" iyz="0.0" izz="0.0001"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+    </inertial>
+  </link>
+  <joint name="right_gripper" type="fixed">
+    <origin rpy="0 0 2.35619449019" xyz="0 0 0.1"/>
+      <axis xyz="0 0 1"/>
+    <parent link="panda_link8"/>
+    <child link="right_gripper"/>
+  </joint>
+   <link name="panda_grasptarget">
+ <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+       <mass value="0.0"/>
+       <inertia ixx="0.1" ixy="0" ixz="0" iyy="0.1" iyz="0" izz="0.1"/>
+    </inertial>
+   </link>
+   <joint name="panda_grasptarget_hand" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="panda_grasptarget"/>
+    <origin rpy="0 0 0" xyz="0 0 0.105"/>
+  </joint>
+
+</robot>
\ No newline at end of file
diff --git a/src/curobo/content/assets/robot/franka_description/moving_gripper.urdf b/src/curobo/content/assets/robot/franka_description/moving_gripper.urdf
new file mode 100644
index 00000000..88247b3c
--- /dev/null
+++ b/src/curobo/content/assets/robot/franka_description/moving_gripper.urdf
@@ -0,0 +1,71 @@
+<?xml version="1.0"?>
+<robot name="mug">
+  <link name="base"/>
+  <link name="base1"/>
+  <link name="base2"/>
+  <link name="base3"/>
+  <link name="base4"/>
+  <link name="base5"/>
+  <joint name="x_joint" type="prismatic">
+    <origin xyz="0 0 0" rpy="0 0 0"/>
+    <parent link="base"/>
+    <child link="base1"/>
+    <axis xyz="1 0 0"/>
+    <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
+  </joint>
+
+  <joint name="y_joint" type="prismatic">
+    <origin xyz="0 0 0" rpy="0 0 0"/>
+    <parent link="base1"/>
+    <child link="base2"/>
+    <axis xyz="0 1 0"/>
+    <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
+  </joint>
+
+  <joint name="z_joint" type="prismatic">
+    <origin xyz="0 0 0" rpy="0 0 0"/>
+    <parent link="base2"/>
+    <child link="base3"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
+  </joint>
+  <joint name="rx_joint" type="revolute">
+    <origin xyz="0 0 0" rpy="0 0 0"/>
+    <parent link="base3"/>
+    <child link="base4"/>
+    <axis xyz="1 0 0"/>
+    <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
+  </joint>
+  <joint name="ry_joint" type="revolute">
+    <origin xyz="0 0 0" rpy="0 0 0"/>
+    <parent link="base4"/>
+    <child link="base5"/>
+    <axis xyz="0 1 0"/>
+    <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
+  </joint>
+  <joint name="rz_joint" type="revolute">
+    <origin xyz="0 0 0" rpy="0 0 0"/>
+    <parent link="base5"/>
+    <child link="gripper"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
+  </joint>
+  <link name="gripper">
+    <visual>
+      <origin xyz="0.0 0.0 0.0" rpy="0.0 0 0"/>
+      <geometry>
+         <mesh filename="meshes/visual/hand.obj"/>
+      </geometry>
+    </visual>
+    <collision>
+      <origin xyz="0.0 0.0 0.0"/>
+      <geometry>
+          <mesh filename="meshes/visual/hand.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.0"/>
+      <inertia ixx="0.0001" ixy="0.0" ixz="0.0" iyy="0.0001" iyz="0.0" izz="0.0001"/>
+    </inertial>
+  </link>
+</robot>
diff --git a/src/curobo/content/assets/robot/franka_description/panda.urdf b/src/curobo/content/assets/robot/franka_description/panda.urdf
new file mode 100644
index 00000000..db245ac4
--- /dev/null
+++ b/src/curobo/content/assets/robot/franka_description/panda.urdf
@@ -0,0 +1,342 @@
+<?xml version="1.0" ?>
+<!-- =================================================================================== -->
+<!-- |    This document was autogenerated by xacro from panda_arm_hand.urdf.xacro      | -->
+<!-- |    EDITING THIS FILE BY HAND IS NOT RECOMMENDED                                 | -->
+<!-- =================================================================================== -->
+<robot name="panda" xmlns:xacro="http://www.ros.org/wiki/xacro">
+  <link name="panda_link0">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link0.dae"/>
+      </geometry>
+      <material name="panda_white">
+    		<color rgba="1. 1. 1. 1."/>
+  		</material>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link0.stl"/>
+      </geometry>
+      <material name="panda_white"/>
+    </collision>
+  </link>
+  <link name="panda_link1">
+  	<inertial>
+      <origin rpy="0 0 0" xyz="0 -0.04 -0.05"/>
+       <mass value="2.7"/>
+       <inertia ixx="0.1" ixy="0" ixz="0" iyy="0.1" iyz="0" izz="0.1"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link1.dae"/>
+      </geometry>
+      <material name="panda_white"/>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link1.stl"/>
+      </geometry>
+      <material name="panda_white"/>
+    </collision>
+  </link>
+  <joint name="panda_joint1" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="0 0 0" xyz="0 0 0.333"/>
+    <parent link="panda_link0"/>
+    <child link="panda_link1"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="87" lower="-2.8973" upper="2.8973" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link2">
+  	<inertial>
+      <origin rpy="0 0 0" xyz="0 -0.04 0.06"/>
+       <mass value="2.73"/>
+       <inertia ixx="0.1" ixy="0" ixz="0" iyy="0.1" iyz="0" izz="0.1"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link2.dae"/>
+      </geometry>
+      <material name="panda_white"/>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link2.stl"/>
+      </geometry>
+      <material name="panda_white"/>
+    </collision>
+  </link>
+  <joint name="panda_joint2" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-1.7628" soft_upper_limit="1.7628"/>
+    <origin rpy="-1.57079632679 0 0" xyz="0 0 0"/>
+    <parent link="panda_link1"/>
+    <child link="panda_link2"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="87" lower="-1.7628" upper="1.7628" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link3">
+	  <inertial>
+      <origin rpy="0 0 0" xyz="0.01 0.01 -0.05"/>
+       <mass value="2.04"/>
+       <inertia ixx="0.1" ixy="0" ixz="0" iyy="0.1" iyz="0" izz="0.1"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link3.dae"/>
+      </geometry>
+      <material name="panda_red">
+    		<color rgba="1. 1. 1. 1."/>
+  		</material>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link3.stl"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="panda_joint3" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="1.57079632679 0 0" xyz="0 -0.316 0"/>
+    <parent link="panda_link2"/>
+    <child link="panda_link3"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="87" lower="-2.8973" upper="2.8973" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link4">
+  	<inertial>
+      <origin rpy="0 0 0" xyz="-0.03 0.03 0.02"/>
+       <mass value="2.08"/>
+       <inertia ixx="0.1" ixy="0" ixz="0" iyy="0.1" iyz="0" izz="0.1"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link4.dae"/>
+      </geometry>
+      <material name="panda_white"/>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link4.stl"/>
+      </geometry>
+      <material name="panda_white"/>
+    </collision>
+  </link>
+  <joint name="panda_joint4" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-3.0718" soft_upper_limit="-0.0698"/>
+    <origin rpy="1.57079632679 0 0" xyz="0.0825 0 0"/>
+    <parent link="panda_link3"/>
+    <child link="panda_link4"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="87" lower="-3.0718" upper="-0.0698" velocity="2.1750"/>
+  </joint>
+  <link name="panda_link5">
+  	<inertial>
+      <origin rpy="0 0 0" xyz="0 0.04 -0.12"/>
+       <mass value="3"/>
+       <inertia ixx="0.1" ixy="0" ixz="0" iyy="0.1" iyz="0" izz="0.1"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link5.dae"/>
+      </geometry>
+      <material name="panda_white"/>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link5.stl"/>
+      </geometry>
+      <material name="panda_white"/>
+    </collision>
+  </link>
+  <joint name="panda_joint5" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8975" soft_upper_limit="2.8975"/>
+    <origin rpy="-1.57079632679 0 0" xyz="-0.0825 0.384 0"/>
+    <parent link="panda_link4"/>
+    <child link="panda_link5"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="12" lower="-2.8975" upper="2.8975" velocity="2.6100"/>
+  </joint>
+  <link name="panda_link6">
+  	<inertial>
+      <origin rpy="0 0 0" xyz="0.04 0 0"/>
+       <mass value="1.3"/>
+       <inertia ixx="0.1" ixy="0" ixz="0" iyy="0.1" iyz="0" izz="0.1"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link6.dae"/>
+      </geometry>
+      <material name="panda_white"/>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link6.stl"/>
+      </geometry>
+      <material name="panda_white"/>
+    </collision>
+  </link>
+  <joint name="panda_joint6" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-0.0175" soft_upper_limit="3.7525"/>
+    <origin rpy="1.57079632679 0 0" xyz="0 0 0"/>
+    <parent link="panda_link5"/>
+    <child link="panda_link6"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="12" lower="-0.0175" upper="3.7525" velocity="2.6100"/>
+  </joint>
+  <link name="panda_link7">
+  	<inertial>
+      <origin rpy="0 0 0" xyz="0 0 0.08"/>
+       <mass value=".2"/>
+       <inertia ixx="0.1" ixy="0" ixz="0" iyy="0.1" iyz="0" izz="0.1"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/link7.dae"/>
+      </geometry>
+      <material name="panda_white"/>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/link7.stl"/>
+      </geometry>
+      <material name="panda_white"/>
+    </collision>
+  </link>
+  <joint name="panda_joint7" type="revolute">
+    <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
+    <origin rpy="1.57079632679 0 0" xyz="0.088 0 0"/>
+    <parent link="panda_link6"/>
+    <child link="panda_link7"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
+  </joint>
+  <link name="panda_link8">
+  	 <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+       <mass value="0.0"/>
+       <inertia ixx="0.1" ixy="0" ixz="0" iyy="0.1" iyz="0" izz="0.1"/>
+    </inertial>
+  </link>
+  <joint name="panda_joint8" type="fixed">
+    <origin rpy="0 0 0" xyz="0 0 0.107"/>
+    <parent link="panda_link7"/>
+    <child link="panda_link8"/>
+    <axis xyz="0 0 0"/>
+  </joint>
+  <joint name="panda_hand_joint" type="fixed">
+    <parent link="panda_link8"/>
+    <child link="panda_hand"/>
+    <origin rpy="0 0 -0.785398163397" xyz="0 0 0"/>
+  </joint>
+  <link name="panda_hand">
+  	<inertial>
+      <origin rpy="0 0 0" xyz="0 0 0.04"/>
+       <mass value=".81"/>
+       <inertia ixx="0.1" ixy="0" ixz="0" iyy="0.1" iyz="0" izz="0.1"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/hand.dae"/>
+      </geometry>
+      <material name="panda_white"/>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/hand.stl"/>
+      </geometry>
+      <material name="panda_white"/>
+    </collision>
+  </link>
+  <link name="panda_leftfinger">
+       <contact>
+      <friction_anchor/>
+      <stiffness value="30000.0"/>
+      <damping value="1000.0"/>
+      <spinning_friction value="0.1"/>
+      <lateral_friction value="1.0"/>
+    </contact>
+  	<inertial>
+      <origin rpy="0 0 0" xyz="0 0.01 0.02"/>
+       <mass value="0.1"/>
+       <inertia ixx="0.1" ixy="0" ixz="0" iyy="0.1" iyz="0" izz="0.1"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/visual/finger.dae"/>
+      </geometry>
+      <material name="panda_white"/>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/collision/finger.stl"/>
+      </geometry>
+      <material name="panda_white"/>
+    </collision>
+  </link>
+  <link name="panda_rightfinger">
+        <contact>
+      <friction_anchor/>
+      <stiffness value="30000.0"/>
+      <damping value="1000.0"/>
+      <spinning_friction value="0.1"/>
+      <lateral_friction value="1.0"/>
+    </contact>
+
+  	<inertial>
+      <origin rpy="0 0 0" xyz="0 -0.01 0.02"/>
+       <mass value="0.1"/>
+       <inertia ixx="0.1" ixy="0" ixz="0" iyy="0.1" iyz="0" izz="0.1"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 3.14159265359" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/visual/finger.dae"/>
+      </geometry>
+      <material name="panda_white"/>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.14159265359" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/collision/finger.stl"/>
+      </geometry>
+      <material name="panda_white"/>
+    </collision>
+  </link>
+  <joint name="panda_finger_joint1" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="panda_leftfinger"/>
+    <origin rpy="0 0 0" xyz="0 0.04 0.0584"/>
+  </joint>
+  <joint name="panda_finger_joint2" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="panda_rightfinger"/>
+    <origin rpy="0 0 0" xyz="0 -0.04 0.0584"/>
+  </joint>
+  <link name="right_gripper">
+    <inertial>
+      <!-- Dummy inertial parameters to avoid link lumping-->
+      <mass value="0.01"/>
+      <inertia ixx="0.0001" ixy="0.0" ixz="0.0" iyy="0.0001" iyz="0.0" izz="0.0001"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+    </inertial>
+  </link>
+  <joint name="right_gripper" type="fixed">
+    <origin rpy="0 0 2.35619449019" xyz="0 0 0.1"/>
+      <axis xyz="0 0 1"/>
+    <parent link="panda_link8"/>
+    <child link="right_gripper"/>
+  </joint>
+   <link name="panda_grasptarget">
+ <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+       <mass value="0.0"/>
+       <inertia ixx="0.1" ixy="0" ixz="0" iyy="0.1" iyz="0" izz="0.1"/>
+    </inertial>
+   </link>
+   <joint name="panda_grasptarget_hand" type="fixed">
+    <parent link="panda_hand"/>
+    <child link="panda_grasptarget"/>
+    <origin rpy="0 0 0" xyz="0 0 0.105"/>
+  </joint>
+
+</robot>
diff --git a/src/curobo/content/assets/robot/franka_description/panda_grasp_gripper.urdf b/src/curobo/content/assets/robot/franka_description/panda_grasp_gripper.urdf
new file mode 100644
index 00000000..896ffd80
--- /dev/null
+++ b/src/curobo/content/assets/robot/franka_description/panda_grasp_gripper.urdf
@@ -0,0 +1,26 @@
+<?xml version="1.0"?>
+<robot name="panda_gripper">
+  <link name="gripper">
+    <visual>
+      <origin rpy="0.0 0.0 0.0" xyz="0 0 0.0"/>
+      <geometry>
+        <mesh filename="meshes/visual/graspnet_panda_mesh.obj"/>
+
+      </geometry>
+    </visual>
+    <collision>
+      <origin rpy="0.0 0.0 0.0" xyz="0 0 0.0"/>
+      <geometry>
+        <!--box size="0.01 0.01 0.01"/-->
+        
+        <mesh filename="meshes/visual/graspnet_panda_mesh.obj"/>
+        <!--mesh filename="meshes/collision/hand_gripper.obj"/-->
+      </geometry>
+    </collision>
+
+    <inertial>
+      <mass value="0.102"/>
+      <inertia ixx="0.0001" ixy="0.0" ixz="0.0" iyy="0.0001" iyz="0.0" izz="0.0001"/>
+    </inertial>
+  </link>
+</robot>
diff --git a/src/curobo/content/assets/robot/franka_description/panda_gripper.urdf b/src/curobo/content/assets/robot/franka_description/panda_gripper.urdf
new file mode 100644
index 00000000..f1982cdf
--- /dev/null
+++ b/src/curobo/content/assets/robot/franka_description/panda_gripper.urdf
@@ -0,0 +1,26 @@
+<?xml version="1.0"?>
+<robot name="panda_gripper">
+  <link name="gripper">
+    <visual>
+      <origin rpy="0 0.0 0.0" xyz="0 0 0.0"/>
+      <geometry>
+        <!--mesh filename="meshes/visual/graspnet_panda_mesh.obj"/-->
+        <mesh filename="meshes/visual/hand.obj"/>
+      </geometry>
+    </visual>
+    <collision>
+      <origin rpy="0.0 0.0 0.0" xyz="0 0 0.0"/>
+      <geometry>
+        <!--box size="0.01 0.01 0.01"/-->
+        <mesh filename="meshes/visual/hand.obj"/>
+        <!--mesh filename="meshes/visual/graspnet_panda_mesh.obj"/-->
+        <!--mesh filename="meshes/collision/hand_gripper.obj"/-->
+      </geometry>
+    </collision>
+
+    <inertial>
+      <mass value="0.102"/>
+      <inertia ixx="0.0001" ixy="0.0" ixz="0.0" iyy="0.0001" iyz="0.0" izz="0.0001"/>
+    </inertial>
+  </link>
+</robot>
diff --git a/src/curobo/content/assets/robot/iiwa_allegro_description/allegro.urdf b/src/curobo/content/assets/robot/iiwa_allegro_description/allegro.urdf
new file mode 100644
index 00000000..a7e114a4
--- /dev/null
+++ b/src/curobo/content/assets/robot/iiwa_allegro_description/allegro.urdf
@@ -0,0 +1,546 @@
+<?xml version="1.0" ?>
+<!-- =================================================================================== -->
+<!-- |    This document was autogenerated by xacro from lula_kuka_allegro.urdf.xacro   | -->
+<!-- |    EDITING THIS FILE BY HAND IS NOT RECOMMENDED                                 | -->
+<!-- =================================================================================== -->
+<robot name="kuka_allegro" xmlns:xacro="http://www.ros.org/wiki/xacro">
+  <!-- ======================== BASE PARAMS ========================= -->
+  <!-- ======================== FINGER PARAMS ======================== -->
+  <!-- full height from joint to tip. when used,
+       the radius of the finger tip sphere will be subtracted
+       and one fixed link will be added for the tip. -->
+  <!-- ========================= THUMB PARAMS ========================= -->
+  <!-- ========================= LIMITS ========================= -->
+  <!-- ============================================================================= -->
+  <!-- BASE -->
+  <link name="allegro_mount">
+    <inertial>
+      <mass value="0.05"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="1e-4" ixy="0" ixz="0" iyy="1e-4" iyz="0" izz="1e-4"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/mounts/allegro_mount.obj"/>
+      </geometry>
+      <material name="color_j7"/>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/mounts/allegro_mount.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="allegro_mount_joint" type="fixed">
+    <origin rpy="0 -1.5708 0.785398" xyz="-0.008219 -0.02063 0.08086"/>
+    <parent link="allegro_mount"/>
+    <child link="palm_link"/>
+  </joint>
+  <link name="palm_link">
+    <inertial>
+      <mass value="0.4154"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="1e-4" ixy="0" ixz="0" iyy="1e-4" iyz="0" izz="1e-4"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/allegro/base_link.obj"/>
+      </geometry>
+      <origin rpy="0 0 0" xyz="0 0 0 "/>
+      <material name="Grey">
+        <color rgba="0.2 0.2 0.2 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/allegro/base_link.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  
+  <gazebo reference="palm_link">
+    <material value="Gazebo/Grey"/>
+  </gazebo>
+  <link name="index_link_0">
+    <collision>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/allegro/primary_base.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.005"/>
+      <inertia ixx="5.1458e-5" ixy="0" ixz="0" iyy="5.1458e-5" iyz="0" izz="6.125e-5"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/allegro/primary_base.obj"/>
+      </geometry>
+      <material name="Grey">
+        <color rgba="0.2 0.2 0.2 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="index_link_1">
+    <collision>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/allegro/primary_proximal.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.005"/>
+      <inertia ixx="5.1458e-5" ixy="0" ixz="0" iyy="5.1458e-5" iyz="0" izz="6.125e-5"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/allegro/primary_proximal.obj"/>
+      </geometry>
+      <material name="Grey">
+        <color rgba="0.2 0.2 0.2 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="index_link_2">
+    <collision>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/allegro/primary_medial.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.05"/>
+      <inertia ixx="5.1458e-5" ixy="0" ixz="0" iyy="5.1458e-5" iyz="0" izz="6.125e-5"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/allegro/primary_medial.obj"/>
+      </geometry>
+      <material name="Grey">
+        <color rgba="0.2 0.2 0.2 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="index_link_3">
+    <collision>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/biotac/biotac_sensor.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.11"/>
+      <inertia ixx="5.1458e-5" ixy="0" ixz="0" iyy="5.1458e-5" iyz="0" izz="6.125e-5"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/biotac/biotac_sensor.obj"/>
+      </geometry>
+      <material name="Green">
+        <color rgba="0. 0.5 0. 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="index_biotac_tip">
+    <inertial>
+      <mass value="0.04"/>
+      <inertia ixx="5.1458e-5" ixy="0" ixz="0" iyy="5.1458e-5" iyz="0" izz="6.125e-5"/>
+    </inertial>
+  </link>
+  <link name="middle_link_0">
+    <collision>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/allegro/primary_base.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.1"/>
+      <inertia ixx="5.1458e-5" ixy="0" ixz="0" iyy="5.1458e-5" iyz="0" izz="6.125e-5"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/allegro/primary_base.obj"/>
+      </geometry>
+      <material name="Grey">
+        <color rgba="0.2 0.2 0.2 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="middle_link_1">
+    <collision>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/allegro/primary_proximal.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.1"/>
+      <inertia ixx="5.1458e-5" ixy="0" ixz="0" iyy="5.1458e-5" iyz="0" izz="6.125e-5"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/allegro/primary_proximal.obj"/>
+      </geometry>
+      <material name="Grey">
+        <color rgba="0.2 0.2 0.2 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="middle_link_2">
+    <collision>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/allegro/primary_medial.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.1"/>
+      <inertia ixx="5.1458e-5" ixy="0" ixz="0" iyy="5.1458e-5" iyz="0" izz="6.125e-5"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/allegro/primary_medial.obj"/>
+      </geometry>
+      <material name="Grey">
+        <color rgba="0.2 0.2 0.2 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="middle_link_3">
+    <collision>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/biotac/biotac_sensor.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.1"/>
+      <inertia ixx="5.1458e-5" ixy="0" ixz="0" iyy="5.1458e-5" iyz="0" izz="6.125e-5"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/biotac/biotac_sensor.obj"/>
+      </geometry>
+      <material name="Green">
+        <color rgba="0 0.5 0 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="middle_biotac_tip">
+    <inertial>
+      <mass value="0.1"/>
+      <inertia ixx="5.1458e-5" ixy="0" ixz="0" iyy="5.1458e-5" iyz="0" izz="6.125e-5"/>
+    </inertial>
+  </link>
+-->
+  <link name="ring_link_0">
+    <collision>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/allegro/primary_base.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.1"/>
+      <inertia ixx="5.1458e-5" ixy="0" ixz="0" iyy="5.1458e-5" iyz="0" izz="6.125e-5"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/allegro/primary_base.obj"/>
+      </geometry>
+      <material name="Grey">
+        <color rgba="0.2 0.2 0.2 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="ring_link_1">
+    <collision>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/allegro/primary_proximal.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.1"/>
+      <inertia ixx="5.1458e-5" ixy="0" ixz="0" iyy="5.1458e-5" iyz="0" izz="6.125e-5"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/allegro/primary_proximal.obj"/>
+      </geometry>
+      <material name="Grey">
+        <color rgba="0.2 0.2 0.2 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="ring_link_2">
+    <collision>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/allegro/primary_medial.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.1"/>
+      <inertia ixx="5.1458e-5" ixy="0" ixz="0" iyy="5.1458e-5" iyz="0" izz="6.125e-5"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/allegro/primary_medial.obj"/>
+      </geometry>
+      <material name="Grey">
+        <color rgba="0.2 0.2 0.2 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="ring_link_3">
+    <collision>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/biotac/biotac_sensor.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.1"/>
+      <inertia ixx="5.1458e-5" ixy="0" ixz="0" iyy="5.1458e-5" iyz="0" izz="6.125e-5"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/biotac/biotac_sensor.obj"/>
+      </geometry>
+      <material name="Green">
+        <color rgba="0 0.5 0 1"/>
+      </material>
+    </visual>
+  </link>
+
+  <link name="ring_biotac_tip">
+    <inertial>
+      <mass value="0.1"/>
+      <inertia ixx="5.1458e-5" ixy="0" ixz="0" iyy="5.1458e-5" iyz="0" izz="6.125e-5"/>
+    </inertial>
+  </link>
+  <link name="thumb_link_0">
+    <collision>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/allegro/thumb_base.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.1"/>
+      <inertia ixx="5.1458e-5" ixy="0" ixz="0" iyy="5.1458e-5" iyz="0" izz="6.125e-5"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/allegro/thumb_base.obj"/>
+      </geometry>
+      <material name="Grey">
+        <color rgba="0.2 0.2 0.2 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="thumb_link_1">
+    <collision>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/allegro/thumb_proximal.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.1"/>
+      <inertia ixx="5.1458e-5" ixy="0" ixz="0" iyy="5.1458e-5" iyz="0" izz="6.125e-5"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/allegro/thumb_proximal.obj"/>
+      </geometry>
+      <material name="Grey">
+        <color rgba="0.2 0.2 0.2 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="thumb_link_2">
+    <collision>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/allegro/thumb_medial.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.1"/>
+      <inertia ixx="5.1458e-5" ixy="0" ixz="0" iyy="5.1458e-5" iyz="0" izz="6.125e-5"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/allegro/thumb_medial.obj"/>
+      </geometry>
+      <material name="Grey">
+        <color rgba="0.2 0.2 0.2 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="thumb_link_3">
+    <collision>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/biotac/biotac_sensor_thumb.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.1"/>
+      <inertia ixx="5.1458e-5" ixy="0" ixz="0" iyy="5.1458e-5" iyz="0" izz="6.125e-5"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="kuka_allegro_description/meshes/biotac/biotac_sensor_thumb.obj"/>
+      </geometry>
+      <material name="Green">
+        <color rgba="0 0.5 0 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="thumb_biotac_tip">
+    <inertial>
+      <mass value="0.1"/>
+      <inertia ixx="5.1458e-5" ixy="0" ixz="0" iyy="5.1458e-5" iyz="0" izz="6.125e-5"/>
+    </inertial>
+  </link>
+
+  <joint name="index_joint_0" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.558488888889" upper="0.558488888889" velocity="6.283"/>
+    <origin rpy="3.1415 -1.57075 -0.0872638888889" xyz="0.0514302 -0.03632 -0.0113"/>
+    <parent link="palm_link"/>
+    <child link="index_link_0"/>
+    <dynamics damping="0.025" friction="0.035"/>
+  </joint>
+  <joint name="index_joint_1" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.279244444444" upper="1.727825" velocity="6.283"/>
+    <origin rpy="3.1415 -1.57075 1.57075" xyz="0.0 0.0 0.0"/>
+    <parent link="index_link_0"/>
+    <child link="index_link_1"/>
+    <dynamics damping="0.025" friction="0.035"/>
+  </joint>
+  <joint name="index_joint_2" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.279244444444" upper="1.727825" velocity="6.283"/>
+    <origin rpy="0.0 0.0 0.0" xyz="0.054 0.0 0.0"/>
+    <parent link="index_link_1"/>
+    <child link="index_link_2"/>
+    <dynamics damping="0.025" friction="0.035"/>
+  </joint>
+  <joint name="index_joint_3" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.279244444444" upper="1.727825" velocity="6.283"/>
+    <origin rpy="0.0 0.0 0.0" xyz="0.0384 0.0 0.0"/>
+    <parent link="index_link_2"/>
+    <child link="index_link_3"/>
+    <dynamics damping="0.025" friction="0.035"/>
+  </joint>
+  <joint name="middle_joint_0" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.558488888889" upper="0.558488888889" velocity="6.283"/>
+    <origin rpy="3.1415 -1.57075 0" xyz="0.0537375 0.0087771 -0.0113"/>
+    <parent link="palm_link"/>
+    <child link="middle_link_0"/>
+    <dynamics friction="0.035"/>
+  </joint>
+
+  <joint name="middle_joint_1" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.279244444444" upper="1.727825" velocity="6.283"/>
+    <origin rpy="3.1415 -1.57075 1.57075" xyz="0.0 0.0 0.0"/>
+    <parent link="middle_link_0"/>
+    <child link="middle_link_1"/>
+    <dynamics friction="0.035"/>
+  </joint>
+  <joint name="middle_joint_2" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.279244444444" upper="1.727825" velocity="6.283"/>
+    <origin rpy="0.0 0.0 0.0" xyz="0.054 0.0 0.0"/>
+    <parent link="middle_link_1"/>
+    <child link="middle_link_2"/>
+    <dynamics friction="0.035"/>
+  </joint>
+  <joint name="middle_joint_3" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.279244444444" upper="1.727825" velocity="6.283"/>
+    <origin rpy="0.0 0.0 0.0" xyz="0.0384 0.0 0.0"/>
+    <parent link="middle_link_2"/>
+    <child link="middle_link_3"/>
+    <dynamics friction="0.035"/>
+  </joint>
+  <joint name="middle_biotac_tip_joint" type="fixed">
+    <origin rpy="0 0 0.436319444444 " xyz="0.055 0.015 0"/>
+    <parent link="middle_link_3"/>
+    <child link="middle_biotac_tip"/>
+  </joint>
+
+  <joint name="ring_joint_0" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.558488888889" upper="0.558488888889" velocity="6.283"/>
+    <origin rpy="3.1415 -1.57075 0.0872638888889" xyz="0.0514302 0.0538749 -0.0113"/>
+    <parent link="palm_link"/>
+    <child link="ring_link_0"/>
+    <dynamics friction="0.035"/>
+  </joint>
+  <joint name="ring_joint_1" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.279244444444" upper="1.727825" velocity="6.283"/>
+    <origin rpy="3.1415 -1.57075 1.57075" xyz="0.0 0.0 0.0"/>
+    <parent link="ring_link_0"/>
+    <child link="ring_link_1"/>
+    <dynamics friction="0.035"/>
+  </joint>
+  <joint name="ring_joint_2" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.279244444444" upper="1.727825" velocity="6.283"/>
+    <origin rpy="0.0 0.0 0.0" xyz="0.054 0.0 0.0"/>
+    <parent link="ring_link_1"/>
+    <child link="ring_link_2"/>
+    <dynamics friction="0.035"/>
+  </joint>
+  <joint name="ring_joint_3" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.279244444444" upper="1.727825" velocity="6.283"/>
+    <origin rpy="0.0 0.0 0.0" xyz="0.0384 0.0 0.0"/>
+    <parent link="ring_link_2"/>
+    <child link="ring_link_3"/>
+    <dynamics friction="0.035"/>
+  </joint>
+
+  <joint name="ring_biotac_tip_joint" type="fixed">
+    <origin rpy="0 0 0.436319444444 " xyz="0.055 0.015 0"/>
+    <parent link="ring_link_3"/>
+    <child link="ring_biotac_tip"/>
+  </joint>
+
+  <joint name="thumb_joint_0" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="0.279244444444" upper="1.57075" velocity="6.283"/>
+    <origin rpy="-1.57075 -1.57075 1.48348611111" xyz="-0.0367482 -0.0081281 -0.0295"/>
+    <parent link="palm_link"/>
+    <child link="thumb_link_0"/>
+    <dynamics friction="0.035"/>
+  </joint>
+  <joint name="thumb_joint_1" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.331602777778" upper="1.15188333333" velocity="6.283"/>
+    <origin rpy="1.57075 0 0" xyz="0.005 0.0 0.0"/>
+    <parent link="thumb_link_0"/>
+    <child link="thumb_link_1"/>
+    <dynamics friction="0.035"/>
+  </joint>
+  <joint name="thumb_joint_2" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.279244444444" upper="1.727825" velocity="6.283"/>
+    <origin rpy="3.1415 -1.57075 0.0" xyz="0 0 0.0554"/>
+    <parent link="thumb_link_1"/>
+    <child link="thumb_link_2"/>
+    <dynamics friction="0.035"/>
+  </joint>
+  <joint name="thumb_joint_3" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.279244444444" upper="1.76273055556" velocity="6.283"/>
+    <origin rpy="0.0 0.0 0.0" xyz="0.0514 0.0 0.0"/>
+    <parent link="thumb_link_2"/>
+    <child link="thumb_link_3"/>
+    <dynamics friction="0.035"/>
+  </joint>
+  <joint name="thumb_biotac_tip_joint" type="fixed">
+    <origin rpy="0 0 0.436319444444 " xyz="0.07 0.01 0"/>
+    <parent link="thumb_link_3"/>
+    <child link="thumb_biotac_tip"/>
+  </joint>
+
+</robot>
diff --git a/src/curobo/content/assets/robot/iiwa_allegro_description/iiwa.urdf b/src/curobo/content/assets/robot/iiwa_allegro_description/iiwa.urdf
new file mode 100644
index 00000000..41fb39f8
--- /dev/null
+++ b/src/curobo/content/assets/robot/iiwa_allegro_description/iiwa.urdf
@@ -0,0 +1,414 @@
+<?xml version="1.0" ?>
+<!-- =================================================================================== -->
+<!-- |    This document was autogenerated by xacro from lula_kuka_allegro.urdf.xacro   | -->
+<!-- |    EDITING THIS FILE BY HAND IS NOT RECOMMENDED                                 | -->
+<!-- =================================================================================== -->
+<robot name="kuka_iiwa" xmlns:xacro="http://www.ros.org/wiki/xacro">
+  <material name="Black">
+    <color rgba="0.0 0.0 0.0 1.0"/>
+  </material>
+  <material name="Blue">
+    <color rgba="0.0 0.0 0.8 1.0"/>
+  </material>
+  <material name="Green">
+    <color rgba="0.0 0.8 0.0 1.0"/>
+  </material>
+  <material name="Grey">
+    <color rgba="0.4 0.4 0.4 1.0"/>
+  </material>
+  <material name="Orange">
+    <color rgba="1.0 0.423529411765 0.0392156862745 1.0"/>
+  </material>
+  <material name="Brown">
+    <color rgba="0.870588235294 0.811764705882 0.764705882353 1.0"/>
+  </material>
+  <material name="Red">
+    <color rgba="0.8 0.0 0.0 1.0"/>
+  </material>
+  <material name="White">
+    <color rgba="1.0 1.0 1.0 1.0"/>
+  </material>
+  <link name="base_link"/>
+  <!--joint between {parent} and link_0-->
+  <joint name="iiwa7_base_link_iiwa7_joint" type="fixed">
+    <origin rpy="0 0 0" xyz="0 0 0"/>
+    <parent link="base_link"/>
+    <child link="iiwa7_link_0"/>
+  </joint>
+  <link name="iiwa7_link_0">
+    <inertial>
+      <origin rpy="0 0 0" xyz="-0.1 0 0.07"/>
+      <mass value="5"/>
+      <inertia ixx="0.05" ixy="0" ixz="0" iyy="0.06" iyz="0" izz="0.03"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/visual/link_0.obj"/>
+      </geometry>
+      <material name="Grey"/>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/collision/link_0.obj"/>
+      </geometry>
+      <material name="Grey"/>
+    </collision>
+    <self_collision_checking>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <capsule length="0.25" radius="0.15"/>
+      </geometry>
+    </self_collision_checking>
+  </link>
+  <!-- joint between link_0 and link_1 -->
+  <joint name="iiwa7_joint_1" type="revolute">
+    <parent link="iiwa7_link_0"/>
+    <child link="iiwa7_link_1"/>
+    <origin rpy="0 0 0" xyz="0 0 0.15"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="300" lower="-2.96705972839" upper="2.96705972839" velocity="10"/>
+    <safety_controller k_position="100" k_velocity="2" soft_lower_limit="-2.93215314335" soft_upper_limit="2.93215314335"/>
+    <dynamics damping="0.5"/>
+  </joint>
+  <link name="iiwa7_link_1">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 -0.03 0.12"/>
+      <mass value="3.4525"/>
+      <inertia ixx="0.02183" ixy="0" ixz="0" iyy="0.007703" iyz="-0.003887" izz="0.02083"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0.0075"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/visual/link_1.obj"/>
+      </geometry>
+      <material name="Orange"/>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0.0075"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/collision/link_1.obj"/>
+      </geometry>
+      <material name="Orange"/>
+    </collision>
+  </link>
+  <joint name="iiwa7_joint_2" type="revolute">
+    <parent link="iiwa7_link_1"/>
+    <child link="iiwa7_link_2"/>
+    <origin rpy="1.57079632679   0 3.14159265359" xyz="0 0 0.19"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="300" lower="-2.09439510239" upper="2.09439510239" velocity="10"/>
+    <safety_controller k_position="100" k_velocity="2" soft_lower_limit="-2.05948851735" soft_upper_limit="2.05948851735"/>
+    <dynamics damping="0.5"/>
+  </joint>
+  <link name="iiwa7_link_2">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.0003 0.059 0.042"/>
+      <mass value="3.4821"/>
+      <inertia ixx="0.02076" ixy="0" ixz="-0.003626" iyy="0.02179" iyz="0" izz="0.00779"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/visual/link_2.obj"/>
+      </geometry>
+      <material name="Orange"/>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/collision/link_2.obj"/>
+      </geometry>
+      <material name="Orange"/>
+    </collision>
+  </link>
+  <joint name="iiwa7_joint_3" type="revolute">
+    <parent link="iiwa7_link_2"/>
+    <child link="iiwa7_link_3"/>
+    <origin rpy="1.57079632679 0 3.14159265359" xyz="0 0.21 0"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="300" lower="-2.96705972839" upper="2.96705972839" velocity="10"/>
+    <safety_controller k_position="100" k_velocity="2" soft_lower_limit="-2.93215314335" soft_upper_limit="2.93215314335"/>
+    <dynamics damping="0.5"/>
+  </joint>
+  <link name="iiwa7_link_3">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0.03 0.13"/>
+      <mass value="4.05623"/>
+      <inertia ixx="0.03204" ixy="0" ixz="0" iyy="0.00972" iyz="0.006227" izz="0.03042"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 -0.026"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/visual/link_3.obj"/>
+      </geometry>
+      <material name="Orange"/>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 -0.026"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/collision/link_3.obj"/>
+      </geometry>
+      <material name="Orange"/>
+    </collision>
+  </link>
+  <joint name="iiwa7_joint_4" type="revolute">
+    <parent link="iiwa7_link_3"/>
+    <child link="iiwa7_link_4"/>
+    <origin rpy="1.57079632679 0 0" xyz="0 0 0.19"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="300" lower="-2.09439510239" upper="2.09439510239" velocity="10"/>
+    <safety_controller k_position="100" k_velocity="2" soft_lower_limit="-2.05948851735" soft_upper_limit="2.05948851735"/>
+    <dynamics damping="0.5"/>
+  </joint>
+  <link name="iiwa7_link_4">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0.067 0.034"/>
+      <mass value="3.4822"/>
+      <inertia ixx="0.02178" ixy="0" ixz="0" iyy="0.02075" iyz="-0.003625" izz="0.007785"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/visual/link_4.obj"/>
+      </geometry>
+      <material name="Orange"/>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/collision/link_4.obj"/>
+      </geometry>
+      <material name="Orange"/>
+    </collision>
+  </link>
+  <joint name="iiwa7_joint_5" type="revolute">
+    <parent link="iiwa7_link_4"/>
+    <child link="iiwa7_link_5"/>
+    <origin rpy="-1.57079632679 3.14159265359 0" xyz="0 0.21 0"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="300" lower="-2.96705972839" upper="2.96705972839" velocity="10"/>
+    <safety_controller k_position="100" k_velocity="2" soft_lower_limit="-2.93215314335" soft_upper_limit="2.93215314335"/>
+    <dynamics damping="0.5"/>
+  </joint>
+  <link name="iiwa7_link_5">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.0001 0.021 0.076"/>
+      <mass value="2.1633"/>
+      <inertia ixx="0.01287" ixy="0" ixz="0" iyy="0.005708" iyz="-0.003946" izz="0.01112"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 -0.026"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/visual/link_5.obj"/>
+      </geometry>
+      <material name="Orange"/>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 -0.026"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/collision/link_5.obj"/>
+      </geometry>
+      <material name="Orange"/>
+    </collision>
+  </link>
+  <joint name="iiwa7_joint_6" type="revolute">
+    <parent link="iiwa7_link_5"/>
+    <child link="iiwa7_link_6"/>
+    <origin rpy="1.57079632679 0 0" xyz="0 0.06070 0.19"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="300" lower="-2.09439510239" upper="2.09439510239" velocity="10"/>
+    <safety_controller k_position="100" k_velocity="2" soft_lower_limit="-2.05948851735" soft_upper_limit="2.05948851735"/>
+    <dynamics damping="0.5"/>
+  </joint>
+  <link name="iiwa7_link_6">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0.0006 0.0004"/>
+      <mass value="2.3466"/>
+      <inertia ixx="0.006509" ixy="0" ixz="0" iyy="0.006259" iyz="0.00031891" izz="0.004527"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/visual/link_6.obj"/>
+      </geometry>
+      <material name="Orange"/>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/collision/link_6.obj"/>
+      </geometry>
+      <material name="Orange"/>
+    </collision>
+  </link>
+  <joint name="iiwa7_joint_7" type="revolute">
+    <parent link="iiwa7_link_6"/>
+    <child link="iiwa7_link_7"/>
+    <origin rpy="-1.57079632679 3.14159265359 0" xyz="0 0.081 0.06070"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="300" lower="-3.05432619099" upper="3.05432619099" velocity="10"/>
+    <safety_controller k_position="100" k_velocity="2" soft_lower_limit="-3.01941960595" soft_upper_limit="3.01941960595"/>
+    <dynamics damping="0.5"/>
+  </joint>
+  <link name="iiwa7_link_7">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0.02"/>
+      <mass value="3.129"/>
+      <inertia ixx="0.01464" ixy="0.0005912" ixz="0" iyy="0.01465" iyz="0" izz="0.002872"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 -0.0005"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/visual/link_7.obj"/>
+      </geometry>
+      <material name="Grey"/>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 -0.0005"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/collision/link_7.obj"/>
+      </geometry>
+      <material name="Grey"/>
+    </collision>
+  </link>
+  <joint name="iiwa7_joint_ee" type="fixed">
+    <parent link="iiwa7_link_7"/>
+    <child link="iiwa7_link_ee"/>
+    <origin rpy="0 0 0" xyz="0 0 0.071"/>
+  </joint>
+  <link name="iiwa7_link_ee">
+    </link>
+  <!-- Load Gazebo lib and set the robot namespace -->
+  <!-- <gazebo>
+      <plugin name="gazebo_ros_control" filename="libgazebo_ros_control.so">
+        <robotNamespace>/${robot_name}</robotNamespace>
+      </plugin>
+    </gazebo> -->
+  <!-- Link0 -->
+  <gazebo reference="iiwa7_link_0">
+    <material>Gazebo/Grey</material>
+    <mu1>0.2</mu1>
+    <mu2>0.2</mu2>
+  </gazebo>
+  <!-- Link1 -->
+  <gazebo reference="iiwa7_link_1">
+    <material>Gazebo/Orange</material>
+    <mu1>0.2</mu1>
+    <mu2>0.2</mu2>
+  </gazebo>
+  <!-- Link2 -->
+  <gazebo reference="iiwa7_link_2">
+    <material>Gazebo/Orange</material>
+    <mu1>0.2</mu1>
+    <mu2>0.2</mu2>
+  </gazebo>
+  <!-- Link3 -->
+  <gazebo reference="iiwa7_link_3">
+    <material>Gazebo/Orange</material>
+    <mu1>0.2</mu1>
+    <mu2>0.2</mu2>
+  </gazebo>
+  <!-- Link4 -->
+  <gazebo reference="iiwa7_link_4">
+    <material>Gazebo/Orange</material>
+    <mu1>0.2</mu1>
+    <mu2>0.2</mu2>
+  </gazebo>
+  <!-- Link5 -->
+  <gazebo reference="iiwa7_link_5">
+    <material>Gazebo/Orange</material>
+    <mu1>0.2</mu1>
+    <mu2>0.2</mu2>
+  </gazebo>
+  <!-- Link6 -->
+  <gazebo reference="iiwa7_link_6">
+    <material>Gazebo/Orange</material>
+    <mu1>0.2</mu1>
+    <mu2>0.2</mu2>
+  </gazebo>
+  <!-- Link7 -->
+  <gazebo reference="iiwa7_link_7">
+    <material>Gazebo/Grey</material>
+    <mu1>0.2</mu1>
+    <mu2>0.2</mu2>
+  </gazebo>
+  <transmission name="iiwa7_tran_1">
+    <robotNamespace>/iiwa7</robotNamespace>
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="iiwa7_joint_1">
+      <hardwareInterface>PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="iiwa7_motor_1">
+      <hardwareInterface>PositionJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="iiwa7_tran_2">
+    <robotNamespace>/iiwa7</robotNamespace>
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="iiwa7_joint_2">
+      <hardwareInterface>PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="iiwa7_motor_2">
+      <hardwareInterface>PositionJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="iiwa7_tran_3">
+    <robotNamespace>/iiwa7</robotNamespace>
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="iiwa7_joint_3">
+      <hardwareInterface>PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="iiwa7_motor_3">
+      <hardwareInterface>PositionJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="iiwa7_tran_4">
+    <robotNamespace>/iiwa7</robotNamespace>
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="iiwa7_joint_4">
+      <hardwareInterface>PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="iiwa7_motor_4">
+      <hardwareInterface>PositionJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="iiwa7_tran_5">
+    <robotNamespace>/iiwa7</robotNamespace>
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="iiwa7_joint_5">
+      <hardwareInterface>PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="iiwa7_motor_5">
+      <hardwareInterface>PositionJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="iiwa7_tran_6">
+    <robotNamespace>/iiwa7</robotNamespace>
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="iiwa7_joint_6">
+      <hardwareInterface>PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="iiwa7_motor_6">
+      <hardwareInterface>PositionJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="iiwa7_tran_7">
+    <robotNamespace>/iiwa7</robotNamespace>
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="iiwa7_joint_7">
+      <hardwareInterface>PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="iiwa7_motor_7">
+      <hardwareInterface>PositionJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+</robot>
diff --git a/src/curobo/content/assets/robot/iiwa_allegro_description/iiwa_allegro.urdf b/src/curobo/content/assets/robot/iiwa_allegro_description/iiwa_allegro.urdf
new file mode 100644
index 00000000..3fcc99d7
--- /dev/null
+++ b/src/curobo/content/assets/robot/iiwa_allegro_description/iiwa_allegro.urdf
@@ -0,0 +1,973 @@
+<?xml version="1.0" ?>
+<!-- This file was autogenerated but we have modified quite a bit
+Authors: Karl, Ankur
+
+Karl did the dynamics compensation and obtained the following values for the index finger
+
+Noise: 0.111576
+Masses: 0.005, 0.125164, 0.131691, 0.0211922,
+Rotor Inertia: 0.00386572, 0.00346965, 0.00433775, 0.00366413,
+Viscous Friction: 0.0414019, 0.00587541, 0.010638, 0.0226948,
+Coulomb Friction: 0.0523963, 0.0150275, 0.00616359, 0.0227036,
+Mass Inertia:
+5.1458e-05, 5.1458e-05, 6.125e-05, 0, 0, 0,
+6.39979e-06, 8.88687e-05, 9.13751e-05, -3.26531e-06, 1.23963e-05, 2.07384e-05,
+7.04217e-05, 3.95744e-05, 6.61125e-05, -9.64342e-05, 5.8796e-05, -3.62996e-05,
+2.93743e-05, 7.21391e-05, 7.59731e-05, -3.51896e-05, -6.31225e-05, -9.25392e-07,
+
+Center of Mass:
+0, 0, 0,
+0.027, 0, 0,
+0.038, 0, 0,
+0.029, 0, 0,
+
+-->
+
+<robot name="iiwa_allegro" xmlns:xacro="http://www.ros.org/wiki/xacro">
+  <material name="Black">
+    <color rgba="0.0 0.0 0.0 1.0"/>
+  </material>
+  <material name="Blue">
+    <color rgba="0.0 0.0 0.8 1.0"/>
+  </material>
+  <material name="Green">
+    <color rgba="0.0 0.8 0.0 1.0"/>
+  </material>
+  <material name="Grey">
+    <color rgba="0.4 0.4 0.4 1.0"/>
+  </material>
+  <material name="Orange">
+    <color rgba="1.0 0.423529411765 0.0392156862745 1.0"/>
+  </material>
+  <material name="Brown">
+    <color rgba="0.870588235294 0.811764705882 0.764705882353 1.0"/>
+  </material>
+  <material name="Red">
+    <color rgba="0.8 0.0 0.0 1.0"/>
+  </material>
+  <material name="White">
+    <color rgba="1.0 1.0 1.0 1.0"/>
+  </material>
+  <link name="base_link"/>
+  <!--joint between {parent} and link_0-->
+  <joint name="iiwa7_base_link_iiwa7_joint" type="fixed">
+    <origin rpy="0 0 0" xyz="0 0 0"/>
+    <parent link="base_link"/>
+    <child link="iiwa7_link_0"/>
+  </joint>
+  <link name="iiwa7_link_0">
+    <inertial>
+      <origin rpy="0 0 0" xyz="-0.1 0 0.07"/>
+      <mass value="5"/>
+      <inertia ixx="0.05" ixy="0" ixz="0" iyy="0.06" iyz="0" izz="0.03"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/visual/link_0.obj"/>
+      </geometry>
+      <material name="Grey"/>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/collision/link_0.obj"/>
+      </geometry>
+      <material name="Grey"/>
+    </collision>
+    <self_collision_checking>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <capsule length="0.25" radius="0.15"/>
+      </geometry>
+    </self_collision_checking>
+  </link>
+  <!-- joint between link_0 and link_1 -->
+  <joint name="iiwa7_joint_1" type="revolute">
+    <parent link="iiwa7_link_0"/>
+    <child link="iiwa7_link_1"/>
+    <origin rpy="0 0 0" xyz="0 0 0.15"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="300" lower="-2.96705972839" upper="2.96705972839" velocity="10"/>
+    <safety_controller k_position="100" k_velocity="2" soft_lower_limit="-2.93215314335" soft_upper_limit="2.93215314335"/>
+    <dynamics damping="5.5" friction="0.025"/>
+  </joint>
+  <link name="iiwa7_link_1">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 -0.03 0.12"/>
+      <mass value="3.4525"/>
+      <inertia ixx="0.02183" ixy="0" ixz="0" iyy="0.007703" iyz="-0.003887" izz="0.02083"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0.0075"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/visual/link_1.obj"/>
+      </geometry>
+      <material name="Orange"/>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0.0075"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/collision/link_1.obj"/>
+      </geometry>
+      <material name="Orange"/>
+    </collision>
+  </link>
+  <joint name="iiwa7_joint_2" type="revolute">
+    <parent link="iiwa7_link_1"/>
+    <child link="iiwa7_link_2"/>
+    <origin rpy="1.57079632679   0 3.14159265359" xyz="0 0 0.19"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="300" lower="-2.09439510239" upper="2.09439510239" velocity="10"/>
+    <safety_controller k_position="100" k_velocity="2" soft_lower_limit="-2.05948851735" soft_upper_limit="2.05948851735"/>
+    <dynamics damping="5.55" friction="0.025"/>
+  </joint>
+  <link name="iiwa7_link_2">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.0003 0.059 0.042"/>
+      <mass value="3.4821"/>
+      <inertia ixx="0.02076" ixy="0" ixz="-0.003626" iyy="0.02179" iyz="0" izz="0.00779"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/visual/link_2.obj"/>
+      </geometry>
+      <material name="Orange"/>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/collision/link_2.obj"/>
+      </geometry>
+      <material name="Orange"/>
+    </collision>
+  </link>
+  <joint name="iiwa7_joint_3" type="revolute">
+    <parent link="iiwa7_link_2"/>
+    <child link="iiwa7_link_3"/>
+    <origin rpy="1.57079632679 0 3.14159265359" xyz="0 0.21 0"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="300" lower="-2.96705972839" upper="2.96705972839" velocity="10"/>
+    <safety_controller k_position="100" k_velocity="2" soft_lower_limit="-2.93215314335" soft_upper_limit="2.93215314335"/>
+    <dynamics damping="5.5" friction="0.025"/>
+  </joint>
+  <link name="iiwa7_link_3">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0.03 0.13"/>
+      <mass value="4.05623"/>
+      <inertia ixx="0.03204" ixy="0" ixz="0" iyy="0.00972" iyz="0.006227" izz="0.03042"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 -0.026"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/visual/link_3.obj"/>
+      </geometry>
+      <material name="Orange"/>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 -0.026"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/collision/link_3.obj"/>
+      </geometry>
+      <material name="Orange"/>
+    </collision>
+  </link>
+  <joint name="iiwa7_joint_4" type="revolute">
+    <parent link="iiwa7_link_3"/>
+    <child link="iiwa7_link_4"/>
+    <origin rpy="1.57079632679 0 0" xyz="0 0 0.19"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="300" lower="-2.09439510239" upper="2.09439510239" velocity="10"/>
+    <safety_controller k_position="100" k_velocity="2" soft_lower_limit="-2.05948851735" soft_upper_limit="2.05948851735"/>
+    <dynamics damping="5.5" friction="0.025"/>
+  </joint>
+  <link name="iiwa7_link_4">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0.067 0.034"/>
+      <mass value="3.4822"/>
+      <inertia ixx="0.02178" ixy="0" ixz="0" iyy="0.02075" iyz="-0.003625" izz="0.007785"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/visual/link_4.obj"/>
+      </geometry>
+      <material name="Orange"/>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/collision/link_4.obj"/>
+      </geometry>
+      <material name="Orange"/>
+    </collision>
+  </link>
+  <joint name="iiwa7_joint_5" type="revolute">
+    <parent link="iiwa7_link_4"/>
+    <child link="iiwa7_link_5"/>
+    <origin rpy="-1.57079632679 3.14159265359 0" xyz="0 0.21 0"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="300" lower="-2.96705972839" upper="2.96705972839" velocity="10"/>
+    <safety_controller k_position="100" k_velocity="2" soft_lower_limit="-2.93215314335" soft_upper_limit="2.93215314335"/>
+    <dynamics damping="5.5" friction="0.025"/>
+  </joint>
+  <link name="iiwa7_link_5">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.0001 0.021 0.076"/>
+      <mass value="2.1633"/>
+      <inertia ixx="0.01287" ixy="0" ixz="0" iyy="0.005708" iyz="-0.003946" izz="0.01112"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 -0.026"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/visual/link_5.obj"/>
+      </geometry>
+      <material name="Orange"/>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 -0.026"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/collision/link_5.obj"/>
+      </geometry>
+      <material name="Orange"/>
+    </collision>
+  </link>
+  <joint name="iiwa7_joint_6" type="revolute">
+    <parent link="iiwa7_link_5"/>
+    <child link="iiwa7_link_6"/>
+    <origin rpy="1.57079632679 0 0" xyz="0 0.06070 0.19"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="300" lower="-2.09439510239" upper="2.09439510239" velocity="10"/>
+    <safety_controller k_position="100" k_velocity="2" soft_lower_limit="-2.05948851735" soft_upper_limit="2.05948851735"/>
+    <dynamics damping="0.5" friction="0.025"/>
+  </joint>
+  <link name="iiwa7_link_6">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0.0006 0.0004"/>
+      <mass value="2.3466"/>
+      <inertia ixx="0.006509" ixy="0" ixz="0" iyy="0.006259" iyz="0.00031891" izz="0.004527"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/visual/link_6.obj"/>
+      </geometry>
+      <material name="Orange"/>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/collision/link_6.obj"/>
+      </geometry>
+      <material name="Orange"/>
+    </collision>
+  </link>
+  <joint name="iiwa7_joint_7" type="revolute">
+    <parent link="iiwa7_link_6"/>
+    <child link="iiwa7_link_7"/>
+    <origin rpy="-1.57079632679 3.14159265359 0" xyz="0 0.081 0.06070"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="300" lower="-3.05432619099" upper="3.05432619099" velocity="10"/>
+    <safety_controller k_position="100" k_velocity="2" soft_lower_limit="-3.01941960595" soft_upper_limit="3.01941960595"/>
+    <dynamics damping="5.5" friction="0.025"/>
+  </joint>
+  <link name="iiwa7_link_7">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0.02"/>
+      <mass value="3.129"/>
+      <inertia ixx="0.01464" ixy="0.0005912" ixz="0" iyy="0.01465" iyz="0" izz="0.002872"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 -0.0005"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/visual/link_7.obj"/>
+      </geometry>
+      <material name="Grey"/>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 -0.0005"/>
+      <geometry>
+        <mesh filename="meshes/iiwa7/collision/link_7.obj"/>
+      </geometry>
+      <material name="Grey"/>
+    </collision>
+  </link>
+  <joint name="iiwa7_joint_ee" type="fixed">
+    <parent link="iiwa7_link_7"/>
+    <child link="iiwa7_link_ee"/>
+    <origin rpy="0 0 0" xyz="0 0 0.071"/>
+    <dynamics damping="5.5" friction="0.025"/>
+  </joint>
+  <link name="iiwa7_link_ee">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0 0 0.0"/>
+      <mass value="0.1"/>
+      <inertia ixx="1e-3" ixy="0.0" ixz="0" iyy="1e-3" iyz="0" izz="1e-3"/>
+    </inertial>
+  </link>
+  
+  <!-- Load Gazebo lib and set the robot namespace -->
+  <!-- <gazebo>
+      <plugin name="gazebo_ros_control" filename="libgazebo_ros_control.so">
+        <robotNamespace>/${robot_name}</robotNamespace>
+      </plugin>
+    </gazebo> -->
+  <!-- Link0 -->
+  <gazebo reference="iiwa7_link_0">
+    <material>Gazebo/Grey</material>
+    <mu1>0.2</mu1>
+    <mu2>0.2</mu2>
+  </gazebo>
+  <!-- Link1 -->
+  <gazebo reference="iiwa7_link_1">
+    <material>Gazebo/Orange</material>
+    <mu1>0.2</mu1>
+    <mu2>0.2</mu2>
+  </gazebo>
+  <!-- Link2 -->
+  <gazebo reference="iiwa7_link_2">
+    <material>Gazebo/Orange</material>
+    <mu1>0.2</mu1>
+    <mu2>0.2</mu2>
+  </gazebo>
+  <!-- Link3 -->
+  <gazebo reference="iiwa7_link_3">
+    <material>Gazebo/Orange</material>
+    <mu1>0.2</mu1>
+    <mu2>0.2</mu2>
+  </gazebo>
+  <!-- Link4 -->
+  <gazebo reference="iiwa7_link_4">
+    <material>Gazebo/Orange</material>
+    <mu1>0.2</mu1>
+    <mu2>0.2</mu2>
+  </gazebo>
+  <!-- Link5 -->
+  <gazebo reference="iiwa7_link_5">
+    <material>Gazebo/Orange</material>
+    <mu1>0.2</mu1>
+    <mu2>0.2</mu2>
+  </gazebo>
+  <!-- Link6 -->
+  <gazebo reference="iiwa7_link_6">
+    <material>Gazebo/Orange</material>
+    <mu1>0.2</mu1>
+    <mu2>0.2</mu2>
+  </gazebo>
+  <!-- Link7 -->
+  <gazebo reference="iiwa7_link_7">
+    <material>Gazebo/Grey</material>
+    <mu1>0.2</mu1>
+    <mu2>0.2</mu2>
+  </gazebo>
+  <transmission name="iiwa7_tran_1">
+    <robotNamespace>/iiwa7</robotNamespace>
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="iiwa7_joint_1">
+      <hardwareInterface>PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="iiwa7_motor_1">
+      <hardwareInterface>PositionJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="iiwa7_tran_2">
+    <robotNamespace>/iiwa7</robotNamespace>
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="iiwa7_joint_2">
+      <hardwareInterface>PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="iiwa7_motor_2">
+      <hardwareInterface>PositionJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="iiwa7_tran_3">
+    <robotNamespace>/iiwa7</robotNamespace>
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="iiwa7_joint_3">
+      <hardwareInterface>PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="iiwa7_motor_3">
+      <hardwareInterface>PositionJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="iiwa7_tran_4">
+    <robotNamespace>/iiwa7</robotNamespace>
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="iiwa7_joint_4">
+      <hardwareInterface>PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="iiwa7_motor_4">
+      <hardwareInterface>PositionJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="iiwa7_tran_5">
+    <robotNamespace>/iiwa7</robotNamespace>
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="iiwa7_joint_5">
+      <hardwareInterface>PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="iiwa7_motor_5">
+      <hardwareInterface>PositionJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="iiwa7_tran_6">
+    <robotNamespace>/iiwa7</robotNamespace>
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="iiwa7_joint_6">
+      <hardwareInterface>PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="iiwa7_motor_6">
+      <hardwareInterface>PositionJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="iiwa7_tran_7">
+    <robotNamespace>/iiwa7</robotNamespace>
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="iiwa7_joint_7">
+      <hardwareInterface>PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="iiwa7_motor_7">
+      <hardwareInterface>PositionJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <!-- ======================== BASE PARAMS ========================= -->
+  <!-- ======================== FINGER PARAMS ======================== -->
+  <!-- full height from joint to tip. when used,
+       the radius of the finger tip sphere will be subtracted
+       and one fixed link will be added for the tip. -->
+  <!-- ========================= THUMB PARAMS ========================= -->
+  <!-- ========================= LIMITS ========================= -->
+  <!-- ============================================================================= -->
+  <!-- BASE -->
+
+  <link name="allegro_mount">
+    <inertial>
+      <mass value="0.05"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="1e-4" ixy="0" ixz="0" iyy="1e-4" iyz="0" izz="1e-4"/>
+    </inertial>
+    <visual>
+      <!-- <origin xyz="-0.0425 -0.0425 0" rpy="0 0 0" /> -->
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/mounts/allegro_mount.obj"/>
+      </geometry>
+      <material name="color_j7"/>
+    </visual>
+    <collision>
+      <!-- <origin xyz="-0.0425 -0.0425 0" rpy="0 0 0" /> -->
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/mounts/allegro_mount.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <!-- <joint name="allegro_mount_joint1" type="fixed">
+    
+    <origin rpy="0 0 0" xyz="-0.00 -0.0 0.0"/>
+    <parent link="iiwa7_link_ee"/>
+    <child link="allegro_mount"/>
+
+  </joint> -->
+  <joint name="allegro_mount_joint" type="fixed">
+    <!-- <origin xyz="0.065 0 0.0275" rpy="0 1.57 0" /> -->
+    <origin rpy="0 -1.5708 0.785398" xyz="0.008219 -0.02063 0.08086"/>
+    <parent link="allegro_mount"/>
+    <child link="palm_link"/>
+
+  </joint>
+  
+  <!-- BASE -->
+  <link name="palm_link">
+    <inertial>
+      <mass value="0.4154"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="1e-4" ixy="0" ixz="0" iyy="1e-4" iyz="0" izz="1e-4"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/allegro/base_link.obj"/>
+      </geometry>
+      <origin rpy="0 0 0" xyz="0 0 0 "/>
+      <material name="Grey">
+        <color rgba="0.2 0.2 0.2 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0.0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/allegro/base_link.obj"/>
+      </geometry>
+    </collision>
+  </link>
+  <gazebo reference="palm_link">
+    <material value="Gazebo/Grey"/>
+  </gazebo>
+  <!-- ============================================================================= -->
+  <!-- FINGERS -->
+  <!-- RIGHT HAND due to which finger is number 0 -->
+  <!-- for LEFT HAND switch the sign of the **offset_origin_y** and **finger_angle_r** parameters-->
+  <!-- [LINK 0, 4, 8] -->
+  <link name="index_link_0">
+    <collision>
+      <geometry>
+        <mesh filename="meshes/allegro/primary_base.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.005"/>
+      <origin xyz="0 0 0"/>
+      <inertia ixx="5.1458e-05" iyy="5.1458e-05" izz="6.125e-05" ixy="0" ixz="0" iyz="0"/>	
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/allegro/primary_base.obj"/>
+      </geometry>
+      <material name="Grey">
+        <color rgba="0.2 0.2 0.2 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="index_link_1">
+    <collision>
+      <geometry>
+        <mesh filename="meshes/allegro/primary_proximal.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.125164"/>
+      <origin xyz="0.027 0 0"/>
+      <inertia ixx="6.39979e-06" iyy="8.88687e-05" izz="9.13751e-05" ixy="-3.26531e-06" ixz="1.23963e-05" iyz="2.07384e-05"/>	
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/allegro/primary_proximal.obj"/>
+      </geometry>
+      <material name="Grey">
+        <color rgba="0.2 0.2 0.2 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="index_link_2">
+    <collision>
+      <geometry>
+        <mesh filename="meshes/allegro/primary_medial.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.131691"/>
+      <origin xyz="0.039 0 0"/>
+      <inertia ixx="7.04217e-05" iyy="3.95744e-05" izz="6.61125e-05" ixy="-9.64342e-05" ixz="5.8796e-05" iyz="-3.62996e-05"/>	
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/allegro/primary_medial.obj"/>
+      </geometry>
+      <material name="Grey">
+        <color rgba="0.2 0.2 0.2 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="index_link_3">
+    <collision>
+      <geometry>
+        <mesh filename="meshes/biotac/biotac_sensor.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.0211922"/>
+      <origin xyz="0.029 0 0"/>
+      <inertia ixx="2.93743e-05" iyy="7.21391e-05" izz="7.59731e-05" ixy="-3.51896e-05" ixz="-6.31225e-05" iyz="-9.25392e-07"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/biotac/biotac_sensor.obj"/>
+      </geometry>
+      <material name="Green">
+        <color rgba="0. 0.5 0. 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="middle_link_0">
+    <collision>
+      <geometry>
+        <mesh filename="meshes/allegro/primary_base.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.005"/>
+      <origin xyz="0 0 0"/>
+      <inertia ixx="5.1458e-05" iyy="5.1458e-05" izz="6.125e-05" ixy="0" ixz="0" iyz="0"/>	
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/allegro/primary_base.obj"/>
+      </geometry>
+      <material name="Grey">
+        <color rgba="0.2 0.2 0.2 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="middle_link_1">
+    <collision>
+      <geometry>
+        <mesh filename="meshes/allegro/primary_proximal.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.125164"/>
+      <origin xyz="0.027 0 0"/>
+      <inertia ixx="6.39979e-06" iyy="8.88687e-05" izz="9.13751e-05" ixy="-3.26531e-06" ixz="1.23963e-05" iyz="2.07384e-05"/>	
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/allegro/primary_proximal.obj"/>
+      </geometry>
+      <material name="Grey">
+        <color rgba="0.2 0.2 0.2 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="middle_link_2">
+    <collision>
+      <geometry>
+        <mesh filename="meshes/allegro/primary_medial.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.131691"/>
+      <origin xyz="0.039 0 0"/>
+      <inertia ixx="7.04217e-05" iyy="3.95744e-05" izz="6.61125e-05" ixy="-9.64342e-05" ixz="5.8796e-05" iyz="-3.62996e-05"/>	
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/allegro/primary_medial.obj"/>
+      </geometry>
+      <material name="Grey">
+        <color rgba="0.2 0.2 0.2 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="middle_link_3">
+    <collision>
+      <geometry>
+        <mesh filename="meshes/biotac/biotac_sensor.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.0211922"/>
+      <origin xyz="0.029 0 0"/>
+      <inertia ixx="2.93743e-05" iyy="7.21391e-05" izz="7.59731e-05" ixy="-3.51896e-05" ixz="-6.31225e-05" iyz="-9.25392e-07"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/biotac/biotac_sensor.obj"/>
+      </geometry>
+      <material name="Green">
+        <color rgba="0 0.5 0 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="ring_link_0">
+    <collision>
+      <geometry>
+        <mesh filename="meshes/allegro/primary_base.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.005"/>
+      <origin xyz="0 0 0"/>
+      <inertia ixx="5.1458e-05" iyy="5.1458e-05" izz="6.125e-05" ixy="0" ixz="0" iyz="0"/>	
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/allegro/primary_base.obj"/>
+      </geometry>
+      <material name="Grey">
+        <color rgba="0.2 0.2 0.2 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="ring_link_1">
+    <collision>
+      <geometry>
+        <mesh filename="meshes/allegro/primary_proximal.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.125164"/>
+      <origin xyz="0.027 0 0"/>
+      <inertia ixx="6.39979e-06" iyy="8.88687e-05" izz="9.13751e-05" ixy="-3.26531e-06" ixz="1.23963e-05" iyz="2.07384e-05"/>	
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/allegro/primary_proximal.obj"/>
+      </geometry>
+      <material name="Grey">
+        <color rgba="0.2 0.2 0.2 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="ring_link_2">
+    <collision>
+      <geometry>
+        <mesh filename="meshes/allegro/primary_medial.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.131691"/>
+      <origin xyz="0.039 0 0"/>
+      <inertia ixx="7.04217e-05" iyy="3.95744e-05" izz="6.61125e-05" ixy="-9.64342e-05" ixz="5.8796e-05" iyz="-3.62996e-05"/>	
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/allegro/primary_medial.obj"/>
+      </geometry>
+      <material name="Grey">
+        <color rgba="0.2 0.2 0.2 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="ring_link_3">
+    <collision>
+      <geometry>
+        <mesh filename="meshes/biotac/biotac_sensor.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.0211922"/>
+      <origin xyz="0.029 0 0"/>
+      <inertia ixx="2.93743e-05" iyy="7.21391e-05" izz="7.59731e-05" ixy="-3.51896e-05" ixz="-6.31225e-05" iyz="-9.25392e-07"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/biotac/biotac_sensor.obj"/>
+      </geometry>
+      <material name="Green">
+        <color rgba="0 0.5 0 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="thumb_link_0">
+    <collision>
+      <geometry>
+        <mesh filename="meshes/allegro/thumb_base.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.1"/>
+      <inertia ixx="5.1458e-5" ixy="0" ixz="0" iyy="5.1458e-5" iyz="0" izz="6.125e-5"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/allegro/thumb_base.obj"/>
+      </geometry>
+      <material name="Grey">
+        <color rgba="0.2 0.2 0.2 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="thumb_link_1">
+    <collision>
+      <geometry>
+        <mesh filename="meshes/allegro/thumb_proximal.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.1"/>
+      <inertia ixx="5.1458e-5" ixy="0" ixz="0" iyy="5.1458e-5" iyz="0" izz="6.125e-5"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/allegro/thumb_proximal.obj"/>
+      </geometry>
+      <material name="Grey">
+        <color rgba="0.2 0.2 0.2 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="thumb_link_2">
+    <collision>
+      <geometry>
+        <mesh filename="meshes/allegro/thumb_medial.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.1"/>
+      <inertia ixx="5.1458e-5" ixy="0" ixz="0" iyy="5.1458e-5" iyz="0" izz="6.125e-5"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/allegro/thumb_medial.obj"/>
+      </geometry>
+      <material name="Grey">
+        <color rgba="0.2 0.2 0.2 1"/>
+      </material>
+    </visual>
+  </link>
+  <link name="thumb_link_3">
+    <collision>
+      <geometry>
+        <mesh filename="meshes/biotac/biotac_sensor_thumb.obj"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.1"/>
+      <inertia ixx="5.1458e-5" ixy="0" ixz="0" iyy="5.1458e-5" iyz="0" izz="6.125e-5"/>
+    </inertial>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/biotac/biotac_sensor_thumb.obj"/>
+      </geometry>
+      <material name="Green">
+        <color rgba="0 0.5 0 1"/>
+      </material>
+    </visual>
+  </link>
+  <joint name="index_joint_0" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.558488888889" upper="0.558488888889" velocity="6.283"/>
+    <origin rpy="3.1415 -1.57075 -0.0872638888889" xyz="0.0514302 -0.03632 -0.0113"/>
+    <parent link="palm_link"/>
+    <child link="index_link_0"/>
+    <dynamics damping="0.0414019" friction="0.0523963"/>
+  </joint>
+  <joint name="index_joint_1" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.279244444444" upper="1.727825" velocity="6.283"/>
+    <origin rpy="3.1415 -1.57075 1.57075" xyz="0.0 0.0 0.0"/>
+    <parent link="index_link_0"/>
+    <child link="index_link_1"/>
+    <dynamics damping="0.00587541" friction="0.0150275"/>
+  </joint>
+  <joint name="index_joint_2" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.279244444444" upper="1.727825" velocity="6.283"/>
+    <origin rpy="0.0 0.0 0.0" xyz="0.054 0.0 0.0"/>
+    <parent link="index_link_1"/>
+    <child link="index_link_2"/>
+    <dynamics damping="0.010638" friction="0.00616359"/>
+  </joint>
+  <joint name="index_joint_3" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.279244444444" upper="1.727825" velocity="6.283"/>
+    <origin rpy="0.0 0.0 0.0" xyz="0.0384 0.0 0.0"/>
+    <parent link="index_link_2"/>
+    <child link="index_link_3"/>
+    <dynamics damping="0.0226948" friction="0.0227036"/>
+  </joint>
+  <joint name="middle_joint_0" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.558488888889" upper="0.558488888889" velocity="6.283"/>
+    <origin rpy="3.1415 -1.57075 0" xyz="0.0537375 0.0087771 -0.0113"/>
+    <parent link="palm_link"/>
+    <child link="middle_link_0"/>
+    <dynamics damping="0.0414019" friction="0.0523963"/>
+  </joint>
+  <joint name="middle_joint_1" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.279244444444" upper="1.727825" velocity="6.283"/>
+    <origin rpy="3.1415 -1.57075 1.57075" xyz="0.0 0.0 0.0"/>
+    <parent link="middle_link_0"/>
+    <child link="middle_link_1"/>
+    <dynamics damping="0.00587541" friction="0.0150275"/>
+  </joint>
+  <joint name="middle_joint_2" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.279244444444" upper="1.727825" velocity="6.283"/>
+    <origin rpy="0.0 0.0 0.0" xyz="0.054 0.0 0.0"/>
+    <parent link="middle_link_1"/>
+    <child link="middle_link_2"/>
+    <dynamics damping="0.010638" friction="0.00616359"/>
+  </joint>
+  <joint name="middle_joint_3" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.279244444444" upper="1.727825" velocity="6.283"/>
+    <origin rpy="0.0 0.0 0.0" xyz="0.0384 0.0 0.0"/>
+    <parent link="middle_link_2"/>
+    <child link="middle_link_3"/>
+    <dynamics damping="0.0226948" friction="0.0227036"/>
+  </joint>
+  <joint name="ring_joint_0" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.558488888889" upper="0.558488888889" velocity="6.283"/>
+    <origin rpy="3.1415 -1.57075 0.0872638888889" xyz="0.0514302 0.0538749 -0.0113"/>
+    <parent link="palm_link"/>
+    <child link="ring_link_0"/>
+    <dynamics damping="0.0414019" friction="0.0523963"/>
+  </joint>
+  <joint name="ring_joint_1" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.279244444444" upper="1.727825" velocity="6.283"/>
+    <origin rpy="3.1415 -1.57075 1.57075" xyz="0.0 0.0 0.0"/>
+    <parent link="ring_link_0"/>
+    <child link="ring_link_1"/>
+    <dynamics damping="0.00587541" friction="0.0150275"/>
+  </joint>
+  <joint name="ring_joint_2" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.279244444444" upper="1.727825" velocity="6.283"/>
+    <origin rpy="0.0 0.0 0.0" xyz="0.054 0.0 0.0"/>
+    <parent link="ring_link_1"/>
+    <child link="ring_link_2"/>
+    <dynamics damping="0.010638" friction="0.00616359"/>
+  </joint>
+  <joint name="ring_joint_3" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.279244444444" upper="1.727825" velocity="6.283"/>
+    <origin rpy="0.0 0.0 0.0" xyz="0.0384 0.0 0.0"/>
+    <parent link="ring_link_2"/>
+    <child link="ring_link_3"/>
+    <dynamics damping="0.0226948" friction="0.0227036"/>
+  </joint>
+  <joint name="thumb_joint_0" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="0.279244444444" upper="1.57075" velocity="6.283"/>
+    <origin rpy="-1.57075 -1.57075 1.48348611111" xyz="-0.0367482 -0.0081281 -0.0295"/>
+    <parent link="palm_link"/>
+    <child link="thumb_link_0"/>
+    <dynamics friction="0.035"/>
+  </joint>
+  <joint name="thumb_joint_1" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.331602777778" upper="1.15188333333" velocity="6.283"/>
+    <origin rpy="1.57075 0 0" xyz="0.005 0.0 0.0"/>
+    <parent link="thumb_link_0"/>
+    <child link="thumb_link_1"/>
+    <dynamics friction="0.035"/>
+  </joint>
+  <joint name="thumb_joint_2" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.279244444444" upper="1.727825" velocity="6.283"/>
+    <origin rpy="3.1415 -1.57075 0.0" xyz="0 0 0.0554"/>
+    <parent link="thumb_link_1"/>
+    <child link="thumb_link_2"/>
+    <dynamics friction="0.035"/>
+  </joint>
+  <joint name="thumb_joint_3" type="revolute">
+    <axis xyz="0 0 1"/>
+    <limit effort="0.35" lower="-0.279244444444" upper="1.76273055556" velocity="6.283"/>
+    <origin rpy="0.0 0.0 0.0" xyz="0.0514 0.0 0.0"/>
+    <parent link="thumb_link_2"/>
+    <child link="thumb_link_3"/>
+    <dynamics friction="0.035"/>
+  </joint>
+  <!-- ============================================================================= -->
+
+  <!-- Create a different *root* for the allegro hand -->
+  <!-- Note: this offset is just eyeballed... -->
+  <joint name="iiwa7_allegro" type="fixed">
+    <parent link="iiwa7_link_ee"/>
+    <child link="allegro_mount"/>
+    <origin rpy="0 0 0" xyz="0 0 0"/>
+  </joint>
+  
+</robot>
diff --git a/src/curobo/content/assets/robot/jaco/jaco_7n.urdf b/src/curobo/content/assets/robot/jaco/jaco_7n.urdf
new file mode 100644
index 00000000..392a21d5
--- /dev/null
+++ b/src/curobo/content/assets/robot/jaco/jaco_7n.urdf
@@ -0,0 +1,693 @@
+<?xml version="1.0" ?>
+<!-- =================================================================================== -->
+<!-- |    This document was autogenerated by xacro from j2n7s300_standalone.xacro      | -->
+<!-- |    EDITING THIS FILE BY HAND IS NOT RECOMMENDED                                 | -->
+<!-- =================================================================================== -->
+<!-- j2n7s300 refers to jaco v2 7DOF non-spherical service 3fingers -->
+<robot name="j2n7s300" xmlns:body="http://playerstage.sourceforge.net/gazebo/xmlschema/#body" xmlns:controller="http://playerstage.sourceforge.net/gazebo/xmlschema/#controller" xmlns:gazebo="http://playerstage.sourceforge.net/gazebo/xmlschema/#gz" xmlns:geom="http://playerstage.sourceforge.net/gazebo/xmlschema/#geom" xmlns:interface="http://playerstage.sourceforge.net/gazebo/xmlschema/#interface" xmlns:joint="http://playerstage.sourceforge.net/gazebo/xmlschema/#joint" xmlns:model="http://playerstage.sourceforge.net/gazebo/xmlschema/#model" xmlns:physics="http://playerstage.sourceforge.net/gazebo/xmlschema/#physics" xmlns:renderable="http://playerstage.sourceforge.net/gazebo/xmlschema/#renderable" xmlns:rendering="http://playerstage.sourceforge.net/gazebo/xmlschema/#rendering" xmlns:sensor="http://playerstage.sourceforge.net/gazebo/xmlschema/#sensor" xmlns:xi="http://www.w3.org/2001/XInclude">
+  <!-- links      		mesh_no
+   base           		0
+   shoulder       		1
+   arm            		2
+   forearm        		3
+   wrist          		4
+   arm_mico       		5
+   arm_half1 (7dof)		6
+   arm_half2 (7dof)		7
+   wrist_spherical_1  8
+   wrist_spherical_2  9
+
+   hand 3 finger  		55
+   hand_2finger   		56
+   finger_proximal		57
+   finger_distal      58
+-->
+  <!-- links      		mesh_no
+   base           		0
+   shoulder       		1
+   arm            		2
+   forearm        		3
+   wrist          		4
+   arm_mico       		5
+   arm_half1 (7dof)		6
+   arm_half2 (7dof)		7
+   wrist_spherical_1  8
+   wrist_spherical_2  9
+
+   hand 3 finger  		55
+   hand_2finger   		56
+   finger_proximal		57
+   finger_distal      58
+-->
+  <link name="root"/>
+  <!-- for gazebo -->
+  <link name="world"/>
+  <joint name="connect_root_and_world" type="fixed">
+    <child link="root"/>
+    <parent link="world"/>
+    <origin rpy="0 0 0" xyz="0 0 0"/>
+  </joint>
+  <!-- ros_control plugin -->
+  <gazebo>
+    <plugin filename="libgazebo_ros_control.so" name="gazebo_ros_control">
+      <robotNamespace>j2n7s300</robotNamespace>
+      <robotSimType>gazebo_ros_control/DefaultRobotHWSim</robotSimType>
+      <legacyModeNS>true</legacyModeNS>
+    </plugin>
+  </gazebo>
+  <link name="j2n7s300_link_base">
+    <visual>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/base.dae"/>
+      </geometry>
+      <material name="carbon_fiber">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/base.dae"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.46784"/>
+      <origin rpy="0 0 0" xyz="0 0 0.1255"/>
+      <inertia ixx="0.000951270861568" ixy="0" ixz="0" iyy="0.000951270861568" iyz="0" izz="0.00037427200000000004"/>
+    </inertial>
+  </link>
+  <joint name="j2n7s300_joint_base" type="fixed">
+    <parent link="root"/>
+    <child link="j2n7s300_link_base"/>
+    <axis xyz="0 0 0"/>
+    <origin rpy="0 0 0" xyz="0 0 0"/>
+    <dynamics damping="0.0" friction="0.0"/>
+  </joint>
+  <link name="j2n7s300_link_1">
+    <visual>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/shoulder.dae"/>
+      </geometry>
+      <material name="carbon_fiber">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <visual>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/ring_big.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/shoulder.dae"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.7477"/>
+      <origin xyz="0 -0.002 -0.0605"/>
+      <inertia ixx="0.0015203172520400004" ixy="0" ixz="0" iyy="0.0015203172520400004" iyz="0" izz="0.00059816"/>
+    </inertial>
+  </link>
+  <joint name="j2n7s300_joint_1" type="continuous">
+    <parent link="j2n7s300_link_base"/>
+    <child link="j2n7s300_link_1"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="40" velocity="0.6283185307179586"/>
+    <origin rpy="0 3.141592653589793 0" xyz="0 0 0.15675"/>
+    <dynamics damping="0.0" friction="0.0"/>
+  </joint>
+  <transmission name="j2n7s300_joint_1_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="j2n7s300_joint_1">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="j2n7s300_joint_1_actuator">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>160</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <!--For torque sensing in simulation-->
+  <gazebo reference="j2n7s300_joint_1">
+    <provideFeedback>true</provideFeedback>
+  </gazebo>
+  <gazebo>
+    <plugin filename="libgazebo_ros_ft_sensor.so" name="ft_sensor">
+      <updateRate>30.0</updateRate>
+      <topicName>j2n7s300_joint_1_ft_sensor_topic</topicName>
+      <jointName>j2n7s300_joint_1</jointName>
+    </plugin>
+  </gazebo>
+  <link name="j2n7s300_link_2">
+    <visual>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/arm_half_1.dae"/>
+      </geometry>
+      <material name="carbon_fiber">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <visual>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/ring_big.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/arm_half_1.dae"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.8447"/>
+      <origin xyz="0 -0.103563213 0"/>
+      <inertia ixx="0.00247073761701" ixy="0" ixz="0" iyy="0.000380115" iyz="0" izz="0.00247073761701"/>
+    </inertial>
+  </link>
+  <joint name="j2n7s300_joint_2" type="revolute">
+    <parent link="j2n7s300_link_1"/>
+    <child link="j2n7s300_link_2"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="80" lower="0.5235987755982988" upper="5.759586531581287" velocity="0.6283185307179586"/>
+    <origin rpy="-1.5707963267948966 0 3.141592653589793" xyz="0 0.0016 -0.11875"/>
+    <dynamics damping="0.0" friction="0.0"/>
+  </joint>
+  <transmission name="j2n7s300_joint_2_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="j2n7s300_joint_2">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="j2n7s300_joint_2_actuator">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>160</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <!--For torque sensing in simulation-->
+  <gazebo reference="j2n7s300_joint_2">
+    <provideFeedback>true</provideFeedback>
+  </gazebo>
+  <gazebo>
+    <plugin filename="libgazebo_ros_ft_sensor.so" name="ft_sensor">
+      <updateRate>30.0</updateRate>
+      <topicName>j2n7s300_joint_2_ft_sensor_topic</topicName>
+      <jointName>j2n7s300_joint_2</jointName>
+    </plugin>
+  </gazebo>
+  <link name="j2n7s300_link_3">
+    <visual>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/arm_half_2.dae"/>
+      </geometry>
+      <material name="carbon_fiber">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <visual>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/ring_big.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/arm_half_2.dae"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.8447"/>
+      <origin xyz="0 0 -0.1022447445"/>
+      <inertia ixx="0.00247073761701" ixy="0" ixz="0" iyy="0.00247073761701" iyz="0" izz="0.000380115"/>
+    </inertial>
+  </link>
+  <joint name="j2n7s300_joint_3" type="continuous">
+    <parent link="j2n7s300_link_2"/>
+    <child link="j2n7s300_link_3"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="40" velocity="0.6283185307179586"/>
+    <origin rpy="1.5707963267948966 0 3.141592653589793" xyz="0 -0.205 0"/>
+    <dynamics damping="0.0" friction="0.0"/>
+  </joint>
+  <transmission name="j2n7s300_joint_3_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="j2n7s300_joint_3">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="j2n7s300_joint_3_actuator">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>160</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <!--For torque sensing in simulation-->
+  <gazebo reference="j2n7s300_joint_3">
+    <provideFeedback>true</provideFeedback>
+  </gazebo>
+  <gazebo>
+    <plugin filename="libgazebo_ros_ft_sensor.so" name="ft_sensor">
+      <updateRate>30.0</updateRate>
+      <topicName>j2n7s300_joint_3_ft_sensor_topic</topicName>
+      <jointName>j2n7s300_joint_3</jointName>
+    </plugin>
+  </gazebo>
+  <link name="j2n7s300_link_4">
+    <visual>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/forearm.dae"/>
+      </geometry>
+      <material name="carbon_fiber">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <visual>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/ring_big.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/forearm.dae"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.6763"/>
+      <origin xyz="0 0.081 -0.0086"/>
+      <inertia ixx="0.0014202243190800001" ixy="0" ixz="0" iyy="0.000304335" iyz="0" izz="0.0014202243190800001"/>
+    </inertial>
+  </link>
+  <joint name="j2n7s300_joint_4" type="revolute">
+    <parent link="j2n7s300_link_3"/>
+    <child link="j2n7s300_link_4"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="40" lower="0.5235987755982988" upper="5.759586531581287" velocity="0.6283185307179586"/>
+    <origin rpy="1.5707963267948966 0 3.141592653589793" xyz="0 0 -0.205"/>
+    <dynamics damping="0.0" friction="0.0"/>
+  </joint>
+  <transmission name="j2n7s300_joint_4_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="j2n7s300_joint_4">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="j2n7s300_joint_4_actuator">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>160</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <!--For torque sensing in simulation-->
+  <gazebo reference="j2n7s300_joint_4">
+    <provideFeedback>true</provideFeedback>
+  </gazebo>
+  <gazebo>
+    <plugin filename="libgazebo_ros_ft_sensor.so" name="ft_sensor">
+      <updateRate>30.0</updateRate>
+      <topicName>j2n7s300_joint_4_ft_sensor_topic</topicName>
+      <jointName>j2n7s300_joint_4</jointName>
+    </plugin>
+  </gazebo>
+  <link name="j2n7s300_link_5">
+    <visual>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/wrist.dae"/>
+      </geometry>
+      <material name="carbon_fiber">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <visual>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/ring_small.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/wrist.dae"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.426367"/>
+      <origin xyz="0 -0.037 -0.0642"/>
+      <inertia ixx="7.734969059999999e-05" ixy="0" ixz="0" iyy="7.734969059999999e-05" iyz="0" izz="0.0001428"/>
+    </inertial>
+  </link>
+  <joint name="j2n7s300_joint_5" type="continuous">
+    <parent link="j2n7s300_link_4"/>
+    <child link="j2n7s300_link_5"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="20" velocity="0.8377580409572781"/>
+    <origin rpy="-1.5707963267948966 0 3.141592653589793" xyz="0 0.2073 -0.0114"/>
+    <dynamics damping="0.0" friction="0.0"/>
+  </joint>
+  <transmission name="j2n7s300_joint_5_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="j2n7s300_joint_5">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="j2n7s300_joint_5_actuator">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>160</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <!--For torque sensing in simulation-->
+  <gazebo reference="j2n7s300_joint_5">
+    <provideFeedback>true</provideFeedback>
+  </gazebo>
+  <gazebo>
+    <plugin filename="libgazebo_ros_ft_sensor.so" name="ft_sensor">
+      <updateRate>30.0</updateRate>
+      <topicName>j2n7s300_joint_5_ft_sensor_topic</topicName>
+      <jointName>j2n7s300_joint_5</jointName>
+    </plugin>
+  </gazebo>
+  <link name="j2n7s300_link_6">
+    <visual>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/wrist.dae"/>
+      </geometry>
+      <material name="carbon_fiber">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <visual>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/ring_small.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/wrist.dae"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.426367"/>
+      <origin xyz="0 -0.037 -0.0642"/>
+      <inertia ixx="7.734969059999999e-05" ixy="0" ixz="0" iyy="7.734969059999999e-05" iyz="0" izz="0.0001428"/>
+    </inertial>
+  </link>
+  <joint name="j2n7s300_joint_6" type="continuous">
+    <parent link="j2n7s300_link_5"/>
+    <child link="j2n7s300_link_6"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="20" velocity="0.8377580409572781"/>
+    <origin rpy="1.0471975511965976 0 3.141592653589793" xyz="0 -0.03703 -0.06414"/>
+    <dynamics damping="0.0" friction="0.0"/>
+  </joint>
+  <transmission name="j2n7s300_joint_6_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="j2n7s300_joint_6">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="j2n7s300_joint_6_actuator">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>160</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <!--For torque sensing in simulation-->
+  <gazebo reference="j2n7s300_joint_6">
+    <provideFeedback>true</provideFeedback>
+  </gazebo>
+  <gazebo>
+    <plugin filename="libgazebo_ros_ft_sensor.so" name="ft_sensor">
+      <updateRate>30.0</updateRate>
+      <topicName>j2n7s300_joint_6_ft_sensor_topic</topicName>
+      <jointName>j2n7s300_joint_6</jointName>
+    </plugin>
+  </gazebo>
+  <link name="j2n7s300_link_7">
+    <visual>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/hand_3finger.dae"/>
+      </geometry>
+      <material name="carbon_fiber">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <visual>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/ring_small.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/hand_3finger.dae"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.99"/>
+      <origin xyz="0 0 -0.06"/>
+      <inertia ixx="0.00034532361869999995" ixy="0" ixz="0" iyy="0.00034532361869999995" iyz="0" izz="0.0005815999999999999"/>
+    </inertial>
+  </link>
+  <joint name="j2n7s300_joint_7" type="continuous">
+    <parent link="j2n7s300_link_6"/>
+    <child link="j2n7s300_link_7"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="20" velocity="0.8377580409572781"/>
+    <origin rpy="1.0471975511965976 0 3.141592653589793" xyz="0 -0.03703 -0.06414"/>
+    <dynamics damping="0.0" friction="0.0"/>
+  </joint>
+  <transmission name="j2n7s300_joint_7_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="j2n7s300_joint_7">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="j2n7s300_joint_7_actuator">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>160</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <!--For torque sensing in simulation-->
+  <gazebo reference="j2n7s300_joint_7">
+    <provideFeedback>true</provideFeedback>
+  </gazebo>
+  <gazebo>
+    <plugin filename="libgazebo_ros_ft_sensor.so" name="ft_sensor">
+      <updateRate>30.0</updateRate>
+      <topicName>j2n7s300_joint_7_ft_sensor_topic</topicName>
+      <jointName>j2n7s300_joint_7</jointName>
+    </plugin>
+  </gazebo>
+  <link name="j2n7s300_end_effector"/>
+  <joint name="j2n7s300_joint_end_effector" type="fixed">
+    <parent link="j2n7s300_link_7"/>
+    <child link="j2n7s300_end_effector"/>
+    <axis xyz="0 0 0"/>
+    <origin rpy="3.141592653589793 0 1.5707963267948966" xyz="0 0 -0.1600"/>
+  </joint>
+  <link name="j2n7s300_link_finger_1">
+    <visual>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/finger_proximal.dae"/>
+      </geometry>
+      <material name="carbon_fiber">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/finger_proximal.dae"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.01"/>
+      <origin xyz="0.022 0 0"/>
+      <inertia ixx="7.8999684e-07" ixy="0" ixz="0" iyy="7.8999684e-07" iyz="0" izz="8e-08"/>
+    </inertial>
+  </link>
+  <joint name="j2n7s300_joint_finger_1" type="revolute">
+    <parent link="j2n7s300_link_7"/>
+    <child link="j2n7s300_link_finger_1"/>
+    <axis xyz="0 0 1"/>
+    <origin rpy="-1.7047873384941834 0.6476144647144773 1.67317415161155" xyz="0.00279 0.03126 -0.11467"/>
+    <limit effort="2" lower="0" upper="1.51" velocity="1"/>
+  </joint>
+  <transmission name="j2n7s300_joint_finger_1_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="j2n7s300_joint_finger_1">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="j2n7s300_joint_finger_1_actuator">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <link name="j2n7s300_link_finger_tip_1">
+    <visual>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/finger_distal.dae"/>
+      </geometry>
+      <material name="carbon_fiber">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/finger_distal.dae"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.01"/>
+      <origin xyz="0.022 0 0"/>
+      <inertia ixx="7.8999684e-07" ixy="0" ixz="0" iyy="7.8999684e-07" iyz="0" izz="8e-08"/>
+    </inertial>
+  </link>
+  <joint name="j2n7s300_joint_finger_tip_1" type="revolute">
+    <parent link="j2n7s300_link_finger_1"/>
+    <child link="j2n7s300_link_finger_tip_1"/>
+    <axis xyz="0 0 1"/>
+    <origin rpy="0 0 0" xyz="0.044 -0.003 0"/>
+    <limit effort="2" lower="0" upper="2" velocity="1"/>
+  </joint>
+  <transmission name="j2n7s300_joint_finger_tip_1_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="j2n7s300_joint_finger_tip_1">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="j2n7s300_joint_finger_tip_1_actuator">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <link name="j2n7s300_link_finger_2">
+    <visual>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/finger_proximal.dae"/>
+      </geometry>
+      <material name="carbon_fiber">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/finger_proximal.dae"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.01"/>
+      <origin xyz="0.022 0 0"/>
+      <inertia ixx="7.8999684e-07" ixy="0" ixz="0" iyy="7.8999684e-07" iyz="0" izz="8e-08"/>
+    </inertial>
+  </link>
+  <joint name="j2n7s300_joint_finger_2" type="revolute">
+    <parent link="j2n7s300_link_7"/>
+    <child link="j2n7s300_link_finger_2"/>
+    <axis xyz="0 0 1"/>
+    <origin rpy="-1.570796327 .649262481663582 -1.38614049188413" xyz="0.02226 -0.02707 -0.11482"/>
+    <limit effort="2" lower="0" upper="1.51" velocity="1"/>
+  </joint>
+  <transmission name="j2n7s300_joint_finger_2_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="j2n7s300_joint_finger_2">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="j2n7s300_joint_finger_2_actuator">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <link name="j2n7s300_link_finger_tip_2">
+    <visual>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/finger_distal.dae"/>
+      </geometry>
+      <material name="carbon_fiber">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/finger_distal.dae"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.01"/>
+      <origin xyz="0.022 0 0"/>
+      <inertia ixx="7.8999684e-07" ixy="0" ixz="0" iyy="7.8999684e-07" iyz="0" izz="8e-08"/>
+    </inertial>
+  </link>
+  <joint name="j2n7s300_joint_finger_tip_2" type="revolute">
+    <parent link="j2n7s300_link_finger_2"/>
+    <child link="j2n7s300_link_finger_tip_2"/>
+    <axis xyz="0 0 1"/>
+    <origin rpy="0 0 0" xyz="0.044 -0.003 0"/>
+    <limit effort="2" lower="0" upper="2" velocity="1"/>
+  </joint>
+  <transmission name="j2n7s300_joint_finger_tip_2_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="j2n7s300_joint_finger_tip_2">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="j2n7s300_joint_finger_tip_2_actuator">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <link name="j2n7s300_link_finger_3">
+    <visual>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/finger_proximal.dae"/>
+      </geometry>
+      <material name="carbon_fiber">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/finger_proximal.dae"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.01"/>
+      <origin xyz="0.022 0 0"/>
+      <inertia ixx="7.8999684e-07" ixy="0" ixz="0" iyy="7.8999684e-07" iyz="0" izz="8e-08"/>
+    </inertial>
+  </link>
+  <joint name="j2n7s300_joint_finger_3" type="revolute">
+    <parent link="j2n7s300_link_7"/>
+    <child link="j2n7s300_link_finger_3"/>
+    <axis xyz="0 0 1"/>
+    <origin rpy="-1.570796327 .649262481663582 -1.75545216211587" xyz="-0.02226 -0.02707 -0.11482"/>
+    <limit effort="2" lower="0" upper="1.51" velocity="1"/>
+  </joint>
+  <transmission name="j2n7s300_joint_finger_3_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="j2n7s300_joint_finger_3">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="j2n7s300_joint_finger_3_actuator">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <link name="j2n7s300_link_finger_tip_3">
+    <visual>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/finger_distal.dae"/>
+      </geometry>
+      <material name="carbon_fiber">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="package://kinova_description/meshes/finger_distal.dae"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.01"/>
+      <origin xyz="0.022 0 0"/>
+      <inertia ixx="7.8999684e-07" ixy="0" ixz="0" iyy="7.8999684e-07" iyz="0" izz="8e-08"/>
+    </inertial>
+  </link>
+  <joint name="j2n7s300_joint_finger_tip_3" type="revolute">
+    <parent link="j2n7s300_link_finger_3"/>
+    <child link="j2n7s300_link_finger_tip_3"/>
+    <axis xyz="0 0 1"/>
+    <origin rpy="0 0 0" xyz="0.044 -0.003 0"/>
+    <limit effort="2" lower="0" upper="2" velocity="1"/>
+  </joint>
+  <transmission name="j2n7s300_joint_finger_tip_3_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="j2n7s300_joint_finger_tip_3">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="j2n7s300_joint_finger_tip_3_actuator">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+</robot>
+
diff --git a/src/curobo/content/assets/robot/jaco/jaco_7s.urdf b/src/curobo/content/assets/robot/jaco/jaco_7s.urdf
new file mode 100644
index 00000000..807981d3
--- /dev/null
+++ b/src/curobo/content/assets/robot/jaco/jaco_7s.urdf
@@ -0,0 +1,693 @@
+<?xml version="1.0" ?>
+<!-- =================================================================================== -->
+<!-- |    This document was autogenerated by xacro from j2s7s300_standalone.xacro      | -->
+<!-- |    EDITING THIS FILE BY HAND IS NOT RECOMMENDED                                 | -->
+<!-- =================================================================================== -->
+<!-- j2s7s300 refers to jaco v2 7DOF spherical 3fingers -->
+<robot name="jaco" xmlns:body="http://playerstage.sourceforge.net/gazebo/xmlschema/#body" xmlns:controller="http://playerstage.sourceforge.net/gazebo/xmlschema/#controller" xmlns:gazebo="http://playerstage.sourceforge.net/gazebo/xmlschema/#gz" xmlns:geom="http://playerstage.sourceforge.net/gazebo/xmlschema/#geom" xmlns:interface="http://playerstage.sourceforge.net/gazebo/xmlschema/#interface" xmlns:joint="http://playerstage.sourceforge.net/gazebo/xmlschema/#joint" xmlns:model="http://playerstage.sourceforge.net/gazebo/xmlschema/#model" xmlns:physics="http://playerstage.sourceforge.net/gazebo/xmlschema/#physics" xmlns:renderable="http://playerstage.sourceforge.net/gazebo/xmlschema/#renderable" xmlns:rendering="http://playerstage.sourceforge.net/gazebo/xmlschema/#rendering" xmlns:sensor="http://playerstage.sourceforge.net/gazebo/xmlschema/#sensor" xmlns:xi="http://www.w3.org/2001/XInclude">
+  <!-- links      		mesh_no
+   base           		0
+   shoulder       		1
+   arm            		2
+   forearm        		3
+   wrist          		4
+   arm_mico       		5
+   arm_half1 (7dof)		6
+   arm_half2 (7dof)		7
+   wrist_spherical_1  8
+   wrist_spherical_2  9
+
+   hand 3 finger  		55
+   hand_2finger   		56
+   finger_proximal		57
+   finger_distal      58
+-->
+  <!-- links      		mesh_no
+   base           		0
+   shoulder       		1
+   arm            		2
+   forearm        		3
+   wrist          		4
+   arm_mico       		5
+   arm_half1 (7dof)		6
+   arm_half2 (7dof)		7
+   wrist_spherical_1  8
+   wrist_spherical_2  9
+
+   hand 3 finger  		55
+   hand_2finger   		56
+   finger_proximal		57
+   finger_distal      58
+-->
+  <link name="root"/>
+  <!-- for gazebo -->
+  <link name="world"/>
+  <joint name="connect_root_and_world" type="fixed">
+    <child link="root"/>
+    <parent link="world"/>
+    <origin rpy="0 0 0" xyz="0 0 0"/>
+  </joint>
+  <!-- ros_control plugin -->
+  <gazebo>
+    <plugin filename="libgazebo_ros_control.so" name="gazebo_ros_control">
+      <robotNamespace>j2s7s300</robotNamespace>
+      <robotSimType>gazebo_ros_control/DefaultRobotHWSim</robotSimType>
+      <legacyModeNS>true</legacyModeNS>
+    </plugin>
+  </gazebo>
+  <link name="j2s7s300_link_base">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/base.dae"/>
+      </geometry>
+      <material name="carbon_fiber">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/base.dae"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.46784"/>
+      <origin rpy="0 0 0" xyz="0 0 0.1255"/>
+      <inertia ixx="0.000951270861568" ixy="0" ixz="0" iyy="0.000951270861568" iyz="0" izz="0.00037427200000000004"/>
+    </inertial>
+  </link>
+  <joint name="j2s7s300_joint_base" type="fixed">
+    <parent link="root"/>
+    <child link="j2s7s300_link_base"/>
+    <axis xyz="0 0 0"/>
+    <origin rpy="0 0 0" xyz="0 0 0"/>
+    <dynamics damping="0.0" friction="0.0"/>
+  </joint>
+  <link name="j2s7s300_link_1">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/shoulder.dae"/>
+      </geometry>
+      <material name="carbon_fiber">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/ring_big.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/shoulder.dae"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.7477"/>
+      <origin xyz="0 -0.002 -0.0605"/>
+      <inertia ixx="0.0015203172520400004" ixy="0" ixz="0" iyy="0.0015203172520400004" iyz="0" izz="0.00059816"/>
+    </inertial>
+  </link>
+  <joint name="j2s7s300_joint_1" type="continuous">
+    <parent link="j2s7s300_link_base"/>
+    <child link="j2s7s300_link_1"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="40" velocity="0.6283185307179586"/>
+    <origin rpy="0 3.141592653589793 0" xyz="0 0 0.15675"/>
+    <dynamics damping="0.0" friction="0.0"/>
+  </joint>
+  <transmission name="j2s7s300_joint_1_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="j2s7s300_joint_1">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="j2s7s300_joint_1_actuator">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>160</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <!--For torque sensing in simulation-->
+  <gazebo reference="j2s7s300_joint_1">
+    <provideFeedback>true</provideFeedback>
+  </gazebo>
+  <gazebo>
+    <plugin filename="libgazebo_ros_ft_sensor.so" name="ft_sensor">
+      <updateRate>30.0</updateRate>
+      <topicName>j2s7s300_joint_1_ft_sensor_topic</topicName>
+      <jointName>j2s7s300_joint_1</jointName>
+    </plugin>
+  </gazebo>
+  <link name="j2s7s300_link_2">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/arm_half_1.dae"/>
+      </geometry>
+      <material name="carbon_fiber">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/ring_big.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/arm_half_1.dae"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.8447"/>
+      <origin xyz="0 -0.103563213 0"/>
+      <inertia ixx="0.00247073761701" ixy="0" ixz="0" iyy="0.000380115" iyz="0" izz="0.00247073761701"/>
+    </inertial>
+  </link>
+  <joint name="j2s7s300_joint_2" type="revolute">
+    <parent link="j2s7s300_link_1"/>
+    <child link="j2s7s300_link_2"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="80" lower="0.8203047484373349" upper="5.462880558742252" velocity="0.6283185307179586"/>
+    <origin rpy="-1.5707963267948966 0 3.141592653589793" xyz="0 0.0016 -0.11875"/>
+    <dynamics damping="0.0" friction="0.0"/>
+  </joint>
+  <transmission name="j2s7s300_joint_2_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="j2s7s300_joint_2">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="j2s7s300_joint_2_actuator">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>160</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <!--For torque sensing in simulation-->
+  <gazebo reference="j2s7s300_joint_2">
+    <provideFeedback>true</provideFeedback>
+  </gazebo>
+  <gazebo>
+    <plugin filename="libgazebo_ros_ft_sensor.so" name="ft_sensor">
+      <updateRate>30.0</updateRate>
+      <topicName>j2s7s300_joint_2_ft_sensor_topic</topicName>
+      <jointName>j2s7s300_joint_2</jointName>
+    </plugin>
+  </gazebo>
+  <link name="j2s7s300_link_3">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/arm_half_2.dae"/>
+      </geometry>
+      <material name="carbon_fiber">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/ring_big.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/arm_half_2.dae"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.8447"/>
+      <origin xyz="0 0 -0.1022447445"/>
+      <inertia ixx="0.00247073761701" ixy="0" ixz="0" iyy="0.00247073761701" iyz="0" izz="0.000380115"/>
+    </inertial>
+  </link>
+  <joint name="j2s7s300_joint_3" type="continuous">
+    <parent link="j2s7s300_link_2"/>
+    <child link="j2s7s300_link_3"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="40" velocity="0.6283185307179586"/>
+    <origin rpy="-1.5707963267948966 0 0" xyz="0 -0.205 0"/>
+    <dynamics damping="0.0" friction="0.0"/>
+  </joint>
+  <transmission name="j2s7s300_joint_3_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="j2s7s300_joint_3">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="j2s7s300_joint_3_actuator">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>160</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <!--For torque sensing in simulation-->
+  <gazebo reference="j2s7s300_joint_3">
+    <provideFeedback>true</provideFeedback>
+  </gazebo>
+  <gazebo>
+    <plugin filename="libgazebo_ros_ft_sensor.so" name="ft_sensor">
+      <updateRate>30.0</updateRate>
+      <topicName>j2s7s300_joint_3_ft_sensor_topic</topicName>
+      <jointName>j2s7s300_joint_3</jointName>
+    </plugin>
+  </gazebo>
+  <link name="j2s7s300_link_4">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/forearm.dae"/>
+      </geometry>
+      <material name="carbon_fiber">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/ring_big.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/forearm.dae"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.6763"/>
+      <origin xyz="0 0.081 -0.0086"/>
+      <inertia ixx="0.0014202243190800001" ixy="0" ixz="0" iyy="0.000304335" iyz="0" izz="0.0014202243190800001"/>
+    </inertial>
+  </link>
+  <joint name="j2s7s300_joint_4" type="revolute">
+    <parent link="j2s7s300_link_3"/>
+    <child link="j2s7s300_link_4"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="40" lower="0.5235987755982988" upper="5.759586531581287" velocity="0.6283185307179586"/>
+    <origin rpy="1.5707963267948966 0 3.141592653589793" xyz="0 0 -0.205"/>
+    <dynamics damping="0.0" friction="0.0"/>
+  </joint>
+  <transmission name="j2s7s300_joint_4_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="j2s7s300_joint_4">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="j2s7s300_joint_4_actuator">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>160</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <!--For torque sensing in simulation-->
+  <gazebo reference="j2s7s300_joint_4">
+    <provideFeedback>true</provideFeedback>
+  </gazebo>
+  <gazebo>
+    <plugin filename="libgazebo_ros_ft_sensor.so" name="ft_sensor">
+      <updateRate>30.0</updateRate>
+      <topicName>j2s7s300_joint_4_ft_sensor_topic</topicName>
+      <jointName>j2s7s300_joint_4</jointName>
+    </plugin>
+  </gazebo>
+  <link name="j2s7s300_link_5">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/wrist_spherical_1.dae"/>
+      </geometry>
+      <material name="carbon_fiber">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/ring_small.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/wrist_spherical_1.dae"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.463"/>
+      <origin xyz="0 0.0028848942 -0.0541932613"/>
+      <inertia ixx="0.0004321316048000001" ixy="0" ixz="0" iyy="0.0004321316048000001" iyz="0" izz="9.260000000000001e-05"/>
+    </inertial>
+  </link>
+  <joint name="j2s7s300_joint_5" type="continuous">
+    <parent link="j2s7s300_link_4"/>
+    <child link="j2s7s300_link_5"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="20" velocity="0.8377580409572781"/>
+    <origin rpy="-1.5707963267948966 0 3.141592653589793" xyz="0 0.2073 -0.0114"/>
+    <dynamics damping="0.0" friction="0.0"/>
+  </joint>
+  <transmission name="j2s7s300_joint_5_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="j2s7s300_joint_5">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="j2s7s300_joint_5_actuator">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>160</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <!--For torque sensing in simulation-->
+  <gazebo reference="j2s7s300_joint_5">
+    <provideFeedback>true</provideFeedback>
+  </gazebo>
+  <gazebo>
+    <plugin filename="libgazebo_ros_ft_sensor.so" name="ft_sensor">
+      <updateRate>30.0</updateRate>
+      <topicName>j2s7s300_joint_5_ft_sensor_topic</topicName>
+      <jointName>j2s7s300_joint_5</jointName>
+    </plugin>
+  </gazebo>
+  <link name="j2s7s300_link_6">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/wrist_spherical_2.dae"/>
+      </geometry>
+      <material name="carbon_fiber">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/ring_small.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/wrist_spherical_2.dae"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.463"/>
+      <origin xyz="0 0.0497208855 -0.0028562765"/>
+      <inertia ixx="0.0004321316048000001" ixy="0" ixz="0" iyy="9.260000000000001e-05" iyz="0" izz="0.0004321316048000001"/>
+    </inertial>
+  </link>
+  <joint name="j2s7s300_joint_6" type="revolute">
+    <parent link="j2s7s300_link_5"/>
+    <child link="j2s7s300_link_6"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="20" lower="1.1344640137963142" upper="5.148721293383272" velocity="0.8377580409572781"/>
+    <origin rpy="1.5707963267948966 0 3.141592653589793" xyz="0 0 -0.10375"/>
+    <dynamics damping="0.0" friction="0.0"/>
+  </joint>
+  <transmission name="j2s7s300_joint_6_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="j2s7s300_joint_6">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="j2s7s300_joint_6_actuator">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>160</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <!--For torque sensing in simulation-->
+  <gazebo reference="j2s7s300_joint_6">
+    <provideFeedback>true</provideFeedback>
+  </gazebo>
+  <gazebo>
+    <plugin filename="libgazebo_ros_ft_sensor.so" name="ft_sensor">
+      <updateRate>30.0</updateRate>
+      <topicName>j2s7s300_joint_6_ft_sensor_topic</topicName>
+      <jointName>j2s7s300_joint_6</jointName>
+    </plugin>
+  </gazebo>
+  <link name="j2s7s300_link_7">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/hand_3finger.dae"/>
+      </geometry>
+      <material name="carbon_fiber">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <visual>
+      <geometry>
+        <mesh filename="meshes/ring_small.dae"/>
+      </geometry>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/hand_3finger.dae"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.99"/>
+      <origin xyz="0 0 -0.06"/>
+      <inertia ixx="0.00034532361869999995" ixy="0" ixz="0" iyy="0.00034532361869999995" iyz="0" izz="0.0005815999999999999"/>
+    </inertial>
+  </link>
+  <joint name="j2s7s300_joint_7" type="continuous">
+    <parent link="j2s7s300_link_6"/>
+    <child link="j2s7s300_link_7"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="20" velocity="0.8377580409572781"/>
+    <origin rpy="-1.5707963267948966 0 3.141592653589793" xyz="0 0.10375 0"/>
+    <dynamics damping="0.0" friction="0.0"/>
+  </joint>
+  <transmission name="j2s7s300_joint_7_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="j2s7s300_joint_7">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="j2s7s300_joint_7_actuator">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>160</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <!--For torque sensing in simulation-->
+  <gazebo reference="j2s7s300_joint_7">
+    <provideFeedback>true</provideFeedback>
+  </gazebo>
+  <gazebo>
+    <plugin filename="libgazebo_ros_ft_sensor.so" name="ft_sensor">
+      <updateRate>30.0</updateRate>
+      <topicName>j2s7s300_joint_7_ft_sensor_topic</topicName>
+      <jointName>j2s7s300_joint_7</jointName>
+    </plugin>
+  </gazebo>
+  <link name="j2s7s300_end_effector"/>
+  <joint name="j2s7s300_joint_end_effector" type="fixed">
+    <parent link="j2s7s300_link_7"/>
+    <child link="j2s7s300_end_effector"/>
+    <axis xyz="0 0 0"/>
+    <origin rpy="3.141592653589793 0 1.5707963267948966" xyz="0 0 -0.1600"/>
+  </joint>
+  <link name="j2s7s300_link_finger_1">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/finger_proximal.dae"/>
+      </geometry>
+      <material name="carbon_fiber">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/finger_proximal.dae"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.01"/>
+      <origin xyz="0.022 0 0"/>
+      <inertia ixx="7.8999684e-07" ixy="0" ixz="0" iyy="7.8999684e-07" iyz="0" izz="8e-08"/>
+    </inertial>
+  </link>
+  <joint name="j2s7s300_joint_finger_1" type="revolute">
+    <parent link="j2s7s300_link_7"/>
+    <child link="j2s7s300_link_finger_1"/>
+    <axis xyz="0 0 1"/>
+    <origin rpy="-1.7047873384941834 0.6476144647144773 1.67317415161155" xyz="0.00279 0.03126 -0.11467"/>
+    <limit effort="2" lower="0" upper="1.51" velocity="1"/>
+  </joint>
+  <transmission name="j2s7s300_joint_finger_1_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="j2s7s300_joint_finger_1">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="j2s7s300_joint_finger_1_actuator">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <link name="j2s7s300_link_finger_tip_1">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/finger_distal.dae"/>
+      </geometry>
+      <material name="carbon_fiber">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/finger_distal.dae"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.01"/>
+      <origin xyz="0.022 0 0"/>
+      <inertia ixx="7.8999684e-07" ixy="0" ixz="0" iyy="7.8999684e-07" iyz="0" izz="8e-08"/>
+    </inertial>
+  </link>
+  <joint name="j2s7s300_joint_finger_tip_1" type="revolute">
+    <parent link="j2s7s300_link_finger_1"/>
+    <child link="j2s7s300_link_finger_tip_1"/>
+    <axis xyz="0 0 1"/>
+    <origin rpy="0 0 0" xyz="0.044 -0.003 0"/>
+    <limit effort="2" lower="0" upper="2" velocity="1"/>
+  </joint>
+  <transmission name="j2s7s300_joint_finger_tip_1_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="j2s7s300_joint_finger_tip_1">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="j2s7s300_joint_finger_tip_1_actuator">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <link name="j2s7s300_link_finger_2">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/finger_proximal.dae"/>
+      </geometry>
+      <material name="carbon_fiber">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/finger_proximal.dae"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.01"/>
+      <origin xyz="0.022 0 0"/>
+      <inertia ixx="7.8999684e-07" ixy="0" ixz="0" iyy="7.8999684e-07" iyz="0" izz="8e-08"/>
+    </inertial>
+  </link>
+  <joint name="j2s7s300_joint_finger_2" type="revolute">
+    <parent link="j2s7s300_link_7"/>
+    <child link="j2s7s300_link_finger_2"/>
+    <axis xyz="0 0 1"/>
+    <origin rpy="-1.570796327 .649262481663582 -1.38614049188413" xyz="0.02226 -0.02707 -0.11482"/>
+    <limit effort="2" lower="0" upper="1.51" velocity="1"/>
+  </joint>
+  <transmission name="j2s7s300_joint_finger_2_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="j2s7s300_joint_finger_2">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="j2s7s300_joint_finger_2_actuator">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <link name="j2s7s300_link_finger_tip_2">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/finger_distal.dae"/>
+      </geometry>
+      <material name="carbon_fiber">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/finger_distal.dae"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.01"/>
+      <origin xyz="0.022 0 0"/>
+      <inertia ixx="7.8999684e-07" ixy="0" ixz="0" iyy="7.8999684e-07" iyz="0" izz="8e-08"/>
+    </inertial>
+  </link>
+  <joint name="j2s7s300_joint_finger_tip_2" type="revolute">
+    <parent link="j2s7s300_link_finger_2"/>
+    <child link="j2s7s300_link_finger_tip_2"/>
+    <axis xyz="0 0 1"/>
+    <origin rpy="0 0 0" xyz="0.044 -0.003 0"/>
+    <limit effort="2" lower="0" upper="2" velocity="1"/>
+  </joint>
+  <transmission name="j2s7s300_joint_finger_tip_2_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="j2s7s300_joint_finger_tip_2">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="j2s7s300_joint_finger_tip_2_actuator">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <link name="j2s7s300_link_finger_3">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/finger_proximal.dae"/>
+      </geometry>
+      <material name="carbon_fiber">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/finger_proximal.dae"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.01"/>
+      <origin xyz="0.022 0 0"/>
+      <inertia ixx="7.8999684e-07" ixy="0" ixz="0" iyy="7.8999684e-07" iyz="0" izz="8e-08"/>
+    </inertial>
+  </link>
+  <joint name="j2s7s300_joint_finger_3" type="revolute">
+    <parent link="j2s7s300_link_7"/>
+    <child link="j2s7s300_link_finger_3"/>
+    <axis xyz="0 0 1"/>
+    <origin rpy="-1.570796327 .649262481663582 -1.75545216211587" xyz="-0.02226 -0.02707 -0.11482"/>
+    <limit effort="2" lower="0" upper="1.51" velocity="1"/>
+  </joint>
+  <transmission name="j2s7s300_joint_finger_3_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="j2s7s300_joint_finger_3">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="j2s7s300_joint_finger_3_actuator">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <link name="j2s7s300_link_finger_tip_3">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/finger_distal.dae"/>
+      </geometry>
+      <material name="carbon_fiber">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/finger_distal.dae"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.01"/>
+      <origin xyz="0.022 0 0"/>
+      <inertia ixx="7.8999684e-07" ixy="0" ixz="0" iyy="7.8999684e-07" iyz="0" izz="8e-08"/>
+    </inertial>
+  </link>
+  <joint name="j2s7s300_joint_finger_tip_3" type="revolute">
+    <parent link="j2s7s300_link_finger_3"/>
+    <child link="j2s7s300_link_finger_tip_3"/>
+    <axis xyz="0 0 1"/>
+    <origin rpy="0 0 0" xyz="0.044 -0.003 0"/>
+    <limit effort="2" lower="0" upper="2" velocity="1"/>
+  </joint>
+  <transmission name="j2s7s300_joint_finger_tip_3_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="j2s7s300_joint_finger_tip_3">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="j2s7s300_joint_finger_tip_3_actuator">
+      <hardwareInterface>hardware_interface/EffortJointInterface</hardwareInterface>
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+</robot>
+
diff --git a/src/curobo/content/assets/robot/kinova/README.md b/src/curobo/content/assets/robot/kinova/README.md
new file mode 100644
index 00000000..3470f039
--- /dev/null
+++ b/src/curobo/content/assets/robot/kinova/README.md
@@ -0,0 +1,2 @@
+The files in this directory come from:
+https://github.com/Kinovarobotics/ros_kortex/tree/noetic-devel/kortex_description
\ No newline at end of file
diff --git a/src/curobo/content/assets/robot/kinova/kinova_gen3_7dof.urdf b/src/curobo/content/assets/robot/kinova/kinova_gen3_7dof.urdf
new file mode 100644
index 00000000..03138533
--- /dev/null
+++ b/src/curobo/content/assets/robot/kinova/kinova_gen3_7dof.urdf
@@ -0,0 +1,577 @@
+<?xml version="1.0" ?>
+<!-- =================================================================================== -->
+<!-- |    This document was autogenerated by xacro from robots/gen3_robotiq_2f_85.xacro | -->
+<!-- |    EDITING THIS FILE BY HAND IS NOT RECOMMENDED                                 | -->
+<!-- =================================================================================== -->
+<robot name="gen3_robotiq_2f_85">
+  <!-- Run the macros -->
+  <!-- For gazebo and DART -->
+
+  <link name="base_link">
+    <inertial>
+      <origin rpy="0 0 0" xyz="-0.000648 -0.000166 0.084487"/>
+      <mass value="1.697"/>
+      <inertia ixx="0.004622" ixy="9E-06" ixz="6E-05" iyy="0.004495" iyz="9E-06" izz="0.002079"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/arms/gen3/7dof/meshes/base_link.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.75294 0.75294 0.75294 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/arms/gen3/7dof/meshes/base_link.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <link name="shoulder_link">
+    <inertial>
+      <origin rpy="0 0 0" xyz="-2.3E-05 -0.010364 -0.07336"/>
+      <mass value="1.3773"/>
+      <inertia ixx="0.00457" ixy="1E-06" ixz="2E-06" iyy="0.004831" iyz="0.000448" izz="0.001409"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/arms/gen3/7dof/meshes/shoulder_link.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.75294 0.75294 0.75294 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/arms/gen3/7dof/meshes/shoulder_link.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_1" type="revolute">
+    <origin rpy="3.1416 2.7629E-18 -4.9305E-36" xyz="0 0 0.15643"/>
+    <parent link="base_link"/>
+    <child link="shoulder_link"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="39" lower="-6.0" upper="6.0" velocity="1.3963"/>
+  </joint>
+  <link name="half_arm_1_link">
+    <inertial>
+      <origin rpy="0 0 0" xyz="-4.4E-05 -0.09958 -0.013278"/>
+      <mass value="1.1636"/>
+      <inertia ixx="0.011088" ixy="5E-06" ixz="0" iyy="0.001072" iyz="-0.000691" izz="0.011255"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/arms/gen3/7dof/meshes/half_arm_1_link.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.75294 0.75294 0.75294 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/arms/gen3/7dof/meshes/half_arm_1_link.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_2" type="revolute">
+    <origin rpy="1.5708 2.1343E-17 -1.1102E-16" xyz="0 0.005375 -0.12838"/>
+    <parent link="shoulder_link"/>
+    <child link="half_arm_1_link"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="39" lower="-2.41" upper="2.41" velocity="1.3963"/>
+  </joint>
+  <link name="half_arm_2_link">
+    <inertial>
+      <origin rpy="0 0 0" xyz="-4.4E-05 -0.006641 -0.117892"/>
+      <mass value="1.1636"/>
+      <inertia ixx="0.010932" ixy="0" ixz="-7E-06" iyy="0.011127" iyz="0.000606" izz="0.001043"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/arms/gen3/7dof/meshes/half_arm_2_link.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.75294 0.75294 0.75294 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/arms/gen3/7dof/meshes/half_arm_2_link.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_3" type="revolute">
+    <origin rpy="-1.5708 1.2326E-32 -2.9122E-16" xyz="0 -0.21038 -0.006375"/>
+    <parent link="half_arm_1_link"/>
+    <child link="half_arm_2_link"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="39"  lower="-6.0" upper="6.0" velocity="1.3963"/>
+  </joint>
+  <link name="forearm_link">
+    <inertial>
+      <origin rpy="0 0 0" xyz="-1.8E-05 -0.075478 -0.015006"/>
+      <mass value="0.9302"/>
+      <inertia ixx="0.008147" ixy="-1E-06" ixz="0" iyy="0.000631" iyz="-0.0005" izz="0.008316"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/arms/gen3/7dof/meshes/forearm_link.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.75294 0.75294 0.75294 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/arms/gen3/7dof/meshes/forearm_link.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_4" type="revolute">
+    <origin rpy="1.5708 -6.6954E-17 -1.6653E-16" xyz="0 0.006375 -0.21038"/>
+    <parent link="half_arm_2_link"/>
+    <child link="forearm_link"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="39" lower="-2.66" upper="2.66" velocity="1.3963"/>
+  </joint>
+  <link name="spherical_wrist_1_link">
+    <inertial>
+      <origin rpy="0 0 0" xyz="1E-06 -0.009432 -0.063883"/>
+      <mass value="0.6781"/>
+      <inertia ixx="0.001596" ixy="0" ixz="0" iyy="0.001607" iyz="0.000256" izz="0.000399"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/arms/gen3/7dof/meshes/spherical_wrist_1_link.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.75294 0.75294 0.75294 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/arms/gen3/7dof/meshes/spherical_wrist_1_link.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_5" type="revolute">
+    <origin rpy="-1.5708 2.2204E-16 -6.373E-17" xyz="0 -0.20843 -0.006375"/>
+    <parent link="forearm_link"/>
+    <child link="spherical_wrist_1_link"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="9" lower="-6.0" upper="6.0"  velocity="1.2218"/>
+  </joint>
+  <link name="spherical_wrist_2_link">
+    <inertial>
+      <origin rpy="0 0 0" xyz="1E-06 -0.045483 -0.00965"/>
+      <mass value="0.6781"/>
+      <inertia ixx="0.001641" ixy="0" ixz="0" iyy="0.00041" iyz="-0.000278" izz="0.001641"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/arms/gen3/7dof/meshes/spherical_wrist_2_link.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.75294 0.75294 0.75294 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/arms/gen3/7dof/meshes/spherical_wrist_2_link.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_6" type="revolute">
+    <origin rpy="1.5708 9.2076E-28 -8.2157E-15" xyz="0 0.00017505 -0.10593"/>
+    <parent link="spherical_wrist_1_link"/>
+    <child link="spherical_wrist_2_link"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="9" lower="-2.23" upper="2.23" velocity="1.2218"/>
+  </joint>
+  <link name="bracelet_link">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.000281 0.011402 -0.029798"/>
+      <mass value="0.5"/>
+      <inertia ixx="0.000587" ixy="3E-06" ixz="3E-06" iyy="0.000369" iyz="-0.000118" izz="0.000609"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/arms/gen3/7dof/meshes/bracelet_with_vision_link.STL"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.75294 0.75294 0.75294 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/arms/gen3/7dof/meshes/bracelet_with_vision_link.STL"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="joint_7" type="revolute">
+    <origin rpy="-1.5708 -5.5511E-17 9.6396E-17" xyz="0 -0.10593 -0.00017505"/>
+    <parent link="spherical_wrist_2_link"/>
+    <child link="bracelet_link"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="9"  lower="-6.0" upper="6.0" velocity="1.2218"/>
+  </joint>
+  <link name="end_effector_link"/>
+  <joint name="end_effector" type="fixed">
+    <origin rpy="3.14159265358979 1.09937075168372E-32 0" xyz="0 0 -0.0615250000000001"/>
+    <parent link="bracelet_link"/>
+    <child link="end_effector_link"/>
+    <axis xyz="0 0 0"/>
+  </joint>
+  <link name="camera_link"/>
+  <joint name="camera_module" type="fixed">
+    <origin rpy="3.14159265358979 3.14159265358979 0" xyz="0 0.05639 -0.00305"/>
+    <parent link="end_effector_link"/>
+    <child link="camera_link"/>
+  </joint>
+  <link name="camera_depth_frame"/>
+  <joint name="depth_module" type="fixed">
+    <origin rpy="3.14159265358979 3.14159265358979 0" xyz="0.0275 0.066 -0.00305"/>
+    <parent link="end_effector_link"/>
+    <child link="camera_depth_frame"/>
+  </joint>
+  <link name="camera_color_frame"/>
+  <joint name="color_module" type="fixed">
+    <origin rpy="3.14159265358979 3.14159265358979 0" xyz="0 0.05639 -0.00305"/>
+    <parent link="end_effector_link"/>
+    <child link="camera_color_frame"/>
+  </joint>
+  <!-- This line was removed by Kinova because we replaced the transmission file with our own
+    <xacro:include filename="$(find robotiq_2f_85_gripper_visualization)/urdf/robotiq_arg2f_transmission.xacro" /> -->
+  <!-- Tool frame used by the arm -->
+  <link name="tool_frame"/>
+  <joint name="tool_frame_joint" type="fixed">
+    <origin rpy="0 0 0" xyz="0 0 0.120"/>
+    <parent link="end_effector_link"/>
+    <child link="tool_frame"/>
+    <axis xyz="0 0 0"/>
+  </joint>
+  <!-- This joint was added by Kinova -->
+  <joint name="gripper_base_joint" type="fixed">
+    <parent link="end_effector_link"/>
+    <child link="robotiq_arg2f_base_link"/>
+    <origin rpy="0.0 0.0 1.57" xyz="0.0 0.0 0.0"/>
+  </joint>
+  <link name="robotiq_arg2f_base_link">
+    <inertial>
+      <origin rpy="0 0 0" xyz="8.625E-08 -4.6583E-06 0.03145"/>
+      <mass value="0.22652"/>
+      <inertia ixx="0.00020005" ixy="-4.2442E-10" ixz="-2.9069E-10" iyy="0.00017832" iyz="-3.4402E-08" izz="0.00013478"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <!-- The DAE file doesn't show well in Gazebo so we're using the STL instead -->
+        <!-- <mesh filename="kortex_description/grippers/robotiq_2f_85/meshes/visual/robotiq_arg2f_85_base_link.dae" scale="0.001 0.001 0.001"/> -->
+        <mesh filename="kortex_description/grippers/robotiq_2f_85/meshes/collision/robotiq_arg2f_base_link.stl"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.1 0.1 0.1 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/grippers/robotiq_2f_85/meshes/collision/robotiq_arg2f_base_link.stl"/>
+      </geometry>
+    </collision>
+  </link>
+  <gazebo reference="robotiq_arg2f_base_link">
+    <material>Gazebo/Black</material>
+  </gazebo>
+  <link name="left_outer_knuckle">
+    <inertial>
+      <origin rpy="0 0 0" xyz="-0.000200000000003065 0.0199435877845359 0.0292245259211331"/>
+      <mass value="0.00853198276973456"/>
+      <inertia ixx="2.89328108496468E-06" ixy="-1.57935047237397E-19" ixz="-1.93980378593255E-19" iyy="1.86719750325683E-06" iyz="-1.21858577871576E-06" izz="1.21905238907251E-06"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/grippers/robotiq_2f_85/meshes/visual/robotiq_arg2f_85_outer_knuckle.dae" scale="0.001 0.001 0.001"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/grippers/robotiq_2f_85/meshes/collision/robotiq_arg2f_85_outer_knuckle.dae" scale="0.001 0.001 0.001"/>
+      </geometry>
+    </collision>
+  </link>
+  <link name="left_outer_finger">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.00030115855001899 0.0373907951953854 -0.0208027427000385"/>
+      <mass value="0.022614240507152"/>
+      <inertia ixx="1.52518312458174E-05" ixy="9.76583423954399E-10" ixz="-5.43838577022588E-10" iyy="6.17694243867776E-06" iyz="6.78636130740228E-06" izz="1.16494917907219E-05"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/grippers/robotiq_2f_85/meshes/visual/robotiq_arg2f_85_outer_finger.dae" scale="0.001 0.001 0.001"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.1 0.1 0.1 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/grippers/robotiq_2f_85/meshes/collision/robotiq_arg2f_85_outer_finger.dae" scale="0.001 0.001 0.001"/>
+      </geometry>
+    </collision>
+  </link>
+  <link name="left_inner_finger">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.000299999999999317 0.0160078233491243 -0.0136945669206257"/>
+      <mass value="0.0104003125914103"/>
+      <inertia ixx="2.71909453810972E-06" ixy="1.35402465472579E-21" ixz="-7.1817349065269E-22" iyy="7.69100314106116E-07" iyz="6.74715432769696E-07" izz="2.30315190420171E-06"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/grippers/robotiq_2f_85/meshes/visual/robotiq_arg2f_85_inner_finger.dae" scale="0.001 0.001 0.001"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.1 0.1 0.1 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/grippers/robotiq_2f_85/meshes/collision/robotiq_arg2f_85_inner_finger.dae" scale="0.001 0.001 0.001"/>
+      </geometry>
+    </collision>
+  </link>
+  <link name="left_inner_finger_pad">
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <box size="0.022 0.00635 0.0375"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.9 0.9 0.9 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <box size="0.022 0.00635 0.0375"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.9 0.0 0.0 1"/>
+      </material>
+    </collision>
+  </link>
+  <link name="left_inner_knuckle">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.000123011831763771 0.0507850843201817 0.00103968640075166"/>
+      <mass value="0.0271177346495152"/>
+      <inertia ixx="2.61910379223783E-05" ixy="-2.43616858946494E-07" ixz="-6.37789906117123E-09" iyy="2.8270243746167E-06" iyz="-5.37200748039765E-07" izz="2.83695868220296E-05"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/grippers/robotiq_2f_85/meshes/visual/robotiq_arg2f_85_inner_knuckle.dae" scale="0.001 0.001 0.001"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.1 0.1 0.1 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/grippers/robotiq_2f_85/meshes/collision/robotiq_arg2f_85_inner_knuckle.dae" scale="0.001 0.001 0.001"/>
+      </geometry>
+    </collision>
+  </link>
+  <link name="right_outer_knuckle">
+    <inertial>
+      <origin rpy="0 0 0" xyz="-0.000200000000003065 0.0199435877845359 0.0292245259211331"/>
+      <mass value="0.00853198276973456"/>
+      <inertia ixx="2.89328108496468E-06" ixy="-1.57935047237397E-19" ixz="-1.93980378593255E-19" iyy="1.86719750325683E-06" iyz="-1.21858577871576E-06" izz="1.21905238907251E-06"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/grippers/robotiq_2f_85/meshes/visual/robotiq_arg2f_85_outer_knuckle.dae" scale="0.001 0.001 0.001"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.792156862745098 0.819607843137255 0.933333333333333 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/grippers/robotiq_2f_85/meshes/collision/robotiq_arg2f_85_outer_knuckle.dae" scale="0.001 0.001 0.001"/>
+      </geometry>
+    </collision>
+  </link>
+  <link name="right_outer_finger">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.00030115855001899 0.0373907951953854 -0.0208027427000385"/>
+      <mass value="0.022614240507152"/>
+      <inertia ixx="1.52518312458174E-05" ixy="9.76583423954399E-10" ixz="-5.43838577022588E-10" iyy="6.17694243867776E-06" iyz="6.78636130740228E-06" izz="1.16494917907219E-05"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/grippers/robotiq_2f_85/meshes/visual/robotiq_arg2f_85_outer_finger.dae" scale="0.001 0.001 0.001"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.1 0.1 0.1 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/grippers/robotiq_2f_85/meshes/collision/robotiq_arg2f_85_outer_finger.dae" scale="0.001 0.001 0.001"/>
+      </geometry>
+    </collision>
+  </link>
+  <link name="right_inner_finger">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.000299999999999317 0.0160078233491243 -0.0136945669206257"/>
+      <mass value="0.0104003125914103"/>
+      <inertia ixx="2.71909453810972E-06" ixy="1.35402465472579E-21" ixz="-7.1817349065269E-22" iyy="7.69100314106116E-07" iyz="6.74715432769696E-07" izz="2.30315190420171E-06"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/grippers/robotiq_2f_85/meshes/visual/robotiq_arg2f_85_inner_finger.dae" scale="0.001 0.001 0.001"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.1 0.1 0.1 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/grippers/robotiq_2f_85/meshes/collision/robotiq_arg2f_85_inner_finger.dae" scale="0.001 0.001 0.001"/>
+      </geometry>
+    </collision>
+  </link>
+  <link name="right_inner_finger_pad">
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <box size="0.022 0.00635 0.0375"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.9 0.9 0.9 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <box size="0.022 0.00635 0.0375"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.9 0.0 0.0 1"/>
+      </material>
+    </collision>
+  </link>
+  <link name="right_inner_knuckle">
+    <inertial>
+      <origin rpy="0 0 0" xyz="0.000123011831763771 0.0507850843201817 0.00103968640075166"/>
+      <mass value="0.0271177346495152"/>
+      <inertia ixx="2.61910379223783E-05" ixy="-2.43616858946494E-07" ixz="-6.37789906117123E-09" iyy="2.8270243746167E-06" iyz="-5.37200748039765E-07" izz="2.83695868220296E-05"/>
+    </inertial>
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/grippers/robotiq_2f_85/meshes/visual/robotiq_arg2f_85_inner_knuckle.dae" scale="0.001 0.001 0.001"/>
+      </geometry>
+      <material name="">
+        <color rgba="0.1 0.1 0.1 1"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="kortex_description/grippers/robotiq_2f_85/meshes/collision/robotiq_arg2f_85_inner_knuckle.dae" scale="0.001 0.001 0.001"/>
+      </geometry>
+    </collision>
+  </link>
+  <joint name="left_outer_knuckle_joint" type="fixed">
+    <origin rpy="0 0 3.1415" xyz="0 -0.0306011 0.054904"/>
+    <parent link="robotiq_arg2f_base_link"/>
+    <child link="left_outer_knuckle"/>
+    <!--axis xyz="1 0 0"/>
+    <limit effort="1000" lower="-0.0" upper="0.8" velocity="2.0"/-->
+  </joint>
+  <joint name="left_outer_finger_joint" type="fixed">
+    <origin rpy="0 0 0" xyz="0 0.0315 -0.0041"/>
+    <parent link="left_outer_knuckle"/>
+    <child link="left_outer_finger"/>
+  </joint>
+  <joint name="left_inner_knuckle_joint" type="fixed">
+    <origin rpy="0 0 3.141592653589793" xyz="0 -0.0127 0.06142"/>
+    <parent link="robotiq_arg2f_base_link"/>
+    <child link="left_inner_knuckle"/>
+    
+  </joint>
+  <joint name="left_inner_finger_joint" type="fixed">
+    <origin rpy="0 0 0" xyz="0 0.0061 0.0471"/>
+    <parent link="left_outer_finger"/>
+    <child link="left_inner_finger"/>
+
+  </joint>
+  <joint name="left_inner_finger_pad_joint" type="fixed">
+    <origin rpy="0 0 0" xyz="0 -0.0220203446692936 0.03242"/>
+    <parent link="left_inner_finger"/>
+    <child link="left_inner_finger_pad"/>
+  </joint>
+  <joint name="right_outer_knuckle_joint" type="fixed">
+    <origin rpy="0 0 0" xyz="0 0.0306011 0.054904"/>
+    <parent link="robotiq_arg2f_base_link"/>
+    <child link="right_outer_knuckle"/>
+    <axis xyz="1 0 0"/>
+    <limit effort="1000" lower="0" upper="0.8" velocity="2.0"/>
+  </joint>
+  <joint name="right_outer_finger_joint" type="fixed">
+    <origin rpy="0 0 0" xyz="0 0.0315 -0.0041"/>
+    <parent link="right_outer_knuckle"/>
+    <child link="right_outer_finger"/>
+  </joint>
+  <joint name="right_inner_knuckle_joint" type="fixed">
+    <origin rpy="0 0 0.0" xyz="0 0.0127 0.06142"/>
+    <parent link="robotiq_arg2f_base_link"/>
+    <child link="right_inner_knuckle"/>
+    
+  </joint>
+  <joint name="right_inner_finger_joint" type="fixed">
+    <origin rpy="0 0 0" xyz="0 0.0061 0.0471"/>
+    <parent link="right_outer_finger"/>
+    <child link="right_inner_finger"/>
+    
+  </joint>
+  <joint name="right_inner_finger_pad_joint" type="fixed">
+    <origin rpy="0 0 0" xyz="0 -0.0220203446692936 0.03242"/>
+    <parent link="right_inner_finger"/>
+    <child link="right_inner_finger_pad"/>
+  </joint>
+</robot>
\ No newline at end of file
diff --git a/src/curobo/content/assets/robot/kinova/kortex_description/grippers/robotiq_2f_85/LICENSE b/src/curobo/content/assets/robot/kinova/kortex_description/grippers/robotiq_2f_85/LICENSE
new file mode 100644
index 00000000..316f6108
--- /dev/null
+++ b/src/curobo/content/assets/robot/kinova/kortex_description/grippers/robotiq_2f_85/LICENSE
@@ -0,0 +1,23 @@
+Copyright (c) 2013, ROS-Industrial
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+  Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/src/curobo/content/assets/robot/kinova/kortex_description/grippers/robotiq_2f_85/README.md b/src/curobo/content/assets/robot/kinova/kortex_description/grippers/robotiq_2f_85/README.md
new file mode 100644
index 00000000..cd862a9f
--- /dev/null
+++ b/src/curobo/content/assets/robot/kinova/kortex_description/grippers/robotiq_2f_85/README.md
@@ -0,0 +1,13 @@
+This is the README file from Robotiq's package for the 2F-85 gripper. The package's license file can also be found in this folder.
+We used commit `4e3196c1c6a83f54acfa3dd8cf02b575ebba3e53` from [Robotiq's ROS Industrial Github repository](https://github.com/ros-industrial/robotiq)
+
+# Robotiq 85mm 2-Finger-Adaptive-Gripper 
+
+This package contains the URDF files describing the 85mm stroke gripper from robotiq, also known as series **C3**.
+
+## Robot Visual
+![85](https://user-images.githubusercontent.com/8356912/49428405-45a6ef00-f7a6-11e8-822b-c6870c39d445.png)
+
+## Robot Collision 
+![852](https://user-images.githubusercontent.com/8356912/49428404-450e5880-f7a6-11e8-82a8-564247ebe7fc.png)
+
diff --git a/src/curobo/content/assets/robot/techman/tm_description/urdf/tm12-nominal.urdf b/src/curobo/content/assets/robot/techman/tm_description/urdf/tm12-nominal.urdf
new file mode 100644
index 00000000..c800a06b
--- /dev/null
+++ b/src/curobo/content/assets/robot/techman/tm_description/urdf/tm12-nominal.urdf
@@ -0,0 +1,331 @@
+<?xml version="1.0" ?>
+<!-- =================================================================================== -->
+<!-- |    This document was autogenerated by xacro from tm12.urdf.xacro                | -->
+<!-- |    EDITING THIS FILE BY HAND IS NOT RECOMMENDED                                 | -->
+<!-- =================================================================================== -->
+<robot name="tm12">
+  <gazebo reference="link_1">
+    <selfCollide>true</selfCollide>
+  </gazebo>
+  <gazebo reference="link_2">
+    <selfCollide>true</selfCollide>
+  </gazebo>
+  <gazebo reference="link_3">
+    <selfCollide>true</selfCollide>
+  </gazebo>
+  <gazebo reference="link_4">
+    <selfCollide>true</selfCollide>
+  </gazebo>
+  <gazebo reference="link_5">
+    <selfCollide>true</selfCollide>
+  </gazebo>
+  <gazebo reference="link_6">
+    <selfCollide>true</selfCollide>
+  </gazebo>
+  <gazebo>
+    <plugin filename="libgazebo_ros_control.so" name="gazebo_ros_control">
+      <robotNamespace>/</robotNamespace>
+      <robotSimType>gazebo_ros_control/DefaultRobotHWSim</robotSimType>
+    </plugin>
+  </gazebo>
+  <transmission name="trans_1">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="shoulder_1_joint">
+      <hardwareInterface>hardware_interface/PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="motor_1">
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="trans_2">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="shoulder_2_joint">
+      <hardwareInterface>hardware_interface/PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="motor_2">
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="trans_3">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="elbow_joint">
+      <hardwareInterface>hardware_interface/PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="motor_3">
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="trans_4">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="wrist_1_joint">
+      <hardwareInterface>hardware_interface/PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="motor_4">
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="trans_5">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="wrist_2_joint">
+      <hardwareInterface>hardware_interface/PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="motor_5">
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="trans_6">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="wrist_3_joint">
+      <hardwareInterface>hardware_interface/PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="motor_6">
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <material name="none">
+    <color rgba="0.0 0.0 0.0 1.0"/>
+  </material>
+  <material name="lightgrey">
+    <color rgba="0.5 0.5 0.5 1.0"/>
+  </material>
+  <material name="grey">
+    <color rgba="0.75 0.75 0.75 1.0"/>
+  </material>
+  <material name="white">
+    <color rgba="0.95 0.95 0.95 1.0"/>
+  </material>
+  <material name="red">
+    <color rgba="0.95 0.0 0.0 1.0"/>
+  </material>
+  <material name="orange">
+    <color rgba="1.0 0.5 0.0 1.0"/>
+  </material>
+  <material name="yellow">
+    <color rgba="1.0 1.0 0.0 1.0"/>
+  </material>
+  <material name="green">
+    <color rgba="0.0 1.0 0.0 1.0"/>
+  </material>
+  <material name="blue">
+    <color rgba="0.0 0.0 1.0 1.0"/>
+  </material>
+  <material name="indigo">
+    <color rgba="0.3 0.3 0.6 1.0"/>
+  </material>
+  <material name="violet">
+    <color rgba="0.6 0.0 1.0 1.0"/>
+  </material>
+  <material name="black">
+    <color rgba="0.0 0.0 0.0 1.0"/>
+  </material>
+  <material name="darkolive">
+    <color rgba="0.3 0.3 0.25 1.0"/>
+  </material>
+  <!--LinkDescription-->
+  <link name="link_0">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/tm12/visual/tm12-base.obj"/>
+      </geometry>
+      <material name="none"/>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/tm12/collision/tm12-base_c.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="1.0"/>
+      <origin rpy="0.000000 0.000000 0.000000" xyz="0.000000 0.000000 0.000000"/>
+      <inertia ixx="0.00110833289" ixy="0.0" ixz="0.0" iyy="0.00110833289" iyz="0.0" izz="0.0018"/>
+    </inertial>
+  </link>
+  <joint name="shoulder_1_joint" type="revolute">
+    <parent link="link_0"/>
+    <child link="link_1"/>
+    <origin rpy="0.000000 -0.000000 0.000000" xyz="0.000000 0.000000 0.165200"/>
+    <axis xyz="0 0 1"/>
+    <!--limit-->
+    <limit effort="353" lower="-4.71238898038469" upper="4.71238898038469" velocity="2.0943951023931953"/>
+    <dynamics damping="0.0" friction="0.0"/>
+  </joint>
+  <link name="link_1">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/tm12/visual/tmr_750w_01.obj"/>
+      </geometry>
+      <material name="none"/>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/tm12/collision/tmr_750w_01_c.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0.000000 0.000000 0.000000" xyz="0.000000 0.000000 0.000000"/>
+      <mass value="7.6"/>
+      <inertia ixx="0.020289334" ixy="0.000000" ixz="0.000000" iyy="0.020289334" iyz="0.000000" izz="0.021396270999999998"/>
+    </inertial>
+  </link>
+  <joint name="shoulder_2_joint" type="revolute">
+    <parent link="link_1"/>
+    <child link="link_2"/>
+    <origin rpy="-1.570796 -1.570796 0.000000" xyz="0.000000 0.000000 0.000000"/>
+    <axis xyz="0 0 1"/>
+    <!--limit-->
+    <limit effort="353" lower="-3.141592653589793" upper="3.141592653589793" velocity="2.0943951023931953"/>
+    <dynamics damping="0.0" friction="0.0"/>
+  </joint>
+  <link name="link_2">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/tm12/visual/tm12-arm1.obj"/>
+      </geometry>
+      <material name="none"/>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/tm12/collision/tm12-arm1_c.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0.000000 0.000000 0.000000" xyz="0.000000 0.000000 0.000000"/>
+      <mass value="14.0239"/>
+      <inertia ixx="0.071505715" ixy="0.000000" ixz="0.000000" iyy="1.1758788999999998" iyz="0.000000" izz="1.2033932999999999"/>
+    </inertial>
+  </link>
+  <joint name="elbow_joint" type="revolute">
+    <parent link="link_2"/>
+    <child link="link_3"/>
+    <origin rpy="0.000000 -0.000000 0.000000" xyz="0.636100 0.000000 0.000000"/>
+    <axis xyz="0 0 1"/>
+    <!--limit-->
+    <limit effort="157" lower="-2.897246558310587" upper="2.897246558310587" velocity="3.141592653589793"/>
+    <dynamics damping="0.0" friction="0.0"/>
+  </joint>
+  <link name="link_3">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/tm12/visual/tm12-arm2.obj"/>
+      </geometry>
+      <material name="none"/>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/tm12/collision/tm12-arm2_c.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0.000000 0.000000 0.000000" xyz="0.000000 0.000000 0.000000"/>
+      <mass value="3.3577"/>
+      <inertia ixx="0.009755469" ixy="0.000000" ixz="0.000000" iyy="0.16334719" iyz="0.000000" izz="0.16656678"/>
+    </inertial>
+  </link>
+  <joint name="wrist_1_joint" type="revolute">
+    <parent link="link_3"/>
+    <child link="link_4"/>
+    <origin rpy="0.000000 -0.000000 1.570796" xyz="0.557900 0.000000 -0.156300"/>
+    <axis xyz="0 0 1"/>
+    <!--limit-->
+    <limit effort="54" lower="-3.141592653589793" upper="3.141592653589793" velocity="3.141592653589793"/>
+    <dynamics damping="0.0" friction="0.0"/>
+  </joint>
+  <link name="link_4">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/tm5-900/visual/tmr_100w_01.obj"/>
+      </geometry>
+      <material name="none"/>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/tm5-900/collision/tmr_100w_01_c.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0.000000 0.000000 0.000000" xyz="0.000000 0.000000 0.000000"/>
+      <mass value="1.576"/>
+      <inertia ixx="0.002058405" ixy="0.000000" ixz="0.000000" iyy="0.0025630790000000002" iyz="0.000000" izz="0.00264321"/>
+    </inertial>
+  </link>
+  <joint name="wrist_2_joint" type="revolute">
+    <parent link="link_4"/>
+    <child link="link_5"/>
+    <origin rpy="1.570796 -0.000000 0.000000" xyz="0.000000 -0.106000 0.000000"/>
+    <axis xyz="0 0 1"/>
+    <!--limit-->
+    <limit effort="54" lower="-3.141592653589793" upper="3.141592653589793" velocity="3.141592653589793"/>
+    <dynamics damping="0.0" friction="0.0"/>
+  </joint>
+  <link name="link_5">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/tm5-900/visual/tmr_100w_02.obj"/>
+      </geometry>
+      <material name="none"/>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/tm5-900/collision/tmr_100w_02_c.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0.000000 0.000000 0.000000" xyz="0.000000 0.000000 0.000000"/>
+      <mass value="1.576"/>
+      <inertia ixx="0.002058405" ixy="0.000000" ixz="0.000000" iyy="0.0025630790000000002" iyz="0.000000" izz="0.00264321"/>
+    </inertial>
+  </link>
+  <joint name="wrist_3_joint" type="revolute">
+    <parent link="link_5"/>
+    <child link="link_6"/>
+    <origin rpy="1.570796 -0.000000 0.000000" xyz="0.000000 -0.113150 0.000000"/>
+    <axis xyz="0 0 1"/>
+    <!--limit-->
+    <limit effort="54" lower="-4.71238898038469" upper="4.71238898038469" velocity="3.141592653589793"/>
+    <dynamics damping="0.0" friction="0.0"/>
+  </joint>
+  <link name="link_6">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/tm5-900/visual/tmr_ee.obj"/>
+      </geometry>
+      <material name="none"/>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/tm5-900/collision/tmr_ee_c.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0.000000 0.000000 0.000000" xyz="0.000000 0.000000 0.000000"/>
+      <mass value="0.65"/>
+      <inertia ixx="0.000774544" ixy="0.000000" ixz="0.000000" iyy="0.001383811" iyz="0.000000" izz="0.001559496"/>
+    </inertial>
+  </link>
+  <joint name="flange_fixed_joint" type="fixed">
+    <parent link="link_6"/>
+    <child link="flange_link"/>
+    <origin rpy="0.000000 0.000000 0.000000" xyz="0.000000 0.000000 0.000000"/>
+  </joint>
+  <link name="flange_link"/>
+  <link name="base"/>
+  <joint name="base_fixed_joint" type="fixed">
+    <parent link="base"/>
+    <child link="link_0"/>
+    <origin rpy="0.000000 0.000000 0.000000" xyz="0.000000 0.000000 0.000000"/>
+  </joint>
+  <link name="tool0"/>
+  <joint name="flange_link-tool0" type="fixed">
+    <parent link="flange_link"/>
+    <child link="tool0"/>
+    <origin rpy="0.000000 0.000000 0.000000" xyz="0.000000 0.000000 0.000000"/>
+  </joint>
+  <!--LinkDescription-->
+  <link name="world"/>
+  <joint name="world_joint" type="fixed">
+    <parent link="world"/>
+    <child link="base"/>
+    <origin rpy="0 0 0" xyz="0 0 0"/>
+  </joint>
+</robot>
+
diff --git a/src/curobo/content/assets/robot/techman/tm_description/urdf/tm12x-nominal.urdf b/src/curobo/content/assets/robot/techman/tm_description/urdf/tm12x-nominal.urdf
new file mode 100755
index 00000000..4f933321
--- /dev/null
+++ b/src/curobo/content/assets/robot/techman/tm_description/urdf/tm12x-nominal.urdf
@@ -0,0 +1,331 @@
+<?xml version="1.0" ?>
+<!-- =================================================================================== -->
+<!-- |    This document was autogenerated by xacro from tm12x.urdf.xacro               | -->
+<!-- |    EDITING THIS FILE BY HAND IS NOT RECOMMENDED                                 | -->
+<!-- =================================================================================== -->
+<robot name="tm12x">
+  <gazebo reference="link_1">
+    <selfCollide>true</selfCollide>
+  </gazebo>
+  <gazebo reference="link_2">
+    <selfCollide>true</selfCollide>
+  </gazebo>
+  <gazebo reference="link_3">
+    <selfCollide>true</selfCollide>
+  </gazebo>
+  <gazebo reference="link_4">
+    <selfCollide>true</selfCollide>
+  </gazebo>
+  <gazebo reference="link_5">
+    <selfCollide>true</selfCollide>
+  </gazebo>
+  <gazebo reference="link_6">
+    <selfCollide>true</selfCollide>
+  </gazebo>
+  <gazebo>
+    <plugin filename="libgazebo_ros_control.so" name="gazebo_ros_control">
+      <robotNamespace>/</robotNamespace>
+      <robotSimType>gazebo_ros_control/DefaultRobotHWSim</robotSimType>
+    </plugin>
+  </gazebo>
+  <transmission name="trans_1">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="shoulder_1_joint">
+      <hardwareInterface>hardware_interface/PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="motor_1">
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="trans_2">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="shoulder_2_joint">
+      <hardwareInterface>hardware_interface/PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="motor_2">
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="trans_3">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="elbow_joint">
+      <hardwareInterface>hardware_interface/PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="motor_3">
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="trans_4">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="wrist_1_joint">
+      <hardwareInterface>hardware_interface/PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="motor_4">
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="trans_5">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="wrist_2_joint">
+      <hardwareInterface>hardware_interface/PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="motor_5">
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="trans_6">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="wrist_3_joint">
+      <hardwareInterface>hardware_interface/PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="motor_6">
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <material name="none">
+    <color rgba="0.0 0.0 0.0 1.0"/>
+  </material>
+  <material name="lightgrey">
+    <color rgba="0.5 0.5 0.5 1.0"/>
+  </material>
+  <material name="grey">
+    <color rgba="0.75 0.75 0.75 1.0"/>
+  </material>
+  <material name="white">
+    <color rgba="0.95 0.95 0.95 1.0"/>
+  </material>
+  <material name="red">
+    <color rgba="0.95 0.0 0.0 1.0"/>
+  </material>
+  <material name="orange">
+    <color rgba="1.0 0.5 0.0 1.0"/>
+  </material>
+  <material name="yellow">
+    <color rgba="1.0 1.0 0.0 1.0"/>
+  </material>
+  <material name="green">
+    <color rgba="0.0 1.0 0.0 1.0"/>
+  </material>
+  <material name="blue">
+    <color rgba="0.0 0.0 1.0 1.0"/>
+  </material>
+  <material name="indigo">
+    <color rgba="0.3 0.3 0.6 1.0"/>
+  </material>
+  <material name="violet">
+    <color rgba="0.6 0.0 1.0 1.0"/>
+  </material>
+  <material name="black">
+    <color rgba="0.0 0.0 0.0 1.0"/>
+  </material>
+  <material name="darkolive">
+    <color rgba="0.3 0.3 0.25 1.0"/>
+  </material>
+  <!--LinkDescription-->
+  <link name="link_0">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/tm12/visual/tm12-base.obj"/>
+      </geometry>
+      <material name="none"/>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/tm12/collision/tm12-base_c.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="1.0"/>
+      <origin rpy="0.000000 0.000000 0.000000" xyz="0.000000 0.000000 0.000000"/>
+      <inertia ixx="0.00110833289" ixy="0.0" ixz="0.0" iyy="0.00110833289" iyz="0.0" izz="0.0018"/>
+    </inertial>
+  </link>
+  <joint name="shoulder_1_joint" type="revolute">
+    <parent link="link_0"/>
+    <child link="link_1"/>
+    <origin rpy="0.000000 -0.000000 0.000000" xyz="0.000000 0.000000 0.165200"/>
+    <axis xyz="0 0 1"/>
+    <!--limit-->
+    <limit effort="353" lower="-6.283185307179586" upper="6.283185307179586" velocity="2.0943951023931953"/>
+    <dynamics damping="0.0" friction="0.0"/>
+  </joint>
+  <link name="link_1">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/tm12/visual/tmr_750w_01.obj"/>
+      </geometry>
+      <material name="none"/>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/tm12/collision/tmr_750w_01_c.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0.000000 0.000000 0.000000" xyz="0.000000 0.000000 0.000000"/>
+      <mass value="7.6"/>
+      <inertia ixx="0.020289334" ixy="0.000000" ixz="0.000000" iyy="0.020289334" iyz="0.000000" izz="0.021396270999999998"/>
+    </inertial>
+  </link>
+  <joint name="shoulder_2_joint" type="revolute">
+    <parent link="link_1"/>
+    <child link="link_2"/>
+    <origin rpy="-1.570796 -1.570796 0.000000" xyz="0.000000 0.000000 0.000000"/>
+    <axis xyz="0 0 1"/>
+    <!--limit-->
+    <limit effort="353" lower="-6.283185307179586" upper="6.283185307179586" velocity="2.0943951023931953"/>
+    <dynamics damping="0.0" friction="0.0"/>
+  </joint>
+  <link name="link_2">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/tm12/visual/tm12-arm1.obj"/>
+      </geometry>
+      <material name="none"/>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/tm12/collision/tm12-arm1_c.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0.000000 0.000000 0.000000" xyz="0.000000 0.000000 0.000000"/>
+      <mass value="14.0239"/>
+      <inertia ixx="0.071505715" ixy="0.000000" ixz="0.000000" iyy="1.1758788999999998" iyz="0.000000" izz="1.2033932999999999"/>
+    </inertial>
+  </link>
+  <joint name="elbow_joint" type="revolute">
+    <parent link="link_2"/>
+    <child link="link_3"/>
+    <origin rpy="0.000000 -0.000000 0.000000" xyz="0.636100 0.000000 0.000000"/>
+    <axis xyz="0 0 1"/>
+    <!--limit-->
+    <limit effort="157" lower="-2.897246558310587" upper="2.897246558310587" velocity="3.141592653589793"/>
+    <dynamics damping="0.0" friction="0.0"/>
+  </joint>
+  <link name="link_3">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/tm12/visual/tm12-arm2.obj"/>
+      </geometry>
+      <material name="none"/>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/tm12/collision/tm12-arm2_c.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0.000000 0.000000 0.000000" xyz="0.000000 0.000000 0.000000"/>
+      <mass value="3.3577"/>
+      <inertia ixx="0.009755469" ixy="0.000000" ixz="0.000000" iyy="0.16334719" iyz="0.000000" izz="0.16656678"/>
+    </inertial>
+  </link>
+  <joint name="wrist_1_joint" type="revolute">
+    <parent link="link_3"/>
+    <child link="link_4"/>
+    <origin rpy="0.000000 -0.000000 1.570796" xyz="0.557900 0.000000 -0.156300"/>
+    <axis xyz="0 0 1"/>
+    <!--limit-->
+    <limit effort="54" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0.0" friction="0.0"/>
+  </joint>
+  <link name="link_4">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/tm5-900/visual/tmr_100w_01.obj"/>
+      </geometry>
+      <material name="none"/>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/tm5-900/collision/tmr_100w_01_c.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0.000000 0.000000 0.000000" xyz="0.000000 0.000000 0.000000"/>
+      <mass value="1.576"/>
+      <inertia ixx="0.002058405" ixy="0.000000" ixz="0.000000" iyy="0.0025630790000000002" iyz="0.000000" izz="0.00264321"/>
+    </inertial>
+  </link>
+  <joint name="wrist_2_joint" type="revolute">
+    <parent link="link_4"/>
+    <child link="link_5"/>
+    <origin rpy="1.570796 -0.000000 0.000000" xyz="0.000000 -0.106000 0.000000"/>
+    <axis xyz="0 0 1"/>
+    <!--limit-->
+    <limit effort="54" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0.0" friction="0.0"/>
+  </joint>
+  <link name="link_5">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/tm5-900/visual/tmr_100w_02.obj"/>
+      </geometry>
+      <material name="none"/>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/tm5-900/collision/tmr_100w_02_c.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0.000000 0.000000 0.000000" xyz="0.000000 0.000000 0.000000"/>
+      <mass value="1.576"/>
+      <inertia ixx="0.002058405" ixy="0.000000" ixz="0.000000" iyy="0.0025630790000000002" iyz="0.000000" izz="0.00264321"/>
+    </inertial>
+  </link>
+  <joint name="wrist_3_joint" type="revolute">
+    <parent link="link_5"/>
+    <child link="link_6"/>
+    <origin rpy="1.570796 -0.000000 0.000000" xyz="0.000000 -0.113150 0.000000"/>
+    <axis xyz="0 0 1"/>
+    <!--limit-->
+    <limit effort="54" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0.0" friction="0.0"/>
+  </joint>
+  <link name="link_6">
+    <visual>
+      <geometry>
+        <mesh filename="meshes/tm5-900/visual/tmr_iox.obj"/>
+      </geometry>
+      <material name="none"/>
+    </visual>
+    <collision>
+      <geometry>
+        <mesh filename="meshes/tm5-900/collision/tmr_iox_c.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <origin rpy="0.000000 0.000000 0.000000" xyz="0.000000 0.000000 0.000000"/>
+      <mass value="0.31"/>
+      <inertia ixx="0.000226642" ixy="0.000000" ixz="0.000000" iyy="0.000226642" iyz="0.000000" izz="0.000286595"/>
+    </inertial>
+  </link>
+  <joint name="flange_fixed_joint" type="fixed">
+    <parent link="link_6"/>
+    <child link="flange_link"/>
+    <origin rpy="0.000000 0.000000 0.000000" xyz="0.000000 0.000000 0.000000"/>
+  </joint>
+  <link name="flange_link"/>
+  <link name="base"/>
+  <joint name="base_fixed_joint" type="fixed">
+    <parent link="base"/>
+    <child link="link_0"/>
+    <origin rpy="0.000000 0.000000 0.000000" xyz="0.000000 0.000000 0.000000"/>
+  </joint>
+  <link name="tool0"/>
+  <joint name="flange_link-tool0" type="fixed">
+    <parent link="flange_link"/>
+    <child link="tool0"/>
+    <origin rpy="0.000000 0.000000 0.000000" xyz="0.000000 0.000000 0.000000"/>
+  </joint>
+  <!--LinkDescription-->
+  <link name="world"/>
+  <joint name="world_joint" type="fixed">
+    <parent link="world"/>
+    <child link="base"/>
+    <origin rpy="0 0 0" xyz="0 0 0"/>
+  </joint>
+</robot>
+
diff --git a/src/curobo/content/assets/robot/ur_description/dual_ur10e.urdf b/src/curobo/content/assets/robot/ur_description/dual_ur10e.urdf
new file mode 100644
index 00000000..ad5296bf
--- /dev/null
+++ b/src/curobo/content/assets/robot/ur_description/dual_ur10e.urdf
@@ -0,0 +1,556 @@
+<?xml version="1.0" ?>
+<!-- =================================================================================== -->
+<!-- |    This document was autogenerated by xacro from ur_description/urdf/ur10e.xacro | -->
+<!-- |    EDITING THIS FILE BY HAND IS NOT RECOMMENDED                                 | -->
+<!-- =================================================================================== -->
+<robot name="ur10e_robot">
+  <!--
+    Base UR robot series xacro macro.
+
+    NOTE: this is NOT a URDF. It cannot directly be loaded by consumers
+    expecting a flattened '.urdf' file. See the top-level '.xacro' for that
+    (but note: that .xacro must still be processed by the xacro command).
+
+    For use in '.launch' files: use one of the 'load_urX.launch' convenience
+    launch files.
+
+    This file models the base kinematic chain of a UR robot, which then gets
+    parameterised by various configuration files to convert it into a UR3(e),
+    UR5(e), UR10(e) or UR16e.
+
+    NOTE: the default kinematic parameters (ie: link lengths, frame locations,
+    offets, etc) do not correspond to any particular robot. They are defaults
+    only. There WILL be non-zero offsets between the Forward Kinematics results
+    in TF (ie: robot_state_publisher) and the values reported by the Teach
+    Pendant.
+
+    For accurate (and robot-specific) transforms, the 'kinematics_parameters_file'
+    parameter MUST point to a .yaml file containing the appropriate values for
+    the targetted robot.
+
+    If using the UniversalRobots/Universal_Robots_ROS_Driver, follow the steps
+    described in the readme of that repository to extract the kinematic
+    calibration from the controller and generate the required .yaml file.
+
+    Main author of the migration to yaml configs: Ludovic Delval.
+
+    Contributors to previous versions (in no particular order):
+
+     - Felix Messmer
+     - Kelsey Hawkins
+     - Wim Meeussen
+     - Shaun Edwards
+     - Nadia Hammoudeh Garcia
+     - Dave Hershberger
+     - G. vd. Hoorn
+     - Philip Long
+     - Dave Coleman
+     - Miguel Prada
+     - Mathias Luedtke
+     - Marcel Schnirring
+     - Felix von Drigalski
+     - Felix Exner
+     - Jimmy Da Silva
+     - Ajit Krisshna N L
+     - Muhammad Asif Rana
+  -->
+  <!--
+    NOTE: the macro defined in this file is NOT part of the public API of this
+          package. Users CANNOT rely on this file being available, or stored in
+          this location. Nor can they rely on the existence of the macro.
+  -->
+  
+  <!-- links: main serial chain -->
+  <link name="base_fixture_link"/>
+  <link name="base_link"/>
+
+ <link name="base_link_1"/>
+ 
+
+
+    <!-- joints: main serial chain -->
+  <joint name="base_fixture_j_base_link" type="fixed">
+    <parent link="base_fixture_link"/>
+    <child link="base_link"/>
+    <origin rpy="0 0 0" xyz="0.75 0.75 0"/>
+  </joint>
+
+
+    <!-- joints: main serial chain -->
+  <joint name="base_fixture_j_base_link_1" type="fixed">
+    <parent link="base_fixture_link"/>
+    <child link="base_link_1"/>
+    <origin rpy="0 0 0.0" xyz="-0.75 0.75 0"/>
+  </joint>
+
+
+  
+  <link name="base_link_inertia">
+    <visual>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/base.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/base.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="4.0"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.0061063308908" ixy="0.0" ixz="0.0" iyy="0.0061063308908" iyz="0.0" izz="0.01125"/>
+    </inertial>
+  </link>
+  <joint name="base_link_base_link_inertia" type="fixed">
+    <parent link="base_link"/>
+    <child link="base_link_inertia"/>
+    <!-- 'base_link' is REP-103 aligned (so X+ forward), while the internal
+           frames of the robot/controller have X+ pointing backwards.
+           Use the joint between 'base_link' and 'base_link_inertia' (a dummy
+           link/frame) to introduce the necessary rotation over Z (of pi rad).
+      -->
+    <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+  </joint>
+
+  <link name="shoulder_link">
+    <visual>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/shoulder.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/shoulder.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="7.778"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.03147431257693659" ixy="0.0" ixz="0.0" iyy="0.03147431257693659" iyz="0.0" izz="0.021875624999999996"/>
+    </inertial>
+  </link>
+  <link name="upper_arm_link">
+    <visual>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.1762"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/upperarm.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.1762"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/upperarm.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="12.93"/>
+      <origin rpy="0 1.5707963267948966 0" xyz="-0.306 0.0 0.175"/>
+      <inertia ixx="0.42175380379841093" ixy="0.0" ixz="0.0" iyy="0.42175380379841093" iyz="0.0" izz="0.03636562499999999"/>
+    </inertial>
+  </link>
+  <link name="forearm_link">
+    <visual>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.0393"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/forearm.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.0393"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/forearm.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="3.87"/>
+      <origin rpy="0 1.5707963267948966 0" xyz="-0.285775 0.0 0.0393"/>
+      <inertia ixx="0.11079302548902206" ixy="0.0" ixz="0.0" iyy="0.11079302548902206" iyz="0.0" izz="0.010884375"/>
+    </inertial>
+  </link>
+  <link name="wrist_1_link">
+    <visual>
+      <!-- TODO: Move this to a parameter -->
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.135"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist1.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.135"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist1.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="1.96"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.005108247956699999" ixy="0.0" ixz="0.0" iyy="0.005108247956699999" iyz="0.0" izz="0.005512499999999999"/>
+    </inertial>
+  </link>
+  <link name="wrist_2_link">
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 -0.12"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist2.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 -0.12"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist2.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="1.96"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.005108247956699999" ixy="0.0" ixz="0.0" iyy="0.005108247956699999" iyz="0.0" izz="0.005512499999999999"/>
+    </inertial>
+  </link>
+  <link name="wrist_3_link">
+    <visual>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.1168"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist3.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.1168"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist3.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.202"/>
+      <origin rpy="0 0 0" xyz="0.0 0.0 -0.025"/>
+      <inertia ixx="0.00014434577559500002" ixy="0.0" ixz="0.0" iyy="0.00014434577559500002" iyz="0.0" izz="0.00020452500000000002"/>
+    </inertial>
+  </link>
+  
+  
+  <joint name="shoulder_pan_joint" type="revolute">
+    <parent link="base_link_inertia"/>
+    <child link="shoulder_link"/>
+    <origin rpy="0 0 0" xyz="0 0 0.1807"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="330.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="2.0943951023931953"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="shoulder_lift_joint" type="revolute">
+    <parent link="shoulder_link"/>
+    <child link="upper_arm_link"/>
+    <origin rpy="1.570796327 0 0" xyz="0 0 0"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="330.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="2.0943951023931953"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="elbow_joint" type="revolute">
+    <parent link="upper_arm_link"/>
+    <child link="forearm_link"/>
+    <origin rpy="0 0 0" xyz="-0.6127 0 0"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="150.0" lower="-3.141592653589793" upper="3.141592653589793" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_1_joint" type="revolute">
+    <parent link="forearm_link"/>
+    <child link="wrist_1_link"/>
+    <origin rpy="0 0 0" xyz="-0.57155 0 0.17415"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_2_joint" type="revolute">
+    <parent link="wrist_1_link"/>
+    <child link="wrist_2_link"/>
+    <origin rpy="1.570796327 0 0" xyz="0 -0.11985 -2.458164590756244e-11"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_3_joint" type="revolute">
+    <parent link="wrist_2_link"/>
+    <child link="wrist_3_link"/>
+    <origin rpy="1.570796326589793 3.141592653589793 3.141592653589793" xyz="0 0.11655 -2.390480459346185e-11"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+
+  <link name="flange"/>
+  <joint name="wrist_3-flange" type="fixed">
+    <parent link="wrist_3_link"/>
+    <child link="flange"/>
+    <origin rpy="0 -1.5707963267948966 -1.5707963267948966" xyz="0 0 0"/>
+  </joint>
+  <!-- ROS-Industrial 'tool0' frame: all-zeros tool frame -->
+  <link name="tool0"/>
+  <joint name="flange-tool0" type="fixed">
+    <!-- default toolframe: X+ left, Y+ up, Z+ front -->
+    <origin rpy="1.5707963267948966 0 1.5707963267948966" xyz="0 0 0"/>
+    <parent link="flange"/>
+    <child link="tool0"/>
+  </joint>
+  <!-- ROS-Industrial 'base' frame: base_link to UR 'Base' Coordinates transform -->
+
+
+  
+  <link name="base_link_inertia_link_1">
+    <visual>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/base.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/base.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="4.0"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.0061063308908" ixy="0.0" ixz="0.0" iyy="0.0061063308908" iyz="0.0" izz="0.01125"/>
+    </inertial>
+  </link>
+  <link name="shoulder_link_1">
+    <visual>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/shoulder.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/shoulder.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="7.778"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.03147431257693659" ixy="0.0" ixz="0.0" iyy="0.03147431257693659" iyz="0.0" izz="0.021875624999999996"/>
+    </inertial>
+  </link>
+  <link name="upper_arm_link_1">
+    <visual>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.1762"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/upperarm.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.1762"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/upperarm.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="12.93"/>
+      <origin rpy="0 1.5707963267948966 0" xyz="-0.306 0.0 0.175"/>
+      <inertia ixx="0.42175380379841093" ixy="0.0" ixz="0.0" iyy="0.42175380379841093" iyz="0.0" izz="0.03636562499999999"/>
+    </inertial>
+  </link>
+  <link name="forearm_link_1">
+    <visual>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.0393"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/forearm.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.0393"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/forearm.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="3.87"/>
+      <origin rpy="0 1.5707963267948966 0" xyz="-0.285775 0.0 0.0393"/>
+      <inertia ixx="0.11079302548902206" ixy="0.0" ixz="0.0" iyy="0.11079302548902206" iyz="0.0" izz="0.010884375"/>
+    </inertial>
+  </link>
+  <link name="wrist_1_link_1">
+    <visual>
+      <!-- TODO: Move this to a parameter -->
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.135"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist1.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.135"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist1.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="1.96"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.005108247956699999" ixy="0.0" ixz="0.0" iyy="0.005108247956699999" iyz="0.0" izz="0.005512499999999999"/>
+    </inertial>
+  </link>
+  <link name="wrist_2_link_1">
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 -0.12"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist2.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 -0.12"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist2.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="1.96"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.005108247956699999" ixy="0.0" ixz="0.0" iyy="0.005108247956699999" iyz="0.0" izz="0.005512499999999999"/>
+    </inertial>
+  </link>
+  <link name="wrist_3_link_1">
+    <visual>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.1168"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist3.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.1168"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist3.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.202"/>
+      <origin rpy="0 0 0" xyz="0.0 0.0 -0.025"/>
+      <inertia ixx="0.00014434577559500002" ixy="0.0" ixz="0.0" iyy="0.00014434577559500002" iyz="0.0" izz="0.00020452500000000002"/>
+    </inertial>
+  </link>
+  
+  <joint name="base_link_base_link_inertia_joint_1" type="fixed">
+    <parent link="base_link_1"/>
+    <child link="base_link_inertia_link_1"/>
+    <!-- 'base_link' is REP-103 aligned (so X+ forward), while the internal
+           frames of the robot/controller have X+ pointing backwards.
+           Use the joint between 'base_link' and 'base_link_inertia' (a dummy
+           link/frame) to introduce the necessary rotation over Z (of pi rad).
+      -->
+    <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+  </joint>
+
+  <joint name="shoulder_pan_joint_1" type="revolute">
+    <parent link="base_link_inertia_link_1"/>
+    <child link="shoulder_link_1"/>
+    <origin rpy="0 0 0" xyz="0 0 0.1807"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="330.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="2.0943951023931953"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="shoulder_lift_joint_1" type="revolute">
+    <parent link="shoulder_link_1"/>
+    <child link="upper_arm_link_1"/>
+    <origin rpy="1.570796327 0 0" xyz="0 0 0"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="330.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="2.0943951023931953"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="elbow_joint_1" type="revolute">
+    <parent link="upper_arm_link_1"/>
+    <child link="forearm_link_1"/>
+    <origin rpy="0 0 0" xyz="-0.6127 0 0"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="150.0" lower="-3.141592653589793" upper="3.141592653589793" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_1_joint_1" type="revolute">
+    <parent link="forearm_link_1"/>
+    <child link="wrist_1_link_1"/>
+    <origin rpy="0 0 0" xyz="-0.57155 0 0.17415"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_2_joint_1" type="revolute">
+    <parent link="wrist_1_link_1"/>
+    <child link="wrist_2_link_1"/>
+    <origin rpy="1.570796327 0 0" xyz="0 -0.11985 -2.458164590756244e-11"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_3_joint_1" type="revolute">
+    <parent link="wrist_2_link_1"/>
+    <child link="wrist_3_link_1"/>
+    <origin rpy="1.570796326589793 3.141592653589793 3.141592653589793" xyz="0 0.11655 -2.390480459346185e-11"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  
+  <!-- ROS-Industrial 'flange' frame: attachment point for EEF models -->
+  <link name="flange_1"/>
+  <joint name="wrist_3_flange_1" type="fixed">
+    <parent link="wrist_3_link_1"/>
+    <child link="flange_1"/>
+    <origin rpy="0 -1.5707963267948966 -1.5707963267948966" xyz="0 0 0"/>
+  </joint>
+  <!-- ROS-Industrial 'tool0' frame: all-zeros tool frame -->
+  <link name="tool1"/>
+  <joint name="flange_tool_1" type="fixed">
+    <!-- default toolframe: X+ left, Y+ up, Z+ front -->
+    <origin rpy="1.5707963267948966 0 1.5707963267948966" xyz="0 0 0"/>
+    <parent link="flange_1"/>
+    <child link="tool1"/>
+  </joint>
+
+
+</robot>
diff --git a/src/curobo/content/assets/robot/ur_description/quad_ur10e.urdf b/src/curobo/content/assets/robot/ur_description/quad_ur10e.urdf
new file mode 100644
index 00000000..228b3be4
--- /dev/null
+++ b/src/curobo/content/assets/robot/ur_description/quad_ur10e.urdf
@@ -0,0 +1,1035 @@
+<?xml version="1.0" ?>
+<!-- =================================================================================== -->
+<!-- |    This document was autogenerated by xacro from ur_description/urdf/ur10e.xacro | -->
+<!-- |    EDITING THIS FILE BY HAND IS NOT RECOMMENDED                                 | -->
+<!-- =================================================================================== -->
+<robot name="ur10e_robot">
+  <!--
+    Base UR robot series xacro macro.
+
+    NOTE: this is NOT a URDF. It cannot directly be loaded by consumers
+    expecting a flattened '.urdf' file. See the top-level '.xacro' for that
+    (but note: that .xacro must still be processed by the xacro command).
+
+    For use in '.launch' files: use one of the 'load_urX.launch' convenience
+    launch files.
+
+    This file models the base kinematic chain of a UR robot, which then gets
+    parameterised by various configuration files to convert it into a UR3(e),
+    UR5(e), UR10(e) or UR16e.
+
+    NOTE: the default kinematic parameters (ie: link lengths, frame locations,
+    offets, etc) do not correspond to any particular robot. They are defaults
+    only. There WILL be non-zero offsets between the Forward Kinematics results
+    in TF (ie: robot_state_publisher) and the values reported by the Teach
+    Pendant.
+
+    For accurate (and robot-specific) transforms, the 'kinematics_parameters_file'
+    parameter MUST point to a .yaml file containing the appropriate values for
+    the targetted robot.
+
+    If using the UniversalRobots/Universal_Robots_ROS_Driver, follow the steps
+    described in the readme of that repository to extract the kinematic
+    calibration from the controller and generate the required .yaml file.
+
+    Main author of the migration to yaml configs: Ludovic Delval.
+
+    Contributors to previous versions (in no particular order):
+
+     - Felix Messmer
+     - Kelsey Hawkins
+     - Wim Meeussen
+     - Shaun Edwards
+     - Nadia Hammoudeh Garcia
+     - Dave Hershberger
+     - G. vd. Hoorn
+     - Philip Long
+     - Dave Coleman
+     - Miguel Prada
+     - Mathias Luedtke
+     - Marcel Schnirring
+     - Felix von Drigalski
+     - Felix Exner
+     - Jimmy Da Silva
+     - Ajit Krisshna N L
+     - Muhammad Asif Rana
+  -->
+  <!--
+    NOTE: the macro defined in this file is NOT part of the public API of this
+          package. Users CANNOT rely on this file being available, or stored in
+          this location. Nor can they rely on the existence of the macro.
+  -->
+  
+  <!-- links: main serial chain -->
+  <link name="base_fixture_link"/>
+  <link name="base_link"/>
+
+ <link name="base_link_1"/>
+ <link name="base_link_2"/>
+ <link name="base_link_3"/>
+
+
+    <!-- joints: main serial chain -->
+  <joint name="base_fixture_j_base_link" type="fixed">
+    <parent link="base_fixture_link"/>
+    <child link="base_link"/>
+    <origin rpy="0 0 0" xyz="0.75 0.75 0"/>
+  </joint>
+
+
+    <!-- joints: main serial chain -->
+  <joint name="base_fixture_j_base_link_1" type="fixed">
+    <parent link="base_fixture_link"/>
+    <child link="base_link_1"/>
+    <origin rpy="0 0 0.0" xyz="-0.75 0.75 0"/>
+  </joint>
+
+    <!-- joints: main serial chain -->
+  <joint name="base_fixture_j_base_link_2" type="fixed">
+    <parent link="base_fixture_link"/>
+    <child link="base_link_2"/>
+    <origin rpy="0 0 3.14" xyz="0.75 -0.75 0"/>
+  </joint>
+
+    <!-- joints: main serial chain -->
+  <joint name="base_fixture_j_base_link_3" type="fixed">
+    <parent link="base_fixture_link"/>
+    <child link="base_link_3"/>
+    <origin rpy="0 0 3.14" xyz="-0.75 -0.75 0"/>
+  </joint>
+
+
+  
+  <link name="base_link_inertia">
+    <visual>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/base.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/base.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="4.0"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.0061063308908" ixy="0.0" ixz="0.0" iyy="0.0061063308908" iyz="0.0" izz="0.01125"/>
+    </inertial>
+  </link>
+  <joint name="base_link_base_link_inertia" type="fixed">
+    <parent link="base_link"/>
+    <child link="base_link_inertia"/>
+    <!-- 'base_link' is REP-103 aligned (so X+ forward), while the internal
+           frames of the robot/controller have X+ pointing backwards.
+           Use the joint between 'base_link' and 'base_link_inertia' (a dummy
+           link/frame) to introduce the necessary rotation over Z (of pi rad).
+      -->
+    <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+  </joint>
+
+  <link name="shoulder_link">
+    <visual>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/shoulder.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/shoulder.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="7.778"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.03147431257693659" ixy="0.0" ixz="0.0" iyy="0.03147431257693659" iyz="0.0" izz="0.021875624999999996"/>
+    </inertial>
+  </link>
+  <link name="upper_arm_link">
+    <visual>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.1762"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/upperarm.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.1762"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/upperarm.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="12.93"/>
+      <origin rpy="0 1.5707963267948966 0" xyz="-0.306 0.0 0.175"/>
+      <inertia ixx="0.42175380379841093" ixy="0.0" ixz="0.0" iyy="0.42175380379841093" iyz="0.0" izz="0.03636562499999999"/>
+    </inertial>
+  </link>
+  <link name="forearm_link">
+    <visual>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.0393"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/forearm.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.0393"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/forearm.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="3.87"/>
+      <origin rpy="0 1.5707963267948966 0" xyz="-0.285775 0.0 0.0393"/>
+      <inertia ixx="0.11079302548902206" ixy="0.0" ixz="0.0" iyy="0.11079302548902206" iyz="0.0" izz="0.010884375"/>
+    </inertial>
+  </link>
+  <link name="wrist_1_link">
+    <visual>
+      <!-- TODO: Move this to a parameter -->
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.135"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist1.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.135"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist1.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="1.96"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.005108247956699999" ixy="0.0" ixz="0.0" iyy="0.005108247956699999" iyz="0.0" izz="0.005512499999999999"/>
+    </inertial>
+  </link>
+  <link name="wrist_2_link">
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 -0.12"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist2.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 -0.12"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist2.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="1.96"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.005108247956699999" ixy="0.0" ixz="0.0" iyy="0.005108247956699999" iyz="0.0" izz="0.005512499999999999"/>
+    </inertial>
+  </link>
+  <link name="wrist_3_link">
+    <visual>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.1168"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist3.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.1168"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist3.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.202"/>
+      <origin rpy="0 0 0" xyz="0.0 0.0 -0.025"/>
+      <inertia ixx="0.00014434577559500002" ixy="0.0" ixz="0.0" iyy="0.00014434577559500002" iyz="0.0" izz="0.00020452500000000002"/>
+    </inertial>
+  </link>
+  
+  
+  <joint name="shoulder_pan_joint" type="revolute">
+    <parent link="base_link_inertia"/>
+    <child link="shoulder_link"/>
+    <origin rpy="0 0 0" xyz="0 0 0.1807"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="330.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="2.0943951023931953"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="shoulder_lift_joint" type="revolute">
+    <parent link="shoulder_link"/>
+    <child link="upper_arm_link"/>
+    <origin rpy="1.570796327 0 0" xyz="0 0 0"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="330.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="2.0943951023931953"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="elbow_joint" type="revolute">
+    <parent link="upper_arm_link"/>
+    <child link="forearm_link"/>
+    <origin rpy="0 0 0" xyz="-0.6127 0 0"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="150.0" lower="-3.141592653589793" upper="3.141592653589793" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_1_joint" type="revolute">
+    <parent link="forearm_link"/>
+    <child link="wrist_1_link"/>
+    <origin rpy="0 0 0" xyz="-0.57155 0 0.17415"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_2_joint" type="revolute">
+    <parent link="wrist_1_link"/>
+    <child link="wrist_2_link"/>
+    <origin rpy="1.570796327 0 0" xyz="0 -0.11985 -2.458164590756244e-11"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_3_joint" type="revolute">
+    <parent link="wrist_2_link"/>
+    <child link="wrist_3_link"/>
+    <origin rpy="1.570796326589793 3.141592653589793 3.141592653589793" xyz="0 0.11655 -2.390480459346185e-11"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+
+  <link name="flange"/>
+  <joint name="wrist_3-flange" type="fixed">
+    <parent link="wrist_3_link"/>
+    <child link="flange"/>
+    <origin rpy="0 -1.5707963267948966 -1.5707963267948966" xyz="0 0 0"/>
+  </joint>
+  <!-- ROS-Industrial 'tool0' frame: all-zeros tool frame -->
+  <link name="tool0"/>
+  <joint name="flange-tool0" type="fixed">
+    <!-- default toolframe: X+ left, Y+ up, Z+ front -->
+    <origin rpy="1.5707963267948966 0 1.5707963267948966" xyz="0 0 0"/>
+    <parent link="flange"/>
+    <child link="tool0"/>
+  </joint>
+  <!-- ROS-Industrial 'base' frame: base_link to UR 'Base' Coordinates transform -->
+
+
+  
+  <link name="base_link_inertia_link_1">
+    <visual>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/base.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/base.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="4.0"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.0061063308908" ixy="0.0" ixz="0.0" iyy="0.0061063308908" iyz="0.0" izz="0.01125"/>
+    </inertial>
+  </link>
+  <link name="shoulder_link_1">
+    <visual>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/shoulder.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/shoulder.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="7.778"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.03147431257693659" ixy="0.0" ixz="0.0" iyy="0.03147431257693659" iyz="0.0" izz="0.021875624999999996"/>
+    </inertial>
+  </link>
+  <link name="upper_arm_link_1">
+    <visual>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.1762"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/upperarm.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.1762"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/upperarm.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="12.93"/>
+      <origin rpy="0 1.5707963267948966 0" xyz="-0.306 0.0 0.175"/>
+      <inertia ixx="0.42175380379841093" ixy="0.0" ixz="0.0" iyy="0.42175380379841093" iyz="0.0" izz="0.03636562499999999"/>
+    </inertial>
+  </link>
+  <link name="forearm_link_1">
+    <visual>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.0393"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/forearm.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.0393"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/forearm.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="3.87"/>
+      <origin rpy="0 1.5707963267948966 0" xyz="-0.285775 0.0 0.0393"/>
+      <inertia ixx="0.11079302548902206" ixy="0.0" ixz="0.0" iyy="0.11079302548902206" iyz="0.0" izz="0.010884375"/>
+    </inertial>
+  </link>
+  <link name="wrist_1_link_1">
+    <visual>
+      <!-- TODO: Move this to a parameter -->
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.135"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist1.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.135"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist1.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="1.96"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.005108247956699999" ixy="0.0" ixz="0.0" iyy="0.005108247956699999" iyz="0.0" izz="0.005512499999999999"/>
+    </inertial>
+  </link>
+  <link name="wrist_2_link_1">
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 -0.12"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist2.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 -0.12"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist2.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="1.96"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.005108247956699999" ixy="0.0" ixz="0.0" iyy="0.005108247956699999" iyz="0.0" izz="0.005512499999999999"/>
+    </inertial>
+  </link>
+  <link name="wrist_3_link_1">
+    <visual>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.1168"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist3.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.1168"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist3.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.202"/>
+      <origin rpy="0 0 0" xyz="0.0 0.0 -0.025"/>
+      <inertia ixx="0.00014434577559500002" ixy="0.0" ixz="0.0" iyy="0.00014434577559500002" iyz="0.0" izz="0.00020452500000000002"/>
+    </inertial>
+  </link>
+  
+  <joint name="base_link_base_link_inertia_joint_1" type="fixed">
+    <parent link="base_link_1"/>
+    <child link="base_link_inertia_link_1"/>
+    <!-- 'base_link' is REP-103 aligned (so X+ forward), while the internal
+           frames of the robot/controller have X+ pointing backwards.
+           Use the joint between 'base_link' and 'base_link_inertia' (a dummy
+           link/frame) to introduce the necessary rotation over Z (of pi rad).
+      -->
+    <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+  </joint>
+
+  <joint name="shoulder_pan_joint_1" type="revolute">
+    <parent link="base_link_inertia_link_1"/>
+    <child link="shoulder_link_1"/>
+    <origin rpy="0 0 0" xyz="0 0 0.1807"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="330.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="2.0943951023931953"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="shoulder_lift_joint_1" type="revolute">
+    <parent link="shoulder_link_1"/>
+    <child link="upper_arm_link_1"/>
+    <origin rpy="1.570796327 0 0" xyz="0 0 0"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="330.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="2.0943951023931953"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="elbow_joint_1" type="revolute">
+    <parent link="upper_arm_link_1"/>
+    <child link="forearm_link_1"/>
+    <origin rpy="0 0 0" xyz="-0.6127 0 0"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="150.0" lower="-3.141592653589793" upper="3.141592653589793" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_1_joint_1" type="revolute">
+    <parent link="forearm_link_1"/>
+    <child link="wrist_1_link_1"/>
+    <origin rpy="0 0 0" xyz="-0.57155 0 0.17415"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_2_joint_1" type="revolute">
+    <parent link="wrist_1_link_1"/>
+    <child link="wrist_2_link_1"/>
+    <origin rpy="1.570796327 0 0" xyz="0 -0.11985 -2.458164590756244e-11"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_3_joint_1" type="revolute">
+    <parent link="wrist_2_link_1"/>
+    <child link="wrist_3_link_1"/>
+    <origin rpy="1.570796326589793 3.141592653589793 3.141592653589793" xyz="0 0.11655 -2.390480459346185e-11"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  
+  <!-- ROS-Industrial 'flange' frame: attachment point for EEF models -->
+  <link name="flange_1"/>
+  <joint name="wrist_3_flange_1" type="fixed">
+    <parent link="wrist_3_link_1"/>
+    <child link="flange_1"/>
+    <origin rpy="0 -1.5707963267948966 -1.5707963267948966" xyz="0 0 0"/>
+  </joint>
+  <!-- ROS-Industrial 'tool0' frame: all-zeros tool frame -->
+  <link name="tool1"/>
+  <joint name="flange_tool_1" type="fixed">
+    <!-- default toolframe: X+ left, Y+ up, Z+ front -->
+    <origin rpy="1.5707963267948966 0 1.5707963267948966" xyz="0 0 0"/>
+    <parent link="flange_1"/>
+    <child link="tool1"/>
+  </joint>
+
+
+  <link name="base_link_inertia_link_2">
+    <visual>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/base.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/base.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="4.0"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.0061063308908" ixy="0.0" ixz="0.0" iyy="0.0061063308908" iyz="0.0" izz="0.01125"/>
+    </inertial>
+  </link>
+  <link name="shoulder_link_2">
+    <visual>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/shoulder.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/shoulder.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="7.778"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.03147431257693659" ixy="0.0" ixz="0.0" iyy="0.03147431257693659" iyz="0.0" izz="0.021875624999999996"/>
+    </inertial>
+  </link>
+  <link name="upper_arm_link_2">
+    <visual>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.1762"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/upperarm.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.1762"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/upperarm.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="12.93"/>
+      <origin rpy="0 1.5707963267948966 0" xyz="-0.306 0.0 0.175"/>
+      <inertia ixx="0.42175380379841093" ixy="0.0" ixz="0.0" iyy="0.42175380379841093" iyz="0.0" izz="0.03636562499999999"/>
+    </inertial>
+  </link>
+  <link name="forearm_link_2">
+    <visual>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.0393"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/forearm.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.0393"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/forearm.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="3.87"/>
+      <origin rpy="0 1.5707963267948966 0" xyz="-0.285775 0.0 0.0393"/>
+      <inertia ixx="0.11079302548902206" ixy="0.0" ixz="0.0" iyy="0.11079302548902206" iyz="0.0" izz="0.010884375"/>
+    </inertial>
+  </link>
+  <link name="wrist_1_link_2">
+    <visual>
+      <!-- TODO: Move this to a parameter -->
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.135"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist1.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.135"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist1.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="1.96"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.005108247956699999" ixy="0.0" ixz="0.0" iyy="0.005108247956699999" iyz="0.0" izz="0.005512499999999999"/>
+    </inertial>
+  </link>
+  <link name="wrist_2_link_2">
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 -0.12"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist2.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 -0.12"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist2.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="1.96"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.005108247956699999" ixy="0.0" ixz="0.0" iyy="0.005108247956699999" iyz="0.0" izz="0.005512499999999999"/>
+    </inertial>
+  </link>
+  <link name="wrist_3_link_2">
+    <visual>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.1168"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist3.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.1168"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist3.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.202"/>
+      <origin rpy="0 0 0" xyz="0.0 0.0 -0.025"/>
+      <inertia ixx="0.00014434577559500002" ixy="0.0" ixz="0.0" iyy="0.00014434577559500002" iyz="0.0" izz="0.00020452500000000002"/>
+    </inertial>
+  </link>
+  
+  <joint name="base_link_base_link_inertia_joint_2" type="fixed">
+    <parent link="base_link_2"/>
+    <child link="base_link_inertia_link_2"/>
+    <!-- 'base_link' is REP-103 aligned (so X+ forward), while the internal
+           frames of the robot/controller have X+ pointing backwards.
+           Use the joint between 'base_link' and 'base_link_inertia' (a dummy
+           link/frame) to introduce the necessary rotation over Z (of pi rad).
+      -->
+    <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+  </joint>
+
+  <joint name="shoulder_pan_joint_2" type="revolute">
+    <parent link="base_link_inertia_link_2"/>
+    <child link="shoulder_link_2"/>
+    <origin rpy="0 0 0" xyz="0 0 0.1807"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="330.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="2.0943951023931953"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="shoulder_lift_joint_2" type="revolute">
+    <parent link="shoulder_link_2"/>
+    <child link="upper_arm_link_2"/>
+    <origin rpy="1.570796327 0 0" xyz="0 0 0"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="330.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="2.0943951023931953"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="elbow_joint_2" type="revolute">
+    <parent link="upper_arm_link_2"/>
+    <child link="forearm_link_2"/>
+    <origin rpy="0 0 0" xyz="-0.6127 0 0"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="150.0" lower="-3.141592653589793" upper="3.141592653589793" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_1_joint_2" type="revolute">
+    <parent link="forearm_link_2"/>
+    <child link="wrist_1_link_2"/>
+    <origin rpy="0 0 0" xyz="-0.57155 0 0.17415"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_2_joint_2" type="revolute">
+    <parent link="wrist_1_link_2"/>
+    <child link="wrist_2_link_2"/>
+    <origin rpy="1.570796327 0 0" xyz="0 -0.11985 -2.458164590756244e-11"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_3_joint_2" type="revolute">
+    <parent link="wrist_2_link_2"/>
+    <child link="wrist_3_link_2"/>
+    <origin rpy="1.570796326589793 3.141592653589793 3.141592653589793" xyz="0 0.11655 -2.390480459346185e-11"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  
+  <!-- ROS-Industrial 'flange' frame: attachment point for EEF models -->
+  <link name="flange_2"/>
+  <joint name="wrist_3_flange_joint_2" type="fixed">
+    <parent link="wrist_3_link_2"/>
+    <child link="flange_2"/>
+    <origin rpy="0 -1.5707963267948966 -1.5707963267948966" xyz="0 0 0"/>
+  </joint>
+  <!-- ROS-Industrial 'tool0' frame: all-zeros tool frame -->
+  <link name="tool2"/>
+  <joint name="flange_tool_joint_2" type="fixed">
+    <!-- default toolframe: X+ left, Y+ up, Z+ front -->
+    <origin rpy="1.5707963267948966 0 1.5707963267948966" xyz="0 0 0"/>
+    <parent link="flange_2"/>
+    <child link="tool2"/>
+  </joint>
+
+
+  <link name="base_link_inertia_link_3">
+    <visual>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/base.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/base.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="4.0"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.0061063308908" ixy="0.0" ixz="0.0" iyy="0.0061063308908" iyz="0.0" izz="0.01125"/>
+    </inertial>
+  </link>
+  <link name="shoulder_link_3">
+    <visual>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/shoulder.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/shoulder.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="7.778"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.03147431257693659" ixy="0.0" ixz="0.0" iyy="0.03147431257693659" iyz="0.0" izz="0.021875624999999996"/>
+    </inertial>
+  </link>
+  <link name="upper_arm_link_3">
+    <visual>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.1762"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/upperarm.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.1762"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/upperarm.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="12.93"/>
+      <origin rpy="0 1.5707963267948966 0" xyz="-0.306 0.0 0.175"/>
+      <inertia ixx="0.42175380379841093" ixy="0.0" ixz="0.0" iyy="0.42175380379841093" iyz="0.0" izz="0.03636562499999999"/>
+    </inertial>
+  </link>
+  <link name="forearm_link_3">
+    <visual>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.0393"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/forearm.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.0393"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/forearm.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="3.87"/>
+      <origin rpy="0 1.5707963267948966 0" xyz="-0.285775 0.0 0.0393"/>
+      <inertia ixx="0.11079302548902206" ixy="0.0" ixz="0.0" iyy="0.11079302548902206" iyz="0.0" izz="0.010884375"/>
+    </inertial>
+  </link>
+  <link name="wrist_1_link_3">
+    <visual>
+      <!-- TODO: Move this to a parameter -->
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.135"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist1.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.135"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist1.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="1.96"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.005108247956699999" ixy="0.0" ixz="0.0" iyy="0.005108247956699999" iyz="0.0" izz="0.005512499999999999"/>
+    </inertial>
+  </link>
+  <link name="wrist_2_link_3">
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 -0.12"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist2.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 -0.12"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist2.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="1.96"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.005108247956699999" ixy="0.0" ixz="0.0" iyy="0.005108247956699999" iyz="0.0" izz="0.005512499999999999"/>
+    </inertial>
+  </link>
+  <link name="wrist_3_link_3">
+    <visual>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.1168"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist3.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.1168"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist3.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.202"/>
+      <origin rpy="0 0 0" xyz="0.0 0.0 -0.025"/>
+      <inertia ixx="0.00014434577559500002" ixy="0.0" ixz="0.0" iyy="0.00014434577559500002" iyz="0.0" izz="0.00020452500000000002"/>
+    </inertial>
+  </link>
+  
+  <joint name="base_link_base_link_inertia_joint_3" type="fixed">
+    <parent link="base_link_3"/>
+    <child link="base_link_inertia_link_3"/>
+    <!-- 'base_link' is REP-103 aligned (so X+ forward), while the internal
+           frames of the robot/controller have X+ pointing backwards.
+           Use the joint between 'base_link' and 'base_link_inertia' (a dummy
+           link/frame) to introduce the necessary rotation over Z (of pi rad).
+      -->
+    <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+  </joint>
+
+  <joint name="shoulder_pan_joint_3" type="revolute">
+    <parent link="base_link_inertia_link_3"/>
+    <child link="shoulder_link_3"/>
+    <origin rpy="0 0 0" xyz="0 0 0.1807"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="330.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="2.0943951023931953"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="shoulder_lift_joint_3" type="revolute">
+    <parent link="shoulder_link_3"/>
+    <child link="upper_arm_link_3"/>
+    <origin rpy="1.570796327 0 0" xyz="0 0 0"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="330.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="2.0943951023931953"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="elbow_joint_3" type="revolute">
+    <parent link="upper_arm_link_3"/>
+    <child link="forearm_link_3"/>
+    <origin rpy="0 0 0" xyz="-0.6127 0 0"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="150.0" lower="-3.141592653589793" upper="3.141592653589793" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_1_joint_3" type="revolute">
+    <parent link="forearm_link_3"/>
+    <child link="wrist_1_link_3"/>
+    <origin rpy="0 0 0" xyz="-0.57155 0 0.17415"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_2_joint_3" type="revolute">
+    <parent link="wrist_1_link_3"/>
+    <child link="wrist_2_link_3"/>
+    <origin rpy="1.570796327 0 0" xyz="0 -0.11985 -2.458164590756244e-11"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_3_joint_3" type="revolute">
+    <parent link="wrist_2_link_3"/>
+    <child link="wrist_3_link_3"/>
+    <origin rpy="1.570796326589793 3.141592653589793 3.141592653589793" xyz="0 0.11655 -2.390480459346185e-11"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  
+  <!-- ROS-Industrial 'flange' frame: attachment point for EEF models -->
+  <link name="flange_3"/>
+  <joint name="wrist_3_flange_joint_3" type="fixed">
+    <parent link="wrist_3_link_3"/>
+    <child link="flange_3"/>
+    <origin rpy="0 -1.5707963267948966 -1.5707963267948966" xyz="0 0 0"/>
+  </joint>
+  <!-- ROS-Industrial 'tool0' frame: all-zeros tool frame -->
+  <link name="tool3"/>
+  <joint name="flange_tool_joint_3" type="fixed">
+    <!-- default toolframe: X+ left, Y+ up, Z+ front -->
+    <origin rpy="1.5707963267948966 0 1.5707963267948966" xyz="0 0 0"/>
+    <parent link="flange_3"/>
+    <child link="tool3"/>
+  </joint>
+</robot>
diff --git a/src/curobo/content/assets/robot/ur_description/tri_ur10e.urdf b/src/curobo/content/assets/robot/ur_description/tri_ur10e.urdf
new file mode 100644
index 00000000..5af1ca4d
--- /dev/null
+++ b/src/curobo/content/assets/robot/ur_description/tri_ur10e.urdf
@@ -0,0 +1,953 @@
+<?xml version="1.0" ?>
+<!-- =================================================================================== -->
+<!-- |    This document was autogenerated by xacro from ur_description/urdf/ur10e.xacro | -->
+<!-- |    EDITING THIS FILE BY HAND IS NOT RECOMMENDED                                 | -->
+<!-- =================================================================================== -->
+<robot name="ur10e_robot">
+  <!--
+    Base UR robot series xacro macro.
+
+    NOTE: this is NOT a URDF. It cannot directly be loaded by consumers
+    expecting a flattened '.urdf' file. See the top-level '.xacro' for that
+    (but note: that .xacro must still be processed by the xacro command).
+
+    For use in '.launch' files: use one of the 'load_urX.launch' convenience
+    launch files.
+
+    This file models the base kinematic chain of a UR robot, which then gets
+    parameterised by various configuration files to convert it into a UR3(e),
+    UR5(e), UR10(e) or UR16e.
+
+    NOTE: the default kinematic parameters (ie: link lengths, frame locations,
+    offets, etc) do not correspond to any particular robot. They are defaults
+    only. There WILL be non-zero offsets between the Forward Kinematics results
+    in TF (ie: robot_state_publisher) and the values reported by the Teach
+    Pendant.
+
+    For accurate (and robot-specific) transforms, the 'kinematics_parameters_file'
+    parameter MUST point to a .yaml file containing the appropriate values for
+    the targetted robot.
+
+    If using the UniversalRobots/Universal_Robots_ROS_Driver, follow the steps
+    described in the readme of that repository to extract the kinematic
+    calibration from the controller and generate the required .yaml file.
+
+    Main author of the migration to yaml configs: Ludovic Delval.
+
+    Contributors to previous versions (in no particular order):
+
+     - Felix Messmer
+     - Kelsey Hawkins
+     - Wim Meeussen
+     - Shaun Edwards
+     - Nadia Hammoudeh Garcia
+     - Dave Hershberger
+     - G. vd. Hoorn
+     - Philip Long
+     - Dave Coleman
+     - Miguel Prada
+     - Mathias Luedtke
+     - Marcel Schnirring
+     - Felix von Drigalski
+     - Felix Exner
+     - Jimmy Da Silva
+     - Ajit Krisshna N L
+     - Muhammad Asif Rana
+  -->
+  <!--
+    NOTE: the macro defined in this file is NOT part of the public API of this
+          package. Users CANNOT rely on this file being available, or stored in
+          this location. Nor can they rely on the existence of the macro.
+  -->
+  
+  <!-- links: main serial chain -->
+  <link name="base_fixture_link"/>
+  <link name="base_link"/>
+
+ <link name="base_link_1"/>
+ <link name="base_link_2"/>
+
+
+    <!-- joints: main serial chain -->
+  <joint name="base_fixture_j_base_link" type="fixed">
+    <parent link="base_fixture_link"/>
+    <child link="base_link"/>
+    <origin rpy="0 0 0" xyz="0.75 0.75 0"/>
+  </joint>
+
+
+    <!-- joints: main serial chain -->
+  <joint name="base_fixture_j_base_link_1" type="fixed">
+    <parent link="base_fixture_link"/>
+    <child link="base_link_1"/>
+    <origin rpy="0 0 0.0" xyz="-0.75 0.75 0"/>
+  </joint>
+
+    <!-- joints: main serial chain -->
+  <joint name="base_fixture_j_base_link_2" type="fixed">
+    <parent link="base_fixture_link"/>
+    <child link="base_link_2"/>
+    <origin rpy="0 0 3.14" xyz="0.0 -0.75 0"/>
+  </joint>
+
+
+
+  
+  <link name="base_link_inertia">
+    <visual>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/base.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/base.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="4.0"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.0061063308908" ixy="0.0" ixz="0.0" iyy="0.0061063308908" iyz="0.0" izz="0.01125"/>
+    </inertial>
+  </link>
+  <joint name="base_link_base_link_inertia" type="fixed">
+    <parent link="base_link"/>
+    <child link="base_link_inertia"/>
+    <!-- 'base_link' is REP-103 aligned (so X+ forward), while the internal
+           frames of the robot/controller have X+ pointing backwards.
+           Use the joint between 'base_link' and 'base_link_inertia' (a dummy
+           link/frame) to introduce the necessary rotation over Z (of pi rad).
+      -->
+    <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+  </joint>
+
+  <link name="shoulder_link">
+    <visual>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/shoulder.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/shoulder.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="7.778"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.03147431257693659" ixy="0.0" ixz="0.0" iyy="0.03147431257693659" iyz="0.0" izz="0.021875624999999996"/>
+    </inertial>
+  </link>
+  <link name="upper_arm_link">
+    <visual>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.1762"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/upperarm.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.1762"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/upperarm.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="12.93"/>
+      <origin rpy="0 1.5707963267948966 0" xyz="-0.306 0.0 0.175"/>
+      <inertia ixx="0.42175380379841093" ixy="0.0" ixz="0.0" iyy="0.42175380379841093" iyz="0.0" izz="0.03636562499999999"/>
+    </inertial>
+  </link>
+  <link name="forearm_link">
+    <visual>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.0393"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/forearm.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.0393"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/forearm.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="3.87"/>
+      <origin rpy="0 1.5707963267948966 0" xyz="-0.285775 0.0 0.0393"/>
+      <inertia ixx="0.11079302548902206" ixy="0.0" ixz="0.0" iyy="0.11079302548902206" iyz="0.0" izz="0.010884375"/>
+    </inertial>
+  </link>
+  <link name="wrist_1_link">
+    <visual>
+      <!-- TODO: Move this to a parameter -->
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.135"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist1.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.135"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist1.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="1.96"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.005108247956699999" ixy="0.0" ixz="0.0" iyy="0.005108247956699999" iyz="0.0" izz="0.005512499999999999"/>
+    </inertial>
+  </link>
+  <link name="wrist_2_link">
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 -0.12"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist2.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 -0.12"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist2.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="1.96"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.005108247956699999" ixy="0.0" ixz="0.0" iyy="0.005108247956699999" iyz="0.0" izz="0.005512499999999999"/>
+    </inertial>
+  </link>
+  <link name="wrist_3_link">
+    <visual>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.1168"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist3.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.1168"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist3.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.202"/>
+      <origin rpy="0 0 0" xyz="0.0 0.0 -0.025"/>
+      <inertia ixx="0.00014434577559500002" ixy="0.0" ixz="0.0" iyy="0.00014434577559500002" iyz="0.0" izz="0.00020452500000000002"/>
+    </inertial>
+  </link>
+  
+  
+  <joint name="shoulder_pan_joint" type="revolute">
+    <parent link="base_link_inertia"/>
+    <child link="shoulder_link"/>
+    <origin rpy="0 0 0" xyz="0 0 0.1807"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="330.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="2.0943951023931953"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="shoulder_lift_joint" type="revolute">
+    <parent link="shoulder_link"/>
+    <child link="upper_arm_link"/>
+    <origin rpy="1.570796327 0 0" xyz="0 0 0"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="330.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="2.0943951023931953"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="elbow_joint" type="revolute">
+    <parent link="upper_arm_link"/>
+    <child link="forearm_link"/>
+    <origin rpy="0 0 0" xyz="-0.6127 0 0"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="150.0" lower="-3.141592653589793" upper="3.141592653589793" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_1_joint" type="revolute">
+    <parent link="forearm_link"/>
+    <child link="wrist_1_link"/>
+    <origin rpy="0 0 0" xyz="-0.57155 0 0.17415"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_2_joint" type="revolute">
+    <parent link="wrist_1_link"/>
+    <child link="wrist_2_link"/>
+    <origin rpy="1.570796327 0 0" xyz="0 -0.11985 -2.458164590756244e-11"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_3_joint" type="revolute">
+    <parent link="wrist_2_link"/>
+    <child link="wrist_3_link"/>
+    <origin rpy="1.570796326589793 3.141592653589793 3.141592653589793" xyz="0 0.11655 -2.390480459346185e-11"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+
+  <link name="flange"/>
+  <joint name="wrist_3-flange" type="fixed">
+    <parent link="wrist_3_link"/>
+    <child link="flange"/>
+    <origin rpy="0 -1.5707963267948966 -1.5707963267948966" xyz="0 0 0"/>
+  </joint>
+  <!-- ROS-Industrial 'tool0' frame: all-zeros tool frame -->
+  <link name="tool0"/>
+  <joint name="flange-tool0" type="fixed">
+    <!-- default toolframe: X+ left, Y+ up, Z+ front -->
+    <origin rpy="1.5707963267948966 0 1.5707963267948966" xyz="0 0 0"/>
+    <parent link="flange"/>
+    <child link="tool0"/>
+  </joint>
+  <!-- ROS-Industrial 'base' frame: base_link to UR 'Base' Coordinates transform -->
+
+
+  
+  <link name="base_link_inertia_link_1">
+    <visual>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/base.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/base.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="4.0"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.0061063308908" ixy="0.0" ixz="0.0" iyy="0.0061063308908" iyz="0.0" izz="0.01125"/>
+    </inertial>
+  </link>
+  <link name="shoulder_link_1">
+    <visual>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/shoulder.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/shoulder.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="7.778"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.03147431257693659" ixy="0.0" ixz="0.0" iyy="0.03147431257693659" iyz="0.0" izz="0.021875624999999996"/>
+    </inertial>
+  </link>
+  <link name="upper_arm_link_1">
+    <visual>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.1762"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/upperarm.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.1762"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/upperarm.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="12.93"/>
+      <origin rpy="0 1.5707963267948966 0" xyz="-0.306 0.0 0.175"/>
+      <inertia ixx="0.42175380379841093" ixy="0.0" ixz="0.0" iyy="0.42175380379841093" iyz="0.0" izz="0.03636562499999999"/>
+    </inertial>
+  </link>
+  <link name="forearm_link_1">
+    <visual>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.0393"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/forearm.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.0393"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/forearm.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="3.87"/>
+      <origin rpy="0 1.5707963267948966 0" xyz="-0.285775 0.0 0.0393"/>
+      <inertia ixx="0.11079302548902206" ixy="0.0" ixz="0.0" iyy="0.11079302548902206" iyz="0.0" izz="0.010884375"/>
+    </inertial>
+  </link>
+  <link name="wrist_1_link_1">
+    <visual>
+      <!-- TODO: Move this to a parameter -->
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.135"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist1.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.135"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist1.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="1.96"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.005108247956699999" ixy="0.0" ixz="0.0" iyy="0.005108247956699999" iyz="0.0" izz="0.005512499999999999"/>
+    </inertial>
+  </link>
+  <link name="wrist_2_link_1">
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 -0.12"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist2.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 -0.12"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist2.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="1.96"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.005108247956699999" ixy="0.0" ixz="0.0" iyy="0.005108247956699999" iyz="0.0" izz="0.005512499999999999"/>
+    </inertial>
+  </link>
+  <link name="wrist_3_link_1">
+    <visual>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.1168"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist3.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.1168"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist3.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.202"/>
+      <origin rpy="0 0 0" xyz="0.0 0.0 -0.025"/>
+      <inertia ixx="0.00014434577559500002" ixy="0.0" ixz="0.0" iyy="0.00014434577559500002" iyz="0.0" izz="0.00020452500000000002"/>
+    </inertial>
+  </link>
+  
+  <joint name="base_link_base_link_inertia_joint_1" type="fixed">
+    <parent link="base_link_1"/>
+    <child link="base_link_inertia_link_1"/>
+    <!-- 'base_link' is REP-103 aligned (so X+ forward), while the internal
+           frames of the robot/controller have X+ pointing backwards.
+           Use the joint between 'base_link' and 'base_link_inertia' (a dummy
+           link/frame) to introduce the necessary rotation over Z (of pi rad).
+      -->
+    <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+  </joint>
+
+  <joint name="shoulder_pan_joint_1" type="revolute">
+    <parent link="base_link_inertia_link_1"/>
+    <child link="shoulder_link_1"/>
+    <origin rpy="0 0 0" xyz="0 0 0.1807"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="330.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="2.0943951023931953"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="shoulder_lift_joint_1" type="revolute">
+    <parent link="shoulder_link_1"/>
+    <child link="upper_arm_link_1"/>
+    <origin rpy="1.570796327 0 0" xyz="0 0 0"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="330.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="2.0943951023931953"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="elbow_joint_1" type="revolute">
+    <parent link="upper_arm_link_1"/>
+    <child link="forearm_link_1"/>
+    <origin rpy="0 0 0" xyz="-0.6127 0 0"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="150.0" lower="-3.141592653589793" upper="3.141592653589793" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_1_joint_1" type="revolute">
+    <parent link="forearm_link_1"/>
+    <child link="wrist_1_link_1"/>
+    <origin rpy="0 0 0" xyz="-0.57155 0 0.17415"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_2_joint_1" type="revolute">
+    <parent link="wrist_1_link_1"/>
+    <child link="wrist_2_link_1"/>
+    <origin rpy="1.570796327 0 0" xyz="0 -0.11985 -2.458164590756244e-11"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_3_joint_1" type="revolute">
+    <parent link="wrist_2_link_1"/>
+    <child link="wrist_3_link_1"/>
+    <origin rpy="1.570796326589793 3.141592653589793 3.141592653589793" xyz="0 0.11655 -2.390480459346185e-11"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  
+  <!-- ROS-Industrial 'flange' frame: attachment point for EEF models -->
+  <link name="flange_1"/>
+  <joint name="wrist_3_flange_1" type="fixed">
+    <parent link="wrist_3_link_1"/>
+    <child link="flange_1"/>
+    <origin rpy="0 -1.5707963267948966 -1.5707963267948966" xyz="0 0 0"/>
+  </joint>
+  <!-- ROS-Industrial 'tool0' frame: all-zeros tool frame -->
+  <link name="tool1"/>
+  <joint name="flange_tool_1" type="fixed">
+    <!-- default toolframe: X+ left, Y+ up, Z+ front -->
+    <origin rpy="1.5707963267948966 0 1.5707963267948966" xyz="0 0 0"/>
+    <parent link="flange_1"/>
+    <child link="tool1"/>
+  </joint>
+
+
+  <link name="base_link_inertia_link_2">
+    <visual>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/base.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/base.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="4.0"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.0061063308908" ixy="0.0" ixz="0.0" iyy="0.0061063308908" iyz="0.0" izz="0.01125"/>
+    </inertial>
+  </link>
+  <link name="shoulder_link_2">
+    <visual>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/shoulder.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/shoulder.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="7.778"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.03147431257693659" ixy="0.0" ixz="0.0" iyy="0.03147431257693659" iyz="0.0" izz="0.021875624999999996"/>
+    </inertial>
+  </link>
+  <link name="upper_arm_link_2">
+    <visual>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.1762"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/upperarm.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.1762"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/upperarm.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="12.93"/>
+      <origin rpy="0 1.5707963267948966 0" xyz="-0.306 0.0 0.175"/>
+      <inertia ixx="0.42175380379841093" ixy="0.0" ixz="0.0" iyy="0.42175380379841093" iyz="0.0" izz="0.03636562499999999"/>
+    </inertial>
+  </link>
+  <link name="forearm_link_2">
+    <visual>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.0393"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/forearm.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.0393"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/forearm.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="3.87"/>
+      <origin rpy="0 1.5707963267948966 0" xyz="-0.285775 0.0 0.0393"/>
+      <inertia ixx="0.11079302548902206" ixy="0.0" ixz="0.0" iyy="0.11079302548902206" iyz="0.0" izz="0.010884375"/>
+    </inertial>
+  </link>
+  <link name="wrist_1_link_2">
+    <visual>
+      <!-- TODO: Move this to a parameter -->
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.135"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist1.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.135"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist1.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="1.96"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.005108247956699999" ixy="0.0" ixz="0.0" iyy="0.005108247956699999" iyz="0.0" izz="0.005512499999999999"/>
+    </inertial>
+  </link>
+  <link name="wrist_2_link_2">
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 -0.12"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist2.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 -0.12"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist2.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="1.96"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.005108247956699999" ixy="0.0" ixz="0.0" iyy="0.005108247956699999" iyz="0.0" izz="0.005512499999999999"/>
+    </inertial>
+  </link>
+  <link name="wrist_3_link_2">
+    <visual>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.1168"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist3.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.1168"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist3.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.202"/>
+      <origin rpy="0 0 0" xyz="0.0 0.0 -0.025"/>
+      <inertia ixx="0.00014434577559500002" ixy="0.0" ixz="0.0" iyy="0.00014434577559500002" iyz="0.0" izz="0.00020452500000000002"/>
+    </inertial>
+  </link>
+  
+  <joint name="base_link_base_link_inertia_joint_2" type="fixed">
+    <parent link="base_link_2"/>
+    <child link="base_link_inertia_link_2"/>
+    <!-- 'base_link' is REP-103 aligned (so X+ forward), while the internal
+           frames of the robot/controller have X+ pointing backwards.
+           Use the joint between 'base_link' and 'base_link_inertia' (a dummy
+           link/frame) to introduce the necessary rotation over Z (of pi rad).
+      -->
+    <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+  </joint>
+
+  <joint name="shoulder_pan_joint_2" type="revolute">
+    <parent link="base_link_inertia_link_2"/>
+    <child link="shoulder_link_2"/>
+    <origin rpy="0 0 0" xyz="0 0 0.1807"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="330.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="2.0943951023931953"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="shoulder_lift_joint_2" type="revolute">
+    <parent link="shoulder_link_2"/>
+    <child link="upper_arm_link_2"/>
+    <origin rpy="1.570796327 0 0" xyz="0 0 0"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="330.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="2.0943951023931953"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="elbow_joint_2" type="revolute">
+    <parent link="upper_arm_link_2"/>
+    <child link="forearm_link_2"/>
+    <origin rpy="0 0 0" xyz="-0.6127 0 0"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="150.0" lower="-3.141592653589793" upper="3.141592653589793" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_1_joint_2" type="revolute">
+    <parent link="forearm_link_2"/>
+    <child link="wrist_1_link_2"/>
+    <origin rpy="0 0 0" xyz="-0.57155 0 0.17415"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_2_joint_2" type="revolute">
+    <parent link="wrist_1_link_2"/>
+    <child link="wrist_2_link_2"/>
+    <origin rpy="1.570796327 0 0" xyz="0 -0.11985 -2.458164590756244e-11"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_3_joint_2" type="revolute">
+    <parent link="wrist_2_link_2"/>
+    <child link="wrist_3_link_2"/>
+    <origin rpy="1.570796326589793 3.141592653589793 3.141592653589793" xyz="0 0.11655 -2.390480459346185e-11"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  
+  <!-- ROS-Industrial 'flange' frame: attachment point for EEF models -->
+  <link name="flange_2"/>
+  <joint name="wrist_3_flange_joint_2" type="fixed">
+    <parent link="wrist_3_link_2"/>
+    <child link="flange_2"/>
+    <origin rpy="0 -1.5707963267948966 -1.5707963267948966" xyz="0 0 0"/>
+  </joint>
+  <!-- ROS-Industrial 'tool0' frame: all-zeros tool frame -->
+  <link name="tool2"/>
+  <joint name="flange_tool_joint_2" type="fixed">
+    <!-- default toolframe: X+ left, Y+ up, Z+ front -->
+    <origin rpy="1.5707963267948966 0 1.5707963267948966" xyz="0 0 0"/>
+    <parent link="flange_2"/>
+    <child link="tool2"/>
+  </joint>
+
+
+  <link name="base_link_inertia_link_3">
+    <visual>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/base.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/base.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="4.0"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.0061063308908" ixy="0.0" ixz="0.0" iyy="0.0061063308908" iyz="0.0" izz="0.01125"/>
+    </inertial>
+  </link>
+  <link name="shoulder_link_3">
+    <visual>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/shoulder.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/shoulder.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="7.778"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.03147431257693659" ixy="0.0" ixz="0.0" iyy="0.03147431257693659" iyz="0.0" izz="0.021875624999999996"/>
+    </inertial>
+  </link>
+  <link name="upper_arm_link_3">
+    <visual>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.1762"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/upperarm.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.1762"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/upperarm.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="12.93"/>
+      <origin rpy="0 1.5707963267948966 0" xyz="-0.306 0.0 0.175"/>
+      <inertia ixx="0.42175380379841093" ixy="0.0" ixz="0.0" iyy="0.42175380379841093" iyz="0.0" izz="0.03636562499999999"/>
+    </inertial>
+  </link>
+  <link name="forearm_link_3">
+    <visual>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.0393"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/forearm.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.0393"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/forearm.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="3.87"/>
+      <origin rpy="0 1.5707963267948966 0" xyz="-0.285775 0.0 0.0393"/>
+      <inertia ixx="0.11079302548902206" ixy="0.0" ixz="0.0" iyy="0.11079302548902206" iyz="0.0" izz="0.010884375"/>
+    </inertial>
+  </link>
+  <link name="wrist_1_link_3">
+    <visual>
+      <!-- TODO: Move this to a parameter -->
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.135"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist1.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.135"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist1.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="1.96"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.005108247956699999" ixy="0.0" ixz="0.0" iyy="0.005108247956699999" iyz="0.0" izz="0.005512499999999999"/>
+    </inertial>
+  </link>
+  <link name="wrist_2_link_3">
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 -0.12"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist2.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 -0.12"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist2.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="1.96"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.005108247956699999" ixy="0.0" ixz="0.0" iyy="0.005108247956699999" iyz="0.0" izz="0.005512499999999999"/>
+    </inertial>
+  </link>
+  <link name="wrist_3_link_3">
+    <visual>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.1168"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist3.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.1168"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist3.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.202"/>
+      <origin rpy="0 0 0" xyz="0.0 0.0 -0.025"/>
+      <inertia ixx="0.00014434577559500002" ixy="0.0" ixz="0.0" iyy="0.00014434577559500002" iyz="0.0" izz="0.00020452500000000002"/>
+    </inertial>
+  </link>
+  
+</robot>
diff --git a/src/curobo/content/assets/robot/ur_description/ur10e.urdf b/src/curobo/content/assets/robot/ur_description/ur10e.urdf
new file mode 100644
index 00000000..d0b38651
--- /dev/null
+++ b/src/curobo/content/assets/robot/ur_description/ur10e.urdf
@@ -0,0 +1,359 @@
+<?xml version="1.0" ?>
+<!-- =================================================================================== -->
+<!-- |    This document was autogenerated by xacro from ur_description/urdf/ur10e.xacro | -->
+<!-- |    EDITING THIS FILE BY HAND IS NOT RECOMMENDED                                 | -->
+<!-- =================================================================================== -->
+<robot name="ur10e_robot">
+  <!--
+    Base UR robot series xacro macro.
+
+    NOTE: this is NOT a URDF. It cannot directly be loaded by consumers
+    expecting a flattened '.urdf' file. See the top-level '.xacro' for that
+    (but note: that .xacro must still be processed by the xacro command).
+
+    For use in '.launch' files: use one of the 'load_urX.launch' convenience
+    launch files.
+
+    This file models the base kinematic chain of a UR robot, which then gets
+    parameterised by various configuration files to convert it into a UR3(e),
+    UR5(e), UR10(e) or UR16e.
+
+    NOTE: the default kinematic parameters (ie: link lengths, frame locations,
+    offets, etc) do not correspond to any particular robot. They are defaults
+    only. There WILL be non-zero offsets between the Forward Kinematics results
+    in TF (ie: robot_state_publisher) and the values reported by the Teach
+    Pendant.
+
+    For accurate (and robot-specific) transforms, the 'kinematics_parameters_file'
+    parameter MUST point to a .yaml file containing the appropriate values for
+    the targetted robot.
+
+    If using the UniversalRobots/Universal_Robots_ROS_Driver, follow the steps
+    described in the readme of that repository to extract the kinematic
+    calibration from the controller and generate the required .yaml file.
+
+    Main author of the migration to yaml configs: Ludovic Delval.
+
+    Contributors to previous versions (in no particular order):
+
+     - Felix Messmer
+     - Kelsey Hawkins
+     - Wim Meeussen
+     - Shaun Edwards
+     - Nadia Hammoudeh Garcia
+     - Dave Hershberger
+     - G. vd. Hoorn
+     - Philip Long
+     - Dave Coleman
+     - Miguel Prada
+     - Mathias Luedtke
+     - Marcel Schnirring
+     - Felix von Drigalski
+     - Felix Exner
+     - Jimmy Da Silva
+     - Ajit Krisshna N L
+     - Muhammad Asif Rana
+  -->
+  <!--
+    NOTE: the macro defined in this file is NOT part of the public API of this
+          package. Users CANNOT rely on this file being available, or stored in
+          this location. Nor can they rely on the existence of the macro.
+  -->
+  <transmission name="shoulder_pan_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="shoulder_pan_joint">
+      <hardwareInterface>hardware_interface/PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="shoulder_pan_motor">
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="shoulder_lift_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="shoulder_lift_joint">
+      <hardwareInterface>hardware_interface/PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="shoulder_lift_motor">
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="elbow_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="elbow_joint">
+      <hardwareInterface>hardware_interface/PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="elbow_motor">
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="wrist_1_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="wrist_1_joint">
+      <hardwareInterface>hardware_interface/PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="wrist_1_motor">
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="wrist_2_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="wrist_2_joint">
+      <hardwareInterface>hardware_interface/PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="wrist_2_motor">
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="wrist_3_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="wrist_3_joint">
+      <hardwareInterface>hardware_interface/PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="wrist_3_motor">
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <!-- links: main serial chain -->
+  <link name="base_link"/>
+  <link name="base_link_inertia">
+    <visual>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/base.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/base.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="4.0"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.0061063308908" ixy="0.0" ixz="0.0" iyy="0.0061063308908" iyz="0.0" izz="0.01125"/>
+    </inertial>
+  </link>
+  <link name="shoulder_link">
+    <visual>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/shoulder.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/shoulder.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="7.778"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.03147431257693659" ixy="0.0" ixz="0.0" iyy="0.03147431257693659" iyz="0.0" izz="0.021875624999999996"/>
+    </inertial>
+  </link>
+  <link name="upper_arm_link">
+    <visual>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.1762"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/upperarm.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.1762"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/upperarm.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="12.93"/>
+      <origin rpy="0 1.5707963267948966 0" xyz="-0.306 0.0 0.175"/>
+      <inertia ixx="0.42175380379841093" ixy="0.0" ixz="0.0" iyy="0.42175380379841093" iyz="0.0" izz="0.03636562499999999"/>
+    </inertial>
+  </link>
+  <link name="forearm_link">
+    <visual>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.0393"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/forearm.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.0393"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/forearm.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="3.87"/>
+      <origin rpy="0 1.5707963267948966 0" xyz="-0.285775 0.0 0.0393"/>
+      <inertia ixx="0.11079302548902206" ixy="0.0" ixz="0.0" iyy="0.11079302548902206" iyz="0.0" izz="0.010884375"/>
+    </inertial>
+  </link>
+  <link name="wrist_1_link">
+    <visual>
+      <!-- TODO: Move this to a parameter -->
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.135"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist1.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.135"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist1.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="1.96"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.005108247956699999" ixy="0.0" ixz="0.0" iyy="0.005108247956699999" iyz="0.0" izz="0.005512499999999999"/>
+    </inertial>
+  </link>
+  <link name="wrist_2_link">
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 -0.12"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist2.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 -0.12"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist2.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="1.96"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.005108247956699999" ixy="0.0" ixz="0.0" iyy="0.005108247956699999" iyz="0.0" izz="0.005512499999999999"/>
+    </inertial>
+  </link>
+  <link name="wrist_3_link">
+    <visual>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.1168"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/visual/wrist3.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.1168"/>
+      <geometry>
+        <mesh filename="meshes/ur10e/collision/wrist3.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.202"/>
+      <origin rpy="0 0 0" xyz="0.0 0.0 -0.025"/>
+      <inertia ixx="0.00014434577559500002" ixy="0.0" ixz="0.0" iyy="0.00014434577559500002" iyz="0.0" izz="0.00020452500000000002"/>
+    </inertial>
+  </link>
+  <!-- joints: main serial chain -->
+  <joint name="base_link-base_link_inertia" type="fixed">
+    <parent link="base_link"/>
+    <child link="base_link_inertia"/>
+    <!-- 'base_link' is REP-103 aligned (so X+ forward), while the internal
+           frames of the robot/controller have X+ pointing backwards.
+           Use the joint between 'base_link' and 'base_link_inertia' (a dummy
+           link/frame) to introduce the necessary rotation over Z (of pi rad).
+      -->
+    <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+  </joint>
+  <joint name="shoulder_pan_joint" type="revolute">
+    <parent link="base_link_inertia"/>
+    <child link="shoulder_link"/>
+    <origin rpy="0 0 0" xyz="0 0 0.1807"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="330.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="2.0943951023931953"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="shoulder_lift_joint" type="revolute">
+    <parent link="shoulder_link"/>
+    <child link="upper_arm_link"/>
+    <origin rpy="1.570796327 0 0" xyz="0 0 0"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="330.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="2.0943951023931953"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="elbow_joint" type="revolute">
+    <parent link="upper_arm_link"/>
+    <child link="forearm_link"/>
+    <origin rpy="0 0 0" xyz="-0.6127 0 0"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="150.0" lower="-3.141592653589793" upper="3.141592653589793" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_1_joint" type="revolute">
+    <parent link="forearm_link"/>
+    <child link="wrist_1_link"/>
+    <origin rpy="0 0 0" xyz="-0.57155 0 0.17415"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_2_joint" type="revolute">
+    <parent link="wrist_1_link"/>
+    <child link="wrist_2_link"/>
+    <origin rpy="1.570796327 0 0" xyz="0 -0.11985 -2.458164590756244e-11"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_3_joint" type="revolute">
+    <parent link="wrist_2_link"/>
+    <child link="wrist_3_link"/>
+    <origin rpy="1.570796326589793 3.141592653589793 3.141592653589793" xyz="0 0.11655 -2.390480459346185e-11"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="56.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <!-- ROS-Industrial 'base' frame: base_link to UR 'Base' Coordinates transform -->
+  <link name="base"/>
+  <joint name="base_link-base_fixed_joint" type="fixed">
+    <!-- Note the rotation over Z of pi radians: as base_link is REP-103
+           aligned (ie: has X+ forward, Y+ left and Z+ up), this is needed
+           to correctly align 'base' with the 'Base' coordinate system of
+           the UR controller.
+      -->
+    <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+    <parent link="base_link"/>
+    <child link="base"/>
+  </joint>
+  <!-- ROS-Industrial 'flange' frame: attachment point for EEF models -->
+  <link name="flange"/>
+  <joint name="wrist_3-flange" type="fixed">
+    <parent link="wrist_3_link"/>
+    <child link="flange"/>
+    <origin rpy="0 -1.5707963267948966 -1.5707963267948966" xyz="0 0 0"/>
+  </joint>
+  <!-- ROS-Industrial 'tool0' frame: all-zeros tool frame -->
+  <link name="tool0"/>
+  <joint name="flange-tool0" type="fixed">
+    <!-- default toolframe: X+ left, Y+ up, Z+ front -->
+    <origin rpy="1.5707963267948966 0 1.5707963267948966" xyz="0 0 0"/>
+    <parent link="flange"/>
+    <child link="tool0"/>
+  </joint>
+</robot>
diff --git a/src/curobo/content/assets/robot/ur_description/ur5e.urdf b/src/curobo/content/assets/robot/ur_description/ur5e.urdf
new file mode 100644
index 00000000..a8e0dad4
--- /dev/null
+++ b/src/curobo/content/assets/robot/ur_description/ur5e.urdf
@@ -0,0 +1,359 @@
+<?xml version="1.0" ?>
+<!-- =================================================================================== -->
+<!-- |    This document was autogenerated by xacro from ur_description/urdf/ur5e.xacro | -->
+<!-- |    EDITING THIS FILE BY HAND IS NOT RECOMMENDED                                 | -->
+<!-- =================================================================================== -->
+<robot name="ur5e_robot">
+  <!--
+    Base UR robot series xacro macro.
+
+    NOTE: this is NOT a URDF. It cannot directly be loaded by consumers
+    expecting a flattened '.urdf' file. See the top-level '.xacro' for that
+    (but note: that .xacro must still be processed by the xacro command).
+
+    For use in '.launch' files: use one of the 'load_urX.launch' convenience
+    launch files.
+
+    This file models the base kinematic chain of a UR robot, which then gets
+    parameterised by various configuration files to convert it into a UR3(e),
+    UR5(e), UR10(e) or UR16e.
+
+    NOTE: the default kinematic parameters (ie: link lengths, frame locations,
+    offets, etc) do not correspond to any particular robot. They are defaults
+    only. There WILL be non-zero offsets between the Forward Kinematics results
+    in TF (ie: robot_state_publisher) and the values reported by the Teach
+    Pendant.
+
+    For accurate (and robot-specific) transforms, the 'kinematics_parameters_file'
+    parameter MUST point to a .yaml file containing the appropriate values for
+    the targetted robot.
+
+    If using the UniversalRobots/Universal_Robots_ROS_Driver, follow the steps
+    described in the readme of that repository to extract the kinematic
+    calibration from the controller and generate the required .yaml file.
+
+    Main author of the migration to yaml configs: Ludovic Delval.
+
+    Contributors to previous versions (in no particular order):
+
+     - Felix Messmer
+     - Kelsey Hawkins
+     - Wim Meeussen
+     - Shaun Edwards
+     - Nadia Hammoudeh Garcia
+     - Dave Hershberger
+     - G. vd. Hoorn
+     - Philip Long
+     - Dave Coleman
+     - Miguel Prada
+     - Mathias Luedtke
+     - Marcel Schnirring
+     - Felix von Drigalski
+     - Felix Exner
+     - Jimmy Da Silva
+     - Ajit Krisshna N L
+     - Muhammad Asif Rana
+  -->
+  <!--
+    NOTE: the macro defined in this file is NOT part of the public API of this
+          package. Users CANNOT rely on this file being available, or stored in
+          this location. Nor can they rely on the existence of the macro.
+  -->
+  <transmission name="shoulder_pan_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="shoulder_pan_joint">
+      <hardwareInterface>hardware_interface/PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="shoulder_pan_motor">
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="shoulder_lift_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="shoulder_lift_joint">
+      <hardwareInterface>hardware_interface/PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="shoulder_lift_motor">
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="elbow_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="elbow_joint">
+      <hardwareInterface>hardware_interface/PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="elbow_motor">
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="wrist_1_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="wrist_1_joint">
+      <hardwareInterface>hardware_interface/PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="wrist_1_motor">
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="wrist_2_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="wrist_2_joint">
+      <hardwareInterface>hardware_interface/PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="wrist_2_motor">
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <transmission name="wrist_3_trans">
+    <type>transmission_interface/SimpleTransmission</type>
+    <joint name="wrist_3_joint">
+      <hardwareInterface>hardware_interface/PositionJointInterface</hardwareInterface>
+    </joint>
+    <actuator name="wrist_3_motor">
+      <mechanicalReduction>1</mechanicalReduction>
+    </actuator>
+  </transmission>
+  <!-- links: main serial chain -->
+  <link name="base_link"/>
+  <link name="base_link_inertia">
+    <visual>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur5e/visual/base.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur5e/collision/base.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="4.0"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.00443333156" ixy="0.0" ixz="0.0" iyy="0.00443333156" iyz="0.0" izz="0.0072"/>
+    </inertial>
+  </link>
+  <link name="shoulder_link">
+    <visual>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur5e/visual/shoulder.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+      <geometry>
+        <mesh filename="meshes/ur5e/collision/shoulder.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="3.7"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.010267495893" ixy="0.0" ixz="0.0" iyy="0.010267495893" iyz="0.0" izz="0.00666"/>
+    </inertial>
+  </link>
+  <link name="upper_arm_link">
+    <visual>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.138"/>
+      <geometry>
+        <mesh filename="meshes/ur5e/visual/upperarm.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.138"/>
+      <geometry>
+        <mesh filename="meshes/ur5e/collision/upperarm.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="8.393"/>
+      <origin rpy="0 1.5707963267948966 0" xyz="-0.2125 0.0 0.138"/>
+      <inertia ixx="0.1338857818623325" ixy="0.0" ixz="0.0" iyy="0.1338857818623325" iyz="0.0" izz="0.0151074"/>
+    </inertial>
+  </link>
+  <link name="forearm_link">
+    <visual>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.007"/>
+      <geometry>
+        <mesh filename="meshes/ur5e/visual/forearm.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 -1.5707963267948966" xyz="0 0 0.007"/>
+      <geometry>
+        <mesh filename="meshes/ur5e/collision/forearm.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="2.275"/>
+      <origin rpy="0 1.5707963267948966 0" xyz="-0.1961 0.0 0.007"/>
+      <inertia ixx="0.031209355099586295" ixy="0.0" ixz="0.0" iyy="0.031209355099586295" iyz="0.0" izz="0.004095"/>
+    </inertial>
+  </link>
+  <link name="wrist_1_link">
+    <visual>
+      <!-- TODO: Move this to a parameter -->
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.127"/>
+      <geometry>
+        <mesh filename="meshes/ur5e/visual/wrist1.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.127"/>
+      <geometry>
+        <mesh filename="meshes/ur5e/collision/wrist1.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="1.219"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.0025598989760400002" ixy="0.0" ixz="0.0" iyy="0.0025598989760400002" iyz="0.0" izz="0.0021942"/>
+    </inertial>
+  </link>
+  <link name="wrist_2_link">
+    <visual>
+      <origin rpy="0 0 0" xyz="0 0 -0.0997"/>
+      <geometry>
+        <mesh filename="meshes/ur5e/visual/wrist2.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="0 0 0" xyz="0 0 -0.0997"/>
+      <geometry>
+        <mesh filename="meshes/ur5e/collision/wrist2.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="1.219"/>
+      <origin rpy="0 0 0" xyz="0 0 0"/>
+      <inertia ixx="0.0025598989760400002" ixy="0.0" ixz="0.0" iyy="0.0025598989760400002" iyz="0.0" izz="0.0021942"/>
+    </inertial>
+  </link>
+  <link name="wrist_3_link">
+    <visual>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.0989"/>
+      <geometry>
+        <mesh filename="meshes/ur5e/visual/wrist3.dae"/>
+      </geometry>
+      <material name="LightGrey">
+        <color rgba="0.7 0.7 0.7 1.0"/>
+      </material>
+    </visual>
+    <collision>
+      <origin rpy="1.5707963267948966 0 0" xyz="0 0 -0.0989"/>
+      <geometry>
+        <mesh filename="meshes/ur5e/collision/wrist3.stl"/>
+      </geometry>
+    </collision>
+    <inertial>
+      <mass value="0.1879"/>
+      <origin rpy="0 0 0" xyz="0.0 0.0 -0.0229"/>
+      <inertia ixx="9.890410052167731e-05" ixy="0.0" ixz="0.0" iyy="9.890410052167731e-05" iyz="0.0" izz="0.0001321171875"/>
+    </inertial>
+  </link>
+  <!-- joints: main serial chain -->
+  <joint name="base_link-base_link_inertia" type="fixed">
+    <parent link="base_link"/>
+    <child link="base_link_inertia"/>
+    <!-- 'base_link' is REP-103 aligned (so X+ forward), while the internal
+           frames of the robot/controller have X+ pointing backwards.
+           Use the joint between 'base_link' and 'base_link_inertia' (a dummy
+           link/frame) to introduce the necessary rotation over Z (of pi rad).
+      -->
+    <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+  </joint>
+  <joint name="shoulder_pan_joint" type="revolute">
+    <parent link="base_link_inertia"/>
+    <child link="shoulder_link"/>
+    <origin rpy="0 0 0" xyz="0 0 0.1625"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="150.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="shoulder_lift_joint" type="revolute">
+    <parent link="shoulder_link"/>
+    <child link="upper_arm_link"/>
+    <origin rpy="1.570796327 0 0" xyz="0 0 0"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="150.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="elbow_joint" type="revolute">
+    <parent link="upper_arm_link"/>
+    <child link="forearm_link"/>
+    <origin rpy="0 0 0" xyz="-0.425 0 0"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="150.0" lower="-3.141592653589793" upper="3.141592653589793" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_1_joint" type="revolute">
+    <parent link="forearm_link"/>
+    <child link="wrist_1_link"/>
+    <origin rpy="0 0 0" xyz="-0.3922 0 0.1333"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="28.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_2_joint" type="revolute">
+    <parent link="wrist_1_link"/>
+    <child link="wrist_2_link"/>
+    <origin rpy="1.570796327 0 0" xyz="0 -0.0997 -2.044881182297852e-11"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="28.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <joint name="wrist_3_joint" type="revolute">
+    <parent link="wrist_2_link"/>
+    <child link="wrist_3_link"/>
+    <origin rpy="1.570796326589793 3.141592653589793 3.141592653589793" xyz="0 0.0996 -2.042830148012698e-11"/>
+    <axis xyz="0 0 1"/>
+    <limit effort="28.0" lower="-6.283185307179586" upper="6.283185307179586" velocity="3.141592653589793"/>
+    <dynamics damping="0" friction="0"/>
+  </joint>
+  <!-- ROS-Industrial 'base' frame: base_link to UR 'Base' Coordinates transform -->
+  <link name="base"/>
+  <joint name="base_link-base_fixed_joint" type="fixed">
+    <!-- Note the rotation over Z of pi radians: as base_link is REP-103
+           aligned (ie: has X+ forward, Y+ left and Z+ up), this is needed
+           to correctly align 'base' with the 'Base' coordinate system of
+           the UR controller.
+      -->
+    <origin rpy="0 0 3.141592653589793" xyz="0 0 0"/>
+    <parent link="base_link"/>
+    <child link="base"/>
+  </joint>
+  <!-- ROS-Industrial 'flange' frame: attachment point for EEF models -->
+  <link name="flange"/>
+  <joint name="wrist_3-flange" type="fixed">
+    <parent link="wrist_3_link"/>
+    <child link="flange"/>
+    <origin rpy="0 -1.5707963267948966 -1.5707963267948966" xyz="0 0 0"/>
+  </joint>
+  <!-- ROS-Industrial 'tool0' frame: all-zeros tool frame -->
+  <link name="tool0"/>
+  <joint name="flange-tool0" type="fixed">
+    <!-- default toolframe: X+ left, Y+ up, Z+ front -->
+    <origin rpy="1.5707963267948966 0 1.5707963267948966" xyz="0 0 0"/>
+    <parent link="flange"/>
+    <child link="tool0"/>
+  </joint>
+</robot>
diff --git a/src/curobo/content/assets/scene/nvblox/srl_ur10_bins.nvblx b/src/curobo/content/assets/scene/nvblox/srl_ur10_bins.nvblx
new file mode 100644
index 00000000..78fb0b84
--- /dev/null
+++ b/src/curobo/content/assets/scene/nvblox/srl_ur10_bins.nvblx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:93892dddfc6a65dedc4ef311e773eba7273b29f0a2cdcb76637013fdadf22897
+size 87195648
diff --git a/src/curobo/content/assets/scene/nvblox/srl_ur10_bins_color.png b/src/curobo/content/assets/scene/nvblox/srl_ur10_bins_color.png
new file mode 100644
index 00000000..7867a44e
--- /dev/null
+++ b/src/curobo/content/assets/scene/nvblox/srl_ur10_bins_color.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b82b0a4873d7bf1796893a47f9b41ab7cd32751d8caa102cfea3f401f75dd70
+size 1773918
diff --git a/src/curobo/content/configs/robot/dual_ur10e.yml b/src/curobo/content/configs/robot/dual_ur10e.yml
new file mode 100644
index 00000000..29ebc9f0
--- /dev/null
+++ b/src/curobo/content/configs/robot/dual_ur10e.yml
@@ -0,0 +1,63 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+robot_cfg:
+  kinematics:
+    use_usd_kinematics: False
+    urdf_path: "robot/ur_description/dual_ur10e.urdf"
+    asset_root_path: "robot/ur_description"
+    isaac_usd_path: "/Isaac/Robots/UR10/ur10_long_suction.usd"
+    usd_robot_root: "/ur10"
+    usd_path: "robot/ur_description/dual_ur10e.usd"
+    base_link: "base_fixture_link"
+    ee_link: "tool0"
+    link_names: [ "tool1", "tool0"] # , "tool2"] #, "tool3"]
+    collision_link_names: [
+      'shoulder_link','upper_arm_link', 'forearm_link', 'wrist_1_link', 'wrist_2_link' ,'wrist_3_link', 'tool0',
+      'shoulder_link_1','upper_arm_link_1', 'forearm_link_1', 'wrist_1_link_1', 'wrist_2_link_1' ,'wrist_3_link_1', 'tool1']
+    collision_spheres: 'spheres/dual_ur10e.yml'
+    collision_sphere_buffer: 0.005
+
+
+    self_collision_ignore: {
+      "upper_arm_link": ["shoulder_link","forearm_link"],
+      "forarm_link": ["wrist_1_link"],
+      "wrist_1_link": ["wrist_2_link","wrist_3_link"],
+      "wrist_2_link": ["wrist_3_link", "tool0"],
+      "wrist_3_link": ["tool0"],
+
+      "upper_arm_link_1": ["shoulder_link_1","forearm_link_1"],
+      "forarm_link_1": ["wrist_1_link_1"],
+      "wrist_1_link_1": ["wrist_2_link_1","wrist_3_link_1"],
+      "wrist_2_link_1": ["wrist_3_link_1", "tool1"],
+      "wrist_3_link_1": ["tool1"],
+    
+      }
+    self_collision_buffer: {
+      'shoulder_link': 0.05,
+      'shoulder_link_1': 0.05,
+      
+       }
+    lock_joints: null
+
+    cspace:
+      joint_names: [
+        'shoulder_pan_joint', 'shoulder_lift_joint', 'elbow_joint', 'wrist_1_joint', 'wrist_2_joint', 'wrist_3_joint',
+        'shoulder_pan_joint_1', 'shoulder_lift_joint_1', 'elbow_joint_1', 'wrist_1_joint_1', 'wrist_2_joint_1', 'wrist_3_joint_1']
+      retract_config: [-1.57, -2.2, 1.9, -1.383, -1.57, 0.00, 
+                      -1.57, -2.2, 1.9, -1.383, -1.57, 0.00]
+      null_space_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                          1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+      cspace_distance_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                              1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+                              
+      max_jerk: 500.0
+      max_acceleration: 15.0
diff --git a/src/curobo/content/configs/robot/franka.yml b/src/curobo/content/configs/robot/franka.yml
new file mode 100644
index 00000000..d9f9aaab
--- /dev/null
+++ b/src/curobo/content/configs/robot/franka.yml
@@ -0,0 +1,113 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+robot_cfg:
+  kinematics:
+    use_usd_kinematics: False
+    isaac_usd_path: "/Isaac/Robots/Franka/franka.usd"
+    usd_path: "robot/non_shipping/franka/franka_panda_meters1.usda"
+    usd_robot_root: "/panda"
+    usd_flip_joints: ["panda_joint1","panda_joint2","panda_joint3","panda_joint4", "panda_joint5",
+      "panda_joint6","panda_joint7","panda_finger_joint1", "panda_finger_joint2"]
+    usd_flip_joints: {
+    "panda_joint1": "Z",
+    "panda_joint2": "Z",
+    "panda_joint3": "Z",
+    "panda_joint4": "Z",
+     "panda_joint5": "Z",
+    "panda_joint6": "Z",
+    "panda_joint7": "Z",
+    "panda_finger_joint1": "Y",
+     "panda_finger_joint2":  "Y",
+    }
+    usd_flip_joint_limits: ["panda_finger_joint2"]
+    urdf_path: "robot/franka_description/franka_panda.urdf"
+    asset_root_path: "robot/franka_description"
+    base_link: "base_link"
+    ee_link: "panda_hand"
+    collision_link_names:
+      [
+        "panda_link0",
+        "panda_link1",
+        "panda_link2",
+        "panda_link3",
+        "panda_link4",
+        "panda_link5",
+        "panda_link6",
+        "panda_link7",
+        "panda_hand",
+        "panda_leftfinger",
+        "panda_rightfinger",
+        "attached_object",
+      ]
+    collision_spheres: "spheres/franka_mesh.yml"
+    collision_sphere_buffer: 0.0025 
+    extra_collision_spheres: {"attached_object": 4}
+    use_global_cumul: True
+    self_collision_ignore:
+      {
+        "panda_link0": ["panda_link1", "panda_link2"],
+        "panda_link1": ["panda_link2", "panda_link3", "panda_link4"],
+        "panda_link2": ["panda_link3", "panda_link4"],
+        "panda_link3": ["panda_link4", "panda_link6"],
+        "panda_link4":
+          ["panda_link5", "panda_link6", "panda_link7", "panda_link8"],
+        "panda_link5": ["panda_link6", "panda_link7", "panda_hand","panda_leftfinger", "panda_rightfinger"],
+        "panda_link6": ["panda_link7", "panda_hand", "attached_object", "panda_leftfinger", "panda_rightfinger"],
+        "panda_link7": ["panda_hand", "attached_object", "panda_leftfinger", "panda_rightfinger"],
+        "panda_hand": ["panda_leftfinger", "panda_rightfinger","attached_object"],
+        "panda_leftfinger": ["panda_rightfinger", "attached_object"],
+        "panda_rightfinger": ["attached_object"],
+        
+      }
+    
+    self_collision_buffer:
+      {
+        "panda_link0": 0.1, 
+        "panda_link1": 0.05, 
+        "panda_link2": 0.0,
+        "panda_link3": 0.0,
+        "panda_link4": 0.0,
+        "panda_link5": 0.0,
+        "panda_link6": 0.0,
+        "panda_link7": 0.0,
+        "panda_hand": 0.02,
+        "panda_leftfinger": 0.01,
+        "panda_rightfinger": 0.01,
+        "attached_object": 0.0,
+      }
+
+    mesh_link_names:
+      [
+        "panda_link0",
+        "panda_link1",
+        "panda_link2",
+        "panda_link3",
+        "panda_link4",
+        "panda_link5",
+        "panda_link6",
+        "panda_link7",
+        "panda_hand",
+        "panda_leftfinger",
+        "panda_rightfinger",
+      ]
+    lock_joints: {"panda_finger_joint1": 0.04, "panda_finger_joint2": -0.04}
+    extra_links: {"attached_object":{"parent_link_name": "panda_hand" , 
+    "link_name": "attached_object", "fixed_transform": [0,0,0,1,0,0,0], "joint_type":"FIXED",
+    "joint_name": "attach_joint" }}
+    cspace:
+      joint_names: ["panda_joint1","panda_joint2","panda_joint3","panda_joint4", "panda_joint5",
+      "panda_joint6","panda_joint7","panda_finger_joint1", "panda_finger_joint2"]
+      retract_config: [0.0, -1.3, 0.0, -2.5, 0.0, 1.0, 0., 0.04, -0.04]
+      null_space_weight: [1,1,1,1,1,1,1,1,1]
+      cspace_distance_weight: [1,1,1,1,1,1,1,1,1]
+      max_acceleration: 15.0
+      max_jerk: 500.0
\ No newline at end of file
diff --git a/src/curobo/content/configs/robot/franka_mobile.yml b/src/curobo/content/configs/robot/franka_mobile.yml
new file mode 100644
index 00000000..ef048a5e
--- /dev/null
+++ b/src/curobo/content/configs/robot/franka_mobile.yml
@@ -0,0 +1,116 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+robot_cfg:
+  kinematics:
+    use_usd_kinematics: False
+    isaac_usd_path: "/Isaac/Robots/Franka/franka.usd"
+    usd_path: "robot/franka_description/franka_panda_meters.usda"
+    usd_robot_root: "/panda"
+    usd_flip_joints: ["panda_joint1","panda_joint2","panda_joint3","panda_joint4", "panda_joint5",
+      "panda_joint6","panda_joint7","panda_finger_joint1", "panda_finger_joint2"]
+    usd_flip_joints: {
+    "panda_joint1": "Z",
+    "panda_joint2": "Z",
+    "panda_joint3": "Z",
+    "panda_joint4": "Z",
+     "panda_joint5": "Z",
+    "panda_joint6": "Z",
+    "panda_joint7": "Z",
+    "panda_finger_joint1": "Y",
+     "panda_finger_joint2":  "Y",
+    }
+    usd_flip_joint_limits: ["panda_finger_joint2"]
+    urdf_path: "robot/franka_description/franka_panda_mobile_xy_tz.urdf"
+    asset_root_path: "robot/franka_description"
+    base_link: "base_link"
+    ee_link: "panda_hand"
+    # link_names: null
+    collision_link_names:
+      [
+        "panda_link0",
+        "panda_link1",
+        "panda_link2",
+        "panda_link3",
+        "panda_link4",
+        "panda_link5",
+        "panda_link6",
+        "panda_link7",
+        "panda_hand",
+        "panda_leftfinger",
+        "panda_rightfinger",
+        "attached_object",
+      ]
+    collision_spheres: "spheres/franka_mesh.yml"
+    collision_sphere_buffer: 0.002 #0.02
+    extra_collision_spheres: {"attached_object": 4}
+    use_global_cumul: True
+    self_collision_ignore:
+      {
+        "panda_link0": ["panda_link1", "panda_link2"],
+        "panda_link1": ["panda_link2", "panda_link3", "panda_link4"],
+        "panda_link2": ["panda_link3", "panda_link4"],
+        "panda_link3": ["panda_link4", "panda_link6"],
+        "panda_link4":
+          ["panda_link5", "panda_link6", "panda_link7", "panda_link8"],
+        "panda_link5": ["panda_link6", "panda_link7", "panda_hand","panda_leftfinger", "panda_rightfinger"],
+        "panda_link6": ["panda_link7", "panda_hand", "attached_object", "panda_leftfinger", "panda_rightfinger"],
+        "panda_link7": ["panda_hand", "attached_object", "panda_leftfinger", "panda_rightfinger"],
+        "panda_hand": ["panda_leftfinger", "panda_rightfinger","attached_object"],
+        "panda_leftfinger": ["panda_rightfinger", "attached_object"],
+        "panda_rightfinger": ["attached_object"],
+        
+      }
+    
+    self_collision_buffer:
+      {
+        "panda_link0": 0.1, 
+        "panda_link1": 0.05, 
+        "panda_link2": 0.0,
+        "panda_link3": 0.0,
+        "panda_link4": 0.0,
+        "panda_link5": 0.0,
+        "panda_link6": 0.0,
+        "panda_link7": 0.0,
+        "panda_hand": 0.02,
+        "panda_leftfinger": 0.01,
+        "panda_rightfinger": 0.01,
+        "attached_object": 0.0,
+      }
+
+    mesh_link_names:
+      [
+        "panda_link0",
+        "panda_link1",
+        "panda_link2",
+        "panda_link3",
+        "panda_link4",
+        "panda_link5",
+        "panda_link6",
+        "panda_link7",
+        "panda_hand",
+        "panda_leftfinger",
+        "panda_rightfinger",
+      ]
+    lock_joints: {"panda_finger_joint1": 0.04, "panda_finger_joint2": -0.04}
+    extra_links: {"attached_object":{"parent_link_name": "panda_hand" , 
+    "link_name": "attached_object", "fixed_transform": [0,0,0,1,0,0,0], "joint_type":"FIXED",
+    "joint_name": "attach_joint" }}
+    cspace:
+      joint_names: [
+        "base_x", "base_y", "base_z",
+        "panda_joint1","panda_joint2","panda_joint3","panda_joint4", "panda_joint5",
+      "panda_joint6","panda_joint7","panda_finger_joint1", "panda_finger_joint2"]
+      retract_config: [0,0,0,0.0, -1.3, 0.0, -2.5, 0.0, 1.0, 0., 0.04, -0.04]
+      null_space_weight: [1,1,1,1,1,1,1,1,1,1,1,1]
+      cspace_distance_weight: [1,1,1,1,1,1,1,1,1,1,1,1]
+      max_acceleration: 15.0 #15.0
+      max_jerk: 500.0
\ No newline at end of file
diff --git a/src/curobo/content/configs/robot/iiwa.yml b/src/curobo/content/configs/robot/iiwa.yml
new file mode 100644
index 00000000..585541af
--- /dev/null
+++ b/src/curobo/content/configs/robot/iiwa.yml
@@ -0,0 +1,81 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+robot_cfg:
+  kinematics:
+    base_link: base_link
+    collision_link_names:
+      - iiwa7_link_1
+      - iiwa7_link_2
+      - iiwa7_link_3
+      - iiwa7_link_4
+      - iiwa7_link_5
+      - iiwa7_link_6
+    collision_sphere_buffer: 0.005
+    collision_spheres: spheres/iiwa_allegro.yml
+    ee_link: iiwa7_link_ee
+    self_collision_buffer:
+      iiwa7_link_1: -0.02
+      iiwa7_link_2: 0.0
+      iiwa7_link_3: 0.0
+      iiwa7_link_4: 0.0
+      iiwa7_link_5: 0.0
+      iiwa7_link_6: 0.0
+
+    self_collision_ignore:
+      {
+        iiwa7_link_1: [iiwa7_link_2, iiwa7_link_3],
+        iiwa7_link_2: [iiwa7_link_3, iiwa7_link_4],
+        iiwa7_link_3: [iiwa7_link_4, iiwa7_link_5],
+        iiwa7_link_4: [iiwa7_link_5, iiwa7_link_6],
+        iiwa7_link_5: [iiwa7_link_6],
+      }
+    urdf_path: robot/iiwa_allegro_description/iiwa.urdf
+    asset_root_path: robot/iiwa_allegro_description
+  
+    cspace:
+      joint_names:
+        [
+          "iiwa7_joint_1",
+          "iiwa7_joint_2",
+          "iiwa7_joint_3",
+          "iiwa7_joint_4",
+          "iiwa7_joint_5",
+          "iiwa7_joint_6",
+          "iiwa7_joint_7",
+        ]
+      cspace_distance_weight:
+        - 1.0
+        - 0.9
+        - 0.8
+        - 0.8
+        - 0.7
+        - 0.6
+        - 0.5
+
+      null_space_weight:
+        - 1.0
+        - 1.0
+        - 1.0
+        - 1.0
+        - 1.0
+        - 1.0
+        - 1.0
+
+      retract_config:
+        - 0.0
+        - -0.78
+        - 0.0
+        - -1.4
+        - 0.0
+        - 0.6
+        - 0.0
+      max_acceleration: 15.0
+      max_jerk: 500.0
\ No newline at end of file
diff --git a/src/curobo/content/configs/robot/iiwa_allegro.yml b/src/curobo/content/configs/robot/iiwa_allegro.yml
new file mode 100644
index 00000000..881c7859
--- /dev/null
+++ b/src/curobo/content/configs/robot/iiwa_allegro.yml
@@ -0,0 +1,195 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+robot_cfg:
+  kinematics:
+    base_link: base_link
+    collision_link_names:
+      - iiwa7_link_1
+      - iiwa7_link_2
+      - iiwa7_link_3
+      - iiwa7_link_4
+      - iiwa7_link_5
+      - iiwa7_link_6
+      - palm_link
+      - index_link_1
+      - index_link_2
+      - index_link_3
+      - middle_link_1
+      - middle_link_2
+      - middle_link_3
+      - ring_link_1
+      - ring_link_2
+      - ring_link_3
+      - thumb_link_2
+      - thumb_link_3
+    collision_sphere_buffer: 0.005
+    collision_spheres: spheres/iiwa_allegro.yml
+    ee_link: index_link_3
+    link_names:
+      - palm_link
+      - index_link_3
+      - middle_link_3
+      - ring_link_3
+      - thumb_link_3
+    self_collision_buffer:
+      iiwa7_link_1: -0.02
+      iiwa7_link_2: 0.0
+      iiwa7_link_3: 0.0
+      iiwa7_link_4: 0.0
+      iiwa7_link_5: 0.0
+      iiwa7_link_6: 0.0
+      index_link_1: -0.0
+      index_link_2: -0.0
+      index_link_3: 0.0
+      middle_link_1: 0.0
+      middle_link_2: 0.0
+      middle_link_3: 0.0
+      palm_link: 0.0
+      ring_link_1: 0.0
+      ring_link_2: 0.0
+      ring_link_3: 0.0
+      thumb_link_2: 0.0
+      thumb_link_3: 0.0
+    self_collision_ignore: {
+        iiwa7_link_1: [iiwa7_link_2, iiwa7_link_3],
+        iiwa7_link_2: [iiwa7_link_3, iiwa7_link_4],
+        iiwa7_link_3: [iiwa7_link_4, iiwa7_link_5],
+        iiwa7_link_4: [iiwa7_link_5, iiwa7_link_6],
+        iiwa7_link_5: [iiwa7_link_6, palm_link],
+        iiwa7_link_6: [palm_link],
+        palm_link:
+          [
+            index_link_1,
+            index_link_2,
+            middle_link_1,
+            middle_link_2,
+            ring_link_1,
+            ring_link_2,
+            thumb_link_2,
+          ],
+        index_link_1: [index_link_2, middle_link_1, middle_link_2],
+        index_link_2: [index_link_3],
+        #index_link_3: [iiwa7_link_2],
+        middle_link_1: [middle_link_2, ring_link_1, ring_link_2],
+        middle_link_2: [middle_link_3],
+        #middle_link_3: [],
+        ring_link_1: [ring_link_2],
+        ring_link_2: [ring_link_3],
+        #ring_link_3: [iiwa7_link_2],
+        thumb_link_2: [thumb_link_3],
+      }
+    #thumb_link_3: [iiwa7_link_2],
+    urdf_path: robot/iiwa_allegro_description/iiwa_allegro.urdf
+    asset_root_path: robot/iiwa_allegro_description
+    usd_path: robot/iiwa_allegro_description/iiwa_allegro.usda
+    usd_robot_root: /iiwa_allegro
+    cspace:
+      joint_names:
+        [
+          "iiwa7_joint_1",
+          "iiwa7_joint_2",
+          "iiwa7_joint_3",
+          "iiwa7_joint_4",
+          "iiwa7_joint_5",
+          "iiwa7_joint_6",
+          "iiwa7_joint_7",
+          "index_joint_0",
+          "index_joint_1",
+          "index_joint_2",
+          "index_joint_3",
+          "middle_joint_0",
+          "middle_joint_1",
+          "middle_joint_2",
+          "middle_joint_3",
+          "ring_joint_0",
+          "ring_joint_1",
+          "ring_joint_2",
+          "ring_joint_3",
+          "thumb_joint_0",
+          "thumb_joint_1",
+          "thumb_joint_2",
+          "thumb_joint_3",
+        ]
+      cspace_distance_weight:
+        - 1.0
+        - 0.9
+        - 0.8
+        - 0.8
+        - 0.7
+        - 0.6
+        - 0.5
+        - 0.1
+        - 0.1
+        - 0.1
+        - 0.1
+        - 0.1
+        - 0.1
+        - 0.1
+        - 0.1
+        - 0.1
+        - 0.1
+        - 0.1
+        - 0.1
+        - 1.1
+        - 1.1
+        - 0.1
+        - 0.1
+
+      null_space_weight:
+        - 1.0
+        - 1.0
+        - 1.0
+        - 1.0
+        - 1.0
+        - 1.0
+        - 1.0
+        - 1.0
+        - 1.0
+        - 1.0
+        - 1.0
+        - 1.0
+        - 1.0 #
+        - 1.0
+        - 1.0
+        - 1.0
+        - 1.0
+        - 1.0
+        - 1.0
+        - 1.0
+        - 1.0
+        - 1.0
+        - 1.0
+      retract_config:
+        - 0.0
+        - -0.78
+        - 0.0
+        - -1.4
+        - 0.0
+        - 0.6
+        - 2.1
+        - 0.0
+        - 0.7242903113365173
+        - 0.7242903113365173
+        - 0.7242903113365173
+        - 0.0
+        - 0.7242903113365173
+        - 0.7242903113365173
+        - 0.7242903113365173
+        - 0.0
+        - 0.7242903113365173
+        - 0.7242903113365173
+        - 0.9249972105026245
+        - 0.4101402759552002
+        - 0.7242903113365173
+        - 0.7417430877685547
+        - 0.7242903113365173
+      max_acceleration: 15.0
+      max_jerk: 500.0
diff --git a/src/curobo/content/configs/robot/isaac_sim_description/jaco_7s.yaml b/src/curobo/content/configs/robot/isaac_sim_description/jaco_7s.yaml
new file mode 100644
index 00000000..e3f75f49
--- /dev/null
+++ b/src/curobo/content/configs/robot/isaac_sim_description/jaco_7s.yaml
@@ -0,0 +1,146 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+# The robot description defines the generalized coordinates and how to map those
+# to the underlying URDF dofs.
+
+api_version: 1.0
+
+# Defines the generalized coordinates. Each generalized coordinate is assumed
+# to have an entry in the URDF.
+# Lula will only use these joints to control the robot position.
+cspace:
+    - j2s7s300_joint_1
+    - j2s7s300_joint_2
+    - j2s7s300_joint_3
+    - j2s7s300_joint_4
+    - j2s7s300_joint_5
+    - j2s7s300_joint_6
+    - j2s7s300_joint_7
+default_q: [
+    0.0,1.7,0.0,0.5236,0.0,1.1345,0.0
+]
+
+# Most dimensions of the cspace have a direct corresponding element
+# in the URDF. This list of rules defines how unspecified coordinates
+# should be extracted or how values in the URDF should be overwritten.
+
+cspace_to_urdf_rules:
+    - {name: j2s7s300_joint_finger_1, rule: fixed, value: 0.0}
+    - {name: j2s7s300_joint_finger_2, rule: fixed, value: 1e-04}
+    - {name: j2s7s300_joint_finger_3, rule: fixed, value: 0.0}
+    - {name: j2s7s300_joint_finger_tip_1, rule: fixed, value: 0.0}
+    - {name: j2s7s300_joint_finger_tip_2, rule: fixed, value: 0.0}
+    - {name: j2s7s300_joint_finger_tip_3, rule: fixed, value: 0.0}
+
+# Lula uses collision spheres to define the robot geometry in order to avoid
+# collisions with external obstacles.  If no spheres are specified, Lula will
+# not be able to avoid obstacles.
+
+collision_spheres:
+  - j2s7s300_link_base:
+    - "center": [0.0, -0.003, 0.103]
+      "radius": 0.048
+    - "center": [0.0, -0.0, 0.162]
+      "radius": 0.047
+  - j2s7s300_link_1:
+    - "center": [-0.004, -0.002, -0.056]
+      "radius": 0.05
+    - "center": [0.0, 0.001, -0.119]
+      "radius": 0.05
+  - j2s7s300_link_2:
+    - "center": [0.0, -0.162, -0.001]
+      "radius": 0.049
+    - "center": [-0.0, -0.0, 0.0]
+      "radius": 0.05
+    - "center": [0.0, -0.108, -0.001]
+      "radius": 0.049
+    - "center": [0.0, -0.054, -0.0]
+      "radius": 0.05
+  - j2s7s300_link_3:
+    - "center": [-0.004, -0.0, -0.202]
+      "radius": 0.05
+    - "center": [0.0, 0.0, 0.0]
+      "radius": 0.05
+    - "center": [-0.003, -0.0, -0.161]
+      "radius": 0.05
+    - "center": [-0.002, -0.0, -0.121]
+      "radius": 0.05
+    - "center": [-0.001, -0.0, -0.081]
+      "radius": 0.05
+    - "center": [-0.001, -0.0, -0.04]
+      "radius": 0.05
+  - j2s7s300_link_4:
+    - "center": [0.002, 0.21, -0.013]
+      "radius": 0.04
+    - "center": [0.0, 0.0, 0.0]
+      "radius": 0.05
+    - "center": [0.001, 0.172, -0.01]
+      "radius": 0.042
+    - "center": [0.001, 0.132, -0.008]
+      "radius": 0.044
+    - "center": [0.001, 0.09, -0.005]
+      "radius": 0.046
+    - "center": [0.0, 0.046, -0.003]
+      "radius": 0.048
+  - j2s7s300_link_5:
+    - "center": [-0.001, 0.013, -0.106]
+      "radius": 0.04
+    - "center": [-0.003, 0.003, -0.044]
+      "radius": 0.04
+    - "center": [-0.002, 0.008, -0.075]
+      "radius": 0.04
+  - j2s7s300_link_6:
+    - "center": [-0.001, 0.094, 0.0]
+      "radius": 0.04
+    - "center": [0.0, -0.0, -0.017]
+      "radius": 0.04
+    - "center": [-0.001, 0.047, -0.008]
+      "radius": 0.04
+  - j2s7s300_link_7:
+    - "center": [-0.01, -0.017, -0.069]
+      "radius": 0.045
+    - "center": [0.001, 0.023, -0.076]
+      "radius": 0.038
+    - "center": [0.016, -0.022, -0.071]
+      "radius": 0.038
+    - "center": [-0.0, 0.0, -0.031]
+      "radius": 0.04
+  - j2s7s300_link_finger_tip_1:
+    - "center": [0.009, -0.004, -0.0]
+      "radius": 0.017
+    - "center": [0.03, -0.007, -0.002]
+      "radius": 0.015
+  - j2s7s300_link_finger_tip_2:
+    - "center": [0.009, -0.004, -0.0]
+      "radius": 0.017
+    - "center": [0.032, -0.002, -0.001]
+      "radius": 0.015
+  - j2s7s300_link_finger_2:
+    - "center": [0.011, -0.01, -0.0]
+      "radius": 0.021
+    - "center": [0.032, -0.01, 0.001]
+      "radius": 0.02
+  - j2s7s300_link_finger_3:
+    - "center": [0.011, -0.01, -0.0]
+      "radius": 0.021
+    - "center": [0.032, -0.01, 0.001]
+      "radius": 0.02
+  - j2s7s300_link_finger_tip_3:
+    - "center": [0.032, -0.004, 0.001]
+      "radius": 0.015
+    - "center": [0.015, -0.006, -0.002]
+      "radius": 0.015
+  - j2s7s300_link_finger_1:
+    - "center": [0.011, -0.01, -0.0]
+      "radius": 0.021
+    - "center": [0.032, -0.01, 0.001]
+      "radius": 0.02
diff --git a/src/curobo/content/configs/robot/isaac_sim_description/tm12.yaml b/src/curobo/content/configs/robot/isaac_sim_description/tm12.yaml
new file mode 100644
index 00000000..21714adf
--- /dev/null
+++ b/src/curobo/content/configs/robot/isaac_sim_description/tm12.yaml
@@ -0,0 +1,103 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+# The robot description defines the generalized coordinates and how to map those
+# to the underlying URDF dofs.
+
+api_version: 1.0
+
+# Defines the generalized coordinates. Each generalized coordinate is assumed
+# to have an entry in the URDF.
+# Lula will only use these joints to control the robot position.
+cspace:
+    - shoulder_1_joint
+    - shoulder_2_joint
+    - elbow_joint
+    - wrist_1_joint
+    - wrist_2_joint
+    - wrist_3_joint
+default_q: [
+    -0.1,-0.8,1.5,-0.2,0.2,0.2
+]
+
+# Most dimensions of the cspace have a direct corresponding element
+# in the URDF. This list of rules defines how unspecified coordinates
+# should be extracted or how values in the URDF should be overwritten.
+
+cspace_to_urdf_rules:
+
+# Lula uses collision spheres to define the robot geometry in order to avoid
+# collisions with external obstacles.  If no spheres are specified, Lula will
+# not be able to avoid obstacles.
+
+collision_spheres:
+  - link_1:
+    - "center": [-0.0, -0.0, 0.018]
+      "radius": 0.12
+    - "center": [0.0, 0.0, -0.13]
+      "radius": 0.09
+    - "center": [-0.017, -0.18, 0.02]
+      "radius": 0.12
+  - link_2:
+    - "center": [0.116, 0.006, -0.182]
+      "radius": 0.075
+    - "center": [0.638, -0.004, -0.192]
+      "radius": 0.08
+    - "center": [0.19, 0.004, -0.183]
+      "radius": 0.075
+    - "center": [0.265, 0.003, -0.184]
+      "radius": 0.075
+    - "center": [0.34, 0.001, -0.186]
+      "radius": 0.075
+    - "center": [0.414, 0.0, -0.187]
+      "radius": 0.075
+    - "center": [0.489, -0.001, -0.189]
+      "radius": 0.075
+    - "center": [0.563, -0.003, -0.19]
+      "radius": 0.075
+  - link_3:
+    - "center": [0.012, 0.004, -0.076]
+      "radius": 0.08
+    - "center": [0.55, -0.001, -0.046]
+      "radius": 0.07
+    - "center": [0.088, 0.003, -0.061]
+      "radius": 0.06
+    - "center": [0.165, 0.002, -0.056]
+      "radius": 0.06
+    - "center": [0.242, 0.001, -0.052]
+      "radius": 0.06
+    - "center": [0.319, 0.001, -0.047]
+      "radius": 0.06
+    - "center": [0.396, -0.0, -0.043]
+      "radius": 0.06
+    - "center": [0.473, -0.001, -0.038]
+      "radius": 0.06
+  - link_4:
+    - "center": [0.0, 0.0, 0.0]
+      "radius": 0.07
+  - link_6:
+    - "center": [0.003, -0.002, -0.028]
+      "radius": 0.06
+    - "center": [-0.001, 0.075, 0.009]
+      "radius": 0.05
+    - "center": [-0.0, 0.078, -0.028]
+      "radius": 0.05
+    - "center": [-0.031, 0.128, 0.008]
+      "radius": 0.03
+    - "center": [-0.006, 0.146, 0.0]
+      "radius": 0.03
+    - "center": [0.025, 0.125, 0.007]
+      "radius": 0.03
+    - "center": [-0.005, 0.128, 0.003]
+      "radius": 0.03
+  - link_5:
+    - "center": [0.0, 0.0, 0.0]
+      "radius": 0.08
diff --git a/src/curobo/content/configs/robot/isaac_sim_description/ur5e.yaml b/src/curobo/content/configs/robot/isaac_sim_description/ur5e.yaml
new file mode 100644
index 00000000..8b30d446
--- /dev/null
+++ b/src/curobo/content/configs/robot/isaac_sim_description/ur5e.yaml
@@ -0,0 +1,101 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+# The robot description defines the generalized coordinates and how to map those
+# to the underlying URDF dofs.
+
+api_version: 1.0
+
+# Defines the generalized coordinates. Each generalized coordinate is assumed
+# to have an entry in the URDF.
+# Lula will only use these joints to control the robot position.
+cspace:
+    - shoulder_pan_joint
+    - shoulder_lift_joint
+    - elbow_joint
+    - wrist_1_joint
+    - wrist_2_joint
+    - wrist_3_joint
+default_q: [
+    0.0,0.0002,0.0,0.0002,0.0,0.0
+]
+
+# Most dimensions of the cspace have a direct corresponding element
+# in the URDF. This list of rules defines how unspecified coordinates
+# should be extracted or how values in the URDF should be overwritten.
+
+cspace_to_urdf_rules:
+
+# Lula uses collision spheres to define the robot geometry in order to avoid
+# collisions with external obstacles.  If no spheres are specified, Lula will
+# not be able to avoid obstacles.
+
+collision_spheres:
+  - shoulder_link:
+    - "center": [0.0, 0.0, 0.0]
+      "radius": 0.1
+  - upper_arm_link:
+    - "center": [-0.416, -0.0, 0.143]
+      "radius": 0.078
+    - "center": [-0.015, 0.0, 0.134]
+      "radius": 0.077
+    - "center": [-0.14, 0.0, 0.138]
+      "radius": 0.062
+    - "center": [-0.285, -0.001, 0.139]
+      "radius": 0.061
+    - "center": [-0.376, 0.001, 0.138]
+      "radius": 0.077
+    - "center": [-0.222, 0.001, 0.139]
+      "radius": 0.061
+    - "center": [-0.055, 0.008, 0.14]
+      "radius": 0.07
+    - "center": [-0.001, -0.002, 0.143]
+      "radius": 0.076
+  - forearm_link:
+    - "center": [-0.01, 0.002, 0.031]
+      "radius": 0.072
+    - "center": [-0.387, 0.0, 0.014]
+      "radius": 0.057
+    - "center": [-0.121, -0.0, 0.006]
+      "radius": 0.057
+    - "center": [-0.206, 0.001, 0.007]
+      "radius": 0.057
+    - "center": [-0.312, -0.001, 0.006]
+      "radius": 0.056
+    - "center": [-0.057, 0.003, 0.008]
+      "radius": 0.065
+    - "center": [-0.266, 0.0, 0.006]
+      "radius": 0.057
+    - "center": [-0.397, -0.001, -0.018]
+      "radius": 0.052
+    - "center": [-0.164, -0.0, 0.007]
+      "radius": 0.057
+  - wrist_1_link:
+    - "center": [-0.0, 0.0, -0.009]
+      "radius": 0.047
+    - "center": [-0.0, 0.0, -0.052]
+      "radius": 0.047
+    - "center": [-0.002, 0.027, -0.001]
+      "radius": 0.045
+    - "center": [0.001, -0.01, 0.0]
+      "radius": 0.046
+  - wrist_2_link:
+    - "center": [0.0, -0.01, -0.001]
+      "radius": 0.047
+    - "center": [0.0, 0.008, -0.001]
+      "radius": 0.047
+    - "center": [0.001, -0.001, -0.036]
+      "radius": 0.047
+    - "center": [0.001, -0.03, -0.0]
+      "radius": 0.047
+  - wrist_3_link:
+    - "center": [0.001, 0.001, -0.029]
+      "radius": 0.043
diff --git a/src/curobo/content/configs/robot/jaco7.yml b/src/curobo/content/configs/robot/jaco7.yml
new file mode 100644
index 00000000..d7e76c89
--- /dev/null
+++ b/src/curobo/content/configs/robot/jaco7.yml
@@ -0,0 +1,217 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+robot_cfg:
+  
+  kinematics:
+
+    urdf_path: "robot/jaco/jaco_7s.urdf"
+    asset_root_path: "robot/jaco"
+    usd_path: "robot/jaco/jaco_7s.usda"
+    isaac_usd_path: "/Isaac/Robots/Kinova/Jaco2/J2N7S300/j2n7s300_instanceable.usd"
+    usd_robot_root: "/jaco"
+    #usd_robot_root: "/robot"
+    usd_flip_joints: {
+        j2s7s300_joint_1: "Z",
+        j2s7s300_joint_2: "Z",
+        j2s7s300_joint_3: "Z",
+        j2s7s300_joint_4: "Z",
+        j2s7s300_joint_5: "Z",
+        j2s7s300_joint_6: "Z",
+        j2s7s300_joint_7 : "Z",
+        j2s7s300_joint_finger_1: "Z",
+        j2s7s300_joint_finger_2: "Z",
+        j2s7s300_joint_finger_3: "Z",
+        j2s7s300_joint_finger_tip_1: "Z",
+        j2s7s300_joint_finger_tip_2: "Z",
+        j2s7s300_joint_finger_tip_3: "Z",
+    }
+    base_link: "root"
+    ee_link: "j2s7s300_end_effector"
+    link_names: null
+    
+    collision_link_names: [
+      "j2s7s300_link_base", "j2s7s300_link_1", "j2s7s300_link_2",
+      "j2s7s300_link_3", "j2s7s300_link_4", "j2s7s300_link_5", "j2s7s300_link_6", "j2s7s300_link_7",
+      "j2s7s300_link_finger_tip_1", "j2s7s300_link_finger_tip_2",
+      "j2s7s300_link_finger_tip_3", 
+      "j2s7s300_link_finger_3", 
+      "j2s7s300_link_finger_2",
+      "j2s7s300_link_finger_1",
+      ]
+
+    collision_spheres:
+      j2s7s300_link_base:
+        - "center": [0.0, -0.003, 0.103]
+          "radius": 0.048
+        - "center": [0.0, -0.0, 0.162]
+          "radius": 0.047
+      j2s7s300_link_1:
+        - "center": [-0.004, -0.002, -0.056]
+          "radius": 0.05
+        - "center": [0.0, 0.001, -0.119]
+          "radius": 0.05
+      j2s7s300_link_2:
+        - "center": [0.0, -0.162, -0.001]
+          "radius": 0.049
+        - "center": [-0.0, -0.0, 0.0]
+          "radius": 0.05
+        - "center": [0.0, -0.108, -0.001]
+          "radius": 0.049
+        - "center": [0.0, -0.054, -0.0]
+          "radius": 0.05
+      j2s7s300_link_3:
+        - "center": [-0.004, -0.0, -0.202]
+          "radius": 0.05
+        - "center": [0.0, 0.0, 0.0]
+          "radius": 0.05
+        - "center": [-0.003, -0.0, -0.161]
+          "radius": 0.05
+        - "center": [-0.002, -0.0, -0.121]
+          "radius": 0.05
+        - "center": [-0.001, -0.0, -0.081]
+          "radius": 0.05
+        - "center": [-0.001, -0.0, -0.04]
+          "radius": 0.05
+      j2s7s300_link_4:
+        - "center": [0.002, 0.21, -0.013]
+          "radius": 0.04
+        - "center": [0.0, 0.0, 0.0]
+          "radius": 0.05
+        - "center": [0.001, 0.172, -0.01]
+          "radius": 0.042
+        - "center": [0.001, 0.132, -0.008]
+          "radius": 0.044
+        - "center": [0.001, 0.09, -0.005]
+          "radius": 0.046
+        - "center": [0.0, 0.046, -0.003]
+          "radius": 0.048
+      j2s7s300_link_5:
+        - "center": [-0.001, 0.013, -0.106]
+          "radius": 0.04
+        - "center": [-0.003, 0.003, -0.044]
+          "radius": 0.04
+        - "center": [-0.002, 0.008, -0.075]
+          "radius": 0.04
+      j2s7s300_link_6:
+        - "center": [-0.001, 0.094, 0.0]
+          "radius": 0.04
+        - "center": [0.0, -0.0, -0.017]
+          "radius": 0.04
+        - "center": [-0.001, 0.047, -0.008]
+          "radius": 0.04
+      j2s7s300_link_7:
+        - "center": [-0.01, -0.017, -0.069]
+          "radius": 0.045
+        - "center": [0.001, 0.023, -0.076]
+          "radius": 0.038
+        - "center": [0.016, -0.022, -0.071]
+          "radius": 0.038
+        - "center": [-0.0, 0.0, -0.031]
+          "radius": 0.04
+      j2s7s300_link_finger_tip_1:
+        - "center": [0.009, -0.004, -0.0]
+          "radius": 0.017
+        - "center": [0.03, -0.007, -0.002]
+          "radius": 0.015
+      j2s7s300_link_finger_tip_2:
+        - "center": [0.009, -0.004, -0.0]
+          "radius": 0.017
+        - "center": [0.032, -0.002, -0.001]
+          "radius": 0.015
+      j2s7s300_link_finger_2:
+        - "center": [0.011, -0.01, -0.0]
+          "radius": 0.021
+        - "center": [0.032, -0.01, 0.001]
+          "radius": 0.02
+      j2s7s300_link_finger_3:
+        - "center": [0.011, -0.01, -0.0]
+          "radius": 0.021
+        - "center": [0.032, -0.01, 0.001]
+          "radius": 0.02
+      j2s7s300_link_finger_tip_3:
+        - "center": [0.032, -0.004, 0.001]
+          "radius": 0.015
+        - "center": [0.015, -0.006, -0.002]
+          "radius": 0.015
+      j2s7s300_link_finger_1:
+        - "center": [0.011, -0.01, -0.0]
+          "radius": 0.021
+        - "center": [0.032, -0.01, 0.001]
+          "radius": 0.02
+
+    collision_sphere_buffer: 0.005
+    self_collision_ignore: {
+      "j2s7s300_link_base":["j2s7s300_link_1"],
+      "j2s7s300_link_1":["j2s7s300_link_2"], 
+      "j2s7s300_link_2":["j2s7s300_link_3"],
+      "j2s7s300_link_3":["j2s7s300_link_4"], 
+      "j2s7s300_link_4":["j2s7s300_link_5"], 
+      "j2s7s300_link_5":["j2s7s300_link_6"], 
+      "j2s7s300_link_6":["j2s7s300_link_7"], 
+      "j2s7s300_link_7":[
+      "j2s7s300_link_finger_tip_1", 
+      "j2s7s300_link_finger_tip_2",
+      "j2s7s300_link_finger_tip_3",
+      "j2s7s300_link_finger_1",
+      "j2s7s300_link_finger_2",
+      "j2s7s300_link_finger_3",
+      ],
+    
+      "j2s7s300_link_finger_3":["j2s7s300_link_finger_tip_3","j2s7s300_link_finger_2",
+       "j2s7s300_link_finger_1"], 
+      "j2s7s300_link_finger_2":["j2s7s300_link_finger_tip_2", "j2s7s300_link_finger_1",
+      "j2s7s300_link_finger_3"],
+      "j2s7s300_link_finger_1":["j2s7s300_link_finger_tip_1", "j2s7s300_link_finger_2",
+      "j2s7s300_link_finger_3"],
+
+      } # Dict[str, List[str]]
+    self_collision_buffer: {} # Dict[str, float]
+    
+    mesh_link_names: [ 
+      "j2s7s300_link_base", "j2s7s300_link_1", "j2s7s300_link_2",
+      "j2s7s300_link_3", "j2s7s300_link_4", "j2s7s300_link_5", "j2s7s300_link_6", "j2s7s300_link_7",
+      "j2s7s300_link_finger_tip_1", "j2s7s300_link_finger_tip_2",
+      "j2s7s300_link_finger_tip_3", 
+      "j2s7s300_link_finger_3", 
+      "j2s7s300_link_finger_2",
+      "j2s7s300_link_finger_1",
+    ] # List[str]
+    lock_joints: {"j2s7s300_joint_finger_1": 0,
+    "j2s7s300_joint_finger_2": 0,
+    "j2s7s300_joint_finger_3": 0,
+    "j2s7s300_joint_finger_tip_1": 0,
+    "j2s7s300_joint_finger_tip_2": 0,
+    "j2s7s300_joint_finger_tip_3": 0,
+    
+    }
+
+    cspace:
+      joint_names: 
+        - j2s7s300_joint_1
+        - j2s7s300_joint_2
+        - j2s7s300_joint_3
+        - j2s7s300_joint_4
+        - j2s7s300_joint_5
+        - j2s7s300_joint_6
+        - j2s7s300_joint_7 
+        - j2s7s300_joint_finger_1
+        - j2s7s300_joint_finger_2
+        - j2s7s300_joint_finger_3
+        - j2s7s300_joint_finger_tip_1
+        - j2s7s300_joint_finger_tip_2
+        - j2s7s300_joint_finger_tip_3
+      retract_config: [0.0,4.0,0.0,5,0.0,3.0,0.0, 0,0,0,0,0,0]
+      null_space_weight: [1,1,1,1,1,1,1, 1,1,1,1,1,1] # List[str]
+      cspace_distance_weight: [1,1,1,1,1,1,1,1,1,1,1,1,1] # List[str]
+
+      max_acceleration: 15.0
+      max_jerk: 500.0
\ No newline at end of file
diff --git a/src/curobo/content/configs/robot/kinova_gen3.yml b/src/curobo/content/configs/robot/kinova_gen3.yml
new file mode 100644
index 00000000..3af7ee82
--- /dev/null
+++ b/src/curobo/content/configs/robot/kinova_gen3.yml
@@ -0,0 +1,92 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+robot_cfg:
+  kinematics:
+    base_link: base_link
+    collision_link_names:
+    - base_link
+    - shoulder_link
+    - half_arm_1_link
+    - half_arm_2_link
+    - forearm_link
+    - spherical_wrist_1_link
+    - spherical_wrist_2_link
+    - bracelet_link
+    - robotiq_arg2f_base_link
+    - left_outer_finger
+    - left_inner_finger_pad
+    - right_outer_finger
+    - right_inner_finger_pad
+    collision_sphere_buffer: 0.005
+    collision_spheres: spheres/kinova_gen3.yml
+    ee_link: tool_frame
+    self_collision_buffer:
+      base_link: 0.0
+      forearm_link: 0.0
+      half_arm_1_link: 0.0
+      half_arm_2_link: 0.0
+      shoulder_link: 0.0
+      spherical_wrist_1_link : 0.0
+      spherical_wrist_2_link : 0.0
+      bracelet_link: 0.0
+      robotiq_arg2f_base_link: 0.0
+      left_outer_finger: 0.0
+      left_inner_finger_pad: 0.0
+      right_outer_finger: 0.0
+      right_inner_finger_pad: 0.0
+      left_outer_knuckle: 0.0
+    self_collision_ignore: {
+      base_link: [shoulder_link],
+      shoulder_link: [half_arm_1_link],
+      half_arm_1_link: [half_arm_2_link],
+      half_arm_2_link: [forearm_link],
+      forearm_link: [spherical_wrist_1_link],
+      spherical_wrist_1_link: [spherical_wrist_2_link],
+      spherical_wrist_2_link: [bracelet_link],
+      bracelet_link: [robotiq_arg2f_base_link],
+      robotiq_arg2f_base_link: [left_outer_finger, right_outer_finger, left_inner_finger_pad, right_inner_finger_pad],
+      left_outer_finger: [right_outer_finger, right_inner_finger_pad, left_inner_finger_pad],
+      right_outer_finger: [right_inner_finger_pad, left_inner_finger_pad],
+      left_inner_finger_pad: [right_inner_finger_pad],
+
+    }
+    urdf_path: robot/kinova/kinova_gen3_7dof.urdf
+    asset_root_path: robot/kinova
+    
+    cspace:
+      joint_names: ['joint_1', 'joint_2', 'joint_3', 'joint_4', 'joint_5', 'joint_6', 'joint_7']
+      cspace_distance_weight:
+      - 1.0
+      - 1.0
+      - 1.0
+      - 1.0
+      - 1.0
+      - 1.0
+      - 1.0
+    
+      null_space_weight:
+      - 1.0
+      - 1.0
+      - 1.0
+      - 1.0
+      - 1.0
+      - 1.0
+      - 1.0
+      retract_config:
+      - 0.0
+      - -0.8
+      - 0.0
+      - 1.5
+      - 0.0
+      - 0.4
+      - 0.0
+      max_acceleration: 25.0
+      max_jerk: 500.0
\ No newline at end of file
diff --git a/src/curobo/content/configs/robot/quad_ur10e.yml b/src/curobo/content/configs/robot/quad_ur10e.yml
new file mode 100644
index 00000000..a90fd085
--- /dev/null
+++ b/src/curobo/content/configs/robot/quad_ur10e.yml
@@ -0,0 +1,85 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+robot_cfg:
+  kinematics:
+    use_usd_kinematics: False
+    urdf_path: "robot/ur_description/quad_ur10e.urdf"
+    asset_root_path: "robot/ur_description"
+    isaac_usd_path: "/Isaac/Robots/UR10/ur10_long_suction.usd"
+    usd_robot_root: "/ur10"
+    usd_path: "robot/ur_description/ur10e.usd"
+    base_link: "base_fixture_link"
+    ee_link: "tool0"
+    link_names: ["tool0", "tool1","tool2","tool3"] # , "tool2"] #, "tool3"]
+    collision_link_names: [
+      'shoulder_link','upper_arm_link', 'forearm_link', 'wrist_1_link', 'wrist_2_link' ,'wrist_3_link', 'tool0',
+      'shoulder_link_1','upper_arm_link_1', 'forearm_link_1', 'wrist_1_link_1', 'wrist_2_link_1' ,'wrist_3_link_1', 'tool1',
+      'shoulder_link_2','upper_arm_link_2', 'forearm_link_2', 'wrist_1_link_2', 'wrist_2_link_2' ,'wrist_3_link_2', 'tool2',
+      'shoulder_link_3','upper_arm_link_3', 'forearm_link_3', 'wrist_1_link_3', 'wrist_2_link_3' ,'wrist_3_link_3', 'tool3']
+    collision_spheres: 'spheres/quad_ur10e.yml'
+    collision_sphere_buffer: 0.005
+
+
+    self_collision_ignore: {
+      "upper_arm_link": ["shoulder_link","forearm_link"],
+      "forarm_link": ["wrist_1_link"],
+      "wrist_1_link": ["wrist_2_link","wrist_3_link"],
+      "wrist_2_link": ["wrist_3_link", "tool0"],
+      "wrist_3_link": ["tool0"],
+
+      "upper_arm_link_1": ["shoulder_link_1","forearm_link_1"],
+      "forarm_link_1": ["wrist_1_link_1"],
+      "wrist_1_link_1": ["wrist_2_link_1","wrist_3_link_1"],
+      "wrist_2_link_1": ["wrist_3_link_1", "tool1"],
+      "wrist_3_link_1": ["tool1"],
+
+      "upper_arm_link_2": ["shoulder_link_2","forearm_link_2"],
+      "forarm_link_2": ["wrist_1_link_2"],
+      "wrist_1_link_2": ["wrist_2_link_2","wrist_3_link_2"],
+      "wrist_2_link_2": ["wrist_3_link_2", "tool2"],
+      "wrist_3_link_2": ["tool2"],
+
+      "upper_arm_link_3": ["shoulder_link_3","forearm_link_3"],
+      "forarm_link_3": ["wrist_1_link_3"],
+      "wrist_1_link_3": ["wrist_2_link_3","wrist_3_link_3"],
+      "wrist_2_link_3": ["wrist_3_link_3", "tool3"],
+      "wrist_3_link_3": ["tool3"],
+    
+      }
+    self_collision_buffer: {
+      'shoulder_link': 0.05,
+      'shoulder_link_1': 0.05,
+      'shoulder_link_2': 0.05,
+      'shoulder_link_3': 0.05,
+       }
+    lock_joints: null
+
+    cspace:
+      joint_names: [
+        'shoulder_pan_joint', 'shoulder_lift_joint', 'elbow_joint', 'wrist_1_joint', 'wrist_2_joint', 'wrist_3_joint',
+        'shoulder_pan_joint_1', 'shoulder_lift_joint_1', 'elbow_joint_1', 'wrist_1_joint_1', 'wrist_2_joint_1', 'wrist_3_joint_1',
+        'shoulder_pan_joint_2', 'shoulder_lift_joint_2', 'elbow_joint_2', 'wrist_1_joint_2', 'wrist_2_joint_2', 'wrist_3_joint_2',
+        'shoulder_pan_joint_3', 'shoulder_lift_joint_3', 'elbow_joint_3', 'wrist_1_joint_3', 'wrist_2_joint_3', 'wrist_3_joint_3']
+      retract_config: [-1.57, -2.2, 1.9, -1.383, -1.57, 0.00, 
+                      -1.57, -2.2, 1.9, -1.383, -1.57, 0.00,
+                      -1.57, -2.2, 1.9, -1.383, -1.57, 0.00,
+                      -1.57, -2.2, 1.9, -1.383, -1.57, 0.00]
+      null_space_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                          1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                          1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                          1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+      cspace_distance_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                              1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                              1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                              1.0, 1.0, 1.0, 1.0, 1.0, 1.0] #[2.5, 1.75, 1.5, 1.25, 0.7, 0.4]
+      max_jerk: 500.0
+      max_acceleration: 15.0
diff --git a/src/curobo/content/configs/robot/spheres/dual_ur10e.yml b/src/curobo/content/configs/robot/spheres/dual_ur10e.yml
new file mode 100644
index 00000000..62e1cb96
--- /dev/null
+++ b/src/curobo/content/configs/robot/spheres/dual_ur10e.yml
@@ -0,0 +1,110 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+robot: 'UR10'
+collision_spheres:
+  shoulder_link:
+    - center: [0, 0, 0]
+      radius: 0.05
+    - center: [0.0, -0.18, 0]
+      radius: 0.05
+  upper_arm_link:
+    - center: [-0.102167, 0,  0.18]
+      radius: 0.05
+    - center: [-0.204333, -8.88178e-19, 0.18]
+      radius: 0.05
+    - center: [-0.3065, -1.33227e-18, 0.18]
+      radius: 0.05
+    - center: [-0.408667, -1.77636e-18, 0.18]
+      radius: 0.05
+    - center: [-0.510833, 0, 0.18]
+      radius: 0.05
+    - center: [ -0.613, 0,0.18]
+      radius: 0.07
+  forearm_link:
+    - center: [-0, 0, 0.03]
+      radius: 0.05
+    - center: [-0.0951667, 0, 0.03]
+      radius: 0.05
+    - center: [-0.190333, 0, 0.03]
+      radius: 0.05
+    - center: [-0.2855, 0, 0.03]
+      radius: 0.05
+    - center: [-0.380667, 0,0.03]
+      radius: 0.05
+    - center: [-0.475833, 0,0.03]
+      radius: 0.05
+    - center: [-0.571, -1.19904e-17, 0.03]
+      radius: 0.05
+  wrist_1_link:
+    - center: [0, 0, 0]
+      radius: 0.05
+  wrist_2_link:
+    - center: [0, 0, 0]
+      radius: 0.05
+  wrist_3_link:
+    - center: [0, 0, 0]
+      radius: 0.05
+    - center: [0, 0, 0.06]
+      radius: 0.08
+  tool0:
+    - center: [0, 0, 0.03]
+      radius: 0.05
+ 
+
+  shoulder_link_1:
+    - center: [0, 0, 0]
+      radius: 0.05
+    - center: [0.0, -0.18, 0]
+      radius: 0.05
+  upper_arm_link_1:
+    - center: [-0.102167, 0,  0.18]
+      radius: 0.05
+    - center: [-0.204333, -8.88178e-19, 0.18]
+      radius: 0.05
+    - center: [-0.3065, -1.33227e-18, 0.18]
+      radius: 0.05
+    - center: [-0.408667, -1.77636e-18, 0.18]
+      radius: 0.05
+    - center: [-0.510833, 0, 0.18]
+      radius: 0.05
+    - center: [ -0.613, 0,0.18]
+      radius: 0.07
+  forearm_link_1:
+    - center: [-0, 0, 0.03]
+      radius: 0.05
+    - center: [-0.0951667, 0, 0.03]
+      radius: 0.05
+    - center: [-0.190333, 0, 0.03]
+      radius: 0.05
+    - center: [-0.2855, 0, 0.03]
+      radius: 0.05
+    - center: [-0.380667, 0,0.03]
+      radius: 0.05
+    - center: [-0.475833, 0,0.03]
+      radius: 0.05
+    - center: [-0.571, -1.19904e-17, 0.03]
+      radius: 0.05
+  wrist_1_link_1:
+    - center: [0, 0, 0]
+      radius: 0.05
+  wrist_2_link_1:
+    - center: [0, 0, 0]
+      radius: 0.05
+  wrist_3_link_1:
+    - center: [0, 0, 0]
+      radius: 0.05
+    - center: [0, 0, 0.06]
+      radius: 0.08
+    
+  tool1:
+    - center: [0, 0, 0.03]
+      radius: 0.05
+  
\ No newline at end of file
diff --git a/src/curobo/content/configs/robot/spheres/franka.yml b/src/curobo/content/configs/robot/spheres/franka.yml
new file mode 100644
index 00000000..262ba164
--- /dev/null
+++ b/src/curobo/content/configs/robot/spheres/franka.yml
@@ -0,0 +1,156 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+robot: 'Franka Panda'
+urdf_path: "urdf/franka_description/franka_panda_dyn.urdf"
+collision_spheres:
+    panda_link0:
+        - "center": [-0.08, 0.0, 0.05]
+          "radius": 0.06
+        - "center": [-0.0, 0.0, 0.05]
+          "radius": 0.08
+    panda_link1:
+        - "center": [0.0, -0.08, 0.0]
+          "radius": 0.06
+        - "center": [0.0, -0.03, 0.0]
+          "radius": 0.06
+        - "center": [0.0, 0.0, -0.12]
+          "radius": 0.06
+        - "center": [0.0, 0.0, -0.17]
+          "radius": 0.06
+    panda_link2:
+        - "center": [0.0, 0.0, 0.03]
+          "radius": 0.06
+        - "center": [0.0, 0.0, 0.08]
+          "radius": 0.06
+        - "center": [0.0, -0.12, 0.0]
+          "radius": 0.06
+        - "center": [0.0, -0.17, 0.0]
+          "radius": 0.06
+    panda_link3:
+        - "center": [0.0, 0.0, -0.06]
+          "radius": 0.05
+        - "center": [0.0, 0.0, -0.1]
+          "radius": 0.06
+        - "center": [0.08, 0.06, 0.0]
+          "radius": 0.055
+        - "center": [0.08, 0.02, 0.0]
+          "radius": 0.055
+    panda_link4:
+        - "center": [0.0, 0.0, 0.02]
+          "radius": 0.055
+        - "center": [0.0, 0.0, 0.06]
+          "radius": 0.055
+        - "center": [-0.08, 0.095, 0.0]
+          "radius": 0.06
+        - "center": [-0.08, 0.06, 0.0]
+          "radius": 0.055
+    panda_link5:
+        - "center": [0.0, 0.055, 0.0]
+          "radius": 0.05
+        - "center": [0.0, 0.085, 0.0]
+          "radius": 0.055
+        - "center": [0.0, 0.000, -0.22]
+          "radius": 0.05
+        - "center": [0.0, 0.05, -0.18]
+          "radius": 0.045
+        - "center": [0.015, 0.08, -0.14]
+          "radius": 0.03
+        - "center": [0.015, 0.085, -0.11]
+          "radius": 0.03
+        - "center": [0.015, 0.09, -0.08]
+          "radius": 0.03
+        - "center": [0.015, 0.095, -0.05]
+          "radius": 0.03
+        - "center": [-0.015, 0.08, -0.14]
+          "radius": 0.03
+        - "center": [-0.015, 0.085, -0.11]
+          "radius": 0.03
+        - "center": [-0.015, 0.09, -0.08]
+          "radius": 0.03
+        - "center": [-0.015, 0.095, -0.05]
+          "radius": 0.03
+    panda_link6:
+        - "center": [0.0, 0.0, 0.0]
+          "radius": 0.05
+        - "center": [0.08, 0.035, 0.0]
+          "radius": 0.05
+        - "center": [0.08, -0.01, 0.0]
+          "radius": 0.05
+    panda_link7:
+        - "center": [0.0, 0.0, 0.07]
+          "radius": 0.05
+        - "center": [0.02, 0.04, 0.08]
+          "radius": 0.025
+        - "center": [0.04, 0.02, 0.08]
+          "radius": 0.025
+        - "center": [0.04, 0.06, 0.085]
+          "radius": 0.02
+        - "center": [0.06, 0.04, 0.085]
+          "radius": 0.02
+    panda_hand:
+        - "center": [0.0, -0.075, 0.01]
+          "radius": 0.03
+        - "center": [0.0, -0.045, 0.01]
+          "radius": 0.03
+        - "center": [0.0, -0.015, 0.01]
+          "radius": 0.03
+        - "center": [0.0, 0.015, 0.01]
+          "radius": 0.03
+        - "center": [0.0, 0.045, 0.01]
+          "radius": 0.03
+        - "center": [0.0, 0.075, 0.01]
+          "radius": 0.03
+        - "center": [0.0, 0.065, -0.02]
+          "radius": 0.03
+        - "center": [0.0, -0.075, 0.03]
+          "radius": 0.03
+        - "center": [0.0, -0.045, 0.03]
+          "radius": 0.03
+        - "center": [0.0, -0.015, 0.03]
+          "radius": 0.03
+        - "center": [0.0, 0.015, 0.03]
+          "radius": 0.03
+        - "center": [0.0, 0.045, 0.03]
+          "radius": 0.03
+        - "center": [0.0, 0.075, 0.03]
+          "radius": 0.03
+        - "center": [0.0, -0.075, 0.05]
+          "radius": 0.02
+        - "center": [0.0, -0.045, 0.05]
+          "radius": 0.02
+        - "center": [0.0, -0.015, 0.05]
+          "radius": 0.02
+        - "center": [0.0, 0.015, 0.05]
+          "radius": 0.02
+        - "center": [0.0, 0.045, 0.05]
+          "radius": 0.02
+        - "center": [0.0, 0.075, 0.05]
+          "radius": 0.02
+    panda_leftfinger:
+        - "center": [0.0, 0.01, 0.04]
+          "radius": 0.01
+        - "center": [0.0, 0.01, 0.014]
+          "radius": 0.01
+    panda_rightfinger:
+        - "center": [0.0, -0.01, 0.04]
+          "radius": 0.01
+        - "center": [0.0, -0.01, 0.014]
+          "radius": 0.01
+    attached_object:
+        - "center": [0.0, 0.0, 0.0]
+          "radius": -10.0
+        - "center": [0.0, 0.0, 0.0]
+          "radius": -10.0
+        - "center": [0.0, 0.0, 0.0]
+          "radius": -10.0
+        - "center": [0.0, 0.0, 0.0]
+          "radius": -10.0
+        
\ No newline at end of file
diff --git a/src/curobo/content/configs/robot/spheres/franka_collision_mesh.yml b/src/curobo/content/configs/robot/spheres/franka_collision_mesh.yml
new file mode 100644
index 00000000..74c2d372
--- /dev/null
+++ b/src/curobo/content/configs/robot/spheres/franka_collision_mesh.yml
@@ -0,0 +1,142 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+collision_spheres:
+    panda_link0:
+        - "center": [0.0, 0.0, 0.085]
+          "radius": 0.03 #0.07
+        - "center": [-0.1, -0.0, 0.085]
+          "radius": 0.03 #0.07
+    panda_link1:
+        - "center": [0.0, -0.08, 0.0]
+          "radius": 0.06
+        - "center": [0.0, -0.03, 0.0]
+          "radius": 0.06
+        - "center": [0.0, 0.0, -0.12]
+          "radius": 0.06
+        - "center": [0.0, 0.0, -0.17]
+          "radius": 0.06
+    panda_link2:
+        - "center": [0.0, 0.0, 0.03]
+          "radius": 0.06
+        - "center": [0.0, 0.0, 0.08]
+          "radius": 0.06
+        - "center": [0.0, -0.12, 0.0]
+          "radius": 0.06
+        - "center": [0.0, -0.17, 0.0]
+          "radius": 0.06
+    panda_link3:
+        - "center": [0.0, 0.0, -0.06]
+          "radius": 0.05
+        - "center": [0.0, 0.0, -0.1]
+          "radius": 0.06
+        - "center": [0.08, 0.06, 0.0]
+          "radius": 0.055
+        - "center": [0.08, 0.02, 0.0]
+          "radius": 0.055
+    panda_link4:
+        - "center": [0.0, 0.0, 0.02]
+          "radius": 0.06
+        - "center": [0.0, 0.0, 0.06]
+          "radius": 0.06
+        - "center": [-0.08, 0.095, 0.0]
+          "radius": 0.06
+        - "center": [-0.08, 0.06, 0.0]
+          "radius": 0.06
+    panda_link5:
+        - "center": [0.0, 0.03, 0.0]
+          "radius": 0.06
+        - "center": [0.0, 0.08, 0.0]
+          "radius": 0.06
+        - "center": [0.0, 0.000, -0.22]
+          "radius": 0.06
+        - "center": [0.0, 0.05, -0.18]
+          "radius": 0.06
+        - "center": [0.01, 0.08, -0.14]
+          "radius": 0.06
+        - "center": [0.01, 0.085, -0.11]
+          "radius": 0.025
+        - "center": [0.01, 0.09, -0.08]
+          "radius": 0.025
+        - "center": [0.01, 0.095, -0.05]
+          "radius": 0.025
+        - "center": [-0.01, 0.08, -0.14]
+          "radius": 0.025
+        - "center": [-0.01, 0.085, -0.11]
+          "radius": 0.025
+        - "center": [-0.01, 0.09, -0.08]
+          "radius": 0.025
+        - "center": [-0.01, 0.095, -0.05]
+          "radius": 0.025
+    panda_link6:
+        - "center": [0.0, 0.0, 0.012]
+          "radius": 0.052
+        - "center": [0.08, 0.03, 0.0]
+          "radius": 0.05 
+        - "center": [0.08, -0.01, 0.0]
+          "radius": 0.05 
+    panda_link7:
+        - "center": [0.0, 0.0, 0.07]
+          "radius": 0.045
+        - "center": [0.02, 0.04, 0.08]
+          "radius": 0.025
+        - "center": [0.04, 0.02, 0.08]
+          "radius": 0.025
+        - "center": [0.04, 0.06, 0.085]
+          "radius": 0.02
+        - "center": [0.06, 0.04, 0.085]
+          "radius": 0.02
+    panda_hand:
+        - "center": [0.0, -0.075, 0.01]
+          "radius": 0.024
+        - "center": [0.0, -0.045, 0.01]
+          "radius": 0.024
+        - "center": [0.0, -0.015, 0.01]
+          "radius": 0.024
+        - "center": [0.0, 0.015, 0.01]
+          "radius": 0.024
+        - "center": [0.0, 0.045, 0.01]
+          "radius": 0.024
+        - "center": [0.0, 0.075, 0.01]
+          "radius": 0.024
+        - "center": [0.0, -0.08, 0.03]
+          "radius": 0.022
+        - "center": [0.0, -0.045, 0.03]
+          "radius": 0.022
+        - "center": [0.0, -0.015, 0.03]
+          "radius": 0.022
+        - "center": [0.0, 0.015, 0.03]
+          "radius": 0.022
+        - "center": [0.0, 0.045, 0.03]
+          "radius": 0.022
+        - "center": [0.0, 0.08, 0.03]
+          "radius": 0.022
+        - "center": [0.0, -0.08, 0.045]
+          "radius": 0.022
+        - "center": [0.0, -0.045, 0.045]
+          "radius": 0.022
+        - "center": [0.0, -0.015, 0.045]
+          "radius": 0.022
+        - "center": [0.0, 0.015, 0.045]
+          "radius": 0.022
+        - "center": [0.0, 0.045, 0.045]
+          "radius": 0.022
+        - "center": [0.0, 0.08, 0.045]
+          "radius": 0.022
+    panda_leftfinger:
+        - "center": [0.0, 0.01, 0.043]
+          "radius": 0.0125 # 0.01
+        - "center": [0.0, 0.02, 0.015]
+          "radius": 0.0125
+    panda_rightfinger:
+        - "center": [0.0, -0.01, 0.043]
+          "radius": 0.0125 # 0.01
+        - "center": [0.0, -0.02, 0.015]
+          "radius": 0.0125
diff --git a/src/curobo/content/configs/robot/spheres/franka_mesh.yml b/src/curobo/content/configs/robot/spheres/franka_mesh.yml
new file mode 100644
index 00000000..e4b578c5
--- /dev/null
+++ b/src/curobo/content/configs/robot/spheres/franka_mesh.yml
@@ -0,0 +1,144 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+collision_spheres:
+    panda_link0:
+        - "center": [0.0, 0.0, 0.085]
+          "radius": 0.03 #0.07
+        - "center": [-0.1, -0.0, 0.085]
+          "radius": 0.03 #0.07
+    panda_link1:
+        - "center": [0.0, -0.08, 0.0]
+          "radius": 0.055
+        - "center": [0.0, -0.03, 0.0]
+          "radius": 0.06
+        - "center": [0.0, 0.0, -0.12]
+          "radius": 0.06
+        - "center": [0.0, 0.0, -0.17]
+          "radius": 0.06
+    panda_link2:
+        - "center": [0.0, 0.0, 0.03]
+          "radius": 0.055
+        - "center": [0.0, 0.0, 0.08]
+          "radius": 0.055
+        - "center": [0.0, -0.12, 0.0]
+          "radius": 0.055
+        - "center": [0.0, -0.17, 0.0]
+          "radius": 0.055
+    panda_link3:
+        - "center": [0.0, 0.0, -0.06]
+          "radius": 0.05
+        - "center": [0.0, 0.0, -0.1]
+          "radius": 0.06
+        - "center": [0.08, 0.06, 0.0]
+          "radius": 0.052
+        - "center": [0.08, 0.02, 0.0]
+          "radius": 0.052
+    panda_link4:
+        - "center": [0.0, 0.0, 0.02]
+          "radius": 0.052
+        - "center": [0.0, 0.0, 0.06]
+          "radius": 0.052
+        - "center": [-0.08, 0.095, 0.0]
+          "radius": 0.055
+        - "center": [-0.08, 0.06, 0.0]
+          "radius": 0.052
+    panda_link5:
+        - "center": [0.0, 0.03, 0.0]
+          "radius": 0.05
+        - "center": [0.0, 0.082, 0.0]
+          "radius": 0.05
+        - "center": [0.0, 0.000, -0.22]
+          "radius": 0.05
+        - "center": [0.0, 0.052, -0.18]
+          "radius": 0.04
+        - "center": [0.01, 0.08, -0.14]
+          "radius": 0.022
+        - "center": [0.01, 0.085, -0.11]
+          "radius": 0.022
+        - "center": [0.01, 0.09, -0.08]
+          "radius": 0.022
+        - "center": [0.01, 0.095, -0.05]
+          "radius": 0.022
+        - "center": [-0.01, 0.08, -0.14]
+          "radius": 0.022
+        - "center": [-0.01, 0.085, -0.11]
+          "radius": 0.022
+        - "center": [-0.01, 0.09, -0.08]
+          "radius": 0.022
+        - "center": [-0.01, 0.095, -0.05]
+          "radius": 0.022
+        - "center": [0.0, -0.009, 0.0]
+          "radius": 0.05
+    panda_link6:
+        - "center": [0.085, 0.035, 0.0]
+          "radius": 0.045 
+        - "center": [0.085, 0.0, 0.0]
+          "radius": 0.045 
+        - "center": [0.085, -0.015, 0.0]
+          "radius": 0.045 
+    panda_link7:
+        - "center": [0.0, 0.0, 0.07]
+          "radius": 0.045
+        - "center": [0.02, 0.04, 0.08]
+          "radius": 0.024
+        - "center": [0.04, 0.02, 0.08]
+          "radius": 0.024
+        - "center": [0.04, 0.06, 0.085]
+          "radius": 0.02
+        - "center": [0.06, 0.04, 0.085]
+          "radius": 0.02
+    panda_hand:
+        - "center": [0.0, -0.075, 0.01]
+          "radius": 0.023
+        - "center": [0.0, -0.045, 0.01]
+          "radius": 0.023
+        - "center": [0.0, -0.015, 0.01]
+          "radius": 0.023
+        - "center": [0.0, 0.015, 0.01]
+          "radius": 0.023
+        - "center": [0.0, 0.045, 0.01]
+          "radius": 0.023
+        - "center": [0.0, 0.075, 0.01]
+          "radius": 0.023
+        - "center": [0.0, -0.08, 0.03]
+          "radius": 0.022
+        - "center": [0.0, -0.045, 0.03]
+          "radius": 0.022
+        - "center": [0.0, -0.015, 0.03]
+          "radius": 0.022
+        - "center": [0.0, 0.015, 0.03]
+          "radius": 0.022
+        - "center": [0.0, 0.045, 0.03]
+          "radius": 0.022
+        - "center": [0.0, 0.08, 0.03]
+          "radius": 0.022
+        - "center": [0.0, -0.08, 0.045]
+          "radius": 0.022
+        - "center": [0.0, -0.045, 0.045]
+          "radius": 0.022
+        - "center": [0.0, -0.015, 0.045]
+          "radius": 0.022
+        - "center": [0.0, 0.015, 0.045]
+          "radius": 0.022
+        - "center": [0.0, 0.045, 0.045]
+          "radius": 0.022
+        - "center": [0.0, 0.08, 0.045]
+          "radius": 0.022
+    panda_leftfinger:
+        - "center": [0.0, 0.01, 0.043]
+          "radius": 0.011 # 25 
+        - "center": [0.0, 0.02, 0.015]
+          "radius": 0.011 # 25
+    panda_rightfinger:
+        - "center": [0.0, -0.01, 0.043]
+          "radius": 0.011 #25 
+        - "center": [0.0, -0.02, 0.015]
+          "radius": 0.011 #25
\ No newline at end of file
diff --git a/src/curobo/content/configs/robot/spheres/franka_mesh_inside.yml b/src/curobo/content/configs/robot/spheres/franka_mesh_inside.yml
new file mode 100644
index 00000000..14abaa17
--- /dev/null
+++ b/src/curobo/content/configs/robot/spheres/franka_mesh_inside.yml
@@ -0,0 +1,146 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+collision_spheres:
+    panda_link0:
+        - "center": [0.0, 0.0, 0.1]
+          "radius": 0.02
+    panda_link1:
+        - "center": [0.0, -0.08, 0.0]
+          "radius": 0.06
+        - "center": [0.0, -0.03, 0.0]
+          "radius": 0.06
+        - "center": [0.0, 0.0, -0.12]
+          "radius": 0.06
+        - "center": [0.0, 0.0, -0.17]
+          "radius": 0.06
+    panda_link2:
+        - "center": [0.0, 0.0, 0.03]
+          "radius": 0.06
+        - "center": [0.0, 0.0, 0.08]
+          "radius": 0.06
+        - "center": [0.0, -0.12, 0.0]
+          "radius": 0.06
+        - "center": [0.0, -0.17, 0.0]
+          "radius": 0.06
+    panda_link3:
+        - "center": [0.0, 0.0, -0.06]
+          "radius": 0.05
+        - "center": [0.0, 0.0, -0.1]
+          "radius": 0.06
+        - "center": [0.08, 0.06, 0.0]
+          "radius": 0.055
+        - "center": [0.08, 0.02, 0.0]
+          "radius": 0.055
+    panda_link4:
+        - "center": [0.0, 0.0, 0.02]
+          "radius": 0.055
+        - "center": [0.0, 0.0, 0.06]
+          "radius": 0.055
+        - "center": [-0.08, 0.095, 0.0]
+          "radius": 0.06
+        - "center": [-0.08, 0.06, 0.0]
+          "radius": 0.055
+    panda_link5:
+        - "center": [0.0, 0.055, 0.0]
+          "radius": 0.06
+        - "center": [0.0, 0.075, 0.0]
+          "radius": 0.06
+        - "center": [0.0, 0.000, -0.22]
+          "radius": 0.06
+        - "center": [0.0, 0.05, -0.18]
+          "radius": 0.05
+        - "center": [0.01, 0.08, -0.14]
+          "radius": 0.025
+        - "center": [0.01, 0.085, -0.11]
+          "radius": 0.025
+        - "center": [0.01, 0.09, -0.08]
+          "radius": 0.025
+        - "center": [0.01, 0.095, -0.05]
+          "radius": 0.025
+        - "center": [-0.01, 0.08, -0.14]
+          "radius": 0.025
+        - "center": [-0.01, 0.085, -0.11]
+          "radius": 0.025
+        - "center": [-0.01, 0.09, -0.08]
+          "radius": 0.025
+        - "center": [-0.01, 0.095, -0.05]
+          "radius": 0.025
+    panda_link6:
+        - "center": [0.0, 0.0, 0.0]
+          "radius": 0.06
+        - "center": [0.08, 0.03, 0.0]
+          "radius": 0.06
+        - "center": [0.08, -0.01, 0.0]
+          "radius": 0.06
+    panda_link7:
+        - "center": [0.0, 0.0, 0.07]
+          "radius": 0.05
+        - "center": [0.02, 0.04, 0.08]
+          "radius": 0.025
+        - "center": [0.04, 0.02, 0.08]
+          "radius": 0.025
+        - "center": [0.04, 0.06, 0.085]
+          "radius": 0.02
+        - "center": [0.06, 0.04, 0.085]
+          "radius": 0.02
+    panda_hand:
+        - "center": [0.0, -0.075, 0.01]
+          "radius": 0.028
+        - "center": [0.0, -0.045, 0.01]
+          "radius": 0.028
+        - "center": [0.0, -0.015, 0.01]
+          "radius": 0.028
+        - "center": [0.0, 0.015, 0.01]
+          "radius": 0.028
+        - "center": [0.0, 0.045, 0.01]
+          "radius": 0.028
+        - "center": [0.0, 0.075, 0.01]
+          "radius": 0.028
+        - "center": [0.0, -0.075, 0.03]
+          "radius": 0.026
+        - "center": [0.0, -0.045, 0.03]
+          "radius": 0.026
+        - "center": [0.0, -0.015, 0.03]
+          "radius": 0.026
+        - "center": [0.0, 0.015, 0.03]
+          "radius": 0.026
+        - "center": [0.0, 0.045, 0.03]
+          "radius": 0.026
+        - "center": [0.0, 0.075, 0.03]
+          "radius": 0.026
+        - "center": [0.0, -0.075, 0.05]
+          "radius": 0.024
+        - "center": [0.0, -0.045, 0.05]
+          "radius": 0.024
+        - "center": [0.0, -0.015, 0.05]
+          "radius": 0.024
+        - "center": [0.0, 0.015, 0.05]
+          "radius": 0.024
+        - "center": [0.0, 0.045, 0.05]
+          "radius": 0.024
+        - "center": [0.0, 0.075, 0.05]
+          "radius": 0.024
+    #panda_leftfinger:
+    #    - "center": [0.0, 0.01, 0.043]
+    #      "radius": 0.01
+    #    - "center": [0.0, 0.02, 0.015]
+    #      "radius": 0.01
+    #panda_rightfinger:
+    #    - "center": [0.0, -0.01, 0.043]
+    #      "radius": 0.01
+    #    - "center": [0.0, -0.02, 0.015]
+    #      "radius": 0.01
+    panda_leftfinger:
+        - "center": [0.0, 0.0075, 0.0]
+          "radius": 0.0108
+    panda_rightfinger:
+        - "center": [0.0, -0.0075, 0.0]
+          "radius": 0.0108
\ No newline at end of file
diff --git a/src/curobo/content/configs/robot/spheres/franka_mesh_mb.yml b/src/curobo/content/configs/robot/spheres/franka_mesh_mb.yml
new file mode 100644
index 00000000..3e96972f
--- /dev/null
+++ b/src/curobo/content/configs/robot/spheres/franka_mesh_mb.yml
@@ -0,0 +1,159 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+collision_spheres:
+    panda_link0:
+        - "center": [0.0, 0.0, 0.1]
+          "radius": 0.03
+        - "center": [-0.1, -0.0, 0.1]
+          "radius": 0.03
+    panda_link1:
+        - "center": [0.0, -0.08, 0.0]
+          "radius": 0.06
+        - "center": [0.0, -0.03, 0.0]
+          "radius": 0.06
+        - "center": [0.0, 0.0, -0.12]
+          "radius": 0.06
+        - "center": [0.0, 0.0, -0.17]
+          "radius": 0.06
+    panda_link2:
+        - "center": [0.0, 0.0, 0.03]
+          "radius": 0.06
+        - "center": [0.0, 0.0, 0.08]
+          "radius": 0.06
+        - "center": [0.0, -0.12, 0.0]
+          "radius": 0.06
+        - "center": [0.0, -0.17, 0.0]
+          "radius": 0.06
+    panda_link3:
+        - "center": [0.0, 0.0, -0.06]
+          "radius": 0.05
+        - "center": [0.0, 0.0, -0.1]
+          "radius": 0.06
+        - "center": [0.08, 0.06, 0.0]
+          "radius": 0.055
+        - "center": [0.08, 0.02, 0.0]
+          "radius": 0.055
+    panda_link4:
+        - "center": [0.0, 0.0, 0.02]
+          "radius": 0.055
+        - "center": [0.0, 0.0, 0.06]
+          "radius": 0.055
+        - "center": [-0.08, 0.095, 0.0]
+          "radius": 0.06
+        - "center": [-0.08, 0.06, 0.0]
+          "radius": 0.055
+    panda_link5:
+        - "center": [0.0, 0.03, 0.0]
+          "radius": 0.05
+        - "center": [0.0, 0.08, 0.0]
+          "radius": 0.05
+        - "center": [0.0, 0.000, -0.22]
+          "radius": 0.05
+        - "center": [0.0, 0.05, -0.18]
+          "radius": 0.045
+        - "center": [0.01, 0.08, -0.14]
+          "radius": 0.025
+        - "center": [0.01, 0.085, -0.11]
+          "radius": 0.025
+        - "center": [0.01, 0.09, -0.08]
+          "radius": 0.025
+        - "center": [0.01, 0.095, -0.05]
+          "radius": 0.025
+        - "center": [-0.01, 0.08, -0.14]
+          "radius": 0.025
+        - "center": [-0.01, 0.085, -0.11]
+          "radius": 0.025
+        - "center": [-0.01, 0.09, -0.08]
+          "radius": 0.025
+        - "center": [-0.01, 0.095, -0.05]
+          "radius": 0.025
+    panda_link6:
+        - "center": [0.0, 0.0, 0.012]
+          "radius": 0.052
+        - "center": [0.08, 0.03, 0.0]
+          "radius": 0.05 
+        - "center": [0.08, -0.01, 0.0]
+          "radius": 0.05 
+    panda_link7:
+        - "center": [0.0, 0.0, 0.07]
+          "radius": 0.045
+        - "center": [0.02, 0.04, 0.08]
+          "radius": 0.025
+        - "center": [0.04, 0.02, 0.08]
+          "radius": 0.025
+        - "center": [0.04, 0.06, 0.085]
+          "radius": 0.02
+        - "center": [0.06, 0.04, 0.085]
+          "radius": 0.02
+    panda_hand:
+        - "center": [0.0, -0.075, 0.01]
+          "radius": 0.024
+        - "center": [0.0, -0.045, 0.01]
+          "radius": 0.024
+        - "center": [0.0, -0.015, 0.01]
+          "radius": 0.024
+        - "center": [0.0, 0.015, 0.01]
+          "radius": 0.024
+        - "center": [0.0, 0.045, 0.01]
+          "radius": 0.024
+        - "center": [0.0, 0.073, 0.01]
+          "radius": 0.02
+        - "center": [0.0, -0.08, 0.03]
+          "radius": 0.022
+        - "center": [0.0, -0.045, 0.03]
+          "radius": 0.022
+        - "center": [0.0, -0.015, 0.03]
+          "radius": 0.022
+        - "center": [0.0, 0.015, 0.03]
+          "radius": 0.022
+        - "center": [0.0, 0.045, 0.03]
+          "radius": 0.022
+        - "center": [0.0, 0.075, 0.03]
+          "radius": 0.02
+        - "center": [0.0, -0.08, 0.045]
+          "radius": 0.02
+        - "center": [0.0, -0.045, 0.045]
+          "radius": 0.02
+        - "center": [0.0, -0.015, 0.045]
+          "radius": 0.02
+        - "center": [0.0, 0.015, 0.045]
+          "radius": 0.02
+        - "center": [0.0, 0.045, 0.045]
+          "radius": 0.02
+        - "center": [0.0, 0.078, 0.045]
+          "radius": 0.02
+        #- "center": [0.0, -0.02, 0.1]
+        #  "radius": 0.015
+        #- "center": [0.0, 0.02, 0.1]
+        #  "radius": 0.015
+        #- "center": [0.0, 0.0, 0.1]
+        #  "radius": 0.015
+    panda_leftfinger:
+        - "center": [0.0, 0.01, 0.043]
+          "radius": 0.01 # 0.01
+        - "center": [0.0, 0.02, 0.015]
+          "radius": 0.005
+    panda_rightfinger:
+      
+        - "center": [0.0, -0.01, 0.043]
+          "radius": 0.01 # 0.01
+        - "center": [0.0, -0.02, 0.015]
+          "radius": 0.005
+    attached_object:
+        - "center": [0.0, 0.0, 0.0]
+          "radius": -10.0
+        - "center": [0.0, 0.0, 0.0]
+          "radius": -10.0
+        - "center": [0.0, 0.0, 0.0]
+          "radius": -10.0
+        - "center": [0.0, 0.0, 0.0]
+          "radius": -10.0
+        
\ No newline at end of file
diff --git a/src/curobo/content/configs/robot/spheres/franka_real_robot.yml b/src/curobo/content/configs/robot/spheres/franka_real_robot.yml
new file mode 100644
index 00000000..a85ed93a
--- /dev/null
+++ b/src/curobo/content/configs/robot/spheres/franka_real_robot.yml
@@ -0,0 +1,141 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+robot: 'Franka Panda'
+urdf_path: "urdf/franka_description/franka_panda_dyn.urdf"
+collision_spheres:
+    panda_link0:
+        - "center": [-0.08, 0.0, 0.05]
+          "radius": 0.06
+        - "center": [-0.0, 0.0, 0.05]
+          "radius": 0.08
+    panda_link1:
+        - "center": [0.0, -0.08, 0.0]
+          "radius": 0.1
+        - "center": [0.0, -0.03, 0.0]
+          "radius": 0.1
+        - "center": [0.0, 0.0, -0.12]
+          "radius": 0.06
+        - "center": [0.0, 0.0, -0.17]
+          "radius": 0.06
+    panda_link2:
+        - "center": [0.0, 0.0, 0.03]
+          "radius": 0.1
+        - "center": [0.0, 0.0, 0.08]
+          "radius": 0.1
+        - "center": [0.0, -0.12, 0.0]
+          "radius": 0.06
+        - "center": [0.0, -0.17, 0.0]
+          "radius": 0.06
+    panda_link3:
+        - "center": [0.0, 0.0, -0.06]
+          "radius": 0.05
+        - "center": [0.0, 0.0, -0.1]
+          "radius": 0.06
+        - "center": [0.08, 0.06, 0.0]
+          "radius": 0.055
+        - "center": [0.08, 0.02, 0.0]
+          "radius": 0.055
+    panda_link4:
+        - "center": [0.0, 0.0, 0.02]
+          "radius": 0.055
+        - "center": [0.0, 0.0, 0.06]
+          "radius": 0.055
+        - "center": [-0.08, 0.095, 0.0]
+          "radius": 0.06
+        - "center": [-0.08, 0.06, 0.0]
+          "radius": 0.055
+    panda_link5:
+        - "center": [0.0, 0.055, 0.0]
+          "radius": 0.05
+        - "center": [0.0, 0.085, 0.0]
+          "radius": 0.055
+        - "center": [0.0, 0.000, -0.22]
+          "radius": 0.05
+        - "center": [0.0, 0.05, -0.18]
+          "radius": 0.045
+        - "center": [0.015, 0.08, -0.14]
+          "radius": 0.03
+        - "center": [0.015, 0.085, -0.11]
+          "radius": 0.03
+        - "center": [0.015, 0.09, -0.08]
+          "radius": 0.03
+        - "center": [0.015, 0.095, -0.05]
+          "radius": 0.03
+        - "center": [-0.015, 0.08, -0.14]
+          "radius": 0.03
+        - "center": [-0.015, 0.085, -0.11]
+          "radius": 0.03
+        - "center": [-0.015, 0.09, -0.08]
+          "radius": 0.03
+        - "center": [-0.015, 0.095, -0.05]
+          "radius": 0.03
+    panda_link6:
+        - "center": [0.0, 0.0, 0.0]
+          "radius": 0.05
+        - "center": [0.08, 0.035, 0.0]
+          "radius": 0.052
+        - "center": [0.08, -0.01, 0.0]
+          "radius": 0.05
+    panda_link7:
+        - "center": [0.0, 0.0, 0.07]
+          "radius": 0.05
+        - "center": [0.02, 0.04, 0.08]
+          "radius": 0.025
+        - "center": [0.04, 0.02, 0.08]
+          "radius": 0.025
+        - "center": [0.04, 0.06, 0.085]
+          "radius": 0.02
+        - "center": [0.06, 0.04, 0.085]
+          "radius": 0.02
+    panda_hand:
+        - "center": [0.0, -0.08, 0.01]
+          "radius": 0.03
+        - "center": [0.0, -0.045, 0.01]
+          "radius": 0.03
+        - "center": [0.0, -0.015, 0.01]
+          "radius": 0.03
+        - "center": [0.0, 0.015, 0.01]
+          "radius": 0.03
+        - "center": [0.0, 0.045, 0.01]
+          "radius": 0.03
+        - "center": [0.0, 0.08, 0.01]
+          "radius": 0.03
+        - "center": [0.0, 0.065, -0.02]
+          "radius": 0.05
+        - "center": [0.0, -0.08, 0.05]
+          "radius": 0.05
+        - "center": [0.0, -0.045, 0.05]
+          "radius": 0.05
+        - "center": [0.0, -0.015, 0.05]
+          "radius": 0.05
+        - "center": [0.0, 0.015, 0.05]
+          "radius": 0.05
+        - "center": [0.0, 0.045, 0.05]
+          "radius": 0.05
+        - "center": [0.0, 0.08, 0.05]
+          "radius": 0.05
+        - "center": [0.0, 0.08, 0.08]
+          "radius": 0.05
+        - "center": [0.0, -0.08, 0.08]
+          "radius": 0.05
+        - "center": [0.0, 0.05, 0.08]
+          "radius": 0.05
+        - "center": [0.0, -0.05, 0.08]
+          "radius": 0.05
+        - "center": [0.0, 0.0, 0.08]
+          "radius": 0.05
+    panda_leftfinger:
+        - "center": [0.0, 0.01, 0.034]
+          "radius": 0.02
+    panda_rightfinger:
+        - "center": [0.0, -0.01, 0.034]
+          "radius": 0.02
+          
diff --git a/src/curobo/content/configs/robot/spheres/iiwa_allegro.yml b/src/curobo/content/configs/robot/spheres/iiwa_allegro.yml
new file mode 100644
index 00000000..4927c9c3
--- /dev/null
+++ b/src/curobo/content/configs/robot/spheres/iiwa_allegro.yml
@@ -0,0 +1,157 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+collision_spheres:
+    iiwa7_link_0:
+        - "center": [0.0, 0.0, 0.05]
+          "radius": 0.10
+    iiwa7_link_1:
+        - "center": [0.0, 0.0, 0.0]
+          "radius": 0.08
+        - "center": [0.0, -0.05, 0.1]
+          "radius": 0.07
+        - "center": [0.0, -0.05, 0.18]
+          "radius": 0.08
+    iiwa7_link_2:
+        - "center": [0.0, 0.0, 0.0]
+          "radius": 0.08
+        - "center": [0.0, 0.02, 0.06]
+          "radius": 0.07
+        - "center": [0.0, 0.1, 0.03]
+          "radius": 0.07
+        - "center": [0.0, 0.18, 0.0]
+          "radius": 0.08
+    iiwa7_link_3:
+        - "center": [0.0, 0.0, 0.08]
+          "radius": 0.08
+        - "center": [0.0, 0.06, 0.16]
+          "radius": 0.07
+        - "center": [0.0, 0.05, 0.22]
+          "radius": 0.07
+    iiwa7_link_4:
+        - "center": [0.0, 0.0, 0.0]
+          "radius": 0.08
+        - "center": [0.0, 0.0, 0.05]
+          "radius": 0.07
+        - "center": [0.0, 0.07, 0.05]
+          "radius": 0.06
+        - "center": [0.0, 0.11, 0.03]
+          "radius": 0.06
+        - "center": [0.0, 0.15, 0.01]
+          "radius": 0.07
+    iiwa7_link_5:
+        - "center": [0.0, 0.0, 0.02]
+          "radius": 0.08
+        - "center": [0.0, 0.03, 0.07]
+          "radius": 0.06
+        - "center": [0.0, 0.08, 0.13]
+          "radius": 0.05
+    iiwa7_link_6:
+        - "center": [0.0, 0.0, 0.0]
+          "radius": 0.06
+        - "center": [0., 0., 0.05]
+          "radius": 0.08
+        - "center": [0., -0.04, 0.075]
+          "radius": 0.06
+        - "center": [0., 0.08, 0.06]
+          "radius": 0.065
+        - "center": [0., 0.16, 0.06]
+          "radius": 0.05
+    palm_link:
+        - "center": [0.0, 0.01, -0.04]
+          "radius": 0.07
+    index_link_1:
+        - "center": [0.0, 0.0, 0.0]
+          "radius": 0.016
+        - "center": [0.02, 0.0, 0.0]
+          "radius": 0.016
+        - "center": [0.04, 0.0, 0.0]
+          "radius": 0.016
+    index_link_2:
+        - "center": [0.0, 0.0, 0.0]
+          "radius": 0.016
+        - "center": [0.02, 0.0, 0.0]
+          "radius": 0.016
+    index_link_3:
+        - "center": [0.0, 0.0, 0.0]
+          "radius": 0.016
+        - "center": [0.02, 0.0, 0.0]
+          "radius": 0.015
+        - "center": [0.04, 0.0, 0.0]
+          "radius": 0.01
+        - "center": [0.0475, 0.0025, 0.0]
+          "radius": 0.01
+        - "center": [0.055, 0.005, 0.0]
+          "radius": 0.01
+    middle_link_1:
+        - "center": [0.0, 0.0, 0.0]
+          "radius": 0.016
+        - "center": [0.02, 0.0, 0.0]
+          "radius": 0.016
+        - "center": [0.04, 0.0, 0.0]
+          "radius": 0.016
+    middle_link_2:
+        - "center": [0.0, 0.0, 0.0]
+          "radius": 0.016
+        - "center": [0.02, 0.0, 0.0]
+          "radius": 0.016
+    middle_link_3:
+        - "center": [0.0, 0.0, 0.0]
+          "radius": 0.016
+        - "center": [0.02, 0.0, 0.0]
+          "radius": 0.015
+        - "center": [0.04, 0.0, 0.0]
+          "radius": 0.01
+        - "center": [0.0475, 0.0025, 0.0]
+          "radius": 0.01
+        - "center": [0.055, 0.005, 0.0]
+          "radius": 0.01
+    ring_link_1:
+        - "center": [0.0, 0.0, 0.0]
+          "radius": 0.016
+        - "center": [0.02, 0.0, 0.0]
+          "radius": 0.016
+        - "center": [0.04, 0.0, 0.0]
+          "radius": 0.016
+    ring_link_2:
+        - "center": [0.0, 0.0, 0.0]
+          "radius": 0.016
+        - "center": [0.02, 0.0, 0.0]
+          "radius": 0.016
+    ring_link_3:
+        - "center": [0.0, 0.0, 0.0]
+          "radius": 0.016
+        - "center": [0.02, 0.0, 0.0]
+          "radius": 0.015
+        - "center": [0.04, 0.0, 0.0]
+          "radius": 0.01
+        - "center": [0.0475, 0.0025, 0.0]
+          "radius": 0.01
+        - "center": [0.055, 0.005, 0.0]
+          "radius": 0.01
+    thumb_link_2:
+        - "center": [0.0, 0.0, 0.0]
+          "radius": 0.016
+        - "center": [0.02, 0.0, 0.0]
+          "radius": 0.016
+    thumb_link_3:
+        - "center": [0.0, 0.0, 0.0]
+          "radius": 0.016
+        - "center": [0.02, 0.0, 0.0]
+          "radius": 0.016
+        - "center": [0.04, 0.0, 0.0]
+          "radius": 0.015
+        - "center": [0.05, 0.0, 0.0]
+          "radius": 0.01
+        - "center": [0.06, 0.003, 0.0]
+          "radius": 0.01
+        - "center": [0.07, 0.0055, 0.0]
+          "radius": 0.01
\ No newline at end of file
diff --git a/src/curobo/content/configs/robot/spheres/kinova_gen3.yml b/src/curobo/content/configs/robot/spheres/kinova_gen3.yml
new file mode 100644
index 00000000..7ee59bca
--- /dev/null
+++ b/src/curobo/content/configs/robot/spheres/kinova_gen3.yml
@@ -0,0 +1,76 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+collision_spheres:
+  base_link:
+      - "center": [0.0, 0.0, 0.1]
+        "radius": 0.05
+  
+  shoulder_link:
+      - "center": [0.0, 0.0, -0.1]
+        "radius": 0.06
+      - "center": [0.0, 0.0, -0.15]
+        "radius": 0.05
+  half_arm_1_link:
+      - "center": [0.0, -0.0, 0.0]
+        "radius": 0.055
+      - "center": [0.0, -0.07, 0.0]
+        "radius": 0.055
+      - "center": [0.0, -0.15,0.0]
+        "radius": 0.055
+  half_arm_2_link:
+      - "center": [0.0, -0.0, 0.0]
+        "radius": 0.055
+      - "center": [0.0, -0.0, -0.07]
+        "radius": 0.055
+      - "center": [0.0, -0.0,-0.15]
+        "radius": 0.055
+      - "center": [0.0, -0.0,-0.21]
+        "radius": 0.055
+  forearm_link:
+      - "center": [0.0, -0.0, 0.0]
+        "radius": 0.055
+      - "center": [0.0, -0.07, -0.0]
+        "radius": 0.055
+      - "center": [0.0, -0.17,-0.0]
+        "radius": 0.055
+  spherical_wrist_1_link:
+      - "center": [0.0, -0.0, 0.0]
+        "radius": 0.055
+      - "center": [0.0, -0.0, -0.085]
+        "radius": 0.055
+  spherical_wrist_2_link:
+      - "center": [0.0, -0.0, 0.0]
+        "radius": 0.05
+      - "center": [0.0, -0.085, -0.0]
+        "radius": 0.05
+  bracelet_link:
+      - "center": [0.0, -0.0, -0.05]
+        "radius": 0.04
+      - "center": [0.0, -0.05, -0.05]
+        "radius": 0.04
+  robotiq_arg2f_base_link:
+      - "center": [0.0, -0.0, 0.04]
+        "radius": 0.04
+  left_outer_finger:
+      - "center": [0.0, -0.01, 0.02]
+        "radius": 0.03
+  left_inner_finger_pad:
+      - "center": [0.0, -0.0, 0.0]
+        "radius": 0.01
+  right_outer_finger:
+      - "center": [0.0, -0.01, 0.02]
+        "radius": 0.03
+  right_inner_finger_pad:
+      - "center": [0.0, -0.0, 0.0]
+        "radius": 0.01
+  
+  
+  
\ No newline at end of file
diff --git a/src/curobo/content/configs/robot/spheres/quad_ur10e.yml b/src/curobo/content/configs/robot/spheres/quad_ur10e.yml
new file mode 100644
index 00000000..90749eb0
--- /dev/null
+++ b/src/curobo/content/configs/robot/spheres/quad_ur10e.yml
@@ -0,0 +1,214 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+robot: 'UR10'
+collision_spheres:
+  shoulder_link:
+    - center: [0, 0, 0]
+      radius: 0.05
+    - center: [0.0, -0.18, 0]
+      radius: 0.05
+  upper_arm_link:
+    - center: [-0.102167, 0,  0.18]
+      radius: 0.05
+    - center: [-0.204333, -8.88178e-19, 0.18]
+      radius: 0.05
+    - center: [-0.3065, -1.33227e-18, 0.18]
+      radius: 0.05
+    - center: [-0.408667, -1.77636e-18, 0.18]
+      radius: 0.05
+    - center: [-0.510833, 0, 0.18]
+      radius: 0.05
+    - center: [ -0.613, 0,0.18]
+      radius: 0.07
+  forearm_link:
+    - center: [-0, 0, 0.03]
+      radius: 0.05
+    - center: [-0.0951667, 0, 0.03]
+      radius: 0.05
+    - center: [-0.190333, 0, 0.03]
+      radius: 0.05
+    - center: [-0.2855, 0, 0.03]
+      radius: 0.05
+    - center: [-0.380667, 0,0.03]
+      radius: 0.05
+    - center: [-0.475833, 0,0.03]
+      radius: 0.05
+    - center: [-0.571, -1.19904e-17, 0.03]
+      radius: 0.05
+  wrist_1_link:
+    - center: [0, 0, 0]
+      radius: 0.05
+  wrist_2_link:
+    - center: [0, 0, 0]
+      radius: 0.05
+  wrist_3_link:
+    - center: [0, 0, 0]
+      radius: 0.05
+    - center: [0, 0, 0.06]
+      radius: 0.08
+  tool0:
+    - center: [0, 0, 0.03]
+      radius: 0.05
+ 
+
+  shoulder_link_1:
+    - center: [0, 0, 0]
+      radius: 0.05
+    - center: [0.0, -0.18, 0]
+      radius: 0.05
+  upper_arm_link_1:
+    - center: [-0.102167, 0,  0.18]
+      radius: 0.05
+    - center: [-0.204333, -8.88178e-19, 0.18]
+      radius: 0.05
+    - center: [-0.3065, -1.33227e-18, 0.18]
+      radius: 0.05
+    - center: [-0.408667, -1.77636e-18, 0.18]
+      radius: 0.05
+    - center: [-0.510833, 0, 0.18]
+      radius: 0.05
+    - center: [ -0.613, 0,0.18]
+      radius: 0.07
+  forearm_link_1:
+    - center: [-0, 0, 0.03]
+      radius: 0.05
+    - center: [-0.0951667, 0, 0.03]
+      radius: 0.05
+    - center: [-0.190333, 0, 0.03]
+      radius: 0.05
+    - center: [-0.2855, 0, 0.03]
+      radius: 0.05
+    - center: [-0.380667, 0,0.03]
+      radius: 0.05
+    - center: [-0.475833, 0,0.03]
+      radius: 0.05
+    - center: [-0.571, -1.19904e-17, 0.03]
+      radius: 0.05
+  wrist_1_link_1:
+    - center: [0, 0, 0]
+      radius: 0.05
+  wrist_2_link_1:
+    - center: [0, 0, 0]
+      radius: 0.05
+  wrist_3_link_1:
+    - center: [0, 0, 0]
+      radius: 0.05
+    - center: [0, 0, 0.06]
+      radius: 0.08
+    
+  tool1:
+    - center: [0, 0, 0.03]
+      radius: 0.05
+  
+
+  
+  shoulder_link_2:
+    - center: [0, 0, 0]
+      radius: 0.05
+    - center: [0.0, -0.18, 0]
+      radius: 0.05
+  upper_arm_link_2:
+    - center: [-0.102167, 0,  0.18]
+      radius: 0.05
+    - center: [-0.204333, -8.88178e-19, 0.18]
+      radius: 0.05
+    - center: [-0.3065, -1.33227e-18, 0.18]
+      radius: 0.05
+    - center: [-0.408667, -1.77636e-18, 0.18]
+      radius: 0.05
+    - center: [-0.510833, 0, 0.18]
+      radius: 0.05
+    - center: [ -0.613, 0,0.18]
+      radius: 0.07
+  forearm_link_2:
+    - center: [-0, 0, 0.03]
+      radius: 0.05
+    - center: [-0.0951667, 0, 0.03]
+      radius: 0.05
+    - center: [-0.190333, 0, 0.03]
+      radius: 0.05
+    - center: [-0.2855, 0, 0.03]
+      radius: 0.05
+    - center: [-0.380667, 0,0.03]
+      radius: 0.05
+    - center: [-0.475833, 0,0.03]
+      radius: 0.05
+    - center: [-0.571, -1.19904e-17, 0.03]
+      radius: 0.05
+  wrist_1_link_2:
+    - center: [0, 0, 0]
+      radius: 0.05
+  wrist_2_link_2:
+    - center: [0, 0, 0]
+      radius: 0.05
+  wrist_3_link_2:
+    - center: [0, 0, 0]
+      radius: 0.05
+    - center: [0, 0, 0.06]
+      radius: 0.08
+    
+  tool2:
+    - center: [0, 0, 0.03]
+      radius: 0.05
+  
+
+  
+  shoulder_link_3:
+    - center: [0, 0, 0]
+      radius: 0.05
+    - center: [0.0, -0.18, 0]
+      radius: 0.05
+  upper_arm_link_3:
+    - center: [-0.102167, 0,  0.18]
+      radius: 0.05
+    - center: [-0.204333, -8.88178e-19, 0.18]
+      radius: 0.05
+    - center: [-0.3065, -1.33227e-18, 0.18]
+      radius: 0.05
+    - center: [-0.408667, -1.77636e-18, 0.18]
+      radius: 0.05
+    - center: [-0.510833, 0, 0.18]
+      radius: 0.05
+    - center: [ -0.613, 0,0.18]
+      radius: 0.07
+  forearm_link_3:
+    - center: [-0, 0, 0.03]
+      radius: 0.05
+    - center: [-0.0951667, 0, 0.03]
+      radius: 0.05
+    - center: [-0.190333, 0, 0.03]
+      radius: 0.05
+    - center: [-0.2855, 0, 0.03]
+      radius: 0.05
+    - center: [-0.380667, 0,0.03]
+      radius: 0.05
+    - center: [-0.475833, 0,0.03]
+      radius: 0.05
+    - center: [-0.571, -1.19904e-17, 0.03]
+      radius: 0.05
+  wrist_1_link_3:
+    - center: [0, 0, 0]
+      radius: 0.05
+  wrist_2_link_3:
+    - center: [0, 0, 0]
+      radius: 0.05
+  wrist_3_link_3:
+    - center: [0, 0, 0]
+      radius: 0.05
+    - center: [0, 0, 0.06]
+      radius: 0.08
+    
+  tool3:
+    - center: [0, 0, 0.03]
+      radius: 0.05
+  
+
+  
\ No newline at end of file
diff --git a/src/curobo/content/configs/robot/spheres/single.yml b/src/curobo/content/configs/robot/spheres/single.yml
new file mode 100644
index 00000000..fc2e4f28
--- /dev/null
+++ b/src/curobo/content/configs/robot/spheres/single.yml
@@ -0,0 +1,16 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+robot: 'single'
+collision_spheres:
+    base:
+        - "center": [-0.08, 0.0, 0.05]
+          "radius": 0.06
+          
diff --git a/src/curobo/content/configs/robot/spheres/ur10e.yml b/src/curobo/content/configs/robot/spheres/ur10e.yml
new file mode 100644
index 00000000..4ced8ffc
--- /dev/null
+++ b/src/curobo/content/configs/robot/spheres/ur10e.yml
@@ -0,0 +1,63 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+robot: 'UR10'
+collision_spheres:
+  shoulder_link:
+    - center: [0, 0, 0.0]
+      radius: 0.02
+  upper_arm_link:
+    - center: [-0, -0, 0.18]
+      radius: 0.09
+    - center: [-0.102167, 0,  0.18]
+      radius: 0.05
+    - center: [-0.204333, -8.88178e-19, 0.18]
+      radius: 0.05
+    - center: [-0.3065, -1.33227e-18, 0.18]
+      radius: 0.05
+    - center: [-0.408667, -1.77636e-18, 0.18]
+      radius: 0.05
+    - center: [-0.510833, 0, 0.18]
+      radius: 0.05
+    - center: [ -0.613, 0,0.18]
+      radius: 0.07
+  forearm_link:
+    - center: [-0, 0, 0.03]
+      radius: 0.05
+    - center: [-0.0951667, 0, 0.03]
+      radius: 0.05
+    - center: [-0.190333, 0, 0.03]
+      radius: 0.05
+    - center: [-0.2855, 0, 0.03]
+      radius: 0.05
+    - center: [-0.380667, 0,0.03]
+      radius: 0.05
+    - center: [-0.475833, 0,0.03]
+      radius: 0.05
+    - center: [-0.571, -1.19904e-17, 0.03]
+      radius: 0.05
+  wrist_1_link:
+    - center: [0, 0, 0]
+      radius: 0.05
+  wrist_2_link:
+    - center: [0, 0, 0]
+      radius: 0.05
+  wrist_3_link:
+    - center: [0, 0, 0]
+      radius: 0.05
+    - center: [0, 0, 0.06]
+      radius: 0.08
+    
+  tool0:
+    - center: [0, 0, 0.03]
+      radius: 0.05
+  camera_mount:
+    - center: [0, 0.11, -0.01]
+      radius: 0.06
\ No newline at end of file
diff --git a/src/curobo/content/configs/robot/template.yml b/src/curobo/content/configs/robot/template.yml
new file mode 100644
index 00000000..0846e225
--- /dev/null
+++ b/src/curobo/content/configs/robot/template.yml
@@ -0,0 +1,46 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+robot_cfg:
+  kinematics:
+    usd_path: "FILL_THIS"
+    usd_robot_root: "/robot"
+    isaac_usd_path: ""
+    usd_flip_joints: {}
+    usd_flip_joint_limits: []
+
+    urdf_path: "FILL_THIS"
+    asset_root_path: ""
+    
+    base_link: "FILL_THIS"
+    ee_link: "FILL_THIS"
+    link_names: null
+    lock_joints: null
+    extra_links: null
+
+    
+    collision_link_names: null # List[str]
+    collision_spheres: null #
+    collision_sphere_buffer: 0.005
+    extra_collision_spheres: {}
+    self_collision_ignore: null # Dict[str, List[str]]
+    self_collision_buffer: null # Dict[str, float]
+
+    use_global_cumul: True
+    mesh_link_names: null # List[str]
+
+    cspace:
+      joint_names: [] # List[str]
+      retract_config: null # List[float]
+      null_space_weight: null # List[str]
+      cspace_distance_weight: null # List[str]
+      max_jerk: 500.0
+      max_acceleration: 15.0
diff --git a/src/curobo/content/configs/robot/tm12.yml b/src/curobo/content/configs/robot/tm12.yml
new file mode 100644
index 00000000..01db09c2
--- /dev/null
+++ b/src/curobo/content/configs/robot/tm12.yml
@@ -0,0 +1,140 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+robot_cfg:
+  kinematics:
+    use_usd_kinematics: False
+    usd_path: "robot/non_shipping/techman/tm12.usd"
+    usd_robot_root: "/tm12"
+    usd_flip_joints:
+      {
+        "shoulder_1_joint": "Z",
+        "shoulder_2_joint": "Z",
+        "elbow_joint": "Z",
+        "wrist_1_joint": "Z",
+        "wrist_2_joint": "Z",
+        "wrist_3_joint": "Z",
+      }
+    urdf_path: "robot/techman/tm_description/urdf/tm12-nominal.urdf"
+    asset_root_path: "robot/techman/tm_description"
+
+    base_link: "base"
+    ee_link: "tool0"
+    link_names: null
+
+    collision_link_names: [
+        "link_1",
+        "link_2",
+        "link_3",
+        "link_4",
+        "link_5",
+        "link_6",
+      ] # List[str]
+
+    collision_spheres:
+      link_1:
+        - "center": [-0.0, -0.0, 0.018]
+          "radius": 0.1
+        - "center": [-0.017, -0.18, 0.02]
+          "radius": 0.1
+      link_2:
+        - "center": [0.116, 0.006, -0.182]
+          "radius": 0.075
+        - "center": [0.638, -0.004, -0.192]
+          "radius": 0.08
+        - "center": [0.19, 0.004, -0.183]
+          "radius": 0.075
+        - "center": [0.265, 0.003, -0.184]
+          "radius": 0.075
+        - "center": [0.34, 0.001, -0.186]
+          "radius": 0.075
+        - "center": [0.414, 0.0, -0.187]
+          "radius": 0.075
+        - "center": [0.489, -0.001, -0.189]
+          "radius": 0.075
+        - "center": [0.563, -0.003, -0.19]
+          "radius": 0.075
+      link_3:
+        - "center": [0.012, 0.004, -0.076]
+          "radius": 0.08
+        - "center": [0.55, -0.001, -0.046]
+          "radius": 0.07
+        - "center": [0.088, 0.003, -0.061]
+          "radius": 0.06
+        - "center": [0.165, 0.002, -0.056]
+          "radius": 0.06
+        - "center": [0.242, 0.001, -0.052]
+          "radius": 0.06
+        - "center": [0.319, 0.001, -0.047]
+          "radius": 0.06
+        - "center": [0.396, -0.0, -0.043]
+          "radius": 0.06
+        - "center": [0.473, -0.001, -0.038]
+          "radius": 0.06
+      link_4:
+        - "center": [0.0, 0.0, 0.0]
+          "radius": 0.07
+
+      link_5:
+        - "center": [0.0, 0.0, 0.0]
+          "radius": 0.06
+      link_6:
+        - "center": [0.003, -0.002, -0.028]
+          "radius": 0.06
+        - "center": [-0.001, 0.075, 0.009]
+          "radius": 0.05
+        - "center": [-0.0, 0.078, -0.028]
+          "radius": 0.05
+        - "center": [-0.031, 0.128, 0.008]
+          "radius": 0.03
+        - "center": [-0.006, 0.146, 0.0]
+          "radius": 0.03
+        - "center": [0.025, 0.125, 0.007]
+          "radius": 0.03
+        - "center": [-0.005, 0.128, 0.003]
+          "radius": 0.03
+    collision_sphere_buffer: 0.0
+
+    self_collision_ignore: {
+        "link_1": ["link_2"],
+        "link_2": ["link_3"],
+        "link_3": ["link_4", "link_5"],
+        "link_4": ["link_5", "link_6"],
+        "link_5": ["link_6", "link_4"],
+      } 
+    self_collision_buffer: {
+        link_1: -0.01,
+        link_2: -0.02,
+        link_3: -0.01,
+        link_4: -0.02,
+        link_6: -0.02,
+        link_5: -0.02,
+      } # Dict[str, float]
+
+    mesh_link_names: ["link_0", "link_1", "link_2", "link_3", "link_4", "link_5", "link_6"] # List[str]
+    lock_joints: null
+    add_object_link: False
+
+    cspace:
+      joint_names: [
+          "shoulder_1_joint",
+          "shoulder_2_joint",
+          "elbow_joint",
+          "wrist_1_joint",
+          "wrist_2_joint",
+          "wrist_3_joint",
+        ] # List[str]
+      retract_config: [0.0, -0.5, 1.9, -0.2, 1.25, 0.0] # List[float]
+      null_space_weight: [1,1,1,1,1,1] # List[str]
+      cspace_distance_weight: [1,1,1,1,1,1] # List[str]
+
+      max_acceleration: 13.5
+      max_jerk: 2000.0
\ No newline at end of file
diff --git a/src/curobo/content/configs/robot/tri_ur10e.yml b/src/curobo/content/configs/robot/tri_ur10e.yml
new file mode 100644
index 00000000..e43b2291
--- /dev/null
+++ b/src/curobo/content/configs/robot/tri_ur10e.yml
@@ -0,0 +1,75 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+robot_cfg:
+  kinematics:
+    use_usd_kinematics: False
+    urdf_path: "robot/ur_description/tri_ur10e.urdf"
+    asset_root_path: "robot/ur_description"
+    isaac_usd_path: "/Isaac/Robots/UR10/ur10_long_suction.usd"
+    usd_robot_root: "/ur10"
+    usd_path: "robot/ur_description/tri_ur10e.usd"
+    base_link: "base_fixture_link"
+    ee_link: "tool0"
+    link_names: ["tool1","tool2"]
+    collision_link_names: [
+      'shoulder_link','upper_arm_link', 'forearm_link', 'wrist_1_link', 'wrist_2_link' ,'wrist_3_link', 'tool0',
+      'shoulder_link_1','upper_arm_link_1', 'forearm_link_1', 'wrist_1_link_1', 'wrist_2_link_1' ,'wrist_3_link_1', 'tool1',
+      'shoulder_link_2','upper_arm_link_2', 'forearm_link_2', 'wrist_1_link_2', 'wrist_2_link_2' ,'wrist_3_link_2', 'tool2']
+    collision_spheres: 'spheres/quad_ur10e.yml'
+    collision_sphere_buffer: 0.005
+
+
+    self_collision_ignore: {
+      "upper_arm_link": ["shoulder_link","forearm_link"],
+      "forarm_link": ["wrist_1_link"],
+      "wrist_1_link": ["wrist_2_link","wrist_3_link"],
+      "wrist_2_link": ["wrist_3_link", "tool0"],
+      "wrist_3_link": ["tool0"],
+
+      "upper_arm_link_1": ["shoulder_link_1","forearm_link_1"],
+      "forarm_link_1": ["wrist_1_link_1"],
+      "wrist_1_link_1": ["wrist_2_link_1","wrist_3_link_1"],
+      "wrist_2_link_1": ["wrist_3_link_1", "tool1"],
+      "wrist_3_link_1": ["tool1"],
+
+      "upper_arm_link_2": ["shoulder_link_2","forearm_link_2"],
+      "forarm_link_2": ["wrist_1_link_2"],
+      "wrist_1_link_2": ["wrist_2_link_2","wrist_3_link_2"],
+      "wrist_2_link_2": ["wrist_3_link_2", "tool2"],
+      "wrist_3_link_2": ["tool2"],
+
+      
+    
+      }
+    self_collision_buffer: {
+      'shoulder_link': 0.05,
+      'shoulder_link_1': 0.05,
+      'shoulder_link_2': 0.05,
+       }
+    lock_joints: null
+
+    cspace:
+      joint_names: [
+        'shoulder_pan_joint', 'shoulder_lift_joint', 'elbow_joint', 'wrist_1_joint', 'wrist_2_joint', 'wrist_3_joint',
+        'shoulder_pan_joint_1', 'shoulder_lift_joint_1', 'elbow_joint_1', 'wrist_1_joint_1', 'wrist_2_joint_1', 'wrist_3_joint_1',
+        'shoulder_pan_joint_2', 'shoulder_lift_joint_2', 'elbow_joint_2', 'wrist_1_joint_2', 'wrist_2_joint_2', 'wrist_3_joint_2']
+      retract_config: [-1.57, -2.2, 1.9, -1.383, -1.57, 0.00, 
+                      -1.57, -2.2, 1.9, -1.383, -1.57, 0.00,
+                      -1.57, -2.2, 1.9, -1.383, -1.57, 0.00]
+      null_space_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                          1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                          1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+      cspace_distance_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                              1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                              1.0, 1.0, 1.0, 1.0, 1.0, 1.0] #[2.5, 1.75, 1.5, 1.25, 0.7, 0.4]
+      max_jerk: 500.0
+      max_acceleration: 15.0
diff --git a/src/curobo/content/configs/robot/ur10e.yml b/src/curobo/content/configs/robot/ur10e.yml
new file mode 100644
index 00000000..b58eeef0
--- /dev/null
+++ b/src/curobo/content/configs/robot/ur10e.yml
@@ -0,0 +1,51 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+robot_cfg:
+  kinematics:
+    use_usd_kinematics: False
+    urdf_path: "robot/ur_description/ur10e.urdf"
+    asset_root_path: "robot/ur_description"
+    isaac_usd_path: "/Isaac/Robots/UR10/ur10_long_suction.usd"
+    usd_robot_root: "/ur10"
+    usd_path: "robot/ur_description/ur10e.usd"
+    base_link: "base_link"
+    ee_link: "tool0"
+    link_names: null 
+    collision_link_names: ['shoulder_link','upper_arm_link', 'forearm_link', 'wrist_1_link', 'wrist_2_link' ,'wrist_3_link', 'tool0']
+    collision_spheres: 'spheres/ur10e.yml'
+    collision_sphere_buffer: 0.02
+    self_collision_ignore: {
+      "upper_arm_link": ["forearm_link", "shoulder_link"],
+      "forarm_link": ["wrist_1_link"],
+      "wrist_1_link": ["wrist_2_link","wrist_3_link"],
+      "wrist_2_link": ["wrist_3_link", "tool0"],
+      "wrist_3_link": ["tool0"],
+      "camera_mount": ["tool0", "wrist_3_link"],
+      }
+    self_collision_buffer: {
+      'shoulder_link': 0.07,
+      'upper_arm_link': 0, 
+      'forearm_link': 0, 
+      'wrist_1_link': 0, 
+      'wrist_2_link': 0,
+      'wrist_3_link' : 0, 
+      'tool0': 0,
+    }
+    lock_joints: null
+
+    cspace:
+      joint_names: ['shoulder_pan_joint', 'shoulder_lift_joint', 'elbow_joint', 'wrist_1_joint', 'wrist_2_joint', 'wrist_3_joint']
+      retract_config: [0.0, -2.2, 1.9, -1.383, -1.57, 0.00]
+      null_space_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+      cspace_distance_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] #[2.5, 1.75, 1.5, 1.25, 0.7, 0.4]
+      max_jerk: 500.0
+      max_acceleration: 15.0
diff --git a/src/curobo/content/configs/robot/ur5e.yml b/src/curobo/content/configs/robot/ur5e.yml
new file mode 100644
index 00000000..37c9a03f
--- /dev/null
+++ b/src/curobo/content/configs/robot/ur5e.yml
@@ -0,0 +1,117 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+robot_cfg:
+  kinematics:
+    usd_path: "robot/ur_description/ur5e.usd"
+    usd_robot_root: "/robot"
+    isaac_usd_path: ""
+    usd_flip_joints: {}
+    usd_flip_joint_limits: []
+
+    urdf_path: "robot/ur_description/ur5e.urdf"
+    asset_root_path: "robot/ur_description"
+    
+    base_link: "base_link"
+    ee_link: "tool0"
+    link_names: null
+    lock_joints: null
+    extra_links: null
+
+    
+    collision_link_names: ['shoulder_link','upper_arm_link', 'forearm_link', 'wrist_1_link', 'wrist_2_link' ,'wrist_3_link'] # List[str]
+    collision_spheres:
+      shoulder_link:
+        - "center": [0.0, 0.0, 0.0]
+          "radius": 0.1
+      upper_arm_link:
+        - "center": [-0.416, -0.0, 0.143]
+          "radius": 0.078
+        - "center": [-0.015, 0.0, 0.134]
+          "radius": 0.077
+        - "center": [-0.14, 0.0, 0.138]
+          "radius": 0.062
+        - "center": [-0.285, -0.001, 0.139]
+          "radius": 0.061
+        - "center": [-0.376, 0.001, 0.138]
+          "radius": 0.077
+        - "center": [-0.222, 0.001, 0.139]
+          "radius": 0.061
+        - "center": [-0.055, 0.008, 0.14]
+          "radius": 0.07
+        - "center": [-0.001, -0.002, 0.143]
+          "radius": 0.076
+      forearm_link:
+        - "center": [-0.01, 0.002, 0.031]
+          "radius": 0.072
+        - "center": [-0.387, 0.0, 0.014]
+          "radius": 0.057
+        - "center": [-0.121, -0.0, 0.006]
+          "radius": 0.057
+        - "center": [-0.206, 0.001, 0.007]
+          "radius": 0.057
+        - "center": [-0.312, -0.001, 0.006]
+          "radius": 0.056
+        - "center": [-0.057, 0.003, 0.008]
+          "radius": 0.065
+        - "center": [-0.266, 0.0, 0.006]
+          "radius": 0.057
+        - "center": [-0.397, -0.001, -0.018]
+          "radius": 0.052
+        - "center": [-0.164, -0.0, 0.007]
+          "radius": 0.057
+      wrist_1_link:
+        - "center": [-0.0, 0.0, -0.009]
+          "radius": 0.047
+        - "center": [-0.0, 0.0, -0.052]
+          "radius": 0.047
+        - "center": [-0.002, 0.027, -0.001]
+          "radius": 0.045
+        - "center": [0.001, -0.01, 0.0]
+          "radius": 0.046
+      wrist_2_link:
+        - "center": [0.0, -0.01, -0.001]
+          "radius": 0.047
+        - "center": [0.0, 0.008, -0.001]
+          "radius": 0.047
+        - "center": [0.001, -0.001, -0.036]
+          "radius": 0.047
+        - "center": [0.001, -0.03, -0.0]
+          "radius": 0.047
+      wrist_3_link:
+        - "center": [0.001, 0.001, -0.029]
+          "radius": 0.043
+
+    collision_sphere_buffer: 0.005
+    extra_collision_spheres: {}
+    self_collision_ignore: {
+      "upper_arm_link": ["forearm_link", "shoulder_link"],
+      "forearm_link": ["wrist_1_link", "wrist_2_link", "wrist_3_link"],
+      "wrist_1_link": ["wrist_2_link","wrist_3_link"],
+      "wrist_2_link": ["wrist_3_link"],
+      }
+    self_collision_buffer: {'upper_arm_link': 0, 
+      'forearm_link': 0, 
+      'wrist_1_link': 0, 
+      'wrist_2_link': 0,
+      'wrist_3_link' : 0,
+      } 
+      
+    use_global_cumul: True
+    mesh_link_names: null # List[str]
+
+    cspace:
+      joint_names: ['shoulder_pan_joint', 'shoulder_lift_joint', 'elbow_joint', 'wrist_1_joint', 'wrist_2_joint', 'wrist_3_joint']
+      retract_config:  [0.0, -2.2, 1.9, -1.383, -1.57, 0.00]
+      null_space_weight:  [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+      cspace_distance_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+      max_jerk: 500.0
+      max_acceleration: 15.0
diff --git a/src/curobo/content/configs/task/base_cfg.yml b/src/curobo/content/configs/task/base_cfg.yml
new file mode 100644
index 00000000..3418452e
--- /dev/null
+++ b/src/curobo/content/configs/task/base_cfg.yml
@@ -0,0 +1,60 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+
+world_collision_checker_cfg:
+  cache: null #{"cube": 41, "capsule": 0, "sphere": 0}
+  checker_type: "PRIMITIVE" # ["PRIMITIVE", "BLOX", "MESH"]
+  max_distance: 0.1
+ 
+
+cost:
+  pose_cfg:
+    vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+    weight: [0.0, 0.0]
+    vec_convergence: [0.0, 0.00] # orientation, position
+    terminal: False
+    
+ 
+  bound_cfg:
+    weight: 000.0
+    activation_distance: [0.0,0.0,0.0,0.0]
+
+constraint:
+  primitive_collision_cfg:
+    weight: 2000.0
+    use_sweep: False
+    classify: True
+  self_collision_cfg:
+    weight: 1000.0
+    classify: True
+  bound_cfg:
+    weight: [5000.0, 5000.0, 5000.0,5000.0]
+    activation_distance: [0.0,0.0,0.0,0.0] # for position, velocity, acceleration and jerk
+
+
+convergence:
+  pose_cfg:
+    vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+    weight: [1.0, 1.0]
+    vec_convergence: [0.0, 0.00] # orientation, position
+    terminal: False
+    
+  cspace_cfg:
+    weight: 1.0
+    terminal: True
+    run_weight: 1.0
+  null_space_cfg:
+    weight: 1.0
+    terminal: True
+    run_weight: 1.0
+    
+ 
\ No newline at end of file
diff --git a/src/curobo/content/configs/task/finetune_trajopt.yml b/src/curobo/content/configs/task/finetune_trajopt.yml
new file mode 100644
index 00000000..95b4a3ed
--- /dev/null
+++ b/src/curobo/content/configs/task/finetune_trajopt.yml
@@ -0,0 +1,97 @@
+
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+model:
+  horizon: 32
+  state_filter_cfg:
+    filter_coeff:
+      position: 1.0
+      velocity: 1.0
+      acceleration: 1.0
+    enable: False
+  dt_traj_params:
+    base_dt: 0.2
+    base_ratio: 1.0
+    max_dt: 0.2
+  vel_scale: 1.0
+  control_space: 'POSITION'
+  state_finite_difference_mode: "CENTRAL" 
+
+  teleport_mode: False
+  return_full_act_buffer: True
+cost:
+  pose_cfg:
+    vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] # orientation, position for all timesteps
+    run_vec_weight: [1.00,1.0,1.0,1.0,1.0,1.0] # running weight orientation, position
+    weight: [5000,30000.0,50,70] #[150.0, 2000.0, 30, 40]
+    vec_convergence: [0.0,0.0] # orientation, position, orientation metric activation, position metric activation
+    terminal: True
+    run_weight: 0.0
+    use_metric: True
+
+  cspace_cfg:
+    weight: 10000.0
+    terminal: True
+    run_weight: 0.00 #1
+
+  bound_cfg:
+    weight: [5000.0, 5000.0, 5000.0, 5000.0] # needs to be 3 values
+    smooth_weight: [0.0,1000.0, 500.0, 0.0]  # [vel, acc, jerk,] 
+    run_weight_velocity: 0.0
+    run_weight_acceleration: 1.0
+    run_weight_jerk: 1.0
+    activation_distance: [0.1,0.1,0.1,10.0] # for position, velocity, acceleration and jerk
+    null_space_weight: [1.0]
+
+  primitive_collision_cfg:
+    weight: 50000.0
+    use_sweep: True
+    sweep_steps: 6
+    classify: False
+    use_sweep_kernel: True
+    use_speed_metric: True
+    speed_dt: 0.01 # used only for speed metric
+    activation_distance: 0.025
+
+  self_collision_cfg:
+    weight: 5000.0
+    classify: False
+  
+
+ 
+lbfgs:
+  n_iters: 300 #125 #@200 #250 #250 # 150 #25
+  inner_iters: 25 #$25 # 25
+  cold_start_n_iters: 300 #125 #200 #250 #$150 #25
+  min_iters: 25
+  line_search_scale: [0.01,0.3,0.7,1.0]   #   #
+  fixed_iters: False
+  cost_convergence: 0.01
+  cost_delta_threshold: 10.0 #10.0 # 5.0 #2.0
+  cost_relative_threshold: 0.999 #0.999
+  epsilon: 0.01
+  history: 4 #15
+  use_cuda_graph: True
+  n_envs: 1
+  store_debug: False
+  use_cuda_kernel: True
+  stable_mode: True
+  line_search_type: "approx_wolfe"
+  use_cuda_line_search_kernel: True
+  use_cuda_update_best_kernel: True
+  use_temporal_smooth: False
+  sync_cuda_time: True
+  step_scale: 1.0
+  last_best: 5
+  use_coo_sparse: True
+  debug_info:
+    visual_traj       : null #'ee_pos_seq'
diff --git a/src/curobo/content/configs/task/gradient_ik.yml b/src/curobo/content/configs/task/gradient_ik.yml
new file mode 100644
index 00000000..b14b4d3b
--- /dev/null
+++ b/src/curobo/content/configs/task/gradient_ik.yml
@@ -0,0 +1,81 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+model:
+  horizon: 1
+  state_filter_cfg:
+    filter_coeff:
+      position: 1.0
+      velocity: 1.0
+      acceleration: 0.0
+
+    enable: False
+  dt_traj_params:
+    base_dt: 0.02
+    base_ratio: 1.0
+    max_dt: 0.25
+  vel_scale: 1.0
+  control_space: 'POSITION'
+  teleport_mode: True
+cost:
+  pose_cfg:
+    vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+    #weight: [100, 500, 10, 20] 
+    weight: [500, 5000, 30, 30]
+    vec_convergence: [0.00, 0.000] 
+    terminal: False
+    use_metric: True
+    
+  cspace_cfg:
+    weight: 0.000
+  bound_cfg:
+    weight: 50.0
+    activation_distance: [0.1]
+    null_space_weight: [1.0]
+  primitive_collision_cfg:
+    weight: 1000.0
+    use_sweep: False
+    classify: False
+    activation_distance: 0.025
+  self_collision_cfg:
+    weight: 1000.0
+    classify: False
+
+
+lbfgs:
+  n_iters: 80 #60
+  inner_iters: 20
+  cold_start_n_iters: 80
+  min_iters: 20
+  line_search_scale:  [0.01, 0.3, 0.7, 1.0] #[0.01,0.4, 0.9, 1.0] # # 
+  fixed_iters: True
+  cost_convergence: 1e-7
+  cost_delta_threshold: 1e-6 #0.0001
+  cost_relative_threshold: 1.0
+  epsilon: 0.01 # used only in stable_mode
+  history: 4
+  horizon: 1
+  use_cuda_graph: True
+  n_envs: 1
+  store_debug: False
+  use_cuda_kernel: True
+  stable_mode: True
+  line_search_type: "approx_wolfe" #"wolfe"
+  use_cuda_line_search_kernel: True
+  use_cuda_update_best_kernel: True
+  sync_cuda_time: True
+  step_scale: 1.0 
+  use_coo_sparse: True
+  last_best: 10
+  debug_info:
+    visual_traj       : null #'ee_pos_seq'
+  
+
diff --git a/src/curobo/content/configs/task/gradient_mpc.yml b/src/curobo/content/configs/task/gradient_mpc.yml
new file mode 100644
index 00000000..58b49363
--- /dev/null
+++ b/src/curobo/content/configs/task/gradient_mpc.yml
@@ -0,0 +1,99 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+model:
+  horizon: 40
+  state_filter_cfg:
+    filter_coeff:
+      position: 0.0
+      velocity: 0.0
+      acceleration: 0.0
+    enable: True
+  dt_traj_params:
+    base_dt: 0.02
+    base_ratio: 0.5
+    max_dt: 0.02
+  vel_scale: 1.0
+  control_space: 'POSITION'
+  teleport_mode: False
+  state_finite_difference_mode: "CENTRAL"
+ 
+cost:
+  pose_cfg:
+    vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] # orientation, position for all timesteps
+    run_vec_weight: [1.00,1.00,1.00,1.0,1.0,1.0] # running weight orientation, position
+    weight: [500,2000.0,10,10] #[150.0, 2000.0, 30, 40]
+    vec_convergence: [0.0,0.0] #[0.001,0.0001] #[0.01, 0.001] # orientation, position
+    terminal: True
+    run_weight: 0.1
+    use_metric: True
+    
+  cspace_cfg:
+    weight: 00.0
+    terminal: True
+    run_weight: 1.0
+    
+  bound_cfg:
+    weight: [50.0, 0.0,0.0,0.0]
+    activation_distance: [0.1,0.1,0.1,0.1] # for position, velocity, acceleration and jerk
+    smooth_weight: [0.0, 100.0, 0.0,0.0] # [vel, acc, jerk, alpha_vel, eta_position, eta_vel, eta_acc] 
+    run_weight_velocity: 0.0
+    run_weight_acceleration: 1.0
+    run_weight_jerk: 1.0
+  primitive_collision_cfg:
+    weight: 500.0
+    use_sweep: True
+    sweep_steps: 4
+    classify: False
+    use_sweep_kernel: True
+    use_speed_metric: False
+    speed_dt: 0.1 # used only for speed metric
+    activation_distance: 0.025
+  self_collision_cfg:
+    weight: 500.0
+    classify: False
+
+  null_space_cfg:
+    weight: 0.1
+    terminal: True
+    run_weight: 1.0
+    use_null_space: True
+ 
+  stop_cfg:
+    weight: 10.0 #50.0
+    max_nlimit: 0.5 #0.2
+
+lbfgs:
+  n_iters: 150 #125 #@200 #250 #250 # 150 #25
+  inner_iters: 25
+  cold_start_n_iters: 500 #125 #200 #250 #$150 #25
+  min_iters: 50
+  line_search_scale: [0.01,0.25, 0.75,1.0] #[0.01,0.25,0.7, 1.0]  # [0.01, 0.8, 1.0] #
+  fixed_iters: True
+  cost_convergence: 0.01
+  cost_delta_threshold: 0.0001
+  epsilon: 0.01
+  history: 6 #15
+  use_cuda_graph: True
+  n_envs: 1
+  store_debug: False
+  use_cuda_kernel: True
+  stable_mode: True
+  line_search_type: "approx_wolfe" #"strong_wolfe" #"strong_wolfe"
+  use_cuda_line_search_kernel: True
+  use_cuda_update_best_kernel: True
+  sync_cuda_time: True
+  use_temporal_smooth: False
+  last_best: 26
+  step_scale: 1.0
+  use_coo_sparse: True
+  debug_info:
+    visual_traj       : null #'ee_pos_seq'
diff --git a/src/curobo/content/configs/task/gradient_trajopt.yml b/src/curobo/content/configs/task/gradient_trajopt.yml
new file mode 100644
index 00000000..fb4a84fd
--- /dev/null
+++ b/src/curobo/content/configs/task/gradient_trajopt.yml
@@ -0,0 +1,98 @@
+
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+model:
+  horizon: 32
+  state_filter_cfg:
+    filter_coeff:
+      position: 1.0
+      velocity: 1.0
+      acceleration: 1.0
+    enable: False
+  dt_traj_params:
+    base_dt: 0.2
+    base_ratio: 1.0
+    max_dt: 0.2
+  vel_scale: 1.0
+  control_space: 'POSITION'
+  state_finite_difference_mode: "CENTRAL" 
+  teleport_mode: False
+  return_full_act_buffer: True
+
+cost:
+  pose_cfg:
+    vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] # orientation, position for all timesteps
+    run_vec_weight: [1.00,1.00,1.00,1.0,1.0,1.0] # running weight orientation, position
+    weight: [2000,20000.0,30,30] #[150.0, 2000.0, 30, 40]
+    vec_convergence: [0.0,0.0] # orientation, position, orientation metric activation, position metric activation
+    terminal: True
+    run_weight: 0.0 #0.05
+    use_metric: True
+
+  cspace_cfg:
+    weight: 10000.0
+    terminal: True
+    run_weight: 0.00 #1
+
+  bound_cfg:
+    weight: [5000.0, 5000.0, 5000.0,5000.0] # needs to be 3 values
+    smooth_weight: [0.0,3000.0,10.0, 0.0]  # [vel, acc, jerk,] 
+    run_weight_velocity: 0.00
+    run_weight_acceleration: 1.0
+    run_weight_jerk: 1.0
+    activation_distance: [0.1,0.1,0.1,10.0] # for position, velocity, acceleration and jerk
+    null_space_weight: [1.0]
+
+  primitive_collision_cfg:
+    weight: 10000.0
+    use_sweep: True
+    sweep_steps: 4
+    classify: False
+    use_sweep_kernel: True
+    use_speed_metric: True
+    speed_dt: 0.01 # used only for speed metric
+    activation_distance: 0.025
+
+
+  self_collision_cfg:
+    weight: 5000.0
+    classify: False
+  
+
+ 
+lbfgs:
+  n_iters: 100 #125 #@200 #250 #250 # 150 #25
+  inner_iters: 25 #25 # 25
+  cold_start_n_iters: 100 #125 #200 #250 #$150 #25
+  min_iters: 25
+  line_search_scale: [0.01,0.3,0.7,1.0]  #[0.01,0.2, 0.3,0.5,0.7,0.9, 1.0] #
+  fixed_iters: False
+  cost_convergence: 0.01
+  cost_delta_threshold: 1.0
+  cost_relative_threshold: 0.9999
+  epsilon: 0.01
+  history: 4 #15 #$14
+  use_cuda_graph: True
+  n_envs: 1
+  store_debug: False
+  use_cuda_kernel: True
+  stable_mode: True
+  line_search_type: "approx_wolfe"
+  use_cuda_line_search_kernel: True
+  use_cuda_update_best_kernel: True
+  use_temporal_smooth: False
+  sync_cuda_time: True
+  step_scale: 1.0 #1.0
+  last_best: 5
+  use_coo_sparse: True
+  debug_info:
+    visual_traj       : null #'ee_pos_seq'
diff --git a/src/curobo/content/configs/task/graph.yml b/src/curobo/content/configs/task/graph.yml
new file mode 100644
index 00000000..9e8f9a7b
--- /dev/null
+++ b/src/curobo/content/configs/task/graph.yml
@@ -0,0 +1,53 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+model:
+  horizon: 1
+  state_filter_cfg:
+    filter_coeff:
+      position: 1.0
+      velocity: 1.0
+      acceleration: 0.0
+    enable: True
+  dt_traj_params:
+    base_dt: 0.02
+    base_ratio: 1.0
+    max_dt: 0.02
+  vel_scale: 1.0
+  control_space: 'POSITION'
+  teleport_mode: True
+  state_finite_difference_mode: "CENTRAL"
+
+
+
+graph:
+  max_nodes: 5000 # node list
+  steer_delta_buffer: 500 # for steering
+  sample_pts: 1500
+  node_similarity_distance: 0.1
+  
+  rejection_ratio: 20 #$20
+  k_nn: 15
+  max_buffer: 10000
+  max_cg_buffer: 1000
+  vertex_n : 30
+
+  graph_max_attempts: 10
+  graph_min_attempts: 1 
+  init_nodes: 30
+
+  c_max: 1.25
+  use_bias_node: True
+  compute_metrics: False
+  interpolation_steps: 1000
+  interpolation_type: "linear"
+  seed: 0
+  interpolation_deviation: 0.05
+  interpolation_acceleration_scale: 0.25
\ No newline at end of file
diff --git a/src/curobo/content/configs/task/particle_ik.yml b/src/curobo/content/configs/task/particle_ik.yml
new file mode 100644
index 00000000..6d36a0e9
--- /dev/null
+++ b/src/curobo/content/configs/task/particle_ik.yml
@@ -0,0 +1,81 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+model:
+  horizon: 1
+  state_filter_cfg:
+    filter_coeff:
+      position: 1.0
+      velocity: 1.0
+      acceleration: 0.0
+    enable: False
+  dt_traj_params:
+    base_dt: 0.02
+    base_ratio: 1.0
+    max_dt: 0.25
+  vel_scale: 1.0
+  control_space: 'POSITION'
+  teleport_mode: True
+cost:
+  pose_cfg:
+    vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+    weight: [30, 50, 10, 10] #[20.0, 100.0]
+    vec_convergence: [0.00, 0.000] # orientation, position
+    terminal: False
+    use_metric: True
+  cspace_cfg:
+    weight: 0.00
+  bound_cfg:
+    weight: 5000.0
+    activation_distance: [0.05]
+  primitive_collision_cfg:
+    weight: 500.0 
+    use_sweep: False
+    classify: False
+    activation_distance: 0.035
+  self_collision_cfg:
+    weight: 500.0
+    classify: False
+
+
+mppi:
+  init_cov          : 1.0 #0.15 #.5 #.5
+  gamma             : 1.0
+  n_iters           : 4
+  cold_start_n_iters: 4
+  step_size_mean    : 0.9
+  step_size_cov     : 0.1
+  beta              : 0.01
+  alpha             : 1
+  num_particles     : 25 #10000
+  update_cov        : True
+  cov_type          : "DIAG_A" # 
+  kappa             : 0.01
+  null_act_frac     : 0.0
+  sample_mode       : 'BEST'
+  base_action       : 'REPEAT'
+  squash_fn         : 'CLAMP'
+  n_envs            : 1
+  use_cuda_graph    : True
+  seed              : 0
+  store_debug       : False
+  random_mean       : False
+  sample_per_env    : True
+  sync_cuda_time    : False
+  use_coo_sparse    : True
+  sample_params:
+    fixed_samples: True
+    sample_ratio: {'halton':1.0, 'halton-knot':0.0, 'random':0.0, 'random-knot':0.0}
+    seed: 23
+    filter_coeffs: [0.0, 0.0, 1.0]
+    n_knots: 5
+  debug_info:
+    visual_traj       : null #'state_seq'
diff --git a/src/curobo/content/configs/task/particle_mpc.yml b/src/curobo/content/configs/task/particle_mpc.yml
new file mode 100644
index 00000000..eec1adf2
--- /dev/null
+++ b/src/curobo/content/configs/task/particle_mpc.yml
@@ -0,0 +1,107 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+model:
+  horizon: 30
+  state_filter_cfg:
+    filter_coeff:
+      position: 0.1
+      velocity: 0.1
+      acceleration: 0.0
+    enable: True
+  dt_traj_params:
+    base_dt: 0.01
+    base_ratio: 0.5
+    max_dt: 0.04
+  vel_scale: 1.0
+  control_space: 'ACCELERATION'
+  teleport_mode: False
+  state_finite_difference_mode: "CENTRAL"
+ 
+
+cost:
+  pose_cfg:
+    vec_weight: [1.0,1.0, 1.0, 1.0, 1.0, 1.0]
+    run_vec_weight: [1.0,1.0, 1.0, 1.0, 1.0, 1.0]
+    weight: [60,300.0,20,20] #[150.0, 2000.0, 30, 40]
+    vec_convergence: [0.0, 0.00] # orientation, position
+    terminal: True
+    run_weight: 1.0
+    use_metric: True
+    
+  cspace_cfg:
+    weight: 500.0
+    terminal: True
+    run_weight: 1.0
+    
+  bound_cfg:
+    weight: [5000.0, 5000.0,5000.0,000.0]
+    activation_distance: [0.1,0.1,0.1,0.1] # for position, velocity, acceleration and jerk
+    smooth_weight: [0.0, 50.0, 0.0,0.0] # [vel, acc, jerk, alpha_vel, eta_position, eta_vel, eta_acc] 
+    run_weight_velocity: 0.0
+    run_weight_acceleration: 1.0
+    run_weight_jerk: 1.0
+    null_space_weight: [10.0]
+
+  primitive_collision_cfg:
+    weight: 100000.0
+    use_sweep: True
+    sweep_steps: 4
+    classify: False
+    use_sweep_kernel: True
+    use_speed_metric: False
+    speed_dt: 0.1 # used only for speed metric
+    activation_distance: 0.025
+  
+  self_collision_cfg:
+    weight: 50000.0
+    classify: False
+
+
+  stop_cfg:
+    weight: 100.0 #50.0
+    max_nlimit: 0.25 #0.2
+
+
+
+mppi:
+  init_cov          : 0.05 #.5 #.5
+  gamma             : 0.98
+  n_iters           : 5
+  cold_start_n_iters: 5
+  step_size_mean    : 0.9
+  step_size_cov     : 0.01
+  beta              : 0.1
+  alpha             : 1
+  num_particles     : 400 #10000
+  update_cov        : True
+  cov_type          : "DIAG_A" # 
+  kappa             : 0.0001
+  null_act_frac     : 0.05
+  sample_mode       : 'BEST'
+  base_action       : 'REPEAT'
+  squash_fn         : 'CLAMP'
+  n_envs            : 1
+  use_cuda_graph    : True
+  seed              : 0
+  store_debug       : False
+  random_mean       : True
+  sample_per_env    : False
+  sync_cuda_time    : True
+  use_coo_sparse    : True
+  sample_params:
+    fixed_samples: True
+    sample_ratio: {'halton':0.3, 'halton-knot':0.7, 'random':0.0, 'random-knot':0.0}
+    seed: 0
+    filter_coeffs: [0.3, 0.3, 0.4]
+    n_knots: 5
+  debug_info:
+    visual_traj       : 'ee_pos_seq'
diff --git a/src/curobo/content/configs/task/particle_trajopt.yml b/src/curobo/content/configs/task/particle_trajopt.yml
new file mode 100644
index 00000000..07b466d0
--- /dev/null
+++ b/src/curobo/content/configs/task/particle_trajopt.yml
@@ -0,0 +1,104 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+model:
+  horizon: 32
+  state_filter_cfg:
+    filter_coeff:
+      position: 1.0
+      velocity: 1.0
+      acceleration: 1.0
+    enable: False
+  dt_traj_params:
+    base_dt: 0.2
+    base_ratio: 1.0
+    max_dt: 0.2
+  vel_scale: 1.0
+  control_space: 'POSITION'
+  teleport_mode: False
+  return_full_act_buffer: True
+  state_finite_difference_mode: "CENTRAL"
+
+cost:
+  pose_cfg:
+    vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+    run_vec_weight: [1.0,1.0,1.0,1.0,1.0,1.0] # running weight 
+    weight: [250.0, 5000.0, 40, 40]
+    vec_convergence: [0.0,0.0,1000.0,1000.0] 
+    terminal: True
+    run_weight: 0.00 
+    use_metric: True
+ 
+  cspace_cfg:
+    weight: 500.0
+    terminal: True
+    run_weight: 0.001
+
+  bound_cfg:
+    weight: [0.1, 0.1,0.0,0.0]
+    activation_distance: [0.0,0.0,0.0,0.0] #-#0.01
+    smooth_weight: [0.0,100.0,1.0,0.0]
+    run_weight_velocity: 0.0
+    run_weight_acceleration: 1.0
+    run_weight_jerk: 1.0
+
+  primitive_collision_cfg:
+    weight: 5000.0
+    use_sweep: True
+    classify: False
+    sweep_steps: 4
+    use_sweep_kernel: True
+    use_speed_metric: True
+    speed_dt: 0.01  # used only for speed metric
+    activation_distance: 0.025
+
+  self_collision_cfg:
+    weight: 500.0
+    classify: False
+  
+ 
+
+
+mppi:
+  init_cov          : 0.1 #0.5 
+  gamma             : 1.0
+  n_iters           : 2
+  cold_start_n_iters: 2
+  step_size_mean    : 0.9
+  step_size_cov     : 0.01
+  beta              : 0.01
+  alpha             : 1
+  num_particles     : 25 # 100
+  update_cov        : True
+  cov_type          : "DIAG_A" # 
+  kappa             : 0.001
+  null_act_frac     : 0.0
+  sample_mode       : 'BEST'
+  base_action       : 'REPEAT'
+  squash_fn         : 'CLAMP'
+  n_envs            : 1
+  use_cuda_graph    : True
+  seed              : 0
+  store_debug       : False
+  random_mean       : True
+  sample_per_env    : True
+  sync_cuda_time    : False
+  use_coo_sparse    : True
+  sample_params:
+    fixed_samples: True
+    sample_ratio: {'halton':0.0, 'halton-knot':0.0, 'random':0.0, 'random-knot':0.0, "stomp": 1.0}
+    seed: 25
+    filter_coeffs: [0.3, 0.3, 0.4]
+    n_knots: 5
+  debug_info:
+    visual_traj       : null #'ee_pos_seq'
+
+
diff --git a/src/curobo/content/configs/world/collision_base.yml b/src/curobo/content/configs/world/collision_base.yml
new file mode 100644
index 00000000..6a3fbfd7
--- /dev/null
+++ b/src/curobo/content/configs/world/collision_base.yml
@@ -0,0 +1,14 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+cuboid:
+  table:
+    dims: [0.1, 0.1, 0.8] # x, y, z
+    pose: [0.0, 0.0, -0.45, 1, 0, 0, 0.0] # x, y, z, qw, qx, qy, qz
diff --git a/src/curobo/content/configs/world/collision_cage.yml b/src/curobo/content/configs/world/collision_cage.yml
new file mode 100644
index 00000000..4d344801
--- /dev/null
+++ b/src/curobo/content/configs/world/collision_cage.yml
@@ -0,0 +1,116 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+  cuboid:
+    cube0:
+      dims:
+      - 0.44759031573031055
+      - 0.6041613589568638
+      - 0.04
+      pose:
+      - 0.5368277748379187
+      - 0.5609131160539919
+      - -0.21207545174379683
+      - 0.8967553371515242
+      - 0.0
+      - 0.0
+      - -0.44252668313928384
+    cube1:
+      dims:
+      - 0.04
+      - 0.6441613589568638
+      - 0.507432552596414
+      pose:
+      - 0.4043908894165481
+      - 0.7665743675779062
+      - 0.02164082455441016
+      - 0.8967553371515242
+      - 0.0
+      - 0.0
+      - -0.44252668313928384
+    cube2:
+      dims:
+      - 0.04
+      - 0.6441613589568638
+      - 0.507432552596414
+      pose:
+      - 0.7010117134542583
+      - 0.37958547530685705
+      - 0.02164082455441016
+      - 0.8967553371515242
+      - 0.0
+      - 0.0
+      - -0.44252668313928384
+    cube3:
+      dims:
+      - 0.5275903157303106
+      - 0.04
+      - 0.04
+      pose:
+      - 0.2811999632260791
+      - 0.36497846872527095
+      - -0.07378708334618755
+      - 0.8967553371515242
+      - 0.0
+      - 0.0
+      - -0.44252668313928384
+    cube4:
+      dims:
+      - 0.5275903157303106
+      - 0.04
+      - 0.04
+      pose:
+      - 0.2811999632260791
+      - 0.36497846872527095
+      - 0.1293126754499218
+      - 0.8967553371515242
+      - 0.0
+      - 0.0
+      - -0.44252668313928384
+    cube5:
+      dims:
+      - 0.44759031573031055
+      - 0.42891295126980467
+      - 0.04
+      pose:
+      - 0.6381200845475716
+      - 0.6385520586046123
+      - 0.25535710085261715
+      - 0.8967553371515242
+      - 0.0
+      - 0.0
+      - -0.44252668313928384
+    cube6:
+      dims:
+      - 0.44759031573031055
+      - 0.04
+      - 0.467432552596414
+      pose:
+      - 0.7924555864497582
+      - 0.7568477633827128
+      - 0.0016408245544101419
+      - 0.8967553371515242
+      - 0.0
+      - 0.0
+      - -0.44252668313928384
+    cube7:
+      dims:
+      - 0.25
+      - 0.2
+      - 0.5
+      pose:
+      - -0.05
+      - 0.0
+      - -0.25
+      - 1.0
+      - 0.0
+      - 0.0
+      - 0.0
diff --git a/src/curobo/content/configs/world/collision_cubby.yml b/src/curobo/content/configs/world/collision_cubby.yml
new file mode 100644
index 00000000..d075ba2a
--- /dev/null
+++ b/src/curobo/content/configs/world/collision_cubby.yml
@@ -0,0 +1,58 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+cuboid:
+  table:
+    dims: [5.2, 5.2, 0.2] # x, y, z
+    pose: [0.0, 0.0, -0.1, 1, 0, 0, 0.0] # x, y, z, qx, qy, qz, qw
+
+
+  
+  
+  cube4:
+    dims:
+    - 1.5
+    - 0.1
+    - 0.9
+    pose:
+    - -0.65
+    - -0.7248229665483708
+    - 0.552010009365394
+    - 0.9982952839725183
+    - 0.0
+    - 0.0
+    - 0.05836545209478731
+ 
+  cube51:
+    dims:
+    - 0.10037689095815354
+    - 0.2
+    - 0.9
+    pose:
+    - -0.33
+    - 0.3
+    - 0.53552010009365394
+    - 0.9982952839725183
+    - 0.0
+    - 0.0
+    - 0.05836545209478731
+  cube6:
+    dims:
+    - 0.1
+    - 0.8
+    - 1.6
+    pose:
+    - 0.4
+    - -0.1
+    - 0.3
+    - 0.9982952839725183
+    - 0.0
+    - 0.0
+    - 0.05836545209478731
diff --git a/src/curobo/content/configs/world/collision_floor_plan.yml b/src/curobo/content/configs/world/collision_floor_plan.yml
new file mode 100644
index 00000000..d11859f3
--- /dev/null
+++ b/src/curobo/content/configs/world/collision_floor_plan.yml
@@ -0,0 +1,71 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+cuboid:
+  table:
+    dims: [5.2, 5.2, 0.2] # x, y, z
+    pose: [0.0, 0.0, -0.1, 0, 0, 0, 1.0] # x, y, z, qx, qy, qz, qw
+
+
+  
+  
+  cube4:
+    dims:
+    - 1.5
+    - 0.1
+    - 0.9
+    pose:
+    - -0.65
+    - -0.7248229665483708
+    - 0.552010009365394
+    - 0.9982952839725183
+    - 0.0
+    - 0.0
+    - 0.05836545209478731
+ 
+  cube51:
+    dims:
+    - 0.10037689095815354
+    - 0.6
+    - 0.9
+    pose:
+    - -0.5
+    - 0.3
+    - 0.53552010009365394
+    - 0.9982952839725183
+    - 0.0
+    - 0.0
+    - 0.05836545209478731
+  cube6:
+    dims:
+    - 0.1
+    - 1.8
+    - 1.6
+    pose:
+    - 0.4
+    - -0.1
+    - 0.3
+    - 0.9982952839725183
+    - 0.0
+    - 0.0
+    - 0.05836545209478731
+  cube61:
+    dims:
+    - 1.5
+    - 0.1
+    - 1.6
+    pose:
+    - -0.4
+    - 0.25
+    - 0.3
+    - 1.0
+    - 0.0
+    - 0.0
+    - 0.0
diff --git a/src/curobo/content/configs/world/collision_handover.yml b/src/curobo/content/configs/world/collision_handover.yml
new file mode 100644
index 00000000..8a2e3ce1
--- /dev/null
+++ b/src/curobo/content/configs/world/collision_handover.yml
@@ -0,0 +1,17 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+cuboid:
+  table:
+    dims: [2.0, 2.0, 0.2] # x, y, z
+    pose: [0.0, 0.0, -0.1, 1, 0, 0, 0.0] # x, y, z, qx, qy, qz, qw
+  hand:
+    dims: [0.05, 0.18, 0.15] # x, y, z
+    pose: [0.0, 0.0, -0.3, 1, 0, 0, 0.0] # x, y, z, qx, qy, qz, qw
diff --git a/src/curobo/content/configs/world/collision_mesh_scene.yml b/src/curobo/content/configs/world/collision_mesh_scene.yml
new file mode 100644
index 00000000..cf63f5dc
--- /dev/null
+++ b/src/curobo/content/configs/world/collision_mesh_scene.yml
@@ -0,0 +1,14 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+mesh:
+  base_scene:
+    pose: [1.5, 0.080, 1.6, 0.043, -0.471, 0.284, 0.834]
+    file_path: "scene/nvblox/srl_ur10_bins.obj"
diff --git a/src/curobo/content/configs/world/collision_nvblox.yml b/src/curobo/content/configs/world/collision_nvblox.yml
new file mode 100644
index 00000000..661f3d8c
--- /dev/null
+++ b/src/curobo/content/configs/world/collision_nvblox.yml
@@ -0,0 +1,19 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+blox:
+  world:
+    pose: [1.5, 0.080, 1.55, 0.043, -0.471, 0.284, 0.834]
+    map_path: "scene/nvblox/srl_ur10_bins.nvblx"
+    mesh_file_path: "scene/nvblox/srl_ur10_bins.obj"
+    integrator_type: "tsdf"
+    voxel_size: 0.03
+
+  
diff --git a/src/curobo/content/configs/world/collision_nvblox_online.yml b/src/curobo/content/configs/world/collision_nvblox_online.yml
new file mode 100644
index 00000000..26b08086
--- /dev/null
+++ b/src/curobo/content/configs/world/collision_nvblox_online.yml
@@ -0,0 +1,17 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+blox:
+  world:
+    pose: [0,0,0,1,0,0,0]
+    integrator_type: "tsdf"
+    voxel_size: 0.03
+    
+  
\ No newline at end of file
diff --git a/src/curobo/content/configs/world/collision_nvblox_ur10.yml b/src/curobo/content/configs/world/collision_nvblox_ur10.yml
new file mode 100644
index 00000000..6589ab42
--- /dev/null
+++ b/src/curobo/content/configs/world/collision_nvblox_ur10.yml
@@ -0,0 +1,36 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+blox:
+  world_map:
+    pose: [-0.38, 0.280, 1.05,0.043, -0.471, 0.284, 0.834]
+    map_path: "scene/nvblox/srl_ur10_bins.nvblx"
+    mesh_file_path: "scene/nvblox/srl_ur10_bins.obj"
+
+cuboid:
+  case:
+    dims: [0.9, 0.7, 0.4] # x, y, z
+    pose: [0.0, 0.0, -0.6, 1, 0, 0, 0.0] # x, y, z, qx, qy, qz, qw
+
+  stand:
+    dims: [0.3, 0.3, 0.8] # x, y, z
+    pose: [0.0, 0.0, -0.4, 1, 0, 0, 0.0] # x, y, z, qx, qy, qz, qw
+  table:
+    dims: [4.0, 4.0, 0.2] # x, y, z
+    pose: [0.0, 0.0, -0.85, 1, 0, 0, 0.0] # x, y, z, qx, qy, qz, qw
+  
+  safety_wall:
+    dims: [2.0, 0.2, 2.0] # x, y, z
+    pose: [0.0, -1.2, 0.0, 1, 0, 0, 0.0] # x, y, z, qx, qy, qz, qw
+
+  
+  #safety_wall_ceiling:
+  #  dims: [2.0, 2.0, 0.1] # x, y, z
+  #  pose: [0.0, 0.0, 1.4, 1, 0, 0, 0.0] # x, y, z, qx, qy, qz, qw
diff --git a/src/curobo/content/configs/world/collision_pillar_wall.yml b/src/curobo/content/configs/world/collision_pillar_wall.yml
new file mode 100644
index 00000000..a50e4d5d
--- /dev/null
+++ b/src/curobo/content/configs/world/collision_pillar_wall.yml
@@ -0,0 +1,60 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+##
+cuboid:
+  table:
+    dims: [2.2, 2.2, 0.2] # x, y, z
+    pose: [0.0, 0.0, -0.1, 1, 0, 0, 0.0] # x, y, z, qx, qy, qz, qw
+    color:
+      - 0.6
+      - 0.6
+      - 0.8
+      - 1.0
+  cube6:
+    dims:
+    - 0.05
+    - 0.05
+    - 0.6
+    pose:
+    - 0.4
+    - 0.0
+    - 0.3
+    - 1.0
+    - 0.0
+    - 0.0
+    - 0.0
+    color:
+      - 0.8
+      - 0.0
+      - 0.0
+      - 1.0
+ 
+  cube4:
+    dims:
+    - 0.05
+    - 0.05
+    - 0.6
+    pose:
+    - -0.6
+    - 0.0
+    - 0.3
+    - 1.0
+    - 0.0
+    - 0.0
+    - 0.0
+    color:
+      - 0.8
+      - 0.0
+      - 0.0
+      - 1.0
+ 
+ 
\ No newline at end of file
diff --git a/src/curobo/content/configs/world/collision_primitives_3d.yml b/src/curobo/content/configs/world/collision_primitives_3d.yml
new file mode 100644
index 00000000..beb60673
--- /dev/null
+++ b/src/curobo/content/configs/world/collision_primitives_3d.yml
@@ -0,0 +1,39 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+cuboid:
+  #cube1:
+  #  dims: [0.7, 0.1, 0.4] # x, y, z
+  #  pose: [0.6, 0.2, 0.1, 0, 0, 0, 1.0] # x
+  cube2:
+    dims: [0.3, 0.1, 0.5] # x, y, z
+    pose: [0.4, -0.3, 0.2, 1, 0, 0, 0.0] # x
+  cube3:
+    dims: [2.0, 2.0, 0.2] # x, y, z
+    pose: [0.0, 0.0, -0.1, 1, 0, 0, 0.0] # x
+sphere:
+  sphere1:
+    position: [0.5,0.1,0.1]
+    radius: 0.1
+  sphere2:
+    position: [-0.5,0.1,0.1]
+    radius: 0.1
+        
+capsule:
+  capsule1:
+    radius: 0.1
+    base: [0.0,0.0,0.1]
+    tip: [0.0,0.0,0.5]
+    pose: [0.5,0.0,0.0,1.0,0.0,0.0,0.0]
+  capsule2:
+    radius: 0.1
+    base: [0.0,0.0,0.1]
+    tip: [0.0,0.0,0.5]
+    pose: [0.0,0.5,0.0,1.0,0.0,0.0,0.0]
\ No newline at end of file
diff --git a/src/curobo/content/configs/world/collision_table.yml b/src/curobo/content/configs/world/collision_table.yml
new file mode 100644
index 00000000..169a95fa
--- /dev/null
+++ b/src/curobo/content/configs/world/collision_table.yml
@@ -0,0 +1,14 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+cuboid:
+  table:
+    dims: [5.0, 5.0, 0.2] # x, y, z
+    pose: [0.0, 0.0, -0.1, 1, 0, 0, 0.0] # x, y, z, qx, qy, qz, qw
diff --git a/src/curobo/content/configs/world/collision_test.yml b/src/curobo/content/configs/world/collision_test.yml
new file mode 100644
index 00000000..9aa25c31
--- /dev/null
+++ b/src/curobo/content/configs/world/collision_test.yml
@@ -0,0 +1,37 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+##
+cuboid:
+  table:
+    dims: [2.2, 2.2, 0.2] # x, y, z
+    pose: [0.0, 0.0, -0.1, 1, 0, 0, 0.0] 
+    
+  cube6:
+    dims:
+    - 0.1
+    - 0.1
+    - 1.5
+    pose:
+    - 0.4
+    - -0.1
+    - 0.3
+    - 1.0
+    - 0.0
+    - 0.0
+    - 0.0
+    color:
+      - 1.0
+      - 0.0
+      - 0.0
+      - 1.0
+    
+ 
\ No newline at end of file
diff --git a/src/curobo/content/configs/world/collision_thin_walls.yml b/src/curobo/content/configs/world/collision_thin_walls.yml
new file mode 100644
index 00000000..12186e0a
--- /dev/null
+++ b/src/curobo/content/configs/world/collision_thin_walls.yml
@@ -0,0 +1,72 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+##
+cuboid:
+  table:
+    dims: [2.2, 2.2, 0.2] # x, y, z
+    pose: [0.0, 0.0, -0.1, 1, 0, 0, 0.0] #
+
+  cube6:
+    dims:
+    - 0.05
+    - 0.01
+    - 1.5
+    pose:
+    - 0.4
+    - -0.1
+    - 0.3
+    - 1.0
+    - 0.0
+    - 0.0
+    - 0.0
+ 
+  cube62:
+    dims:
+    - 0.05
+    - 0.01
+    - 1.5
+    pose:
+    - 0.0
+    - 0.4
+    - 0.3
+    - 1.0
+    - 0.0
+    - 0.0
+    - 0.0
+  cube63:
+    dims:
+    - 0.05
+    - 0.01
+    - 1.5
+    pose:
+    - 0.0
+    - -0.4
+    - 0.3
+    - 1.0
+    - 0.0
+    - 0.0
+    - 0.0
+ 
+  #cube8:
+  #  dims:
+  #  - 0.9
+  #  - 0.1
+  #  - 0.02
+  #  pose:
+  #  - 0.0
+  #  - 0.0
+  #  - 0.9
+  #  - 1.0
+  #  - 0.0
+  #  - 0.0
+  #  - 0.0
+ 
\ No newline at end of file
diff --git a/src/curobo/content/configs/world/collision_wall.yml b/src/curobo/content/configs/world/collision_wall.yml
new file mode 100644
index 00000000..a295b032
--- /dev/null
+++ b/src/curobo/content/configs/world/collision_wall.yml
@@ -0,0 +1,42 @@
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+##
+cuboid:
+  table:
+    dims: [2.2, 2.2, 0.2] # x, y, z
+    pose: [0.0, 0.0, -0.1, 1, 0, 0, 0.0] # x, y, z, qx, qy, qz, qw
+    color:
+      - 0.6
+      - 0.6
+      - 0.8
+      - 1.0
+  
+  cube4:
+    dims:
+    - 0.05
+    - 2.0
+    - 2.0
+    pose:
+    - -0.5
+    - 0.0
+    - 0.3
+    - 1.0
+    - 0.0
+    - 0.0
+    - 0.0
+    color:
+      - 0.6
+      - 0.6
+      - 0.8
+      - 1.0
+ 
+ 
\ No newline at end of file
diff --git a/src/curobo/cuda_robot_model/__init__.py b/src/curobo/cuda_robot_model/__init__.py
new file mode 100644
index 00000000..5da8758a
--- /dev/null
+++ b/src/curobo/cuda_robot_model/__init__.py
@@ -0,0 +1,43 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+"""
+This module contains code for cuda accelerated kinematics.
+
+Kinematics in CuRobo currently supports single axis actuated joints, where the joint can be actuated
+as prismatic or revolute joints. Continuous joints are approximated to revolute joints with limits
+at [-6, +6] radians. Mimic joints are not supported, so convert mimic joints to independent joints.
+
+CuRobo loads a robot's kinematic tree from :class:`types.KinematicsTensorConfig`. This config is
+generated using :class:`cuda_robot_generator.CudaRobotGenerator`. A parser base class 
+:class:`kinematics_parser.KinematicsParer` is provided to help with parsing kinematics from 
+standard formats. Kinematics parsing from URDF is implemented in 
+:class:`urdf_kinematics_parser.UrdfKinematicsParser`. An experimental USD kinematics parser is 
+provided in :class:`usd_kinematics_parser.UsdKinematicsParser`, which is missing an additional 
+transform between the joint origin and link origin, so this might not work for all robots.
+
+In addition to parsing data from a kinematics file (urdf, usd), CuRobo also needs a sphere 
+representation of the robot that approximates the volume of the robot's links with spheres. 
+Several other parameters are also needed to represent kinematics in CuRobo. A tutorial on setting up a 
+robot is provided in :ref:`tut_robot_configuration`.
+
+Once a robot configuration file is setup, you can pass this to 
+:class:`cuda_robot_model.CudaRobotModelConfig` to generate an instance of kinematics configuraiton.
+:class:`cuda_robot_model.CudaRobotModel` takes this configuration and provides access to kinematics
+computations.
+
+.. note::
+    :class:`cuda_robot_model.CudaRobotModel` creates memory tensors that are used by CUDA kernels 
+    while :class:`cuda_robot_model.CudaRobotModelConfig` contains only the robot kinematics 
+    configuration. To reduce memory overhead, you can pass one instance of 
+    :class:`cuda_robot_model.CudaRobotModelConfig` to many instances of 
+    :class:`cuda_robot_model.CudaRobotModel`.
+"""
diff --git a/src/curobo/cuda_robot_model/cuda_robot_generator.py b/src/curobo/cuda_robot_model/cuda_robot_generator.py
new file mode 100644
index 00000000..ac70fb60
--- /dev/null
+++ b/src/curobo/cuda_robot_model/cuda_robot_generator.py
@@ -0,0 +1,888 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+from __future__ import annotations
+
+# Standard Library
+import copy
+import os
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Union
+
+# Third Party
+import importlib_resources
+import torch
+import torch.autograd.profiler as profiler
+
+# CuRobo
+from curobo.cuda_robot_model.kinematics_parser import LinkParams
+from curobo.cuda_robot_model.types import (
+    CSpaceConfig,
+    JointLimits,
+    JointType,
+    KinematicsTensorConfig,
+    SelfCollisionKinematicsConfig,
+)
+from curobo.cuda_robot_model.urdf_kinematics_parser import UrdfKinematicsParser
+from curobo.curobolib.kinematics import get_cuda_kinematics
+from curobo.geom.types import tensor_sphere
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.state import JointState
+from curobo.util.logger import log_error, log_info, log_warn
+from curobo.util_file import get_assets_path, get_robot_configs_path, join_path, load_yaml
+
+try:
+    # CuRobo
+    from curobo.cuda_robot_model.usd_kinematics_parser import UsdKinematicsParser
+except ImportError:
+    log_warn(
+        "USDParser failed to import, install curobo with pip install .[usd] "
+        + "or pip install usd-core"
+    )
+
+
+@dataclass
+class CudaRobotGeneratorConfig:
+    """Create Cuda Robot Model Configuration."""
+
+    #: Name of base link for kinematic tree.
+    base_link: str
+
+    #: Name of end-effector link to compute pose.
+    ee_link: str
+
+    #: Device to load cuda robot model.
+    tensor_args: TensorDeviceType = TensorDeviceType()
+
+    #: Name of link names to compute pose in addition to ee_link.
+    link_names: Optional[List[str]] = None
+
+    #: Name of links to compute sphere positions for use in collision checking.
+    collision_link_names: Optional[List[str]] = None
+
+    #: Collision spheres that fill the volume occupied by the links of the robot.
+    #: Collision spheres can be generated for robot using https://docs.omniverse.nvidia.com/app_isaacsim/app_isaacsim/advanced_tutorials/tutorial_motion_generation_robot_description_editor.html#collision-spheres
+    collision_spheres: Union[None, str, Dict[str, Any]] = None
+
+    #: Radius buffer to add to collision spheres as padding.
+    collision_sphere_buffer: float = 0.0
+
+    #: Compute jacobian of link poses. Currently not supported.
+    compute_jacobian: bool = False
+
+    #: Padding to add for self collision between links. Some robots use a large padding
+    #: for self collision avoidance (https://github.com/ros-planning/panda_moveit_config/pull/35#issuecomment-671333863)
+    self_collision_buffer: Optional[Dict[str, float]] = None
+
+    #: Self-collision ignore
+    self_collision_ignore: Optional[Dict[str, List[str]]] = None
+
+    #: debug config
+    debug: Optional[Dict[str, Any]] = None
+
+    #: Enabling this flag writes out the cumulative transformation matrix to global memory. This
+    #: allows for reusing the cumulative matrix during backward of kinematics (15% speedup over
+    #: recomputing cumul in backward).
+    use_global_cumul: bool = True
+
+    #: Path of meshes of robot links. Currently not used as we represent robot link geometry with
+    #: collision spheres.
+    asset_root_path: str = ""
+
+    #: Names of links to load meshes for visualization. This is only used for exporting
+    #: visualizations.
+    mesh_link_names: Optional[List[str]] = None
+
+    #: Set this to true to add mesh_link_names to link_names when computing kinematics.
+    load_link_names_with_mesh: bool = False
+
+    #: Path to load robot urdf.
+    urdf_path: Optional[str] = None
+
+    #: Path to load robot usd.
+    usd_path: Optional[str] = None
+
+    #: Root prim of robot in usd.
+    usd_robot_root: Optional[str] = None
+
+    #: Path of robot in Isaac server.
+    isaac_usd_path: Optional[str] = None
+
+    #: Load Kinematics chain from usd.
+    use_usd_kinematics: bool = False
+
+    #: Joints to flip axis when loading from USD
+    usd_flip_joints: Optional[List[str]] = None
+
+    #: Flip joint limits in USD.
+    usd_flip_joint_limits: Optional[List[str]] = None
+
+    #: Lock active joints in the kinematic tree. This will convert the joint to a fixed joint with
+    #: joint angle given from this dictionary.
+    lock_joints: Optional[Dict[str, float]] = None
+
+    extra_links: Optional[Dict[str, LinkParams]] = None
+
+    #: this is deprecated
+    add_object_link: bool = False
+
+    use_external_assets: bool = False
+
+    #: Create n collision spheres for links with name
+    extra_collision_spheres: Optional[Dict[str, int]] = None
+
+    #: cspace config
+    cspace: Union[None, CSpaceConfig, Dict[str, List[Any]]] = None
+
+    def __post_init__(self):
+        # add root path:
+        if self.urdf_path is not None:
+            self.urdf_path = join_path(get_assets_path(), self.urdf_path)
+        if self.usd_path is not None:
+            self.usd_path = join_path(get_assets_path(), self.usd_path)
+        if self.asset_root_path != "":
+            if self.use_external_assets:
+                with importlib_resources.files("content") as path:
+                    content_dir_posix = path
+                self.asset_root_path = join_path(content_dir_posix, self.asset_root_path)
+
+            else:
+                self.asset_root_path = join_path(get_assets_path(), self.asset_root_path)
+        elif self.urdf_path is not None:
+            self.asset_root_path = os.path.dirname(self.urdf_path)
+
+        if self.collision_spheres is None and (
+            self.collision_link_names is not None and len(self.collision_link_names) > 0
+        ):
+            log_error("collision link names are provided without robot collision spheres")
+        if self.load_link_names_with_mesh:
+            if self.link_names is None:
+                self.link_names = copy.deepcopy(self.mesh_link_names)
+            else:
+                for i in self.mesh_link_names:
+                    if i not in self.link_names:
+                        self.link_names.append(i)
+        if self.link_names is None:
+            self.link_names = [self.ee_link]
+        if self.collision_link_names is None:
+            self.collision_link_names = []
+        if self.ee_link not in self.link_names:
+            self.link_names.append(self.ee_link)
+        if self.collision_spheres is not None:
+            if isinstance(self.collision_spheres, str):
+                coll_yml = join_path(get_robot_configs_path(), self.collision_spheres)
+                coll_params = load_yaml(coll_yml)
+
+                self.collision_spheres = coll_params["collision_spheres"]
+            if self.extra_collision_spheres is not None:
+                for k in self.extra_collision_spheres.keys():
+                    new_spheres = [
+                        {"center": [0.0, 0.0, 0.0], "radius": -10.0}
+                        for n in range(self.extra_collision_spheres[k])
+                    ]
+                    self.collision_spheres[k] = new_spheres
+        if self.use_usd_kinematics and self.usd_path is None:
+            log_error("usd_path is required to load kinematics from usd")
+        if self.usd_flip_joints is None:
+            self.usd_flip_joints = {}
+        if self.usd_flip_joint_limits is None:
+            self.usd_flip_joint_limits = []
+        if self.extra_links is None:
+            self.extra_links = {}
+        else:
+            for k in self.extra_links.keys():
+                if isinstance(self.extra_links[k], dict):
+                    self.extra_links[k] = LinkParams.from_dict(self.extra_links[k])
+        if isinstance(self.cspace, Dict):
+            self.cspace = CSpaceConfig(**self.cspace, tensor_args=self.tensor_args)
+
+
+class CudaRobotGenerator(CudaRobotGeneratorConfig):
+    def __init__(self, config: CudaRobotGeneratorConfig) -> None:
+        super().__init__(**vars(config))
+        self.cpu_tensor_args = self.tensor_args.cpu()
+
+        self._self_collision_data = None
+        self.non_fixed_joint_names = []
+        self._n_dofs = 1
+
+        self.initialize_tensors()
+
+    @property
+    def kinematics_config(self):
+        return self._kinematics_config
+
+    @property
+    def self_collision_config(self):
+        return self._self_collision_data
+
+    @property
+    def kinematics_parser(self):
+        return self._kinematics_parser
+
+    @profiler.record_function("robot_generator/initialize_tensors")
+    def initialize_tensors(self):
+        self._joint_limits = None
+        self._self_collision_data = None
+        self.lock_jointstate = None
+        self.lin_jac, self.ang_jac = None, None
+
+        self._link_spheres_tensor = torch.empty(
+            (0, 4), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+        )
+        self._link_sphere_idx_map = torch.empty(
+            (0), dtype=torch.int16, device=self.tensor_args.device
+        )
+        self.total_spheres = 0
+        self.self_collision_distance = (
+            torch.zeros(
+                (self.total_spheres, self.total_spheres),
+                dtype=self.tensor_args.dtype,
+                device=self.tensor_args.device,
+            )
+            - torch.inf
+        )
+        self.self_collision_offset = torch.zeros(
+            (self.total_spheres), dtype=self.tensor_args.dtype, device=self.tensor_args.device
+        )
+        # create a mega list of all links that we need:
+        other_links = copy.deepcopy(self.link_names)
+
+        for i in self.collision_link_names:
+            if i not in self.link_names:
+                other_links.append(i)
+        for i in self.extra_links:
+            p_name = self.extra_links[i].parent_link_name
+            if p_name not in self.link_names and p_name not in other_links:
+                other_links.append(p_name)
+
+        # other_links = list(set(self.link_names + self.collision_link_names))
+
+        # load kinematics parser based on file type:
+        # NOTE: Also add option to load from data buffers.
+        if self.use_usd_kinematics:
+            self._kinematics_parser = UsdKinematicsParser(
+                self.usd_path,
+                flip_joints=self.usd_flip_joints,
+                flip_joint_limits=self.usd_flip_joint_limits,
+                extra_links=self.extra_links,
+                usd_robot_root=self.usd_robot_root,
+            )
+        else:
+            self._kinematics_parser = UrdfKinematicsParser(
+                self.urdf_path, mesh_root=self.asset_root_path, extra_links=self.extra_links
+            )
+
+        if self.lock_joints is None:
+            self._build_kinematics(self.base_link, self.ee_link, other_links, self.link_names)
+        else:
+            self._build_kinematics_with_lock_joints(
+                self.base_link, self.ee_link, other_links, self.link_names, self.lock_joints
+            )
+        if self.cspace is None:
+            jpv = self._get_joint_position_velocity_limits()
+            self.cspace = CSpaceConfig.load_from_joint_limits(
+                jpv["position"][1, :], jpv["position"][0, :], self.joint_names, self.tensor_args
+            )
+
+        self.cspace.inplace_reindex(self.joint_names)
+        self._update_joint_limits()
+        self._ee_idx = self.link_names.index(self.ee_link)
+
+        # create kinematics tensor:
+        self._kinematics_config = KinematicsTensorConfig(
+            fixed_transforms=self._fixed_transform,
+            link_map=self._link_map,
+            joint_map=self._joint_map,
+            joint_map_type=self._joint_map_type,
+            store_link_map=self._store_link_map,
+            link_chain_map=self._link_chain_map,
+            link_names=self.link_names,
+            link_spheres=self._link_spheres_tensor,
+            link_sphere_idx_map=self._link_sphere_idx_map,
+            n_dof=self._n_dofs,
+            joint_limits=self._joint_limits,
+            non_fixed_joint_names=self.non_fixed_joint_names,
+            total_spheres=self.total_spheres,
+            link_name_to_idx_map=self._name_to_idx_map,
+            joint_names=self.joint_names,
+            debug=self.debug,
+            ee_idx=self._ee_idx,
+            mesh_link_names=self.mesh_link_names,
+            cspace=self.cspace,
+            base_link=self.base_link,
+            ee_link=self.ee_link,
+            lock_jointstate=self.lock_jointstate,
+        )
+        if self.asset_root_path != "":
+            self._kinematics_parser.add_absolute_path_to_link_meshes(self.asset_root_path)
+
+    def add_link(self, link_params: LinkParams):
+        self.extra_links[link_params.link_name] = link_params
+
+    def add_fixed_link(
+        self,
+        link_name: str,
+        parent_link_name: str,
+        joint_name: Optional[str] = None,
+        transform: Optional[Pose] = None,
+    ):
+        if transform is None:
+            transform = (
+                Pose.from_list([0, 0, 0, 1, 0, 0, 0], self.tensor_args)
+                .get_matrix()
+                .view(4, 4)
+                .cpu()
+                .numpy()
+            )
+        if joint_name is None:
+            joint_name = link_name + "_j_" + parent_link_name
+        link_params = LinkParams(
+            link_name=link_name,
+            parent_link_name=parent_link_name,
+            joint_name=joint_name,
+            fixed_transform=transform,
+            joint_type=JointType.FIXED,
+        )
+        self.add_link(link_params)
+
+    @profiler.record_function("robot_generator/build_chain")
+    def _build_chain(self, base_link, ee_link, other_links, link_names):
+        self._n_dofs = 0
+        self._controlled_joints = []
+        self._bodies = []
+
+        self._name_to_idx_map = dict()
+        self.base_link = base_link
+        self.ee_link = ee_link
+        self.joint_names = []
+        self._fixed_transform = []
+        chain_link_names = self._kinematics_parser.get_chain(base_link, ee_link)
+        self._add_body_to_tree(chain_link_names[0], base=True)
+        for i, l_name in enumerate(chain_link_names[1:]):
+            self._add_body_to_tree(l_name)
+        # check if all links are in the built tree:
+
+        for i in other_links:
+            if i in self._name_to_idx_map:
+                continue
+            if i not in self.extra_links.keys():
+                chain_l_names = self._kinematics_parser.get_chain(base_link, i)
+
+                for k in chain_l_names:
+                    if k in chain_link_names:
+                        continue
+                    # if link name is not in chain, add to chain
+                    chain_link_names.append(k)
+                    # add to tree:
+                    self._add_body_to_tree(k, base=False)
+        for i in self.extra_links.keys():
+            if i not in chain_link_names:
+                self._add_body_to_tree(i, base=False)
+                chain_link_names.append(i)
+
+        self.non_fixed_joint_names = self.joint_names.copy()
+        return chain_link_names
+
+    @profiler.record_function("robot_generator/build_kinematics_tensors")
+    def _build_kinematics_tensors(self, base_link, ee_link, link_names, chain_link_names):
+        link_map = [0 for i in range(len(self._bodies))]
+        store_link_map = []  # [-1 for i in range(len(self._bodies))]
+
+        joint_map = [
+            -1 if i not in self._controlled_joints else i for i in range(len(self._bodies))
+        ]
+        joint_map_type = [
+            -1 if i not in self._controlled_joints else i for i in range(len(self._bodies))
+        ]
+        all_joint_names = []
+        j_count = 0
+        ordered_link_names = []
+        # add body 0 details:
+        if self._bodies[0].link_name in link_names:
+            store_link_map.append(chain_link_names.index(self._bodies[0].link_name))
+            ordered_link_names.append(self._bodies[0].link_name)
+        # get joint types:
+        for i in range(1, len(self._bodies)):
+            body = self._bodies[i]
+            parent_name = body.parent_link_name
+            link_map[i] = self._name_to_idx_map[parent_name]
+            all_joint_names.append(body.joint_name)
+            if body.link_name in link_names:
+                store_link_map.append(chain_link_names.index(body.link_name))
+                ordered_link_names.append(body.link_name)
+            if i in self._controlled_joints:
+                joint_map[i] = j_count
+                joint_map_type[i] = body.joint_type.value
+                j_count += 1
+        self.link_names = ordered_link_names
+        # do a for loop to get link matrix:
+        link_chain_map = torch.eye(
+            len(chain_link_names), dtype=torch.int16, device=self.cpu_tensor_args.device
+        )
+
+        # iterate and set true:
+        for i in range(len(chain_link_names)):
+            chain_l_names = self._kinematics_parser.get_chain(base_link, chain_link_names[i])
+            for k in chain_l_names:
+                link_chain_map[i, chain_link_names.index(k)] = 1.0
+
+        self._link_map = torch.as_tensor(
+            link_map, device=self.tensor_args.device, dtype=torch.int16
+        )
+        self._joint_map = torch.as_tensor(
+            joint_map, device=self.tensor_args.device, dtype=torch.int16
+        )
+        self._joint_map_type = torch.as_tensor(
+            joint_map_type, device=self.tensor_args.device, dtype=torch.int8
+        )
+        self._store_link_map = torch.as_tensor(
+            store_link_map, device=self.tensor_args.device, dtype=torch.int16
+        )
+        self._link_chain_map = link_chain_map.to(device=self.tensor_args.device)
+        self._fixed_transform = torch.cat((self._fixed_transform), dim=0).to(
+            device=self.tensor_args.device
+        )
+        self._all_joint_names = all_joint_names
+
+    @profiler.record_function("robot_generator/build_kinematics")
+    def _build_kinematics(self, base_link, ee_link, other_links, link_names):
+        chain_link_names = self._build_chain(base_link, ee_link, other_links, link_names)
+        self._build_kinematics_tensors(base_link, ee_link, link_names, chain_link_names)
+        if self.collision_spheres is not None and len(self.collision_link_names) > 0:
+            self._build_collision_model(
+                self.collision_spheres, self.collision_link_names, self.collision_sphere_buffer
+            )
+
+    @profiler.record_function("robot_generator/build_kinematics_with_lock_joints")
+    def _build_kinematics_with_lock_joints(
+        self,
+        base_link,
+        ee_link,
+        other_links,
+        link_names,
+        lock_joints: Dict[str, float],
+    ):
+        chain_link_names = self._build_chain(base_link, ee_link, other_links, link_names)
+        # find links attached to lock joints:
+        lock_joint_names = list(lock_joints.keys())
+
+        joint_data = self._get_joint_links(lock_joint_names)
+
+        lock_links = list(
+            set(
+                [joint_data[j]["parent"] for j in joint_data.keys()]
+                + [joint_data[j]["child"] for j in joint_data.keys()]
+            )
+        )
+        new_link_names = link_names + lock_links
+
+        # rebuild kinematic tree with link names added to link pose computation:
+        self._build_kinematics_tensors(base_link, ee_link, new_link_names, chain_link_names)
+        if self.collision_spheres is not None and len(self.collision_link_names) > 0:
+            self._build_collision_model(
+                self.collision_spheres, self.collision_link_names, self.collision_sphere_buffer
+            )
+        # do forward kinematics and get transform for locked joints:
+        q = torch.zeros(
+            (1, self._n_dofs), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+        )
+        # set lock joints in the joint angles:
+        l_idx = torch.as_tensor(
+            [self.joint_names.index(l) for l in lock_joints.keys()],
+            dtype=torch.long,
+            device=self.tensor_args.device,
+        )
+        l_val = self.tensor_args.to_device([lock_joints[l] for l in lock_joints.keys()])
+
+        q[0, l_idx] = l_val
+        kinematics_config = KinematicsTensorConfig(
+            fixed_transforms=self._fixed_transform,
+            link_map=self._link_map,
+            joint_map=self._joint_map,
+            joint_map_type=self._joint_map_type,
+            store_link_map=self._store_link_map,
+            link_chain_map=self._link_chain_map,
+            link_names=self.link_names,
+            link_spheres=self._link_spheres_tensor,
+            link_sphere_idx_map=self._link_sphere_idx_map,
+            n_dof=self._n_dofs,
+            joint_limits=self._joint_limits,
+            non_fixed_joint_names=self.non_fixed_joint_names,
+            total_spheres=self.total_spheres,
+        )
+        link_poses = self._get_link_poses(q, lock_links, kinematics_config)
+
+        # remove lock links from store map:
+        store_link_map = [chain_link_names.index(l) for l in link_names]
+        self._store_link_map = torch.as_tensor(
+            store_link_map, device=self.tensor_args.device, dtype=torch.int16
+        )
+        self.link_names = link_names
+
+        # compute a fixed transform for fixing joints:
+        with profiler.record_function("cuda_robot_generator/fix_locked_joints"):
+            # convert tensors to cpu:
+            self._joint_map_type = self._joint_map_type.to(device=self.cpu_tensor_args.device)
+            self._joint_map = self._joint_map.to(device=self.cpu_tensor_args.device)
+
+            for j in lock_joint_names:
+                w_parent = lock_links.index(joint_data[j]["parent"])
+                w_child = lock_links.index(joint_data[j]["child"])
+                parent_t_child = (
+                    link_poses.get_index(0, w_parent)
+                    .inverse()
+                    .multiply(link_poses.get_index(0, w_child))
+                )
+                # Make this joint as fixed
+                i = self._all_joint_names.index(j) + 1
+                self._joint_map_type[i] = -1
+                self._joint_map[i:] -= 1
+                self._joint_map[i] = -1
+                self._n_dofs -= 1
+                self._fixed_transform[i] = parent_t_child.get_matrix()
+                self._controlled_joints.remove(i)
+                self.joint_names.remove(j)
+            self._joint_map[self._joint_map < -1] = -1
+            self._joint_map = self._joint_map.to(device=self.tensor_args.device)
+            self._joint_map_type = self._joint_map_type.to(device=self.tensor_args.device)
+        if len(self.lock_joints.keys()) > 0:
+            self.lock_jointstate = JointState(
+                position=l_val, joint_names=list(self.lock_joints.keys())
+            )
+
+    @profiler.record_function("robot_generator/build_collision_model")
+    def _build_collision_model(
+        self,
+        collision_spheres: Dict,
+        collision_link_names: List[str],
+        collision_sphere_buffer: float = 0.0,
+    ):
+        """
+
+        Args:
+            collision_spheres (_type_): _description_
+            collision_link_names (_type_): _description_
+            collision_sphere_buffer (float, optional): _description_. Defaults to 0.0.
+        """
+
+        # We create all tensors on cpu and then finally move them to gpu
+        coll_link_spheres = []
+        # we store as [n_link, 7]
+        link_sphere_idx_map = []
+        cpu_tensor_args = self.tensor_args.cpu()
+        with profiler.record_function("robot_generator/build_collision_spheres"):
+            for j_idx, j in enumerate(collision_link_names):
+                # print(j_idx)
+                n_spheres = len(collision_spheres[j])
+                link_spheres = torch.zeros(
+                    (n_spheres, 4), dtype=cpu_tensor_args.dtype, device=cpu_tensor_args.device
+                )
+                # find link index in global map:
+                l_idx = self._name_to_idx_map[j]
+
+                for i in range(n_spheres):
+                    link_spheres[i, :] = tensor_sphere(
+                        collision_spheres[j][i]["center"],
+                        collision_spheres[j][i]["radius"],
+                        tensor_args=cpu_tensor_args,
+                        tensor=link_spheres[i, :],
+                    )
+                    link_sphere_idx_map.append(l_idx)
+                coll_link_spheres.append(link_spheres)
+                self.total_spheres += n_spheres
+
+        self._link_spheres_tensor = torch.cat(coll_link_spheres, dim=0)
+        new_radius = self._link_spheres_tensor[..., 3] + collision_sphere_buffer
+        flag = torch.logical_and(new_radius > -1.0, new_radius <= 0.0)
+        new_radius[flag] = 0.001
+        self._link_spheres_tensor[:, 3] = new_radius
+        self._link_sphere_idx_map = torch.as_tensor(
+            link_sphere_idx_map, dtype=torch.int16, device=cpu_tensor_args.device
+        )
+
+        # build self collision distance tensor:
+        self.self_collision_distance = (
+            torch.zeros(
+                (self.total_spheres, self.total_spheres),
+                dtype=cpu_tensor_args.dtype,
+                device=cpu_tensor_args.device,
+            )
+            - torch.inf
+        )
+        self.self_collision_offset = torch.zeros(
+            (self.total_spheres), dtype=cpu_tensor_args.dtype, device=cpu_tensor_args.device
+        )
+
+        with profiler.record_function("robot_generator/self_collision_distance"):
+            # iterate through each link:
+            for j_idx, j in enumerate(collision_link_names):
+                ignore_links = []
+                if j in self.self_collision_ignore.keys():
+                    ignore_links = self.self_collision_ignore[j]
+                link1_idx = self._name_to_idx_map[j]
+                link1_spheres_idx = torch.nonzero(self._link_sphere_idx_map == link1_idx)
+
+                rad1 = self._link_spheres_tensor[link1_spheres_idx, 3]
+                if j not in self.self_collision_buffer.keys():
+                    self.self_collision_buffer[j] = 0.0
+                c1 = self.self_collision_buffer[j]
+                self.self_collision_offset[link1_spheres_idx] = c1
+                for _, i_name in enumerate(collision_link_names):
+                    if i_name == j or i_name in ignore_links:
+                        continue
+                    if i_name not in collision_link_names:
+                        log_error("Self Collision Link name not found in collision_link_names")
+                    # find index of this link name:
+                    if i_name not in self.self_collision_buffer.keys():
+                        self.self_collision_buffer[i_name] = 0.0
+                    c2 = self.self_collision_buffer[i_name]
+                    link2_idx = self._name_to_idx_map[i_name]
+                    # update collision distance between spheres from these two links:
+                    link2_spheres_idx = torch.nonzero(self._link_sphere_idx_map == link2_idx)
+                    rad2 = self._link_spheres_tensor[link2_spheres_idx, 3]
+
+                    for k1 in range(len(rad1)):
+                        sp1 = link1_spheres_idx[k1]
+                        for k2 in range(len(rad2)):
+                            sp2 = link2_spheres_idx[k2]
+                            self.self_collision_distance[sp1, sp2] = rad1[k1] + rad2[k2] + c1 + c2
+
+        with profiler.record_function("robot_generator/self_collision_min"):
+            d_mat = self.self_collision_distance
+            self.self_collision_distance = torch.minimum(d_mat, d_mat.transpose(0, 1))
+
+        if self.debug is not None and "self_collision_experimental" in self.debug:
+            use_experimental_kernel = self.debug["self_collision_experimental"]
+        self.self_collision_distance = self.self_collision_distance.to(
+            device=self.tensor_args.device
+        )
+        (
+            self._self_coll_thread_locations,
+            self._self_coll_idx,
+            valid_data,
+            checks_per_thread,
+        ) = self._create_self_collision_thread_data(self.self_collision_distance)
+        self._self_coll_matrix = (self.self_collision_distance != -(torch.inf)).to(
+            dtype=torch.uint8
+        )
+
+        use_experimental_kernel = True
+        # convert all tensors to gpu:
+        self._link_sphere_idx_map = self._link_sphere_idx_map.to(device=self.tensor_args.device)
+        self._link_spheres_tensor = self._link_spheres_tensor.to(device=self.tensor_args.device)
+        self.self_collision_offset = self.self_collision_offset.to(device=self.tensor_args.device)
+        self._self_collision_data = SelfCollisionKinematicsConfig(
+            offset=self.self_collision_offset,
+            distance_threshold=self.self_collision_distance,
+            thread_location=self._self_coll_thread_locations,
+            thread_max=self._self_coll_idx,
+            collision_matrix=self._self_coll_matrix,
+            experimental_kernel=valid_data and use_experimental_kernel,
+            checks_per_thread=checks_per_thread,
+        )
+
+    @profiler.record_function("robot_generator/create_self_collision_thread_data")
+    def _create_self_collision_thread_data(self, collision_threshold):
+        coll_cpu = collision_threshold.cpu()
+        max_checks_per_thread = 512
+        thread_loc = torch.zeros((2 * 32 * max_checks_per_thread), dtype=torch.int16) - 1
+        n_spheres = coll_cpu.shape[0]
+        sl_idx = 0
+        skip_count = 0
+        all_val = 0
+        valid_data = True
+        for i in range(n_spheres):
+            if not valid_data:
+                break
+            if torch.max(coll_cpu[i]) == -torch.inf:
+                print("skip", i)
+            for j in range(i + 1, n_spheres):
+                if sl_idx > thread_loc.shape[0] - 1:
+                    valid_data = False
+                    log_warn(
+                        "Self Collision checks are greater than "
+                        + str(32 * max_checks_per_thread)
+                        + ", using slower kernel"
+                    )
+                    break
+                if coll_cpu[i, j] != -torch.inf:
+                    thread_loc[sl_idx] = i
+                    sl_idx += 1
+                    thread_loc[sl_idx] = j
+                    sl_idx += 1
+                else:
+                    skip_count += 1
+                all_val += 1
+        log_info("Self Collision threads, skipped %: " + str(100 * float(skip_count) / all_val))
+        log_info("Self Collision count: " + str(sl_idx / (2)))
+        log_info("Self Collision per thread: " + str(sl_idx / (2 * 1024)))
+
+        max_checks_per_thread = 512
+        val = sl_idx / (2 * 1024)
+        if val < 1:
+            max_checks_per_thread = 1
+        elif val < 2:
+            max_checks_per_thread = 2
+        elif val < 4:
+            max_checks_per_thread = 4
+        elif val < 8:
+            max_checks_per_thread = 8
+        elif val < 32:
+            max_checks_per_thread = 32
+        elif val < 64:
+            max_checks_per_thread = 64
+        elif val < 128:
+            max_checks_per_thread = 128
+        elif val < 512:
+            max_checks_per_thread = 512
+        else:
+            raise ValueError("Self Collision not supported")
+
+        if max_checks_per_thread < 2:
+            max_checks_per_thread = 2
+        log_info("Self Collision using: " + str(max_checks_per_thread))
+
+        return (
+            thread_loc.to(device=collision_threshold.device),
+            sl_idx,
+            valid_data,
+            max_checks_per_thread,
+        )
+
+    @profiler.record_function("robot_generator/add_body_to_tree")
+    def _add_body_to_tree(self, link_name, base=False):
+        body_idx = len(self._bodies)
+
+        rigid_body_params = self._kinematics_parser.get_link_parameters(link_name, base=base)
+        self._bodies.append(rigid_body_params)
+        if rigid_body_params.joint_type != JointType.FIXED:
+            self._controlled_joints.append(body_idx)
+            self.joint_names.append(rigid_body_params.joint_name)
+            self._n_dofs = self._n_dofs + 1
+        self._fixed_transform.append(
+            torch.as_tensor(
+                rigid_body_params.fixed_transform,
+                device=self.cpu_tensor_args.device,
+                dtype=self.cpu_tensor_args.dtype,
+            ).unsqueeze(0)
+        )
+        self._name_to_idx_map[rigid_body_params.link_name] = body_idx
+
+    def _get_joint_links(self, joint_names: List[str]):
+        j_data = {}
+        for j in joint_names:
+            for b in self._bodies:
+                if b.joint_name == j:
+                    j_data[j] = {"parent": b.parent_link_name, "child": b.link_name}
+        return j_data
+
+    @profiler.record_function("robot_generator/get_link_poses")
+    def _get_link_poses(
+        self, q: torch.Tensor, link_names: List[str], kinematics_config: KinematicsTensorConfig
+    ) -> Pose:
+        if q.is_contiguous():
+            q_in = q.view(-1)
+        else:
+            q_in = q.reshape(-1)  # .reshape(-1)
+        # q_in = q.reshape(-1)
+        link_pos_seq = torch.zeros(
+            (1, len(self.link_names), 3),
+            device=self.tensor_args.device,
+            dtype=self.tensor_args.dtype,
+        )
+        link_quat_seq = torch.zeros(
+            (1, len(self.link_names), 4),
+            device=self.tensor_args.device,
+            dtype=self.tensor_args.dtype,
+        )
+        batch_robot_spheres = torch.zeros(
+            (1, self.total_spheres, 4),
+            device=self.tensor_args.device,
+            dtype=self.tensor_args.dtype,
+        )
+        grad_out_q = torch.zeros(
+            (1 * q.shape[-1]),
+            device=self.tensor_args.device,
+            dtype=self.tensor_args.dtype,
+        )
+        global_cumul_mat = torch.zeros(
+            (1, self._link_map.shape[0], 4, 4),
+            device=self.tensor_args.device,
+            dtype=self.tensor_args.dtype,
+        )
+        link_pos_seq, link_quat_seq, _ = get_cuda_kinematics(
+            # self._link_mat_seq,  # data will be stored here
+            link_pos_seq,
+            link_quat_seq,
+            batch_robot_spheres,
+            global_cumul_mat,
+            q_in,
+            kinematics_config.fixed_transforms,
+            kinematics_config.link_spheres,
+            kinematics_config.link_map,  # tells which link is attached to which link i
+            kinematics_config.joint_map,  # tells which joint is attached to a link i
+            kinematics_config.joint_map_type,  # joint type
+            kinematics_config.store_link_map,
+            kinematics_config.link_sphere_idx_map,  # sphere idx map
+            kinematics_config.link_chain_map,
+            grad_out_q,
+            self.use_global_cumul,
+        )
+        position = torch.zeros(
+            (q.shape[0], len(link_names), 3),
+            device=self.tensor_args.device,
+            dtype=self.tensor_args.dtype,
+        )
+        quaternion = torch.zeros(
+            (q.shape[0], len(link_names), 4),
+            device=self.tensor_args.device,
+            dtype=self.tensor_args.dtype,
+        )
+
+        for li, l in enumerate(link_names):
+            i = self.link_names.index(l)
+            position[:, li, :] = link_pos_seq[:, i, :]
+            quaternion[:, li, :] = link_quat_seq[:, i, :]
+        return Pose(position=position, quaternion=quaternion)
+
+    @property
+    def get_joint_limits(self):
+        return self._joint_limits
+
+    @profiler.record_function("robot_generator/get_joint_limits")
+    def _get_joint_position_velocity_limits(self):
+        joint_limits = {"position": [[], []], "velocity": [[], []]}
+
+        for idx in self._controlled_joints:
+            joint_limits["position"][0].append(self._bodies[idx].joint_limits[0])
+            joint_limits["position"][1].append(self._bodies[idx].joint_limits[1])
+            joint_limits["velocity"][0].append(self._bodies[idx].joint_velocity_limits[0])
+            joint_limits["velocity"][1].append(self._bodies[idx].joint_velocity_limits[1])
+        for k in joint_limits:
+            joint_limits[k] = torch.as_tensor(
+                joint_limits[k], device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            )
+        return joint_limits
+
+    @profiler.record_function("robot_generator/update_joint_limits")
+    def _update_joint_limits(self):
+        joint_limits = self._get_joint_position_velocity_limits()
+        joint_limits["jerk"] = torch.cat(
+            [-1.0 * self.cspace.max_jerk.unsqueeze(0), self.cspace.max_jerk.unsqueeze(0)]
+        )
+        joint_limits["acceleration"] = torch.cat(
+            [
+                -1.0 * self.cspace.max_acceleration.unsqueeze(0),
+                self.cspace.max_acceleration.unsqueeze(0),
+            ]
+        )
+        self._joint_limits = JointLimits(joint_names=self.joint_names, **joint_limits)
diff --git a/src/curobo/cuda_robot_model/cuda_robot_model.py b/src/curobo/cuda_robot_model/cuda_robot_model.py
new file mode 100644
index 00000000..cb82f7aa
--- /dev/null
+++ b/src/curobo/cuda_robot_model/cuda_robot_model.py
@@ -0,0 +1,386 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+from __future__ import annotations
+
+# Standard Library
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+# Third Party
+import torch
+import torch.autograd.profiler as profiler
+
+# CuRobo
+from curobo.cuda_robot_model.cuda_robot_generator import (
+    CudaRobotGenerator,
+    CudaRobotGeneratorConfig,
+)
+from curobo.cuda_robot_model.kinematics_parser import KinematicsParser
+from curobo.cuda_robot_model.types import (
+    CudaRobotModelState,
+    KinematicsTensorConfig,
+    SelfCollisionKinematicsConfig,
+)
+from curobo.curobolib.kinematics import get_cuda_kinematics
+from curobo.geom.types import Sphere
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.util.logger import log_error
+from curobo.util_file import get_robot_path, join_path, load_yaml
+
+
+@dataclass
+class CudaRobotModelConfig:
+    tensor_args: TensorDeviceType
+    link_names: List[str]
+    kinematics_config: KinematicsTensorConfig
+    self_collision_config: Optional[SelfCollisionKinematicsConfig] = None
+    kinematics_parser: Optional[KinematicsParser] = None
+    compute_jacobian: bool = False
+    use_global_cumul: bool = False
+    generator_config: Optional[CudaRobotGeneratorConfig] = None
+
+    def get_joint_limits(self):
+        return self.kinematics_config.joint_limits
+
+    @staticmethod
+    def from_basic_urdf(
+        urdf_path: str,
+        base_link: str,
+        ee_link: str,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+    ) -> CudaRobotModelConfig:
+        """Load a cuda robot model from only urdf. This does not support collision queries.
+
+        Args:
+            urdf_path : Path of urdf file.
+            base_link : Name of base link.
+            ee_link : Name of end-effector link.
+            tensor_args : Device to load robot model. Defaults to TensorDeviceType().
+
+        Returns:
+            CudaRobotModelConfig: cuda robot model configuration.
+        """
+        config = CudaRobotGeneratorConfig(base_link, ee_link, tensor_args, urdf_path=urdf_path)
+        return CudaRobotModelConfig.from_config(config)
+
+    @staticmethod
+    def from_basic_usd(
+        usd_path: str,
+        usd_robot_root: str,
+        base_link: str,
+        ee_link: str,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+    ) -> CudaRobotModelConfig:
+        """Load a cuda robot model from only urdf. This does not support collision queries.
+
+        Args:
+            urdf_path : Path of urdf file.
+            base_link : Name of base link.
+            ee_link : Name of end-effector link.
+            tensor_args : Device to load robot model. Defaults to TensorDeviceType().
+
+        Returns:
+            CudaRobotModelConfig: cuda robot model configuration.
+        """
+        config = CudaRobotGeneratorConfig(
+            tensor_args,
+            base_link,
+            ee_link,
+            usd_path=usd_path,
+            usd_robot_root=usd_robot_root,
+            use_usd_kinematics=True,
+        )
+        return CudaRobotModelConfig.from_config(config)
+
+    @staticmethod
+    def from_robot_yaml_file(
+        file_path: str,
+        ee_link: Optional[str] = None,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+    ):
+        config_file = load_yaml(join_path(get_robot_path(), file_path))["robot_cfg"]["kinematics"]
+        if ee_link is not None:
+            config_file["ee_link"] = ee_link
+        return CudaRobotModelConfig.from_config(
+            CudaRobotGeneratorConfig(**config_file, tensor_args=tensor_args)
+        )
+
+    @staticmethod
+    def from_data_dict(
+        data_dict: Dict[str, Any], tensor_args: TensorDeviceType = TensorDeviceType()
+    ):
+        return CudaRobotModelConfig.from_config(
+            CudaRobotGeneratorConfig(**data_dict, tensor_args=tensor_args)
+        )
+
+    @staticmethod
+    def from_config(config: CudaRobotGeneratorConfig):
+        # create a config generator and load all values
+        generator = CudaRobotGenerator(config)
+        return CudaRobotModelConfig(
+            tensor_args=generator.tensor_args,
+            link_names=generator.link_names,
+            kinematics_config=generator.kinematics_config,
+            self_collision_config=generator.self_collision_config,
+            kinematics_parser=generator.kinematics_parser,
+            use_global_cumul=generator.use_global_cumul,
+            compute_jacobian=generator.compute_jacobian,
+            generator_config=config,
+        )
+
+    @property
+    def cspace(self):
+        return self.kinematics_config.cspace
+
+
+class CudaRobotModel(CudaRobotModelConfig):
+    """
+    CUDA Accelerated Robot Model
+
+    NOTE: Currently dof is created only for links that we need to compute kinematics.
+    E.g., for robots with many serial chains, add all links of the robot to get the correct dof.
+    This is not an issue if you are loading collision spheres as that will cover the full geometry
+    of the robot.
+    """
+
+    def __init__(self, config: CudaRobotModelConfig):
+        super().__init__(**vars(config))
+        self._batch_size = 0
+        self.update_batch_size(1, reset_buffers=True)
+
+    def update_batch_size(self, batch_size, force_update=False, reset_buffers=False):
+        if batch_size == 0:
+            log_error("batch size is zero")
+        if force_update and self._batch_size == batch_size and self.compute_jacobian:
+            self.lin_jac = self.lin_jac.detach()  # .requires_grad_(True)
+            self.ang_jac = self.ang_jac.detach()  # .requires_grad_(True)
+        elif self._batch_size != batch_size or reset_buffers:
+            self._batch_size = batch_size
+            self._link_pos_seq = torch.zeros(
+                (self._batch_size, len(self.link_names), 3),
+                device=self.tensor_args.device,
+                dtype=self.tensor_args.dtype,
+            )
+            self._link_quat_seq = torch.zeros(
+                (self._batch_size, len(self.link_names), 4),
+                device=self.tensor_args.device,
+                dtype=self.tensor_args.dtype,
+            )
+
+            self._batch_robot_spheres = torch.zeros(
+                (self._batch_size, self.kinematics_config.total_spheres, 4),
+                device=self.tensor_args.device,
+                dtype=self.tensor_args.dtype,
+            )
+            self._grad_out_q = torch.zeros(
+                (self._batch_size, self.get_dof()),
+                device=self.tensor_args.device,
+                dtype=self.tensor_args.dtype,
+            )
+            self._global_cumul_mat = torch.zeros(
+                (self._batch_size, self.kinematics_config.link_map.shape[0], 4, 4),
+                device=self.tensor_args.device,
+                dtype=self.tensor_args.dtype,
+            )
+            if self.compute_jacobian:
+                self.lin_jac = torch.zeros(
+                    [batch_size, 3, self.kinematics_config.n_dofs],
+                    device=self.tensor_args.device,
+                    dtype=self.tensor_args.dtype,
+                )
+                self.ang_jac = torch.zeros(
+                    [batch_size, 3, self.kinematics_config.n_dofs],
+                    device=self.tensor_args.device,
+                    dtype=self.tensor_args.dtype,
+                )
+
+    @profiler.record_function("cuda_robot_model/forward_kinematics")
+    def forward(self, q, link_name=None, calculate_jacobian=False):
+        # pos, rot = self.compute_forward_kinematics(q, qd, link_name)
+        if len(q.shape) > 2:
+            raise ValueError("q shape should be [batch_size, dof]")
+        batch_size = q.shape[0]
+        self.update_batch_size(batch_size, force_update=q.requires_grad)
+
+        # do fused forward:
+        link_pos_seq, link_quat_seq, link_spheres_tensor = self._cuda_forward(q)
+
+        if len(self.link_names) == 1:
+            ee_pos = link_pos_seq.squeeze(1)
+            ee_quat = link_quat_seq.squeeze(1)
+        else:
+            link_idx = self.kinematics_config.ee_idx
+            if link_name is not None:
+                link_idx = self.link_names.index(link_name)
+            ee_pos = link_pos_seq.contiguous()[..., link_idx, :]
+            ee_quat = link_quat_seq.contiguous()[..., link_idx, :]
+        lin_jac = ang_jac = None
+
+        # compute jacobians?
+        if calculate_jacobian:
+            raise NotImplementedError
+        return (
+            ee_pos,
+            ee_quat,
+            lin_jac,
+            ang_jac,
+            link_pos_seq,
+            link_quat_seq,
+            link_spheres_tensor,
+        )
+
+    def get_state(self, q, link_name=None, calculate_jacobian=False) -> CudaRobotModelState:
+        out = self.forward(q, link_name, calculate_jacobian)
+        state = CudaRobotModelState(
+            out[0],
+            out[1],
+            None,
+            None,
+            out[4],
+            out[5],
+            out[6],
+            self.link_names,
+        )
+        return state
+
+    def get_robot_as_mesh(self, q: torch.Tensor):
+        # get all link meshes:
+        m_list = [self.get_link_mesh(l) for l in self.mesh_link_names]
+        pose = self.get_link_poses(q, self.mesh_link_names)
+        for li, l in enumerate(self.mesh_link_names):
+            m_list[li].pose = pose.get_index(0, li).tolist()
+
+        return m_list
+
+    def get_robot_as_spheres(self, q: torch.Tensor, filter_valid: bool = True):
+        state = self.get_state(q)
+
+        # state has sphere position and radius
+
+        sph_all = state.get_link_spheres().cpu().numpy()
+
+        sph_traj = []
+        for j in range(sph_all.shape[0]):
+            sph = sph_all[j, :, :]
+            if filter_valid:
+                sph_list = [
+                    Sphere(
+                        name="robot_curobo_sphere_" + str(i),
+                        pose=[sph[i, 0], sph[i, 1], sph[i, 2], 1, 0, 0, 0],
+                        radius=sph[i, 3],
+                    )
+                    for i in range(sph.shape[0])
+                    if (sph[i, 3] > 0.0)
+                ]
+            else:
+                sph_list = [
+                    Sphere(
+                        name="robot_curobo_sphere_" + str(i),
+                        pose=[sph[i, 0], sph[i, 1], sph[i, 2], 1, 0, 0, 0],
+                        radius=sph[i, 3],
+                    )
+                    for i in range(sph.shape[0])
+                ]
+            sph_traj.append(sph_list)
+        return sph_traj
+
+    def get_link_poses(self, q: torch.Tensor, link_names: List[str]) -> Pose:
+        state = self.get_state(q)
+        position = torch.zeros(
+            (q.shape[0], len(link_names), 3),
+            device=self.tensor_args.device,
+            dtype=self.tensor_args.dtype,
+        )
+        quaternion = torch.zeros(
+            (q.shape[0], len(link_names), 4),
+            device=self.tensor_args.device,
+            dtype=self.tensor_args.dtype,
+        )
+
+        for li, l in enumerate(link_names):
+            i = self.link_names.index(l)
+            position[:, li, :] = state.links_position[:, i, :]
+            quaternion[:, li, :] = state.links_quaternion[:, i, :]
+        return Pose(position=position, quaternion=quaternion)
+
+    def _cuda_forward(self, q):
+        link_pos, link_quat, robot_spheres = get_cuda_kinematics(
+            # self._link_mat_seq,  # data will be stored here
+            self._link_pos_seq,
+            self._link_quat_seq,
+            self._batch_robot_spheres,
+            self._global_cumul_mat,
+            q,
+            self.kinematics_config.fixed_transforms,
+            self.kinematics_config.link_spheres,
+            self.kinematics_config.link_map,  # tells which link is attached to which link i
+            self.kinematics_config.joint_map,  # tells which joint is attached to a link i
+            self.kinematics_config.joint_map_type,  # joint type
+            self.kinematics_config.store_link_map,
+            self.kinematics_config.link_sphere_idx_map,  # sphere idx map
+            self.kinematics_config.link_chain_map,
+            self._grad_out_q,
+            self.use_global_cumul,
+        )
+        # if(robot_spheres.shape[0]<10):
+        #    print(robot_spheres)
+        return link_pos, link_quat, robot_spheres
+
+    @property
+    def all_articulated_joint_names(self):
+        return self.kinematics_config.non_fixed_joint_names
+
+    def get_self_collision_config(self) -> SelfCollisionKinematicsConfig:
+        return self.self_collision_config
+
+    def get_link_mesh(self, link_name: str) -> Mesh:
+        mesh = self.kinematics_parser.get_link_mesh(link_name)
+        mesh.pose = [0, 0, 0, 1, 0, 0, 0]
+        return mesh
+
+    def get_link_transform(self, link_name: str) -> Pose:
+        mat = self._kinematics_config.fixed_transforms[self._name_to_idx_map[link_name]]
+        pose = Pose(position=mat[:3, 3], rotation=mat[:3, :3])
+        return pose
+
+    def get_all_link_transforms(self) -> Pose:
+        pose = Pose(
+            self.kinematics_config.fixed_transforms[:, :3, 3],
+            rotation=self.kinematics_config.fixed_transforms[:, :3, :3],
+        )
+        return pose
+
+    def get_dof(self) -> int:
+        return self.kinematics_config.n_dof
+
+    @property
+    def joint_names(self) -> List[str]:
+        return self.kinematics_config.joint_names
+
+    @property
+    def total_spheres(self) -> int:
+        return self.kinematics_config.total_spheres
+
+    @property
+    def lock_jointstate(self):
+        return self.kinematics_config.lock_jointstate
+
+    @property
+    def ee_link(self):
+        return self.kinematics_config.ee_link
+
+    @property
+    def base_link(self):
+        return self.kinematics_config.base_link
+
+    def update_kinematics_config(self, new_kin_config: KinematicsTensorConfig):
+        self.kinematics_config.copy(new_kin_config)
diff --git a/src/curobo/cuda_robot_model/kinematics_parser.py b/src/curobo/cuda_robot_model/kinematics_parser.py
new file mode 100644
index 00000000..a2474984
--- /dev/null
+++ b/src/curobo/cuda_robot_model/kinematics_parser.py
@@ -0,0 +1,101 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Union
+
+# Third Party
+import numpy as np
+
+# CuRobo
+from curobo.cuda_robot_model.types import JointType
+from curobo.geom.types import Mesh
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.util.logger import log_error
+
+
+@dataclass
+class LinkParams:
+    link_name: str
+    joint_name: str
+    joint_type: JointType
+    fixed_transform: np.ndarray
+    parent_link_name: Optional[str] = None
+    joint_limits: Optional[List[float]] = None
+    joint_axis: Optional[np.ndarray] = None
+    joint_id: Optional[int] = None
+    joint_velocity_limits: List[float] = field(default_factory=lambda: [-2.0, 2.0])
+
+    @staticmethod
+    def from_dict(dict_data):
+        dict_data["joint_type"] = JointType[dict_data["joint_type"]]
+        dict_data["fixed_transform"] = (
+            Pose.from_list(dict_data["fixed_transform"], tensor_args=TensorDeviceType())
+            .get_numpy_matrix()
+            .reshape(4, 4)
+        )
+
+        return LinkParams(**dict_data)
+
+
+class KinematicsParser:
+    def __init__(self, extra_links: Optional[Dict[str, LinkParams]] = None) -> None:
+        self._parent_map = {}
+        self.extra_links = extra_links
+        self.build_link_parent()
+        # add extra links to parent:
+        if self.extra_links is not None and len(list(self.extra_links.keys())) > 0:
+            for i in self.extra_links:
+                self._parent_map[i] = self.extra_links[i].parent_link_name
+
+    def build_link_parent(self):
+        """Build a map of parent links to each link in the kinematic tree.
+
+        NOTE: Use this function to fill self._parent_map.
+        """
+        pass
+
+    def get_chain(self, base_link: str, ee_link: str) -> List[str]:
+        """Get list of links attaching ee_link to base_link.
+
+        Args:
+            base_link (str): Name of base link.
+            ee_link (str): Name of end-effector link.
+
+        Returns:
+            List[str]: List of link names starting from base_link to ee_link.
+        """
+        chain_links = [ee_link]
+        link = ee_link
+        while link != base_link:
+            link = self._parent_map[link]
+            # add link to chain:
+            chain_links.append(link)
+        chain_links.reverse()
+        return chain_links
+
+    def _get_from_extra_links(self, link_name: str) -> LinkParams:
+        if self.extra_links is None:
+            return None
+        if link_name in self.extra_links.keys():
+            return self.extra_links[link_name]
+        return None
+
+    def get_link_parameters(self, link_name: str, base: bool = False) -> LinkParams:
+        pass
+
+    def add_absolute_path_to_link_meshes(self, mesh_dir: str = ""):
+        pass
+
+    def get_link_mesh(self, link_name: str) -> Mesh:
+        pass
diff --git a/src/curobo/cuda_robot_model/types.py b/src/curobo/cuda_robot_model/types.py
new file mode 100644
index 00000000..b54fa1a1
--- /dev/null
+++ b/src/curobo/cuda_robot_model/types.py
@@ -0,0 +1,388 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+from __future__ import annotations
+
+# Standard Library
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any, Dict, List, Optional, Union
+
+# Third Party
+import torch
+
+# CuRobo
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.state import JointState
+from curobo.types.tensor import T_DOF
+from curobo.util.tensor_util import copy_if_not_none
+
+
+class JointType(Enum):
+    FIXED = -1
+    X_PRISM = 0
+    Y_PRISM = 1
+    Z_PRISM = 2
+    X_ROT = 3
+    Y_ROT = 4
+    Z_ROT = 5
+
+
+@dataclass
+class JointLimits:
+    joint_names: List[str]
+    position: torch.Tensor
+    velocity: torch.Tensor
+    acceleration: torch.Tensor
+    jerk: torch.Tensor
+    effort: Optional[torch.Tensor] = None
+    tensor_args: TensorDeviceType = TensorDeviceType()
+
+    @staticmethod
+    def from_data_dict(data: Dict, tensor_args: TensorDeviceType = TensorDeviceType()):
+        p = tensor_args.to_device(data["position"])
+        v = tensor_args.to_device(data["velocity"])
+        a = tensor_args.to_device(data["acceleration"])
+        j = tensor_args.to_device(data["jerk"])
+        e = None
+        if "effort" in data and data["effort"] is not None:
+            e = tensor_args.to_device(data["effort"])
+
+        return JointLimits(data["joint_names"], p, v, a, j, e)
+
+    def clone(self) -> JointLimits:
+        return JointLimits(
+            self.joint_names.copy(),
+            self.position.clone(),
+            self.velocity.clone(),
+            self.acceleration.clone(),
+            self.jerk.clone(),
+            self.effort.clone() if self.effort is not None else None,
+            self.tensor_args,
+        )
+
+
+@dataclass
+class CSpaceConfig:
+    joint_names: List[str]
+    retract_config: Optional[T_DOF] = None
+    cspace_distance_weight: Optional[T_DOF] = None
+    null_space_weight: Optional[T_DOF] = None  # List[float]
+    tensor_args: TensorDeviceType = TensorDeviceType()
+    max_acceleration: Union[float, List[float]] = 10.0
+    max_jerk: Union[float, List[float]] = 500.0
+    velocity_scale: Union[float, List[float]] = 1.0
+    acceleration_scale: Union[float, List[float]] = 1.0
+    jerk_scale: Union[float, List[float]] = 1.0
+
+    def __post_init__(self):
+        if self.retract_config is not None:
+            self.retract_config = self.tensor_args.to_device(self.retract_config)
+        if self.cspace_distance_weight is not None:
+            self.cspace_distance_weight = self.tensor_args.to_device(self.cspace_distance_weight)
+        if self.null_space_weight is not None:
+            self.null_space_weight = self.tensor_args.to_device(self.null_space_weight)
+        if isinstance(self.max_acceleration, float):
+            self.max_acceleration = self.tensor_args.to_device(
+                [self.max_acceleration for _ in self.joint_names]
+            )
+
+        if isinstance(self.velocity_scale, float) or len(self.velocity_scale) == 1:
+            self.velocity_scale = self.tensor_args.to_device(
+                [self.velocity_scale for _ in self.joint_names]
+            ).view(-1)
+
+        if isinstance(self.acceleration_scale, float) or len(self.acceleration_scale) == 1:
+            self.acceleration_scale = self.tensor_args.to_device(
+                [self.acceleration_scale for _ in self.joint_names]
+            ).view(-1)
+
+        if isinstance(self.jerk_scale, float) or len(self.jerk_scale) == 1:
+            self.jerk_scale = self.tensor_args.to_device(
+                [self.jerk_scale for _ in self.joint_names]
+            ).view(-1)
+
+        if isinstance(self.max_acceleration, List):
+            self.max_acceleration = self.tensor_args.to_device(self.max_acceleration)
+        if isinstance(self.max_jerk, float):
+            self.max_jerk = [self.max_jerk for _ in self.joint_names]
+        if isinstance(self.max_jerk, List):
+            self.max_jerk = self.tensor_args.to_device(self.max_jerk)
+        if isinstance(self.velocity_scale, List):
+            self.velocity_scale = self.tensor_args.to_device(self.velocity_scale)
+
+        if isinstance(self.acceleration_scale, List):
+            self.acceleration_scale = self.tensor_args.to_device(self.acceleration_scale)
+        if isinstance(self.jerk_scale, List):
+            self.jerk_scale = self.tensor_args.to_device(self.jerk_scale)
+
+    def inplace_reindex(self, joint_names: List[str]):
+        new_index = [self.joint_names.index(j) for j in joint_names]
+        if self.retract_config is not None:
+            self.retract_config = self.retract_config[new_index].clone()
+        if self.cspace_distance_weight is not None:
+            self.cspace_distance_weight = self.cspace_distance_weight[new_index].clone()
+        if self.null_space_weight is not None:
+            self.null_space_weight = self.null_space_weight[new_index].clone()
+        self.max_acceleration = self.max_acceleration[new_index].clone()
+        self.max_jerk = self.max_jerk[new_index].clone()
+        self.velocity_scale = self.velocity_scale[new_index].clone()
+        self.acceleration_scale = self.acceleration_scale[new_index].clone()
+        self.jerk_scale = self.jerk_scale[new_index].clone()
+        joint_names = [self.joint_names[n] for n in new_index]
+        self.joint_names = joint_names
+
+    def clone(self) -> CSpaceConfig:
+        return CSpaceConfig(
+            joint_names=self.joint_names.copy(),
+            retract_config=copy_if_not_none(self.retract_config),
+            null_space_weight=copy_if_not_none(self.null_space_weight),
+            cspace_distance_weight=copy_if_not_none(self.cspace_distance_weight),
+            tensor_args=self.tensor_args,
+            max_jerk=self.max_jerk.clone(),
+            max_acceleration=self.max_acceleration.clone(),
+            velocity_scale=self.velocity_scale.clone(),
+            acceleration_scale=self.acceleration_scale.clone(),
+            jerk_scale=self.jerk_scale.clone(),
+        )
+
+    def scale_joint_limits(self, joint_limits: JointLimits):
+        if self.velocity_scale is not None:
+            joint_limits.velocity = joint_limits.velocity * self.velocity_scale
+        if self.acceleration_scale is not None:
+            joint_limits.acceleration = joint_limits.acceleration * self.acceleration_scale
+        if self.jerk_scale is not None:
+            joint_limits.jerk = joint_limits.jerk * self.jerk_scale
+
+        return joint_limits
+
+    @staticmethod
+    def load_from_joint_limits(
+        joint_position_upper: torch.Tensor,
+        joint_position_lower: torch.Tensor,
+        joint_names: List[str],
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+    ):
+        retract_config = ((joint_position_upper + joint_position_lower) / 2).flatten()
+        n_dof = retract_config.shape[-1]
+        null_space_weight = torch.ones(n_dof, **vars(tensor_args))
+        cspace_distance_weight = torch.ones(n_dof, **vars(tensor_args))
+        return CSpaceConfig(
+            joint_names,
+            retract_config,
+            cspace_distance_weight,
+            null_space_weight,
+            tensor_args=tensor_args,
+        )
+
+
+@dataclass
+class KinematicsTensorConfig:
+    fixed_transforms: torch.Tensor
+    link_map: torch.Tensor
+    joint_map: torch.Tensor
+    joint_map_type: torch.Tensor
+    store_link_map: torch.Tensor
+    link_chain_map: torch.Tensor
+    link_names: List[str]
+    joint_limits: JointLimits
+    non_fixed_joint_names: List[str]
+    n_dof: int
+    mesh_link_names: Optional[List[str]] = None
+    joint_names: Optional[List[str]] = None
+    lock_jointstate: Optional[JointState] = None
+    link_spheres: Optional[torch.Tensor] = None
+    link_sphere_idx_map: Optional[torch.Tensor] = None
+    link_name_to_idx_map: Optional[Dict[str, int]] = None
+    total_spheres: int = 0
+    debug: Optional[Any] = None
+    ee_idx: int = 0
+    cspace: Optional[CSpaceConfig] = None
+    base_link: str = "base_link"
+    ee_link: str = "ee_link"
+
+    def __post_init__(self):
+        if self.cspace is None and self.joint_limits is not None:
+            self.load_cspace_cfg_from_kinematics()
+        if self.joint_limits is not None and self.cspace is not None:
+            self.joint_limits = self.cspace.scale_joint_limits(self.joint_limits)
+
+    def load_cspace_cfg_from_kinematics(self):
+        retract_config = (
+            (self.joint_limits.position[1] + self.joint_limits.position[0]) / 2
+        ).flatten()
+        null_space_weight = torch.ones(self.n_dof, **vars(self.tensor_args))
+        cspace_distance_weight = torch.ones(self.n_dof, **vars(self.tensor_args))
+        joint_names = self.joint_names
+        self.cspace = CSpaceConfig(
+            joint_names,
+            retract_config,
+            cspace_distance_weight,
+            null_space_weight,
+            tensor_args=self.tensor_args,
+            max_acceleration=self.joint_limits.acceleration[1],
+            max_jerk=self.joint_limits.max_jerk[1],
+        )
+
+    def get_sphere_index_from_link_name(self, link_name: str) -> torch.Tensor:
+        link_idx = self.link_name_to_idx_map[link_name]
+        # get sphere index for this link:
+        link_spheres_idx = torch.nonzero(self.link_sphere_idx_map == link_idx).view(-1)
+        return link_spheres_idx
+
+    def update_link_spheres(
+        self, link_name: str, sphere_position_radius: torch.Tensor, start_sph_idx: int = 0
+    ):
+        """Update sphere parameters
+
+        #NOTE: This currently does not update self collision distances.
+        Args:
+            link_name: _description_
+            sphere_position_radius: _description_
+            start_sph_idx: _description_. Defaults to 0.
+        """
+        # get sphere indices from link name:
+        link_sphere_index = self.get_sphere_index_from_link_name(link_name)[
+            start_sph_idx : start_sph_idx + sphere_position_radius.shape[0]
+        ]
+        # update sphere data:
+        self.link_spheres[link_sphere_index, :] = sphere_position_radius
+
+    def get_link_spheres(
+        self,
+        link_name: str,
+    ):
+        link_sphere_index = self.get_sphere_index_from_link_name(link_name)
+        return self.link_spheres[link_sphere_index, :]
+
+    def attach_object(
+        self,
+        sphere_radius: Optional[float] = None,
+        sphere_tensor: Optional[torch.Tensor] = None,
+        link_name: str = "attached_object",
+    ) -> bool:
+        """_summary_
+
+        Args:
+            sphere_radius: _description_. Defaults to None.
+            sphere_tensor: _description_. Defaults to None.
+            link_name: _description_. Defaults to "attached_object".
+
+        Raises:
+            ValueError: _description_
+            ValueError: _description_
+
+        Returns:
+            _description_
+        """
+        if link_name not in self.link_name_to_idx_map.keys():
+            raise ValueError(link_name + " not found in spheres")
+        curr_spheres = self.get_link_spheres(link_name)
+
+        if sphere_radius is not None:
+            curr_spheres[:, 3] = sphere_radius
+        if sphere_tensor is not None:
+            if sphere_tensor.shape != curr_spheres.shape and sphere_tensor.shape[0] != 1:
+                raise ValueError("sphere_tensor shape does not match current spheres")
+            curr_spheres[:, :] = sphere_tensor
+        self.update_link_spheres(link_name, curr_spheres)
+        return True
+
+    def detach_object(self, link_name: str = "attached_object") -> bool:
+        """Detach object from robot end-effector
+
+        Args:
+            link_name: _description_. Defaults to "attached_object".
+
+        Raises:
+            ValueError: _description_
+
+        Returns:
+            _description_
+        """
+        if link_name not in self.link_name_to_idx_map.keys():
+            raise ValueError(link_name + " not found in spheres")
+        curr_spheres = self.get_link_spheres(link_name)
+        curr_spheres[:, 3] = -100.0
+        curr_spheres[:, :3] = 0.0
+        self.update_link_spheres(link_name, curr_spheres)
+
+        return True
+
+    def get_number_of_spheres(self, link_name: str) -> int:
+        """Get number of spheres for a link
+
+        Args:
+            link_name: name of link
+        """
+        return self.get_link_spheres(link_name).shape[0]
+
+
+@dataclass
+class SelfCollisionKinematicsConfig:
+    """Dataclass that stores self collision attributes to pass to cuda kernel."""
+
+    offset: Optional[torch.Tensor] = None
+    distance_threshold: Optional[torch.Tensor] = None
+    thread_location: Optional[torch.Tensor] = None
+    thread_max: Optional[int] = None
+    collision_matrix: Optional[torch.Tensor] = None
+    experimental_kernel: bool = True
+    checks_per_thread: int = 32
+
+
+@dataclass(frozen=True)
+class CudaRobotModelState:
+    """Dataclass that stores kinematics information."""
+
+    #: End-effector position stored as x,y,z in meters [b, 3]. End-effector is defined by
+    #: :py:attr:`curobo.cuda_robot_model.cuda_robot_generator.CudaRobotGeneratorConfig.ee_link`.
+    ee_position: torch.Tensor
+
+    #: End-effector orientaiton stored as quaternion qw, qx, qy, qz [b,4]. End-effector is defined
+    # by :py:attr:`CudaRobotModelConfig.ee_link`.
+    ee_quaternion: torch.Tensor
+
+    #: Linear Jacobian. Currently not supported.
+    lin_jacobian: Optional[torch.Tensor] = None
+
+    #: Angular Jacobian. Currently not supported.
+    ang_jacobian: Optional[torch.Tensor] = None
+
+    #: Position of links specified by link_names  (:py:attr:`CudaRobotModelConfig.link_names`).
+    links_position: Optional[torch.Tensor] = None
+
+    #: Quaternions of links specified by link names (:py:attr:`CudaRobotModelConfig.link_names`).
+    links_quaternion: Optional[torch.Tensor] = None
+
+    #: Position of spheres specified by collision spheres (:py:attr:`CudaRobotModelConfig.collision_spheres`)
+    #: in x, y, z, r format [b,n,4].
+    link_spheres_tensor: Optional[torch.Tensor] = None
+
+    link_names: Optional[str] = None
+
+    @property
+    def ee_pose(self):
+        return Pose(self.ee_position, self.ee_quaternion)
+
+    def get_link_spheres(self):
+        return self.link_spheres_tensor
+
+    @property
+    def link_pose(self):
+        link_poses = None
+        if self.link_names is not None:
+            link_poses = {}
+            link_pos = self.links_position.contiguous()
+            link_quat = self.links_quaternion.contiguous()
+            for i, v in enumerate(self.link_names):
+                link_poses[v] = Pose(link_pos[..., i, :], link_quat[..., i, :])
+        return link_poses
diff --git a/src/curobo/cuda_robot_model/urdf_kinematics_parser.py b/src/curobo/cuda_robot_model/urdf_kinematics_parser.py
new file mode 100644
index 00000000..bbe71558
--- /dev/null
+++ b/src/curobo/cuda_robot_model/urdf_kinematics_parser.py
@@ -0,0 +1,167 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+from typing import Dict, Optional
+
+# Third Party
+import numpy as np
+import yourdfpy
+from lxml import etree
+
+# CuRobo
+from curobo.cuda_robot_model.kinematics_parser import KinematicsParser, LinkParams
+from curobo.cuda_robot_model.types import JointType
+from curobo.geom.types import Mesh as CuroboMesh
+from curobo.types.base import TensorDeviceType
+from curobo.util.logger import log_error, log_warn
+from curobo.util_file import join_path
+
+
+class UrdfKinematicsParser(KinematicsParser):
+    def __init__(
+        self,
+        urdf_path,
+        load_meshes: bool = False,
+        mesh_root: str = "",
+        extra_links: Optional[Dict[str, LinkParams]] = None,
+    ) -> None:
+        # load robot from urdf:
+        self._robot = yourdfpy.URDF.load(
+            urdf_path,
+            load_meshes=load_meshes,
+            build_scene_graph=False,
+            mesh_dir=mesh_root,
+            filename_handler=yourdfpy.filename_handler_null,
+        )
+        super().__init__(extra_links)
+
+    def build_link_parent(self):
+        self._parent_map = {}
+        for j in self._robot.joint_map:
+            self._parent_map[self._robot.joint_map[j].child] = self._robot.joint_map[j].parent
+
+    def _find_parent_joint_of_link(self, link_name):
+        for j_idx, j in enumerate(self._robot.joint_map):
+            if self._robot.joint_map[j].child == link_name:
+                return j_idx, j
+        log_error("Link is not attached to any joint")
+
+    def _get_joint_name(self, idx):
+        joint = self._robot.joint_names[idx]
+        return joint
+
+    def get_link_parameters(self, link_name: str, base=False) -> LinkParams:
+        link_params = self._get_from_extra_links(link_name)
+        if link_params is not None:
+            return link_params
+        body_params = {}
+        body_params["link_name"] = link_name
+
+        if base:
+            body_params["parent_link_name"] = None
+            joint_transform = np.eye(4)
+            joint_name = "base_joint"
+            joint_type = "fixed"
+            joint_limits = None
+            joint_axis = None
+            body_params["joint_id"] = 0
+        else:
+            body_params["parent_link_name"] = self._parent_map[link_name]
+
+            jid, joint_name = self._find_parent_joint_of_link(link_name)
+            body_params["joint_id"] = jid
+            joint = self._robot.joint_map[joint_name]
+            joint_transform = joint.origin
+            if joint_transform is None:
+                joint_transform = np.eye(4)
+            joint_type = joint.type
+            joint_limits = None
+            joint_axis = None
+            if joint_type != "fixed":
+                if joint_type != "continuous":
+                    joint_limits = {
+                        "effort": joint.limit.effort,
+                        "lower": joint.limit.lower,
+                        "upper": joint.limit.upper,
+                        "velocity": joint.limit.velocity,
+                    }
+                else:
+                    # log_warn("Converting continuous joint to revolute")
+                    joint_type = "revolute"
+                    joint_limits = {
+                        "effort": joint.limit.effort,
+                        "lower": -3.14 * 2,
+                        "upper": 3.14 * 2,
+                        "velocity": joint.limit.velocity,
+                    }
+
+                joint_axis = joint.axis
+                if (joint_axis < 0).any():
+                    log_warn("Joint axis has negative value (-1). This is not supported.")
+                joint_axis = np.abs(joint_axis)
+                body_params["joint_limits"] = [joint_limits["lower"], joint_limits["upper"]]
+                body_params["joint_velocity_limits"] = [
+                    -1.0 * joint_limits["velocity"],
+                    joint_limits["velocity"],
+                ]
+
+        body_params["fixed_transform"] = joint_transform
+        body_params["joint_name"] = joint_name
+
+        body_params["joint_axis"] = joint_axis
+
+        if joint_type == "fixed":
+            joint_type = JointType.FIXED
+        elif joint_type == "prismatic":
+            if joint_axis[0] == 1:
+                joint_type = JointType.X_PRISM
+            if joint_axis[1] == 1:
+                joint_type = JointType.Y_PRISM
+            if joint_axis[2] == 1:
+                joint_type = JointType.Z_PRISM
+        elif joint_type == "revolute":
+            if joint_axis[0] == 1:
+                joint_type = JointType.X_ROT
+            if joint_axis[1] == 1:
+                joint_type = JointType.Y_ROT
+            if joint_axis[2] == 1:
+                joint_type = JointType.Z_ROT
+        else:
+            log_error("Joint type not supported")
+
+        body_params["joint_type"] = joint_type
+        link_params = LinkParams(**body_params)
+
+        return link_params
+
+    def add_absolute_path_to_link_meshes(self, mesh_dir: str = ""):
+        # read all link meshes and update their mesh paths by prepending mesh_dir
+        links = self._robot.link_map
+        for k in links.keys():
+            # read visual and collision
+            vis = links[k].visuals
+            for i in range(len(vis)):
+                m = vis[i].geometry.mesh
+                if m is not None:
+                    m.filename = join_path(mesh_dir, m.filename)
+            col = links[k].collisions
+            for i in range(len(col)):
+                m = col[i].geometry.mesh
+                if m is not None:
+                    m.filename = join_path(mesh_dir, m.filename)
+
+    def get_urdf_string(self):
+        txt = etree.tostring(self._robot.write_xml(), method="xml", encoding="unicode")
+        return txt
+
+    def get_link_mesh(self, link_name):
+        m = self._robot.link_map[link_name].visuals[0].geometry.mesh
+        return CuroboMesh(name=link_name, pose=None, scale=m.scale, file_path=m.filename)
diff --git a/src/curobo/cuda_robot_model/usd_kinematics_parser.py b/src/curobo/cuda_robot_model/usd_kinematics_parser.py
new file mode 100644
index 00000000..d9dc978a
--- /dev/null
+++ b/src/curobo/cuda_robot_model/usd_kinematics_parser.py
@@ -0,0 +1,213 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+from typing import Dict, List, Optional, Tuple
+
+# Third Party
+import numpy as np
+from pxr import Usd, UsdPhysics
+
+# CuRobo
+from curobo.cuda_robot_model.kinematics_parser import KinematicsParser, LinkParams
+from curobo.cuda_robot_model.types import JointType
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.util.logger import log_error
+
+
+class UsdKinematicsParser(KinematicsParser):
+    """An experimental kinematics parser from USD.
+    NOTE: A more complete solution will be available in a future release. Current implementation
+    does not account for link geometry transformations after a joints.
+    """
+
+    def __init__(
+        self,
+        usd_path: str,
+        flip_joints: List[str] = [],
+        flip_joint_limits: List[str] = [],
+        usd_robot_root: str = "robot",
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+        extra_links: Optional[Dict[str, LinkParams]] = None,
+    ) -> None:
+        # load usd file:
+
+        # create a usd stage
+        self._flip_joints = flip_joints
+        self._flip_joint_limits = flip_joint_limits
+        self._stage = Usd.Stage.Open(usd_path)
+        self._usd_robot_root = usd_robot_root
+        self._parent_joint_map = {}
+        self.tensor_args = tensor_args
+        super().__init__(extra_links)
+
+    @property
+    def robot_prim_root(self):
+        return self._usd_robot_root
+
+    def build_link_parent(self):
+        self._parent_map = {}
+        all_joints = [
+            x
+            for x in self._stage.Traverse()
+            if (x.IsA(UsdPhysics.Joint) and str(x.GetPath()).startswith(self._usd_robot_root))
+        ]
+        for l in all_joints:
+            parent, child = get_links_for_joint(l)
+            if child is not None and parent is not None:
+                self._parent_map[child.GetName()] = parent.GetName()
+                self._parent_joint_map[child.GetName()] = l  # store joint prim
+
+    def get_link_parameters(self, link_name: str, base: bool = False) -> LinkParams:
+        """Get Link parameters from usd stage.
+
+        NOTE: USD kinematics "X" axis joints map to "Z" in URDF. Specifically,
+        uniform token physics:axis = "X" value only matches "Z" in URDF. This is because of usd
+        files assuming Y axis as up while urdf files assume Z axis as up.
+
+        Args:
+            link_name (str): Name of link.
+            base (bool, optional): flag to specify base link. Defaults to False.
+
+        Returns:
+            LinkParams: obtained link parameters.
+        """
+        link_params = self._get_from_extra_links(link_name)
+        if link_params is not None:
+            return link_params
+        joint_limits = None
+        joint_axis = None
+        if base:
+            parent_link_name = None
+            joint_transform = np.eye(4)
+            joint_name = "base_joint"
+            joint_type = JointType.FIXED
+
+        else:
+            parent_link_name = self._parent_map[link_name]
+            joint_prim = self._parent_joint_map[link_name]  # joint prim connects link
+            joint_transform = self._get_joint_transform(joint_prim)
+            joint_axis = None
+            joint_name = joint_prim.GetName()
+            if joint_prim.IsA(UsdPhysics.FixedJoint):
+                joint_type = JointType.FIXED
+            elif joint_prim.IsA(UsdPhysics.RevoluteJoint):
+                j_prim = UsdPhysics.RevoluteJoint(joint_prim)
+                joint_axis = j_prim.GetAxisAttr().Get()
+                joint_limits = np.radians(
+                    np.ravel([j_prim.GetLowerLimitAttr().Get(), j_prim.GetUpperLimitAttr().Get()])
+                )
+                if joint_name in self._flip_joints.keys():
+                    joint_axis = self._flip_joints[joint_name]
+                if joint_axis == "X":
+                    joint_type = JointType.X_ROT
+                elif joint_axis == "Y":
+                    joint_type = JointType.Y_ROT
+                elif joint_axis == "Z":
+                    joint_type = JointType.Z_ROT
+                else:
+                    log_error("Joint axis not supported" + str(joint_axis))
+
+            elif joint_prim.IsA(UsdPhysics.PrismaticJoint):
+                j_prim = UsdPhysics.PrismaticJoint(joint_prim)
+
+                joint_axis = j_prim.GetAxisAttr().Get()
+                joint_limits = np.ravel(
+                    [j_prim.GetLowerLimitAttr().Get(), j_prim.GetUpperLimitAttr().Get()]
+                )
+                if joint_name in self._flip_joints.keys():
+                    joint_axis = self._flip_joints[joint_name]
+                if joint_name in self._flip_joint_limits:
+                    joint_limits = np.ravel(
+                        [-1.0 * j_prim.GetUpperLimitAttr().Get(), j_prim.GetLowerLimitAttr().Get()]
+                    )
+                if joint_axis == "X":
+                    joint_type = JointType.X_PRISM
+                elif joint_axis == "Y":
+                    joint_type = JointType.Y_PRISM
+                elif joint_axis == "Z":
+                    joint_type = JointType.Z_PRISM
+                else:
+                    log_error("Joint axis not supported" + str(joint_axis))
+            else:
+                log_error("Joint type not supported")
+        link_params = LinkParams(
+            link_name=link_name,
+            joint_name=joint_name,
+            joint_type=joint_type,
+            fixed_transform=joint_transform,
+            parent_link_name=parent_link_name,
+            joint_limits=joint_limits,
+        )
+        return link_params
+
+    def _get_joint_transform(self, prim: Usd.Prim):
+        j_prim = UsdPhysics.Joint(prim)
+        position = np.ravel(j_prim.GetLocalPos0Attr().Get())
+        quatf = j_prim.GetLocalRot0Attr().Get()
+        quat = np.zeros(4)
+        quat[0] = quatf.GetReal()
+        quat[1:] = quatf.GetImaginary()
+
+        # create a homogenous transformation matrix:
+        transform_0 = Pose(self.tensor_args.to_device(position), self.tensor_args.to_device(quat))
+
+        position = np.ravel(j_prim.GetLocalPos1Attr().Get())
+        quatf = j_prim.GetLocalRot1Attr().Get()
+        quat = np.zeros(4)
+        quat[0] = quatf.GetReal()
+        quat[1:] = quatf.GetImaginary()
+
+        # create a homogenous transformation matrix:
+        transform_1 = Pose(self.tensor_args.to_device(position), self.tensor_args.to_device(quat))
+        transform = (
+            transform_0.multiply(transform_1.inverse()).get_matrix().cpu().view(4, 4).numpy()
+        )
+
+        # get attached link transform:
+
+        return transform
+
+
+def get_links_for_joint(prim: Usd.Prim) -> Tuple[Optional[Usd.Prim], Optional[Usd.Prim]]:
+    """Get all link prims from the given joint prim.
+
+    Note:
+        This assumes that the `body0_rel_targets` and `body1_rel_targets` are configured such
+        that the parent link is specified in `body0_rel_targets` and the child links is specified
+        in `body1_rel_targets`.
+    """
+    stage = prim.GetStage()
+    joint_api = UsdPhysics.Joint(prim)
+
+    rel0_targets = joint_api.GetBody0Rel().GetTargets()
+    if len(rel0_targets) > 1:
+        raise NotImplementedError(
+            "`get_links_for_joint` does not currently handle more than one relative"
+            f" body target in the joint. joint_prim: {prim}, body0_rel_targets:"
+            f" {rel0_targets}"
+        )
+    link0_prim = None
+    if len(rel0_targets) != 0:
+        link0_prim = stage.GetPrimAtPath(rel0_targets[0])
+
+    rel1_targets = joint_api.GetBody1Rel().GetTargets()
+    if len(rel1_targets) > 1:
+        raise NotImplementedError(
+            "`get_links_for_joint` does not currently handle more than one relative"
+            f" body target in the joint. joint_prim: {prim}, body1_rel_targets:"
+            f" {rel0_targets}"
+        )
+    link1_prim = None
+    if len(rel1_targets) != 0:
+        link1_prim = stage.GetPrimAtPath(rel1_targets[0])
+
+    return (link0_prim, link1_prim)
diff --git a/src/curobo/curobolib/__init__.py b/src/curobo/curobolib/__init__.py
new file mode 100644
index 00000000..d9f07477
--- /dev/null
+++ b/src/curobo/curobolib/__init__.py
@@ -0,0 +1,11 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+"""CuRoboLib Module."""
diff --git a/src/curobo/curobolib/cpp/geom_cuda.cpp b/src/curobo/curobolib/cpp/geom_cuda.cpp
new file mode 100644
index 00000000..cd69a6f9
--- /dev/null
+++ b/src/curobo/curobolib/cpp/geom_cuda.cpp
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#include <map>
+#include <torch/extension.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <vector>
+
+// CUDA forward declarations
+
+std::vector<torch::Tensor> self_collision_distance(
+    torch::Tensor out_distance, torch::Tensor out_vec,
+    torch::Tensor sparse_index,
+    const torch::Tensor robot_spheres,      // batch_size x n_spheres x 4
+    const torch::Tensor collision_offset,   // n_spheres x n_spheres
+    const torch::Tensor weight,
+    const torch::Tensor collision_matrix, // n_spheres x n_spheres
+    const torch::Tensor thread_locations, const int locations_size,
+    const int batch_size, const int nspheres, const bool compute_grad = false,
+    const int ndpt = 8, // Does this need to match template?
+    const bool debug = false);
+
+// CUDA forward declarations
+
+std::vector<torch::Tensor> swept_sphere_obb_clpt(
+    const torch::Tensor sphere_position, // batch_size, 3
+    torch::Tensor distance, // batch_size, 1
+    torch::Tensor
+        closest_point, // batch size, 4 -> written out as x,y,z,0 for gradient
+    torch::Tensor sparsity_idx, const torch::Tensor weight,
+    const torch::Tensor activation_distance, const torch::Tensor speed_dt,
+    const torch::Tensor obb_accel,     // n_boxes, 4, 4
+    const torch::Tensor obb_bounds,    // n_boxes, 3
+    const torch::Tensor obb_pose,      // n_boxes, 4, 4
+    const torch::Tensor obb_enable,    // n_boxes, 4,
+    const torch::Tensor n_env_obb,     // n_boxes, 4, 4
+    const torch::Tensor env_query_idx, // n_boxes, 4, 4
+    const int max_nobs, const int batch_size, const int horizon,
+    const int n_spheres, const int sweep_steps, const bool enable_speed_metric,
+    const bool transform_back, const bool compute_distance,
+    const bool use_batch_env);
+
+std::vector<torch::Tensor>
+sphere_obb_clpt(const torch::Tensor sphere_position, // batch_size, 4
+                torch::Tensor distance,
+                torch::Tensor closest_point, // batch size, 3
+                torch::Tensor sparsity_idx, const torch::Tensor weight,
+                const torch::Tensor activation_distance,
+                const torch::Tensor obb_accel,     // n_boxes, 4, 4
+                const torch::Tensor obb_bounds,    // n_boxes, 3
+                const torch::Tensor obb_pose,      // n_boxes, 4, 4
+                const torch::Tensor obb_enable,    // n_boxes, 4, 4
+                const torch::Tensor n_env_obb,     // n_boxes, 4, 4
+                const torch::Tensor env_query_idx, // n_boxes, 4, 4
+                const int max_nobs, const int batch_size, const int horizon,
+                const int n_spheres, const bool transform_back,
+                const bool compute_distance, const bool use_batch_env);
+
+std::vector<torch::Tensor> pose_distance(
+    torch::Tensor out_distance, torch::Tensor out_position_distance,
+    torch::Tensor out_rotation_distance,
+    torch::Tensor distance_p_vector, // batch size, 3
+    torch::Tensor distance_q_vector, // batch size, 4
+    torch::Tensor out_gidx,
+    const torch::Tensor current_position, // batch_size, 3
+    const torch::Tensor goal_position,    // n_boxes, 3
+    const torch::Tensor current_quat, const torch::Tensor goal_quat,
+    const torch::Tensor vec_weight, // n_boxes, 4, 4
+    const torch::Tensor weight,     // n_boxes, 4, 4
+    const torch::Tensor vec_convergence, const torch::Tensor run_weight,
+    const torch::Tensor run_vec_weight, const torch::Tensor batch_pose_idx,
+    const int batch_size, const int horizon, const int mode,
+    const int num_goals = 1, const bool compute_grad = false,
+    const bool write_distance = true, const bool use_metric = false);
+
+std::vector<torch::Tensor>
+backward_pose_distance(torch::Tensor out_grad_p, torch::Tensor out_grad_q,
+                       const torch::Tensor grad_distance,   // batch_size, 3
+                       const torch::Tensor grad_p_distance, // n_boxes, 3
+                       const torch::Tensor grad_q_distance,
+                       const torch::Tensor pose_weight,
+                       const torch::Tensor grad_p_vec, // n_boxes, 4, 4
+                       const torch::Tensor grad_q_vec, const int batch_size,
+                       const bool use_distance = false);
+
+// C++ interface
+
+// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x)                                                    \
+  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x)                                                         \
+  CHECK_CUDA(x);                                                               \
+  CHECK_CONTIGUOUS(x)
+
+std::vector<torch::Tensor> self_collision_distance_wrapper(
+    torch::Tensor out_distance, torch::Tensor out_vec,
+    torch::Tensor sparse_index,
+    const torch::Tensor robot_spheres,        // batch_size x n_spheres x 4
+    const torch::Tensor collision_offset,     // n_spheres
+    const torch::Tensor weight,
+    const torch::Tensor collision_matrix, // n_spheres
+    const torch::Tensor thread_locations, const int thread_locations_size,
+    const int batch_size, const int nspheres, const bool compute_grad = false,
+    const int ndpt = 8, const bool debug = false) {
+
+  CHECK_INPUT(out_distance);
+  CHECK_INPUT(out_vec);
+  CHECK_INPUT(robot_spheres);
+  CHECK_INPUT(collision_offset);
+  CHECK_INPUT(sparse_index);
+  CHECK_INPUT(weight);
+  CHECK_INPUT(thread_locations);
+  CHECK_INPUT(collision_matrix);
+  const at::cuda::OptionalCUDAGuard guard(robot_spheres.device());
+
+  return self_collision_distance(
+      out_distance, out_vec, sparse_index, robot_spheres, 
+      collision_offset, weight, collision_matrix, thread_locations,
+      thread_locations_size, batch_size, nspheres, compute_grad, ndpt, debug);
+}
+
+std::vector<torch::Tensor> sphere_obb_clpt_wrapper(
+    const torch::Tensor sphere_position, // batch_size, 4
+
+    torch::Tensor distance,
+    torch::Tensor closest_point, // batch size, 3
+    torch::Tensor sparsity_idx, const torch::Tensor weight,
+    const torch::Tensor activation_distance,
+    const torch::Tensor obb_accel,     // n_boxes, 4, 4
+    const torch::Tensor obb_bounds,    // n_boxes, 3
+    const torch::Tensor obb_pose,      // n_boxes, 4, 4
+    const torch::Tensor obb_enable,    // n_boxes, 4, 4
+    const torch::Tensor n_env_obb,     // n_boxes, 4, 4
+    const torch::Tensor env_query_idx, // n_boxes, 4, 4
+    const int max_nobs, const int batch_size, const int horizon,
+    const int n_spheres, const bool transform_back, const bool compute_distance,
+    const bool use_batch_env) {
+
+  const at::cuda::OptionalCUDAGuard guard(sphere_position.device());
+  CHECK_INPUT(distance);
+  CHECK_INPUT(closest_point);
+  CHECK_INPUT(sphere_position);
+  CHECK_INPUT(sparsity_idx);
+  CHECK_INPUT(weight);
+  CHECK_INPUT(activation_distance);
+  CHECK_INPUT(obb_accel);
+  return sphere_obb_clpt(
+      sphere_position, distance, closest_point, sparsity_idx, weight,
+      activation_distance, obb_accel, obb_bounds, obb_pose, obb_enable,
+      n_env_obb, env_query_idx, max_nobs, batch_size, horizon, n_spheres,
+      transform_back, compute_distance, use_batch_env);
+}
+std::vector<torch::Tensor> swept_sphere_obb_clpt_wrapper(
+    const torch::Tensor sphere_position, // batch_size, 4
+    torch::Tensor distance,
+    torch::Tensor closest_point, // batch size, 3
+    torch::Tensor sparsity_idx, const torch::Tensor weight,
+    const torch::Tensor activation_distance, const torch::Tensor speed_dt,
+    const torch::Tensor obb_accel,     // n_boxes, 4, 4
+    const torch::Tensor obb_bounds,    // n_boxes, 3
+    const torch::Tensor obb_pose,      // n_boxes, 4, 4
+    const torch::Tensor obb_enable,    // n_boxes, 4, 4
+    const torch::Tensor n_env_obb,     // n_boxes, 4, 4
+    const torch::Tensor env_query_idx, // n_boxes, 4, 4
+    const int max_nobs, const int batch_size, const int horizon,
+    const int n_spheres, const int sweep_steps, const bool enable_speed_metric,
+    const bool transform_back, const bool compute_distance,
+    const bool use_batch_env) {
+  const at::cuda::OptionalCUDAGuard guard(sphere_position.device());
+  CHECK_INPUT(distance);
+  CHECK_INPUT(closest_point);
+  CHECK_INPUT(sphere_position);
+
+  return swept_sphere_obb_clpt(
+      sphere_position,
+      distance, closest_point, sparsity_idx, weight, activation_distance,
+      speed_dt, obb_accel, obb_bounds, obb_pose, obb_enable, n_env_obb,
+      env_query_idx, max_nobs, batch_size, horizon, n_spheres, sweep_steps,
+      enable_speed_metric, transform_back, compute_distance, use_batch_env);
+}
+std::vector<torch::Tensor> pose_distance_wrapper(
+    torch::Tensor out_distance, torch::Tensor out_position_distance,
+    torch::Tensor out_rotation_distance,
+    torch::Tensor distance_p_vector, // batch size, 3
+    torch::Tensor distance_q_vector, // batch size, 4
+    torch::Tensor out_gidx,
+    const torch::Tensor current_position, // batch_size, 3
+    const torch::Tensor goal_position,    // n_boxes, 3
+    const torch::Tensor current_quat, const torch::Tensor goal_quat,
+    const torch::Tensor vec_weight, // n_boxes, 4, 4
+    const torch::Tensor weight, const torch::Tensor vec_convergence,
+    const torch::Tensor run_weight, const torch::Tensor run_vec_weight,
+    const torch::Tensor batch_pose_idx, const int batch_size, const int horizon,
+    const int mode, const int num_goals = 1, const bool compute_grad = false,
+    const bool write_distance = false, const bool use_metric = false) {
+
+  // at::cuda::DeviceGuard guard(angle.device());
+  CHECK_INPUT(out_distance);
+  CHECK_INPUT(out_position_distance);
+  CHECK_INPUT(out_rotation_distance);
+  CHECK_INPUT(distance_p_vector);
+  CHECK_INPUT(distance_q_vector);
+  CHECK_INPUT(current_position);
+  CHECK_INPUT(goal_position);
+  CHECK_INPUT(current_quat);
+  CHECK_INPUT(goal_quat);
+  CHECK_INPUT(batch_pose_idx);
+  const at::cuda::OptionalCUDAGuard guard(current_position.device());
+
+  return pose_distance(
+      out_distance, out_position_distance, out_rotation_distance,
+      distance_p_vector, distance_q_vector, out_gidx, current_position,
+      goal_position, current_quat, goal_quat, vec_weight, weight,
+      vec_convergence, run_weight, run_vec_weight, batch_pose_idx, batch_size,
+      horizon, mode, num_goals, compute_grad, write_distance, use_metric);
+}
+
+std::vector<torch::Tensor> backward_pose_distance_wrapper(
+    torch::Tensor out_grad_p, torch::Tensor out_grad_q,
+    const torch::Tensor grad_distance,   // batch_size, 3
+    const torch::Tensor grad_p_distance, // n_boxes, 3
+    const torch::Tensor grad_q_distance, const torch::Tensor pose_weight,
+    const torch::Tensor grad_p_vec, // n_boxes, 4, 4
+    const torch::Tensor grad_q_vec, const int batch_size,
+    const bool use_distance) {
+  CHECK_INPUT(out_grad_p);
+  CHECK_INPUT(out_grad_q);
+  CHECK_INPUT(grad_distance);
+  CHECK_INPUT(grad_p_distance);
+  CHECK_INPUT(grad_q_distance);
+
+  const at::cuda::OptionalCUDAGuard guard(grad_distance.device());
+
+  return backward_pose_distance(
+      out_grad_p, out_grad_q, grad_distance, grad_p_distance, grad_q_distance,
+      pose_weight, grad_p_vec, grad_q_vec, batch_size, use_distance);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("pose_distance", &pose_distance_wrapper, "Pose Distance (curobolib)");
+  m.def("pose_distance_backward", &backward_pose_distance_wrapper,
+        "Pose Distance Backward (curobolib)");
+
+  m.def("closest_point", &sphere_obb_clpt_wrapper,
+        "Closest Point OBB(curobolib)");
+  m.def("swept_closest_point", &swept_sphere_obb_clpt_wrapper,
+        "Swept Closest Point OBB(curobolib)");
+
+  m.def("self_collision_distance", &self_collision_distance_wrapper,
+        "Self Collision Distance (curobolib)");
+}
diff --git a/src/curobo/curobolib/cpp/helper_math.h b/src/curobo/curobolib/cpp/helper_math.h
new file mode 100644
index 00000000..f0140aec
--- /dev/null
+++ b/src/curobo/curobolib/cpp/helper_math.h
@@ -0,0 +1,1453 @@
+/*
+ * Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+/*
+ *  This file implements common mathematical operations on vector types
+ *  (float3, float4 etc.) since these are not provided as standard by CUDA.
+ *
+ *  The syntax is modeled on the Cg standard library.
+ *
+ *  This is part of the Helper library includes
+ *
+ *    Thanks to Linh Hah for additions and fixes.
+ */
+
+#ifndef HELPER_MATH_H
+#define HELPER_MATH_H
+
+#include <cuda_runtime.h>
+
+typedef unsigned int uint;
+typedef unsigned short ushort;
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#ifndef __CUDACC__
+#include <math.h>
+
+////////////////////////////////////////////////////////////////////////////////
+// host implementations of CUDA functions
+////////////////////////////////////////////////////////////////////////////////
+
+inline float fminf(float a, float b)
+{
+    return a < b ? a : b;
+}
+
+inline float fmaxf(float a, float b)
+{
+    return a > b ? a : b;
+}
+
+inline int max(int a, int b)
+{
+    return a > b ? a : b;
+}
+
+inline int min(int a, int b)
+{
+    return a < b ? a : b;
+}
+
+inline float rsqrtf(float x)
+{
+    return 1.0f / sqrtf(x);
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// constructors
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 make_float2(float s)
+{
+    return make_float2(s, s);
+}
+inline __host__ __device__ float2 make_float2(float3 a)
+{
+    return make_float2(a.x, a.y);
+}
+inline __host__ __device__ float2 make_float2(int2 a)
+{
+    return make_float2(float(a.x), float(a.y));
+}
+inline __host__ __device__ float2 make_float2(uint2 a)
+{
+    return make_float2(float(a.x), float(a.y));
+}
+
+inline __host__ __device__ int2 make_int2(int s)
+{
+    return make_int2(s, s);
+}
+inline __host__ __device__ int2 make_int2(int3 a)
+{
+    return make_int2(a.x, a.y);
+}
+inline __host__ __device__ int2 make_int2(uint2 a)
+{
+    return make_int2(int(a.x), int(a.y));
+}
+inline __host__ __device__ int2 make_int2(float2 a)
+{
+    return make_int2(int(a.x), int(a.y));
+}
+
+inline __host__ __device__ uint2 make_uint2(uint s)
+{
+    return make_uint2(s, s);
+}
+inline __host__ __device__ uint2 make_uint2(uint3 a)
+{
+    return make_uint2(a.x, a.y);
+}
+inline __host__ __device__ uint2 make_uint2(int2 a)
+{
+    return make_uint2(uint(a.x), uint(a.y));
+}
+
+inline __host__ __device__ float3 make_float3(float s)
+{
+    return make_float3(s, s, s);
+}
+inline __host__ __device__ float3 make_float3(float2 a)
+{
+    return make_float3(a.x, a.y, 0.0f);
+}
+inline __host__ __device__ float3 make_float3(float2 a, float s)
+{
+    return make_float3(a.x, a.y, s);
+}
+inline __host__ __device__ float3 make_float3(float4 a)
+{
+    return make_float3(a.x, a.y, a.z);
+}
+inline __host__ __device__ float3 make_float3(int3 a)
+{
+    return make_float3(float(a.x), float(a.y), float(a.z));
+}
+inline __host__ __device__ float3 make_float3(uint3 a)
+{
+    return make_float3(float(a.x), float(a.y), float(a.z));
+}
+
+inline __host__ __device__ int3 make_int3(int s)
+{
+    return make_int3(s, s, s);
+}
+inline __host__ __device__ int3 make_int3(int2 a)
+{
+    return make_int3(a.x, a.y, 0);
+}
+inline __host__ __device__ int3 make_int3(int2 a, int s)
+{
+    return make_int3(a.x, a.y, s);
+}
+inline __host__ __device__ int3 make_int3(uint3 a)
+{
+    return make_int3(int(a.x), int(a.y), int(a.z));
+}
+inline __host__ __device__ int3 make_int3(float3 a)
+{
+    return make_int3(int(a.x), int(a.y), int(a.z));
+}
+
+inline __host__ __device__ uint3 make_uint3(uint s)
+{
+    return make_uint3(s, s, s);
+}
+inline __host__ __device__ uint3 make_uint3(uint2 a)
+{
+    return make_uint3(a.x, a.y, 0);
+}
+inline __host__ __device__ uint3 make_uint3(uint2 a, uint s)
+{
+    return make_uint3(a.x, a.y, s);
+}
+inline __host__ __device__ uint3 make_uint3(uint4 a)
+{
+    return make_uint3(a.x, a.y, a.z);
+}
+inline __host__ __device__ uint3 make_uint3(int3 a)
+{
+    return make_uint3(uint(a.x), uint(a.y), uint(a.z));
+}
+
+inline __host__ __device__ float4 make_float4(float s)
+{
+    return make_float4(s, s, s, s);
+}
+inline __host__ __device__ float4 make_float4(float3 a)
+{
+    return make_float4(a.x, a.y, a.z, 0.0f);
+}
+inline __host__ __device__ float4 make_float4(float3 a, float w)
+{
+    return make_float4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ float4 make_float4(int4 a)
+{
+    return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
+}
+inline __host__ __device__ float4 make_float4(uint4 a)
+{
+    return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
+}
+
+inline __host__ __device__ int4 make_int4(int s)
+{
+    return make_int4(s, s, s, s);
+}
+inline __host__ __device__ int4 make_int4(int3 a)
+{
+    return make_int4(a.x, a.y, a.z, 0);
+}
+inline __host__ __device__ int4 make_int4(int3 a, int w)
+{
+    return make_int4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ int4 make_int4(uint4 a)
+{
+    return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
+}
+inline __host__ __device__ int4 make_int4(float4 a)
+{
+    return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
+}
+
+
+inline __host__ __device__ uint4 make_uint4(uint s)
+{
+    return make_uint4(s, s, s, s);
+}
+inline __host__ __device__ uint4 make_uint4(uint3 a)
+{
+    return make_uint4(a.x, a.y, a.z, 0);
+}
+inline __host__ __device__ uint4 make_uint4(uint3 a, uint w)
+{
+    return make_uint4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ uint4 make_uint4(int4 a)
+{
+    return make_uint4(uint(a.x), uint(a.y), uint(a.z), uint(a.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// negate
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator-(float2 &a)
+{
+    return make_float2(-a.x, -a.y);
+}
+inline __host__ __device__ int2 operator-(int2 &a)
+{
+    return make_int2(-a.x, -a.y);
+}
+inline __host__ __device__ float3 operator-(float3 &a)
+{
+    return make_float3(-a.x, -a.y, -a.z);
+}
+inline __host__ __device__ int3 operator-(int3 &a)
+{
+    return make_int3(-a.x, -a.y, -a.z);
+}
+inline __host__ __device__ float4 operator-(float4 &a)
+{
+    return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+inline __host__ __device__ int4 operator-(int4 &a)
+{
+    return make_int4(-a.x, -a.y, -a.z, -a.w);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// addition
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator+(float2 a, float2 b)
+{
+    return make_float2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(float2 &a, float2 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+}
+inline __host__ __device__ float2 operator+(float2 a, float b)
+{
+    return make_float2(a.x + b, a.y + b);
+}
+inline __host__ __device__ float2 operator+(float b, float2 a)
+{
+    return make_float2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(float2 &a, float b)
+{
+    a.x += b;
+    a.y += b;
+}
+
+inline __host__ __device__ int2 operator+(int2 a, int2 b)
+{
+    return make_int2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(int2 &a, int2 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+}
+inline __host__ __device__ int2 operator+(int2 a, int b)
+{
+    return make_int2(a.x + b, a.y + b);
+}
+inline __host__ __device__ int2 operator+(int b, int2 a)
+{
+    return make_int2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(int2 &a, int b)
+{
+    a.x += b;
+    a.y += b;
+}
+
+inline __host__ __device__ uint2 operator+(uint2 a, uint2 b)
+{
+    return make_uint2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(uint2 &a, uint2 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+}
+inline __host__ __device__ uint2 operator+(uint2 a, uint b)
+{
+    return make_uint2(a.x + b, a.y + b);
+}
+inline __host__ __device__ uint2 operator+(uint b, uint2 a)
+{
+    return make_uint2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(uint2 &a, uint b)
+{
+    a.x += b;
+    a.y += b;
+}
+
+
+inline __host__ __device__ float3 operator+(float3 a, float3 b)
+{
+    return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(float3 &a, float3 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+}
+inline __host__ __device__ float3 operator+(float3 a, float b)
+{
+    return make_float3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(float3 &a, float b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+}
+
+inline __host__ __device__ int3 operator+(int3 a, int3 b)
+{
+    return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(int3 &a, int3 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+}
+inline __host__ __device__ int3 operator+(int3 a, int b)
+{
+    return make_int3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(int3 &a, int b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+}
+
+inline __host__ __device__ uint3 operator+(uint3 a, uint3 b)
+{
+    return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(uint3 &a, uint3 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+}
+inline __host__ __device__ uint3 operator+(uint3 a, uint b)
+{
+    return make_uint3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(uint3 &a, uint b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+}
+
+inline __host__ __device__ int3 operator+(int b, int3 a)
+{
+    return make_int3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ uint3 operator+(uint b, uint3 a)
+{
+    return make_uint3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ float3 operator+(float b, float3 a)
+{
+    return make_float3(a.x + b, a.y + b, a.z + b);
+}
+
+inline __host__ __device__ float4 operator+(float4 a, float4 b)
+{
+    return make_float4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
+}
+inline __host__ __device__ void operator+=(float4 &a, float4 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    a.w += b.w;
+}
+inline __host__ __device__ float4 operator+(float4 a, float b)
+{
+    return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ float4 operator+(float b, float4 a)
+{
+    return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ void operator+=(float4 &a, float b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+    a.w += b;
+}
+
+inline __host__ __device__ int4 operator+(int4 a, int4 b)
+{
+    return make_int4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
+}
+inline __host__ __device__ void operator+=(int4 &a, int4 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    a.w += b.w;
+}
+inline __host__ __device__ int4 operator+(int4 a, int b)
+{
+    return make_int4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ int4 operator+(int b, int4 a)
+{
+    return make_int4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ void operator+=(int4 &a, int b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+    a.w += b;
+}
+
+inline __host__ __device__ uint4 operator+(uint4 a, uint4 b)
+{
+    return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
+}
+inline __host__ __device__ void operator+=(uint4 &a, uint4 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    a.w += b.w;
+}
+inline __host__ __device__ uint4 operator+(uint4 a, uint b)
+{
+    return make_uint4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ uint4 operator+(uint b, uint4 a)
+{
+    return make_uint4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ void operator+=(uint4 &a, uint b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+    a.w += b;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// subtract
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator-(float2 a, float2 b)
+{
+    return make_float2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(float2 &a, float2 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+}
+inline __host__ __device__ float2 operator-(float2 a, float b)
+{
+    return make_float2(a.x - b, a.y - b);
+}
+inline __host__ __device__ float2 operator-(float b, float2 a)
+{
+    return make_float2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(float2 &a, float b)
+{
+    a.x -= b;
+    a.y -= b;
+}
+
+inline __host__ __device__ int2 operator-(int2 a, int2 b)
+{
+    return make_int2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(int2 &a, int2 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+}
+inline __host__ __device__ int2 operator-(int2 a, int b)
+{
+    return make_int2(a.x - b, a.y - b);
+}
+inline __host__ __device__ int2 operator-(int b, int2 a)
+{
+    return make_int2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(int2 &a, int b)
+{
+    a.x -= b;
+    a.y -= b;
+}
+
+inline __host__ __device__ uint2 operator-(uint2 a, uint2 b)
+{
+    return make_uint2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(uint2 &a, uint2 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+}
+inline __host__ __device__ uint2 operator-(uint2 a, uint b)
+{
+    return make_uint2(a.x - b, a.y - b);
+}
+inline __host__ __device__ uint2 operator-(uint b, uint2 a)
+{
+    return make_uint2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(uint2 &a, uint b)
+{
+    a.x -= b;
+    a.y -= b;
+}
+
+inline __host__ __device__ float3 operator-(float3 a, float3 b)
+{
+    return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(float3 &a, float3 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+}
+inline __host__ __device__ float3 operator-(float3 a, float b)
+{
+    return make_float3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ float3 operator-(float b, float3 a)
+{
+    return make_float3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(float3 &a, float b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+}
+
+inline __host__ __device__ int3 operator-(int3 a, int3 b)
+{
+    return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(int3 &a, int3 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+}
+inline __host__ __device__ int3 operator-(int3 a, int b)
+{
+    return make_int3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ int3 operator-(int b, int3 a)
+{
+    return make_int3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(int3 &a, int b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+}
+
+inline __host__ __device__ uint3 operator-(uint3 a, uint3 b)
+{
+    return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(uint3 &a, uint3 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+}
+inline __host__ __device__ uint3 operator-(uint3 a, uint b)
+{
+    return make_uint3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ uint3 operator-(uint b, uint3 a)
+{
+    return make_uint3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(uint3 &a, uint b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+}
+
+inline __host__ __device__ float4 operator-(float4 a, float4 b)
+{
+    return make_float4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
+}
+inline __host__ __device__ void operator-=(float4 &a, float4 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+    a.w -= b.w;
+}
+inline __host__ __device__ float4 operator-(float4 a, float b)
+{
+    return make_float4(a.x - b, a.y - b, a.z - b,  a.w - b);
+}
+inline __host__ __device__ void operator-=(float4 &a, float b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+    a.w -= b;
+}
+
+inline __host__ __device__ int4 operator-(int4 a, int4 b)
+{
+    return make_int4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
+}
+inline __host__ __device__ void operator-=(int4 &a, int4 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+    a.w -= b.w;
+}
+inline __host__ __device__ int4 operator-(int4 a, int b)
+{
+    return make_int4(a.x - b, a.y - b, a.z - b,  a.w - b);
+}
+inline __host__ __device__ int4 operator-(int b, int4 a)
+{
+    return make_int4(b - a.x, b - a.y, b - a.z, b - a.w);
+}
+inline __host__ __device__ void operator-=(int4 &a, int b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+    a.w -= b;
+}
+
+inline __host__ __device__ uint4 operator-(uint4 a, uint4 b)
+{
+    return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
+}
+inline __host__ __device__ void operator-=(uint4 &a, uint4 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+    a.w -= b.w;
+}
+inline __host__ __device__ uint4 operator-(uint4 a, uint b)
+{
+    return make_uint4(a.x - b, a.y - b, a.z - b,  a.w - b);
+}
+inline __host__ __device__ uint4 operator-(uint b, uint4 a)
+{
+    return make_uint4(b - a.x, b - a.y, b - a.z, b - a.w);
+}
+inline __host__ __device__ void operator-=(uint4 &a, uint b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+    a.w -= b;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// multiply
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator*(float2 a, float2 b)
+{
+    return make_float2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(float2 &a, float2 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+}
+inline __host__ __device__ float2 operator*(float2 a, float b)
+{
+    return make_float2(a.x * b, a.y * b);
+}
+inline __host__ __device__ float2 operator*(float b, float2 a)
+{
+    return make_float2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(float2 &a, float b)
+{
+    a.x *= b;
+    a.y *= b;
+}
+
+inline __host__ __device__ int2 operator*(int2 a, int2 b)
+{
+    return make_int2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(int2 &a, int2 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+}
+inline __host__ __device__ int2 operator*(int2 a, int b)
+{
+    return make_int2(a.x * b, a.y * b);
+}
+inline __host__ __device__ int2 operator*(int b, int2 a)
+{
+    return make_int2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(int2 &a, int b)
+{
+    a.x *= b;
+    a.y *= b;
+}
+
+inline __host__ __device__ uint2 operator*(uint2 a, uint2 b)
+{
+    return make_uint2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(uint2 &a, uint2 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+}
+inline __host__ __device__ uint2 operator*(uint2 a, uint b)
+{
+    return make_uint2(a.x * b, a.y * b);
+}
+inline __host__ __device__ uint2 operator*(uint b, uint2 a)
+{
+    return make_uint2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(uint2 &a, uint b)
+{
+    a.x *= b;
+    a.y *= b;
+}
+
+inline __host__ __device__ float3 operator*(float3 a, float3 b)
+{
+    return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(float3 &a, float3 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+}
+inline __host__ __device__ float3 operator*(float3 a, float b)
+{
+    return make_float3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ float3 operator*(float b, float3 a)
+{
+    return make_float3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(float3 &a, float b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+}
+
+inline __host__ __device__ int3 operator*(int3 a, int3 b)
+{
+    return make_int3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(int3 &a, int3 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+}
+inline __host__ __device__ int3 operator*(int3 a, int b)
+{
+    return make_int3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ int3 operator*(int b, int3 a)
+{
+    return make_int3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(int3 &a, int b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+}
+
+inline __host__ __device__ uint3 operator*(uint3 a, uint3 b)
+{
+    return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(uint3 &a, uint3 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+}
+inline __host__ __device__ uint3 operator*(uint3 a, uint b)
+{
+    return make_uint3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ uint3 operator*(uint b, uint3 a)
+{
+    return make_uint3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(uint3 &a, uint b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+}
+
+inline __host__ __device__ float4 operator*(float4 a, float4 b)
+{
+    return make_float4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
+}
+inline __host__ __device__ void operator*=(float4 &a, float4 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+    a.w *= b.w;
+}
+inline __host__ __device__ float4 operator*(float4 a, float b)
+{
+    return make_float4(a.x * b, a.y * b, a.z * b,  a.w * b);
+}
+inline __host__ __device__ float4 operator*(float b, float4 a)
+{
+    return make_float4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(float4 &a, float b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+    a.w *= b;
+}
+
+inline __host__ __device__ int4 operator*(int4 a, int4 b)
+{
+    return make_int4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
+}
+inline __host__ __device__ void operator*=(int4 &a, int4 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+    a.w *= b.w;
+}
+inline __host__ __device__ int4 operator*(int4 a, int b)
+{
+    return make_int4(a.x * b, a.y * b, a.z * b,  a.w * b);
+}
+inline __host__ __device__ int4 operator*(int b, int4 a)
+{
+    return make_int4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(int4 &a, int b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+    a.w *= b;
+}
+
+inline __host__ __device__ uint4 operator*(uint4 a, uint4 b)
+{
+    return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
+}
+inline __host__ __device__ void operator*=(uint4 &a, uint4 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+    a.w *= b.w;
+}
+inline __host__ __device__ uint4 operator*(uint4 a, uint b)
+{
+    return make_uint4(a.x * b, a.y * b, a.z * b,  a.w * b);
+}
+inline __host__ __device__ uint4 operator*(uint b, uint4 a)
+{
+    return make_uint4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(uint4 &a, uint b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+    a.w *= b;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// divide
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator/(float2 a, float2 b)
+{
+    return make_float2(a.x / b.x, a.y / b.y);
+}
+inline __host__ __device__ void operator/=(float2 &a, float2 b)
+{
+    a.x /= b.x;
+    a.y /= b.y;
+}
+inline __host__ __device__ float2 operator/(float2 a, float b)
+{
+    return make_float2(a.x / b, a.y / b);
+}
+inline __host__ __device__ void operator/=(float2 &a, float b)
+{
+    a.x /= b;
+    a.y /= b;
+}
+inline __host__ __device__ float2 operator/(float b, float2 a)
+{
+    return make_float2(b / a.x, b / a.y);
+}
+
+inline __host__ __device__ float3 operator/(float3 a, float3 b)
+{
+    return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
+}
+inline __host__ __device__ void operator/=(float3 &a, float3 b)
+{
+    a.x /= b.x;
+    a.y /= b.y;
+    a.z /= b.z;
+}
+inline __host__ __device__ float3 operator/(float3 a, float b)
+{
+    return make_float3(a.x / b, a.y / b, a.z / b);
+}
+inline __host__ __device__ void operator/=(float3 &a, float b)
+{
+    a.x /= b;
+    a.y /= b;
+    a.z /= b;
+}
+inline __host__ __device__ float3 operator/(float b, float3 a)
+{
+    return make_float3(b / a.x, b / a.y, b / a.z);
+}
+
+inline __host__ __device__ float4 operator/(float4 a, float4 b)
+{
+    return make_float4(a.x / b.x, a.y / b.y, a.z / b.z,  a.w / b.w);
+}
+inline __host__ __device__ void operator/=(float4 &a, float4 b)
+{
+    a.x /= b.x;
+    a.y /= b.y;
+    a.z /= b.z;
+    a.w /= b.w;
+}
+inline __host__ __device__ float4 operator/(float4 a, float b)
+{
+    return make_float4(a.x / b, a.y / b, a.z / b,  a.w / b);
+}
+inline __host__ __device__ void operator/=(float4 &a, float b)
+{
+    a.x /= b;
+    a.y /= b;
+    a.z /= b;
+    a.w /= b;
+}
+inline __host__ __device__ float4 operator/(float b, float4 a)
+{
+    return make_float4(b / a.x, b / a.y, b / a.z, b / a.w);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// min
+////////////////////////////////////////////////////////////////////////////////
+
+inline  __host__ __device__ float2 fminf(float2 a, float2 b)
+{
+    return make_float2(fminf(a.x,b.x), fminf(a.y,b.y));
+}
+inline __host__ __device__ float3 fminf(float3 a, float3 b)
+{
+    return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z));
+}
+inline  __host__ __device__ float4 fminf(float4 a, float4 b)
+{
+    return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w));
+}
+
+inline __host__ __device__ int2 min(int2 a, int2 b)
+{
+    return make_int2(min(a.x,b.x), min(a.y,b.y));
+}
+inline __host__ __device__ int3 min(int3 a, int3 b)
+{
+    return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
+}
+inline __host__ __device__ int4 min(int4 a, int4 b)
+{
+    return make_int4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
+}
+
+inline __host__ __device__ uint2 min(uint2 a, uint2 b)
+{
+    return make_uint2(min(a.x,b.x), min(a.y,b.y));
+}
+inline __host__ __device__ uint3 min(uint3 a, uint3 b)
+{
+    return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
+}
+inline __host__ __device__ uint4 min(uint4 a, uint4 b)
+{
+    return make_uint4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// max
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 fmaxf(float2 a, float2 b)
+{
+    return make_float2(fmaxf(a.x,b.x), fmaxf(a.y,b.y));
+}
+inline __host__ __device__ float3 fmaxf(float3 a, float3 b)
+{
+    return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z));
+}
+inline __host__ __device__ float4 fmaxf(float4 a, float4 b)
+{
+    return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w));
+}
+
+inline __host__ __device__ int2 max(int2 a, int2 b)
+{
+    return make_int2(max(a.x,b.x), max(a.y,b.y));
+}
+inline __host__ __device__ int3 max(int3 a, int3 b)
+{
+    return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
+}
+inline __host__ __device__ int4 max(int4 a, int4 b)
+{
+    return make_int4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
+}
+
+inline __host__ __device__ uint2 max(uint2 a, uint2 b)
+{
+    return make_uint2(max(a.x,b.x), max(a.y,b.y));
+}
+inline __host__ __device__ uint3 max(uint3 a, uint3 b)
+{
+    return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
+}
+inline __host__ __device__ uint4 max(uint4 a, uint4 b)
+{
+    return make_uint4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// lerp
+// - linear interpolation between a and b, based on value t in [0, 1] range
+////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ __host__ float lerp(float a, float b, float t)
+{
+    return a + t*(b-a);
+}
+inline __device__ __host__ float2 lerp(float2 a, float2 b, float t)
+{
+    return a + t*(b-a);
+}
+inline __device__ __host__ float3 lerp(float3 a, float3 b, float t)
+{
+    return a + t*(b-a);
+}
+inline __device__ __host__ float4 lerp(float4 a, float4 b, float t)
+{
+    return a + t*(b-a);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// clamp
+// - clamp the value v to be in the range [a, b]
+////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ __host__ float clamp(float f, float a, float b)
+{
+    return fmaxf(a, fminf(f, b));
+}
+inline __device__ __host__ int clamp(int f, int a, int b)
+{
+    return max(a, min(f, b));
+}
+inline __device__ __host__ uint clamp(uint f, uint a, uint b)
+{
+    return max(a, min(f, b));
+}
+
+inline __device__ __host__ float2 clamp(float2 v, float a, float b)
+{
+    return make_float2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b)
+{
+    return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ float3 clamp(float3 v, float a, float b)
+{
+    return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
+{
+    return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ float4 clamp(float4 v, float a, float b)
+{
+    return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b)
+{
+    return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+
+inline __device__ __host__ int2 clamp(int2 v, int a, int b)
+{
+    return make_int2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ int2 clamp(int2 v, int2 a, int2 b)
+{
+    return make_int2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ int3 clamp(int3 v, int a, int b)
+{
+    return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b)
+{
+    return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ int4 clamp(int4 v, int a, int b)
+{
+    return make_int4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ int4 clamp(int4 v, int4 a, int4 b)
+{
+    return make_int4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+
+inline __device__ __host__ uint2 clamp(uint2 v, uint a, uint b)
+{
+    return make_uint2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ uint2 clamp(uint2 v, uint2 a, uint2 b)
+{
+    return make_uint2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b)
+{
+    return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b)
+{
+    return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ uint4 clamp(uint4 v, uint a, uint b)
+{
+    return make_uint4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ uint4 clamp(uint4 v, uint4 a, uint4 b)
+{
+    return make_uint4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// dot product
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float dot(float2 a, float2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ float dot(float3 a, float3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ float dot(float4 a, float4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+inline __host__ __device__ int dot(int2 a, int2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ int dot(int3 a, int3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ int dot(int4 a, int4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+inline __host__ __device__ uint dot(uint2 a, uint2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ uint dot(uint3 a, uint3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ uint dot(uint4 a, uint4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// length
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float length(float2 v)
+{
+    return sqrtf(dot(v, v));
+}
+inline __host__ __device__ float length(float3 v)
+{
+    return sqrtf(dot(v, v));
+}
+inline __host__ __device__ float length(float4 v)
+{
+    return sqrtf(dot(v, v));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// normalize
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 normalize(float2 v)
+{
+    float invLen = rsqrtf(dot(v, v));
+    return v * invLen;
+}
+inline __host__ __device__ float3 normalize(float3 v)
+{
+    float invLen = rsqrtf(dot(v, v));
+    return v * invLen;
+}
+inline __host__ __device__ float4 normalize(float4 v)
+{
+    float invLen = rsqrtf(dot(v, v));
+    return v * invLen;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// floor
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 floorf(float2 v)
+{
+    return make_float2(floorf(v.x), floorf(v.y));
+}
+inline __host__ __device__ float3 floorf(float3 v)
+{
+    return make_float3(floorf(v.x), floorf(v.y), floorf(v.z));
+}
+inline __host__ __device__ float4 floorf(float4 v)
+{
+    return make_float4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// frac - returns the fractional portion of a scalar or each vector component
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float fracf(float v)
+{
+    return v - floorf(v);
+}
+inline __host__ __device__ float2 fracf(float2 v)
+{
+    return make_float2(fracf(v.x), fracf(v.y));
+}
+inline __host__ __device__ float3 fracf(float3 v)
+{
+    return make_float3(fracf(v.x), fracf(v.y), fracf(v.z));
+}
+inline __host__ __device__ float4 fracf(float4 v)
+{
+    return make_float4(fracf(v.x), fracf(v.y), fracf(v.z), fracf(v.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// fmod
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 fmodf(float2 a, float2 b)
+{
+    return make_float2(fmodf(a.x, b.x), fmodf(a.y, b.y));
+}
+inline __host__ __device__ float3 fmodf(float3 a, float3 b)
+{
+    return make_float3(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z));
+}
+inline __host__ __device__ float4 fmodf(float4 a, float4 b)
+{
+    return make_float4(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z), fmodf(a.w, b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// absolute value
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 fabs(float2 v)
+{
+    return make_float2(fabs(v.x), fabs(v.y));
+}
+inline __host__ __device__ float3 fabs(float3 v)
+{
+    return make_float3(fabs(v.x), fabs(v.y), fabs(v.z));
+}
+inline __host__ __device__ float4 fabs(float4 v)
+{
+    return make_float4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w));
+}
+
+inline __host__ __device__ int2 abs(int2 v)
+{
+    return make_int2(abs(v.x), abs(v.y));
+}
+inline __host__ __device__ int3 abs(int3 v)
+{
+    return make_int3(abs(v.x), abs(v.y), abs(v.z));
+}
+inline __host__ __device__ int4 abs(int4 v)
+{
+    return make_int4(abs(v.x), abs(v.y), abs(v.z), abs(v.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// reflect
+// - returns reflection of incident ray I around surface normal N
+// - N should be normalized, reflected vector's length is equal to length of I
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float3 reflect(float3 i, float3 n)
+{
+    return i - 2.0f * n * dot(n,i);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// cross product
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float3 cross(float3 a, float3 b)
+{
+    return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// smoothstep
+// - returns 0 if x < a
+// - returns 1 if x > b
+// - otherwise returns smooth interpolation between 0 and 1 based on x
+////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ __host__ float smoothstep(float a, float b, float x)
+{
+    float y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(3.0f - (2.0f*y)));
+}
+inline __device__ __host__ float2 smoothstep(float2 a, float2 b, float2 x)
+{
+    float2 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(make_float2(3.0f) - (make_float2(2.0f)*y)));
+}
+inline __device__ __host__ float3 smoothstep(float3 a, float3 b, float3 x)
+{
+    float3 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(make_float3(3.0f) - (make_float3(2.0f)*y)));
+}
+inline __device__ __host__ float4 smoothstep(float4 a, float4 b, float4 x)
+{
+    float4 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(make_float4(3.0f) - (make_float4(2.0f)*y)));
+}
+
+#endif
diff --git a/src/curobo/curobolib/cpp/kinematics_fused_cuda.cpp b/src/curobo/curobolib/cpp/kinematics_fused_cuda.cpp
new file mode 100644
index 00000000..c73bff93
--- /dev/null
+++ b/src/curobo/curobolib/cpp/kinematics_fused_cuda.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#include <torch/extension.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <vector>
+
+// CUDA forward declarations
+
+std::vector<torch::Tensor>
+matrix_to_quaternion(torch::Tensor out_quat,
+                     const torch::Tensor in_rot // batch_size, 3
+);
+
+std::vector<torch::Tensor> kin_fused_forward(
+    torch::Tensor link_pos, torch::Tensor link_quat,
+    torch::Tensor batch_robot_spheres, torch::Tensor global_cumul_mat,
+    const torch::Tensor joint_vec, const torch::Tensor fixed_transform,
+    const torch::Tensor robot_spheres, const torch::Tensor link_map,
+    const torch::Tensor joint_map, const torch::Tensor joint_map_type,
+    const torch::Tensor store_link_map, const torch::Tensor link_sphere_map,
+    const int batch_size, const int n_spheres,
+    const bool use_global_cumul = false);
+
+std::vector<torch::Tensor> kin_fused_backward_16t(
+    torch::Tensor grad_out, const torch::Tensor grad_nlinks_pos,
+    const torch::Tensor grad_nlinks_quat, const torch::Tensor grad_spheres,
+    const torch::Tensor global_cumul_mat, const torch::Tensor joint_vec,
+    const torch::Tensor fixed_transform, const torch::Tensor robot_spheres,
+    const torch::Tensor link_map, const torch::Tensor joint_map,
+    const torch::Tensor joint_map_type, const torch::Tensor store_link_map,
+    const torch::Tensor link_sphere_map, const torch::Tensor link_chain_map,
+    const int batch_size, const int n_spheres, const bool sparsity_opt = true,
+    const bool use_global_cumul = false);
+// C++ interface
+
+// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x)                                                    \
+  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x)                                                         \
+  CHECK_CUDA(x);                                                               \
+  CHECK_CONTIGUOUS(x)
+
+std::vector<torch::Tensor> kin_forward_wrapper(
+    torch::Tensor link_pos, torch::Tensor link_quat,
+    torch::Tensor batch_robot_spheres, torch::Tensor global_cumul_mat,
+    const torch::Tensor joint_vec, const torch::Tensor fixed_transform,
+    const torch::Tensor robot_spheres, const torch::Tensor link_map,
+    const torch::Tensor joint_map, const torch::Tensor joint_map_type,
+    const torch::Tensor store_link_map, const torch::Tensor link_sphere_map,
+    const int batch_size, const int n_spheres,
+    const bool use_global_cumul = false) {
+
+  const at::cuda::OptionalCUDAGuard guard(joint_vec.device());
+
+  // TODO: add check input
+  return kin_fused_forward(
+      link_pos, link_quat, batch_robot_spheres, global_cumul_mat, joint_vec,
+      fixed_transform, robot_spheres, link_map, joint_map, joint_map_type,
+      store_link_map, link_sphere_map, batch_size, n_spheres, use_global_cumul);
+}
+
+std::vector<torch::Tensor> kin_backward_wrapper(
+    torch::Tensor grad_out, const torch::Tensor grad_nlinks_pos,
+    const torch::Tensor grad_nlinks_quat, const torch::Tensor grad_spheres,
+    const torch::Tensor global_cumul_mat, const torch::Tensor joint_vec,
+    const torch::Tensor fixed_transform, const torch::Tensor robot_spheres,
+    const torch::Tensor link_map, const torch::Tensor joint_map,
+    const torch::Tensor joint_map_type, const torch::Tensor store_link_map,
+    const torch::Tensor link_sphere_map, const torch::Tensor link_chain_map,
+    const int batch_size, const int n_spheres, const bool sparsity_opt = true,
+    const bool use_global_cumul = false) {
+  const at::cuda::OptionalCUDAGuard guard(joint_vec.device());
+
+  return kin_fused_backward_16t(
+      grad_out, grad_nlinks_pos, grad_nlinks_quat, grad_spheres,
+      global_cumul_mat, joint_vec, fixed_transform, robot_spheres, link_map,
+      joint_map, joint_map_type, store_link_map, link_sphere_map,
+      link_chain_map, batch_size, n_spheres, sparsity_opt, use_global_cumul);
+}
+
+std::vector<torch::Tensor>
+matrix_to_quaternion_wrapper(torch::Tensor out_quat,
+                             const torch::Tensor in_rot // batch_size, 3
+) {
+  const at::cuda::OptionalCUDAGuard guard(in_rot.device());
+  CHECK_INPUT(in_rot);
+  CHECK_INPUT(out_quat);
+  return matrix_to_quaternion(out_quat, in_rot);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", &kin_forward_wrapper, "Kinematics fused forward (CUDA)");
+  m.def("backward", &kin_backward_wrapper, "Kinematics fused backward (CUDA)");
+  m.def("matrix_to_quaternion", &matrix_to_quaternion_wrapper,
+        "Rotation Matrix to Quaternion (CUDA)");
+}
diff --git a/src/curobo/curobolib/cpp/kinematics_fused_kernel.cu b/src/curobo/curobolib/cpp/kinematics_fused_kernel.cu
new file mode 100644
index 00000000..024df07b
--- /dev/null
+++ b/src/curobo/curobolib/cpp/kinematics_fused_kernel.cu
@@ -0,0 +1,1255 @@
+/*
+ * Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include <c10/cuda/CUDAStream.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <torch/extension.h>
+
+#include "helper_math.h"
+#include <cuda_fp16.h>
+#include <vector>
+
+#define M 4
+
+#define FIXED -1
+#define X_PRISM 0
+#define Y_PRISM 1
+#define Z_PRISM 2
+#define X_ROT 3
+#define Y_ROT 4
+#define Z_ROT 5
+
+#define MAX_BATCH_PER_BLOCK 32   // tunable parameter for improving occupancy
+#define MAX_BW_BATCH_PER_BLOCK 8 // tunable parameter for improving occupancy
+
+#define MAX_TOTAL_LINKS                                                        \
+  750 // limited by shared memory size. We need to fit 16 * float32 per link
+
+namespace Curobo {
+
+namespace Kinematics {
+
+template <typename psum_t>
+__device__ __forceinline__ void scale_cross_sum(float3 a, float3 b,
+                                                float3 scale, psum_t &sum_out) {
+  sum_out += scale.x * (a.y * b.z - a.z * b.y) +
+             scale.y * (a.z * b.x - a.x * b.z) +
+             scale.z * (a.x * b.y - a.y * b.x);
+}
+
+__device__ __forceinline__ void normalize_quaternion(float *q) {
+  // get length:
+  float length = 1.0 / norm4df(q[0], q[1], q[2], q[3]);
+  if (q[0] < 0.0) {
+    length = -1.0 * length;
+  }
+
+  q[1] = length * q[1];
+  q[2] = length * q[2];
+  q[3] = length * q[3];
+  q[0] = length * q[0];
+}
+/**
+ * @brief get quaternion from transformation matrix
+ *
+ * @param t transformation matrix 4x4
+ * @param q quaternion in wxyz format
+ */
+__device__ __forceinline__ void mat_to_quat(float *t, float *q) {
+  float n;
+  float n_sqrt;
+  if (t[10] < 0.0) {
+    if (t[0] > t[5]) {
+      n = 1 + t[0] - t[5] - t[10];
+      n_sqrt = 0.5 * rsqrtf(n);
+      q[1] = n * n_sqrt;
+      q[2] = (t[1] + t[4]) * n_sqrt;
+      q[3] = (t[8] + t[2]) * n_sqrt;
+      q[0] = -1 * (t[6] - t[9]) * n_sqrt; // * -1 ; // this is the wrong one?
+    } else {
+      n = 1 - t[0] + t[5] - t[10];
+      n_sqrt = 0.5 * rsqrtf(n);
+      q[1] = (t[1] + t[4]) * n_sqrt;
+      q[2] = n * n_sqrt;
+      q[3] = (t[6] + t[9]) * n_sqrt;
+      q[0] = -1 * (t[8] - t[2]) * n_sqrt;
+    }
+  } else {
+    if (t[0] < -1 * t[5]) {
+      n = 1 - t[0] - t[5] + t[10];
+      n_sqrt = 0.5 * rsqrtf(n);
+      q[1] = (t[8] + t[2]) * n_sqrt;
+      q[2] = (t[6] + t[9]) * n_sqrt;
+      q[3] = n * n_sqrt;
+      q[0] = -1 * (t[1] - t[4]) * n_sqrt;
+    } else {
+      n = 1 + t[0] + t[5] + t[10];
+      n_sqrt = 0.5 * rsqrtf(n);
+      q[1] = (t[6] - t[9]) * n_sqrt;
+      q[2] = (t[8] - t[2]) * n_sqrt;
+      q[3] = (t[1] - t[4]) * n_sqrt;
+      q[0] = -1 * n * n_sqrt;
+    }
+  }
+  normalize_quaternion(q);
+}
+
+/**
+ * @brief get quaternion from transformation matrix
+ *
+ * @param t # rotation matrix 3x3
+ * @param q quaternion in wxyz format
+ */
+__device__ __forceinline__ void rot_to_quat(float *t, float *q) {
+  float n;
+  float n_sqrt;
+  if (t[8] < 0.0) {
+    if (t[0] > t[4]) {
+      n = 1 + t[0] - t[4] - t[8];
+      n_sqrt = 0.5 * rsqrtf(n);
+      q[1] = n * n_sqrt;
+      q[2] = (t[1] + t[3]) * n_sqrt;
+      q[3] = (t[6] + t[2]) * n_sqrt;
+      q[0] = -1 * (t[5] - t[7]) * n_sqrt; // * -1 ; // this is the wrong one?
+    } else {
+      n = 1 - t[0] + t[4] - t[8];
+      n_sqrt = 0.5 * rsqrtf(n);
+      q[1] = (t[1] + t[3]) * n_sqrt;
+      q[2] = n * n_sqrt;
+      q[3] = (t[5] + t[7]) * n_sqrt;
+      q[0] = -1 * (t[6] - t[2]) * n_sqrt;
+    }
+  } else {
+    if (t[0] < -1 * t[4]) {
+      n = 1 - t[0] - t[4] + t[8];
+      n_sqrt = 0.5 * rsqrtf(n);
+      q[1] = (t[6] + t[2]) * n_sqrt;
+      q[2] = (t[5] + t[7]) * n_sqrt;
+      q[3] = n * n_sqrt;
+      q[0] = -1 * (t[1] - t[3]) * n_sqrt;
+    } else {
+      n = 1 + t[0] + t[4] + t[8];
+      n_sqrt = 0.5 * rsqrtf(n);
+      q[1] = (t[5] - t[7]) * n_sqrt;
+      q[2] = (t[6] - t[2]) * n_sqrt;
+      q[3] = (t[1] - t[3]) * n_sqrt;
+      q[0] = -1 * n * n_sqrt;
+    }
+  }
+  normalize_quaternion(q);
+}
+
+__device__ __forceinline__ void rot_mul(float *r1, float *r2, float *r_out) {
+  for (int i = 0; i < 9; i++) {
+    r_out[i] = 0.0;
+  }
+#pragma unroll
+  for (int k = 0; k < 3; k++) {
+#pragma unroll
+    for (int j = 0; j < 3; j++) {
+#pragma unroll
+      for (int i = 0; i < 3; i++) {
+        r_out[i * 3 + j] += r1[i * 3 + k] * r2[k * 3 + j];
+      }
+    }
+  }
+}
+__device__ __forceinline__ void rot_inverse_rot_mul(float *r1, float *r2,
+                                                    float *r_out) {
+  // multiply two matrices:
+  r_out[0] = r1[0] * r2[0] + r1[4] * r2[4] + r1[8] * r2[8];
+  r_out[1] = r1[0] * r2[1] + r1[4] * r2[5] + r1[8] * r2[9];
+  r_out[2] = r1[0] * r2[2] + r1[4] * r2[6] + r1[8] * r2[10];
+
+  r_out[3] = r1[1] * r2[0] + r1[5] * r2[4] + r1[9] * r2[8];
+  r_out[4] = r1[1] * r2[1] + r1[5] * r2[5] + r1[9] * r2[9];
+  r_out[5] = r1[1] * r2[2] + r1[5] * r2[6] + r1[9] * r2[10];
+
+  r_out[6] = r1[2] * r2[0] + r1[6] * r2[4] + r1[10] * r2[8];
+  r_out[7] = r1[2] * r2[1] + r1[6] * r2[5] + r1[10] * r2[9];
+  r_out[8] = r1[2] * r2[2] + r1[6] * r2[6] + r1[10] * r2[10];
+}
+
+template <typename scalar_t>
+__device__ __forceinline__ void
+transform_sphere(const float *transform_mat, const scalar_t *sphere, float *C) {
+  float4 sphere_pos = *(float4 *)&sphere[0];
+  int st_idx = 0;
+#pragma unroll 3
+  for (int i = 0; i < 3; i++) {
+    st_idx = i * 4;
+    // do dot product:
+    // C[i] = transform_mat[st_idx] * sphere_pos.x + transform_mat[st_idx+1] *
+    // sphere_pos.y + transform_mat[st_idx+2] * sphere_pos.z +
+    // transform_mat[st_idx + 3];
+    float4 tm = *(float4 *)&transform_mat[st_idx];
+    C[i] =
+        tm.x * sphere_pos.x + tm.y * sphere_pos.y + tm.z * sphere_pos.z + tm.w;
+  }
+  C[3] = sphere_pos.w;
+}
+
+template <typename scalar_t>
+__device__ __forceinline__ void fixed_joint_fn(const scalar_t *fixedTransform,
+                                               float *JM) {
+  JM[0] = fixedTransform[0];
+  JM[1] = fixedTransform[M];
+  JM[2] = fixedTransform[M * 2];
+  JM[3] = fixedTransform[M * 3];
+}
+
+// prism_fn withOUT control flow
+template <typename scalar_t>
+__device__ __forceinline__ void prism_fn(const scalar_t *fixedTransform,
+                                         const float angle, const int col_idx,
+                                         float *JM, const int xyz) {
+  // 	int _and = (col_idx & (col_idx>>1)) & 0x1; // 1 for thread 3, 0 for all
+  // other threads (0,1,2)
+  //
+  // 	float f1 = (1-_and) + _and * angle; // 1 for threads 0,1,2; angle for
+  // thread 3 	int addr_offset = (1-_and) * col_idx + _and * xyz; // col_idx
+  // for threads 0,1,2; xyz for thread 3
+  //
+  // 	JM[0] = fixedTransform[0 + addr_offset] * f1 + _and *
+  // fixedTransform[3];//FT_0[1]; 	JM[1] = fixedTransform[M + addr_offset]
+  // * f1 + _and * fixedTransform[M + 3];//FT_1[1]; 	JM[2] = fixedTransform[M
+  // + M + addr_offset] * f1 + _and * fixedTransform[M + M + 3];//FT_2[1];
+  // JM[3] = fixedTransform[M + M + M + addr_offset] * (1-_and) + _and * 1; //
+  // first three threads will get fixedTransform[3M+col_idx], the last thread
+  // will get 1
+
+  if (col_idx <= 2) {
+    fixed_joint_fn(&fixedTransform[col_idx], &JM[0]);
+  } else {
+    JM[0] = fixedTransform[0 + xyz] * angle + fixedTransform[3];     // FT_0[1];
+    JM[1] = fixedTransform[M + xyz] * angle + fixedTransform[M + 3]; // FT_1[1];
+    JM[2] = fixedTransform[M + M + xyz] * angle +
+            fixedTransform[M + M + 3]; // FT_2[1];
+    JM[3] = 1;
+  }
+}
+
+template <typename scalar_t>
+__device__ __forceinline__ void xprism_fn(const scalar_t *fixedTransform,
+                                          const float angle, const int col_idx,
+                                          float *JM) {
+
+  switch (col_idx) {
+  case 0:
+  case 1:
+  case 2:
+    fixed_joint_fn(&fixedTransform[col_idx], &JM[0]);
+    break;
+  case 3:
+    JM[0] = fixedTransform[0] * angle + fixedTransform[3];     // FT_0[1];
+    JM[1] = fixedTransform[M] * angle + fixedTransform[M + 3]; // FT_1[1];
+    JM[2] =
+        fixedTransform[M + M] * angle + fixedTransform[M + M + 3]; // FT_2[1];
+    JM[3] = 1;
+    break;
+  }
+}
+
+template <typename scalar_t>
+__device__ __forceinline__ void yprism_fn(const scalar_t *fixedTransform,
+                                          const float angle, const int col_idx,
+                                          float *JM) {
+
+  switch (col_idx) {
+  case 0:
+  case 1:
+  case 2:
+    fixed_joint_fn(&fixedTransform[col_idx], &JM[0]);
+    break;
+  case 3:
+    JM[0] = fixedTransform[1] * angle + fixedTransform[3];         // FT_0[1];
+    JM[1] = fixedTransform[M + 1] * angle + fixedTransform[M + 3]; // FT_1[1];
+    JM[2] = fixedTransform[M + M + 1] * angle +
+            fixedTransform[M + M + 3]; // FT_2[1];
+    JM[3] = 1;
+    break;
+  }
+}
+
+template <typename scalar_t>
+__device__ __forceinline__ void zprism_fn(const scalar_t *fixedTransform,
+                                          const float angle, const int col_idx,
+                                          float *JM) {
+
+  switch (col_idx) {
+  case 0:
+  case 1:
+  case 2:
+    fixed_joint_fn(&fixedTransform[col_idx], &JM[0]);
+    break;
+  case 3:
+    JM[0] = fixedTransform[2] * angle + fixedTransform[3];         // FT_0[1];
+    JM[1] = fixedTransform[M + 2] * angle + fixedTransform[M + 3]; // FT_1[1];
+    JM[2] = fixedTransform[M + M + 2] * angle +
+            fixedTransform[M + M + 3]; // FT_2[1];
+    JM[3] = 1;
+    break;
+  }
+}
+
+// In the following versions of rot_fn, some non-nan values may become nan as we
+// add multiple values instead of using if-else/switch-case.
+
+// version with no control flow
+template <typename scalar_t>
+__device__ __forceinline__ void xrot_fn(const scalar_t *fixedTransform,
+                                        const float angle, const int col_idx,
+                                        float *JM) {
+  // we found no change in convergence between fast approximate and IEEE sin,
+  // cos functions using fast approximate method saves 5 registers per thread.
+  float cos = __cosf(angle);
+  float sin = __sinf(angle);
+  float n_sin = -1 * sin;
+
+  int bit1 = col_idx & 0x1;
+  int bit2 = (col_idx & 0x2) >> 1;
+  int _xor = bit1 ^ bit2; // 0 for threads 0 and 3, 1 for threads 1 and 2
+  int col_idx_by_2 =
+      col_idx / 2; // 0 for threads 0 and 1, 1 for threads 2 and 3
+
+  float f1 = (1 - col_idx_by_2) * cos +
+             col_idx_by_2 * n_sin; // thread 1 get cos , thread 2 gets n_sin
+  float f2 = (1 - col_idx_by_2) * sin +
+             col_idx_by_2 * cos; // thread 1 get sin, thread 2 gets cos
+
+  f1 = _xor * f1 + (1 - _xor) * 1; // threads 1 and 2 will get f1; the other
+                                   // two threads will get 1
+  f2 = _xor *
+       f2; // threads 1 and 2 will get f2, the other two threads will get 0.0
+  float f3 = 1 - _xor;
+
+  int addr_offset =
+      _xor + (1 - _xor) *
+                 col_idx; // 1 for threads 1 and 2, col_idx for threads 0 and 3
+
+  JM[0] = fixedTransform[0 + addr_offset] * f1 + f2 * fixedTransform[2];
+  JM[1] = fixedTransform[M + addr_offset] * f1 + f2 * fixedTransform[M + 2];
+  JM[2] =
+      fixedTransform[M + M + addr_offset] * f1 + f2 * fixedTransform[M + M + 2];
+  JM[3] = fixedTransform[M + M + M + addr_offset] *
+          f3; // threads 1 and 2 get 0.0, remaining two get fixedTransform[3M];
+}
+
+// version with no control flow
+template <typename scalar_t>
+__device__ __forceinline__ void yrot_fn(const scalar_t *fixedTransform,
+                                        const float angle, const int col_idx,
+                                        float *JM) {
+  float cos = __cosf(angle);
+  float sin = __sinf(angle);
+  float n_sin = -1 * sin;
+
+  int col_idx_per_2 =
+      col_idx % 2; // threads 0 and 2 will be 0 and threads 1 and 3 will be 1.
+  int col_idx_by_2 =
+      col_idx / 2; // threads 0 and 1 will be 0 and threads 2 and 3 will be 1.
+
+  float f1 = (1 - col_idx_by_2) * cos +
+             col_idx_by_2 * sin; // thread 0 get cos , thread 2 gets sin
+  float f2 = (1 - col_idx_by_2) * n_sin +
+             col_idx_by_2 * cos; // thread 0 get n_sin, thread 2 gets cos
+
+  f1 = (1 - col_idx_per_2) * f1 +
+       col_idx_per_2 * 1; // threads 0 and 2 will get f1; the other two
+                          // threads will get 1
+  f2 = (1 - col_idx_per_2) *
+       f2; // threads 0 and 2 will get f2, the other two threads will get 0.0
+  float f3 =
+      col_idx_per_2; // threads 0 and 2 will be 0 and threads 1 and 3 will be 1.
+
+  int addr_offset =
+      col_idx_per_2 *
+      col_idx; // threads 0 and 2 will get 0, the other two will get col_idx.
+
+  JM[0] = fixedTransform[0 + addr_offset] * f1 + f2 * fixedTransform[2];
+  JM[1] = fixedTransform[M + addr_offset] * f1 + f2 * fixedTransform[M + 2];
+  JM[2] =
+      fixedTransform[M + M + addr_offset] * f1 + f2 * fixedTransform[M + M + 2];
+  JM[3] = fixedTransform[M + M + M + addr_offset] *
+          f3; // threads 0 and 2 threads get 0.0, remaining two get
+              // fixedTransform[3M];
+}
+
+// version with no control flow
+template <typename scalar_t>
+__device__ __forceinline__ void zrot_fn(const scalar_t *fixedTransform,
+                                        const float angle, const int col_idx,
+                                        float *JM) {
+  float cos = __cosf(angle);
+  float sin = __sinf(angle);
+  float n_sin = -1 * sin;
+
+  int col_idx_by_2 =
+      col_idx / 2; // first two threads will be 0 and the next two will be 1.
+  int col_idx_per_2 =
+      col_idx % 2; // first thread will be 0 and the second thread will be 1.
+  float f1 = (1 - col_idx_per_2) * cos +
+             col_idx_per_2 * n_sin; // thread 0 get cos , thread 1 gets n_sin
+  float f2 = (1 - col_idx_per_2) * sin +
+             col_idx_per_2 * cos; // thread 0 get sin, thread 1 gets cos
+
+  f1 = (1 - col_idx_by_2) * f1 +
+       col_idx_by_2 * 1; // first two threads get f1, other two threads get 1
+  f2 = (1 - col_idx_by_2) *
+       f2; // first two threads get f2, other two threads get 0.0
+
+  int addr_offset =
+      col_idx_by_2 *
+      col_idx; // first 2 threads will get 0, the other two will get col_idx.
+
+  JM[0] = fixedTransform[0 + addr_offset] * f1 + f2 * fixedTransform[1];
+  JM[1] = fixedTransform[M + addr_offset] * f1 + f2 * fixedTransform[M + 1];
+  JM[2] =
+      fixedTransform[M + M + addr_offset] * f1 + f2 * fixedTransform[M + M + 1];
+  JM[3] = fixedTransform[M + M + M + addr_offset] *
+          col_idx_by_2; // first two threads get 0.0, remaining two get
+                        // fixedTransform[3M];
+}
+
+template <typename psum_t>
+__device__ __forceinline__ void
+rot_backward_translation(const float3 &vec, float *cumul_mat, float *l_pos,
+                         const float3 &loc_grad, psum_t &grad_q) {
+
+  float3 e_pos, j_pos;
+
+  e_pos.x = cumul_mat[3];
+  e_pos.y = cumul_mat[4 + 3];
+  e_pos.z = cumul_mat[4 + 4 + 3];
+
+  // compute position gradient:
+  j_pos = *(float3 *)&l_pos[0] - e_pos;          // - e_pos;
+  scale_cross_sum(vec, j_pos, loc_grad, grad_q); // cross product
+}
+
+template <typename psum_t>
+__device__ __forceinline__ void
+prism_backward_translation(const float3 vec, const float3 grad_vec,
+                           psum_t &grad_q) {
+  grad_q += dot(vec, grad_vec);
+}
+
+template <typename psum_t>
+__device__ __forceinline__ void
+rot_backward_rotation(const float3 vec, const float3 grad_vec, psum_t &grad_q) {
+  grad_q += dot(vec, grad_vec);
+}
+
+template <typename psum_t>
+__device__ __forceinline__ void
+z_rot_backward(float *link_cumul_mat, float *l_pos, float3 &loc_grad_position,
+               float3 &loc_grad_orientation, psum_t &grad_q) {
+  float3 vec =
+      make_float3(link_cumul_mat[2], link_cumul_mat[6], link_cumul_mat[10]);
+
+  // get rotation vector:
+  rot_backward_translation(vec, &link_cumul_mat[0], &l_pos[0],
+                           loc_grad_position, grad_q);
+
+  rot_backward_rotation(vec, loc_grad_orientation, grad_q);
+}
+
+template <typename psum_t>
+__device__ __forceinline__ void
+x_rot_backward(float *link_cumul_mat, float *l_pos, float3 &loc_grad_position,
+               float3 &loc_grad_orientation, psum_t &grad_q) {
+  float3 vec =
+      make_float3(link_cumul_mat[0], link_cumul_mat[4], link_cumul_mat[8]);
+
+  // get rotation vector:
+  rot_backward_translation(vec, &link_cumul_mat[0], &l_pos[0],
+                           loc_grad_position, grad_q);
+
+  rot_backward_rotation(vec, loc_grad_orientation, grad_q);
+}
+template <typename psum_t>
+__device__ __forceinline__ void
+y_rot_backward(float *link_cumul_mat, float *l_pos, float3 &loc_grad_position,
+               float3 &loc_grad_orientation, psum_t &grad_q) {
+  float3 vec =
+      make_float3(link_cumul_mat[1], link_cumul_mat[5], link_cumul_mat[9]);
+
+  // get rotation vector:
+  rot_backward_translation(vec, &link_cumul_mat[0], &l_pos[0],
+                           loc_grad_position, grad_q);
+
+  rot_backward_rotation(vec, loc_grad_orientation, grad_q);
+}
+
+template <typename psum_t>
+__device__ __forceinline__ void
+xyz_prism_backward_translation(float *cumul_mat, float3 &loc_grad,
+                               psum_t &grad_q, int xyz) {
+  prism_backward_translation(
+      make_float3(cumul_mat[0 + xyz], cumul_mat[4 + xyz], cumul_mat[8 + xyz]),
+      loc_grad, grad_q);
+}
+
+template <typename psum_t>
+__device__ __forceinline__ void x_prism_backward_translation(float *cumul_mat,
+                                                             float3 &loc_grad,
+                                                             psum_t &grad_q) {
+  // get rotation vector:
+  prism_backward_translation(
+      make_float3(cumul_mat[0], cumul_mat[4], cumul_mat[8]), loc_grad, grad_q);
+}
+
+template <typename psum_t>
+__device__ __forceinline__ void y_prism_backward_translation(float *cumul_mat,
+                                                             float3 &loc_grad,
+                                                             psum_t &grad_q) {
+  // get rotation vector:
+  prism_backward_translation(
+      make_float3(cumul_mat[1], cumul_mat[5], cumul_mat[9]), loc_grad, grad_q);
+}
+
+template <typename psum_t>
+__device__ __forceinline__ void z_prism_backward_translation(float *cumul_mat,
+                                                             float3 &loc_grad,
+                                                             psum_t &grad_q) {
+  // get rotation vector:
+  prism_backward_translation(
+      make_float3(cumul_mat[2], cumul_mat[6], cumul_mat[10]), loc_grad, grad_q);
+}
+
+__device__ __forceinline__ void
+xyz_rot_backward_translation(float *cumul_mat, float *l_pos, float3 &loc_grad,
+                             float &grad_q, int xyz) {
+  // get rotation vector:
+  rot_backward_translation(
+      make_float3(cumul_mat[0 + xyz], cumul_mat[4 + xyz], cumul_mat[8 + xyz]),
+      &cumul_mat[0], &l_pos[0], loc_grad, grad_q);
+}
+
+template <typename psum_t>
+__device__ __forceinline__ void
+x_rot_backward_translation(float *cumul_mat, float *l_pos, float3 &loc_grad,
+                           psum_t &grad_q) {
+  // get rotation vector:
+  rot_backward_translation(
+      make_float3(cumul_mat[0], cumul_mat[4], cumul_mat[8]), &cumul_mat[0],
+      &l_pos[0], loc_grad, grad_q);
+}
+
+template <typename psum_t>
+__device__ __forceinline__ void
+y_rot_backward_translation(float *cumul_mat, float *l_pos, float3 &loc_grad,
+                           psum_t &grad_q) {
+  // get rotation vector:
+  rot_backward_translation(
+      make_float3(cumul_mat[1], cumul_mat[5], cumul_mat[9]), &cumul_mat[0],
+      &l_pos[0], loc_grad, grad_q);
+}
+
+template <typename psum_t>
+__device__ __forceinline__ void
+z_rot_backward_translation(float *cumul_mat, float *l_pos, float3 &loc_grad,
+                           psum_t &grad_q) {
+  // get rotation vector:
+  rot_backward_translation(
+      make_float3(cumul_mat[2], cumul_mat[6], cumul_mat[10]), &cumul_mat[0],
+      &l_pos[0], loc_grad, grad_q);
+}
+
+// An optimized version of kin_fused_warp_kernel.
+// This one should be about 10% faster.
+template <typename scalar_t, bool use_global_cumul>
+__global__ void
+kin_fused_warp_kernel2(scalar_t *link_pos, // batchSize xz store_n_links x M x M
+                       scalar_t *link_quat, // batchSize x store_n_links x M x M
+                       scalar_t *b_robot_spheres, // batchSize x nspheres x M
+                       scalar_t *global_cumul_mat,
+                       const scalar_t *q,              // batchSize x njoints
+                       const scalar_t *fixedTransform, // nlinks x M x M
+                       const scalar_t *robot_spheres,  // nspheres x M
+                       const int8_t *jointMapType,     // nlinks
+                       const int16_t *jointMap,        // nlinks
+                       const int16_t *linkMap,         // nlinks
+                       const int16_t *storeLinkMap,    // store_n_links
+                       const int16_t *linkSphereMap,   // nspheres
+                       const int batchSize, const int nspheres,
+                       const int nlinks, const int njoints,
+                       const int store_n_links) {
+  extern __shared__ float cumul_mat[];
+
+  int t = blockDim.x * blockIdx.x + threadIdx.x;
+  const int batch = t / 4;
+  if (batch >= batchSize)
+    return;
+
+  int col_idx = threadIdx.x % 4;
+  const int local_batch = threadIdx.x / 4;
+  const int matAddrBase = local_batch * nlinks * M * M;
+  // read all fixed transforms to local cache:
+
+  // copy base link transform:
+  *(float4 *)&cumul_mat[matAddrBase + col_idx * M] =
+      *(float4 *)&fixedTransform[col_idx * M];
+
+  if (use_global_cumul) {
+    *(float4 *)&global_cumul_mat[batch * nlinks * 16 + col_idx * M] =
+        *(float4 *)&cumul_mat[matAddrBase + col_idx * M];
+  }
+  for (int8_t l = 1; l < nlinks; l++) // TODO: add base link transform
+  {
+
+    // get one row of fixedTransform
+    int ftAddrStart = l * M * M;
+    int inAddrStart = matAddrBase + linkMap[l] * M * M;
+    int outAddrStart = matAddrBase + l * M * M;
+
+    // row index:
+    // check joint type and use one of the helper functions:
+    float JM[M];
+    int j_type = jointMapType[l];
+    float angle = q[batch * njoints + jointMap[l]];
+    if (j_type == FIXED) {
+      fixed_joint_fn(&fixedTransform[ftAddrStart + col_idx], &JM[0]);
+    } else if (j_type <= Z_PRISM) {
+      prism_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0], j_type);
+    } else if (j_type == X_ROT) {
+      xrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
+    } else if (j_type == Y_ROT) {
+      yrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
+    } else if (j_type == Z_ROT) {
+      zrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
+    } else {
+      assert(jointMapType[l] > -2 &&
+             jointMapType[l] < 6); // joint type not supported
+    }
+
+#pragma unroll 4
+    for (int i = 0; i < M; i++) {
+      cumul_mat[outAddrStart + (i * M) + col_idx] =
+          dot(*(float4 *)&cumul_mat[inAddrStart + (i * M)], *(float4 *)JM);
+    }
+
+    if (use_global_cumul) {
+      *(float4 *)&global_cumul_mat[batch * nlinks * 16 + l * 16 + col_idx * M] =
+          *(float4 *)&cumul_mat[outAddrStart + col_idx * M];
+    }
+  }
+  // write out link:
+
+  // do robot_spheres
+
+  const int batchAddrs = batch * nspheres * 4;
+  // read cumul mat index to run for this thread:
+  int16_t read_cumul_idx = -1;
+  int16_t spheres_perthread = (nspheres + 3) / 4;
+  for (int16_t i = 0; i < spheres_perthread; i++) {
+    const int16_t sph_idx = col_idx * spheres_perthread + i;
+    // const int8_t sph_idx =
+    //     i * 4 + col_idx; // different order such that adjacent
+    //  spheres are in neighboring threads
+    if (sph_idx >= nspheres) {
+      break;
+    }
+    // read cumul idx:
+    read_cumul_idx = linkSphereMap[sph_idx];
+    float spheres_mem[4];
+    const int16_t sphAddrs = sph_idx * 4;
+
+    transform_sphere(&cumul_mat[matAddrBase + read_cumul_idx * 16],
+                     &robot_spheres[sphAddrs], &spheres_mem[0]);
+    *(float4 *)&b_robot_spheres[batchAddrs + sphAddrs] =
+        *(float4 *)&spheres_mem[0];
+  }
+
+  // write position and rotation, we convert rotation matrix to a quaternion and
+  // write it out
+  for (int16_t i = 0; i < store_n_links; i++) {
+    int16_t l_map = storeLinkMap[i];
+    int l_outAddrStart =
+        (batch * store_n_links); // * 7) + i * 7;// + (t % M) * M;
+    int outAddrStart = matAddrBase + l_map * M * M;
+
+    float quat[4];
+    // TODO: spread the work to different threads. For now all the threads will
+    // do the same work.
+    mat_to_quat(
+        &cumul_mat[outAddrStart],
+        &quat[0]); // get quaternion, all the 4 threads will do the same work
+    link_quat[l_outAddrStart * 4 + i * 4 + col_idx] =
+        quat[col_idx]; // one thread will write one element to memory
+
+    if (col_idx < 3) {
+      // threads 0,1,2 will execute the following store
+      link_pos[l_outAddrStart * 3 + i * 3 + col_idx] =
+          cumul_mat[outAddrStart + 3 + (col_idx)*4];
+    }
+  }
+}
+
+// kin_fused_backward_kernel3 uses 16 threads per batch, instead of 4 per batch
+// as in kin_fused_backward_kernel2.
+template <typename scalar_t, typename psum_t, bool use_global_cumul,
+          bool enable_sparsity_opt, int16_t MAX_JOINTS, bool PARALLEL_WRITE>
+__global__ void kin_fused_backward_kernel3(
+    scalar_t *grad_out_link_q,       // batchSize * njoints
+    const scalar_t *grad_nlinks_pos, // batchSize * store_n_links * 16
+    const scalar_t *grad_nlinks_quat,
+    const scalar_t *grad_spheres, // batchSize * nspheres * 4
+    const scalar_t *global_cumul_mat,
+    const scalar_t *q,              // batchSize * njoints
+    const scalar_t *fixedTransform, // nlinks * 16
+    const scalar_t *robotSpheres,   // batchSize * nspheres * 4
+    const int8_t *jointMapType,     // nlinks
+    const int16_t *jointMap,        // nlinks
+    const int16_t *linkMap,         // nlinks
+    const int16_t *storeLinkMap,    // store_n_links
+    const int16_t *linkSphereMap,   // nspheres
+    const int16_t *linkChainMap,    // nlinks*nlinks
+    const int batchSize, const int nspheres, const int nlinks,
+    const int njoints, const int store_n_links) {
+  extern __shared__ float cumul_mat[];
+
+  int t = blockDim.x * blockIdx.x + threadIdx.x;
+  const int batch = t / 16;
+  unsigned mask = __ballot_sync(0xffffffff, batch < batchSize);
+
+  if (batch >= batchSize)
+    return;
+
+  // Each thread computes one element of the cumul_mat.
+  // first 4 threads compute a row of the output;
+  const int elem_idx = threadIdx.x % 16;
+  const int col_idx = elem_idx % 4;
+  const int local_batch = threadIdx.x / 16;
+  const int matAddrBase = local_batch * nlinks * M * M;
+
+  if (use_global_cumul) {
+    for (int l = 0; l < nlinks; l++) {
+      int outAddrStart = matAddrBase + l * M * M; // + (t % M) * M;
+      cumul_mat[outAddrStart + elem_idx] =
+          global_cumul_mat[batch * nlinks * M * M + l * M * M + elem_idx];
+    }
+  } else {
+
+    cumul_mat[matAddrBase + elem_idx] = fixedTransform[elem_idx];
+
+    for (int l = 1; l < nlinks; l++) // TODO: add base link transform
+    {
+      float JM[M];                 // store one row locally for mat-mul
+      int ftAddrStart = l * M * M; //+ (t % M) * M;
+      int inAddrStart = matAddrBase + linkMap[l] * M * M;
+      int outAddrStart = matAddrBase + l * M * M; // + (t % M) * M;
+
+      float angle = q[batch * njoints + jointMap[l]];
+      int j_type = jointMapType[l];
+      if (j_type == FIXED) {
+        fixed_joint_fn(&fixedTransform[ftAddrStart + col_idx], &JM[0]);
+      } else if (j_type == Z_ROT) {
+        zrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
+      } else if (j_type <= Z_PRISM) {
+        prism_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0], j_type);
+      } else if (j_type == X_ROT) {
+        xrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
+      } else if (j_type == Y_ROT) {
+        yrot_fn(&fixedTransform[ftAddrStart], angle, col_idx, &JM[0]);
+      } else {
+        assert(jointMapType[l] > -2 &&
+               jointMapType[l] < 6); // joint type not supported
+      }
+
+      // fetch one row of cumul_mat, multiply with a column, which is in JM
+      cumul_mat[outAddrStart + elem_idx] =
+          dot(*(float4 *)&cumul_mat[inAddrStart + ((elem_idx / 4) * M)],
+              *(float4 *)JM);
+    }
+  }
+  // thread-local partial sum accumulators
+  // We would like to keep these partial sums in register file and avoid memory
+  // accesses
+  psum_t psum_grad[MAX_JOINTS]; // MAX_JOINTS
+// we are allocating a lot larger array. So, we will be initilizing just the
+// portion we need.
+#pragma unroll
+  for (int i = 0; i < njoints; i++) {
+    psum_grad[i] = 0.0;
+  }
+
+  // read cumul mat index to run for this thread:
+  int read_cumul_idx = -1;
+
+  const int spheres_perthread = (nspheres + 15) / 16;
+
+  for (int i = 0; i < spheres_perthread; i++) {
+    const int sph_idx = elem_idx * spheres_perthread + i;
+    if (sph_idx >= nspheres) {
+      break;
+    }
+    const int sphAddrs = sph_idx * 4;
+    const int batchAddrs = batch * nspheres * 4;
+    float4 loc_grad_sphere_t = *(float4 *)&grad_spheres[batchAddrs + sphAddrs];
+    // Sparsity-based optimization: Skip zero computation
+    if (enable_sparsity_opt) {
+      if (loc_grad_sphere_t.x == 0 && loc_grad_sphere_t.y == 0 &&
+          loc_grad_sphere_t.z == 0) {
+        continue;
+      }
+    }
+    float3 loc_grad_sphere = make_float3(
+        loc_grad_sphere_t.x, loc_grad_sphere_t.y, loc_grad_sphere_t.z);
+
+    // read cumul idx:
+    read_cumul_idx = linkSphereMap[sph_idx];
+
+    float spheres_mem[4];
+    transform_sphere(&cumul_mat[matAddrBase + read_cumul_idx * 16],
+                     &robotSpheres[sphAddrs], &spheres_mem[0]);
+    for (int j = read_cumul_idx; j > -1; j--) {
+      if (linkChainMap[read_cumul_idx * nlinks + j] == 0.0) {
+        continue;
+      }
+
+      int j_type = jointMapType[j];
+      if (j_type == Z_ROT) {
+        float result = 0.0;
+        z_rot_backward_translation(&cumul_mat[matAddrBase + j * 16],
+                                   &spheres_mem[0], loc_grad_sphere, result);
+        psum_grad[jointMap[j]] += (psum_t)result;
+      } else if (j_type >= X_PRISM && j_type <= Z_PRISM) {
+        float result = 0.0;
+        xyz_prism_backward_translation(&cumul_mat[matAddrBase + j * 16],
+                                       loc_grad_sphere, result, j_type);
+        psum_grad[jointMap[j]] += (psum_t)result;
+      } else if (j_type == X_ROT) {
+        float result = 0.0;
+        x_rot_backward_translation(&cumul_mat[matAddrBase + j * 16],
+                                   &spheres_mem[0], loc_grad_sphere, result);
+        psum_grad[jointMap[j]] += (psum_t)result;
+      } else if (j_type == Y_ROT) {
+        float result = 0.0;
+        y_rot_backward_translation(&cumul_mat[matAddrBase + j * 16],
+                                   &spheres_mem[0], loc_grad_sphere, result);
+        psum_grad[jointMap[j]] += (psum_t)result;
+      }
+    }
+  }
+
+  // Instead of accumulating the sphere_grad and link_grad separately, we will
+  // accumulate them together once below.
+  //
+  // // accumulate across 4 threads using shuffle operation
+  // for(int j=0; j<njoints; j++)
+  // {
+  //   psum_t sum = psum_grad[j];
+  //   sum += __shfl_down_sync(mask, sum, 1);
+  //   sum += __shfl_down_sync(mask, sum, 2);
+
+  //   // thread 0 (col_idx==0) has the accumulated psum_grad[j] in sum
+  //   if (col_idx == 0) {
+  //     loc_grad_q[gqAddrBase + j] = (psum_t)sum;
+  //   }
+  // }
+
+  // compute jacobian for all links in linkmat:
+
+  // write out link:
+  //
+  for (int16_t i = 0; i < store_n_links; i++) {
+    const int batchAddrs = batch * store_n_links;
+    float3 g_position = *(float3 *)&grad_nlinks_pos[batchAddrs * 3 + i * 3];
+    float4 g_orientation_t =
+        *(float4 *)&grad_nlinks_quat[batchAddrs * 4 + i * 4];
+    // TODO: sparisty check here:
+    if (enable_sparsity_opt) {
+      if (g_position.x == 0 && g_position.y == 0 && g_position.z == 0 &&
+          g_orientation_t.y == 0 && g_orientation_t.z == 0 &&
+          g_orientation_t.w == 0) {
+        continue;
+      }
+    }
+    float3 g_orientation =
+        make_float3(g_orientation_t.y, g_orientation_t.z, g_orientation_t.w);
+
+    const int16_t l_map = storeLinkMap[i];
+
+    float l_pos[3];
+    const int outAddrStart = matAddrBase + l_map * M * M;
+    l_pos[0] = cumul_mat[outAddrStart + 3]; // read position of stored link
+    l_pos[1] = cumul_mat[outAddrStart + M + 3];
+    l_pos[2] = cumul_mat[outAddrStart + M * 2 + 3];
+
+    int16_t joints_per_thread = (l_map + 15) / 16;
+    for (int16_t k = joints_per_thread; k >= 0; k--) {
+      int16_t j = k * M + elem_idx;
+      if (j > l_map || j < 0)
+        continue;
+      // This can be spread across threads as they are not sequential?
+      if (linkChainMap[l_map * nlinks + j] == 0.0) {
+        continue;
+      }
+      int16_t j_idx = jointMap[j];
+      int8_t j_type = jointMapType[j];
+      // get rotation vector:
+      if (j_type == Z_ROT) {
+        z_rot_backward(&cumul_mat[matAddrBase + (j)*M * M], &l_pos[0],
+                       g_position, g_orientation, psum_grad[j_idx]);
+      } else if (j_type >= X_PRISM & j_type <= Z_PRISM) {
+        xyz_prism_backward_translation(&cumul_mat[matAddrBase + j * 16],
+                                       g_position, psum_grad[j_idx], j_type);
+      } else if (j_type == X_ROT) {
+        x_rot_backward(&cumul_mat[matAddrBase + (j)*M * M], &l_pos[0],
+                       g_position, g_orientation, psum_grad[j_idx]);
+      } else if (j_type == Y_ROT) {
+        y_rot_backward(&cumul_mat[matAddrBase + (j)*M * M], &l_pos[0],
+                       g_position, g_orientation, psum_grad[j_idx]);
+      }
+    }
+  }
+
+  if (PARALLEL_WRITE) {
+
+// accumulate the partial sums across the 16 threads
+#pragma unroll
+    for (int16_t j = 0; j < njoints; j++) {
+      psum_grad[j] += __shfl_xor_sync(mask, psum_grad[j], 1);
+      psum_grad[j] += __shfl_xor_sync(mask, psum_grad[j], 2);
+      psum_grad[j] += __shfl_xor_sync(mask, psum_grad[j], 4);
+      psum_grad[j] += __shfl_xor_sync(mask, psum_grad[j], 8);
+      // thread 0: psum_grad[j] will have the sum across 16 threads
+      // write out using only thread 0
+    }
+
+    const int16_t joints_per_thread = (njoints + 15) / 16;
+
+#pragma unroll
+    for (int16_t j = 0; j < joints_per_thread; j++) {
+      const int16_t j_idx = elem_idx * joints_per_thread + j;
+      if (j_idx >= njoints) {
+        break;
+      }
+      grad_out_link_q[batch * njoints + j_idx] =
+          psum_grad[j_idx]; // write the sum to memory
+    }
+  } else {
+#pragma unroll
+    for (int16_t j = 0; j < njoints; j++) {
+      psum_grad[j] += __shfl_down_sync(mask, psum_grad[j], 1);
+      psum_grad[j] += __shfl_down_sync(mask, psum_grad[j], 2);
+      psum_grad[j] += __shfl_down_sync(mask, psum_grad[j], 4);
+      psum_grad[j] += __shfl_down_sync(mask, psum_grad[j], 8);
+      // thread 0: psum_grad[j] will have the sum across 16 threads
+      // write out using only thread 0
+    }
+
+    if (elem_idx > 0) {
+      return;
+    }
+
+#pragma unroll
+    for (int16_t j = 0; j < njoints; j++) {
+      {
+        grad_out_link_q[batch * njoints + j] =
+            psum_grad[j]; // write the sum to memory
+      }
+    }
+  }
+  // accumulate the partial sums across the 16 threads
+}
+
+template <typename scalar_t>
+__global__ void mat_to_quat_kernel(scalar_t *out_quat,
+                                   const scalar_t *in_rot_mat,
+                                   const int batch_size) {
+  // Only works for float32
+  const int batch_idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (batch_idx >= batch_size) {
+    return;
+  }
+  float q[4] = {0.0}; // initialize array
+
+  float rot[9];
+
+  // read rot
+  *(float3 *)&rot[0] = *(float3 *)&in_rot_mat[batch_idx * 9];
+  *(float3 *)&rot[3] = *(float3 *)&in_rot_mat[batch_idx * 9 + 3];
+  *(float3 *)&rot[6] = *(float3 *)&in_rot_mat[batch_idx * 9 + 6];
+
+  rot_to_quat(&rot[0], &q[0]);
+
+  // write quaternion:
+  *(float4 *)&out_quat[batch_idx * 4] = *(float4 *)&q[0];
+}
+} // namespace Kinematics
+} // namespace Curobo
+
+std::vector<torch::Tensor> kin_fused_forward(
+    torch::Tensor link_pos, torch::Tensor link_quat,
+    torch::Tensor batch_robot_spheres, torch::Tensor global_cumul_mat,
+    const torch::Tensor joint_vec, const torch::Tensor fixed_transform,
+    const torch::Tensor robot_spheres, const torch::Tensor link_map,
+    const torch::Tensor joint_map, const torch::Tensor joint_map_type,
+    const torch::Tensor store_link_map, const torch::Tensor link_sphere_map,
+    const int batch_size, const int n_spheres,
+    const bool use_global_cumul = false) {
+  using namespace Curobo::Kinematics;
+  const int n_joints = joint_vec.size(0) / batch_size;
+  const int n_links = link_map.size(0);
+  const int store_n_links = link_pos.size(1);
+  assert(joint_map.dtype() == torch::kInt16);
+  assert(joint_map_type.dtype() == torch::kInt8);
+  assert(store_link_map.dtype() == torch::kInt16);
+  assert(link_sphere_map.dtype() == torch::kInt16);
+  assert(link_map.dtype() == torch::kInt16);
+  assert(batch_size > 0);
+  assert(n_links < MAX_TOTAL_LINKS);
+
+  int batches_per_block = (int)((MAX_TOTAL_LINKS / n_links));
+
+  if (batches_per_block == 0) {
+    batches_per_block = 1;
+  }
+
+  if (batches_per_block > MAX_BATCH_PER_BLOCK) {
+    batches_per_block = MAX_BATCH_PER_BLOCK;
+  }
+
+  if (batches_per_block * M > 1024) {
+    batches_per_block = 1024 / (M);
+  }
+  // batches_per_block = 1;
+  const int threadsPerBlock = batches_per_block * M;
+  const int blocksPerGrid =
+      (batch_size * M + threadsPerBlock - 1) / threadsPerBlock;
+  const int sharedMemSize = batches_per_block * n_links * M * M * sizeof(float);
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (use_global_cumul) {
+
+    AT_DISPATCH_FLOATING_TYPES(
+        link_pos.scalar_type(), "kin_fused_forward", ([&] {
+          kin_fused_warp_kernel2<scalar_t, true>
+              <<<blocksPerGrid, threadsPerBlock, sharedMemSize, stream>>>(
+                  link_pos.data_ptr<scalar_t>(), link_quat.data_ptr<scalar_t>(),
+                  batch_robot_spheres.data_ptr<scalar_t>(),
+                  global_cumul_mat.data_ptr<scalar_t>(),
+                  joint_vec.data_ptr<scalar_t>(),
+                  fixed_transform.data_ptr<scalar_t>(),
+                  robot_spheres.data_ptr<scalar_t>(),
+                  joint_map_type.data_ptr<int8_t>(),
+                  joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
+                  store_link_map.data_ptr<int16_t>(),
+                  link_sphere_map.data_ptr<int16_t>(), batch_size, n_spheres,
+                  n_links, n_joints, store_n_links);
+        }));
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(
+        link_pos.scalar_type(), "kin_fused_forward", ([&] {
+          kin_fused_warp_kernel2<scalar_t, false>
+              <<<blocksPerGrid, threadsPerBlock, sharedMemSize, stream>>>(
+                  link_pos.data_ptr<scalar_t>(), link_quat.data_ptr<scalar_t>(),
+                  batch_robot_spheres.data_ptr<scalar_t>(),
+                  global_cumul_mat.data_ptr<scalar_t>(),
+                  joint_vec.data_ptr<scalar_t>(),
+                  fixed_transform.data_ptr<scalar_t>(),
+                  robot_spheres.data_ptr<scalar_t>(),
+                  joint_map_type.data_ptr<int8_t>(),
+                  joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
+                  store_link_map.data_ptr<int16_t>(),
+                  link_sphere_map.data_ptr<int16_t>(), batch_size, n_spheres,
+                  n_links, n_joints, store_n_links);
+        }));
+  }
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {link_pos, link_quat, batch_robot_spheres, global_cumul_mat};
+}
+
+/////////////////////////////////////////////
+// Backward kinematics
+// Uses 16 threads per batch.
+// This version is 30-100% faster compared to
+// kin_fused_backward_4t.
+/////////////////////////////////////////////
+std::vector<torch::Tensor> kin_fused_backward_16t(
+    torch::Tensor grad_out, const torch::Tensor grad_nlinks_pos,
+    const torch::Tensor grad_nlinks_quat, const torch::Tensor grad_spheres,
+    const torch::Tensor global_cumul_mat, const torch::Tensor joint_vec,
+    const torch::Tensor fixed_transform, const torch::Tensor robot_spheres,
+    const torch::Tensor link_map, const torch::Tensor joint_map,
+    const torch::Tensor joint_map_type, const torch::Tensor store_link_map,
+    const torch::Tensor link_sphere_map, const torch::Tensor link_chain_map,
+    const int batch_size, const int n_spheres, const bool sparsity_opt = true,
+    const bool use_global_cumul = false) {
+  using namespace Curobo::Kinematics;
+
+  const int n_joints = joint_vec.size(0) / batch_size;
+  const int n_links = link_map.size(0);
+  const int store_n_links = store_link_map.size(0);
+
+  // assert(n_links < 128);
+  assert(n_joints < 128); // for larger num. of joints, change kernel3's
+                          // MAX_JOINTS template value.
+  assert(n_links < MAX_TOTAL_LINKS);
+  // We need 16 threads per batch
+  // Find the maximum number of batches we can use per block:
+  //
+  int batches_per_block = (int)((MAX_TOTAL_LINKS / n_links));
+  if (batches_per_block == 0) {
+    batches_per_block = 1;
+  }
+
+  // To optimize for better occupancy, we might limit to MAX_BATCH_PER_BLOCK
+  if (batches_per_block > MAX_BW_BATCH_PER_BLOCK) {
+    batches_per_block = MAX_BW_BATCH_PER_BLOCK;
+  }
+
+  // we cannot have more than 1024 threads:
+  if (batches_per_block * M * M > 1024) {
+    batches_per_block = 1024 / (M * M);
+  }
+
+  const int threadsPerBlock = batches_per_block * M * M;
+
+  const int blocksPerGrid =
+      (batch_size * M * M + threadsPerBlock - 1) / threadsPerBlock;
+  // assert to make sure n_joints, n_links < 128 to avoid overflow
+  // printf("threadsPerBlock: %d, blocksPerGRid: %d\n", threadsPerBlock,
+  // blocksPerGrid);
+
+  const int sharedMemSize = batches_per_block * n_links * M * M * sizeof(float);
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  assert(sparsity_opt);
+  if (use_global_cumul) {
+    if (n_joints < 16) {
+      AT_DISPATCH_FLOATING_TYPES(
+          grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
+            kin_fused_backward_kernel3<scalar_t, float, true, true, 16, true>
+                <<<blocksPerGrid, threadsPerBlock, sharedMemSize, stream>>>(
+                    grad_out.data_ptr<scalar_t>(),
+                    grad_nlinks_pos.data_ptr<scalar_t>(),
+                    grad_nlinks_quat.data_ptr<scalar_t>(),
+                    grad_spheres.data_ptr<scalar_t>(),
+                    global_cumul_mat.data_ptr<scalar_t>(),
+                    joint_vec.data_ptr<scalar_t>(),
+                    fixed_transform.data_ptr<scalar_t>(),
+                    robot_spheres.data_ptr<scalar_t>(),
+                    joint_map_type.data_ptr<int8_t>(),
+                    joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
+                    store_link_map.data_ptr<int16_t>(),
+                    link_sphere_map.data_ptr<int16_t>(),
+                    link_chain_map.data_ptr<int16_t>(), batch_size, n_spheres,
+                    n_links, n_joints, store_n_links);
+          }));
+    } else if (n_joints < 64) {
+
+      AT_DISPATCH_FLOATING_TYPES(
+          grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
+            kin_fused_backward_kernel3<scalar_t, float, true, true, 64, true>
+                <<<blocksPerGrid, threadsPerBlock, sharedMemSize, stream>>>(
+                    grad_out.data_ptr<scalar_t>(),
+                    grad_nlinks_pos.data_ptr<scalar_t>(),
+                    grad_nlinks_quat.data_ptr<scalar_t>(),
+                    grad_spheres.data_ptr<scalar_t>(),
+                    global_cumul_mat.data_ptr<scalar_t>(),
+                    joint_vec.data_ptr<scalar_t>(),
+                    fixed_transform.data_ptr<scalar_t>(),
+                    robot_spheres.data_ptr<scalar_t>(),
+                    joint_map_type.data_ptr<int8_t>(),
+                    joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
+                    store_link_map.data_ptr<int16_t>(),
+                    link_sphere_map.data_ptr<int16_t>(),
+                    link_chain_map.data_ptr<int16_t>(), batch_size, n_spheres,
+                    n_links, n_joints, store_n_links);
+          }));
+    } else {
+      AT_DISPATCH_FLOATING_TYPES(
+          grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
+            kin_fused_backward_kernel3<scalar_t, float, true, true, 128, true>
+                <<<blocksPerGrid, threadsPerBlock, sharedMemSize, stream>>>(
+                    grad_out.data_ptr<scalar_t>(),
+                    grad_nlinks_pos.data_ptr<scalar_t>(),
+                    grad_nlinks_quat.data_ptr<scalar_t>(),
+                    grad_spheres.data_ptr<scalar_t>(),
+                    global_cumul_mat.data_ptr<scalar_t>(),
+                    joint_vec.data_ptr<scalar_t>(),
+                    fixed_transform.data_ptr<scalar_t>(),
+                    robot_spheres.data_ptr<scalar_t>(),
+                    joint_map_type.data_ptr<int8_t>(),
+                    joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
+                    store_link_map.data_ptr<int16_t>(),
+                    link_sphere_map.data_ptr<int16_t>(),
+                    link_chain_map.data_ptr<int16_t>(), batch_size, n_spheres,
+                    n_links, n_joints, store_n_links);
+          }));
+    }
+    //
+
+  } else {
+    //
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
+          kin_fused_backward_kernel3<scalar_t, float, false, true, 128, true>
+              <<<blocksPerGrid, threadsPerBlock, sharedMemSize, stream>>>(
+                  grad_out.data_ptr<scalar_t>(),
+                  grad_nlinks_pos.data_ptr<scalar_t>(),
+                  grad_nlinks_quat.data_ptr<scalar_t>(),
+                  grad_spheres.data_ptr<scalar_t>(),
+                  global_cumul_mat.data_ptr<scalar_t>(),
+                  joint_vec.data_ptr<scalar_t>(),
+                  fixed_transform.data_ptr<scalar_t>(),
+                  robot_spheres.data_ptr<scalar_t>(),
+                  joint_map_type.data_ptr<int8_t>(),
+                  joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
+                  store_link_map.data_ptr<int16_t>(),
+                  link_sphere_map.data_ptr<int16_t>(),
+                  link_chain_map.data_ptr<int16_t>(), batch_size, n_spheres,
+                  n_links, n_joints, store_n_links);
+        }));
+  }
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {grad_out};
+}
+
+std::vector<torch::Tensor>
+matrix_to_quaternion(torch::Tensor out_quat,
+                     const torch::Tensor in_rot // batch_size, 3
+) {
+
+  using namespace Curobo::Kinematics;
+
+  // we compute the warp threads based on number of boxes:
+
+  // TODO: verify this math
+  const int batch_size = in_rot.size(0);
+
+  int threadsPerBlock = batch_size;
+  if (batch_size > 512) {
+    threadsPerBlock = 512;
+  }
+
+  // we fit warp thread spheres in a threadsPerBlock
+
+  int blocksPerGrid = (batch_size + threadsPerBlock - 1) / threadsPerBlock;
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES(
+      in_rot.scalar_type(), "matrix_to_quaternion", ([&] {
+        mat_to_quat_kernel<scalar_t>
+            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                out_quat.data_ptr<scalar_t>(), in_rot.data_ptr<scalar_t>(),
+                batch_size);
+      }));
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+  return {out_quat};
+}
\ No newline at end of file
diff --git a/src/curobo/curobolib/cpp/lbfgs_step_cuda.cpp b/src/curobo/curobolib/cpp/lbfgs_step_cuda.cpp
new file mode 100644
index 00000000..c158d998
--- /dev/null
+++ b/src/curobo/curobolib/cpp/lbfgs_step_cuda.cpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#include <torch/extension.h>
+
+#include <vector>
+
+#include <c10/cuda/CUDAGuard.h>
+
+// CUDA forward declarations
+std::vector<torch::Tensor> reduce_cuda(torch::Tensor vec, torch::Tensor vec2,
+                                       torch::Tensor rho_buffer,
+                                       torch::Tensor sum, const int batch_size,
+                                       const int v_dim, const int m);
+
+std::vector<torch::Tensor>
+lbfgs_step_cuda(torch::Tensor step_vec, torch::Tensor rho_buffer,
+                torch::Tensor y_buffer, torch::Tensor s_buffer,
+                torch::Tensor grad_q, const float epsilon, const int batch_size,
+                const int m, const int v_dim);
+
+std::vector<torch::Tensor>
+lbfgs_update_cuda(torch::Tensor rho_buffer, torch::Tensor y_buffer,
+                  torch::Tensor s_buffer, torch::Tensor q, torch::Tensor grad_q,
+                  torch::Tensor x_0, torch::Tensor grad_0, const int batch_size,
+                  const int m, const int v_dim);
+
+std::vector<torch::Tensor>
+lbfgs_cuda_fuse(torch::Tensor step_vec, torch::Tensor rho_buffer,
+                torch::Tensor y_buffer, torch::Tensor s_buffer, torch::Tensor q,
+                torch::Tensor grad_q, torch::Tensor x_0, torch::Tensor grad_0,
+                const float epsilon, const int batch_size, const int m,
+                const int v_dim, const bool stable_mode);
+// C++ interface
+
+// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x)                                                    \
+  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x)                                                         \
+  CHECK_CUDA(x);                                                               \
+  CHECK_CONTIGUOUS(x)
+
+std::vector<torch::Tensor>
+lbfgs_step_call(torch::Tensor step_vec, torch::Tensor rho_buffer,
+                torch::Tensor y_buffer, torch::Tensor s_buffer,
+                torch::Tensor grad_q, const float epsilon, const int batch_size,
+                const int m, const int v_dim) {
+
+  CHECK_INPUT(step_vec);
+  CHECK_INPUT(rho_buffer);
+  CHECK_INPUT(y_buffer);
+  CHECK_INPUT(s_buffer);
+  CHECK_INPUT(grad_q);
+  const at::cuda::OptionalCUDAGuard guard(grad_q.device());
+
+  return lbfgs_step_cuda(step_vec, rho_buffer, y_buffer, s_buffer, grad_q,
+                         epsilon, batch_size, m, v_dim);
+}
+
+std::vector<torch::Tensor>
+lbfgs_update_call(torch::Tensor rho_buffer, torch::Tensor y_buffer,
+                  torch::Tensor s_buffer, torch::Tensor q, torch::Tensor grad_q,
+                  torch::Tensor x_0, torch::Tensor grad_0, const int batch_size,
+                  const int m, const int v_dim) {
+  CHECK_INPUT(rho_buffer);
+  CHECK_INPUT(y_buffer);
+  CHECK_INPUT(s_buffer);
+  CHECK_INPUT(grad_q);
+  CHECK_INPUT(x_0);
+  CHECK_INPUT(grad_0);
+  CHECK_INPUT(q);
+  const at::cuda::OptionalCUDAGuard guard(grad_q.device());
+
+  return lbfgs_update_cuda(rho_buffer, y_buffer, s_buffer, q, grad_q, x_0,
+                           grad_0, batch_size, m, v_dim);
+}
+
+std::vector<torch::Tensor>
+reduce_cuda_call(torch::Tensor vec, torch::Tensor vec2,
+                 torch::Tensor rho_buffer, torch::Tensor sum,
+                 const int batch_size, const int v_dim, const int m) {
+  const at::cuda::OptionalCUDAGuard guard(sum.device());
+
+  return reduce_cuda(vec, vec2, rho_buffer, sum, batch_size, v_dim, m);
+}
+
+std::vector<torch::Tensor>
+lbfgs_call(torch::Tensor step_vec, torch::Tensor rho_buffer,
+           torch::Tensor y_buffer, torch::Tensor s_buffer, torch::Tensor q,
+           torch::Tensor grad_q, torch::Tensor x_0, torch::Tensor grad_0,
+           const float epsilon, const int batch_size, const int m,
+           const int v_dim, const bool stable_mode) {
+  CHECK_INPUT(step_vec);
+  CHECK_INPUT(rho_buffer);
+  CHECK_INPUT(y_buffer);
+  CHECK_INPUT(s_buffer);
+  CHECK_INPUT(grad_q);
+  CHECK_INPUT(x_0);
+  CHECK_INPUT(grad_0);
+  CHECK_INPUT(q);
+  const at::cuda::OptionalCUDAGuard guard(grad_q.device());
+
+  return lbfgs_cuda_fuse(step_vec, rho_buffer, y_buffer, s_buffer, q, grad_q,
+                         x_0, grad_0, epsilon, batch_size, m, v_dim,
+                         stable_mode);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("step", &lbfgs_step_call, "L-BFGS step (CUDA)");
+  m.def("update", &lbfgs_update_call, "L-BFGS Update (CUDA)");
+  m.def("forward", &lbfgs_call, "L-BFGS Update + Step (CUDA)");
+  m.def("debug_reduce", &reduce_cuda_call, "L-BFGS Debug");
+}
diff --git a/src/curobo/curobolib/cpp/lbfgs_step_kernel.cu b/src/curobo/curobolib/cpp/lbfgs_step_kernel.cu
new file mode 100644
index 00000000..e3664655
--- /dev/null
+++ b/src/curobo/curobolib/cpp/lbfgs_step_kernel.cu
@@ -0,0 +1,869 @@
+/*
+ * Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include <cuda.h>
+#include <torch/extension.h>
+#include <vector>
+
+#include <c10/cuda/CUDAStream.h>
+#include <cuda_fp16.h>
+
+// #include "helper_cuda.h"
+#include "helper_math.h"
+
+#include <assert.h>
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+#include <math.h>
+
+// #include <stdio.h>
+//
+// For the CUDA runtime routines (prefixed with "cuda_")
+// #include <cuda_runtime.h>
+
+// #include <cuda_fp16.h>
+// #include <helper_cuda.h>
+#define M_MAX 512
+#define HALF_MAX 65504.0
+#define M 15
+#define VDIM 175 // 25 * 7,
+
+#define FULL_MASK 0xffffffff
+
+namespace Curobo {
+
+namespace Optimization {
+
+template <typename scalar_t>
+__device__ __forceinline__ void
+scalar_vec_product(const scalar_t a, const scalar_t *b, scalar_t *out,
+                   const int v_dim) {
+
+  for (int i = 0; i < v_dim; i++) {
+    out[i] = a * b[i];
+  }
+}
+
+template <typename scalar_t>
+__device__ __forceinline__ void
+m_scalar_vec_product(const scalar_t *a, const scalar_t *b, scalar_t *out,
+                     const int v_dim, const int m) {
+  for (int j = 0; j < m; j++) {
+    for (int i = 0; i < v_dim; i++) {
+      out[j * v_dim + i] = a[j] * b[j * v_dim + i];
+    }
+  }
+}
+
+template <typename scalar_t>
+__device__ __forceinline__ void vec_vec_dot(const scalar_t *a,
+                                            const scalar_t *b, scalar_t &out,
+                                            const int v_dim) {
+
+  for (int i = 0; i < v_dim; i++) {
+    out += a[i] * b[i];
+  }
+}
+
+template <typename scalar_t>
+__device__ __forceinline__ void update_r(const scalar_t *rho_y,
+                                         const scalar_t *s_buffer, scalar_t *r,
+                                         scalar_t &alpha, const int v_dim) {
+  // dot product: and subtract with alpha
+  for (int i = 0; i < v_dim; i++) {
+    alpha -= rho_y[i] * r[i];
+  }
+  // scalar vector product:
+  for (int i = 0; i < v_dim; i++) {
+    r[i] = r[i] + alpha * s_buffer[i];
+  }
+}
+template <typename scalar_t>
+__device__ __forceinline__ void update_q(const scalar_t *y_buffer, scalar_t *gq,
+                                         const scalar_t alpha,
+                                         const int v_dim) {
+  //
+  for (int i = 0; i < v_dim; i++) {
+    gq[i] = gq[i] - (alpha * y_buffer[i]);
+  }
+}
+template <typename scalar_t>
+__global__ void
+lbfgs_step_kernel_old(scalar_t *step_vec, scalar_t *rho_buffer,
+                      const scalar_t *y_buffer, const scalar_t *s_buffer,
+                      const scalar_t *grad_q, const float epsilon,
+                      const int batchSize, const int m, const int v_dim) {
+  // each thread writes one sphere of some link
+  const int t = blockDim.x * blockIdx.x + threadIdx.x; // batch
+  const int b_idx = t;
+  if (t >= (batchSize)) {
+    return;
+  }
+
+  // get thread start address:
+  const int b_start_scalar_adrs = b_idx * m;
+  const int b_start_vec_adrs = b_idx * m * v_dim;
+  const int b_step_start_adrs = b_idx * v_dim;
+
+  scalar_t rho_s[M * VDIM];
+
+  // copy floats to local buffer?
+  // y_buffer, s_buffer, rho_buffer
+  // compute rho_s
+  scalar_t loc_ybuf[M * VDIM];
+  scalar_t loc_sbuf[M * VDIM];
+  scalar_t loc_rho[M];
+  scalar_t gq[VDIM]; //, l_q[VDIM];
+  scalar_t alpha_buffer[M];
+  scalar_t t_1, t_2;
+
+  for (int i = 0; i < m * v_dim; i++) {
+    loc_ybuf[i] = y_buffer[b_start_vec_adrs + i];
+    loc_sbuf[i] = s_buffer[b_start_vec_adrs + i];
+  }
+  for (int i = 0; i < v_dim; i++) {
+    gq[i] = grad_q[b_step_start_adrs + i];
+  }
+  for (int i = 0; i < m; i++) {
+    loc_rho[i] = rho_buffer[b_start_scalar_adrs + i];
+  }
+
+  m_scalar_vec_product(&loc_rho[0], &loc_sbuf[0], &rho_s[0], v_dim, m);
+  // for loop over m
+  for (int i = m - 1; i > m - 2; i--) {
+    // l_start_vec_adrs = i * v_dim;
+    // scalar_vec_product(loc_rho[i], &loc_sbuf[i*v_dim], &rho_s[i*v_dim],
+    // v_dim);
+    vec_vec_dot(&rho_s[i * v_dim], &gq[0], alpha_buffer[i], v_dim);
+    update_q(&loc_ybuf[(i * v_dim)], &gq[0], alpha_buffer[i], v_dim);
+  }
+  // compute initial hessian:
+  vec_vec_dot(&loc_sbuf[(m - 1) * v_dim], &loc_ybuf[(m - 1) * v_dim], t_1,
+              v_dim);
+  vec_vec_dot(&loc_ybuf[(m - 1) * v_dim], &loc_ybuf[(m - 1) * v_dim], t_2,
+              v_dim);
+  t_1 = t_1 / t_2;
+  if (t_1 < 0) {
+    t_1 = 0;
+  }
+
+  t_1 += epsilon;
+  scalar_vec_product(t_1, &gq[0], &gq[0], v_dim);
+
+  m_scalar_vec_product(&loc_rho[0], &loc_ybuf[0], &rho_s[0], v_dim, m);
+
+  for (int i = 0; i < m; i++) {
+    // scalar_vec_product(loc_rho[i], &loc_ybuf[i*v_dim], &rho_s[i*v_dim],
+    // v_dim);
+    update_r(&rho_s[i * v_dim], &loc_sbuf[i * v_dim], &gq[0], alpha_buffer[i],
+             v_dim);
+  }
+
+  // write gq to out grad:
+
+  for (int i = 0; i < v_dim; i++) {
+    step_vec[b_step_start_adrs + i] = -1.0 * gq[i];
+  }
+}
+
+template <typename psum_t>
+__forceinline__ __device__ psum_t warpReduce(psum_t v, int elems,
+                                             unsigned mask) {
+  psum_t val = v;
+  int shift = 1;
+  for (int i = elems; i > 1; i /= 2) {
+    val += __shfl_down_sync(mask, val, shift);
+    shift *= 2;
+  }
+  // val += __shfl_down_sync(mask, val, 1); // i=32
+  // val += __shfl_down_sync(mask, val, 2); // i=16
+  // val += __shfl_down_sync(mask, val, 4); // i=8
+  // val += __shfl_down_sync(mask, val, 8); // i=4
+  // val += __shfl_down_sync(mask, val, 16); // i=2
+  return val;
+}
+
+template <typename scalar_t, typename psum_t>
+__forceinline__ __device__ void reduce(scalar_t v, int m, psum_t *data,
+                                       scalar_t *result) {
+  psum_t val = v;
+  unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < m);
+  val += __shfl_down_sync(mask, val, 1);
+  val += __shfl_down_sync(mask, val, 2);
+  val += __shfl_down_sync(mask, val, 4);
+  val += __shfl_down_sync(mask, val, 8);
+  val += __shfl_down_sync(mask, val, 16);
+  // int leader = __ffs(mask) – 1;    // select a leader lane
+  int leader = 0;
+  if (threadIdx.x % 32 == leader) {
+    if (m < 32) {
+      result[0] = (scalar_t)val;
+    } else {
+      data[(threadIdx.x + 1) / 32] = val;
+    }
+  }
+  if (m >= 32) {
+
+    __syncthreads();
+
+    int elems = (m + 31) / 32;
+    unsigned mask2 = __ballot_sync(FULL_MASK, threadIdx.x < elems);
+    if (threadIdx.x / 32 == 0) { // only the first warp will do this work
+      psum_t val2 = data[threadIdx.x % 32];
+      int shift = 1;
+      for (int i = elems - 1; i > 0; i /= 2) {
+        val2 += __shfl_down_sync(mask2, val2, shift);
+        shift *= 2;
+      }
+      // int leader = __ffs(mask2) – 1;    // select a leader lane
+      int leader = 0;
+      if (threadIdx.x % 32 == leader) {
+        result[0] = (scalar_t)val2;
+      }
+    }
+  }
+  __syncthreads();
+}
+
+// blockReduce
+template <typename scalar_t, typename psum_t>
+__forceinline__ __device__ void reduce_v1(scalar_t v, int m, psum_t *data,
+                                          scalar_t *result) {
+  unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < m);
+  psum_t val = warpReduce(v, 32, mask);
+
+  // int leader = __ffs(mask) – 1;    // select a leader lane
+  int leader = 0;
+  if (threadIdx.x % 32 == leader) {
+    data[(threadIdx.x + 1) / 32] = val;
+  }
+
+  if (m >= 32) {
+
+    __syncthreads();
+
+    int elems = (m + 31) / 32;
+    unsigned mask2 = __ballot_sync(FULL_MASK, threadIdx.x < elems);
+    if (threadIdx.x / 32 == 0) { // only the first warp will do this work
+      psum_t val2 = warpReduce(data[threadIdx.x % 32], elems, mask2);
+
+      // // int leader = __ffs(mask2) – 1;    // select a leader lane
+      if (threadIdx.x == leader) {
+        result[0] = (scalar_t)val2;
+      }
+    }
+  } else {
+    if (threadIdx.x == leader) {
+      result[0] = (scalar_t)val;
+    }
+  }
+  __syncthreads();
+}
+
+template <typename scalar_t, typename psum_t>
+__inline__ __device__ void dot(const scalar_t *mat1, const scalar_t *mat2,
+                               const int m, psum_t *data, scalar_t *result) {
+
+  scalar_t val = mat1[threadIdx.x] * mat2[threadIdx.x];
+  reduce(val, m, data, result);
+}
+
+template <typename scalar_t> __inline__ __device__ scalar_t relu(scalar_t var) {
+  if (var < 0)
+    return 0;
+  else
+    return var;
+}
+
+//////////////////////////////////////////////////////////
+// one block per batch
+// v_dim threads per block
+//////////////////////////////////////////////////////////
+template <typename scalar_t, typename psum_t>
+__global__ void lbfgs_step_kernel(scalar_t *step_vec,       // b x 175
+                                  scalar_t *rho_buffer,     // m x b x 1
+                                  const scalar_t *y_buffer, // m x b x 175
+                                  const scalar_t *s_buffer, // m x b x 175
+                                  const scalar_t *grad_q,   // b x 175
+                                  const float epsilon, const int batchsize,
+                                  const int m, const int v_dim) {
+  __shared__ psum_t
+      data[32]; // temporary buffer needed for block-wide reduction
+  __shared__ scalar_t
+      result; // result of the reduction or vector-vector dot product
+  __shared__ scalar_t gq[175]; /// gq  = batch * v_dim
+  assert(v_dim < 176);
+  int batch = blockIdx.x; // one block per batch
+  if (threadIdx.x >= v_dim)
+    return;
+
+  gq[threadIdx.x] = grad_q[batch * v_dim + threadIdx.x]; // copy grad_q to gq
+
+  scalar_t alpha_buffer[16];
+  // assert(m<16); // allocating a buffer assuming m < 16
+
+  for (int i = m - 1; i > -1; i--) {
+    dot(&gq[0], &s_buffer[i * batchsize * v_dim + batch * v_dim], v_dim,
+        &data[0], &result);
+    alpha_buffer[i] = result * rho_buffer[i * batchsize + batch];
+    gq[threadIdx.x] =
+        gq[threadIdx.x] -
+        alpha_buffer[i] *
+            y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+  }
+
+  // compute var1
+  scalar_t val1 =
+      y_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x];
+  scalar_t val2 =
+      s_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x];
+  reduce(val1 * val1, v_dim, data, &result);
+  scalar_t denominator = result;
+  reduce(val1 * val2, v_dim, data, &result);
+  scalar_t numerator = result;
+  scalar_t var1 = numerator / denominator;
+
+  scalar_t gamma = relu(var1) + epsilon; // epsilon
+
+  gq[threadIdx.x] = gamma * gq[threadIdx.x];
+
+  for (int i = 0; i < m; i++) {
+    dot(&gq[0], &y_buffer[i * batchsize * v_dim + batch * v_dim], v_dim,
+        &data[0], &result);
+    gq[threadIdx.x] =
+        gq[threadIdx.x] +
+        (alpha_buffer[i] - result * rho_buffer[i * batchsize + batch]) *
+            s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+  }
+
+  step_vec[batch * v_dim + threadIdx.x] =
+      -1 * gq[threadIdx.x]; // copy from shared memory to global memory
+}
+
+template <typename scalar_t, typename psum_t>
+__global__ void lbfgs_update_buffer_kernel(scalar_t *rho_buffer, // m x b x 1
+                                           scalar_t *y_buffer,   // m x b x 175
+                                           scalar_t *s_buffer,   // m x b x 175
+                                           scalar_t *q,          // b x 175
+                                           scalar_t *x_0,        // b x 175
+                                           scalar_t *grad_0,     // b x 175
+                                           const scalar_t *grad_q, // b x 175
+                                           const int batchsize, const int m,
+                                           const int v_dim) {
+  __shared__ psum_t
+      data[32]; // temporary buffer needed for block-wide reduction
+  __shared__ scalar_t
+      result; // result of the reduction or vector-vector dot product
+  // __shared__ scalar_t y[175]; // temporary shared memory storage
+  // __shared__ scalar_t s[175]; // temporary shared memory storage
+  assert(v_dim <= VDIM);
+  int batch = blockIdx.x; // one block per batch
+  if (threadIdx.x >= v_dim)
+    return;
+
+  scalar_t y =
+      grad_q[batch * v_dim + threadIdx.x] - grad_0[batch * v_dim + threadIdx.x];
+  scalar_t s =
+      q[batch * v_dim + threadIdx.x] - x_0[batch * v_dim + threadIdx.x];
+  reduce(y * s, v_dim, &data[0], &result);
+
+  for (int i = 1; i < m; i++) {
+    s_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] =
+        s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+    y_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] =
+        y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+  }
+
+  // __syncthreads();
+
+  s_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = s;
+  y_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = y;
+  grad_0[batch * v_dim + threadIdx.x] = grad_q[batch * v_dim + threadIdx.x];
+  x_0[batch * v_dim + threadIdx.x] = q[batch * v_dim + threadIdx.x];
+
+  if (threadIdx.x == 0) {
+    scalar_t rho = 1 / result;
+    for (int i = 1; i < m; i++) {
+      rho_buffer[(i - 1) * batchsize + batch] =
+          rho_buffer[i * batchsize + batch];
+    }
+    rho_buffer[(m - 1) * batchsize + batch] = rho;
+  }
+}
+
+template <typename scalar_t, typename psum_t>
+__global__ void reduce_kernel(
+    scalar_t *vec1,       // b x 175
+    scalar_t *vec2,       // b x 175
+    scalar_t *rho_buffer, // m x b x 1
+
+    scalar_t *sum_out, // m x b x 1
+
+    const int batchsize, const int m,
+    const int v_dim) // s_buffer and y_buffer are not rolled by default
+{
+  __shared__ psum_t
+      data[32]; // temporary buffer needed for block-wide reduction
+  __shared__ scalar_t
+      result; // result of the reduction or vector-vector dot product
+  int batch = blockIdx.x; // one block per batch
+  if (threadIdx.x >= v_dim)
+    return;
+
+  ////////////////////
+  // update_buffer
+  ////////////////////
+  scalar_t y = vec1[batch * v_dim + threadIdx.x];
+  scalar_t s = vec2[batch * v_dim + threadIdx.x];
+
+  reduce(y * s, v_dim, &data[0], &result);
+  scalar_t numerator = result;
+  if (threadIdx.x == 0) {
+    sum_out[batch] = 1 / numerator;
+  }
+  // return;
+  if (threadIdx.x < m - 1) {
+    // m thread participate to shif the values
+    // this is safe as m<32  and this happens in lockstep
+    rho_buffer[threadIdx.x * batchsize + batch] =
+        rho_buffer[(threadIdx.x + 1) * batchsize + batch];
+  } else if (threadIdx.x == m - 1) {
+    scalar_t rho = (1 / numerator);
+    rho_buffer[threadIdx.x * batchsize + batch] = rho;
+  }
+}
+template <typename scalar_t, typename psum_t>
+__global__ void lbfgs_update_buffer_and_step_v1(
+    scalar_t *step_vec,     // b x 175
+    scalar_t *rho_buffer,   // m x b x 1
+    scalar_t *y_buffer,     // m x b x 175
+    scalar_t *s_buffer,     // m x b x 175
+    scalar_t *q,            // b x 175
+    scalar_t *x_0,          // b x 175
+    scalar_t *grad_0,       // b x 175
+    const scalar_t *grad_q, // b x 175
+    const float epsilon, const int batchsize, const int m, const int v_dim,
+    const bool rolled_ys = false,
+    const bool stable_mode =
+        false) // s_buffer and y_buffer are not rolled by default
+{
+  // extern __shared__ scalar_t alpha_buffer_sh[];
+  extern __shared__ __align__(sizeof(scalar_t)) unsigned char my_smem[];
+  scalar_t *my_smem_rc = reinterpret_cast<scalar_t *>(my_smem);
+  scalar_t *alpha_buffer_sh = &my_smem_rc[0];            // m*blockDim.x
+  scalar_t *rho_buffer_sh = &my_smem_rc[m * blockDim.x]; // batchsize*m
+  scalar_t *s_buffer_sh =
+      &my_smem_rc[m * blockDim.x + m * batchsize]; // m*blockDim.x
+  scalar_t *y_buffer_sh =
+      &my_smem_rc[2 * m * blockDim.x + m * batchsize]; // m*blockDim.x
+
+  __shared__ psum_t
+      data[32]; // temporary buffer needed for block-wide reduction
+  __shared__ scalar_t
+      result; // result of the reduction or vector-vector dot product
+  int batch = blockIdx.x; // one block per batch
+  if (threadIdx.x >= v_dim)
+    return;
+
+  scalar_t gq;
+  gq = grad_q[batch * v_dim + threadIdx.x]; // copy grad_q to gq
+
+  ////////////////////
+  // update_buffer
+  ////////////////////
+  scalar_t y = gq - grad_0[batch * v_dim + threadIdx.x];
+  // if y is close to zero
+  scalar_t s =
+      q[batch * v_dim + threadIdx.x] - x_0[batch * v_dim + threadIdx.x];
+  reduce_v1(y * s, v_dim, &data[0], &result);
+  scalar_t numerator = result;
+  // scalar_t rho = 1.0/numerator;
+
+  if (!rolled_ys) {
+#pragma unroll
+    for (int i = 1; i < m; i++) {
+      scalar_t st =
+          s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+      scalar_t yt =
+          y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+      s_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = st;
+      y_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = yt;
+      s_buffer_sh[m * threadIdx.x + i - 1] = st;
+      y_buffer_sh[m * threadIdx.x + i - 1] = yt;
+    }
+  }
+
+  s_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = s;
+  y_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = y;
+  s_buffer_sh[m * threadIdx.x + m - 1] = s;
+  y_buffer_sh[m * threadIdx.x + m - 1] = y;
+  grad_0[batch * v_dim + threadIdx.x] = gq;
+  x_0[batch * v_dim + threadIdx.x] = q[batch * v_dim + threadIdx.x];
+  if (threadIdx.x < m - 1) {
+    // m thread participate to shif the values
+    // this is safe as m<32  and this happens in lockstep
+    scalar_t rho = rho_buffer[(threadIdx.x + 1) * batchsize + batch];
+    rho_buffer[threadIdx.x * batchsize + batch] = rho;
+    rho_buffer_sh[threadIdx.x * batchsize + batch] = rho;
+  }
+  if (threadIdx.x == m - 1) {
+    scalar_t rho = 1.0 / numerator;
+    // if this is nan, make it zero:
+    if (stable_mode && numerator == 0.0) {
+      rho = 0.0;
+    }
+    rho_buffer[threadIdx.x * batchsize + batch] = rho;
+    rho_buffer_sh[threadIdx.x * batchsize + batch] = rho;
+  }
+  // return;
+  __syncthreads();
+  ////////////////////
+  // step
+  ////////////////////
+  // scalar_t alpha_buffer[16];
+  // assert(m<16); // allocating a buffer assuming m < 16
+
+#pragma unroll
+  for (int i = m - 1; i > -1; i--) {
+    // reduce(gq * s_buffer[i*batchsize*v_dim + batch*v_dim + threadIdx.x],
+    // v_dim, &data[0], &result);
+    reduce_v1(gq * s_buffer_sh[m * threadIdx.x + i], v_dim, &data[0], &result);
+    alpha_buffer_sh[threadIdx.x * m + i] =
+        result * rho_buffer_sh[i * batchsize + batch];
+    // gq = gq - alpha_buffer_sh[threadIdx.x*m+i]*y_buffer[i*batchsize*v_dim +
+    // batch*v_dim + threadIdx.x];
+    gq = gq - alpha_buffer_sh[threadIdx.x * m + i] *
+                  y_buffer_sh[m * threadIdx.x + i];
+  }
+
+  // compute var1
+  reduce_v1(y * y, v_dim, data, &result);
+  scalar_t denominator = result;
+  // reduce(s*y, v_dim, data, &result); // redundant - already computed it above
+  // scalar_t numerator = result;
+  scalar_t var1 = numerator / denominator;
+
+  // To improve stability, uncomment below line: [this however leads to poor
+  // convergence]
+
+  if (stable_mode && denominator == 0.0) {
+    var1 = epsilon;
+  }
+
+  scalar_t gamma = relu(var1);
+  gq = gamma * gq;
+
+#pragma unroll
+  for (int i = 0; i < m; i++) {
+    // reduce(gq * y_buffer[i*batchsize*v_dim + batch*v_dim + threadIdx.x],
+    // v_dim, &data[0], &result); gq = gq + (alpha_buffer_sh[threadIdx.x*m+i] -
+    // result * rho_buffer_sh[i*batchsize+batch]) * s_buffer[i*batchsize*v_dim +
+    // batch*v_dim + threadIdx.x];
+    reduce_v1(gq * y_buffer_sh[m * threadIdx.x + i], v_dim, &data[0], &result);
+    gq = gq + (alpha_buffer_sh[threadIdx.x * m + i] -
+               result * rho_buffer_sh[i * batchsize + batch]) *
+                  s_buffer_sh[m * threadIdx.x + i];
+  }
+
+  step_vec[batch * v_dim + threadIdx.x] =
+      -1.0 * gq; // copy from shared memory to global memory
+}
+
+// (32/M) rolls per warp
+// Threads in a warp in a GPU execute in a lock-step. We leverage that to do
+// the roll without using temporary storage or explicit synchronization.
+template <typename scalar_t>
+__global__ void lbfgs_roll(scalar_t *a, // m x b x 175
+                           scalar_t *b, // m x b x 175
+                           const int m_t, const int batchsize, const int m,
+                           const int v_dim) {
+  assert(m_t <= 32);
+  int t = blockDim.x * blockIdx.x + threadIdx.x;
+  if (t >= m_t * v_dim * batchsize)
+    return;
+  int _m = t % m_t;
+  int _v_dim = (t / m_t) % v_dim;
+  int batch = t / (m * v_dim); // this line could be wrong?
+
+  if (_m < m - 1) {
+    a[_m * batchsize * v_dim + batch * v_dim + _v_dim] =
+        a[(_m + 1) * batchsize * v_dim + batch * v_dim + _v_dim];
+    b[_m * batchsize * v_dim + batch * v_dim + _v_dim] =
+        b[(_m + 1) * batchsize * v_dim + batch * v_dim + _v_dim];
+  }
+}
+
+template <typename scalar_t, typename psum_t, bool rolled_ys>
+__global__ void lbfgs_update_buffer_and_step(
+    scalar_t *step_vec,     // b x 175
+    scalar_t *rho_buffer,   // m x b x 1
+    scalar_t *y_buffer,     // m x b x 175
+    scalar_t *s_buffer,     // m x b x 175
+    scalar_t *q,            // b x 175
+    scalar_t *x_0,          // b x 175
+    scalar_t *grad_0,       // b x 175
+    const scalar_t *grad_q, // b x 175
+    const float epsilon, const int batchsize, const int m, const int v_dim,
+    const bool stable_mode =
+        false) // s_buffer and y_buffer are not rolled by default
+{
+  // extern __shared__ scalar_t alpha_buffer_sh[];
+  extern __shared__ __align__(sizeof(scalar_t)) unsigned char my_smem[];
+  scalar_t *alpha_buffer_sh = reinterpret_cast<scalar_t *>(my_smem);
+
+  __shared__ psum_t
+      data[32]; // temporary buffer needed for block-wide reduction
+  __shared__ scalar_t
+      result; // result of the reduction or vector-vector dot product
+  int batch = blockIdx.x; // one block per batch
+  if (threadIdx.x >= v_dim)
+    return;
+
+  scalar_t gq;
+  gq = grad_q[batch * v_dim + threadIdx.x]; // copy grad_q to gq
+
+  ////////////////////
+  // update_buffer
+  ////////////////////
+  scalar_t y = gq - grad_0[batch * v_dim + threadIdx.x];
+  // if y is close to zero
+  scalar_t s =
+      q[batch * v_dim + threadIdx.x] - x_0[batch * v_dim + threadIdx.x];
+  reduce(y * s, v_dim, &data[0], &result);
+  scalar_t numerator = result;
+  // scalar_t rho = 1.0/numerator;
+
+  if (!rolled_ys) {
+    for (int i = 1; i < m; i++) {
+      s_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] =
+          s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+      y_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] =
+          y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+    }
+  }
+
+  s_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = s;
+  y_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = y;
+  grad_0[batch * v_dim + threadIdx.x] = gq;
+  x_0[batch * v_dim + threadIdx.x] = q[batch * v_dim + threadIdx.x];
+  if (threadIdx.x < m - 1) {
+    // m thread participate to shif the values
+    // this is safe as m<32  and this happens in lockstep
+    rho_buffer[threadIdx.x * batchsize + batch] =
+        rho_buffer[(threadIdx.x + 1) * batchsize + batch];
+  }
+  if (threadIdx.x == m - 1) {
+    scalar_t rho = 1.0 / numerator;
+    // if this is nan, make it zero:
+    if (stable_mode && numerator == 0.0) {
+      rho = 0.0;
+    }
+    rho_buffer[threadIdx.x * batchsize + batch] = rho;
+  }
+  // return;
+  //__syncthreads();
+  ////////////////////
+  // step
+  ////////////////////
+  // scalar_t alpha_buffer[16];
+  // assert(m<16); // allocating a buffer assuming m < 16
+
+#pragma unroll
+  for (int i = m - 1; i > -1; i--) {
+    reduce(gq * s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x],
+           v_dim, &data[0], &result);
+    alpha_buffer_sh[threadIdx.x * m + i] =
+        result * rho_buffer[i * batchsize + batch];
+    gq = gq - alpha_buffer_sh[threadIdx.x * m + i] *
+                  y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+  }
+
+  // compute var1
+  reduce(y * y, v_dim, data, &result);
+  scalar_t denominator = result;
+  // reduce(s*y, v_dim, data, &result); // redundant - already computed it above
+  // scalar_t numerator = result;
+  scalar_t var1 = numerator / denominator;
+
+  // To improve stability, uncomment below line: [this however leads to poor
+  // convergence]
+
+  if (stable_mode && denominator == 0.0) {
+    var1 = epsilon;
+  }
+
+  scalar_t gamma = relu(var1);
+
+  gq = gamma * gq;
+
+#pragma unroll
+  for (int i = 0; i < m; i++) {
+    reduce(gq * y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x],
+           v_dim, &data[0], &result);
+    gq = gq + (alpha_buffer_sh[threadIdx.x * m + i] -
+               result * rho_buffer[i * batchsize + batch]) *
+                  s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+  }
+
+  step_vec[batch * v_dim + threadIdx.x] =
+      -1.0 * gq; // copy from shared memory to global memory
+}
+
+} // namespace Optimization
+} // namespace Curobo
+std::vector<torch::Tensor>
+lbfgs_step_cuda(torch::Tensor step_vec, torch::Tensor rho_buffer,
+                torch::Tensor y_buffer, torch::Tensor s_buffer,
+                torch::Tensor grad_q, const float epsilon, const int batch_size,
+                const int m, const int v_dim) {
+  using namespace Curobo::Optimization;
+  const int threadsPerBlock = 128;
+  const int blocksPerGrid =
+      ((batch_size) + threadsPerBlock - 1) / threadsPerBlock;
+
+  // launch threads per batch:
+  // int threadsPerBlock = pow(2,((int)log2(v_dim))+1);
+
+  // const int blocksPerGrid = batch_size; //((batch_size) + threadsPerBlock -
+  // 1) / threadsPerBlock;
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES(
+      step_vec.scalar_type(), "lbfgs_step_cu", ([&] {
+        lbfgs_step_kernel_old<scalar_t>
+            <<<blocksPerGrid, threadsPerBlock,
+               v_dim * sizeof(step_vec.scalar_type()), stream>>>(
+                step_vec.data_ptr<scalar_t>(), rho_buffer.data_ptr<scalar_t>(),
+                y_buffer.data_ptr<scalar_t>(), s_buffer.data_ptr<scalar_t>(),
+                grad_q.data_ptr<scalar_t>(), epsilon, batch_size, m, v_dim);
+      }));
+
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {step_vec};
+}
+
+std::vector<torch::Tensor>
+lbfgs_update_cuda(torch::Tensor rho_buffer, torch::Tensor y_buffer,
+                  torch::Tensor s_buffer, torch::Tensor q, torch::Tensor grad_q,
+                  torch::Tensor x_0, torch::Tensor grad_0, const int batch_size,
+                  const int m, const int v_dim) {
+
+  using namespace Curobo::Optimization;
+
+  // const int threadsPerBlock = 128;
+  //  launch threads per batch:
+  // int threadsPerBlock = pow(2,((int)log2(v_dim))+1);
+  int threadsPerBlock = v_dim;
+
+  const int blocksPerGrid =
+      batch_size; //((batch_size) + threadsPerBlock - 1) / threadsPerBlock;
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES(
+      y_buffer.scalar_type(), "lbfgs_update_cu", ([&] {
+        lbfgs_update_buffer_kernel<scalar_t, scalar_t>
+            <<<blocksPerGrid, threadsPerBlock,
+               v_dim * sizeof(y_buffer.scalar_type()), stream>>>(
+                rho_buffer.data_ptr<scalar_t>(), y_buffer.data_ptr<scalar_t>(),
+                s_buffer.data_ptr<scalar_t>(), q.data_ptr<scalar_t>(),
+                x_0.data_ptr<scalar_t>(), grad_0.data_ptr<scalar_t>(),
+                grad_q.data_ptr<scalar_t>(), batch_size, m, v_dim);
+      }));
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {rho_buffer, y_buffer, s_buffer, x_0, grad_0};
+}
+
+std::vector<torch::Tensor>
+lbfgs_cuda_fuse(torch::Tensor step_vec, torch::Tensor rho_buffer,
+                torch::Tensor y_buffer, torch::Tensor s_buffer, torch::Tensor q,
+                torch::Tensor grad_q, torch::Tensor x_0, torch::Tensor grad_0,
+                const float epsilon, const int batch_size, const int m,
+                const int v_dim, const bool stable_mode) {
+  using namespace Curobo::Optimization;
+
+  // call first kernel:
+
+  int threadsPerBlock = v_dim;
+  assert(threadsPerBlock < 1024);
+  assert(m < M_MAX);
+  int blocksPerGrid =
+      batch_size; //((batch_size) + threadsPerBlock - 1) / threadsPerBlock;
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  int smemsize = 0;
+  if (true) {
+    AT_DISPATCH_FLOATING_TYPES(
+        y_buffer.scalar_type(), "lbfgs_cuda_fuse_kernel", [&] {
+          smemsize = m * threadsPerBlock * sizeof(scalar_t);
+          lbfgs_update_buffer_and_step<scalar_t, scalar_t, false>
+              <<<blocksPerGrid, threadsPerBlock, smemsize, stream>>>(
+                  step_vec.data_ptr<scalar_t>(),
+                  rho_buffer.data_ptr<scalar_t>(),
+                  y_buffer.data_ptr<scalar_t>(), s_buffer.data_ptr<scalar_t>(),
+                  q.data_ptr<scalar_t>(), x_0.data_ptr<scalar_t>(),
+                  grad_0.data_ptr<scalar_t>(), grad_q.data_ptr<scalar_t>(),
+                  epsilon, batch_size, m, v_dim, stable_mode);
+        });
+  } else {
+
+    // v1 does not work
+    AT_DISPATCH_FLOATING_TYPES(
+        y_buffer.scalar_type(), "lbfgs_cuda_fuse_kernel_v1", [&] {
+          smemsize = 3 * m * threadsPerBlock * sizeof(scalar_t) +
+                     m * batch_size * sizeof(scalar_t);
+          lbfgs_update_buffer_and_step_v1<scalar_t, scalar_t>
+              <<<blocksPerGrid, threadsPerBlock, smemsize, stream>>>(
+                  step_vec.data_ptr<scalar_t>(),
+                  rho_buffer.data_ptr<scalar_t>(),
+                  y_buffer.data_ptr<scalar_t>(), s_buffer.data_ptr<scalar_t>(),
+                  q.data_ptr<scalar_t>(), x_0.data_ptr<scalar_t>(),
+                  grad_0.data_ptr<scalar_t>(), grad_q.data_ptr<scalar_t>(),
+                  epsilon, batch_size, m, v_dim, false, stable_mode);
+        });
+  }
+
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {step_vec, rho_buffer, y_buffer, s_buffer, x_0, grad_0};
+}
+
+std::vector<torch::Tensor> reduce_cuda(torch::Tensor vec, torch::Tensor vec2,
+                                       torch::Tensor rho_buffer,
+                                       torch::Tensor sum, const int batch_size,
+                                       const int m, const int v_dim) {
+
+  using namespace Curobo::Optimization;
+
+  int threadsPerBlock = pow(2, ((int)log2(v_dim)) + 1);
+  int blocksPerGrid =
+      batch_size; //((batch_size) + threadsPerBlock - 1) / threadsPerBlock;
+                  // printf("threadsPerBlock:%d, blocksPerGrid: %d\n",
+                  // threadsPerBlock, blocksPerGrid);
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES(
+      vec.scalar_type(), "reduce_cu", ([&] {
+        reduce_kernel<scalar_t, scalar_t>
+            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                vec.data_ptr<scalar_t>(), vec2.data_ptr<scalar_t>(),
+                rho_buffer.data_ptr<scalar_t>(), sum.data_ptr<scalar_t>(),
+                batch_size, m, v_dim);
+      }));
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {sum, rho_buffer};
+}
diff --git a/src/curobo/curobolib/cpp/line_search_cuda.cpp b/src/curobo/curobolib/cpp/line_search_cuda.cpp
new file mode 100644
index 00000000..1a88678f
--- /dev/null
+++ b/src/curobo/curobolib/cpp/line_search_cuda.cpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include <torch/extension.h>
+
+#include <vector>
+
+#include <c10/cuda/CUDAGuard.h>
+
+// CUDA forward declarations
+
+std::vector<torch::Tensor>
+update_best_cuda(torch::Tensor best_cost, torch::Tensor best_q,
+                 torch::Tensor best_iteration, 
+                 torch::Tensor current_iteration,
+                 const torch::Tensor cost,
+                 const torch::Tensor q, const int d_opt, const int cost_s1,
+                 const int cost_s2, const int iteration,
+                 const float delta_threshold,
+                 const float relative_threshold = 0.999);
+
+std::vector<torch::Tensor> line_search_cuda(
+    // torch::Tensor m,
+    torch::Tensor best_x, torch::Tensor best_c, torch::Tensor best_grad,
+    const torch::Tensor g_x, const torch::Tensor x_set,
+    const torch::Tensor step_vec, const torch::Tensor c_0,
+    const torch::Tensor alpha_list, const torch::Tensor c_idx, const float c_1,
+    const float c_2, const bool strong_wolfe, const bool approx_wolfe,
+    const int l1, const int l2, const int batchsize);
+// C++ interface
+
+// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x)                                                    \
+  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x)                                                         \
+  CHECK_CUDA(x);                                                               \
+  CHECK_CONTIGUOUS(x)
+
+
+std::vector<torch::Tensor> line_search_call(
+    // torch::Tensor m,
+    torch::Tensor best_x, torch::Tensor best_c, torch::Tensor best_grad,
+    const torch::Tensor g_x, const torch::Tensor x_set,
+    const torch::Tensor step_vec, const torch::Tensor c_0,
+    const torch::Tensor alpha_list, const torch::Tensor c_idx, const float c_1,
+    const float c_2, const bool strong_wolfe, const bool approx_wolfe,
+    const int l1, const int l2, const int batchsize) {
+
+  CHECK_INPUT(g_x);
+  CHECK_INPUT(x_set);
+  CHECK_INPUT(step_vec);
+  CHECK_INPUT(c_0);
+  CHECK_INPUT(alpha_list);
+  CHECK_INPUT(c_idx);
+  // CHECK_INPUT(m);
+  CHECK_INPUT(best_x);
+  CHECK_INPUT(best_c);
+  CHECK_INPUT(best_grad);
+  const at::cuda::OptionalCUDAGuard guard(best_x.device());
+
+  // return line_search_cuda(m, g_x, step_vec, c_0, alpha_list, c_1, c_2,
+  // strong_wolfe, l1, l2, batchsize);
+  return line_search_cuda(best_x, best_c, best_grad, g_x, x_set, step_vec, c_0,
+                          alpha_list, c_idx, c_1, c_2, strong_wolfe,
+                          approx_wolfe, l1, l2, batchsize);
+}
+
+std::vector<torch::Tensor>
+update_best_call(torch::Tensor best_cost, torch::Tensor best_q,
+                 torch::Tensor best_iteration, 
+                 torch::Tensor current_iteration,
+                 const torch::Tensor cost,
+                 const torch::Tensor q, const int d_opt, const int cost_s1,
+                 const int cost_s2, const int iteration,
+                 const float delta_threshold,
+                 const float relative_threshold=0.999) {
+
+  CHECK_INPUT(best_cost);
+  CHECK_INPUT(best_q);
+  CHECK_INPUT(cost);
+  CHECK_INPUT(q);
+  CHECK_INPUT(current_iteration);
+  CHECK_INPUT(best_iteration);
+  const at::cuda::OptionalCUDAGuard guard(cost.device());
+
+  return update_best_cuda(best_cost, best_q, best_iteration, current_iteration, cost, q, d_opt,
+                          cost_s1, cost_s2, iteration, delta_threshold, relative_threshold);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("update_best", &update_best_call, "Update Best (CUDA)");
+  m.def("line_search", &line_search_call, "Line search (CUDA)");
+
+}
\ No newline at end of file
diff --git a/src/curobo/curobolib/cpp/line_search_kernel.cu b/src/curobo/curobolib/cpp/line_search_kernel.cu
new file mode 100644
index 00000000..0c42029f
--- /dev/null
+++ b/src/curobo/curobolib/cpp/line_search_kernel.cu
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+#include <cuda.h>
+#include <torch/extension.h>
+#include <vector>
+
+#include <c10/cuda/CUDAStream.h>
+#include <cuda_fp16.h>
+
+// #include "helper_cuda.h"
+#include "helper_math.h"
+
+#include <assert.h>
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+#include <cub/cub.cuh>
+#include <math.h>
+
+// #include <stdio.h>
+//
+// For the CUDA runtime routines (prefixed with "cuda_")
+// #include <cuda_runtime.h>
+
+// #include <cuda_fp16.h>
+// #include <helper_cuda.h>
+
+#define FULL_MASK 0xffffffff
+
+namespace Curobo {
+
+namespace Optimization {
+
+template <typename scalar_t, typename psum_t>
+__inline__ __device__ void reduce(scalar_t v, int m, unsigned mask,
+                                  psum_t *data, scalar_t *result) {
+  psum_t val = v;
+  val += __shfl_down_sync(mask, val, 1);
+  val += __shfl_down_sync(mask, val, 2);
+  val += __shfl_down_sync(mask, val, 4);
+  val += __shfl_down_sync(mask, val, 8);
+  val += __shfl_down_sync(mask, val, 16);
+  // int leader = __ffs(mask) – 1;    // select a leader lane
+  int leader = 0;
+  if (threadIdx.x % 32 == leader) {
+    if (m <= 32) {
+      result[0] = (scalar_t)val;
+    } else {
+      data[(threadIdx.x + 1) / 32] = val;
+    }
+  }
+  if (m > 32) {
+
+    __syncthreads();
+
+    int elems = (m + 31) / 32;
+    assert(elems <= 32);
+    unsigned mask2 = __ballot_sync(FULL_MASK, threadIdx.x < elems);
+    if (threadIdx.x < elems) { // only the first warp will do this work
+      psum_t val2 = data[threadIdx.x % 32];
+      int shift = 1;
+      for (int i = elems - 1; i > 0; i /= 2) {
+        val2 += __shfl_down_sync(mask2, val2, shift);
+        shift *= 2;
+      }
+      // int leader = __ffs(mask2) – 1;    // select a leader lane
+      int leader = 0;
+      if (threadIdx.x % 32 == leader) {
+        result[0] = (scalar_t)val2;
+      }
+    }
+  }
+  __syncthreads();
+}
+
+// Launched with l2 threads/block and batchsize blocks
+template <typename scalar_t, typename psum_t>
+__global__ void line_search_kernel(
+    // int64_t *m_idx, // 4x1x1
+    scalar_t *best_x,           // 4x280
+    scalar_t *best_c,           // 4x1
+    scalar_t *best_grad,        // 4x280
+    const scalar_t *g_x,        //  4x6x280
+    const scalar_t *x_set,      //  4x6x280
+    const scalar_t *step_vec,   // 4x280x1
+    const scalar_t *c,          // 4x6x1
+    const scalar_t *alpha_list, // 4x6x1
+    const int64_t *c_idx,       // 4x1x1
+    const float c_1, const float c_2, const bool strong_wolfe,
+    const bool approx_wolfe,
+    const int l1,        // 6
+    const int l2,        // 280
+    const int batchsize) // 4
+{
+
+  int batch = blockIdx.x;
+  __shared__ psum_t data[32];
+  __shared__ scalar_t result[32];
+  assert(l1 <= 32);
+  unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < l2);
+
+  if (threadIdx.x >= l2) {
+    return;
+  }
+
+  scalar_t sv_elem = step_vec[batch * l2 + threadIdx.x];
+
+  // g_step = g0 @ step_vec_T
+  // g_x @ step_vec_T
+  for (int i = 0; i < l1; i++) {
+    reduce(g_x[batch * l1 * l2 + l2 * i + threadIdx.x] * sv_elem, l2, mask,
+           &data[0], &result[i]);
+  }
+
+  __shared__ scalar_t step_success[32];
+  __shared__ scalar_t step_success_w1[32];
+  assert(blockDim.x >= l1);
+  bool wolfe_1 = false;
+  bool wolfe = false;
+  bool condition = threadIdx.x < l1;
+  if (condition) {
+    // scalar_t alpha_list_elem = alpha_list[batch*l1 + threadIdx.x];
+    scalar_t alpha_list_elem = alpha_list[threadIdx.x];
+
+    // condition 1:
+    wolfe_1 = c[batch * l1 + threadIdx.x] <=
+              (c[batch * l1] + c_1 * alpha_list_elem * result[0]);
+
+    // condition 2:
+    bool wolfe_2;
+    if (strong_wolfe) {
+      wolfe_2 = abs(result[threadIdx.x]) <= c_2 * abs(result[0]);
+    } else {
+      wolfe_2 = result[threadIdx.x] >= c_2 * result[0];
+    }
+
+    wolfe = wolfe_1 & wolfe_2;
+
+    step_success[threadIdx.x] = wolfe * (alpha_list_elem + 0.1);
+    step_success_w1[threadIdx.x] = wolfe_1 * (alpha_list_elem + 0.1);
+  }
+
+  __syncthreads();
+
+  __shared__ int idx_shared;
+  if (threadIdx.x == 0) {
+    int m_id = 0;
+    int m1_id = 0;
+    scalar_t max1 = step_success[0];
+    scalar_t max2 = step_success_w1[0];
+    for (int i = 1; i < l1; i++) {
+      if (max1 < step_success[i]) {
+        max1 = step_success[i];
+        m_id = i;
+      }
+      if (max2 < step_success_w1[i]) {
+        max2 = step_success_w1[i];
+        m1_id = i;
+      }
+    }
+
+    if (!approx_wolfe) {
+
+      //   m_idx = torch.where(m_idx == 0, m1_idx, m_idx)
+      if (m_id == 0) {
+        m_id = m1_id;
+      }
+      // m_idx[m_idx == 0] = 1
+      if (m_id == 0) {
+        m_id = 1;
+      }
+    }
+    idx_shared = m_id + c_idx[batch];
+  }
+
+  ////////////////////////////////////
+  // write outputs using the computed index.
+  // one index per batch is computed
+  ////////////////////////////////////
+  // l2 is d_opt, l1 is line_search n.
+  // idx_shared contains index in l1
+  //
+  __syncthreads();
+  if (threadIdx.x < l2) {
+    if (threadIdx.x == 0) {
+      // printf("block: %d, idx_shared: %d\n", batch, idx_shared);
+    }
+    best_x[batch * l2 + threadIdx.x] = x_set[idx_shared * l2 + threadIdx.x];
+    best_grad[batch * l2 + threadIdx.x] = g_x[idx_shared * l2 + threadIdx.x];
+  }
+  if (threadIdx.x == 0) {
+    best_c[batch] = c[idx_shared];
+  }
+}
+
+// Launched with l2 threads/block and #blocks = batchsize
+template <typename scalar_t, typename psum_t>
+__global__ void line_search_kernel_mask(
+    // int64_t *m_idx, // 4x1x1
+    scalar_t *best_x,           // 4x280
+    scalar_t *best_c,           // 4x1
+    scalar_t *best_grad,        // 4x280
+    const scalar_t *g_x,        //  4x6x280
+    const scalar_t *x_set,      //  4x6x280
+    const scalar_t *step_vec,   // 4x280x1
+    const scalar_t *c,          // 4x6x1
+    const scalar_t *alpha_list, // 4x6x1
+    const int64_t *c_idx,       // 4x1x1
+    const float c_1, const float c_2, const bool strong_wolfe,
+    const bool approx_wolfe,
+    const int l1,        // 6
+    const int l2,        // 280
+    const int batchsize) // 4
+{
+
+  int batch = blockIdx.x;
+  __shared__ psum_t data[32];
+  __shared__ scalar_t result[32];
+  assert(l1 <= 32);
+  unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < l2);
+
+  if (threadIdx.x >= l2) {
+    return;
+  }
+
+  scalar_t sv_elem = step_vec[batch * l2 + threadIdx.x];
+
+  // g_step = g0 @ step_vec_T
+  // g_x @ step_vec_T
+  for (int i = 0; i < l1; i++) {
+    reduce(g_x[batch * l1 * l2 + l2 * i + threadIdx.x] * sv_elem, l2, mask,
+           &data[0], &result[i]);
+  }
+
+  // __shared__ scalar_t step_success[32];
+  // __shared__ scalar_t step_success_w1[32];
+  assert(blockDim.x >= l1);
+  bool wolfe_1 = false;
+  bool wolfe = false;
+  bool condition = threadIdx.x < l1;
+  if (condition) {
+    scalar_t alpha_list_elem = alpha_list[threadIdx.x];
+
+    // scalar_t alpha_list_elem = alpha_list[batch*l1 + threadIdx.x];
+
+    // condition 1:
+    wolfe_1 = c[batch * l1 + threadIdx.x] <=
+              (c[batch * l1] + c_1 * alpha_list_elem * result[0]);
+
+    // condition 2:
+    bool wolfe_2;
+    if (strong_wolfe) {
+      wolfe_2 = abs(result[threadIdx.x]) <= c_2 * abs(result[0]);
+    } else {
+      wolfe_2 = result[threadIdx.x] >= c_2 * result[0];
+    }
+
+    // wolfe = torch.logical_and(wolfe_1, wolfe_2)
+    wolfe = wolfe_1 & wolfe_2;
+
+    // // step_success = wolfe * (self.alpha_list[:, :, 0:1] + 0.1)
+    // // step_success_w1 = wolfe_1 * (self.alpha_list[:, :, 0:1] + 0.1)
+    // step_success[threadIdx.x]    = wolfe   * (alpha_list_elem + 0.1);
+    // step_success_w1[threadIdx.x] = wolfe_1 * (alpha_list_elem + 0.1);
+  }
+  unsigned msk1 = __ballot_sync(FULL_MASK, wolfe_1 & condition);
+  unsigned msk = __ballot_sync(FULL_MASK, wolfe & condition);
+
+  // get the index of the last occurance of true
+  unsigned msk1_brev = __brev(msk1);
+  unsigned msk_brev = __brev(msk);
+
+  int id1 = 32 - __ffs(msk1_brev); // position of least signficant bit set to 1
+  int id = 32 - __ffs(msk_brev);   // position of least signficant bit set to 1
+
+  __syncthreads();
+
+  __shared__ int idx_shared;
+  if (threadIdx.x == 0) {
+    if (!approx_wolfe) {
+      if (id == 32) { // msk is zero
+        id = id1;
+      }
+      if (id == 0) { // bit 0 is set
+        id = id1;
+      }
+      if (id == 32) { // msk is zero
+        id = 1;
+      }
+      if (id == 0) {
+        id = 1;
+      }
+    } else {
+      if (id == 32) { // msk is zero
+        id = 0;
+      }
+    }
+
+    // //  _, m_idx = torch.max(step_success, dim=-2)
+    // //  _, m1_idx = torch.max(step_success_w1, dim=-2)
+    // int m_id = 0;
+    // int m1_id = 0;
+    // scalar_t max1 = step_success[0];
+    // scalar_t max2 = step_success_w1[0];
+    // for (int i=1; i<l1; i++) {
+    //   if (max1<step_success[i]) {
+    //     max1 = step_success[i];
+    //     m_id = i;
+    //   }
+    //   if (max2<step_success_w1[i]) {
+    //     max2 = step_success_w1[i];
+    //     m1_id = i;
+    //   }
+    // }
+
+    // //   m_idx = torch.where(m_idx == 0, m1_idx, m_idx)
+    // if (m_id == 0) {
+    //     m_id = m1_id;
+    // }
+
+    // // m_idx[m_idx == 0] = 1
+    // if (m_id == 0) {
+    //     m_id = 1;
+    // }
+
+    // if (id != m_id) {
+    //   printf("id=%d, m_id=%d\n", id, m_id);
+    //   printf("msk1=%x, msk=%x, raw id1=%d, raw id=%d\n", msk1, msk,
+    //   32-__ffs(msk1_brev), 32-__ffs(msk_brev));
+    // }
+
+    // m_idx[batch] = m_id;
+    //  m_idx[batch] = id;
+    idx_shared = id + c_idx[batch];
+  }
+
+  ////////////////////////////////////
+  // write outputs using the computed index.
+  // one index per batch is computed
+  ////////////////////////////////////
+  __syncthreads();
+  if (threadIdx.x < l2) {
+    if (threadIdx.x == 0) {
+      // printf("block: %d, idx_shared: %d\n", batch, idx_shared);
+    }
+    best_x[batch * l2 + threadIdx.x] = x_set[idx_shared * l2 + threadIdx.x];
+    best_grad[batch * l2 + threadIdx.x] = g_x[idx_shared * l2 + threadIdx.x];
+  }
+  if (threadIdx.x == 0) {
+    best_c[batch] = c[idx_shared];
+  }
+}
+
+} // namespace Optimization
+} // namespace Curobo
+std::vector<torch::Tensor> line_search_cuda(
+    // torch::Tensor m_idx,
+    torch::Tensor best_x, torch::Tensor best_c, torch::Tensor best_grad,
+    const torch::Tensor g_x, const torch::Tensor x_set,
+    const torch::Tensor step_vec, const torch::Tensor c_0,
+    const torch::Tensor alpha_list, const torch::Tensor c_idx, const float c_1,
+    const float c_2, const bool strong_wolfe, const bool approx_wolfe,
+    const int l1, const int l2, const int batchsize) {
+  using namespace Curobo::Optimization;
+  assert(l2 <= 1024);
+
+  // multiple of 32
+  const int threadsPerBlock = 32 * ((l2 + 31) / 32); //  l2;
+  const int blocksPerGrid = batchsize;
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES(
+      g_x.scalar_type(), "line_search_cu", ([&] {
+        line_search_kernel_mask<scalar_t, scalar_t>
+            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                // m_idx.data_ptr<int>(),
+                best_x.data_ptr<scalar_t>(), best_c.data_ptr<scalar_t>(),
+                best_grad.data_ptr<scalar_t>(), g_x.data_ptr<scalar_t>(),
+                x_set.data_ptr<scalar_t>(), step_vec.data_ptr<scalar_t>(),
+                c_0.data_ptr<scalar_t>(), alpha_list.data_ptr<scalar_t>(),
+                c_idx.data_ptr<int64_t>(), c_1, c_2, strong_wolfe, approx_wolfe,
+                l1, l2, batchsize);
+      }));
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {best_x, best_c, best_grad};
+}
\ No newline at end of file
diff --git a/src/curobo/curobolib/cpp/pose_distance_kernel.cu b/src/curobo/curobolib/cpp/pose_distance_kernel.cu
new file mode 100644
index 00000000..8897e534
--- /dev/null
+++ b/src/curobo/curobolib/cpp/pose_distance_kernel.cu
@@ -0,0 +1,790 @@
+/*
+ * Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#include <c10/cuda/CUDAStream.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <torch/extension.h>
+
+#include "helper_math.h"
+#include <cmath>
+#include <cuda_fp16.h>
+#include <vector>
+
+#define SINGLE_GOAL 0
+#define BATCH_GOAL 1
+#define GOALSET 2
+#define BATCH_GOALSET 3
+
+namespace Curobo 
+{
+namespace Pose
+{
+__device__ __forceinline__ void
+transform_vec_quat(const float4 q, // x,y,z, qw, qx,qy,qz
+                   const float *d_vec_weight, 
+                   float *C) {
+  // do dot product:
+  // new_p = q * p * q_inv + obs_p
+
+  if (false || q.x != 0 || q.y != 0 || q.z != 0) {
+
+    C[0] = q.w * q.w * d_vec_weight[0] + 2 * q.y * q.w * d_vec_weight[2] -
+          2 * q.z * q.w * d_vec_weight[1] + q.x * q.x * d_vec_weight[0] +
+          2 * q.y * q.x * d_vec_weight[1] + 2 * q.z * q.x * d_vec_weight[2] -
+          q.z * q.z * d_vec_weight[0] - q.y * q.y * d_vec_weight[0];
+    C[1] = 2 * q.x * q.y * d_vec_weight[0] + q.y * q.y * d_vec_weight[1] +
+          2 * q.z * q.y * d_vec_weight[2] + 2 * q.w * q.z * d_vec_weight[0] -
+          q.z * q.z * d_vec_weight[1] + q.w * q.w * d_vec_weight[1] -
+          2 * q.x * q.w * d_vec_weight[2] - q.x * q.x * d_vec_weight[1];
+    C[2] = 2 * q.x * q.z * d_vec_weight[0] + 2 * q.y * q.z * d_vec_weight[1] +
+          q.z * q.z * d_vec_weight[2] - 2 * q.w * q.y * d_vec_weight[0] -
+          q.y * q.y * d_vec_weight[2] + 2 * q.w * q.x * d_vec_weight[1] -
+          q.x * q.x * d_vec_weight[2] + q.w * q.w * d_vec_weight[2];
+
+    C[3] = q.w * q.w * d_vec_weight[3] + 2 * q.y * q.w * d_vec_weight[5] -
+          2 * q.z * q.w * d_vec_weight[4] + q.x * q.x * d_vec_weight[3] +
+          2 * q.y * q.x * d_vec_weight[4] + 2 * q.z * q.x * d_vec_weight[5] -
+          q.z * q.z * d_vec_weight[3] - q.y * q.y * d_vec_weight[3];
+    C[4] = 2 * q.x * q.y * d_vec_weight[3] + q.y * q.y * d_vec_weight[4] + 
+          2 * q.z * q.y * d_vec_weight[5] + 2 * q.w * q.z * d_vec_weight[3] -
+          q.z * q.z * d_vec_weight[4] + q.w * q.w * d_vec_weight[4] -
+          2 * q.x * q.w * d_vec_weight[5] - q.x * q.x * d_vec_weight[4];
+    C[5] = 2 * q.x * q.z * d_vec_weight[3] + 2 * q.y * q.z * d_vec_weight[4] +
+          q.z * q.z * d_vec_weight[5] - 2 * q.w * q.y * d_vec_weight[3] -
+          q.y * q.y * d_vec_weight[5] + 2 * q.w * q.x * d_vec_weight[4] -
+          q.x * q.x * d_vec_weight[5] + q.w * q.w * d_vec_weight[5];
+  }
+  {
+    C[0] = d_vec_weight[0];
+    C[1] = d_vec_weight[1];
+    C[2] = d_vec_weight[2];
+    C[3] = d_vec_weight[3];
+    C[4] = d_vec_weight[4];
+    C[5] = d_vec_weight[5];
+    
+
+  }
+}
+
+__device__ __forceinline__ void
+inv_transform_vec_quat(const float4 q_in, // x,y,z, qw, qx,qy,qz
+                   const float *d_vec_weight, 
+                   float *C) {
+  // do dot product:
+  // new_p = q * p * q_inv + obs_p
+
+  if (q_in.x != 0 || q_in.y != 0 || q_in.z != 0) {
+    float4 q = make_float4(-q_in.x, -q_in.y, -q_in.z, q_in.w);
+
+    C[0] = q.w * q.w * d_vec_weight[0] + 2 * q.y * q.w * d_vec_weight[2] -
+          2 * q.z * q.w * d_vec_weight[1] + q.x * q.x * d_vec_weight[0] +
+          2 * q.y * q.x * d_vec_weight[1] + 2 * q.z * q.x * d_vec_weight[2] -
+          q.z * q.z * d_vec_weight[0] - q.y * q.y * d_vec_weight[0];
+    C[1] = 2 * q.x * q.y * d_vec_weight[0] + q.y * q.y * d_vec_weight[1] +
+          2 * q.z * q.y * d_vec_weight[2] + 2 * q.w * q.z * d_vec_weight[0] -
+          q.z * q.z * d_vec_weight[1] + q.w * q.w * d_vec_weight[1] -
+          2 * q.x * q.w * d_vec_weight[2] - q.x * q.x * d_vec_weight[1];
+    C[2] = 2 * q.x * q.z * d_vec_weight[0] + 2 * q.y * q.z * d_vec_weight[1] +
+          q.z * q.z * d_vec_weight[2] - 2 * q.w * q.y * d_vec_weight[0] -
+          q.y * q.y * d_vec_weight[2] + 2 * q.w * q.x * d_vec_weight[1] -
+          q.x * q.x * d_vec_weight[2] + q.w * q.w * d_vec_weight[2];
+
+    C[3] = q.w * q.w * d_vec_weight[3] + 2 * q.y * q.w * d_vec_weight[5] -
+          2 * q.z * q.w * d_vec_weight[4] + q.x * q.x * d_vec_weight[3] +
+          2 * q.y * q.x * d_vec_weight[4] + 2 * q.z * q.x * d_vec_weight[5] -
+          q.z * q.z * d_vec_weight[3] - q.y * q.y * d_vec_weight[3];
+    C[4] = 2 * q.x * q.y * d_vec_weight[3] + q.y * q.y * d_vec_weight[4] + 
+          2 * q.z * q.y * d_vec_weight[5] + 2 * q.w * q.z * d_vec_weight[3] -
+          q.z * q.z * d_vec_weight[4] + q.w * q.w * d_vec_weight[4] -
+          2 * q.x * q.w * d_vec_weight[5] - q.x * q.x * d_vec_weight[4];
+    C[5] = 2 * q.x * q.z * d_vec_weight[3] + 2 * q.y * q.z * d_vec_weight[4] +
+          q.z * q.z * d_vec_weight[5] - 2 * q.w * q.y * d_vec_weight[3] -
+          q.y * q.y * d_vec_weight[5] + 2 * q.w * q.x * d_vec_weight[4] -
+          q.x * q.x * d_vec_weight[5] + q.w * q.w * d_vec_weight[5];
+  }
+  {
+    C[0] = d_vec_weight[0];
+    C[1] = d_vec_weight[1];
+    C[2] = d_vec_weight[2];
+    C[3] = d_vec_weight[3];
+    C[4] = d_vec_weight[4];
+    C[5] = d_vec_weight[5];
+    
+
+  }
+}
+__device__ __forceinline__ void
+compute_quat_distance(float *result, const float4 a, const float4 b) {
+  // We compute distance with the conjugate of b
+  float r_w = (a.w * b.w + a.x * b.x + a.y * b.y + a.z * b.z);
+  if (r_w < 0.0) {
+    r_w = 1.0;
+  } else {
+    r_w = -1.0;
+  }
+
+  result[0] = r_w * (-1 * a.w * b.x + b.w * a.x - a.y * b.z + b.y * a.z);
+  result[1] = r_w * (-1 * a.w * b.y + b.w * a.y - a.z * b.x + b.z * a.x);
+  result[2] = r_w * (-1 * a.w * b.z + b.w * a.z - a.x * b.y + b.x * a.y);
+}
+
+__device__ __forceinline__ void
+compute_distance(float *distance_vec, float &distance, float &position_distance,
+                 float &rotation_distance, const float3 current_position,
+                 const float3 goal_position, const float4 current_quat,
+                 const float4 goal_quat, const float *vec_weight,
+                 const float *vec_convergence, const float position_weight,
+                 const float rotation_weight) {
+  compute_quat_distance(&distance_vec[3], goal_quat, current_quat);
+  // compute position distance
+  *(float3 *)&distance_vec[0] = current_position - goal_position;
+  // distance_vec[0] = goal_position
+  position_distance = 0;
+  rotation_distance = 0;
+// scale by vec weight and position weight:
+#pragma unroll 3
+  for (int i = 0; i < 3; i++) {
+    distance_vec[i] = vec_weight[i + 3] * distance_vec[i];
+    position_distance += distance_vec[i] * distance_vec[i];
+  }
+#pragma unroll 3
+  for (int i = 3; i < 6; i++) {
+    distance_vec[i] = vec_weight[i - 3] * distance_vec[i];
+    rotation_distance += distance_vec[i] * distance_vec[i];
+  }
+
+  distance = 0;
+  if (rotation_distance > vec_convergence[0] * vec_convergence[0]) {
+    rotation_distance = sqrtf(rotation_distance);
+    distance += rotation_weight * rotation_distance;
+  }
+  if (position_distance > vec_convergence[1] * vec_convergence[1]) {
+    position_distance = sqrtf(position_distance);
+    distance += position_weight * position_distance;
+  }
+}
+
+__device__ __forceinline__ void compute_metric_distance(
+    float *distance_vec, float &distance, float &position_distance,
+    float &rotation_distance, const float3 current_position,
+    const float3 goal_position, const float4 current_quat,
+    const float4 goal_quat, const float *vec_weight,
+    const float *vec_convergence, const float position_weight,
+    const float p_alpha, const float rotation_weight, const float r_alpha) {
+  compute_quat_distance(&distance_vec[3], goal_quat, current_quat);
+  // compute position distance
+  *(float3 *)&distance_vec[0] = current_position - goal_position;
+  // distance_vec[0] = goal_position
+  position_distance = 0;
+  rotation_distance = 0;
+// scale by vec weight and position weight:
+#pragma unroll 3
+  for (int i = 0; i < 3; i++) {
+    distance_vec[i] = vec_weight[i + 3] * distance_vec[i];
+    // position_distance += powf(distance_vec[i],2);
+    position_distance += distance_vec[i] * distance_vec[i];
+  }
+#pragma unroll 3
+  for (int i = 3; i < 6; i++) {
+    distance_vec[i] = vec_weight[i - 3] * distance_vec[i];
+    // rotation_distance += powf(distance_vec[i],2);
+    rotation_distance += distance_vec[i] * distance_vec[i];
+  }
+
+  distance = 0;
+  if (rotation_distance > vec_convergence[0] * vec_convergence[0]) {
+    rotation_distance = sqrtf(rotation_distance);
+    distance += rotation_weight * log2f(coshf(r_alpha * rotation_distance));
+  }
+  if (position_distance > vec_convergence[1] * vec_convergence[1]) {
+    position_distance = sqrtf(position_distance);
+    distance += position_weight * log2f(coshf(p_alpha * position_distance));
+  }
+}
+
+template <typename scalar_t>
+__global__ void
+backward_pose_distance_kernel(scalar_t *out_grad_p,            // [b,3]
+                              scalar_t *out_grad_q,            // [b,4]
+                              const scalar_t *grad_distance,   // [b,1]
+                              const scalar_t *grad_p_distance, // [b,1]
+                              const scalar_t *grad_q_distance, // [b,1]
+                              const scalar_t *pose_weight,     // [2]
+                              const scalar_t *grad_p_vec,      // [b,3]
+                              const scalar_t *grad_q_vec,      // [b,4]
+                              const int batch_size) {
+  const int batch_idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (batch_idx >= batch_size) {
+    return;
+  }
+  // read data
+  const float g_distance = grad_distance[batch_idx];
+  const float2 p_weight = *(float2 *)&pose_weight[0];
+  const float3 g_p_v = *(float3 *)&grad_p_vec[batch_idx * 3];
+  const float3 g_q_v = *(float3 *)&grad_q_vec[batch_idx * 4 + 1];
+  const float g_p_distance = grad_p_distance[batch_idx];
+  const float g_q_distance = grad_q_distance[batch_idx];
+
+  // compute position gradient
+  float3 g_p =
+      (g_p_v) * ((g_p_distance + g_distance * p_weight.y)); // scalar * float3
+  float3 g_q =
+      (g_q_v) * ((g_q_distance + g_distance * p_weight.x)); // scalar * float3
+
+  // write out
+  *(float3 *)&out_grad_p[batch_idx * 3] = g_p;
+  *(float3 *)&out_grad_q[batch_idx * 4 + 1] = g_q;
+}
+
+template <typename scalar_t>
+__global__ void backward_pose_kernel(scalar_t *out_grad_p,          // [b,3]
+                                     scalar_t *out_grad_q,          // [b,4]
+                                     const scalar_t *grad_distance, // [b,1]
+                                     const scalar_t *pose_weight,   // [2]
+                                     const scalar_t *grad_p_vec,    // [b,3]
+                                     const scalar_t *grad_q_vec,    // [b,4]
+                                     const int batch_size) {
+  const int batch_idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (batch_idx >= batch_size) {
+    return;
+  }
+  // read data
+  const float g_distance = grad_distance[batch_idx];
+  const float2 p_weight = *(float2 *)&pose_weight[0];
+  const float3 g_p_v = *(float3 *)&grad_p_vec[batch_idx * 3];
+  const float3 g_q_v = *(float3 *)&grad_q_vec[batch_idx * 4 + 1];
+
+  // compute position gradient
+  float3 g_p = (g_p_v) * ((g_distance * p_weight.y)); // scalar * float3
+  float3 g_q = (g_q_v) * ((g_distance * p_weight.x)); // scalar * float3
+
+  // write out
+  *(float3 *)&out_grad_p[batch_idx * 3] = g_p;
+  *(float3 *)&out_grad_q[batch_idx * 4 + 1] = g_q;
+}
+
+template <typename scalar_t, bool write_distance>
+__global__ void goalset_pose_distance_kernel(
+    scalar_t *out_distance, scalar_t *out_position_distance,
+    scalar_t *out_rotation_distance, scalar_t *out_p_vec, scalar_t *out_q_vec,
+    int32_t *out_gidx, const scalar_t *current_position,
+    const scalar_t *goal_position, const scalar_t *current_quat,
+    const scalar_t *goal_quat, const scalar_t *vec_weight,
+    const scalar_t *weight, const scalar_t *vec_convergence,
+    const scalar_t *run_weight, const scalar_t *run_vec_weight,
+    const int32_t *batch_pose_idx, const int mode, const int num_goals,
+    const int batch_size, const int horizon, const bool write_grad = false) {
+  const int t_idx = (blockDim.x * blockIdx.x + threadIdx.x);
+  const int batch_idx = t_idx / horizon;
+  const int h_idx = t_idx - (batch_idx * horizon);
+  if (batch_idx >= batch_size || h_idx >= horizon) {
+    return;
+  }
+
+  // read current pose:
+  float3 position =
+      *(float3 *)&current_position[batch_idx * horizon * 3 + h_idx * 3];
+  float4 quat_4 = *(float4 *)&current_quat[batch_idx * horizon * 4 + h_idx * 4];
+  float4 quat = make_float4(quat_4.y, quat_4.z, quat_4.w, quat_4.x);
+
+  // read weights:
+
+  float position_weight = weight[1];
+  float rotation_weight = weight[0];
+  if (!write_distance) {
+    position_weight *= run_weight[h_idx];
+    rotation_weight *= run_weight[h_idx];
+    if (position_weight == 0.0 && rotation_weight == 0.0) {
+      return;
+    }
+  }
+
+  float3 l_goal_position;
+  float4 l_goal_quat;
+  float distance_vec[6]; //  = {0.0};
+  float pose_distance = 0.0;
+  float position_distance = 0.0;
+  float rotation_distance = 0.0;
+  float best_distance = INFINITY;
+  float best_position_distance = 0.0;
+  float best_rotation_distance = 0.0;
+  float best_distance_vec[6] = {0.0};
+  float best_des_vec_weight[6] = {0.0};
+  float d_vec_convergence[2];
+
+  *(float2 *)&d_vec_convergence[0] = *(float2 *)&vec_convergence[0];
+
+  int best_idx = -1;
+  float d_vec_weight[6];
+  float des_vec_weight[6] = {0.0};
+  *(float3 *)&d_vec_weight[0] = *(float3 *)&vec_weight[0];
+  *(float3 *)&d_vec_weight[3] = *(float3 *)&vec_weight[3];
+  if (h_idx < horizon - 1) {
+    *(float3 *)&d_vec_weight[0] *= *(float3 *)&run_vec_weight[0];
+    *(float3 *)&d_vec_weight[3] *= *(float3 *)&run_vec_weight[3];
+  }
+ 
+
+  // read offset
+  int offset = batch_pose_idx[batch_idx];
+  if (mode == BATCH_GOALSET || mode == BATCH_GOAL) {
+    offset = (offset)*num_goals;
+  }
+
+  for (int k = 0; k < num_goals; k++) {
+
+    l_goal_position = *(float3 *)&goal_position[(offset + k) * 3];
+    float4 gq4 = *(float4 *)&goal_quat[(offset + k) * 4];
+    l_goal_quat = make_float4(gq4.y, gq4.z, gq4.w, gq4.x);
+      transform_vec_quat(l_goal_quat, &d_vec_weight[0], &des_vec_weight[0]);
+
+    compute_distance(&distance_vec[0], pose_distance, position_distance,
+                     rotation_distance, position, l_goal_position, quat,
+                     l_goal_quat,
+                     &des_vec_weight[0],      //&l_vec_weight[0],
+                     &d_vec_convergence[0], //&l_vec_convergence[0],
+                     position_weight, rotation_weight);
+    if (pose_distance <= best_distance) {
+      best_idx = k;
+      best_distance = pose_distance;
+      best_position_distance = position_distance;
+      best_rotation_distance = rotation_distance;
+      if (write_grad) {
+        //inv_transform_vec_quat(l_goal_quat, &d_vec_weight[0], &best_des_vec_weight[0]);
+#pragma unroll 6
+        for (int i = 0; i < 6; i++) {
+          best_distance_vec[i] = distance_vec[i];
+          best_des_vec_weight[i] = des_vec_weight[i];
+        }
+      }
+    }
+  }
+  // write out:
+
+  // write out pose distance:
+  out_distance[batch_idx * horizon + h_idx] = best_distance;
+  if (write_distance) {
+    out_position_distance[batch_idx * horizon + h_idx] = best_position_distance;
+    out_rotation_distance[batch_idx * horizon + h_idx] = best_rotation_distance;
+  }
+  out_gidx[batch_idx * horizon + h_idx] = best_idx;
+
+  if (write_grad) {
+    if (write_distance) {
+      position_weight = 1;
+      rotation_weight = 1;
+    }
+    if (best_position_distance > 0) {
+      best_position_distance = (position_weight / best_position_distance);
+
+      out_p_vec[batch_idx * horizon * 3 + h_idx * 3] =
+          best_des_vec_weight[3] * best_distance_vec[0] * best_position_distance;
+      out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 1] =
+          best_des_vec_weight[4] * best_distance_vec[1] * best_position_distance;
+      out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 2] =
+          best_des_vec_weight[5] * best_distance_vec[2] * best_position_distance;
+
+    } else {
+      out_p_vec[batch_idx * horizon * 3 + h_idx * 3] = 0.0;
+      out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 1] = 0.0;
+      out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 2] = 0.0;
+    }
+
+    if (best_rotation_distance > 0) {
+      best_rotation_distance = rotation_weight / best_rotation_distance;
+
+      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 1] =
+          best_des_vec_weight[0] * best_distance_vec[3] * best_rotation_distance;
+      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 2] =
+          best_des_vec_weight[1] * best_distance_vec[4] * best_rotation_distance;
+      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 3] =
+          best_des_vec_weight[2] * best_distance_vec[5] * best_rotation_distance;
+    } else {
+      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 1] = 0.0;
+      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 2] = 0.0;
+      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 3] = 0.0;
+    }
+  }
+}
+
+template <typename scalar_t, bool write_distance>
+__global__ void goalset_pose_distance_metric_kernel(
+    scalar_t *out_distance, scalar_t *out_position_distance,
+    scalar_t *out_rotation_distance, scalar_t *out_p_vec, scalar_t *out_q_vec,
+    int32_t *out_gidx, const scalar_t *current_position,
+    const scalar_t *goal_position, const scalar_t *current_quat,
+    const scalar_t *goal_quat, const scalar_t *vec_weight,
+    const scalar_t *weight, const scalar_t *vec_convergence,
+    const scalar_t *run_weight, const scalar_t *run_vec_weight,
+    const int32_t *batch_pose_idx, const int mode, const int num_goals,
+    const int batch_size, const int horizon, const bool write_grad = false) {
+  const int t_idx = (blockDim.x * blockIdx.x + threadIdx.x);
+  const int batch_idx = t_idx / horizon;
+  const int h_idx = t_idx - (batch_idx * horizon);
+  if (batch_idx >= batch_size || h_idx >= horizon) {
+    return;
+  }
+
+  // read current pose:
+  float3 position =
+      *(float3 *)&current_position[batch_idx * horizon * 3 + h_idx * 3];
+  float4 quat_4 = *(float4 *)&current_quat[batch_idx * horizon * 4 + h_idx * 4];
+  float4 quat = make_float4(quat_4.y, quat_4.z, quat_4.w, quat_4.x);
+
+  // read weights:
+
+  float position_weight = weight[1];
+  float p_w_alpha = weight[3];
+  float r_w_alpha = weight[2];
+  float rotation_weight = weight[0];
+  if (!write_distance) {
+    position_weight *= run_weight[h_idx];
+    rotation_weight *= run_weight[h_idx];
+    p_w_alpha *= run_weight[h_idx];
+    r_w_alpha *= run_weight[h_idx];
+    if (position_weight == 0.0 && rotation_weight == 0.0) {
+      return;
+    }
+  }
+
+  float d_vec_convergence[2];
+  *(float2 *)&d_vec_convergence[0] = *(float2 *)&vec_convergence[0];
+
+  float d_vec_weight[6];
+  *(float3 *)&d_vec_weight[0] = *(float3 *)&vec_weight[0];
+  *(float3 *)&d_vec_weight[3] = *(float3 *)&vec_weight[3];
+  if (h_idx < horizon - 1) {
+    *(float3 *)&d_vec_weight[0] *= *(float3 *)&run_vec_weight[0];
+    *(float3 *)&d_vec_weight[3] *= *(float3 *)&run_vec_weight[3];
+  }
+
+  float des_vec_weight[6] = {0.0};
+  float3 l_goal_position;
+  float4 l_goal_quat;
+  float distance_vec[6]; //  = {0.0};
+  float pose_distance = 0.0;
+  float position_distance = 0.0;
+  float rotation_distance = 0.0;
+  float best_distance = INFINITY;
+  float best_position_distance = 0.0;
+  float best_rotation_distance = 0.0;
+  float best_distance_vec[6] = {0.0};
+  float best_des_vec_weight[6] = {0.0};
+  int best_idx = -1.0;
+  int offset = batch_pose_idx[batch_idx];
+  if (mode == BATCH_GOALSET || mode == BATCH_GOAL) {
+    offset = (offset)*num_goals;
+  }
+
+  for (int k = 0; k < num_goals; k++) {
+
+    l_goal_position = *(float3 *)&goal_position[(offset + k) * 3];
+    float4 gq4 =
+        *(float4 *)&goal_quat[(offset + k) * 4]; // reading qw, qx, qy, qz
+    l_goal_quat = make_float4(gq4.y, gq4.z, gq4.w, gq4.x);
+
+    transform_vec_quat(l_goal_quat, &d_vec_weight[0], &des_vec_weight[0]);
+    compute_metric_distance(
+        &distance_vec[0], pose_distance, position_distance, rotation_distance,
+        position, l_goal_position, quat, l_goal_quat,
+        &des_vec_weight[0],      //&l_vec_weight[0],
+        &d_vec_convergence[0], //&l_vec_convergence[0],
+        position_weight, p_w_alpha, rotation_weight, r_w_alpha);
+    if (pose_distance <= best_distance) {
+      best_idx = k;
+      best_distance = pose_distance;
+      best_position_distance = position_distance;
+      best_rotation_distance = rotation_distance;
+      if (write_grad) {
+         // inv_transform_vec_quat(l_goal_quat, &d_vec_weight[0], &best_des_vec_weight[0]);
+
+#pragma unroll 6
+        for (int i = 0; i < 6; i++) {
+          best_distance_vec[i] = distance_vec[i];
+         best_des_vec_weight[i] = des_vec_weight[i];
+        }
+      }
+    }
+  }
+
+  // add scaling metric:
+  // best_distance = log2f(acoshf(best_distance));
+  // write out:
+
+  // write out pose distance:
+  out_distance[batch_idx * horizon + h_idx] = best_distance;
+  if (write_distance) {
+    out_position_distance[batch_idx * horizon + h_idx] = best_position_distance;
+    out_rotation_distance[batch_idx * horizon + h_idx] = best_rotation_distance;
+  }
+  out_gidx[batch_idx * horizon + h_idx] = best_idx;
+
+  if (write_grad) {
+    // write gradient
+    // compute scalar term:
+    // -w * (d_vec)/ (length * length(negative +1) * acos(length)
+    // -w * (d_vec) * sinh(length) / (length * cosh(length))
+    // compute extra term:
+
+    if (write_distance) {
+      position_weight = 1.0;
+      rotation_weight = 1.0;
+    }
+
+    if (best_position_distance > 0) {
+      best_position_distance =
+          (p_w_alpha * position_weight *
+           sinhf(p_w_alpha * best_position_distance)) /
+          (best_position_distance * coshf(p_w_alpha * best_position_distance));
+
+      // best_position_distance = (position_weight/best_position_distance);
+      //  comput scalar gradient
+
+      out_p_vec[batch_idx * horizon * 3 + h_idx * 3] =
+          best_des_vec_weight[3] * best_distance_vec[0] * best_position_distance;
+      out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 1] =
+          best_des_vec_weight[4] * best_distance_vec[1] * best_position_distance;
+      out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 2] =
+          best_des_vec_weight[5] * best_distance_vec[2] * best_position_distance;
+
+    } else {
+      out_p_vec[batch_idx * horizon * 3 + h_idx * 3] = 0.0;
+      out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 1] = 0.0;
+      out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 2] = 0.0;
+    }
+
+    if (best_rotation_distance > 0) {
+      best_rotation_distance =
+          (r_w_alpha * rotation_weight *
+           sinhf(r_w_alpha * best_rotation_distance)) /
+          (best_rotation_distance * coshf(r_w_alpha * best_rotation_distance));
+
+      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 1] =
+          best_des_vec_weight[0] * best_distance_vec[3] * best_rotation_distance;
+      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 2] =
+          best_des_vec_weight[1] * best_distance_vec[4] * best_rotation_distance;
+      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 3] =
+          best_des_vec_weight[2] * best_distance_vec[5] * best_rotation_distance;
+    } else {
+      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 1] = 0.0;
+      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 2] = 0.0;
+      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 3] = 0.0;
+    }
+  }
+}
+
+} // namespace
+}
+
+std::vector<torch::Tensor>
+pose_distance(torch::Tensor out_distance, torch::Tensor out_position_distance,
+              torch::Tensor out_rotation_distance,
+              torch::Tensor distance_p_vector, // batch size, 3
+              torch::Tensor distance_q_vector, // batch size, 4
+              torch::Tensor out_gidx,
+              const torch::Tensor current_position, // batch_size, 3
+              const torch::Tensor goal_position,    // n_boxes, 3
+              const torch::Tensor current_quat, const torch::Tensor goal_quat,
+              const torch::Tensor vec_weight, // n_boxes, 4, 4
+              const torch::Tensor weight, const torch::Tensor vec_convergence,
+              const torch::Tensor run_weight,
+              const torch::Tensor run_vec_weight,
+              const torch::Tensor batch_pose_idx, // batch_size, 1
+              const int batch_size, const int horizon, const int mode,
+              const int num_goals = 1, const bool compute_grad = false,
+              const bool write_distance = true, const bool use_metric = false) {
+  using namespace Curobo::Pose;
+  // we compute the warp threads based on number of boxes:
+  assert(batch_pose_idx.size(0) == batch_size);
+  // TODO: verify this math
+  // const int batch_size = out_distance.size(0);
+  assert(run_weight.size(-1) == horizon);
+  const int bh = batch_size * horizon;
+  int threadsPerBlock = bh;
+  if (bh > 128) {
+    threadsPerBlock = 128;
+  }
+
+  // we fit warp thread spheres in a threadsPerBlock
+
+  int blocksPerGrid = (bh + threadsPerBlock - 1) / threadsPerBlock;
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (use_metric) {
+    if (write_distance)
+    {
+       AT_DISPATCH_FLOATING_TYPES(
+        current_position.scalar_type(), "batch_pose_distance", ([&] {
+          goalset_pose_distance_metric_kernel
+              <scalar_t, true><<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                  out_distance.data_ptr<scalar_t>(),
+                  out_position_distance.data_ptr<scalar_t>(),
+                  out_rotation_distance.data_ptr<scalar_t>(),
+                  distance_p_vector.data_ptr<scalar_t>(),
+                  distance_q_vector.data_ptr<scalar_t>(),
+                  out_gidx.data_ptr<int32_t>(),
+                  current_position.data_ptr<scalar_t>(),
+                  goal_position.data_ptr<scalar_t>(),
+                  current_quat.data_ptr<scalar_t>(),
+                  goal_quat.data_ptr<scalar_t>(),
+                  vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
+                  vec_convergence.data_ptr<scalar_t>(),
+                  run_weight.data_ptr<scalar_t>(),
+                  run_vec_weight.data_ptr<scalar_t>(),
+                  batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
+                  batch_size, horizon, compute_grad);
+        }));
+    
+    }
+    else
+    {
+
+    
+    AT_DISPATCH_FLOATING_TYPES(
+        current_position.scalar_type(), "batch_pose_distance", ([&] {
+          goalset_pose_distance_metric_kernel
+              // goalset_pose_distance_kernel
+              <scalar_t, false><<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                  out_distance.data_ptr<scalar_t>(),
+                  out_position_distance.data_ptr<scalar_t>(),
+                  out_rotation_distance.data_ptr<scalar_t>(),
+                  distance_p_vector.data_ptr<scalar_t>(),
+                  distance_q_vector.data_ptr<scalar_t>(),
+                  out_gidx.data_ptr<int32_t>(),
+                  current_position.data_ptr<scalar_t>(),
+                  goal_position.data_ptr<scalar_t>(),
+                  current_quat.data_ptr<scalar_t>(),
+                  goal_quat.data_ptr<scalar_t>(),
+                  vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
+                  vec_convergence.data_ptr<scalar_t>(),
+                  run_weight.data_ptr<scalar_t>(),
+                  run_vec_weight.data_ptr<scalar_t>(),
+                  batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
+                  batch_size, horizon, compute_grad);
+        }));
+    }
+  } else {
+    if(write_distance)
+    {
+       AT_DISPATCH_FLOATING_TYPES(
+        current_position.scalar_type(), "batch_pose_distance", ([&] {
+          // goalset_pose_distance_metric_kernel
+          goalset_pose_distance_kernel<scalar_t, true>
+              <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                  out_distance.data_ptr<scalar_t>(),
+                  out_position_distance.data_ptr<scalar_t>(),
+                  out_rotation_distance.data_ptr<scalar_t>(),
+                  distance_p_vector.data_ptr<scalar_t>(),
+                  distance_q_vector.data_ptr<scalar_t>(),
+                  out_gidx.data_ptr<int32_t>(),
+                  current_position.data_ptr<scalar_t>(),
+                  goal_position.data_ptr<scalar_t>(),
+                  current_quat.data_ptr<scalar_t>(),
+                  goal_quat.data_ptr<scalar_t>(),
+                  vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
+                  vec_convergence.data_ptr<scalar_t>(),
+                  run_weight.data_ptr<scalar_t>(),
+                  run_vec_weight.data_ptr<scalar_t>(),
+                  batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
+                  batch_size, horizon, compute_grad);
+                          }));
+    }
+    else
+    {
+    AT_DISPATCH_FLOATING_TYPES(
+        current_position.scalar_type(), "batch_pose_distance", ([&] {
+          // goalset_pose_distance_metric_kernel
+          goalset_pose_distance_kernel<scalar_t, false>
+              <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                  out_distance.data_ptr<scalar_t>(),
+                  out_position_distance.data_ptr<scalar_t>(),
+                  out_rotation_distance.data_ptr<scalar_t>(),
+                  distance_p_vector.data_ptr<scalar_t>(),
+                  distance_q_vector.data_ptr<scalar_t>(),
+                  out_gidx.data_ptr<int32_t>(),
+                  current_position.data_ptr<scalar_t>(),
+                  goal_position.data_ptr<scalar_t>(),
+                  current_quat.data_ptr<scalar_t>(),
+                  goal_quat.data_ptr<scalar_t>(),
+                  vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
+                  vec_convergence.data_ptr<scalar_t>(),
+                  run_weight.data_ptr<scalar_t>(),
+                  run_vec_weight.data_ptr<scalar_t>(),
+                  batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
+                  batch_size, horizon, compute_grad);
+        }));
+    }
+  }
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {out_distance,      out_position_distance, out_rotation_distance,
+          distance_p_vector, distance_q_vector,     out_gidx};
+}
+
+std::vector<torch::Tensor>
+backward_pose_distance(torch::Tensor out_grad_p, torch::Tensor out_grad_q,
+                       const torch::Tensor grad_distance,   // batch_size, 3
+                       const torch::Tensor grad_p_distance, // n_boxes, 3
+                       const torch::Tensor grad_q_distance,
+                       const torch::Tensor pose_weight,
+                       const torch::Tensor grad_p_vec, // n_boxes, 4, 4
+                       const torch::Tensor grad_q_vec, const int batch_size,
+                       const bool use_distance = false) {
+
+  // we compute the warp threads based on number of boxes:
+
+  // TODO: verify this math
+  // const int batch_size = grad_distance.size(0);
+  using namespace Curobo::Pose;
+
+  int threadsPerBlock = batch_size;
+  if (batch_size > 128) {
+    threadsPerBlock = 128;
+  }
+
+  // we fit warp thread spheres in a threadsPerBlock
+
+  int blocksPerGrid = (batch_size + threadsPerBlock - 1) / threadsPerBlock;
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  if (use_distance) {
+
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_distance.scalar_type(), "backward_pose_distance", ([&] {
+          backward_pose_distance_kernel<scalar_t>
+              <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                  out_grad_p.data_ptr<scalar_t>(),
+                  out_grad_q.data_ptr<scalar_t>(),
+                  grad_distance.data_ptr<scalar_t>(),
+                  grad_p_distance.data_ptr<scalar_t>(),
+                  grad_q_distance.data_ptr<scalar_t>(),
+                  pose_weight.data_ptr<scalar_t>(),
+                  grad_p_vec.data_ptr<scalar_t>(),
+                  grad_q_vec.data_ptr<scalar_t>(), batch_size);
+        }));
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_distance.scalar_type(), "backward_pose", ([&] {
+          backward_pose_kernel<scalar_t>
+              <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                  out_grad_p.data_ptr<scalar_t>(),
+                  out_grad_q.data_ptr<scalar_t>(),
+                  grad_distance.data_ptr<scalar_t>(),
+                  pose_weight.data_ptr<scalar_t>(),
+                  grad_p_vec.data_ptr<scalar_t>(),
+                  grad_q_vec.data_ptr<scalar_t>(), batch_size);
+        }));
+  }
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {out_grad_p, out_grad_q};
+}
\ No newline at end of file
diff --git a/src/curobo/curobolib/cpp/self_collision_kernel.cu b/src/curobo/curobolib/cpp/self_collision_kernel.cu
new file mode 100644
index 00000000..439c0ec7
--- /dev/null
+++ b/src/curobo/curobolib/cpp/self_collision_kernel.cu
@@ -0,0 +1,643 @@
+/*
+ * Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include <c10/cuda/CUDAStream.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <torch/extension.h>
+
+#include "helper_math.h"
+#include <cmath>
+#include <cuda_fp16.h>
+#include <vector>
+
+#define SINGLE_GOAL 0
+#define BATCH_GOAL 1
+#define GOALSET 2
+#define BATCH_GOALSET 3
+
+namespace Curobo {
+
+namespace Geometry {
+
+template <typename scalar_t> __inline__ __device__ scalar_t relu(scalar_t var) {
+  if (var < 0)
+    return 0;
+  else
+    return var;
+}
+
+template <typename scalar_t>
+__global__ void self_collision_distance_kernel(
+    scalar_t *out_distance,        // batch x 1
+    scalar_t *out_vec,             // batch x nspheres x 4
+    const scalar_t *robot_spheres, // batch x nspheres x 4
+    const scalar_t *collision_threshold, const int batch_size,
+    const int nspheres, const scalar_t *weight, const bool write_grad = false) {
+  const int batch_idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (batch_idx >= batch_size) {
+    return;
+  }
+  float r_diff, distance;
+  float max_penetration = 0;
+  float3 sph1, sph2, dist_vec;
+  int sph1_idx = -1;
+  int sph2_idx = -1;
+  // iterate over spheres:
+  for (int i = 0; i < nspheres; i++) {
+    sph1 = *(float3 *)&robot_spheres[batch_idx * nspheres * 4 + i * 4];
+    for (int j = i + 1; j < nspheres; j++) {
+      r_diff = collision_threshold[i * nspheres + j];
+      if (isinf(r_diff)) {
+        continue;
+      }
+      sph2 = *(float3 *)&robot_spheres[batch_idx * nspheres * 4 + j * 4];
+      // compute sphere distance:
+      distance = relu(r_diff - length(sph1 - sph2));
+      if (distance > max_penetration) {
+        max_penetration = distance;
+        sph1_idx = i;
+        sph2_idx = j;
+        if (write_grad) {
+          dist_vec = (sph1 - sph2) / distance;
+        }
+      }
+    }
+  }
+
+  // write out pose distance:
+  if (max_penetration > 0) {
+    out_distance[batch_idx] = weight[0] * max_penetration;
+    if (write_grad) {
+      *(float3 *)&out_vec[batch_idx * nspheres * 4 + sph1_idx * 4] =
+          weight[0] * -1 * dist_vec;
+      *(float3 *)&out_vec[batch_idx * nspheres * 4 + sph2_idx * 4] =
+          weight[0] * dist_vec;
+    }
+  }
+}
+
+typedef struct {
+  float d;
+  int16_t i;
+  int16_t j;
+} dist_t;
+
+
+
+///////////////////////////////////////////////////////////
+// n warps per row
+// ndpt rows per warp
+///////////////////////////////////////////////////////////
+template <typename scalar_t>
+__global__ void self_collision_distance_kernel4(
+    scalar_t *out_distance,        // batch x 1
+    scalar_t *out_vec,             // batch x nspheres x 3
+    const scalar_t *robot_spheres, // batch x nspheres x 3
+    const scalar_t *offsets, const uint8_t *coll_matrix, const int batch_size,
+    const int nspheres,
+    const int ndpt, // number of distances to be computed per thread
+    const int nwpr, // number of warps per row
+    const scalar_t *weight, uint8_t *sparse_index,
+    const bool write_grad = false) {
+  int batch_idx = blockIdx.x;
+  int warp_idx = threadIdx.x / 32;
+  int i = ndpt * (warp_idx / nwpr); // starting row number for this warp
+  int j = (warp_idx % nwpr) * 32;   // starting column number for this warp
+
+  dist_t max_d = {.d = 0.0, .i = 0, .j = 0};
+  __shared__ dist_t max_darr[32];
+
+  // Optimization: About 1/3 of the warps will have no work.
+  // We compute distances only when i<j. If i is never <j in this warp, we have
+  // no work
+  if (i > j + 31) { // this warp has no work
+    max_darr[warp_idx] = max_d;
+    return;
+  }
+
+  // load robot_spheres to shared memory
+  extern __shared__ float4 __rs_shared[];
+  if (threadIdx.x < nspheres) {
+    float4 sph = *(float4 *)&robot_spheres[4 * (batch_idx * nspheres + threadIdx.x)];
+    //float4 sph = make_float4(robot_spheres[3 * (batch_idx * nspheres + threadIdx.x)],
+    //robot_spheres[3 * (batch_idx * nspheres + threadIdx.x) + 1], 
+    //robot_spheres[3 * (batch_idx * nspheres + threadIdx.x) + 2],
+    //robot_spheres_radius[threadIdx.x]) ;
+    sph.w += offsets[threadIdx.x];
+    __rs_shared[threadIdx.x] = sph;
+  }
+  __syncthreads();
+
+  //////////////////////////////////////////////////////
+  // Compute distances and store the maximum per thread
+  // in registers (max_d).
+  // Each thread computes upto ndpt distances.
+  // two warps per row
+  //////////////////////////////////////////////////////
+  // int nspheres_2 = nspheres * nspheres;
+
+  j = j + threadIdx.x % 32; // column number for this thread
+
+  float4 sph2;
+  if (j < nspheres) {
+    sph2 = __rs_shared[j]; // we need not load sph2 in every iteration.
+
+#pragma unroll 16
+    for (int k = 0; k < ndpt; k++, i++) { // increment i also here
+      if (i < nspheres && j > i) {
+        // check if self collision is allowed here:
+        if (coll_matrix[i * nspheres + j] == 1) {
+
+          float4 sph1 = __rs_shared[i];
+          if (sph1.w <= 0.0 || sph2.w <= 0.0)
+          {
+            continue;
+          }
+          float r_diff = sph1.w + sph2.w;
+          float d = sqrt((sph1.x - sph2.x) * (sph1.x - sph2.x) +
+                         (sph1.y - sph2.y) * (sph1.y - sph2.y) +
+                         (sph1.z - sph2.z) * (sph1.z - sph2.z));
+          //float distance = max((float)0.0, (float)(r_diff - d));
+          float distance = r_diff - d;
+          // printf("%d, %d: (%d, %d) %f new\n", blockIdx.x, threadIdx.x, i, j,
+          // distance);
+          if (distance > max_d.d) {
+            max_d.d = distance;
+            max_d.i = i;
+            max_d.j = j;
+          }
+        }
+      }
+    }
+  }
+  // max_d has the result max for this thread
+
+  //////////////////////////////////////////////////////
+  // Reduce gridDim.x values using gridDim.x threads
+  //////////////////////////////////////////////////////
+
+  // Perform warp-wide reductions
+  // Optimization: Skip the reduction if all the values are zero
+  unsigned zero_mask = __ballot_sync(
+      0xffffffff, max_d.d != 0.0); // we expect most values to be 0. So,
+                                   // zero_mask should be 0 in the common case.
+  if (zero_mask != 0) {            // some of the values are non-zero
+    unsigned mask = __ballot_sync(0xffffffff, threadIdx.x < blockDim.x);
+    if (threadIdx.x < blockDim.x) {
+      // dist_t max_d = dist_sh[threadIdx.x];
+      for (int offset = 16; offset > 0; offset /= 2) {
+        uint64_t nd = __shfl_down_sync(mask, *(uint64_t *)&max_d, offset);
+        dist_t d_temp = *(dist_t *)&nd;
+        if (d_temp.d > max_d.d) {
+          max_d = d_temp;
+        }
+      }
+    }
+  }
+
+  // thread0 in the warp has the max_d for the warp
+  if (threadIdx.x % 32 == 0) {
+    max_darr[warp_idx] = max_d;
+    // printf("threadIdx.x=%d, blockIdx.x=%d, max_d=%f\n", threadIdx.x,
+    // blockIdx.x, max_d);
+  }
+
+  if (threadIdx.x < nspheres) {
+    if (write_grad && sparse_index[batch_idx * nspheres + threadIdx.x] != 0) {
+
+      *(float4 *)&out_vec[batch_idx * nspheres * 4 + threadIdx.x * 4] =
+          make_float4(0.0);
+      sparse_index[batch_idx * nspheres + threadIdx.x] = 0;
+    }
+  }
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    dist_t max_d = max_darr[0];
+    // TODO: This can be parallized
+    for (int i = 1; i < (blockDim.x + 31) / 32; i++) {
+      if (max_darr[i].d > max_d.d) {
+        max_d = max_darr[i];
+      }
+    }
+    //////////////////////////////////////////////////////
+    // Write out the final results
+    //////////////////////////////////////////////////////
+    if (max_d.d != 0.0) {
+      out_distance[batch_idx] = weight[0] * max_d.d;
+      if (write_grad) {
+        float3 sph1 =
+            *(float3 *)&robot_spheres[4 * (batch_idx * nspheres + max_d.i)];
+        float3 sph2 =
+            *(float3 *)&robot_spheres[4 * (batch_idx * nspheres + max_d.j)];
+        float3 dist_vec = (sph1 - sph2) / max_d.d;
+        *(float3 *)&out_vec[batch_idx * nspheres * 4 + max_d.i * 4] =
+            weight[0] * -1 * dist_vec;
+        *(float3 *)&out_vec[batch_idx * nspheres * 4 + max_d.j * 4] =
+            weight[0] *  dist_vec;
+        sparse_index[batch_idx * nspheres + max_d.i] = 1;
+        sparse_index[batch_idx * nspheres + max_d.j] = 1;
+      }
+    } else {
+      out_distance[batch_idx] = 0;
+    }
+  }
+}
+template <typename scalar_t, int ndpt, int NBPB>
+__global__ void self_collision_distance_kernel7(
+    scalar_t *out_distance, // batch x 1
+    scalar_t *out_vec,      // batch x nspheres x 3
+    uint8_t *sparse_index,
+    const scalar_t *robot_spheres, // batch x nspheres x 3
+    const scalar_t *offsets,       // nspheres
+    const scalar_t *weight, const int16_t *locations_, const int batch_size,
+    const int nspheres, const bool write_grad = false) {
+  uint32_t batch_idx = blockIdx.x * NBPB;
+  uint8_t nbpb = min(NBPB, batch_size - batch_idx);
+
+  if (nbpb == 0)
+    return;
+
+  // Layout in shared memory:
+  //     sphere1[batch=0] sphere1[batch=1] sphere1[batch=2] sphere1[batch=4]
+  //     sphere2[batch=0] sphere2[batch=1] sphere2[batch=2] sphere2[batch=4]
+  //     ...
+  extern __shared__ float4 __rs_shared[];
+  if (threadIdx.x < nspheres) { // threadIdx.x is sphere index
+#pragma unroll
+    for (int l = 0; l < nbpb; l++) {
+      float4 sph = *(float4 *)&robot_spheres[4 * ((batch_idx + l) * nspheres + threadIdx.x)];
+
+      //float4 sph = make_float4(
+      //  robot_spheres[3 * ((batch_idx + l) * nspheres + threadIdx.x)],
+      //  robot_spheres[3 * ((batch_idx + l) * nspheres + threadIdx.x) + 1],
+      //  robot_spheres[3 * ((batch_idx + l) * nspheres + threadIdx.x) + 2],
+      //  robot_spheres_radius[threadIdx.x]
+      //);
+      
+      sph.w += offsets[threadIdx.x];
+      __rs_shared[NBPB * threadIdx.x + l] = sph;
+    }
+  }
+  __syncthreads();
+
+  //////////////////////////////////////////////////////
+  // Compute distances and store the maximum per thread
+  // in registers (max_d).
+  // Each thread computes upto ndpt distances.
+  //////////////////////////////////////////////////////
+  dist_t max_d[NBPB] = {{.d = 0.0, .i = 0, .j = 0}};
+  int16_t indices[ndpt * 2];
+  for (uint8_t i =0; i< ndpt * 2; i++)
+  {
+    indices[i] = locations_[(threadIdx.x) * 2 * ndpt + i];
+    
+  }
+
+#pragma unroll
+  for (uint8_t k = 0; k < ndpt; k++) { 
+    // We are iterating through ndpt pair of spheres across batch
+    // if we increase ndpt, then we can compute for more spheres?
+    int i = indices[k * 2];
+    int j = indices[k * 2 + 1];
+    if (i == -1 || j == -1)
+      continue;
+
+#pragma unroll
+    for (uint16_t l = 0; l < nbpb; l++) { // iterate through nbpb batches
+      float4 sph1 = __rs_shared[NBPB * i + l];
+      float4 sph2 = __rs_shared[NBPB * j + l];
+      if (sph1.w <= 0.0 || sph2.w <= 0.0)
+      {
+        continue;
+      }
+      float r_diff =
+          sph1.w + sph2.w; // sum of two radii, radii include respective offsets
+      float d = sqrt((sph1.x - sph2.x) * (sph1.x - sph2.x) +
+                     (sph1.y - sph2.y) * (sph1.y - sph2.y) +
+                     (sph1.z - sph2.z) * (sph1.z - sph2.z));
+      float f_diff = r_diff - d;
+      if (f_diff > max_d[l].d) {
+        max_d[l].d = f_diff;
+        max_d[l].i = i;
+        max_d[l].j = j;
+      }
+    }
+  }
+  // max_d has the result max for this thread
+
+  //////////////////////////////////////////////////////
+  // Reduce gridDim.x values using gridDim.x threads
+  //////////////////////////////////////////////////////
+  // We find the sum across 32 threads. Hence, we are limited to running all our self collision
+  // distances for a batch_idx to 32 threads.
+
+  __shared__ dist_t max_darr[32 * NBPB];
+
+#pragma unroll
+  for (uint8_t l = 0; l < nbpb; l++) {
+    // Perform warp-wide reductions
+    // Optimization: Skip the reduction if all the values are zero
+    unsigned zero_mask = __ballot_sync(
+        0xffffffff,
+        max_d[l].d != 0.0); // we expect most values to be 0. So, zero_mask
+                            // should be 0 in the common case.
+    if (zero_mask != 0) {   // some of the values are non-zero
+      unsigned mask = __ballot_sync(0xffffffff, threadIdx.x < blockDim.x);
+      if (threadIdx.x < blockDim.x) {
+        // dist_t max_d = dist_sh[threadIdx.x];
+        for (int offset = 16; offset > 0; offset /= 2) {
+          // the offset here is linked to ndpt? 
+          uint64_t nd = __shfl_down_sync(mask, *(uint64_t *)&max_d[l], offset);
+          dist_t d_temp = *(dist_t *)&nd;
+          if (d_temp.d > max_d[l].d) {
+            max_d[l] = d_temp;
+          }
+        }
+      }
+    }
+    // thread0 in the warp has the max_d for the warp
+    if (threadIdx.x % 32 == 0) {
+      max_darr[(threadIdx.x / 32) + 32 * l] = max_d[l];
+      // printf("threadIdx.x=%d, blockIdx.x=%d, max_d=%f\n", threadIdx.x,
+      // blockIdx.x, max_d);
+    }
+  }
+
+  if (threadIdx.x < nspheres) {
+    for (int l = 0; l < nbpb; l++) {
+
+      if (write_grad &&
+          sparse_index[(batch_idx + l) * nspheres + threadIdx.x] != 0) {
+        *(float4 *)&out_vec[(batch_idx + l) * nspheres * 4 + threadIdx.x * 4] =
+            make_float4(0.0);
+        sparse_index[(batch_idx + l) * nspheres + threadIdx.x] = 0;
+      }
+    }
+  }
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+#pragma unroll
+    for (uint8_t l = 0; l < nbpb; l++) {
+      dist_t max_d = max_darr[l * 32];
+      // TODO: This can be parallized
+      for (int i = 1; i < (blockDim.x + 31) / 32; i++) {
+        if (max_darr[l * 32 + i].d > max_d.d) {
+          max_d = max_darr[l * 32 + i];
+        }
+      }
+      //////////////////////////////////////////////////////
+      // Write out the final results
+      //////////////////////////////////////////////////////
+      if (max_d.d != 0.0) {
+        out_distance[batch_idx + l] = weight[0] * max_d.d;
+        if (write_grad) {
+          float3 sph1 =
+              *(float3 *)&robot_spheres[4 *
+                                        ((batch_idx + l) * nspheres + max_d.i)];
+          float3 sph2 =
+              *(float3 *)&robot_spheres[4 *
+                                        ((batch_idx + l) * nspheres + max_d.j)];
+          float3 dist_vec = (sph1 - sph2) / max_d.d;
+          *(float3 *)&out_vec[(batch_idx + l) * nspheres * 4 + max_d.i * 4] =
+              weight[0] * -1 * dist_vec;
+          *(float3 *)&out_vec[(batch_idx + l) * nspheres * 4 + max_d.j * 4] =
+              weight[0] *  dist_vec;
+          sparse_index[(batch_idx + l) * nspheres + max_d.i] = 1;
+          sparse_index[(batch_idx + l) * nspheres + max_d.j] = 1;
+        }
+      } else {
+        out_distance[batch_idx + l] = 0;
+      }
+    }
+  }
+}
+} // namespace Geometry
+} // namespace Curobo
+
+// This is the best version so far.
+// It precomputes the start addresses per thread on the cpu.
+// The rest is similar to the version above.
+std::vector<torch::Tensor> self_collision_distance(
+    torch::Tensor out_distance, torch::Tensor out_vec,
+    torch::Tensor sparse_index,
+    const torch::Tensor robot_spheres,    // batch_size x n_spheres x 3
+    const torch::Tensor collision_offset, // n_spheres x n_spheres
+    const torch::Tensor weight, const torch::Tensor collision_matrix,
+    const torch::Tensor thread_locations, const int thread_locations_size,
+    const int batch_size, const int nspheres, const bool compute_grad = false,
+    const int ndpt = 8, // Does this need to match template?
+    const bool experimental_kernel = false) {
+  using namespace Curobo::Geometry;
+
+  // use efficient kernel based on number of threads:
+  const int nbpb = 1;
+
+  assert(nspheres < 1024);
+
+  
+  int threadsPerBlock = ((thread_locations_size / 2) + ndpt - 1) /
+                        ndpt; // location_size must be an even number. We store
+                              // i,j for each sphere pair.
+  //assert(threadsPerBlock/nbpb <=32);
+  if (threadsPerBlock < 32*nbpb)
+  {
+    threadsPerBlock = 32 * nbpb;
+  }
+  if (threadsPerBlock < nspheres)
+  {
+    threadsPerBlock = nspheres;
+  }
+  int blocksPerGrid = (batch_size + nbpb - 1) / nbpb; // number of batches per block 
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+   // ((threadsPerBlock >= nspheres))&& 
+  if (experimental_kernel) {
+
+    int smemSize = nbpb * nspheres * sizeof(float4);
+
+
+
+    if (ndpt == 1)
+    {
+
+      AT_DISPATCH_FLOATING_TYPES(
+        robot_spheres.scalar_type(), "self_collision_distance", ([&] {
+          self_collision_distance_kernel7<scalar_t, 1, nbpb>
+              <<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
+                  out_distance.data_ptr<scalar_t>(),
+                  out_vec.data_ptr<scalar_t>(),
+                  sparse_index.data_ptr<uint8_t>(),
+                  robot_spheres.data_ptr<scalar_t>(),
+                  collision_offset.data_ptr<scalar_t>(),
+                  weight.data_ptr<scalar_t>(),
+                  thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
+                  compute_grad);
+        }));
+    }
+    
+    else if (ndpt == 2)
+    {
+
+      AT_DISPATCH_FLOATING_TYPES(
+        robot_spheres.scalar_type(), "self_collision_distance", ([&] {
+          self_collision_distance_kernel7<scalar_t, 2, nbpb>
+              <<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
+                  out_distance.data_ptr<scalar_t>(),
+                  out_vec.data_ptr<scalar_t>(),
+                  sparse_index.data_ptr<uint8_t>(),
+                  robot_spheres.data_ptr<scalar_t>(),
+                  collision_offset.data_ptr<scalar_t>(),
+                  weight.data_ptr<scalar_t>(),
+                  thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
+                  compute_grad);
+        }));
+    }
+    
+    else if (ndpt == 4)
+    {
+
+      AT_DISPATCH_FLOATING_TYPES(
+        robot_spheres.scalar_type(), "self_collision_distance", ([&] {
+          self_collision_distance_kernel7<scalar_t, 4, nbpb>
+              <<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
+                  out_distance.data_ptr<scalar_t>(),
+                  out_vec.data_ptr<scalar_t>(),
+                  sparse_index.data_ptr<uint8_t>(),
+                  robot_spheres.data_ptr<scalar_t>(),
+                  collision_offset.data_ptr<scalar_t>(),
+                  weight.data_ptr<scalar_t>(),
+                  thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
+                  compute_grad);
+        }));
+    }
+    else if (ndpt == 8)
+    {
+
+      AT_DISPATCH_FLOATING_TYPES(
+        robot_spheres.scalar_type(), "self_collision_distance", ([&] {
+          self_collision_distance_kernel7<scalar_t, 8, nbpb>
+              <<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
+                  out_distance.data_ptr<scalar_t>(),
+                  out_vec.data_ptr<scalar_t>(),
+                  sparse_index.data_ptr<uint8_t>(),
+                  robot_spheres.data_ptr<scalar_t>(),
+                  collision_offset.data_ptr<scalar_t>(),
+                  weight.data_ptr<scalar_t>(),
+                  thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
+                  compute_grad);
+        }));
+    }
+    else if (ndpt == 32)
+    {
+
+    
+    AT_DISPATCH_FLOATING_TYPES(
+        robot_spheres.scalar_type(), "self_collision_distance", ([&] {
+          self_collision_distance_kernel7<scalar_t, 32, nbpb>
+              <<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
+                  out_distance.data_ptr<scalar_t>(),
+                  out_vec.data_ptr<scalar_t>(),
+                  sparse_index.data_ptr<uint8_t>(),
+                  robot_spheres.data_ptr<scalar_t>(),
+                  collision_offset.data_ptr<scalar_t>(),
+                  weight.data_ptr<scalar_t>(),
+                  thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
+                  compute_grad);
+        }));
+    }
+    else if (ndpt == 64)
+    {
+      AT_DISPATCH_FLOATING_TYPES(
+        robot_spheres.scalar_type(), "self_collision_distance", ([&] {
+          self_collision_distance_kernel7<scalar_t, 64, nbpb>
+              <<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
+                  out_distance.data_ptr<scalar_t>(),
+                  out_vec.data_ptr<scalar_t>(),
+                  sparse_index.data_ptr<uint8_t>(),
+                  robot_spheres.data_ptr<scalar_t>(),
+                  collision_offset.data_ptr<scalar_t>(),
+                  weight.data_ptr<scalar_t>(),
+                  thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
+                  compute_grad);
+        }));
+    }
+    else if (ndpt == 128)
+    {
+      AT_DISPATCH_FLOATING_TYPES(
+        robot_spheres.scalar_type(), "self_collision_distance", ([&] {
+          self_collision_distance_kernel7<scalar_t, 128, nbpb>
+              <<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
+                  out_distance.data_ptr<scalar_t>(),
+                  out_vec.data_ptr<scalar_t>(),
+                  sparse_index.data_ptr<uint8_t>(),
+                  robot_spheres.data_ptr<scalar_t>(),
+                  collision_offset.data_ptr<scalar_t>(),
+                  weight.data_ptr<scalar_t>(),
+                  thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
+                  compute_grad);
+        }));
+      
+    }
+    else if (ndpt == 512)
+    {
+      AT_DISPATCH_FLOATING_TYPES(
+        robot_spheres.scalar_type(), "self_collision_distance", ([&] {
+          self_collision_distance_kernel7<scalar_t, 512, nbpb>
+              <<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
+                  out_distance.data_ptr<scalar_t>(),
+                  out_vec.data_ptr<scalar_t>(),
+                  sparse_index.data_ptr<uint8_t>(),
+                  robot_spheres.data_ptr<scalar_t>(),
+                  collision_offset.data_ptr<scalar_t>(),
+                  weight.data_ptr<scalar_t>(),
+                  thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
+                  compute_grad);
+        }));
+    }
+    else
+    {
+      assert(false); // only ndpt of 32 or 64 is currently supported.
+    }
+  }
+
+  else {
+    int ndpt_n = 32; // number of distances to be computed per thread
+    int nwpr = (nspheres + 31) / 32;
+    int warpsPerBlock = nwpr * ((nspheres + ndpt_n - 1) / ndpt_n);
+    threadsPerBlock = warpsPerBlock * 32;
+    blocksPerGrid = batch_size;
+    // printf("Blocks: %d, threads/block:%d, ndpt=%d, nwpr=%d\n", blocksPerGrid,
+    // threadsPerBlock, ndpt, nwpr);
+
+    int smemSize = nspheres * sizeof(float4);
+
+
+    AT_DISPATCH_FLOATING_TYPES(
+        robot_spheres.scalar_type(), "self_collision_distance", ([&] {
+          self_collision_distance_kernel4<scalar_t>
+              <<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
+                  out_distance.data_ptr<scalar_t>(),
+                  out_vec.data_ptr<scalar_t>(),
+                  robot_spheres.data_ptr<scalar_t>(),
+                  collision_offset.data_ptr<scalar_t>(),
+                  collision_matrix.data_ptr<uint8_t>(), batch_size, nspheres,
+                  ndpt_n, nwpr, weight.data_ptr<scalar_t>(),
+                  sparse_index.data_ptr<uint8_t>(), compute_grad);
+        }));
+  }
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {out_distance, out_vec, sparse_index};
+}
+
diff --git a/src/curobo/curobolib/cpp/sphere_obb_kernel.cu b/src/curobo/curobolib/cpp/sphere_obb_kernel.cu
new file mode 100644
index 00000000..4d172e5c
--- /dev/null
+++ b/src/curobo/curobolib/cpp/sphere_obb_kernel.cu
@@ -0,0 +1,1567 @@
+/*
+ * Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include <c10/cuda/CUDAStream.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <torch/extension.h>
+
+#include "helper_math.h"
+#include <cuda_fp16.h>
+#include <vector>
+#define M 4
+// #define MAX_WARP_THREADS 512 // warp level batching. 8 x M = 32
+#define MAX_BOX_SHARED 256 // maximum number of boxes we can store distance and closest point
+#define DEBUG false
+namespace Curobo {
+namespace Geometry {
+
+/**
+ * @brief Compute length of sphere
+ *
+ * @param v1
+ * @param v2
+ * @return float
+ */
+__device__ __forceinline__ float sphere_length(const float4 &v1,
+                                               const float4 &v2) {
+  return norm3df(v1.x - v2.x, v1.y - v2.y, v1.z - v2.z);
+}
+
+__device__ __forceinline__ float sphere_distance(const float4 &v1,
+                                                 const float4 &v2) {
+  return max(0.0f, sphere_length(v1, v2) - v1.w - v2.w);
+}
+
+template<typename scalar_t>
+__device__ __forceinline__ void load_obb_pose(const scalar_t *obb_mat,
+float3 &position, float4 &quat)
+{ // obb_mat has x,y,z, qw, qx, qy, qz, 0 with an extra 0 padding for better use of memory
+  float4 temp = *(float4 *)&obb_mat[0];
+  position.x = temp.x;
+  position.y = temp.y;
+  position.z = temp.z;
+  quat.w = temp.w;
+  temp = *(float4 *)&obb_mat[4];
+  quat.x = temp.x;
+  quat.y = temp.y;
+  quat.z = temp.z;
+}
+template<typename scalar_t>
+__device__ __forceinline__ void load_obb_bounds(const scalar_t *obb_bounds,
+float3 &bounds)
+{ // obb_bounds has x,y,z, 0 with an extra 0 padding.
+  float4 loc_bounds = *(float4 *)&obb_bounds[0];
+  bounds.x = loc_bounds.x / 2;
+  bounds.y = loc_bounds.y / 2;
+  bounds.z = loc_bounds.z / 2;
+}
+
+
+template <typename scalar_t>
+__device__ __forceinline__ void
+transform_sphere_quat(const scalar_t *transform_mat, // x,y,z, qw, qx,qy,qz
+                      const float4 &sphere_pos, float4 &C) {
+  // do dot product:
+  // new_p = q * p * q_inv + obs_p
+  const float3 p_arr = *(float3 *)&transform_mat[0];
+  const scalar_t w = transform_mat[3];
+  const scalar_t x = transform_mat[4];
+  const scalar_t y = transform_mat[5];
+  const scalar_t z = transform_mat[6];
+
+  if(x!= 0 || y != 0 || z!=0)
+  {
+
+  C.x = p_arr.x + w * w * sphere_pos.x + 2 * y * w * sphere_pos.z -
+        2 * z * w * sphere_pos.y + x * x * sphere_pos.x +
+        2 * y * x * sphere_pos.y + 2 * z * x * sphere_pos.z -
+        z * z * sphere_pos.x - y * y * sphere_pos.x;
+  C.y = p_arr.y + 2 * x * y * sphere_pos.x + y * y * sphere_pos.y +
+        2 * z * y * sphere_pos.z + 2 * w * z * sphere_pos.x -
+        z * z * sphere_pos.y + w * w * sphere_pos.y - 2 * x * w * sphere_pos.z -
+        x * x * sphere_pos.y;
+  C.z = p_arr.z + 2 * x * z * sphere_pos.x + 2 * y * z * sphere_pos.y +
+        z * z * sphere_pos.z - 2 * w * y * sphere_pos.x - y * y * sphere_pos.z +
+        2 * w * x * sphere_pos.y - x * x * sphere_pos.z + w * w * sphere_pos.z;
+  }
+  else
+  {
+    C.x = p_arr.x + sphere_pos.x;
+    C.y = p_arr.y + sphere_pos.y;
+    C.z = p_arr.z + sphere_pos.z;
+  }
+  C.w = sphere_pos.w;
+}
+
+
+__device__ __forceinline__ void transform_sphere_quat(const float3 p, 
+                      const float4 q,
+                      const float4 &sphere_pos, 
+                      float4 &C) {
+  // do dot product:
+  // new_p = q * p * q_inv + obs_p
+
+  if(q.x!= 0 || q.y != 0 || q.z!=0)
+  {
+
+  C.x = p.x + q.w * q.w * sphere_pos.x + 2 * q.y * q.w * sphere_pos.z -
+        2 * q.z * q.w * sphere_pos.y + q.x * q.x * sphere_pos.x +
+        2 * q.y * q.x * sphere_pos.y + 2 * q.z * q.x * sphere_pos.z -
+        q.z * q.z * sphere_pos.x - q.y * q.y * sphere_pos.x;
+  C.y = p.y + 2 * q.x * q.y * sphere_pos.x + q.y * q.y * sphere_pos.y +
+        2 * q.z * q.y * sphere_pos.z + 2 * q.w * q.z * sphere_pos.x -
+        q.z * q.z * sphere_pos.y + q.w * q.w * sphere_pos.y - 2 * q.x * q.w * sphere_pos.z -
+        q.x * q.x * sphere_pos.y;
+  C.z = p.z + 2 * q.x * q.z * sphere_pos.x + 2 * q.y * q.z * sphere_pos.y +
+        q.z * q.z * sphere_pos.z - 2 * q.w * q.y * sphere_pos.x - q.y * q.y * sphere_pos.z +
+        2 * q.w * q.x * sphere_pos.y - q.x * q.x * sphere_pos.z + q.w * q.w * sphere_pos.z;
+  }
+  else
+  {
+    C.x = p.x + sphere_pos.x;
+    C.y = p.y + sphere_pos.y;
+    C.z = p.z + sphere_pos.z;
+  }
+  C.w = sphere_pos.w;
+}
+
+
+__device__ __forceinline__ void
+inv_transform_vec_quat(
+                      const float3 p, 
+                      const float4 q,
+                       const float4 &sphere_pos, float3 &C) {
+  // do dot product:
+  // new_p = q * p * q_inv + obs_p
+  if(q.x != 0 || q.y!= 0 || q.z!=0)
+  {
+  C.x =  q.w *  q.w * sphere_pos.x - 2 * q.y *  q.w * sphere_pos.z +
+        2 * q.z *  q.w * sphere_pos.y + q.x * q.x * sphere_pos.x +
+        2 * q.y * q.x * sphere_pos.y + 2 * q.z * q.x * sphere_pos.z -
+        q.z * q.z * sphere_pos.x - q.y * q.y * sphere_pos.x;
+  C.y = 2 * q.x * q.y * sphere_pos.x + q.y * q.y * sphere_pos.y +
+        2 * q.z * q.y * sphere_pos.z - 2 *  q.w * q.z * sphere_pos.x -
+        q.z * q.z * sphere_pos.y +  q.w *  q.w * sphere_pos.y + 2 * q.x *  q.w * sphere_pos.z -
+        q.x * q.x * sphere_pos.y;
+  C.z = 2 * q.x * q.z * sphere_pos.x + 2 * q.y * q.z * sphere_pos.y +
+        q.z * q.z * sphere_pos.z + 2 *  q.w * q.y * sphere_pos.x - q.y * q.y * sphere_pos.z -
+        2 *  q.w * q.x * sphere_pos.y - q.x * q.x * sphere_pos.z +  q.w *  q.w * sphere_pos.z;
+  }
+  else
+  {
+    C.x = sphere_pos.x;
+    C.y = sphere_pos.y;
+    C.z = sphere_pos.z;
+  }
+}
+
+
+__device__ __forceinline__ void
+inv_transform_vec_quat_add(const float3 p, 
+                           const float4 q, // x,y,z, qw, qx,qy,qz
+                           const float4 &sphere_pos, float3 &C) {
+  // do dot product:
+  // new_p = q * p * q_inv + obs_p
+  float3 temp_C = make_float3(0.0);
+  inv_transform_vec_quat(p, q, sphere_pos, temp_C);
+  C = C + temp_C;
+}
+
+
+template <typename scalar_t>
+__device__ __forceinline__ void
+inv_transform_vec_quat(const scalar_t *transform_mat, // x,y,z, qw, qx,qy,qz
+                       const float4 &sphere_pos, float3 &C) {
+  // do dot product:
+  // new_p = q * p * q_inv + obs_p
+  const scalar_t w = transform_mat[3];
+  const scalar_t x = -1 * transform_mat[4];
+  const scalar_t y = -1 * transform_mat[5];
+  const scalar_t z = -1 * transform_mat[6];
+  if(x != 0 || y!= 0 || z!=0)
+  {
+
+  
+  C.x = w * w * sphere_pos.x + 2 * y * w * sphere_pos.z -
+        2 * z * w * sphere_pos.y + x * x * sphere_pos.x +
+        2 * y * x * sphere_pos.y + 2 * z * x * sphere_pos.z -
+        z * z * sphere_pos.x - y * y * sphere_pos.x;
+  C.y = 2 * x * y * sphere_pos.x + y * y * sphere_pos.y +
+        2 * z * y * sphere_pos.z + 2 * w * z * sphere_pos.x -
+        z * z * sphere_pos.y + w * w * sphere_pos.y - 2 * x * w * sphere_pos.z -
+        x * x * sphere_pos.y;
+  C.z = 2 * x * z * sphere_pos.x + 2 * y * z * sphere_pos.y +
+        z * z * sphere_pos.z - 2 * w * y * sphere_pos.x - y * y * sphere_pos.z +
+        2 * w * x * sphere_pos.y - x * x * sphere_pos.z + w * w * sphere_pos.z;
+  }
+  else
+  {
+    C.x = sphere_pos.x;
+    C.y = sphere_pos.y;
+    C.z = sphere_pos.z;
+  }
+}
+
+template <typename scalar_t>
+__device__ __forceinline__ void
+inv_transform_vec_quat_add(const scalar_t *transform_mat, // x,y,z, qw, qx,qy,qz
+                           const float4 &sphere_pos, float3 &C) {
+  // do dot product:
+  // new_p = q * p * q_inv + obs_p
+  const scalar_t w = transform_mat[3];
+  const scalar_t x = -1 * transform_mat[4];
+  const scalar_t y = -1 * transform_mat[5];
+  const scalar_t z = -1 * transform_mat[6];
+  if(x != 0 || y!= 0 || z!=0){
+  
+  C.x += w * w * sphere_pos.x + 2 * y * w * sphere_pos.z -
+         2 * z * w * sphere_pos.y + x * x * sphere_pos.x +
+         2 * y * x * sphere_pos.y + 2 * z * x * sphere_pos.z -
+         z * z * sphere_pos.x - y * y * sphere_pos.x;
+  C.y += 2 * x * y * sphere_pos.x + y * y * sphere_pos.y +
+         2 * z * y * sphere_pos.z + 2 * w * z * sphere_pos.x -
+         z * z * sphere_pos.y + w * w * sphere_pos.y -
+         2 * x * w * sphere_pos.z - x * x * sphere_pos.y;
+  C.z += 2 * x * z * sphere_pos.x + 2 * y * z * sphere_pos.y +
+         z * z * sphere_pos.z - 2 * w * y * sphere_pos.x -
+         y * y * sphere_pos.z + 2 * w * x * sphere_pos.y -
+         x * x * sphere_pos.z + w * w * sphere_pos.z;
+  }
+  {
+    C.x += sphere_pos.x;
+    C.y += sphere_pos.y;
+    C.z += sphere_pos.z;
+  }
+}
+
+/**
+ * @brief Scales the Collision across the trajectory by sphere velocity. This is
+ * implemented from CHOMP motion planner (ICRA 2009). We use central difference
+ * to compute the velocity and acceleration of the sphere.
+ *
+ * @param sphere_0_cache
+ * @param sphere_1_cache
+ * @param sphere_2_cache
+ * @param dt
+ * @param transform_back
+ * @param max_dist
+ * @param max_grad
+ * @return void
+ */
+__device__ __forceinline__ void
+scale_speed_metric(const float4 &sphere_0_cache, const float4 &sphere_1_cache,
+                   const float4 &sphere_2_cache, const float &dt,
+                   const bool &transform_back, float &max_dist,
+                   float3 &max_grad) {
+
+  float3 norm_vel_vec = make_float3(sphere_2_cache.x - sphere_0_cache.x,
+                                    sphere_2_cache.y - sphere_0_cache.y,
+                                    sphere_2_cache.z - sphere_0_cache.z);
+
+  norm_vel_vec = (0.5 / dt) * norm_vel_vec;
+  const float sph_vel = length(norm_vel_vec);
+
+  if (transform_back) {
+
+    float3 sph_acc_vec = make_float3(
+        sphere_0_cache.x + sphere_2_cache.x - 2 * sphere_1_cache.x,
+        sphere_0_cache.y + sphere_2_cache.y - 2 * sphere_1_cache.y,
+        sphere_0_cache.z + sphere_2_cache.z - 2 * sphere_1_cache.z);
+
+    sph_acc_vec = (1 / (dt * dt)) * sph_acc_vec;
+    norm_vel_vec = norm_vel_vec * (1 / sph_vel);
+
+    const float3 curvature_vec = (sph_acc_vec) / (sph_vel * sph_vel);
+
+    // compute orthogonal projection:
+    float orth_proj[9] = {0.0};
+
+    // load float3 into array for easier matmul later:
+    float vel_arr[3];
+    vel_arr[0] = norm_vel_vec.x;
+    vel_arr[1] = norm_vel_vec.y;
+    vel_arr[2] = norm_vel_vec.z;
+
+// calculate projection ( I - (v * v^T)):
+#pragma unroll 3
+    for (int i = 0; i < 3; i++) {
+#pragma unroll 3
+      for (int j = 0; j < 3; j++) {
+        orth_proj[i * 3 + j] = -1 * vel_arr[i] * vel_arr[j];
+      }
+    }
+    orth_proj[0] += 1;
+    orth_proj[4] += 1;
+    orth_proj[8] += 1;
+
+    // curvature vec:
+
+    // multiply by orth projection:
+    // two matmuls:
+    float orth_pt[3];; // orth_proj(3x3) * max_grad(3x1)
+    float orth_curve[3]; // max_dist(1) * orth_proj (3x3) * curvature_vec (3x1)
+
+#pragma unroll 3
+    for (int i = 0; i < 3; i++) // matrix - vector product
+    {
+      orth_pt[i] = orth_proj[i * 3 + 0] * max_grad.x +
+                    orth_proj[i * 3 + 1] * max_grad.y +
+                    orth_proj[i * 3 + 2] * max_grad.z;
+
+      orth_curve[i] = max_dist * (orth_proj[i * 3 + 0] * curvature_vec.x +
+                                   orth_proj[i * 3 + 1] * curvature_vec.y +
+                                   orth_proj[i * 3 + 2] * curvature_vec.z);
+    }
+
+    // max_grad =  sph_vel * ((orth_proj * max_grad) - max_dist *  orth_proj *
+    // curvature)
+
+    max_grad.x =sph_vel * (orth_pt[0] - orth_curve[0]); ;//orth_proj[0];// * (orth_pt[0] - orth_curve[0]);
+    max_grad.y = sph_vel * (orth_pt[1] - orth_curve[1]);
+    max_grad.z = sph_vel * (orth_pt[2] - orth_curve[2]);
+  }
+  max_dist = sph_vel * max_dist;
+}
+
+//
+/**
+ * @brief check if sphere is inside. For numerical stability, we assume that if
+ * sphere is exactly at bound of cuboid, we are not in collision. Note: this is
+ * not warp safe.
+ *
+ * @param bounds bounds of cuboid
+ * @param sphere sphere as float4 (in cuboid frame of reference)
+ * @return bool
+ */
+__device__ __forceinline__ bool
+check_sphere_aabb(const float3 bounds, const float4 sphere) {
+  // if((fabs(sphere.x) - bounds.x) >= sphere.w || fabs(sphere.y) - bounds.y >=
+  // sphere.w || (fabs(sphere.z) - bounds.z) >= sphere.w)
+  if (max(max(fabs(sphere.x) - bounds.x, fabs(sphere.y) - bounds.y),
+          fabs(sphere.z) - bounds.z) >= (sphere.w)) {
+    return false;
+  }
+  return true;
+}
+
+__device__ __forceinline__ void
+compute_closest_point(const float3 &bounds, const float4 &sphere, float4 &pt) {
+  // assuming the cuboid is centered at origin
+  // Find the closest face to the sphere position:
+  // If sphere is within bounds of obstacle,
+  float min_val, curr_val;
+  int min_idx;
+
+  // We check the distance to each face and store the closest face
+  // All we want is the index of the closest face
+  min_val = bounds.x - fabsf(sphere.x);
+
+  if (min_val < 0) // it's outside limits, clamp:
+  {
+
+    pt.x = copysignf(bounds.x, sphere.x);
+  }
+  // check if bounds.x - sphere.x > bounds.x + sphere.x
+  min_val = fabsf(min_val);
+
+  if (sphere.x > 0) {
+    min_idx = 0;
+  } else {
+    min_idx = 1;
+  }
+
+  curr_val = bounds.y - fabsf(sphere.y); // check if sphere-y is outside y-lim
+  if (curr_val < 0) { // outside obb-y, we clamp point to bounds
+    pt.y = copysignf(bounds.y, sphere.y);
+  }
+  curr_val = fabsf(curr_val);
+  if (curr_val < min_val) {
+    min_val = curr_val;
+    if (sphere.y > 0) {
+      min_idx = 2;
+    } else {
+      min_idx = 3;
+    }
+  }
+  curr_val = bounds.z - fabsf(sphere.z); // distance to -ve bound
+  if (curr_val < 0) {
+    pt.y = copysignf(bounds.z, sphere.z);
+  }
+  curr_val = fabsf(curr_val);
+  if (curr_val < min_val) {
+    min_val = curr_val;
+    if (sphere.z > 0) {
+      min_idx = 4;
+    } else {
+      min_idx = 5;
+    }
+  }
+
+  // we move pt's value to bounds on the closest face dimension:
+  switch (min_idx) {
+  case 0:
+    pt.x = bounds.x;
+    break;
+  case 1:
+    pt.x = -1 * bounds.x;
+    break;
+  case 2:
+    pt.y = bounds.y;
+    break;
+  case 3:
+    pt.y = -1 * bounds.y;
+    break;
+  case 4:
+    pt.z = bounds.z;
+    break;
+  case 5:
+    pt.z = -1 * bounds.z;
+    break;
+  default:
+    break;
+  }
+}
+
+__device__ __forceinline__ float
+compute_distance(const float3 &bounds_w_radius,
+                 const float4 &sphere) { // pass in cl_pt
+  // compute closest point:
+  float4 cl_pt = make_float4(sphere.x, sphere.y, sphere.z, 0);
+  compute_closest_point(bounds_w_radius, sphere, cl_pt);
+  // clamp:
+
+  // compute distance:
+  return sphere_length(sphere, cl_pt); // cl_pt includes radius, and sphere also has radius
+}
+
+__device__ __forceinline__ void scale_eta_metric(const float4 &sphere, const float4 &cl_pt, 
+const float eta,
+float4 &sum_pt)
+{
+    // compute distance:
+    float sph_dist = 0;
+    sph_dist = sphere_length(sphere, cl_pt);
+   
+    if (sph_dist == 0)
+    {
+      sum_pt.x = 0;
+      sum_pt.y = 0;
+      sum_pt.z = 0;
+      sum_pt.w = 0;
+      
+      return;
+    }
+    sum_pt.x = (sphere.x - cl_pt.x) / sph_dist;
+    sum_pt.y = (sphere.y - cl_pt.y) / sph_dist;
+    sum_pt.z = (sphere.z - cl_pt.z) / sph_dist;
+    if (eta > 0.0)
+    {
+
+      //sph_dist = sph_dist - eta;
+
+    if (sph_dist > eta) {
+      sum_pt.w = sph_dist - 0.5 * eta;
+    } else if (sph_dist <= eta) {
+      
+      sum_pt.w = (0.5 / eta) * (sph_dist) * (sph_dist);
+      const float scale = (1 / eta) * (sph_dist);
+      sum_pt.x = scale * sum_pt.x;
+      sum_pt.y = scale * sum_pt.y;
+      sum_pt.z = scale * sum_pt.z;
+    }
+
+    }
+    else
+    {
+      sum_pt.w = sph_dist;
+    }
+}
+
+/**
+ * @brief Compute distance and gradient, with a smooth l2 loss when distance >0
+ * and < eta
+ *
+ * @param bounds_w_radius
+ * @param sphere
+ * @param sum_pt
+ * @param eta
+ * @return __device__
+ */
+__device__ __forceinline__ void
+compute_sphere_gradient(const float3 &bounds_w_radius, const float4 &sphere,
+                        float4 &sum_pt, const float eta = 0.0) {
+  // compute closest point:
+  float4 cl_pt = make_float4(sphere.x, sphere.y, sphere.z, 0.0);
+  compute_closest_point(bounds_w_radius, sphere, cl_pt);
+
+  scale_eta_metric(sphere, cl_pt, eta, sum_pt);
+}
+
+__device__ __forceinline__ void
+compute_sphere_gradient_add(const float3 &bounds_w_radius, const float4 &sphere,
+                            float4 &sum_pt, const float k0 = 1.0,
+                            const float eta = 0.0) {
+  // compute closest point:
+  float4 cl_pt = make_float4(sphere.x, sphere.y, sphere.z, 0.0);
+  compute_closest_point(bounds_w_radius, sphere, cl_pt);
+  // We need to check if sphere is colliding or outside
+  // if distance > 0.0, then it's colliding
+  // We can achieve this by adding eta to bounds_w_radius and then subtracting
+  // it
+  float4 local_sum_pt = make_float4(0.0,0.0,0.0,0.0);
+  scale_eta_metric(sphere, cl_pt, eta, local_sum_pt);
+  sum_pt += local_sum_pt;
+}
+
+__device__ __forceinline__ void check_jump_distance(
+    const float4 &loc_sphere_1, const float4 loc_sphere_0, const float k0,
+    const float eta, const float3 &loc_bounds, const float3 &grad_loc_bounds,
+    float4 &sum_pt,
+    float &curr_jump_distance) // we can pass in interpolated sphere here, also
+                               // need to pass cl_pt for use in
+                               // compute_sphere_gradient & compute_distance
+{
+  const float4 interpolated_sphere =
+      (k0)*loc_sphere_1 + (1 - k0) * loc_sphere_0;
+
+  if (check_sphere_aabb(loc_bounds, interpolated_sphere)) {
+
+    compute_sphere_gradient_add(grad_loc_bounds, interpolated_sphere, sum_pt,
+                                k0, eta); // true, loc_sphere_1 works better
+    curr_jump_distance +=  loc_sphere_1.w; // move by diameter of sphere
+  } else {
+    const float dist = compute_distance(grad_loc_bounds, interpolated_sphere);
+    curr_jump_distance +=  max(dist - 2*loc_sphere_1.w, loc_sphere_1.w); 
+  }
+}
+
+///////////////////////////////////////////////////////////////
+// We write out the distance and gradients for all the spheres.
+// So we do not need to initize the output tensor to 0.
+// Each thread computes the max distance and gradients per sphere.
+// This version should be faster if we have enough spheres/threads
+// to fill the GPU as it avoid inter-thread communication and the
+// use of shared memory.
+///////////////////////////////////////////////////////////////
+
+template <typename scalar_t>
+__device__ __forceinline__ void sphere_obb_collision_fn(
+    const scalar_t *sphere_position, 
+    const int env_idx, 
+    const int bn_sph_idx,
+    const int sph_idx,
+    scalar_t *out_distance, const scalar_t *weight,
+    const scalar_t *activation_distance, const scalar_t *obb_accel,
+    const scalar_t *obb_bounds, const scalar_t *obb_mat,
+    const uint8_t *obb_enable, const int max_nobs, const int nboxes) {
+  float max_dist = 0;
+  const int start_box_idx = max_nobs * env_idx;
+  const float eta = activation_distance[0];
+  // Load sphere_position input
+  float4 sphere_cache = *(float4 *)&sphere_position[bn_sph_idx * 4];
+  if (sphere_cache.w <= 0.0) {
+      // write zeros for cost:
+    out_distance[bn_sph_idx] = 0;
+  
+    return;
+  }
+  sphere_cache.w += eta;
+
+  float4 loc_sphere = make_float4(0.0);
+  float4 obb_quat = make_float4(0.0);
+  float3 obb_pos = make_float3(0.0);
+  float3 loc_bounds = make_float3(0.0);
+  for (int box_idx = 0; box_idx < nboxes; box_idx++) {
+    if (obb_enable[start_box_idx + box_idx] == 0) // disabled obstacle
+    {
+      continue;
+    }
+     load_obb_pose(&obb_mat[(start_box_idx + box_idx)*8], obb_pos,
+    obb_quat);
+    load_obb_bounds(&obb_bounds[(start_box_idx + box_idx)*4], loc_bounds);
+
+    transform_sphere_quat(obb_pos, obb_quat, sphere_cache, loc_sphere);
+    
+
+    // first check if point is inside or outside box:
+
+    if (check_sphere_aabb(loc_bounds, loc_sphere)) {
+      max_dist = 1;
+      break; // we exit without checking other cuboids if we found a collision.
+    }
+  }
+  out_distance[bn_sph_idx] = weight[0] * max_dist;
+}
+template <typename scalar_t>
+__device__ __forceinline__ void sphere_obb_distance_fn(
+    const scalar_t *sphere_position, 
+    const int32_t env_idx, 
+    const int bn_sph_idx,
+    const int sph_idx, 
+    scalar_t *out_distance, scalar_t *closest_pt, uint8_t *sparsity_idx,
+    const scalar_t *weight, const scalar_t *activation_distance,
+    const scalar_t *obb_accel, const scalar_t *obb_bounds,
+    const scalar_t *obb_mat, const uint8_t *obb_enable, const int max_nobs,
+    const int nboxes, const bool transform_back) {
+  float max_dist = 0.0;
+  const float eta = activation_distance[0];
+  float3 max_grad = make_float3(0.0, 0.0, 0.0);
+
+  // Load sphere_position input
+  float4 sphere_cache = *(float4 *)&sphere_position[bn_sph_idx * 4];
+  if (sphere_cache.w <= 0.0) {
+    // write zeros for cost:
+    out_distance[bn_sph_idx] = 0;
+
+    // write zeros for gradient if not zero:
+    if (sparsity_idx[bn_sph_idx] !=0) {
+      sparsity_idx[bn_sph_idx] = 0;
+      *(float4 *)& closest_pt[bn_sph_idx * 4] = make_float4(0.0);
+    }
+    return;
+  }
+  sphere_cache.w += eta;
+  const int start_box_idx = max_nobs * env_idx;
+
+  float4 loc_sphere = make_float4(0.0);
+  float4 obb_quat = make_float4(0.0);
+  float3 obb_pos = make_float3(0.0);
+  float3 loc_bounds = make_float3(0.0);
+
+  for (int box_idx = 0; box_idx < nboxes; box_idx++) {
+    if (obb_enable[start_box_idx + box_idx] == 0) // disabled obstacle
+    {
+      continue;
+    }
+    
+    load_obb_pose(&obb_mat[(start_box_idx + box_idx)*8], obb_pos,
+    obb_quat);
+    load_obb_bounds(&obb_bounds[(start_box_idx + box_idx)*4], loc_bounds);
+
+    transform_sphere_quat(obb_pos, obb_quat, sphere_cache, loc_sphere);
+    
+    // first check if point is inside or outside box:
+    if (check_sphere_aabb(loc_bounds, loc_sphere)) {
+      // compute closest point:
+      loc_bounds = loc_bounds + loc_sphere.w;
+
+      // using same primitive functions:
+      float4 cl;
+      compute_sphere_gradient(loc_bounds, loc_sphere, cl, eta);
+
+      max_dist += cl.w;
+      if (transform_back) {
+        inv_transform_vec_quat(obb_pos, obb_quat, cl, max_grad);
+
+        //inv_transform_vec_quat_add(&obb_mat[(start_box_idx + box_idx) * 7], cl,
+        //                           max_grad);
+      }
+    }
+  }
+
+  // sparsity opt:
+  if (max_dist == 0) {
+    if (sparsity_idx[bn_sph_idx] == 0) {
+      return;
+    }
+    sparsity_idx[bn_sph_idx] = 0;
+    if (transform_back) {
+      *(float3 *)&closest_pt[bn_sph_idx * 4] = max_grad; // max_grad is all zeros
+    }
+    out_distance[bn_sph_idx] = 0.0;
+    return;
+  }
+  // else max_dist != 0
+  max_dist = weight[0] * max_dist;
+
+  if (transform_back) {
+    *(float3 *)&closest_pt[bn_sph_idx * 4] = weight[0] * max_grad;
+  }
+  out_distance[bn_sph_idx] = max_dist;
+  sparsity_idx[bn_sph_idx] = 1;
+}
+
+template <typename scalar_t, bool enable_speed_metric>
+__device__ __forceinline__ void swept_sphere_obb_distance_fn(
+    const scalar_t *sphere_position,
+    const int env_idx, const int b_idx,
+    const int h_idx, const int sph_idx, 
+    scalar_t *out_distance,
+    scalar_t *closest_pt, 
+    uint8_t *sparsity_idx, 
+    const scalar_t *weight,
+    const scalar_t *activation_distance, const scalar_t *speed_dt,
+    const scalar_t *obb_accel, const scalar_t *obb_bounds,
+    const scalar_t *obb_mat, 
+    const uint8_t *obb_enable, 
+    const int max_nobs,
+    const int nboxes, const int batch_size, const int horizon,
+    const int nspheres, const int sweep_steps, 
+    const bool transform_back) {
+  // create shared memory to do warp wide reductions:
+  // warp wide reductions should only happen across nspheres in same batch and horizon
+  // 
+  // extern __shared__ float psum[]; 
+
+  const int sw_steps = sweep_steps;
+  const float fl_sw_steps = sw_steps;
+  const int start_box_idx = max_nobs * env_idx;
+  const int b_addrs =
+      b_idx * horizon * nspheres; // + h_idx * n_spheres + sph_idx;
+
+  // We read the same obstacles across
+
+  // Load sphere_position input
+  // if h_idx == horizon -1, we just read the same index
+  const int bhs_idx = b_addrs + h_idx * nspheres + sph_idx;
+
+  float4 sphere_1_cache = *(float4 *)&sphere_position[bhs_idx*4];
+  
+
+  
+  if (sphere_1_cache.w <= 0.0) {
+      // write zeros for cost:
+    out_distance[bhs_idx] = 0;
+
+    // write zeros for gradient if not zero:
+    if (sparsity_idx[bhs_idx] !=0) {
+      sparsity_idx[b_addrs + h_idx * nspheres + sph_idx] = 0;
+      *(float4 *)& closest_pt[bhs_idx * 4] = make_float4(0.0);
+      
+    }
+  
+    return;
+  }
+  bool sweep_back = false;
+  bool sweep_fwd = false;
+  float sphere_0_distance, sphere_2_distance, sphere_0_len, sphere_2_len;
+  
+  float max_dist = 0.0;
+  float3 max_grad = make_float3(0.0, 0.0, 0.0);
+ 
+  const float dt = speed_dt[0];
+  const float eta = activation_distance[0];
+  sphere_1_cache.w += eta;
+  float4 sphere_0_cache, sphere_2_cache;
+  if (h_idx > 0) {
+    sphere_0_cache = *(float4 *)&sphere_position[b_addrs*4 + (h_idx - 1) * nspheres * 4 + sph_idx * 4];
+    sphere_0_cache.w = sphere_1_cache.w;
+    sphere_0_distance = sphere_distance(sphere_0_cache, sphere_1_cache);
+    sphere_0_len = sphere_0_distance + sphere_0_cache.w * 2;
+    if (sphere_0_distance > 0.0) {
+      sweep_back = true;
+    }
+  }
+
+  if (h_idx < horizon - 1) {
+    sphere_2_cache = *(float4 *)&sphere_position[b_addrs*4 + (h_idx + 1) * nspheres * 4 + sph_idx * 4];
+    sphere_2_cache.w = sphere_1_cache.w;
+    sphere_2_distance = sphere_distance(sphere_2_cache, sphere_1_cache);
+    sphere_2_len = sphere_2_distance + sphere_2_cache.w * 2;
+    if (sphere_2_distance > 0.0) {
+      sweep_fwd = true;
+    }
+  }
+  float4 loc_sphere_0, loc_sphere_1, loc_sphere_2;
+  float k0 = 0.0;
+
+
+  //float4 loc_sphere = make_float4(0.0);
+  float4 obb_quat = make_float4(0.0);
+  float3 obb_pos = make_float3(0.0);
+  float3 loc_bounds = make_float3(0.0);
+
+  for (int box_idx = 0; box_idx < nboxes; box_idx++) {
+    if (obb_enable[start_box_idx + box_idx] == 0) // disabled obstacle
+    {
+      continue;
+    }
+
+    // read position and quaternion:
+    load_obb_pose(&obb_mat[(start_box_idx + box_idx)*8], obb_pos,
+    obb_quat);
+    load_obb_bounds(&obb_bounds[(start_box_idx + box_idx)*4], loc_bounds);
+    float curr_jump_distance = 0.0;
+   
+    const float3 grad_loc_bounds = loc_bounds + sphere_1_cache.w; // assuming sphere radius doesn't change
+
+    transform_sphere_quat(obb_pos, obb_quat, sphere_1_cache, loc_sphere_1);
+
+    
+
+    // assuming sphere position is in box frame:
+    // read data:
+    float4 sum_pt = make_float4(0.0, 0.0, 0.0, 0.0);
+
+    // check at exact timestep:
+    if (check_sphere_aabb(loc_bounds, loc_sphere_1)) {
+      compute_sphere_gradient(grad_loc_bounds, loc_sphere_1, sum_pt, eta);
+      curr_jump_distance = loc_sphere_1.w; // TODO: check if this is better
+    } else if (sweep_back || sweep_fwd) {
+      // there is no collision, compute the distance to obstacle:
+      curr_jump_distance = compute_distance(grad_loc_bounds, loc_sphere_1) -
+                           loc_sphere_1.w;// - eta;
+    }
+    const float jump_mid_distance = curr_jump_distance;
+    // compute distance between sweep spheres:
+    if (sweep_back && jump_mid_distance < sphere_0_distance / 2) {
+      transform_sphere_quat(obb_pos, obb_quat, sphere_0_cache, loc_sphere_0);
+
+      // get unit vector:
+      // loc_sphere_0 = (loc_sphere_0 - loc_sphere_1)/(sphere_0_len);
+
+      // loop over sweep steps and accumulate distance:
+      for (int j = 0; j < sw_steps; j++) {
+        // jump by current jump distance:
+
+        // when sweep_steps == 0, then we only check at loc_sphere_1.
+        // do interpolation from t=1 to t=0 (sweep backward)
+
+        if (curr_jump_distance >= (sphere_0_len / 2)) {
+          break;
+        }
+        k0 = 1 - (curr_jump_distance / sphere_0_len);
+        check_jump_distance(loc_sphere_1, loc_sphere_0, k0, eta, loc_bounds,
+                            grad_loc_bounds, sum_pt, curr_jump_distance);
+      }
+    }
+    if (sweep_fwd && jump_mid_distance < (sphere_2_len / 2)) {
+      curr_jump_distance = jump_mid_distance;
+      transform_sphere_quat(obb_pos, obb_quat, sphere_2_cache, loc_sphere_2);
+
+
+      for (int j = 0; j < sw_steps; j++) {
+
+        if (curr_jump_distance >= (sphere_2_len / 2)) {
+          break;
+        }
+        k0 = 1 - curr_jump_distance / sphere_2_len;
+        check_jump_distance(loc_sphere_1, loc_sphere_2, k0, eta, loc_bounds,
+                            grad_loc_bounds, sum_pt, curr_jump_distance);
+      }
+    }
+    if (sum_pt.w > 0) {
+      max_dist += sum_pt.w;
+
+      // transform point back if required:
+      if (transform_back) {
+        inv_transform_vec_quat(obb_pos, obb_quat, sum_pt, max_grad);
+      }
+      // break;// break after first obstacle collision
+    }
+  }
+
+
+  // sparsity opt:
+  if (max_dist == 0) {
+    if (sparsity_idx[bhs_idx] == 0) {
+      return;
+    }
+    sparsity_idx[bhs_idx] = 0;
+    if (transform_back) {
+      *(float3 *)&closest_pt[bhs_idx * 4] = max_grad; // max_grad is all zeros
+    }
+    out_distance[bhs_idx] = 0.0;
+    return;
+  }
+
+  // computer speed metric here:
+  if (enable_speed_metric)
+  {
+  if (sweep_back && sweep_fwd) {
+    scale_speed_metric(sphere_0_cache, sphere_1_cache, sphere_2_cache, dt,
+                       transform_back, max_dist, max_grad);
+  }
+  }
+  max_dist = weight[0] * max_dist;
+
+  if (transform_back) {
+    *(float3 *)&closest_pt[bhs_idx * 4] = weight[0] * max_grad;
+  }
+  sparsity_idx[bhs_idx] = 1;
+
+  out_distance[bhs_idx] = max_dist;
+}
+
+/**
+ * @brief Swept Collision checking. Note: This function currently does not
+ * implement skipping computation based on distance (which is done in
+ * swept_sphere_obb_distance_fn).
+ *
+ * @tparam scalar_t
+ * @param sphere_position
+ * @param env_idx
+ * @param b_idx
+ * @param h_idx
+ * @param sph_idx
+ * @param out_distance
+ * @param weight
+ * @param activation_distance
+ * @param obb_accel
+ * @param obb_bounds
+ * @param obb_mat
+ * @param obb_enable
+ * @param max_nobs
+ * @param nboxes
+ * @param batch_size
+ * @param horizon
+ * @param nspheres
+ * @param sweep_steps
+ * @return __device__
+ */
+template <typename scalar_t>
+__device__ __forceinline__ void swept_sphere_obb_collision_fn(
+    const scalar_t *sphere_position, 
+    const int env_idx, const int b_idx,
+    const int h_idx, const int sph_idx, scalar_t *out_distance,
+    const scalar_t *weight, const scalar_t *activation_distance,
+    const scalar_t *obb_accel, const scalar_t *obb_bounds,
+    const scalar_t *obb_mat, const uint8_t *obb_enable, const int max_nobs,
+    const int nboxes, const int batch_size, const int horizon,
+    const int nspheres, const int sweep_steps) {
+
+  const int sw_steps = sweep_steps;
+  const float fl_sw_steps = 2 * sw_steps + 1;
+  float max_dist = 0.0;
+  const float eta = activation_distance[0];
+  const int b_addrs =
+      b_idx * horizon * nspheres; // + h_idx * n_spheres + sph_idx;
+  const int start_box_idx = max_nobs * env_idx;
+  const int bhs_idx = b_addrs + h_idx * nspheres + sph_idx;
+
+  // We read the same obstacles across
+
+  // Load sphere_position input
+  // if h_idx == horizon -1, we just read the same index
+  float4 sphere_1_cache = *(float4 *)&sphere_position[bhs_idx*4];
+  if (sphere_1_cache.w <= 0.0) {
+    out_distance[b_addrs + h_idx * nspheres + sph_idx] = 0.0;
+    return;
+  }
+  sphere_1_cache.w += eta;
+
+  float4 sphere_0_cache, sphere_2_cache;
+  if (h_idx > 0) {
+    sphere_0_cache = *(float4 *)&sphere_position[b_addrs * 4 + (h_idx - 1) * nspheres * 4 +
+                                   sph_idx * 4];
+    sphere_0_cache.w += eta;
+  }
+
+  if (h_idx < horizon - 1) {
+    sphere_2_cache = *(float4 *)&sphere_position[b_addrs * 4 + (h_idx + 1) * nspheres * 4 +
+                                  sph_idx * 4];
+    sphere_2_cache.w += eta;
+  }
+  float4 loc_sphere_0, loc_sphere_1, loc_sphere_2;
+  float4 interpolated_sphere;
+  float k0, k1;
+  float in_obb_mat[7];
+  for (int box_idx = 0; box_idx < nboxes; box_idx++) {
+    // read position and quaternion:
+    if (obb_enable[start_box_idx + box_idx] == 0) // disabled obstacle
+    {
+      continue;
+    }
+
+#pragma unroll
+    for (int i = 0; i < 7; i++) {
+      in_obb_mat[i] = obb_mat[(start_box_idx + box_idx) * 7 + i];
+    }
+
+    float3 loc_bounds =
+        *(float3 *)&obb_bounds[(start_box_idx + box_idx) * 3]; // /2
+    loc_bounds = loc_bounds / 2;
+
+    transform_sphere_quat(&in_obb_mat[0], sphere_1_cache, loc_sphere_1);
+
+    max_dist += box_idx;
+    if (check_sphere_aabb(loc_bounds, loc_sphere_1)) {
+      max_dist = 1;
+    }
+    if (h_idx > 0) {
+
+      transform_sphere_quat(&in_obb_mat[0], sphere_0_cache, loc_sphere_0);
+
+      // loop over sweep steps and accumulate distance:
+      for (int j = 0; j < sw_steps; j++) {
+        // when sweep_steps == 0, then we only check at loc_sphere_1.
+        // do interpolation from t=1 to t=0 (sweep backward)
+        k0 = (j + 1) / (fl_sw_steps);
+        k1 = 1 - k0;
+        interpolated_sphere = k0 * loc_sphere_1 + (k1)*loc_sphere_0;
+        if (check_sphere_aabb(loc_bounds, interpolated_sphere)) {
+          max_dist = 1;
+          break;
+        }
+      }
+    }
+
+    if (h_idx < horizon - 1) {
+
+      transform_sphere_quat(&in_obb_mat[0], sphere_2_cache, loc_sphere_2);
+
+      for (int j = 0; j < sw_steps; j++) {
+        // do interpolation from t=1 to t=2 (sweep forward):
+
+        k0 = (j + 1) / (fl_sw_steps);
+        k1 = 1 - k0;
+        interpolated_sphere = k0 * loc_sphere_1 + (k1)*loc_sphere_2;
+        if (check_sphere_aabb(loc_bounds, interpolated_sphere)) {
+          max_dist = 1;
+          break;
+        }
+      }
+    }
+    if (max_dist > 0) {
+      break;
+    }
+  }
+  out_distance[b_addrs + h_idx * nspheres + sph_idx] = weight[0] * max_dist;
+}
+
+template <typename scalar_t, bool batch_env_t>
+__global__ void sphere_obb_distance_kernel(
+    const scalar_t *sphere_position, 
+    scalar_t *out_distance,
+    scalar_t *closest_pt, uint8_t *sparsity_idx, const scalar_t *weight,
+    const scalar_t *activation_distance, const scalar_t *obb_accel,
+    const scalar_t *obb_bounds, const scalar_t *obb_mat,
+    const uint8_t *obb_enable, const int32_t *n_env_obb,
+    const int32_t *env_query_idx,
+     const int max_nobs,
+    const int batch_size, const int horizon, const int nspheres,
+    const bool transform_back) {
+  // spheres_per_block is number of spheres in a thread
+  // compute local sphere batch by first dividing threadidx/nboxes
+  // const int sph_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int t_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int b_idx = t_idx / (horizon * nspheres);
+  const int h_idx = (t_idx - b_idx * (horizon * nspheres)) / nspheres;
+  const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres);
+  if (sph_idx >= nspheres || b_idx >= batch_size || h_idx >= horizon) {
+    return;
+  }
+  const int bn_sph_idx =
+        b_idx * horizon * nspheres + h_idx * nspheres + sph_idx;
+
+  int env_idx=0;
+  int env_nboxes=n_env_obb[0];
+  if(batch_env_t)
+  {
+    
+    env_idx =
+      env_query_idx[b_idx]; // read env idx from current batch idx
+    env_nboxes =
+      n_env_obb[env_idx]; // read nboxes in current environment
+  }
+ 
+  sphere_obb_distance_fn(sphere_position, env_idx, bn_sph_idx, sph_idx, out_distance,
+                         closest_pt, sparsity_idx, weight, activation_distance,
+                         obb_accel, obb_bounds, obb_mat, obb_enable, max_nobs,
+                         env_nboxes, transform_back);
+  // return the sphere distance here:
+  // sync threads and do block level reduction:
+
+
+}
+
+template <typename scalar_t, bool enable_speed_metric>
+__global__ void swept_sphere_obb_distance_jump_kernel(
+    const scalar_t *sphere_position, 
+    scalar_t *out_distance,
+    scalar_t *closest_pt, uint8_t *sparsity_idx, const scalar_t *weight,
+    const scalar_t *activation_distance, const scalar_t *speed_dt,
+    const scalar_t *obb_accel, const scalar_t *obb_bounds,
+    const scalar_t *obb_pose, const uint8_t *obb_enable,
+    const int32_t *n_env_obb, const int max_nobs, const int batch_size,
+    const int horizon, const int nspheres, const int sweep_steps, const bool transform_back) {
+
+  // This kernel jumps by sdf to only get gradients at collision points.
+
+  // spheres_per_block is number of spheres in a thread
+  // compute local sphere batch by first dividing threadidx/nboxes
+  const int t_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int b_idx = t_idx / (horizon * nspheres);
+  const int h_idx = (t_idx - b_idx * (horizon * nspheres)) / nspheres;
+  const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres);
+  if (sph_idx >= nspheres || b_idx >= batch_size || h_idx >= horizon) {
+    return;
+  }
+
+  const int env_idx = 0;
+
+ 
+  const int env_nboxes = n_env_obb[env_idx];
+  swept_sphere_obb_distance_fn<scalar_t, enable_speed_metric>(
+      sphere_position, env_idx, b_idx, h_idx, sph_idx, out_distance, closest_pt,
+      sparsity_idx, weight, activation_distance, speed_dt, obb_accel,
+      obb_bounds, obb_pose, obb_enable, max_nobs, env_nboxes, batch_size,
+      horizon, nspheres, sweep_steps, transform_back);
+}
+
+template <typename scalar_t, bool enable_speed_metric>
+__global__ void swept_sphere_obb_distance_jump_batch_env_kernel(
+    const scalar_t *sphere_position,
+    scalar_t *out_distance,
+    scalar_t *closest_pt, uint8_t *sparsity_idx, const scalar_t *weight,
+    const scalar_t *activation_distance, const scalar_t *speed_dt,
+    const scalar_t *obb_accel, const scalar_t *obb_bounds,
+    const scalar_t *obb_pose, const uint8_t *obb_enable,
+    const int32_t *n_env_obb, const int32_t *env_query_idx, const int max_nobs,
+    const int batch_size, const int horizon, const int nspheres,
+    const int sweep_steps,
+    const bool transform_back) {
+  // This kernel jumps by sdf to only get gradients at collision points.
+
+  // spheres_per_block is number of spheres in a thread
+  // compute local sphere batch by first dividing threadidx/nboxes
+  const int t_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int b_idx = t_idx / (horizon * nspheres);
+  //const int sph_idx = (t_idx - b_idx * (horizon * nspheres)) / horizon;
+  //const int h_idx = (t_idx - b_idx * horizon * nspheres - sph_idx * horizon);
+  const int h_idx = (t_idx - b_idx * (horizon * nspheres)) / nspheres;
+  const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres);
+  if (sph_idx >= nspheres || b_idx >= batch_size || h_idx >= horizon) {
+    return;
+  }
+
+  const int env_idx = env_query_idx[b_idx];
+  const int env_nboxes = n_env_obb[env_idx];
+  
+  swept_sphere_obb_distance_fn<scalar_t, enable_speed_metric>(
+      sphere_position,  env_idx, b_idx, h_idx, sph_idx, out_distance, closest_pt,
+      sparsity_idx, weight, activation_distance, speed_dt, obb_accel,
+      obb_bounds, obb_pose, obb_enable, max_nobs, env_nboxes, batch_size,
+      horizon, nspheres, sweep_steps, transform_back);
+}
+template <typename scalar_t>
+__global__ void swept_sphere_obb_collision_kernel(
+    const scalar_t *sphere_position, 
+    scalar_t *out_distance,
+    const scalar_t *weight, const scalar_t *activation_distance,
+    const scalar_t *obb_accel, const scalar_t *obb_bounds,
+    const scalar_t *obb_pose, const uint8_t *obb_enable,
+    const int32_t *n_env_obb, const int max_nobs, const int batch_size,
+    const int horizon, const int nspheres, const int sweep_steps) {
+
+  // spheres_per_block is number of spheres in a thread
+  // compute local sphere batch by first dividing threadidx/nboxes
+  const int t_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int b_idx = t_idx / (horizon * nspheres);
+  const int h_idx = (t_idx - b_idx * (horizon * nspheres)) / nspheres;
+  const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres);
+  if (sph_idx >= nspheres || b_idx >= batch_size || h_idx >= horizon) {
+    return;
+  }
+
+  const int env_idx = 0;
+  const int env_nboxes = n_env_obb[env_idx];
+
+  swept_sphere_obb_collision_fn(
+      sphere_position, env_idx, b_idx, h_idx, sph_idx, out_distance, weight,
+      activation_distance, obb_accel, obb_bounds, obb_pose, obb_enable,
+      max_nobs, env_nboxes, batch_size, horizon, nspheres, sweep_steps);
+}
+
+template <typename scalar_t>
+__global__ void swept_sphere_obb_collision_batch_env_kernel(
+    const scalar_t *sphere_position, 
+    scalar_t *out_distance,
+    const scalar_t *weight, const scalar_t *activation_distance,
+    const scalar_t *obb_accel, const scalar_t *obb_bounds,
+    const scalar_t *obb_pose, const uint8_t *obb_enable,
+    const int32_t *n_env_obb, const int32_t *env_query_idx, const int max_nobs,
+    const int batch_size, const int horizon, const int nspheres,
+    const int sweep_steps) {
+
+  // spheres_per_block is number of spheres in a thread
+  // compute local sphere batch by first dividing threadidx/nboxes
+  const int t_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int b_idx = t_idx / (horizon * nspheres);
+  const int h_idx = (t_idx - b_idx * (horizon * nspheres)) / nspheres;
+  const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres);
+  if (sph_idx >= nspheres || b_idx >= batch_size || h_idx >= horizon) {
+    return;
+  }
+
+  const int env_idx = env_query_idx[b_idx];
+  const int env_nboxes = n_env_obb[env_idx];
+
+  swept_sphere_obb_collision_fn(
+      sphere_position, env_idx, b_idx, h_idx, sph_idx, out_distance, weight,
+      activation_distance, obb_accel, obb_bounds, obb_pose, obb_enable,
+      max_nobs, env_nboxes, batch_size, horizon, nspheres, sweep_steps);
+}
+
+template <typename scalar_t>
+__global__ void sphere_obb_collision_kernel(
+    const scalar_t *sphere_position, 
+    scalar_t *out_distance,
+    const scalar_t *weight, const scalar_t *activation_distance,
+    const scalar_t *obb_accel, const scalar_t *obb_bounds,
+    const scalar_t *obb_mat, const uint8_t *obb_enable,
+    const int32_t *n_env_obb, const int max_nobs, const int batch_size,
+    const int horizon, const int nspheres) {
+  // spheres_per_block is number of spheres in a thread
+  // compute local sphere batch by first dividing threadidx/nboxes
+  const int t_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int b_idx = t_idx / (horizon * nspheres);
+  const int h_idx = (t_idx - b_idx * (horizon * nspheres)) / nspheres;
+  const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres);
+  if (sph_idx >= nspheres || b_idx >= batch_size || h_idx >= horizon) {
+    return;
+  }
+  const int env_idx = 0;
+  const int env_nboxes = n_env_obb[env_idx];
+  const int bn_sph_idx =
+      b_idx * horizon * nspheres + h_idx * nspheres + sph_idx;
+  sphere_obb_collision_fn(sphere_position,
+  env_idx, bn_sph_idx,sph_idx,  out_distance,
+                          weight, activation_distance, obb_accel, obb_bounds,
+                          obb_mat, obb_enable, max_nobs, env_nboxes);
+}
+
+template <typename scalar_t>
+__global__ void sphere_obb_collision_batch_env_kernel(
+    const scalar_t *sphere_position, 
+    scalar_t *out_distance,
+    const scalar_t *weight, const scalar_t *activation_distance,
+    const scalar_t *obb_accel, const scalar_t *obb_bounds,
+    const scalar_t *obb_mat, const uint8_t *obb_enable,
+    const int32_t *n_env_obb, const int32_t *env_query_idx, const int max_nobs,
+    const int batch_size, const int horizon, const int nspheres) {
+  // spheres_per_block is number of spheres in a thread
+  // compute local sphere batch by first dividing threadidx/nboxes
+  const int t_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int b_idx = t_idx / (horizon * nspheres);
+  const int h_idx = (t_idx - b_idx * (horizon * nspheres)) / nspheres;
+  const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres);
+  if (sph_idx >= nspheres || b_idx >= batch_size || h_idx >= horizon) {
+    return;
+  }
+  const int env_idx = env_query_idx[b_idx];
+  const int env_nboxes = n_env_obb[env_idx];
+  const int bn_sph_idx =
+      b_idx * horizon * nspheres + h_idx * nspheres + sph_idx;
+  sphere_obb_collision_fn(sphere_position, env_idx, bn_sph_idx, sph_idx, out_distance,
+                          weight, activation_distance, obb_accel, obb_bounds,
+                          obb_mat, obb_enable, max_nobs, env_nboxes);
+}
+} // namespace Geometry
+} // namespace Curobo
+
+
+std::vector<torch::Tensor>
+sphere_obb_clpt(const torch::Tensor sphere_position, // batch_size, 3
+                torch::Tensor distance,
+                torch::Tensor closest_point, // batch size, 3
+                torch::Tensor sparsity_idx, const torch::Tensor weight,
+                const torch::Tensor activation_distance,
+                const torch::Tensor obb_accel,     // n_boxes, 4, 4
+                const torch::Tensor obb_bounds,    // n_boxes, 3
+                const torch::Tensor obb_pose,      // n_boxes, 4, 4
+                const torch::Tensor obb_enable,    // n_boxes, 4, 4
+                const torch::Tensor n_env_obb,     // n_boxes, 4, 4
+                const torch::Tensor env_query_idx, // n_boxes, 4, 4
+                const int max_nobs, const int batch_size, const int horizon,
+                const int n_spheres, const bool transform_back,
+                const bool compute_distance, const bool use_batch_env) {
+
+  using namespace Curobo::Geometry;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const int bnh_spheres = n_spheres * batch_size * horizon; //
+
+  int threadsPerBlock = bnh_spheres;
+  if (threadsPerBlock > 128) {
+    threadsPerBlock = 128; 
+  }
+  int blocksPerGrid = (bnh_spheres + threadsPerBlock - 1) / threadsPerBlock;
+  if (use_batch_env) {
+
+    if (compute_distance) {
+
+      // TODO: call kernel based on flag:
+      AT_DISPATCH_FLOATING_TYPES(
+          distance.scalar_type(), "SphereObb_clpt", ([&] {
+            sphere_obb_distance_kernel<scalar_t, true>
+                <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                    sphere_position.data_ptr<scalar_t>(),
+                    distance.data_ptr<scalar_t>(),
+                    closest_point.data_ptr<scalar_t>(),
+                    sparsity_idx.data_ptr<uint8_t>(),
+                    weight.data_ptr<scalar_t>(),
+                    activation_distance.data_ptr<scalar_t>(),
+                    obb_accel.data_ptr<scalar_t>(),
+                    obb_bounds.data_ptr<scalar_t>(),
+                    obb_pose.data_ptr<scalar_t>(),
+                    obb_enable.data_ptr<uint8_t>(),
+                    n_env_obb.data_ptr<int32_t>(),
+                    env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
+                    horizon, n_spheres, transform_back);
+          }));
+    } else {
+
+      // TODO: call kernel based on flag:
+      AT_DISPATCH_FLOATING_TYPES(
+          distance.scalar_type(), "SphereObb_collision", ([&] {
+            sphere_obb_collision_batch_env_kernel<scalar_t>
+                <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                    sphere_position.data_ptr<scalar_t>(),
+                    distance.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
+                    activation_distance.data_ptr<scalar_t>(),
+                    obb_accel.data_ptr<scalar_t>(),
+                    obb_bounds.data_ptr<scalar_t>(),
+                    obb_pose.data_ptr<scalar_t>(),
+                    obb_enable.data_ptr<uint8_t>(),
+                    n_env_obb.data_ptr<int32_t>(),
+                    env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
+                    horizon, n_spheres);
+          }));
+    }
+
+  } else {
+
+    if (compute_distance) {
+
+      // TODO: call kernel based on flag:
+      AT_DISPATCH_FLOATING_TYPES(
+          distance.scalar_type(), "SphereObb_clpt", ([&] {
+            sphere_obb_distance_kernel<scalar_t, false>
+                <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                    sphere_position.data_ptr<scalar_t>(),
+                    distance.data_ptr<scalar_t>(),
+                    closest_point.data_ptr<scalar_t>(),
+                    sparsity_idx.data_ptr<uint8_t>(),
+                    weight.data_ptr<scalar_t>(),
+                    activation_distance.data_ptr<scalar_t>(),
+                    obb_accel.data_ptr<scalar_t>(),
+                    obb_bounds.data_ptr<scalar_t>(),
+                    obb_pose.data_ptr<scalar_t>(),
+                    obb_enable.data_ptr<uint8_t>(),
+                    n_env_obb.data_ptr<int32_t>(),
+                    env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
+                    horizon, n_spheres, transform_back);
+          }));
+    } else {
+
+      // TODO: call kernel based on flag:
+      AT_DISPATCH_FLOATING_TYPES(
+          distance.scalar_type(), "SphereObb_collision", ([&] {
+            sphere_obb_collision_kernel<scalar_t>
+                <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                    sphere_position.data_ptr<scalar_t>(),
+                    distance.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
+                    activation_distance.data_ptr<scalar_t>(),
+                    obb_accel.data_ptr<scalar_t>(),
+                    obb_bounds.data_ptr<scalar_t>(),
+                    obb_pose.data_ptr<scalar_t>(),
+                    obb_enable.data_ptr<uint8_t>(),
+                    n_env_obb.data_ptr<int32_t>(), max_nobs, batch_size,
+                    horizon, n_spheres);
+          }));
+    }
+  }
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {distance, closest_point, sparsity_idx};
+}
+
+std::vector<torch::Tensor> swept_sphere_obb_clpt(
+    const torch::Tensor sphere_position, // batch_size, 3
+
+    torch::Tensor distance,                  // batch_size, 1
+    torch::Tensor
+        closest_point, // batch size, 4 -> written out as x,y,z,0 for gradient
+    torch::Tensor sparsity_idx, const torch::Tensor weight,
+    const torch::Tensor activation_distance, const torch::Tensor speed_dt,
+    const torch::Tensor obb_accel,     // n_boxes, 4, 4
+    const torch::Tensor obb_bounds,    // n_boxes, 3
+    const torch::Tensor obb_pose,      // n_boxes, 4, 4
+    const torch::Tensor obb_enable,    // n_boxes, 4,
+    const torch::Tensor n_env_obb,     // n_boxes, 4, 4
+    const torch::Tensor env_query_idx, // n_boxes, 4, 4
+    const int max_nobs, const int batch_size, const int horizon,
+    const int n_spheres, const int sweep_steps, const bool enable_speed_metric,
+    const bool transform_back, const bool compute_distance,
+    const bool use_batch_env) {
+  using namespace Curobo::Geometry;
+
+  
+  //const int max_batches_per_block = 128;
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  
+  //const int bh = batch_size * horizon; 
+
+  //const int warp_n_spheres = n_spheres + (n_spheres % 32);// make n_spheres a multiple of 32
+  //int batches_per_block = (bh * warp_n_spheres) / max_batches_per_block; 
+  const int bnh_spheres = n_spheres * batch_size * horizon; //
+  int threadsPerBlock = bnh_spheres;
+
+  // This block is for old kernels?
+  if (threadsPerBlock > 128) {
+    threadsPerBlock = 128;
+  }
+  int blocksPerGrid = (bnh_spheres + threadsPerBlock - 1) / threadsPerBlock;
+  
+
+  if (use_batch_env) {
+    if (compute_distance) {
+
+      if (enable_speed_metric)
+      {
+        // This is the best kernel for now
+      AT_DISPATCH_FLOATING_TYPES(
+          distance.scalar_type(), "Swept_SphereObb_clpt", ([&] {
+            swept_sphere_obb_distance_jump_batch_env_kernel<scalar_t, true>
+                <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                    sphere_position.data_ptr<scalar_t>(),
+                    distance.data_ptr<scalar_t>(),
+                    closest_point.data_ptr<scalar_t>(),
+                    sparsity_idx.data_ptr<uint8_t>(),
+                    weight.data_ptr<scalar_t>(),
+                    activation_distance.data_ptr<scalar_t>(),
+                    speed_dt.data_ptr<scalar_t>(),
+                    obb_accel.data_ptr<scalar_t>(),
+                    obb_bounds.data_ptr<scalar_t>(),
+                    obb_pose.data_ptr<scalar_t>(),
+                    obb_enable.data_ptr<uint8_t>(),
+                    n_env_obb.data_ptr<int32_t>(),
+                    env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
+                    horizon, n_spheres, sweep_steps, 
+                    transform_back);
+          }));
+
+
+      }
+      else
+      {
+        // This is the best kernel for now
+      AT_DISPATCH_FLOATING_TYPES(
+          distance.scalar_type(), "Swept_SphereObb_clpt", ([&] {
+            swept_sphere_obb_distance_jump_batch_env_kernel<scalar_t, false>
+                <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                    sphere_position.data_ptr<scalar_t>(),
+                    distance.data_ptr<scalar_t>(),
+                    closest_point.data_ptr<scalar_t>(),
+                    sparsity_idx.data_ptr<uint8_t>(),
+                    weight.data_ptr<scalar_t>(),
+                    activation_distance.data_ptr<scalar_t>(),
+                    speed_dt.data_ptr<scalar_t>(),
+                    obb_accel.data_ptr<scalar_t>(),
+                    obb_bounds.data_ptr<scalar_t>(),
+                    obb_pose.data_ptr<scalar_t>(),
+                    obb_enable.data_ptr<uint8_t>(),
+                    n_env_obb.data_ptr<int32_t>(),
+                    env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
+                    horizon, n_spheres, sweep_steps, 
+                    transform_back);
+          }));
+
+
+      }
+
+          } else {
+      // TODO: implement this later
+
+      // TODO: call kernel based on flag:
+      AT_DISPATCH_FLOATING_TYPES(
+          distance.scalar_type(), "SphereObb_collision", ([&] {
+            swept_sphere_obb_collision_batch_env_kernel<scalar_t>
+                <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                    sphere_position.data_ptr<scalar_t>(),
+                    distance.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
+                    activation_distance.data_ptr<scalar_t>(),
+                    obb_accel.data_ptr<scalar_t>(),
+                    obb_bounds.data_ptr<scalar_t>(),
+                    obb_pose.data_ptr<scalar_t>(),
+                    obb_enable.data_ptr<uint8_t>(),
+                    n_env_obb.data_ptr<int32_t>(),
+                    env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
+                    horizon, n_spheres, sweep_steps);
+          }));
+    }
+
+  } else {
+    if (compute_distance) {
+
+      if (enable_speed_metric)
+      {
+
+      
+      // This is the best kernel for now
+      AT_DISPATCH_FLOATING_TYPES(
+          distance.scalar_type(), "Swept_SphereObb_clpt", ([&] {
+            swept_sphere_obb_distance_jump_kernel<scalar_t, true>
+                <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                    sphere_position.data_ptr<scalar_t>(),
+                    distance.data_ptr<scalar_t>(),
+                    closest_point.data_ptr<scalar_t>(),
+                    sparsity_idx.data_ptr<uint8_t>(),
+                    weight.data_ptr<scalar_t>(),
+                    activation_distance.data_ptr<scalar_t>(),
+                    speed_dt.data_ptr<scalar_t>(),
+                    obb_accel.data_ptr<scalar_t>(),
+                    obb_bounds.data_ptr<scalar_t>(),
+                    obb_pose.data_ptr<scalar_t>(),
+                    obb_enable.data_ptr<uint8_t>(),
+                    n_env_obb.data_ptr<int32_t>(), max_nobs, batch_size,
+                    horizon, n_spheres, sweep_steps,
+                    transform_back);
+          }));
+      }
+      else
+      {
+        // This is the best kernel for now
+      AT_DISPATCH_FLOATING_TYPES(
+          distance.scalar_type(), "Swept_SphereObb_clpt", ([&] {
+            swept_sphere_obb_distance_jump_kernel<scalar_t, false>
+                <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                    sphere_position.data_ptr<scalar_t>(),
+                    distance.data_ptr<scalar_t>(),
+                    closest_point.data_ptr<scalar_t>(),
+                    sparsity_idx.data_ptr<uint8_t>(),
+                    weight.data_ptr<scalar_t>(),
+                    activation_distance.data_ptr<scalar_t>(),
+                    speed_dt.data_ptr<scalar_t>(),
+                    obb_accel.data_ptr<scalar_t>(),
+                    obb_bounds.data_ptr<scalar_t>(),
+                    obb_pose.data_ptr<scalar_t>(),
+                    obb_enable.data_ptr<uint8_t>(),
+                    n_env_obb.data_ptr<int32_t>(), max_nobs, batch_size,
+                    horizon, n_spheres, sweep_steps,
+                    transform_back);
+          }));
+      
+      }
+    } else {
+      AT_DISPATCH_FLOATING_TYPES(
+          distance.scalar_type(), "SphereObb_collision", ([&] {
+            swept_sphere_obb_collision_kernel<scalar_t>
+                <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                    sphere_position.data_ptr<scalar_t>(),
+                    distance.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
+                    activation_distance.data_ptr<scalar_t>(),
+                    obb_accel.data_ptr<scalar_t>(),
+                    obb_bounds.data_ptr<scalar_t>(),
+                    obb_pose.data_ptr<scalar_t>(),
+                    obb_enable.data_ptr<uint8_t>(),
+                    n_env_obb.data_ptr<int32_t>(), max_nobs, batch_size,
+                    horizon, n_spheres, sweep_steps);
+          }));
+    }
+  }
+
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {distance, closest_point, sparsity_idx}; //, debug_data};
+}
diff --git a/src/curobo/curobolib/cpp/tensor_step_cuda.cpp b/src/curobo/curobolib/cpp/tensor_step_cuda.cpp
new file mode 100644
index 00000000..0209db22
--- /dev/null
+++ b/src/curobo/curobolib/cpp/tensor_step_cuda.cpp
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#include <torch/extension.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <vector>
+
+std::vector<torch::Tensor> step_position_clique(
+    torch::Tensor out_position, torch::Tensor out_velocity,
+    torch::Tensor out_acceleration, torch::Tensor out_jerk,
+    const torch::Tensor u_position, const torch::Tensor start_position,
+    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+    const torch::Tensor traj_dt, const int batch_size, const int horizon,
+    const int dof);
+std::vector<torch::Tensor> step_position_clique2(
+    torch::Tensor out_position, torch::Tensor out_velocity,
+    torch::Tensor out_acceleration, torch::Tensor out_jerk,
+    const torch::Tensor u_position, const torch::Tensor start_position,
+    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+    const torch::Tensor traj_dt, const int batch_size, const int horizon,
+    const int dof, const int mode);
+std::vector<torch::Tensor> step_position_clique2_idx(
+    torch::Tensor out_position, torch::Tensor out_velocity,
+    torch::Tensor out_acceleration, torch::Tensor out_jerk,
+    const torch::Tensor u_position, const torch::Tensor start_position,
+    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+    const torch::Tensor start_idx, const torch::Tensor traj_dt,
+    const int batch_size, const int horizon, const int dof,  const int mode);
+
+std::vector<torch::Tensor> backward_step_position_clique(
+    torch::Tensor out_grad_position, const torch::Tensor grad_position,
+    const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
+    const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
+    const int batch_size, const int horizon, const int dof);
+std::vector<torch::Tensor> backward_step_position_clique2(
+    torch::Tensor out_grad_position, const torch::Tensor grad_position,
+    const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
+    const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
+    const int batch_size, const int horizon, const int dof,  const int mode);
+
+std::vector<torch::Tensor>
+step_acceleration(torch::Tensor out_position, torch::Tensor out_velocity,
+                  torch::Tensor out_acceleration, torch::Tensor out_jerk,
+                  const torch::Tensor u_acc, const torch::Tensor start_position,
+                  const torch::Tensor start_velocity,
+                  const torch::Tensor start_acceleration,
+                  const torch::Tensor traj_dt, const int batch_size,
+                  const int horizon, const int dof, const bool use_rk2 = true);
+
+std::vector<torch::Tensor> step_acceleration_idx(
+    torch::Tensor out_position, torch::Tensor out_velocity,
+    torch::Tensor out_acceleration, torch::Tensor out_jerk,
+    const torch::Tensor u_acc, const torch::Tensor start_position,
+    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+    const torch::Tensor start_idx, const torch::Tensor traj_dt,
+    const int batch_size, const int horizon, const int dof,
+    const bool use_rk2 = true);
+// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x)                                                    \
+  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x)                                                         \
+  CHECK_CUDA(x);                                                               \
+  CHECK_CONTIGUOUS(x)
+
+std::vector<torch::Tensor> step_position_clique_wrapper(
+    torch::Tensor out_position, torch::Tensor out_velocity,
+    torch::Tensor out_acceleration, torch::Tensor out_jerk,
+    const torch::Tensor u_position, const torch::Tensor start_position,
+    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+    const torch::Tensor traj_dt, const int batch_size, const int horizon,
+    const int dof) {
+  const at::cuda::OptionalCUDAGuard guard(u_position.device());
+
+  CHECK_INPUT(u_position);
+  CHECK_INPUT(out_position);
+  CHECK_INPUT(out_velocity);
+  CHECK_INPUT(out_acceleration);
+  CHECK_INPUT(out_jerk);
+  CHECK_INPUT(start_position);
+  CHECK_INPUT(start_velocity);
+  CHECK_INPUT(start_acceleration);
+  CHECK_INPUT(traj_dt);
+
+  return step_position_clique(out_position, out_velocity, out_acceleration,
+                              out_jerk, u_position, start_position,
+                              start_velocity, start_acceleration, traj_dt,
+                              batch_size, horizon, dof);
+}
+
+std::vector<torch::Tensor> step_position_clique2_wrapper(
+    torch::Tensor out_position, torch::Tensor out_velocity,
+    torch::Tensor out_acceleration, torch::Tensor out_jerk,
+    const torch::Tensor u_position, const torch::Tensor start_position,
+    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+    const torch::Tensor traj_dt, const int batch_size, const int horizon,
+    const int dof,
+    const int mode) {
+  const at::cuda::OptionalCUDAGuard guard(u_position.device());
+
+  CHECK_INPUT(u_position);
+  CHECK_INPUT(out_position);
+  CHECK_INPUT(out_velocity);
+  CHECK_INPUT(out_acceleration);
+  CHECK_INPUT(out_jerk);
+  CHECK_INPUT(start_position);
+  CHECK_INPUT(start_velocity);
+  CHECK_INPUT(start_acceleration);
+  CHECK_INPUT(traj_dt);
+
+  return step_position_clique2(out_position, out_velocity, out_acceleration,
+                               out_jerk, u_position, start_position,
+                               start_velocity, start_acceleration, traj_dt,
+                               batch_size, horizon, dof, mode);
+}
+
+std::vector<torch::Tensor> step_position_clique2_idx_wrapper(
+    torch::Tensor out_position, torch::Tensor out_velocity,
+    torch::Tensor out_acceleration, torch::Tensor out_jerk,
+    const torch::Tensor u_position, const torch::Tensor start_position,
+    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+    const torch::Tensor start_idx, const torch::Tensor traj_dt,
+    const int batch_size, const int horizon, const int dof,
+    const int mode) {
+  const at::cuda::OptionalCUDAGuard guard(u_position.device());
+
+  CHECK_INPUT(u_position);
+  CHECK_INPUT(out_position);
+  CHECK_INPUT(out_velocity);
+  CHECK_INPUT(out_acceleration);
+  CHECK_INPUT(out_jerk);
+  CHECK_INPUT(start_position);
+  CHECK_INPUT(start_velocity);
+  CHECK_INPUT(start_acceleration);
+  CHECK_INPUT(traj_dt);
+  CHECK_INPUT(start_idx);
+
+  return step_position_clique2_idx(
+      out_position, out_velocity, out_acceleration, out_jerk, u_position,
+      start_position, start_velocity, start_acceleration, start_idx, traj_dt,
+      batch_size, horizon, dof, mode);
+}
+
+std::vector<torch::Tensor> backward_step_position_clique_wrapper(
+    torch::Tensor out_grad_position, const torch::Tensor grad_position,
+    const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
+    const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
+    const int batch_size, const int horizon, const int dof) {
+  const at::cuda::OptionalCUDAGuard guard(grad_position.device());
+
+  CHECK_INPUT(out_grad_position);
+  CHECK_INPUT(grad_position);
+  CHECK_INPUT(grad_velocity);
+  CHECK_INPUT(grad_acceleration);
+  CHECK_INPUT(grad_jerk);
+  CHECK_INPUT(traj_dt);
+
+  return backward_step_position_clique(
+      out_grad_position, grad_position, grad_velocity, grad_acceleration,
+      grad_jerk, traj_dt, batch_size, horizon, dof);
+}
+std::vector<torch::Tensor> backward_step_position_clique2_wrapper(
+    torch::Tensor out_grad_position, const torch::Tensor grad_position,
+    const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
+    const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
+    const int batch_size, const int horizon, const int dof,
+    const int mode) {
+  const at::cuda::OptionalCUDAGuard guard(grad_position.device());
+
+  CHECK_INPUT(out_grad_position);
+  CHECK_INPUT(grad_position);
+  CHECK_INPUT(grad_velocity);
+  CHECK_INPUT(grad_acceleration);
+  CHECK_INPUT(grad_jerk);
+  CHECK_INPUT(traj_dt);
+
+  return backward_step_position_clique2(
+      out_grad_position, grad_position, grad_velocity, grad_acceleration,
+      grad_jerk, traj_dt, batch_size, horizon, dof, mode);
+}
+
+std::vector<torch::Tensor> step_acceleration_wrapper(
+    torch::Tensor out_position, torch::Tensor out_velocity,
+    torch::Tensor out_acceleration, torch::Tensor out_jerk,
+    const torch::Tensor u_acc, const torch::Tensor start_position,
+    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+    const torch::Tensor traj_dt, const int batch_size, const int horizon,
+    const int dof, const bool use_rk2 = true) {
+  const at::cuda::OptionalCUDAGuard guard(u_acc.device());
+
+  CHECK_INPUT(u_acc);
+  CHECK_INPUT(out_position);
+  CHECK_INPUT(out_velocity);
+  CHECK_INPUT(out_acceleration);
+  CHECK_INPUT(out_jerk);
+  CHECK_INPUT(start_position);
+  CHECK_INPUT(start_velocity);
+  CHECK_INPUT(start_acceleration);
+  CHECK_INPUT(traj_dt);
+
+  return step_acceleration(out_position, out_velocity, out_acceleration,
+                           out_jerk, u_acc, start_position, start_velocity,
+                           start_acceleration, traj_dt, batch_size, horizon,
+                           dof, use_rk2);
+}
+
+std::vector<torch::Tensor> step_acceleration_idx_wrapper(
+    torch::Tensor out_position, torch::Tensor out_velocity,
+    torch::Tensor out_acceleration, torch::Tensor out_jerk,
+    const torch::Tensor u_acc, const torch::Tensor start_position,
+    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+    const torch::Tensor start_idx, const torch::Tensor traj_dt,
+    const int batch_size, const int horizon, const int dof,
+    const bool use_rk2 = true) {
+  const at::cuda::OptionalCUDAGuard guard(u_acc.device());
+
+  CHECK_INPUT(u_acc);
+  CHECK_INPUT(out_position);
+  CHECK_INPUT(out_velocity);
+  CHECK_INPUT(out_acceleration);
+  CHECK_INPUT(out_jerk);
+  CHECK_INPUT(start_position);
+  CHECK_INPUT(start_velocity);
+  CHECK_INPUT(start_acceleration);
+  CHECK_INPUT(start_idx);
+  CHECK_INPUT(traj_dt);
+
+  return step_acceleration_idx(out_position, out_velocity, out_acceleration,
+                               out_jerk, u_acc, start_position, start_velocity,
+                               start_acceleration, start_idx, traj_dt,
+                               batch_size, horizon, dof, use_rk2);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("step_position", &step_position_clique_wrapper,
+        "Tensor Step Position (curobolib)");
+  m.def("step_position2", &step_position_clique2_wrapper,
+        "Tensor Step Position (curobolib)");
+  m.def("step_idx_position2", &step_position_clique2_idx_wrapper,
+        "Tensor Step Position (curobolib)");
+  m.def("step_position_backward", &backward_step_position_clique_wrapper,
+        "Tensor Step Position (curobolib)");
+  m.def("step_position_backward2", &backward_step_position_clique2_wrapper,
+        "Tensor Step Position (curobolib)");
+  m.def("step_acceleration", &step_acceleration_wrapper,
+        "Tensor Step Acceleration (curobolib)");
+  m.def("step_acceleration_idx", &step_acceleration_idx_wrapper,
+        "Tensor Step Acceleration (curobolib)");
+}
\ No newline at end of file
diff --git a/src/curobo/curobolib/cpp/tensor_step_kernel.cu b/src/curobo/curobolib/cpp/tensor_step_kernel.cu
new file mode 100644
index 00000000..d3a9ae00
--- /dev/null
+++ b/src/curobo/curobolib/cpp/tensor_step_kernel.cu
@@ -0,0 +1,1692 @@
+/*
+ * Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include <c10/cuda/CUDAStream.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <torch/extension.h>
+
+#include "helper_math.h"
+#include <cmath>
+#include <cuda_fp16.h>
+#include <vector>
+#define MAX_H 100
+#define BWD_DIFF -1
+#define CENTRAL_DIFF 0
+
+namespace Curobo
+{
+namespace TensorStep
+{
+
+template <typename scalar_t>
+__global__ void position_clique_loop_kernel(
+    scalar_t *out_position_mem, scalar_t *out_velocity_mem,
+    scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
+    const scalar_t *u_position, const scalar_t *start_position,
+    const scalar_t *start_velocity, const scalar_t *start_acceleration,
+    const scalar_t *traj_dt, const int batch_size, const int horizon,
+    const int dof) {
+  // there are batch * horizon * dof threads:
+  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int b_idx = tid / (dof);
+  const int d_idx = (tid - b_idx * dof);  
+  if (b_idx >= batch_size || d_idx >= dof) {
+    return;
+  }
+
+  const float dt =
+      traj_dt[0]; // assume same dt across traj TODO: Implement variable dt
+
+  // read start state:
+  float u_arr[MAX_H];
+  float out_pos[MAX_H], out_vel[MAX_H], out_acc[MAX_H], out_jerk[MAX_H];
+  const int b_addrs = b_idx * horizon * dof;
+
+#pragma unroll
+  for (int i = 0; i < horizon; i++) {
+    u_arr[i] = u_position[b_addrs + i * dof + d_idx];
+    out_pos[i] = 0;
+    out_vel[i] = 0;
+    out_acc[i] = 0;
+    out_jerk[i] = 0;
+  }
+
+  out_pos[0] = start_position[b_idx * dof + d_idx];
+  out_vel[0] = start_velocity[b_idx * dof + d_idx];
+  out_acc[0] = start_acceleration[b_idx * dof + d_idx];
+  out_jerk[0] = 0.0;
+
+  for (int h_idx = 1; h_idx < horizon; h_idx++) {
+
+    // read actions: batch, horizon
+    out_pos[h_idx] = u_arr[h_idx - 1];
+    out_vel[h_idx] = (out_pos[h_idx] - out_pos[h_idx - 1]) * dt; // 1 - 0
+    out_acc[h_idx] =
+        (out_vel[h_idx] - out_vel[h_idx - 1]) * dt; // 2 - 2.0 * 1 + 1
+    out_jerk[h_idx] = (out_acc[h_idx] - out_acc[h_idx - 1]) * dt; // -1 3 -3 1
+  }
+
+  // write out:
+  for (int h_idx = 0; h_idx < horizon; h_idx++) {
+    out_position_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+        out_pos[h_idx]; // new_position;
+    out_velocity_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+        out_vel[h_idx];
+    out_acceleration_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+        out_acc[h_idx];
+    out_jerk_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_jerk[h_idx];
+  }
+}
+
+template <typename scalar_t>
+__device__ __forceinline__ void compute_backward_difference(scalar_t *out_position_mem, 
+    scalar_t *out_velocity_mem,
+    scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
+    const scalar_t *u_position, const scalar_t *start_position,
+    const scalar_t *start_velocity, const scalar_t *start_acceleration,
+    const scalar_t *traj_dt, const int batch_size, const int horizon,
+    const int dof,
+    const int b_idx,
+    const int h_idx,
+    const int d_idx,
+    const int b_offset) {
+
+  const float dt =
+      traj_dt[0]; // assume same dt across traj TODO: Implement variable dt
+
+  // read start state:
+  float in_pos[4];
+  float st_pos=0.0, st_vel=0.0, st_acc = 0.0;
+
+  #pragma unroll 4
+  for (int i=0; i< 4; i++)
+  {
+    in_pos[i] = 0.0;
+  }
+
+  float out_pos=0.0, out_vel=0.0, out_acc=0.0, out_jerk=0.0;
+  const int b_addrs = b_idx * horizon * dof;
+  if (h_idx < 4){
+    st_pos = start_position[b_offset * dof + d_idx];
+    st_vel = start_velocity[b_offset * dof + d_idx];
+    st_acc = start_acceleration[b_offset * dof + d_idx];
+  }
+  if (h_idx == 0) {
+    out_pos = st_pos;
+    out_vel = st_vel;
+    out_acc = st_acc; //
+  } 
+  else {
+    if (h_idx == 1) {
+      
+      for(int i=3; i<4; i++)
+      {
+        in_pos[i] = u_position[b_addrs + (h_idx - 1 -3 + i)*dof + d_idx];
+      }
+
+
+      in_pos[0] = st_pos - 2.0 * (st_vel - 0.5 * st_acc*dt) * dt;
+
+      in_pos[1] = st_pos - (st_vel - 0.5 * st_acc*dt) * dt;
+
+      in_pos[2] = st_pos;
+    
+
+    } else if (h_idx == 2) {
+
+
+      for(int i=2; i<4; i++)
+      {
+        in_pos[i] = u_position[b_addrs + (h_idx - 1 - 3 + i)*dof + d_idx];
+      }
+
+      in_pos[0] = st_pos - (st_vel - 0.5 * st_acc*dt) * dt;
+
+      in_pos[1] = st_pos;
+
+
+      
+      
+    } 
+    else if (h_idx == 3)
+    {
+
+      for(int i=1; i<4; i++)
+      {
+        in_pos[i] = u_position[b_addrs + (h_idx - 1 -3 + i)*dof + d_idx];
+      }
+      in_pos[0] = st_pos;
+    
+    }
+    else {                                 // h_idx >= 4
+
+      for(int i=0; i<4; i++)
+      {
+        in_pos[i] = u_position[b_addrs + (h_idx - 1 -3 + i)*dof + d_idx];
+      }
+    }
+
+      out_pos = in_pos[3];
+      out_vel = (-in_pos[2] + in_pos[3]) * dt;
+      out_acc = (in_pos[1] - 2 * in_pos[2] + in_pos[3]) * dt * dt;
+      out_jerk = (-in_pos[0] + 3 * in_pos[1] - 3 * in_pos[2] + in_pos[3]) * dt * dt * dt;
+  
+  }
+
+  
+  // write out:
+  out_position_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+      out_pos; // new_position;
+  out_velocity_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_vel;
+  out_acceleration_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_acc;
+  out_jerk_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_jerk;
+}
+
+
+template <typename scalar_t>
+__device__ __forceinline__ void compute_central_difference_v0(scalar_t *out_position_mem, 
+    scalar_t *out_velocity_mem,
+    scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
+    const scalar_t *u_position, const scalar_t *start_position,
+    const scalar_t *start_velocity, const scalar_t *start_acceleration,
+    const scalar_t *traj_dt, const int batch_size, const int horizon,
+    const int dof,
+    const int b_idx,
+    const int h_idx,
+    const int d_idx,
+    const int b_offset) {
+
+    
+  const float dt = traj_dt[0]; // assume same dt across traj TODO: Implement variable dt
+  // dt here is actually 1/dt;
+
+  // read start state:
+  float out_pos=0.0, out_vel=0.0, out_acc=0.0, out_jerk=0.0;
+  float st_pos=0.0, st_vel=0.0, st_acc = 0.0;
+
+  const int b_addrs = b_idx * horizon * dof;
+  float in_pos[5]; // create a 5 value scalar
+
+  #pragma unroll 5
+  for (int i=0; i< 5; i ++){
+    in_pos[i] = 0.0;
+  }
+  if (h_idx < 4){
+    st_pos = start_position[b_offset * dof + d_idx];
+    st_vel = start_velocity[b_offset * dof + d_idx];
+    st_acc = start_acceleration[b_offset * dof + d_idx];
+  }
+  if (h_idx == 0)
+  {
+    out_pos = st_pos;
+    out_vel = st_vel;
+    out_acc = st_acc; 
+  }
+  else if (h_idx < horizon - 2)
+  {
+    if (h_idx == 1)
+    {
+
+      in_pos[0] = st_pos - dt * ( st_vel - (0.5 * st_acc * dt)); // start -1, start, u0, u1
+      in_pos[1] = st_pos; 
+      in_pos[2] = u_position[b_addrs + (h_idx - 1) * dof + d_idx];
+      in_pos[3] = u_position[b_addrs + (h_idx - 1 + 1) * dof + d_idx];
+      in_pos[4] = u_position[b_addrs + (h_idx - 1 + 2) * dof + d_idx];
+      
+    }
+    else if (h_idx == 2)
+    {
+      in_pos[0] = start_position[b_offset * dof + d_idx];
+      in_pos[1] = u_position[b_addrs + (h_idx - 1 - 1) * dof + d_idx];
+      in_pos[2] = u_position[b_addrs + (h_idx - 1 ) * dof + d_idx];
+      in_pos[3] = u_position[b_addrs + (h_idx - 1 + 1) * dof + d_idx];
+      in_pos[4] = u_position[b_addrs + (h_idx - 1 + 2) * dof + d_idx];\
+    
+    }
+  
+    
+    else if (h_idx > 2)
+    {
+      in_pos[0] = u_position[b_addrs + (h_idx - 1 - 2) * dof + d_idx];
+      in_pos[1] = u_position[b_addrs + (h_idx - 1 - 1) * dof + d_idx];
+      in_pos[2] = u_position[b_addrs + (h_idx - 1 ) * dof + d_idx];
+      in_pos[3] = u_position[b_addrs + (h_idx - 1 + 1) * dof + d_idx];
+      in_pos[4] = u_position[b_addrs + (h_idx - 1 + 2) * dof + d_idx];
+    
+    }
+    out_pos = in_pos[2];
+    out_vel = (0.5 * in_pos[3] - 0.5 * in_pos[1]) * dt;
+    out_acc = (in_pos[3] + in_pos[1] - 2 * in_pos[2]) * dt * dt;     
+    out_jerk = ((-0.5)*in_pos[0] + in_pos[1]  - in_pos[3] + (0.5)*in_pos[4]) * (dt * dt * dt);
+
+  }
+  else if (h_idx == horizon -2)
+  {
+    // use backward difference for jerk
+    
+    in_pos[0] = u_position[b_addrs + (h_idx - 1 - 3) * dof + d_idx];
+    in_pos[1] = u_position[b_addrs + (h_idx - 1 - 2) * dof + d_idx];
+    in_pos[2] = u_position[b_addrs + (h_idx - 1 - 1) * dof + d_idx];
+    in_pos[3] = u_position[b_addrs + (h_idx - 1 ) * dof + d_idx];
+    in_pos[4] = u_position[b_addrs + (h_idx - 1 + 1) * dof + d_idx];
+
+    out_pos = in_pos[3];
+    out_vel = (0.5 * in_pos[4] - 0.5 * in_pos[2]) * dt;
+    out_acc = (in_pos[4] + in_pos[2] - 2 * in_pos[3]) * dt * dt;
+    out_jerk= (-1 * in_pos[0] + 3 * in_pos[1] - 3 * in_pos[2] + in_pos[3] ) * dt * dt * dt;
+  }
+  else if (h_idx == horizon -1)
+  { // use backward difference for vel, acc, jerk
+    for(int i=0; i<4; i++)
+    {
+        in_pos[i] = u_position[b_addrs + (h_idx - 1 -3 + i)*dof + d_idx];
+    }
+    out_pos = in_pos[3];
+    out_vel = (-in_pos[2] + in_pos[3]) * dt;
+    out_acc = (in_pos[1] - 2 * in_pos[2] + in_pos[3]) * dt * dt;
+    out_jerk = (-in_pos[0] + 3 * in_pos[1] -3 * in_pos[2] + in_pos[3]) * dt * dt * dt;
+  }
+  // write out:
+  out_position_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_pos; 
+  out_velocity_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_vel;
+  out_acceleration_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_acc;
+  out_jerk_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_jerk;
+}
+
+
+template <typename scalar_t>
+__device__ __forceinline__ void compute_central_difference(scalar_t *out_position_mem, 
+    scalar_t *out_velocity_mem,
+    scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
+    const scalar_t *u_position, const scalar_t *start_position,
+    const scalar_t *start_velocity, const scalar_t *start_acceleration,
+    const scalar_t *traj_dt, const int batch_size, const int horizon,
+    const int dof,
+    const int b_idx,
+    const int h_idx,
+    const int d_idx,
+    const int b_offset) {
+
+    
+  const float dt = traj_dt[0]; // assume same dt across traj TODO: Implement variable dt
+  // dt here is actually 1/dt;
+
+  // read start state:
+  float out_pos=0.0, out_vel=0.0, out_acc=0.0, out_jerk=0.0;
+  float st_pos=0.0, st_vel=0.0, st_acc = 0.0;
+
+  const int b_addrs = b_idx * horizon * dof;
+  float in_pos[5]; // create a 5 value scalar
+
+  #pragma unroll 5
+  for (int i=0; i< 5; i ++){
+    in_pos[i] = 0.0;
+  }
+  if (h_idx < 5){
+    st_pos = start_position[b_offset * dof + d_idx];
+    st_vel = start_velocity[b_offset * dof + d_idx];
+    st_acc = start_acceleration[b_offset * dof + d_idx];
+  }
+  
+  if (h_idx > 3 && h_idx < horizon - 4)
+    {
+      in_pos[0] = u_position[b_addrs + (h_idx - 4) * dof + d_idx];
+      in_pos[1] = u_position[b_addrs + (h_idx - 3) * dof + d_idx];
+      in_pos[2] = u_position[b_addrs + (h_idx - 2 ) * dof + d_idx];
+      in_pos[3] = u_position[b_addrs + (h_idx - 1) * dof + d_idx];
+      in_pos[4] = u_position[b_addrs + (h_idx) * dof + d_idx];
+      
+    }
+
+    
+  else if (h_idx == 0)
+  {
+    in_pos[0] = st_pos - 3 * dt * ( st_vel + (0.5 * st_acc * dt)); // start -1, start, u0, u1
+    in_pos[1] = st_pos - 2 * dt * ( st_vel + (0.5 * st_acc * dt)); 
+    in_pos[2] = st_pos - dt * ( st_vel + (0.5 * st_acc * dt)); 
+    in_pos[3] = st_pos;
+    in_pos[4] = u_position[b_addrs + (h_idx) * dof + d_idx];
+  }
+    
+  else if (h_idx == 1)
+  {
+    in_pos[0] = st_pos - 2 * dt * ( st_vel + (0.5 * st_acc * dt)); // start -1, start, u0, u1
+    in_pos[1] = st_pos - dt * ( st_vel + (0.5 * st_acc * dt)); 
+    in_pos[2] = st_pos; 
+    in_pos[3] = u_position[b_addrs + (h_idx - 1) * dof + d_idx];
+    in_pos[4] = u_position[b_addrs + (h_idx) * dof + d_idx];
+  }
+  
+  else if (h_idx == 2)
+  {
+    in_pos[0] = st_pos - dt * ( st_vel + (0.5 * st_acc * dt)); // start -1, start, u0, u1
+    in_pos[1] = st_pos; 
+    in_pos[2] = u_position[b_addrs + (h_idx - 2) * dof + d_idx];
+    in_pos[3] = u_position[b_addrs + (h_idx - 1) * dof + d_idx];
+    in_pos[4] = u_position[b_addrs + (h_idx) * dof + d_idx];
+      
+  }
+  else if (h_idx == 3)
+  {
+    in_pos[0] = st_pos;
+    in_pos[1] = u_position[b_addrs + (h_idx - 3) * dof + d_idx];
+    in_pos[2] = u_position[b_addrs + (h_idx - 2 ) * dof + d_idx];
+    in_pos[3] = u_position[b_addrs + (h_idx - 1 ) * dof + d_idx];
+    in_pos[4] = u_position[b_addrs + (h_idx) * dof + d_idx];
+  
+  }
+  
+  else if (h_idx == horizon - 4)
+  {
+    in_pos[0] = u_position[b_addrs + (h_idx -4) * dof + d_idx];
+    in_pos[1] = u_position[b_addrs + (h_idx - 3) * dof + d_idx];
+    in_pos[2] = u_position[b_addrs + (h_idx - 2 ) * dof + d_idx];
+    in_pos[3] = u_position[b_addrs + (h_idx - 1) * dof + d_idx];
+    in_pos[4] = in_pos[3];//in_pos[3]; //u_position[b_addrs + (h_idx - 1 + 2) * dof + d_idx];
+
+  }
+  
+  else if (h_idx == horizon - 3)
+  {
+    in_pos[0] = u_position[b_addrs + (h_idx - 4) * dof + d_idx];
+    in_pos[1] = u_position[b_addrs + (h_idx - 3) * dof + d_idx];
+    in_pos[2] = u_position[b_addrs + (h_idx - 2) * dof + d_idx];
+    in_pos[3] = in_pos[2];//u_position[b_addrs + (h_idx - 1 + 1) * dof + d_idx];
+    in_pos[4] = in_pos[2];//in_pos[3]; //u_position[b_addrs + (h_idx - 1 + 2) * dof + d_idx];
+
+  }
+  else if (h_idx == horizon - 2)
+  {
+    in_pos[0] = u_position[b_addrs + (h_idx - 4) * dof + d_idx];
+    in_pos[1] = u_position[b_addrs + (h_idx - 3) * dof + d_idx];
+    in_pos[2] = in_pos[1];
+    in_pos[3] = in_pos[1];//u_position[b_addrs + (h_idx - 1 + 1) * dof + d_idx];
+    in_pos[4] = in_pos[1];//in_pos[3]; //u_position[b_addrs + (h_idx - 1 + 2) * dof + d_idx];
+
+  }
+  
+  else if (h_idx == horizon -1)
+  {
+    in_pos[0] = u_position[b_addrs + (h_idx - 4) * dof + d_idx];
+    in_pos[1] = in_pos[0];
+    in_pos[2] = in_pos[0];//u_position[b_addrs + (h_idx - 1 ) * dof + d_idx];
+    in_pos[3] = in_pos[0];//u_position[b_addrs + (h_idx - 1 + 1) * dof + d_idx];
+    in_pos[4] = in_pos[0];//in_pos[3]; //u_position[b_addrs + (h_idx - 1 + 2) * dof + d_idx];
+  }
+  out_pos = in_pos[2];
+  // out_vel = (0.5 * in_pos[3] - 0.5 * in_pos[1]) * dt;
+  out_vel = ((0.083333333f) * in_pos[0] - (0.666666667f) * in_pos[1] + (0.666666667f) * in_pos[3] + (-0.083333333f) * in_pos[4]) * dt;
+
+  // out_acc = (in_pos[3] + in_pos[1] - 2.0 * in_pos[2]) * dt * dt;     
+  out_acc = ((-0.083333333f) * in_pos[0] + (1.333333333f) * in_pos[1] + (-2.5f) * in_pos[2] + (1.333333333f) * in_pos[3] + (-0.083333333f) * in_pos[4]) * dt * dt;
+  out_jerk = ((-0.5f)*in_pos[0] + in_pos[1]  - in_pos[3] + (0.5f)*in_pos[4]) * (dt * dt * dt);
+  
+  // write out:
+  out_position_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_pos; 
+  out_velocity_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_vel;
+  out_acceleration_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_acc;
+  out_jerk_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_jerk;
+}
+
+
+template <typename scalar_t, int8_t mode>
+__global__ void position_clique_loop_kernel2(
+    scalar_t *out_position_mem, scalar_t *out_velocity_mem,
+    scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
+    const scalar_t *u_position, const scalar_t *start_position,
+    const scalar_t *start_velocity, const scalar_t *start_acceleration,
+    const scalar_t *traj_dt, const int batch_size, const int horizon,
+    const int dof) {
+  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  // number of threads = batch_size * dof * horizon;
+  const int h_idx = tid % horizon;
+  const int d_idx = (tid / horizon) % dof;
+  const int b_idx = tid / (dof * horizon);
+  if (tid >= batch_size * dof * horizon) {
+    return;
+  }
+
+
+  const int b_offset = b_idx;
+  if (mode == BWD_DIFF)
+  {
+    compute_backward_difference(out_position_mem, out_velocity_mem,
+  out_acceleration_mem, out_jerk_mem, u_position, start_position, start_velocity,
+  start_acceleration, traj_dt, batch_size, horizon, dof, b_idx, h_idx, d_idx, b_offset);
+  }
+  else if (mode == CENTRAL_DIFF)
+  {
+
+  compute_central_difference(out_position_mem, out_velocity_mem,
+  out_acceleration_mem, out_jerk_mem, u_position, start_position, start_velocity,
+  start_acceleration, traj_dt, batch_size, horizon, dof, b_idx, h_idx, d_idx, b_offset);
+
+  }
+  else
+  {
+    assert(false);
+  }
+  
+}
+
+
+template <typename scalar_t, int8_t mode>
+__global__ void position_clique_loop_idx_kernel2(
+    scalar_t *out_position_mem, scalar_t *out_velocity_mem,
+    scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
+    const scalar_t *u_position, const scalar_t *start_position,
+    const scalar_t *start_velocity, const scalar_t *start_acceleration,
+    const int32_t *start_idx, const scalar_t *traj_dt, const int batch_size,
+    const int horizon, const int dof) {
+  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  // number of threads = batch_size * dof * horizon;
+  const int h_idx = tid % horizon;
+  const int d_idx = (tid / horizon) % dof;
+  const int b_idx = tid / (dof * horizon);
+  if (tid >= batch_size * dof * horizon) {
+    return;
+  }
+
+
+  const int b_offset = start_idx[b_idx];
+   if (mode == BWD_DIFF)
+  {
+    compute_backward_difference(out_position_mem, out_velocity_mem,
+  out_acceleration_mem, out_jerk_mem, u_position, start_position, start_velocity,
+  start_acceleration, traj_dt, batch_size, horizon, dof, b_idx, h_idx, d_idx, b_offset);
+  }
+  else if (mode == CENTRAL_DIFF)
+  {
+
+  compute_central_difference(out_position_mem, out_velocity_mem,
+  out_acceleration_mem, out_jerk_mem, u_position, start_position, start_velocity,
+  start_acceleration, traj_dt, batch_size, horizon, dof, b_idx, h_idx, d_idx, b_offset);
+
+  }
+  else
+  {
+    assert(false);
+  }
+}
+
+template <typename scalar_t>
+__global__ void backward_position_clique_loop_kernel(
+    scalar_t *out_grad_position, const scalar_t *grad_position,
+    const scalar_t *grad_velocity, const scalar_t *grad_acceleration,
+    const scalar_t *grad_jerk, const scalar_t *traj_dt, const int batch_size,
+    const int horizon, const int dof) {
+  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int b_idx = tid / (dof);
+  const int d_idx = (tid - b_idx * dof);
+  if (b_idx >= batch_size || d_idx >= dof) {
+    return;
+  }
+  const int b_addrs = b_idx * horizon * dof;
+  // read gradients:
+  float g_pos[MAX_H];
+  float g_vel[MAX_H];
+  float g_acc[MAX_H];
+  float g_jerk[MAX_H];
+  const float dt = traj_dt[0];
+  const float dt_2 = dt * dt ;//dt * dt;
+  const float dt_3 = dt * dt * dt;//dt * dt * dt;
+
+  // not used index == 0
+  g_pos[0] = 0.0;
+  g_vel[0] = 0.0;
+  g_acc[0] = 0.0;
+  g_jerk[0] = 0.0;
+#pragma unroll
+  for (int h_idx = 1; h_idx < horizon; h_idx++) {
+    g_pos[h_idx] = grad_position[b_addrs + (h_idx)*dof + d_idx];
+    g_vel[h_idx] = grad_velocity[b_addrs + (h_idx)*dof + d_idx];
+    g_acc[h_idx] = grad_acceleration[b_addrs + (h_idx)*dof + d_idx];
+    g_jerk[h_idx] = grad_jerk[b_addrs + (h_idx)*dof + d_idx];
+  }
+#pragma unroll
+  for (int i = 0; i < 4; i++) {
+    g_vel[horizon + i] = 0.0;
+    g_acc[horizon + i] = 0.0;
+    g_jerk[horizon + i] = 0.0;
+  }
+  // compute gradient and sum
+  for (int h_idx = 0; h_idx < horizon - 1; h_idx++) {
+    g_pos[h_idx + 1] +=
+        ((g_vel[h_idx + 1] - g_vel[h_idx + 2]) * dt +
+         (g_acc[h_idx + 1] - 2 * g_acc[h_idx + 2] + g_acc[h_idx + 3]) * dt_2 +
+         (g_jerk[h_idx + 1] - 3 * g_jerk[h_idx + 2] + 3 * g_jerk[h_idx + 3] -
+          g_jerk[h_idx + 4]) *
+             dt_3);
+  }
+
+  // write out:
+  for (int h_idx = 0; h_idx < horizon - 1; h_idx++) {
+    out_grad_position[b_addrs + (h_idx)*dof + d_idx] = g_pos[h_idx + 1];
+  }
+  out_grad_position[b_addrs + (horizon - 1) * dof + d_idx] = 0.0;
+}
+
+
+
+template <typename scalar_t>
+__global__ void backward_position_clique_loop_backward_difference_kernel2(
+    scalar_t *out_grad_position, const scalar_t *grad_position,
+    const scalar_t *grad_velocity, const scalar_t *grad_acceleration,
+    const scalar_t *grad_jerk, const scalar_t *traj_dt, const int batch_size,
+    const int horizon, const int dof) {
+  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  // number of threads = batch_size * dof * horizon;
+  const int h_idx = tid % horizon;
+  const int d_idx = (tid / horizon) % dof;
+  const int b_idx = tid / (dof * horizon);
+  if (tid >= batch_size * dof * horizon) {
+    return;
+  }
+  const int b_addrs = b_idx * horizon * dof;
+  if (h_idx == 0)
+  {
+    return;
+  }
+  
+
+  const float dt = traj_dt[0];
+
+  // read gradients:
+  float g_pos= 0.0;
+  float out_grad = 0.0;
+  float g_vel[4];
+  float g_acc[4];
+  float g_jerk[4];
+
+  #pragma unroll 4
+  for (int i=0; i<4; i++)
+  {
+    g_vel[i] = 0.0;
+    g_acc[i] = 0.0;
+    g_jerk[i] = 0.0;
+  }
+
+  int hid = h_idx;// + 1;
+
+  g_pos = grad_position[b_addrs + (hid)*dof + d_idx];
+
+
+
+  if(hid < horizon - 3)
+  {
+
+    for (int i=0; i< 4; i++)
+    {
+    g_vel[i] = grad_velocity[b_addrs + (hid + i)*dof + d_idx];
+    g_acc[i] = grad_acceleration[b_addrs + (hid + i)*dof + d_idx];
+    g_jerk[i] = grad_jerk[b_addrs + (hid + i)*dof + d_idx];
+    }
+
+
+  }
+  else if (hid == horizon -3)
+  {
+    for (int i=0; i< 3; i++)
+    {
+    g_vel[i] = grad_velocity[b_addrs + (hid + i)*dof + d_idx];
+    g_acc[i] = grad_acceleration[b_addrs + (hid + i)*dof + d_idx];
+    g_jerk[i] = grad_jerk[b_addrs + (hid + i)*dof + d_idx];
+    }
+  }
+  
+  
+  else if (hid == horizon -2)
+  {
+    for (int i=0; i< 2; i++)
+    {
+    g_vel[i] = grad_velocity[b_addrs + (hid + i)*dof + d_idx];
+    g_acc[i] = grad_acceleration[b_addrs + (hid + i)*dof + d_idx];
+    g_jerk[i] = grad_jerk[b_addrs + (hid + i)*dof + d_idx];
+    }
+  }
+  else if (hid == horizon -1)
+  {
+      for (int i=0; i< 1; i++)
+    {
+    g_vel[i] = grad_velocity[b_addrs + (hid + i)*dof + d_idx];
+    g_acc[i] = grad_acceleration[b_addrs + (hid + i)*dof + d_idx];
+    g_jerk[i] = grad_jerk[b_addrs + (hid + i)*dof + d_idx];
+    }
+
+  }
+    
+
+
+  out_grad = (g_pos + 
+              (g_vel[0] -  g_vel[1]) * dt + 
+              (g_acc[0] -2 * g_acc[1] + g_acc[2]) * dt * dt +
+              (g_jerk[0] - 3 * g_jerk[1] + 3 * g_jerk[2] - g_jerk[3]) * dt * dt * dt); 
+  
+
+
+  // write out:
+  out_grad_position[b_addrs + (h_idx-1)*dof + d_idx] = out_grad;
+  
+}
+
+template <typename scalar_t>
+__global__ void backward_position_clique_loop_central_difference_kernel2(
+    scalar_t *out_grad_position, const scalar_t *grad_position,
+    const scalar_t *grad_velocity, const scalar_t *grad_acceleration,
+    const scalar_t *grad_jerk, const scalar_t *traj_dt, const int batch_size,
+    const int horizon, const int dof) {
+  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  // number of threads = batch_size * dof * horizon;
+  const int h_idx = tid % horizon;
+  const int d_idx = (tid / horizon) % dof;
+  const int b_idx = tid / (dof * horizon);
+  if (tid >= batch_size * dof * horizon) {
+    return;
+  }
+  const int b_addrs = b_idx * horizon * dof;
+
+  if (h_idx < 2 || h_idx > horizon - 2)
+  {
+    return;
+  }
+
+  const float dt = traj_dt[0];
+
+  // read gradients:
+  //float g_pos= 0.0;
+  float out_grad = 0.0;
+  float g_pos[3];
+  float g_vel[5];
+  float g_acc[5];
+  float g_jerk[5];
+
+  #pragma unroll 5
+  for (int i=0; i<5; i++)
+  {
+    g_vel[i] = 0.0;
+    g_acc[i] = 0.0;
+    g_jerk[i] = 0.0;
+  }
+
+  int hid = h_idx;
+
+  g_pos[0] = grad_position[b_addrs + (hid)*dof + d_idx];
+  g_pos[1] = 0.0;
+  g_pos[2] = 0.0;
+
+
+  if(hid > 1 && h_idx < horizon - 3)
+  { 
+    #pragma unroll
+    for (int i=0; i< 5; i++)
+    {
+    g_vel[i] = grad_velocity[b_addrs + ((hid -2) + i)*dof + d_idx];
+    g_acc[i] = grad_acceleration[b_addrs + ((hid -2) + i)*dof + d_idx];
+    g_jerk[i] = grad_jerk[b_addrs + ((hid -2) + i)*dof + d_idx];
+    }
+    out_grad = (g_pos[0] + 
+          //((-0.5) * g_vel[3] + (0.5) * g_vel[1]) * dt + 
+
+          ((-0.083333333f) * g_vel[0] + (0.666666667f) * g_vel[1] + (-0.666666667f) * g_vel[3] + (0.083333333f) * g_vel[4]) * dt +
+
+
+          
+          ((-0.083333333f) * g_acc[0] + (1.333333333f) * g_acc[1] + (-2.5f) * g_acc[2] + (1.333333333f) * g_acc[3] + (-0.083333333f) * g_acc[4]) * dt * dt +
+          //( g_acc[3] + g_acc[1] - (2.0) * g_acc[2]) * dt * dt +
+          (0.5f * g_jerk[0] - g_jerk[1] + g_jerk[3] - 0.5f * g_jerk[4]) * dt * dt * dt);
+    
+  }
+  else if (hid == horizon -3)
+  {
+    #pragma unroll
+    for (int i=0; i< 4; i++)
+    {
+    g_vel[i] = grad_velocity[b_addrs + ((hid - 2) + i)*dof + d_idx];
+    g_acc[i] = grad_acceleration[b_addrs + ((hid -2) + i)*dof + d_idx];
+    g_jerk[i] = grad_jerk[b_addrs + ((hid -2) + i)*dof + d_idx];
+    }
+    g_pos[1] = grad_position[b_addrs + (hid + 1)*dof + d_idx];
+    g_pos[2] = grad_position[b_addrs + (hid + 2)*dof + d_idx];
+
+    out_grad = (g_pos[0] + g_pos[1] + g_pos[2] +  
+          //((0.5) * g_vel[1] + (0.5) * g_vel[2]) * dt + 
+          ((-0.083333333f) * g_vel[0] + (0.583333333f) * g_vel[1] + (0.583333333f) * g_vel[2] + (-0.083333333f) * g_vel[3]) * dt  +
+          ((-0.083333333f) * g_acc[0] + (1.25f) * g_acc[1] + (-1.25f) * g_acc[2] + (0.083333333f) * g_acc[3]) * dt * dt +
+          //( g_acc[1] - g_acc[2]) * dt * dt +
+          (0.5f * g_jerk[0] - 0.5f * g_jerk[1] -0.5f * g_jerk[2] + 0.5f * g_jerk[3]) * dt * dt * dt);
+  }
+  
+   
+  
+  // write out:
+  out_grad_position[b_addrs + (h_idx-2)*dof + d_idx] = out_grad;
+}
+
+// for MPPI:
+
+template <typename scalar_t>
+__global__ void
+acceleration_loop_kernel(scalar_t *out_position_mem, scalar_t *out_velocity_mem,
+                         scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
+                         const scalar_t *u_acc, const scalar_t *start_position,
+                         const scalar_t *start_velocity,
+                         const scalar_t *start_acceleration,
+                         const scalar_t *traj_dt, const int batch_size,
+                         const int horizon, const int dof) {
+  // there are batch * horizon * dof threads:
+  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int b_idx = tid / (dof);
+  const int d_idx = (tid - b_idx * dof);
+  if (b_idx >= batch_size || d_idx >= dof) {
+    return;
+  }
+
+  // read start state:
+  float u_arr[MAX_H], dt[MAX_H];
+  float out_pos[MAX_H], out_vel[MAX_H], out_acc[MAX_H], out_jerk[MAX_H];
+  const int b_addrs = b_idx * horizon * dof;
+
+#pragma unroll
+  for (int i = 0; i < horizon; i++) {
+    u_arr[i] = u_acc[b_addrs + i * dof + d_idx];
+    dt[i] = traj_dt[i];
+    out_pos[i] = 0;
+    out_vel[i] = 0;
+    out_acc[i] = 0;
+    out_jerk[i] = 0;
+  }
+
+  out_pos[0] = start_position[b_idx * dof + d_idx];
+  out_vel[0] = start_velocity[b_idx * dof + d_idx];
+  out_acc[0] = start_acceleration[b_idx * dof + d_idx];
+  out_jerk[0] = 0.0;
+
+  for (int h_idx = 1; h_idx < horizon; h_idx++) {
+    // do semi implicit euler integration:
+    out_acc[h_idx] = u_arr[h_idx - 1];
+    out_vel[h_idx] = out_vel[h_idx - 1] + out_acc[h_idx] * dt[h_idx];
+    out_pos[h_idx] = out_pos[h_idx - 1] + out_vel[h_idx] * dt[h_idx];
+    out_jerk[h_idx] = (out_acc[h_idx] - out_acc[h_idx - 1]) / dt[h_idx];
+  }
+
+  // write out:
+  for (int h_idx = 0; h_idx < horizon; h_idx++) {
+    out_position_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+        out_pos[h_idx]; // new_position;
+    out_velocity_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+        out_vel[h_idx];
+    out_acceleration_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+        out_acc[h_idx];
+    out_jerk_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_jerk[h_idx];
+  }
+}
+template <typename scalar_t>
+__global__ void acceleration_loop_rk2_kernel(
+    scalar_t *out_position_mem, scalar_t *out_velocity_mem,
+    scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
+    const scalar_t *u_acc, const scalar_t *start_position,
+    const scalar_t *start_velocity, const scalar_t *start_acceleration,
+    const scalar_t *traj_dt, const int batch_size, const int horizon,
+    const int dof) {
+  // there are batch * horizon * dof threads:
+  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int b_idx = tid / (dof);
+  const int d_idx = (tid - b_idx * dof);
+  if (b_idx >= batch_size || d_idx >= dof) {
+    return;
+  }
+
+  // read start state:
+  float u_arr[MAX_H], dt[MAX_H];
+  float out_pos[MAX_H], out_vel[MAX_H], out_acc[MAX_H], out_jerk[MAX_H];
+  const int b_addrs = b_idx * horizon * dof;
+
+#pragma unroll
+  for (int i = 0; i < horizon; i++) {
+    u_arr[i] = u_acc[b_addrs + i * dof + d_idx];
+    dt[i] = traj_dt[i];
+    out_pos[i] = 0;
+    out_vel[i] = 0;
+    out_acc[i] = 0;
+    out_jerk[i] = 0;
+  }
+
+  out_pos[0] = start_position[b_idx * dof + d_idx];
+  out_vel[0] = start_velocity[b_idx * dof + d_idx];
+  out_acc[0] = start_acceleration[b_idx * dof + d_idx];
+  out_jerk[0] = 0.0;
+
+  for (int h_idx = 1; h_idx < horizon; h_idx++) {
+    // do rk2 integration:
+
+    out_acc[h_idx] = u_arr[h_idx - 1];
+    out_jerk[h_idx] = (out_acc[h_idx] - out_acc[h_idx - 1]) / dt[h_idx];
+    out_pos[h_idx] = out_pos[h_idx - 1] + out_vel[h_idx - 1] * dt[h_idx] +
+                     0.5 * dt[h_idx] * dt[h_idx] * out_acc[h_idx];
+    out_vel[h_idx] = out_vel[h_idx - 1] + 0.5 * dt[h_idx] * out_acc[h_idx];
+  }
+
+  // write out:
+  for (int h_idx = 0; h_idx < horizon; h_idx++) {
+    out_position_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+        out_pos[h_idx]; // new_position;
+    out_velocity_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+        out_vel[h_idx];
+    out_acceleration_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+        out_acc[h_idx];
+    out_jerk_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_jerk[h_idx];
+  }
+}
+
+template <typename scalar_t>
+__global__ void acceleration_loop_idx_kernel(
+    scalar_t *out_position_mem, scalar_t *out_velocity_mem,
+    scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
+    const scalar_t *u_acc, const scalar_t *start_position,
+    const scalar_t *start_velocity, const scalar_t *start_acceleration,
+    const int32_t *start_idx, const scalar_t *traj_dt, const int batch_size,
+    const int horizon, const int dof) {
+  // there are batch * horizon * dof threads:
+  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int b_idx = tid / (dof);
+  const int d_idx = (tid - b_idx * dof);
+  if (b_idx >= batch_size || d_idx >= dof) {
+    return;
+  }
+
+  // read start state:
+  float u_arr[MAX_H], dt[MAX_H];
+  float out_pos[MAX_H], out_vel[MAX_H], out_acc[MAX_H], out_jerk[MAX_H];
+  const int b_addrs = b_idx * horizon * dof;
+  const int b_offset = start_idx[b_idx];
+
+#pragma unroll
+  for (int i = 0; i < horizon; i++) {
+    u_arr[i] = u_acc[b_addrs + i * dof + d_idx];
+    dt[i] = traj_dt[i];
+    out_pos[i] = 0;
+    out_vel[i] = 0;
+    out_acc[i] = 0;
+    out_jerk[i] = 0;
+  }
+
+  out_pos[0] = start_position[b_offset * dof + d_idx];
+  out_vel[0] = start_velocity[b_offset * dof + d_idx];
+  out_acc[0] = start_acceleration[b_offset * dof + d_idx];
+  out_jerk[0] = 0.0;
+
+  for (int h_idx = 1; h_idx < horizon; h_idx++) {
+    // do semi implicit euler integration:
+    out_acc[h_idx] = u_arr[h_idx - 1];
+    out_vel[h_idx] = out_vel[h_idx - 1] + out_acc[h_idx] * dt[h_idx];
+    out_pos[h_idx] = out_pos[h_idx - 1] + out_vel[h_idx] * dt[h_idx];
+    out_jerk[h_idx] = (out_acc[h_idx] - out_acc[h_idx - 1]) / dt[h_idx];
+  }
+
+  // write out:
+  for (int h_idx = 0; h_idx < horizon; h_idx++) {
+    out_position_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+        out_pos[h_idx]; // new_position;
+    out_velocity_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+        out_vel[h_idx];
+    out_acceleration_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+        out_acc[h_idx];
+    out_jerk_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_jerk[h_idx];
+  }
+}
+
+template <typename scalar_t>
+__global__ void acceleration_loop_idx_rk2_kernel(
+    scalar_t *out_position_mem, scalar_t *out_velocity_mem,
+    scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
+    const scalar_t *u_acc, const scalar_t *start_position,
+    const scalar_t *start_velocity, const scalar_t *start_acceleration,
+    const int32_t *start_idx, const scalar_t *traj_dt, const int batch_size,
+    const int horizon, const int dof) {
+  // there are batch * horizon * dof threads:
+  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int b_idx = tid / (dof);
+  const int d_idx = (tid - b_idx * dof);
+  if (b_idx >= batch_size || d_idx >= dof) {
+    return;
+  }
+
+  // read start state:
+  float u_arr[MAX_H], dt[MAX_H];
+  float out_pos[MAX_H], out_vel[MAX_H], out_acc[MAX_H], out_jerk[MAX_H];
+  const int b_addrs = b_idx * horizon * dof;
+  const int b_offset = start_idx[b_idx];
+
+#pragma unroll
+  for (int i = 0; i < horizon; i++) {
+    u_arr[i] = u_acc[b_addrs + i * dof + d_idx];
+    dt[i] = traj_dt[i];
+    out_pos[i] = 0;
+    out_vel[i] = 0;
+    out_acc[i] = 0;
+    out_jerk[i] = 0;
+  }
+
+  out_pos[0] = start_position[b_offset * dof + d_idx];
+  out_vel[0] = start_velocity[b_offset * dof + d_idx];
+  out_acc[0] = start_acceleration[b_offset * dof + d_idx];
+  out_jerk[0] = 0.0;
+
+  for (int h_idx = 1; h_idx < horizon; h_idx++) {
+    // do semi implicit euler integration:
+
+    out_acc[h_idx] = u_arr[h_idx - 1];
+    out_vel[h_idx] = out_vel[h_idx - 1] + out_acc[h_idx] * dt[h_idx];
+    out_pos[h_idx] = out_pos[h_idx - 1] + out_vel[h_idx] * dt[h_idx];
+    out_jerk[h_idx] = (out_acc[h_idx] - out_acc[h_idx - 1]) / dt[h_idx];
+  }
+
+  // write out:
+  for (int h_idx = 0; h_idx < horizon; h_idx++) {
+    out_position_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+        out_pos[h_idx]; // new_position;
+    out_velocity_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+        out_vel[h_idx];
+    out_acceleration_mem[b_idx * horizon * dof + h_idx * dof + d_idx] =
+        out_acc[h_idx];
+    out_jerk_mem[b_idx * horizon * dof + h_idx * dof + d_idx] = out_jerk[h_idx];
+  }
+}
+
+// Not used
+
+template <typename scalar_t>
+__global__ void position_clique_kernel(
+    scalar_t *out_position, scalar_t *out_velocity, scalar_t *out_acceleration,
+    scalar_t *out_jerk, const scalar_t *u_position,
+    const scalar_t *start_position, const scalar_t *start_velocity,
+    const scalar_t *start_acceleration, const scalar_t *traj_dt,
+    const int batch_size, const int horizon, const int dof) {
+  // there are batch * horizon * dof threads:
+  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int b_idx = tid / (horizon * dof);
+  const int h_idx = (tid - b_idx * (horizon * dof)) / dof;
+  const int d_idx = (tid - b_idx * horizon * dof - h_idx * dof);
+  if (b_idx >= batch_size || h_idx >= horizon || d_idx >= dof) {
+    return;
+  }
+
+  float new_position, new_velocity, new_acceleration, new_jerk;
+  const float dt = traj_dt[h_idx];
+
+  // read actions: batch, horizon
+  if (h_idx == 0) {
+    new_position = start_position[b_idx * dof + d_idx];
+    new_velocity = start_velocity[b_idx * dof + d_idx];
+    new_acceleration = start_acceleration[b_idx * dof + d_idx];
+    new_jerk = 0.0;
+  } else if (h_idx == 1) {
+
+    float2 u_clique = make_float2(
+        start_position[b_idx * dof + d_idx],
+        u_position[b_idx * horizon * dof + (h_idx - 1) * dof + d_idx]);
+    new_position = u_clique.y;
+    new_velocity = (u_clique.y - u_clique.x) * dt; // 1 - 0
+    new_acceleration =
+        (u_clique.y - u_clique.x) * dt * dt;        // 2 - 2.0 * 1 + 1
+    new_jerk = (u_clique.y - u_clique.x) *  dt * dt * dt; // -1 3 -3 1
+  } else if (h_idx == 2) {
+    float3 u_clique = make_float3(
+        start_position[b_idx * dof + d_idx],
+        u_position[b_idx * horizon * dof + (h_idx - 2) * dof + d_idx],
+        u_position[b_idx * horizon * dof + (h_idx - 1) * dof + d_idx]);
+
+    new_position = u_clique.z;
+    new_velocity = (u_clique.z - u_clique.y) * dt; // 1 - 0
+    new_acceleration = (u_clique.x - 2 * u_clique.y + u_clique.z) *
+                       dt * dt; // 2 - 2.0 * 1 + 1
+    new_jerk = (2 * u_clique.x - 3 * u_clique.y + u_clique.z) *
+               dt * dt * dt; // -1 3 -3 1
+
+  } else if (h_idx == 3) {
+    float4 u_clique = make_float4(
+        start_position[b_idx * dof + d_idx],
+        u_position[b_idx * horizon * dof + (h_idx - 3) * dof + d_idx],
+        u_position[b_idx * horizon * dof + (h_idx - 2) * dof + d_idx],
+        u_position[b_idx * horizon * dof + (h_idx - 1) * dof + d_idx]);
+    new_position = u_clique.w;
+    new_velocity = (u_clique.w - u_clique.z) * dt; // 1 - 0
+    new_acceleration = (u_clique.y - 2 * u_clique.z + u_clique.w) *
+                       dt * dt; // 2 - 2.0 * 1 + 1
+    new_jerk =
+        (-1.0 * u_clique.x + 3 * u_clique.y - 3 * u_clique.z + u_clique.w) *
+        dt * dt * dt;
+  } else {
+    float4 u_clique = make_float4(
+        u_position[b_idx * horizon * dof + (h_idx - 4) * dof + d_idx],
+        u_position[b_idx * horizon * dof + (h_idx - 3) * dof + d_idx],
+        u_position[b_idx * horizon * dof + (h_idx - 2) * dof + d_idx],
+        u_position[b_idx * horizon * dof + (h_idx - 1) * dof + d_idx]);
+    new_position = u_clique.w;
+    new_velocity = (u_clique.w - u_clique.z) * dt; // 1 - 0
+    new_acceleration = (u_clique.y - 2 * u_clique.z + u_clique.w) *
+                       dt * dt; // 2 - 2.0 * 1 + 1
+    new_jerk =
+        (-1.0 * u_clique.x + 3 * u_clique.y - 3 * u_clique.z + u_clique.w) *
+        dt * dt * dt;
+  }
+
+  // h_idx = h_idx + 1;
+  out_position[b_idx * horizon * dof + h_idx * dof + d_idx] =
+      new_position; // new_position;
+  out_velocity[b_idx * horizon * dof + h_idx * dof + d_idx] = new_velocity;
+  out_acceleration[b_idx * horizon * dof + h_idx * dof + d_idx] =
+      new_acceleration;
+  out_jerk[b_idx * horizon * dof + h_idx * dof + d_idx] = new_jerk;
+}
+
+// Not used
+template <typename scalar_t>
+__global__ void position_clique_loop_coalesce_kernel(
+    scalar_t *out_position_mem, scalar_t *out_velocity_mem,
+    scalar_t *out_acceleration_mem, scalar_t *out_jerk_mem,
+    const scalar_t *u_position, const scalar_t *start_position,
+    const scalar_t *start_velocity, const scalar_t *start_acceleration,
+    const scalar_t *traj_dt, const int batch_size, const int horizon,
+    const int dof) {
+  // data is stored as batch, dof, horizon
+  // there are batch * horizon * dof threads:
+  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int b_idx = tid / (dof);
+  const int d_idx = (tid - b_idx * dof);
+  if (b_idx >= batch_size || d_idx >= dof) {
+    return;
+  }
+
+  const float dt =
+      traj_dt[0]; // assume same dt across traj TODO: Implement variable dt
+
+  // read start state:
+  float u_arr[MAX_H];
+  float out_pos[MAX_H], out_vel[MAX_H], out_acc[MAX_H], out_jerk[MAX_H];
+  const int b_addrs = b_idx * horizon * dof;
+
+#pragma unroll
+  for (int i = 0; i < horizon; i++) {
+    u_arr[i] = u_position[b_addrs + d_idx * horizon + i];
+    out_pos[i] = 0;
+    out_vel[i] = 0;
+    out_acc[i] = 0;
+    out_jerk[i] = 0;
+  }
+
+  out_pos[0] = start_position[b_idx * dof + d_idx];
+  out_vel[0] = start_velocity[b_idx * dof + d_idx];
+  out_acc[0] = start_acceleration[b_idx * dof + d_idx];
+  out_jerk[0] = 0.0;
+
+  for (int h_idx = 1; h_idx < horizon; h_idx++) {
+
+    // read actions: batch, horizon
+
+    out_pos[h_idx] = u_arr[h_idx - 1];
+
+    out_vel[h_idx] = (out_pos[h_idx] - out_pos[h_idx - 1]) * dt; // 1 - 0
+    out_acc[h_idx] =
+        (out_vel[h_idx] - out_vel[h_idx - 1]) * dt; // 2 - 2.0 * 1 + 1
+    out_jerk[h_idx] = (out_acc[h_idx] - out_acc[h_idx - 1]) * dt; // -1 3 -3 1
+  }
+
+  // write out:
+  for (int h_idx = 0; h_idx < horizon; h_idx++) {
+    out_position_mem[b_idx * horizon * dof + d_idx * horizon + h_idx] =
+        out_pos[h_idx]; // new_position;
+    out_velocity_mem[b_idx * horizon * dof + d_idx * horizon + h_idx] =
+        out_vel[h_idx];
+    out_acceleration_mem[b_idx * horizon * dof + d_idx * horizon + h_idx] =
+        out_acc[h_idx];
+    out_jerk_mem[b_idx * horizon * dof + d_idx * horizon + h_idx] =
+        out_jerk[h_idx];
+  }
+}
+
+// Not used
+template <typename scalar_t>
+__global__ void backward_position_clique_loop_coalesce_kernel(
+    scalar_t *out_grad_position, const scalar_t *grad_position,
+    const scalar_t *grad_velocity, const scalar_t *grad_acceleration,
+    const scalar_t *grad_jerk, const scalar_t *traj_dt, const int batch_size,
+    const int horizon, const int dof) {
+  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int b_idx = tid / (dof);
+  const int d_idx = (tid - b_idx * dof);
+  if (b_idx >= batch_size || d_idx >= dof) {
+    return;
+  }
+  const int b_addrs = b_idx * horizon * dof;
+  // read gradients:
+  float g_pos[MAX_H];
+  float g_vel[MAX_H];
+  float g_acc[MAX_H];
+  float g_jerk[MAX_H];
+  const float dt = traj_dt[0];
+  const float dt_2 = dt * dt;
+  const float dt_3 = dt * dt * dt;
+#pragma unroll
+  for (int h_idx = 0; h_idx < horizon; h_idx++) {
+    g_pos[h_idx] = grad_position[b_addrs + d_idx * horizon + h_idx];
+    g_vel[h_idx] = grad_velocity[b_addrs + d_idx * horizon + h_idx];
+    g_acc[h_idx] = grad_acceleration[b_addrs + d_idx * horizon + h_idx];
+    g_jerk[h_idx] = grad_jerk[b_addrs + d_idx * horizon + h_idx];
+  }
+#pragma unroll
+  for (int i = 0; i < 4; i++) {
+    g_vel[horizon + i] = 0.0;
+    g_acc[horizon + i] = 0.0;
+    g_jerk[horizon + i] = 0.0;
+  }
+  // compute gradient and sum
+  for (int h_idx = 0; h_idx < horizon - 1; h_idx++) {
+
+    g_pos[h_idx + 1] +=
+        ((g_vel[h_idx + 1] - g_vel[h_idx + 2]) * dt +
+         (g_acc[h_idx + 1] - 2 * g_acc[h_idx + 2] + g_acc[h_idx + 3]) * dt_2 +
+         (1 * g_jerk[h_idx + 1] - 3 * g_jerk[h_idx + 2] +
+          3 * g_jerk[h_idx + 3] - g_jerk[h_idx + 4]) *
+             dt_3);
+  }
+
+  // write out:
+  for (int h_idx = 0; h_idx < horizon - 1; h_idx++) {
+    out_grad_position[b_addrs + d_idx * horizon + h_idx] = g_pos[h_idx + 1];
+  }
+  out_grad_position[b_addrs + d_idx * horizon + horizon - 1] = 0.0;
+}
+
+// Not used
+template <typename scalar_t>
+__global__ void backward_position_clique_kernel(
+    scalar_t *out_grad_position, const scalar_t *grad_position,
+    const scalar_t *grad_velocity, const scalar_t *grad_acceleration,
+    const scalar_t *grad_jerk, const scalar_t *traj_dt, const int batch_size,
+    const int horizon, const int dof) {
+  // TODO: transpose h and dof to be able to directly read float2, float3, etc..
+  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int b_idx = tid / (horizon * dof);
+  const int h_idx = (tid - b_idx * (horizon * dof)) / dof;
+  const int d_idx = (tid - b_idx * horizon * dof - h_idx * dof);
+  if (b_idx >= batch_size || h_idx >= horizon || d_idx >= dof) {
+    return;
+  }
+  const int b_addrs = b_idx * horizon * dof;
+  if (h_idx == horizon - 1) {
+    out_grad_position[b_addrs + (h_idx)*dof + d_idx] = 0.0;
+    return;
+  }
+  // read gradients:
+  const float dt = traj_dt[0];
+  float g_u = grad_position[b_addrs + (h_idx + 1) * dof + d_idx];
+
+  float2 g_vel;
+  float3 g_acc;
+  float4 g_jerk;
+
+  if (h_idx < horizon - 4) // && h_idx > 0)
+  {
+    g_vel = make_float2(grad_velocity[b_addrs + (h_idx + 1) * dof + d_idx],
+                        grad_velocity[b_addrs + (h_idx + 2) * dof + d_idx]);
+    g_acc = make_float3(grad_acceleration[b_addrs + (h_idx + 1) * dof + d_idx],
+                        grad_acceleration[b_addrs + (h_idx + 2) * dof + d_idx],
+                        grad_acceleration[b_addrs + (h_idx + 3) * dof + d_idx]);
+
+    g_jerk = make_float4(grad_jerk[b_addrs + (h_idx + 1) * dof + d_idx],
+                         grad_jerk[b_addrs + (h_idx + 2) * dof + d_idx],
+                         grad_jerk[b_addrs + (h_idx + 3) * dof + d_idx],
+                         grad_jerk[b_addrs + (h_idx + 4) * dof + d_idx]);
+    g_u +=
+        ((g_vel.x - g_vel.y) * dt +
+         (g_acc.x - 2 * g_acc.y + g_acc.z) * dt * dt +
+         (1 * g_jerk.x - 3 * g_jerk.y + 3 * g_jerk.z - g_jerk.w) * dt * dt * dt);
+  } else if (h_idx == 0) {
+    g_vel = make_float2(grad_velocity[b_addrs + (h_idx + 1) * dof + d_idx],
+                        grad_velocity[b_addrs + (h_idx + 2) * dof + d_idx]);
+
+    g_acc = make_float3(grad_acceleration[b_addrs + (h_idx + 1) * dof + d_idx],
+                        grad_acceleration[b_addrs + (h_idx + 2) * dof + d_idx],
+                        0.0);
+
+    g_jerk = make_float4(grad_jerk[b_addrs + (h_idx + 1) * dof + d_idx],
+                         grad_jerk[b_addrs + (h_idx + 2) * dof + d_idx],
+                         grad_jerk[b_addrs + (h_idx + 3) * dof + d_idx], 0.0);
+    g_u += ((g_vel.x - g_vel.y) * dt +
+            (-1.0 * g_acc.x + 1 * g_acc.y) * dt * dt +
+            (-1 * g_jerk.x + 2 * g_jerk.y - 1 * g_jerk.z) * dt * dt * dt);
+  } else if (h_idx == horizon - 4) {
+    g_vel = make_float2(grad_velocity[b_addrs + (h_idx + 1) * dof + d_idx],
+                        grad_velocity[b_addrs + (h_idx + 2) * dof + d_idx]);
+    g_acc = make_float3(grad_acceleration[b_addrs + (h_idx + 1) * dof + d_idx],
+                        grad_acceleration[b_addrs + (h_idx + 2) * dof + d_idx],
+                        grad_acceleration[b_addrs + (h_idx + 3) * dof + d_idx]);
+    g_jerk = make_float4(grad_jerk[b_addrs + (h_idx + 1) * dof + d_idx],
+                         grad_jerk[b_addrs + (h_idx + 2) * dof + d_idx],
+                         grad_jerk[b_addrs + (h_idx + 3) * dof + d_idx], 0.0);
+    g_u +=
+        ((g_vel.x - g_vel.y) * dt +
+         (g_acc.x - 2 * g_acc.y + g_acc.z) * dt * dt +
+         (1 * g_jerk.x - 3 * g_jerk.y + 3 * g_jerk.z - g_jerk.w) * dt * dt * dt);
+  } else if (h_idx == horizon - 3) {
+
+    g_vel = make_float2(grad_velocity[b_addrs + (h_idx + 1) * dof + d_idx],
+                        grad_velocity[b_addrs + (h_idx + 2) * dof + d_idx]);
+    g_acc =
+        make_float3(grad_acceleration[b_addrs + (h_idx + 1) * dof + d_idx],
+                    grad_acceleration[b_addrs + (h_idx + 2) * dof + d_idx], 0);
+    g_jerk =
+        make_float4(grad_jerk[b_addrs + (h_idx + 1) * dof + d_idx],
+                    grad_jerk[b_addrs + (h_idx + 2) * dof + d_idx], 0.0, 0.0);
+    g_u +=
+        ((g_vel.x - g_vel.y) * dt +
+         (g_acc.x - 2 * g_acc.y + g_acc.z) * dt * dt +
+         (1 * g_jerk.x - 3 * g_jerk.y + 3 * g_jerk.z - g_jerk.w) * dt * dt * dt);
+  } else if (h_idx == horizon - 2) {
+
+    g_vel =
+        make_float2(grad_velocity[b_addrs + (h_idx + 1) * dof + d_idx], 0.0);
+    g_acc = make_float3(grad_acceleration[b_addrs + (h_idx + 1) * dof + d_idx],
+                        0, 0);
+    g_jerk = make_float4(grad_jerk[b_addrs + (h_idx + 1) * dof + d_idx], 0.0,
+                         0.0, 0.0);
+    g_u +=
+        ((g_vel.x - g_vel.y) * dt +
+         (g_acc.x - 2 * g_acc.y + g_acc.z) * dt * dt +
+         (1 * g_jerk.x - 3 * g_jerk.y + 3 * g_jerk.z - g_jerk.w) * dt * dt * dt);
+  }
+
+  out_grad_position[b_addrs + (h_idx)*dof + d_idx] = g_u;
+}
+
+} // namespace
+}
+std::vector<torch::Tensor> step_position_clique(
+    torch::Tensor out_position, torch::Tensor out_velocity,
+    torch::Tensor out_acceleration, torch::Tensor out_jerk,
+    const torch::Tensor u_position, const torch::Tensor start_position,
+    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+    const torch::Tensor traj_dt, const int batch_size, const int horizon,
+    const int dof) {
+  using namespace Curobo::TensorStep;
+  assert(horizon < MAX_H);
+
+  const int k_size = batch_size * dof;
+  int threadsPerBlock = k_size;
+  if (threadsPerBlock > 512) {
+    threadsPerBlock = 512;
+  }
+
+  int blocksPerGrid = (k_size + threadsPerBlock - 1) / threadsPerBlock;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES(
+      out_position.scalar_type(), "step_position_clique", ([&] {
+        position_clique_loop_kernel<scalar_t>
+            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                out_position.data_ptr<scalar_t>(),
+                out_velocity.data_ptr<scalar_t>(),
+                out_acceleration.data_ptr<scalar_t>(),
+                out_jerk.data_ptr<scalar_t>(), u_position.data_ptr<scalar_t>(),
+                start_position.data_ptr<scalar_t>(),
+                start_velocity.data_ptr<scalar_t>(),
+                start_acceleration.data_ptr<scalar_t>(),
+                traj_dt.data_ptr<scalar_t>(), batch_size, horizon, dof);
+      }));
+
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {out_position, out_velocity, out_acceleration, out_jerk};
+}
+
+std::vector<torch::Tensor> step_position_clique2(
+    torch::Tensor out_position, torch::Tensor out_velocity,
+    torch::Tensor out_acceleration, torch::Tensor out_jerk,
+    const torch::Tensor u_position, const torch::Tensor start_position,
+    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+    const torch::Tensor traj_dt, const int batch_size, const int horizon,
+    const int dof,
+    const int mode = -1) {
+  using namespace Curobo::TensorStep;
+
+  assert(horizon > 5);
+  //assert(horizon < MAX_H);
+  const int k_size = batch_size * dof * horizon;
+  int threadsPerBlock = k_size;
+  if (threadsPerBlock > 128) {
+    threadsPerBlock = 128;
+  }
+
+  int blocksPerGrid = (k_size + threadsPerBlock - 1) / threadsPerBlock;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (mode == BWD_DIFF)
+  {
+
+  
+  AT_DISPATCH_FLOATING_TYPES(
+      out_position.scalar_type(), "step_position_clique", ([&] {
+        position_clique_loop_kernel2<scalar_t, BWD_DIFF>
+            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                out_position.data_ptr<scalar_t>(),
+                out_velocity.data_ptr<scalar_t>(),
+                out_acceleration.data_ptr<scalar_t>(),
+                out_jerk.data_ptr<scalar_t>(), u_position.data_ptr<scalar_t>(),
+                start_position.data_ptr<scalar_t>(),
+                start_velocity.data_ptr<scalar_t>(),
+                start_acceleration.data_ptr<scalar_t>(),
+                traj_dt.data_ptr<scalar_t>(), batch_size, horizon, dof);
+      }));
+  }
+  else if (mode == CENTRAL_DIFF)
+  {
+     AT_DISPATCH_FLOATING_TYPES(
+      out_position.scalar_type(), "step_position_clique", ([&] {
+        position_clique_loop_kernel2<scalar_t, CENTRAL_DIFF>
+            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                out_position.data_ptr<scalar_t>(),
+                out_velocity.data_ptr<scalar_t>(),
+                out_acceleration.data_ptr<scalar_t>(),
+                out_jerk.data_ptr<scalar_t>(), u_position.data_ptr<scalar_t>(),
+                start_position.data_ptr<scalar_t>(),
+                start_velocity.data_ptr<scalar_t>(),
+                start_acceleration.data_ptr<scalar_t>(),
+                traj_dt.data_ptr<scalar_t>(), batch_size, horizon, dof);
+      }));
+  }
+  else 
+  {
+    assert (false);
+  }
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {out_position, out_velocity, out_acceleration, out_jerk};
+}
+std::vector<torch::Tensor> step_position_clique2_idx(
+    torch::Tensor out_position, torch::Tensor out_velocity,
+    torch::Tensor out_acceleration, torch::Tensor out_jerk,
+    const torch::Tensor u_position, const torch::Tensor start_position,
+    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+    const torch::Tensor start_idx, const torch::Tensor traj_dt,
+    const int batch_size, const int horizon, const int dof,
+    const int mode = -1) {
+    using namespace Curobo::TensorStep;
+
+  //assert(horizon < MAX_H);
+  assert(horizon > 5);
+
+
+  const int k_size = batch_size * dof * horizon;
+  int threadsPerBlock = k_size;
+  if (threadsPerBlock > 128) {
+    threadsPerBlock = 128;
+  }
+
+  int blocksPerGrid = (k_size + threadsPerBlock - 1) / threadsPerBlock;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (mode == BWD_DIFF)
+  {
+
+  
+  AT_DISPATCH_FLOATING_TYPES(
+      out_position.scalar_type(), "step_position_clique", ([&] {
+        position_clique_loop_idx_kernel2<scalar_t, BWD_DIFF>
+            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                out_position.data_ptr<scalar_t>(),
+                out_velocity.data_ptr<scalar_t>(),
+                out_acceleration.data_ptr<scalar_t>(),
+                out_jerk.data_ptr<scalar_t>(), u_position.data_ptr<scalar_t>(),
+                start_position.data_ptr<scalar_t>(),
+                start_velocity.data_ptr<scalar_t>(),
+                start_acceleration.data_ptr<scalar_t>(),
+                start_idx.data_ptr<int32_t>(), traj_dt.data_ptr<scalar_t>(),
+                batch_size, horizon, dof);
+      }));
+  }
+ 
+  else if (mode == CENTRAL_DIFF)
+  {  
+  AT_DISPATCH_FLOATING_TYPES(
+      out_position.scalar_type(), "step_position_clique", ([&] {
+        position_clique_loop_idx_kernel2<scalar_t, CENTRAL_DIFF>
+            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                out_position.data_ptr<scalar_t>(),
+                out_velocity.data_ptr<scalar_t>(),
+                out_acceleration.data_ptr<scalar_t>(),
+                out_jerk.data_ptr<scalar_t>(), u_position.data_ptr<scalar_t>(),
+                start_position.data_ptr<scalar_t>(),
+                start_velocity.data_ptr<scalar_t>(),
+                start_acceleration.data_ptr<scalar_t>(),
+                start_idx.data_ptr<int32_t>(), traj_dt.data_ptr<scalar_t>(),
+                batch_size, horizon, dof);
+      }));
+  }
+  else
+  {
+    assert(false);
+  }
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {out_position, out_velocity, out_acceleration, out_jerk};
+}
+
+std::vector<torch::Tensor> backward_step_position_clique(
+    torch::Tensor out_grad_position, const torch::Tensor grad_position,
+    const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
+    const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
+    const int batch_size, const int horizon, const int dof) 
+{
+  using namespace Curobo::TensorStep;
+
+  assert(horizon < MAX_H - 4);
+  const int k_size = batch_size * dof;
+  int threadsPerBlock = k_size;
+  if (threadsPerBlock > 128) {
+    threadsPerBlock = 128;
+  }
+
+  int blocksPerGrid = (k_size + threadsPerBlock - 1) / threadsPerBlock;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES(
+      out_grad_position.scalar_type(), "backward_step_position_clique", ([&] {
+        backward_position_clique_loop_kernel<scalar_t>
+            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                out_grad_position.data_ptr<scalar_t>(),
+                grad_position.data_ptr<scalar_t>(),
+                grad_velocity.data_ptr<scalar_t>(),
+                grad_acceleration.data_ptr<scalar_t>(),
+                grad_jerk.data_ptr<scalar_t>(), traj_dt.data_ptr<scalar_t>(),
+                batch_size, horizon, dof);
+      }));
+
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {out_grad_position};
+}
+
+std::vector<torch::Tensor> backward_step_position_clique2(
+    torch::Tensor out_grad_position, const torch::Tensor grad_position,
+    const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
+    const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
+    const int batch_size, const int horizon, const int dof,
+    const int mode = -1) {
+  //assert(horizon < MAX_H - 4);
+  using namespace Curobo::TensorStep;
+
+  assert(horizon > 5);
+
+
+  // const int k_size = batch_size * dof;
+  const int k_size = batch_size * dof * horizon;
+  int threadsPerBlock = k_size;
+  if (threadsPerBlock > 128) {
+    threadsPerBlock = 128;
+  }
+
+  int blocksPerGrid = (k_size + threadsPerBlock - 1) / threadsPerBlock;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (mode == BWD_DIFF)
+  {
+
+  
+  AT_DISPATCH_FLOATING_TYPES(
+      out_grad_position.scalar_type(), "backward_step_position_clique", ([&] {
+        backward_position_clique_loop_backward_difference_kernel2<scalar_t>
+            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                out_grad_position.data_ptr<scalar_t>(),
+                grad_position.data_ptr<scalar_t>(),
+                grad_velocity.data_ptr<scalar_t>(),
+                grad_acceleration.data_ptr<scalar_t>(),
+                grad_jerk.data_ptr<scalar_t>(), traj_dt.data_ptr<scalar_t>(),
+                batch_size, horizon, dof);
+      }));
+
+  }
+  else if (mode == CENTRAL_DIFF)
+  {
+
+  AT_DISPATCH_FLOATING_TYPES(
+      out_grad_position.scalar_type(), "backward_step_position_clique", ([&] {
+        backward_position_clique_loop_central_difference_kernel2<scalar_t>
+            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                out_grad_position.data_ptr<scalar_t>(),
+                grad_position.data_ptr<scalar_t>(),
+                grad_velocity.data_ptr<scalar_t>(),
+                grad_acceleration.data_ptr<scalar_t>(),
+                grad_jerk.data_ptr<scalar_t>(), traj_dt.data_ptr<scalar_t>(),
+                batch_size, horizon, dof);
+      }));
+  }
+  else
+  {
+    assert(false);
+  }
+
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {out_grad_position};
+}
+
+std::vector<torch::Tensor>
+step_acceleration(torch::Tensor out_position, torch::Tensor out_velocity,
+                  torch::Tensor out_acceleration, torch::Tensor out_jerk,
+                  const torch::Tensor u_acc, const torch::Tensor start_position,
+                  const torch::Tensor start_velocity,
+                  const torch::Tensor start_acceleration,
+                  const torch::Tensor traj_dt, const int batch_size,
+                  const int horizon, const int dof, const bool use_rk2 = true) {
+  assert(horizon < MAX_H);
+  using namespace Curobo::TensorStep;
+
+  const int k_size = batch_size * dof;
+  int threadsPerBlock = k_size;
+  if (threadsPerBlock > 512) {
+    threadsPerBlock = 512;
+  }
+
+  int blocksPerGrid = (k_size + threadsPerBlock - 1) / threadsPerBlock;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (use_rk2) {
+
+    AT_DISPATCH_FLOATING_TYPES(
+        out_position.scalar_type(), "step_acceleration", ([&] {
+          acceleration_loop_rk2_kernel<scalar_t>
+              <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                  out_position.data_ptr<scalar_t>(),
+                  out_velocity.data_ptr<scalar_t>(),
+                  out_acceleration.data_ptr<scalar_t>(),
+                  out_jerk.data_ptr<scalar_t>(), u_acc.data_ptr<scalar_t>(),
+                  start_position.data_ptr<scalar_t>(),
+                  start_velocity.data_ptr<scalar_t>(),
+                  start_acceleration.data_ptr<scalar_t>(),
+                  traj_dt.data_ptr<scalar_t>(), batch_size, horizon, dof);
+        }));
+
+  }
+
+  else {
+    AT_DISPATCH_FLOATING_TYPES(
+        out_position.scalar_type(), "step_acceleration", ([&] {
+          acceleration_loop_kernel<scalar_t>
+              <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                  out_position.data_ptr<scalar_t>(),
+                  out_velocity.data_ptr<scalar_t>(),
+                  out_acceleration.data_ptr<scalar_t>(),
+                  out_jerk.data_ptr<scalar_t>(), u_acc.data_ptr<scalar_t>(),
+                  start_position.data_ptr<scalar_t>(),
+                  start_velocity.data_ptr<scalar_t>(),
+                  start_acceleration.data_ptr<scalar_t>(),
+                  traj_dt.data_ptr<scalar_t>(), batch_size, horizon, dof);
+        }));
+  }
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {out_position, out_velocity, out_acceleration, out_jerk};
+}
+
+std::vector<torch::Tensor> step_acceleration_idx(
+    torch::Tensor out_position, torch::Tensor out_velocity,
+    torch::Tensor out_acceleration, torch::Tensor out_jerk,
+    const torch::Tensor u_acc, const torch::Tensor start_position,
+    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+    const torch::Tensor start_idx, const torch::Tensor traj_dt,
+    const int batch_size, const int horizon, const int dof,
+    const bool use_rk2 = true) {
+  assert(horizon < MAX_H);
+  using namespace Curobo::TensorStep;
+
+
+  const int k_size = batch_size * dof;
+  int threadsPerBlock = k_size;
+  if (threadsPerBlock > 512) {
+    threadsPerBlock = 512;
+  }
+
+  int blocksPerGrid = (k_size + threadsPerBlock - 1) / threadsPerBlock;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (use_rk2) {
+
+    AT_DISPATCH_FLOATING_TYPES(
+        out_position.scalar_type(), "step_acceleration", ([&] {
+          acceleration_loop_idx_rk2_kernel<scalar_t>
+              <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                  out_position.data_ptr<scalar_t>(),
+                  out_velocity.data_ptr<scalar_t>(),
+                  out_acceleration.data_ptr<scalar_t>(),
+                  out_jerk.data_ptr<scalar_t>(), u_acc.data_ptr<scalar_t>(),
+                  start_position.data_ptr<scalar_t>(),
+                  start_velocity.data_ptr<scalar_t>(),
+                  start_acceleration.data_ptr<scalar_t>(),
+                  start_idx.data_ptr<int32_t>(), traj_dt.data_ptr<scalar_t>(),
+                  batch_size, horizon, dof);
+        }));
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(
+        out_position.scalar_type(), "step_acceleration", ([&] {
+          acceleration_loop_idx_kernel<scalar_t>
+              <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                  out_position.data_ptr<scalar_t>(),
+                  out_velocity.data_ptr<scalar_t>(),
+                  out_acceleration.data_ptr<scalar_t>(),
+                  out_jerk.data_ptr<scalar_t>(), u_acc.data_ptr<scalar_t>(),
+                  start_position.data_ptr<scalar_t>(),
+                  start_velocity.data_ptr<scalar_t>(),
+                  start_acceleration.data_ptr<scalar_t>(),
+                  start_idx.data_ptr<int32_t>(), traj_dt.data_ptr<scalar_t>(),
+                  batch_size, horizon, dof);
+        }));
+  }
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {out_position, out_velocity, out_acceleration, out_jerk};
+}
diff --git a/src/curobo/curobolib/cpp/update_best_kernel.cu b/src/curobo/curobolib/cpp/update_best_kernel.cu
new file mode 100644
index 00000000..c96eaf74
--- /dev/null
+++ b/src/curobo/curobolib/cpp/update_best_kernel.cu
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include <cuda.h>
+#include <torch/extension.h>
+#include <vector>
+
+#include <c10/cuda/CUDAStream.h>
+#include <cuda_fp16.h>
+
+// #include "helper_cuda.h"
+#include "helper_math.h"
+
+#include <assert.h>
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+#include <cub/cub.cuh>
+#include <math.h>
+
+namespace Curobo {
+
+namespace Optimization {
+
+// We launch with d_opt*cost_s1 threads.
+// We assume that cost_s2 is always 1.
+template <typename scalar_t>
+__global__ void update_best_kernel(scalar_t *best_cost,     // 200x1
+                                   scalar_t *best_q,        // 200x7
+                                   int16_t *best_iteration, // 200 x 1
+                                   int16_t *current_iteration, // 1
+                                   const scalar_t *cost,    // 200x1
+                                   const scalar_t *q,       // 200x7
+                                   const int d_opt,         // 7
+                                   const int cost_s1,       // 200
+                                   const int cost_s2, 
+                                   const int iteration,
+                                   const float delta_threshold,
+                                   const float relative_threshold) // 1
+{
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int size = cost_s1 * d_opt; // size of best_q
+  if (tid >= size) {
+    return;
+  }
+
+  const int cost_idx = tid / d_opt;
+  const float cost_new = cost[cost_idx];
+  const float best_cost_in = best_cost[cost_idx];
+  const bool change = (best_cost_in - cost_new) > delta_threshold && cost_new < best_cost_in * relative_threshold;
+  if (change) {
+    best_q[tid] = q[tid]; // update best_q
+
+    if (tid % d_opt == 0) {
+      best_cost[cost_idx] = cost_new ; // update best_cost
+      //best_iteration[cost_idx] = curr_iter + iteration; // 
+             // this tensor keeps track of whether the cost reduced by at least
+             // delta_threshold.
+             // here iteration is the last_best parameter. 
+    }
+  }
+
+  if (tid % d_opt == 0)
+  {
+    if (change)
+    {
+      best_iteration[cost_idx] = 0;
+    }
+    else
+    {
+      best_iteration[cost_idx] -= 1;
+    }
+  }
+ 
+  //.if (tid == 0)
+  //{
+  //  curr_iter += 1;
+  //  current_iteration[0] = curr_iter;
+  //}
+  
+} 
+} // namespace Optimization
+} // namespace Curobo
+
+std::vector<torch::Tensor>
+update_best_cuda(torch::Tensor best_cost, torch::Tensor best_q,
+                 torch::Tensor best_iteration,
+                 torch::Tensor current_iteration,
+                  const torch::Tensor cost,
+                 const torch::Tensor q, const int d_opt, const int cost_s1,
+                 const int cost_s2, const int iteration,
+                 const float delta_threshold,
+                 const float relative_threshold = 0.999) {
+  using namespace Curobo::Optimization;
+  const int threadsPerBlock = 128;
+  const int cost_size = cost_s1 * d_opt;
+  assert(cost_s2 == 1); // assumption
+  const int blocksPerGrid = (cost_size + threadsPerBlock - 1) / threadsPerBlock;
+  // printf("cost_s1=%d, d_opt=%d, blocksPerGrid=%d\n", cost_s1, d_opt,
+  // blocksPerGrid);
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES(
+      cost.scalar_type(), "update_best_cu", ([&] {
+        update_best_kernel<scalar_t>
+            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                best_cost.data_ptr<scalar_t>(), best_q.data_ptr<scalar_t>(),
+                best_iteration.data_ptr<int16_t>(), 
+                current_iteration.data_ptr<int16_t>(), 
+                cost.data_ptr<scalar_t>(),
+                q.data_ptr<scalar_t>(), d_opt, cost_s1, cost_s2, iteration,
+                delta_threshold, relative_threshold);
+      }));
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {best_cost, best_q, best_iteration};
+}
\ No newline at end of file
diff --git a/src/curobo/curobolib/geom.py b/src/curobo/curobolib/geom.py
new file mode 100644
index 00000000..1d5a9fe8
--- /dev/null
+++ b/src/curobo/curobolib/geom.py
@@ -0,0 +1,405 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Third Party
+import torch
+
+# CuRobo
+from curobo.util.logger import log_warn
+
+try:
+    # CuRobo
+    from curobo.curobolib import geom_cu
+
+except ImportError:
+    log_warn("geom_cu binary not found, jit compiling...")
+    # Third Party
+    from torch.utils.cpp_extension import load
+
+    # CuRobo
+    from curobo.util_file import add_cpp_path
+
+    geom_cu = load(
+        name="geom_cu",
+        sources=add_cpp_path(
+            [
+                "geom_cuda.cpp",
+                "sphere_obb_kernel.cu",
+                "pose_distance_kernel.cu",
+                "self_collision_kernel.cu",
+            ]
+        ),
+    )
+
+
+def get_self_collision_distance(
+    out_distance,
+    out_vec,
+    sparse_index,
+    robot_spheres,
+    collision_offset,
+    weight,
+    coll_matrix,
+    thread_locations,
+    thread_size,
+    b_size,
+    nspheres,
+    compute_grad,
+    checks_per_thread=32,
+    experimental_kernel=True,
+):
+    r = geom_cu.self_collision_distance(
+        out_distance,
+        out_vec,
+        sparse_index,
+        robot_spheres,
+        collision_offset,
+        weight,
+        coll_matrix,
+        thread_locations,
+        thread_size,
+        b_size,
+        nspheres,
+        compute_grad,
+        checks_per_thread,
+        experimental_kernel,
+    )
+
+    out_distance = r[0]
+    out_vec = r[1]
+    return out_distance, out_vec
+
+
+class SelfCollisionDistance(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        out_distance,
+        out_vec,
+        sparse_idx,
+        robot_spheres,
+        sphere_offset,
+        weight,
+        coll_matrix,
+        thread_locations,
+        max_thread,
+        checks_per_thread: int,
+        experimental_kernel: bool,
+        return_loss: bool = False,
+    ):
+        # get batch size
+        b, h, n_spheres, _ = robot_spheres.shape
+        out_distance, out_vec = get_self_collision_distance(
+            out_distance,
+            out_vec,
+            sparse_idx,
+            robot_spheres,  # .view(-1, 4),
+            sphere_offset,
+            weight,
+            coll_matrix,
+            thread_locations,
+            max_thread,
+            b * h,
+            n_spheres,
+            robot_spheres.requires_grad,
+            checks_per_thread,
+            experimental_kernel,
+        )
+        ctx.return_loss = return_loss
+        ctx.save_for_backward(out_vec)
+        return out_distance
+
+    @staticmethod
+    def backward(ctx, grad_out_distance):
+        sphere_grad = None
+        if ctx.needs_input_grad[3]:
+            (g_vec,) = ctx.saved_tensors
+            if ctx.return_loss:
+                g_vec = g_vec * grad_out_distance.unsqueeze(1)
+            sphere_grad = g_vec
+        return None, None, None, sphere_grad, None, None, None, None, None, None, None, None
+
+
+class SelfCollisionDistanceLoss(SelfCollisionDistance):
+    @staticmethod
+    def backward(ctx, grad_out_distance):
+        sphere_grad = None
+        if ctx.needs_input_grad[3]:
+            (g_vec,) = ctx.saved_tensors
+            sphere_grad = g_vec * grad_out_distance.unsqueeze(1)
+        return None, None, None, sphere_grad, None, None, None, None, None, None, None
+
+
+def get_pose_distance(
+    out_distance,
+    out_position_distance,
+    out_rotation_distance,
+    out_p_vec,
+    out_q_vec,
+    out_idx,
+    current_position,
+    goal_position,
+    current_quat,
+    goal_quat,
+    vec_weight,
+    weight,
+    vec_convergence,
+    run_weight,
+    run_vec_weight,
+    batch_pose_idx,
+    batch_size,
+    horizon,
+    mode=1,
+    num_goals=1,
+    write_grad=False,
+    write_distance=False,
+    use_metric=False,
+):
+    if batch_pose_idx.shape[0] != batch_size:
+        raise ValueError("Index buffer size is different from batch size")
+
+    r = geom_cu.pose_distance(
+        out_distance,
+        out_position_distance,
+        out_rotation_distance,
+        out_p_vec,
+        out_q_vec,
+        out_idx,
+        current_position,
+        goal_position.view(-1),
+        current_quat,
+        goal_quat.view(-1),
+        vec_weight,
+        weight,
+        vec_convergence,
+        run_weight,
+        run_vec_weight,
+        batch_pose_idx,
+        batch_size,
+        horizon,
+        mode,
+        num_goals,
+        write_grad,
+        write_distance,
+        use_metric,
+    )
+
+    out_distance = r[0]
+    out_position_distance = r[1]
+    out_rotation_distance = r[2]
+
+    out_p_vec = r[3]
+    out_q_vec = r[4]
+
+    out_idx = r[5]
+    return out_distance, out_position_distance, out_rotation_distance, out_p_vec, out_q_vec, out_idx
+
+
+def get_pose_distance_backward(
+    out_grad_p,
+    out_grad_q,
+    grad_distance,
+    grad_p_distance,
+    grad_q_distance,
+    pose_weight,
+    grad_p_vec,
+    grad_q_vec,
+    batch_size,
+    use_distance=False,
+):
+    r = geom_cu.pose_distance_backward(
+        out_grad_p,
+        out_grad_q,
+        grad_distance,
+        grad_p_distance,
+        grad_q_distance,
+        pose_weight,
+        grad_p_vec,
+        grad_q_vec,
+        batch_size,
+        use_distance,
+    )
+    return r[0], r[1]
+
+
+class SdfSphereOBB(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        query_sphere,
+        out_buffer,
+        grad_out_buffer,
+        sparsity_idx,
+        weight,
+        activation_distance,
+        box_accel,
+        box_dims,
+        box_pose,
+        box_enable,
+        n_env_obb,
+        env_query_idx,
+        max_nobs,
+        batch_size,
+        horizon,
+        n_spheres,
+        transform_back,
+        compute_distance,
+        use_batch_env,
+        return_loss: bool = False,
+    ):
+        r = geom_cu.closest_point(
+            query_sphere,
+            out_buffer,
+            grad_out_buffer,
+            sparsity_idx,
+            weight,
+            activation_distance,
+            box_accel,
+            box_dims,
+            box_pose,
+            box_enable,
+            n_env_obb,
+            env_query_idx,
+            max_nobs,
+            batch_size,
+            horizon,
+            n_spheres,
+            transform_back,
+            compute_distance,
+            use_batch_env,
+        )
+        # r[1][r[1]!=r[1]] = 0.0
+        ctx.return_loss = return_loss
+        ctx.save_for_backward(r[1])
+        return r[0]
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_pt = None
+        if ctx.needs_input_grad[0]:
+            (r,) = ctx.saved_tensors
+            if ctx.return_loss:
+                r = r * grad_output.unsqueeze(-1)
+            grad_pt = r
+        return (
+            grad_pt,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+class SdfSweptSphereOBB(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        query_sphere,
+        out_buffer,
+        grad_out_buffer,
+        sparsity_idx,
+        weight,
+        activation_distance,
+        speed_dt,
+        box_accel,
+        box_dims,
+        box_pose,
+        box_enable,
+        n_env_obb,
+        env_query_idx,
+        max_nobs,
+        batch_size,
+        horizon,
+        n_spheres,
+        sweep_steps,
+        enable_speed_metric,
+        transform_back,
+        compute_distance,
+        use_batch_env,
+        return_loss: bool = False,
+    ):
+        r = geom_cu.swept_closest_point(
+            query_sphere,
+            out_buffer,
+            grad_out_buffer,
+            sparsity_idx,
+            weight,
+            activation_distance,
+            speed_dt,
+            box_accel,
+            box_dims,
+            box_pose,
+            box_enable,
+            n_env_obb,
+            env_query_idx,
+            max_nobs,
+            batch_size,
+            horizon,
+            n_spheres,
+            sweep_steps,
+            enable_speed_metric,
+            transform_back,
+            compute_distance,
+            use_batch_env,
+        )
+        ctx.return_loss = return_loss
+        ctx.save_for_backward(
+            r[1],
+        )
+        return r[0]
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_pt = None
+        if ctx.needs_input_grad[0]:
+            (r,) = ctx.saved_tensors
+            if ctx.return_loss:
+                r = r * grad_output.unsqueeze(-1)
+            grad_pt = r
+        return (
+            grad_pt,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
diff --git a/src/curobo/curobolib/kinematics.py b/src/curobo/curobolib/kinematics.py
new file mode 100644
index 00000000..1fcb696d
--- /dev/null
+++ b/src/curobo/curobolib/kinematics.py
@@ -0,0 +1,498 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Third Party
+import torch
+from torch.autograd import Function
+
+# CuRobo
+from curobo.util.logger import log_warn
+
+try:
+    # CuRobo
+    from curobo.curobolib import kinematics_fused_cu
+except ImportError:
+    log_warn("kinematics_fused_cu not found, JIT compiling...")
+    # Third Party
+    from torch.utils.cpp_extension import load
+
+    # CuRobo
+    from curobo.curobolib.util_file import add_cpp_path
+
+    kinematics_fused_cu = load(
+        name="kinematics_fused_cu",
+        sources=add_cpp_path(
+            [
+                "kinematics_fused_cuda.cpp",
+                "kinematics_fused_kernel.cu",
+            ]
+        ),
+    )
+
+
+def rotation_matrix_to_quaternion(in_mat, out_quat):
+    r = kinematics_fused_cu.matrix_to_quaternion(out_quat, in_mat.reshape(-1, 9))
+    return r[0]
+
+
+class KinematicsFusedFunction(Function):
+    @staticmethod
+    def _call_cuda(
+        link_pos_out,
+        link_quat_out,
+        robot_sphere_out,
+        global_cumul_mat,
+        angle,
+        fixed_transform,
+        robot_spheres,
+        link_map,
+        joint_map,
+        joint_map_type,
+        store_link_map,
+        link_sphere_map,
+    ):
+        b_shape = link_pos_out.shape
+        b_size = b_shape[0]
+        n_spheres = robot_sphere_out.shape[1]
+        r = kinematics_fused_cu.forward(
+            link_pos_out,
+            link_quat_out,
+            robot_sphere_out,
+            global_cumul_mat,
+            angle,
+            fixed_transform,
+            robot_spheres,
+            link_map,
+            joint_map,
+            joint_map_type,
+            store_link_map,
+            link_sphere_map,
+            b_size,
+            n_spheres,
+            False,
+        )
+        out_link_pos = r[0]
+        out_link_quat = r[1]
+        out_spheres = r[2]
+        global_cumul_mat = r[3]
+        return out_link_pos, out_link_quat, out_spheres, global_cumul_mat
+
+    @staticmethod
+    def _call_backward_cuda(
+        grad_out,
+        link_pos_out,
+        link_quat_out,
+        robot_sphere_out,
+        global_cumul_mat,
+        angle,
+        fixed_transform,
+        robot_spheres,
+        link_map,
+        joint_map,
+        joint_map_type,
+        store_link_map,
+        link_sphere_map,
+        link_chain_map,
+        sparsity_opt=True,
+    ):
+        b_shape = link_pos_out.shape
+        b_size = b_shape[0]
+        n_spheres = robot_sphere_out.shape[1]
+        if grad_out.is_contiguous():
+            grad_out = grad_out.view(-1)
+        else:
+            grad_out = grad_out.reshape(-1)
+        r = kinematics_fused_cu.backward(
+            grad_out,
+            link_pos_out,
+            link_quat_out,
+            robot_sphere_out,
+            global_cumul_mat,
+            angle,
+            fixed_transform,
+            robot_spheres,
+            link_map,
+            joint_map,
+            joint_map_type,
+            store_link_map,
+            link_sphere_map,
+            link_chain_map,
+            b_size,
+            n_spheres,
+            sparsity_opt,
+            False,
+        )
+        out_q = r[0].view(b_size, -1)
+
+        return out_q
+
+    @staticmethod
+    def forward(
+        ctx,
+        # link_mats: torch.Tensor,
+        link_pos: torch.Tensor,
+        link_quat: torch.Tensor,
+        b_robot_spheres: torch.tensor,
+        global_cumul_mat: torch.Tensor,
+        joint_seq: torch.Tensor,
+        fixed_transform: torch.tensor,
+        robot_spheres: torch.tensor,
+        link_map: torch.tensor,
+        joint_map: torch.Tensor,
+        joint_map_type: torch.Tensor,
+        store_link_map: torch.Tensor,
+        link_sphere_map: torch.Tensor,
+        link_chain_map: torch.Tensor,
+        grad_out: torch.Tensor,
+    ):
+        link_pos, link_quat, b_robot_spheres, global_cumul_mat = KinematicsFusedFunction._call_cuda(
+            link_pos,
+            link_quat,
+            b_robot_spheres,
+            global_cumul_mat,
+            joint_seq,
+            fixed_transform,
+            robot_spheres,
+            link_map,
+            joint_map,
+            joint_map_type,
+            store_link_map,
+            link_sphere_map,
+        )
+        ctx.save_for_backward(
+            joint_seq,
+            fixed_transform,
+            robot_spheres,
+            link_map,
+            joint_map,
+            joint_map_type,
+            store_link_map,
+            link_sphere_map,
+            link_chain_map,
+            grad_out,
+            global_cumul_mat,
+        )
+
+        return link_pos, link_quat, b_robot_spheres
+
+    @staticmethod
+    def backward(ctx, grad_out_link_pos, grad_out_link_quat, grad_out_spheres):
+        grad_joint = None
+
+        if ctx.needs_input_grad[4]:
+            (
+                joint_seq,
+                fixed_transform,
+                robot_spheres,
+                link_map,
+                joint_map,
+                joint_map_type,
+                store_link_map,
+                link_sphere_map,
+                link_chain_map,
+                grad_out,
+                global_cumul_mat,
+            ) = ctx.saved_tensors
+            grad_joint = KinematicsFusedFunction._call_backward_cuda(
+                grad_out,
+                grad_out_link_pos,
+                grad_out_link_quat,
+                grad_out_spheres,
+                global_cumul_mat,
+                joint_seq,
+                fixed_transform,
+                robot_spheres,
+                link_map,
+                joint_map,
+                joint_map_type,
+                store_link_map,
+                link_sphere_map,
+                link_chain_map,
+                True,
+            )
+
+        return (
+            None,
+            None,
+            None,
+            None,
+            grad_joint,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+class KinematicsFusedGlobalCumulFunction(Function):
+    @staticmethod
+    def _call_cuda(
+        link_pos_out,
+        link_quat_out,
+        robot_sphere_out,
+        global_cumul_mat,
+        angle,
+        fixed_transform,
+        robot_spheres,
+        link_map,
+        joint_map,
+        joint_map_type,
+        store_link_map,
+        link_sphere_map,
+    ):
+        b_shape = link_pos_out.shape
+        b_size = b_shape[0]
+        n_spheres = robot_sphere_out.shape[1]
+        # print(n_spheres)
+        # print(angle)
+        r = kinematics_fused_cu.forward(
+            link_pos_out,
+            link_quat_out,
+            robot_sphere_out,
+            global_cumul_mat,
+            angle,
+            fixed_transform,
+            robot_spheres,
+            link_map,
+            joint_map,
+            joint_map_type,
+            store_link_map,
+            link_sphere_map,
+            b_size,
+            n_spheres,
+            True,
+        )
+        out_link_pos = r[0]
+        out_link_quat = r[1]
+        out_spheres = r[2]
+        global_cumul_mat = r[3]
+        return out_link_pos, out_link_quat, out_spheres, global_cumul_mat
+
+    @staticmethod
+    def _call_backward_cuda(
+        grad_out,
+        link_pos_out,
+        link_quat_out,
+        robot_sphere_out,
+        global_cumul_mat,
+        angle,
+        fixed_transform,
+        robot_spheres,
+        link_map,
+        joint_map,
+        joint_map_type,
+        store_link_map,
+        link_sphere_map,
+        link_chain_map,
+        sparsity_opt=True,
+    ):
+        b_shape = link_pos_out.shape
+        b_size = b_shape[0]
+        # b_size = link_mat_out.shape[0]
+        n_spheres = robot_sphere_out.shape[1]
+        if grad_out.is_contiguous():
+            grad_out = grad_out.view(-1)
+        else:
+            grad_out = grad_out.reshape(-1)
+        # create backward tensors?
+
+        r = kinematics_fused_cu.backward(
+            grad_out,
+            link_pos_out,
+            link_quat_out,
+            robot_sphere_out,
+            global_cumul_mat,
+            angle,
+            fixed_transform,
+            robot_spheres,
+            link_map,
+            joint_map,
+            joint_map_type,
+            store_link_map,
+            link_sphere_map,
+            link_chain_map,
+            b_size,
+            n_spheres,
+            sparsity_opt,
+            True,
+        )
+        out_q = r[0]  # .view(angle.shape)
+        out_q = out_q.view(b_size, -1)
+        return out_q
+
+    @staticmethod
+    def forward(
+        ctx,
+        # link_mats: torch.Tensor,
+        link_pos: torch.Tensor,
+        link_quat: torch.Tensor,
+        b_robot_spheres: torch.tensor,
+        global_cumul_mat: torch.Tensor,
+        joint_seq: torch.Tensor,
+        fixed_transform: torch.tensor,
+        robot_spheres: torch.tensor,
+        link_map: torch.tensor,
+        joint_map: torch.Tensor,
+        joint_map_type: torch.Tensor,
+        store_link_map: torch.Tensor,
+        link_sphere_map: torch.Tensor,
+        link_chain_map: torch.Tensor,
+        grad_out: torch.Tensor,
+    ):
+        if joint_seq.is_contiguous():
+            joint_seq = joint_seq.view(-1)
+        else:
+            joint_seq = joint_seq.reshape(-1)
+        (
+            link_pos,
+            link_quat,
+            b_robot_spheres,
+            global_cumul_mat,
+        ) = KinematicsFusedGlobalCumulFunction._call_cuda(
+            link_pos,
+            link_quat,
+            b_robot_spheres,
+            global_cumul_mat,
+            joint_seq,
+            fixed_transform,
+            robot_spheres,
+            link_map,
+            joint_map,
+            joint_map_type,
+            store_link_map,
+            link_sphere_map,
+        )
+        ctx.save_for_backward(
+            joint_seq,
+            fixed_transform,
+            robot_spheres,
+            link_map,
+            joint_map,
+            joint_map_type,
+            store_link_map,
+            link_sphere_map,
+            link_chain_map,
+            grad_out,
+            global_cumul_mat,
+        )
+
+        return link_pos, link_quat, b_robot_spheres
+
+    @staticmethod
+    def backward(ctx, grad_out_link_pos, grad_out_link_quat, grad_out_spheres):
+        grad_joint = None
+
+        if ctx.needs_input_grad[4]:
+            (
+                joint_seq,
+                fixed_transform,
+                robot_spheres,
+                link_map,
+                joint_map,
+                joint_map_type,
+                store_link_map,
+                link_sphere_map,
+                link_chain_map,
+                grad_out,
+                global_cumul_mat,
+            ) = ctx.saved_tensors
+            grad_joint = KinematicsFusedGlobalCumulFunction._call_backward_cuda(
+                grad_out,
+                grad_out_link_pos,
+                grad_out_link_quat,
+                grad_out_spheres,
+                global_cumul_mat,
+                joint_seq,
+                fixed_transform,
+                robot_spheres,
+                link_map,
+                joint_map,
+                joint_map_type,
+                store_link_map,
+                link_sphere_map,
+                link_chain_map,
+                True,
+            )
+
+        return (
+            None,
+            None,
+            None,
+            None,
+            grad_joint,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+def get_cuda_kinematics(
+    link_pos_seq,
+    link_quat_seq,
+    batch_robot_spheres,
+    global_cumul_mat,
+    q_in,
+    fixed_transform,
+    link_spheres_tensor,
+    link_map,  # tells which link is attached to which link i
+    joint_map,  # tells which joint is attached to a link i
+    joint_map_type,  # joint type
+    store_link_map,
+    link_sphere_idx_map,  # sphere idx map
+    link_chain_map,
+    grad_out_q,
+    use_global_cumul: bool = True,
+):
+    if use_global_cumul:
+        link_pos, link_quat, robot_spheres = KinematicsFusedGlobalCumulFunction.apply(
+            link_pos_seq,
+            link_quat_seq,
+            batch_robot_spheres,
+            global_cumul_mat,
+            q_in,
+            fixed_transform,
+            link_spheres_tensor,
+            link_map,  # tells which link is attached to which link i
+            joint_map,  # tells which joint is attached to a link i
+            joint_map_type,  # joint type
+            store_link_map,
+            link_sphere_idx_map,  # sphere idx map
+            link_chain_map,
+            grad_out_q,
+        )
+    else:
+        link_pos, link_quat, robot_spheres = KinematicsFusedFunction.apply(
+            link_pos_seq,
+            link_quat_seq,
+            batch_robot_spheres,
+            global_cumul_mat,
+            q_in,
+            fixed_transform,
+            link_spheres_tensor,
+            link_map,  # tells which link is attached to which link i
+            joint_map,  # tells which joint is attached to a link i
+            joint_map_type,  # joint type
+            store_link_map,
+            link_sphere_idx_map,  # sphere idx map
+            link_chain_map,
+            grad_out_q,
+        )
+    return link_pos, link_quat, robot_spheres
diff --git a/src/curobo/curobolib/ls.py b/src/curobo/curobolib/ls.py
new file mode 100644
index 00000000..5a9b678e
--- /dev/null
+++ b/src/curobo/curobolib/ls.py
@@ -0,0 +1,112 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Third Party
+import torch
+
+# CuRobo
+from curobo.util.logger import log_warn
+
+try:
+    # CuRobo
+    from curobo.curobolib import line_search_cu
+except ImportError:
+    log_warn("line_search_cu not found, JIT compiling...")
+
+    # Third Party
+    from torch.utils.cpp_extension import load
+
+    # CuRobo
+    from curobo.curobolib.util_file import add_cpp_path
+
+    line_search_cu = load(
+        name="line_search_cu",
+        sources=add_cpp_path(
+            [
+                "line_search_cuda.cpp",
+                "line_search_kernel.cu",
+                "update_best_kernel.cu",
+            ]
+        ),
+    )
+
+
+def wolfe_line_search(
+    # m_idx,
+    best_x,
+    best_c,
+    best_grad,
+    g_x,
+    x_set,
+    sv,
+    c,
+    c_idx,
+    c_1: float,
+    c_2: float,
+    al,
+    sw: bool,
+    aw: bool,
+):
+    batchsize = g_x.shape[0]
+    l1 = g_x.shape[1]
+    l2 = g_x.shape[2]
+    r = line_search_cu.line_search(
+        # m_idx,
+        best_x,
+        best_c,
+        best_grad,
+        g_x,
+        x_set,
+        sv,
+        c,
+        al,
+        c_idx,
+        c_1,
+        c_2,
+        sw,
+        aw,
+        l1,
+        l2,
+        batchsize,
+    )
+    # print("batchsize:" + str(batchsize))
+    return (r[0], r[1], r[2])
+
+
+def update_best(
+    best_cost,
+    best_q,
+    best_iteration,
+    current_iteration,
+    cost,
+    q,
+    d_opt: int,
+    iteration: int,
+    delta_threshold: float = 1e-5,
+    relative_threshold: float = 0.999,
+):
+    cost_s1 = cost.shape[0]
+    cost_s2 = cost.shape[1]
+    r = line_search_cu.update_best(
+        best_cost,
+        best_q,
+        best_iteration,
+        current_iteration,
+        cost,
+        q,
+        d_opt,
+        cost_s1,
+        cost_s2,
+        iteration,
+        delta_threshold,
+        relative_threshold,
+    )
+    # print("batchsize:" + str(batchsize))
+    return (r[0], r[1], r[2])  # output: best_cost, best_q, best_iteration
diff --git a/src/curobo/curobolib/opt.py b/src/curobo/curobolib/opt.py
new file mode 100644
index 00000000..26b47596
--- /dev/null
+++ b/src/curobo/curobolib/opt.py
@@ -0,0 +1,112 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Third Party
+import torch
+from torch.autograd import Function
+
+# CuRobo
+from curobo.util.logger import log_warn
+
+try:
+    # CuRobo
+    from curobo.curobolib import lbfgs_step_cu
+except ImportError:
+    log_warn("lbfgs_step_cu not found, JIT compiling...")
+    # Third Party
+    from torch.utils.cpp_extension import load
+
+    # CuRobo
+    from curobo.curobolib.util_file import add_cpp_path
+
+    lbfgs_step_cu = load(
+        name="lbfgs_step_cu",
+        sources=add_cpp_path(
+            [
+                "lbfgs_step_cuda.cpp",
+                "lbfgs_step_kernel.cu",
+            ]
+        ),
+    )
+
+
+class LBFGScu(Function):
+    @staticmethod
+    def _call_cuda(
+        step_vec,
+        rho_buffer,
+        y_buffer,
+        s_buffer,
+        q,
+        grad_q,
+        x_0,
+        grad_0,
+        epsilon=0.1,
+        stable_mode=False,
+    ):
+        m, b, v_dim, _ = y_buffer.shape
+
+        R = lbfgs_step_cu.forward(
+            step_vec,  # .view(-1),
+            rho_buffer,  # .view(-1),
+            y_buffer,  # .view(-1),
+            s_buffer,  # .view(-1),
+            q,
+            grad_q,  # .view(-1),
+            x_0,
+            grad_0,
+            epsilon,
+            b,
+            m,
+            v_dim,
+            stable_mode,
+        )
+        step_v = R[0].view(step_vec.shape)
+
+        return step_v
+
+    @staticmethod
+    def forward(
+        ctx,
+        step_vec,
+        rho_buffer,
+        y_buffer,
+        s_buffer,
+        q,
+        grad_q,
+        x_0,
+        grad_0,
+        epsilon=0.1,
+        stable_mode=False,
+    ):
+        R = LBFGScu._call_cuda(
+            step_vec,
+            rho_buffer,
+            y_buffer,
+            s_buffer,
+            q,
+            grad_q,
+            x_0,
+            grad_0,
+            epsilon=epsilon,
+            stable_mode=stable_mode,
+        )
+        # ctx.save_for_backward(batch_spheres, robot_spheres, link_mats, link_sphere_map)
+        return R
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return (
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
diff --git a/src/curobo/curobolib/tensor_step.py b/src/curobo/curobolib/tensor_step.py
new file mode 100644
index 00000000..f97624a4
--- /dev/null
+++ b/src/curobo/curobolib/tensor_step.py
@@ -0,0 +1,195 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Third Party
+import torch
+
+# CuRobo
+from curobo.util.logger import log_warn
+
+try:
+    # CuRobo
+    from curobo.curobolib import tensor_step_cu
+
+except ImportError:
+    # Third Party
+    from torch.utils.cpp_extension import load
+
+    # CuRobo
+    from curobo.curobolib.util_file import add_cpp_path
+
+    log_warn("tensor_step_cu not found, jit compiling...")
+    tensor_step_cu = load(
+        name="tensor_step_cu",
+        sources=add_cpp_path(["tensor_step_cuda.cpp", "tensor_step_kernel.cu"]),
+    )
+
+
+def tensor_step_pos_clique_idx_fwd(
+    out_position,
+    out_velocity,
+    out_acceleration,
+    out_jerk,
+    u_position,
+    start_position,
+    start_velocity,
+    start_acceleration,
+    start_idx,
+    traj_dt,
+    batch_size,
+    horizon,
+    dof,
+    mode=-1,
+):
+    r = tensor_step_cu.step_idx_position2(
+        out_position,
+        out_velocity,
+        out_acceleration,
+        out_jerk,
+        u_position,
+        start_position,
+        start_velocity,
+        start_acceleration,
+        start_idx,
+        traj_dt,
+        batch_size,
+        horizon,
+        dof,
+        mode,
+    )
+    return (r[0], r[1], r[2], r[3])
+
+
+def tensor_step_pos_clique_fwd(
+    out_position,
+    out_velocity,
+    out_acceleration,
+    out_jerk,
+    u_position,
+    start_position,
+    start_velocity,
+    start_acceleration,
+    traj_dt,
+    batch_size,
+    horizon,
+    dof,
+    mode=-1,
+):
+    r = tensor_step_cu.step_position2(
+        out_position,
+        out_velocity,
+        out_acceleration,
+        out_jerk,
+        u_position,
+        start_position,
+        start_velocity,
+        start_acceleration,
+        traj_dt,
+        batch_size,
+        horizon,
+        dof,
+        mode,
+    )
+    return (r[0], r[1], r[2], r[3])
+
+
+def tensor_step_acc_fwd(
+    out_position,
+    out_velocity,
+    out_acceleration,
+    out_jerk,
+    u_acc,
+    start_position,
+    start_velocity,
+    start_acceleration,
+    traj_dt,
+    batch_size,
+    horizon,
+    dof,
+    use_rk2=True,
+):
+    r = tensor_step_cu.step_acceleration(
+        out_position,
+        out_velocity,
+        out_acceleration,
+        out_jerk,
+        u_acc,
+        start_position,
+        start_velocity,
+        start_acceleration,
+        traj_dt,
+        batch_size,
+        horizon,
+        dof,
+        use_rk2,
+    )
+    return (r[0], r[1], r[2], r[3])  # output: best_cost, best_q, best_iteration
+
+
+def tensor_step_acc_idx_fwd(
+    out_position,
+    out_velocity,
+    out_acceleration,
+    out_jerk,
+    u_acc,
+    start_position,
+    start_velocity,
+    start_acceleration,
+    start_idx,
+    traj_dt,
+    batch_size,
+    horizon,
+    dof,
+    use_rk2=True,
+):
+    r = tensor_step_cu.step_acceleration_idx(
+        out_position,
+        out_velocity,
+        out_acceleration,
+        out_jerk,
+        u_acc,
+        start_position,
+        start_velocity,
+        start_acceleration,
+        start_idx,
+        traj_dt,
+        batch_size,
+        horizon,
+        dof,
+        use_rk2,
+    )
+    return (r[0], r[1], r[2], r[3])  # output: best_cost, best_q, best_iteration
+
+
+def tensor_step_pos_clique_bwd(
+    out_grad_position,
+    grad_position,
+    grad_velocity,
+    grad_acceleration,
+    grad_jerk,
+    traj_dt,
+    batch_size,
+    horizon,
+    dof,
+    mode=-1,
+):
+    r = tensor_step_cu.step_position_backward2(
+        out_grad_position,
+        grad_position,
+        grad_velocity,
+        grad_acceleration,
+        grad_jerk,
+        traj_dt,
+        batch_size,
+        horizon,
+        dof,
+        mode,
+    )
+    return r[0]
diff --git a/src/curobo/curobolib/util_file.py b/src/curobo/curobolib/util_file.py
new file mode 100644
index 00000000..9187d986
--- /dev/null
+++ b/src/curobo/curobolib/util_file.py
@@ -0,0 +1,34 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+import os
+
+
+def get_cpp_path():
+    path = os.path.dirname(__file__)
+    return os.path.join(path, "cpp")
+
+
+def join_path(path1, path2):
+    if isinstance(path2, str):
+        return os.path.join(path1, path2)
+    else:
+        return path2
+
+
+def add_cpp_path(sources):
+    cpp_path = get_cpp_path()
+    new_list = []
+    for s in sources:
+        s = join_path(cpp_path, s)
+        new_list.append(s)
+    return new_list
diff --git a/src/curobo/geom/__init__.py b/src/curobo/geom/__init__.py
new file mode 100644
index 00000000..e5253f1f
--- /dev/null
+++ b/src/curobo/geom/__init__.py
@@ -0,0 +1,17 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+"""
+This module contains code for geometric processing such as pointcloud processing, analytic signed 
+distance computation for the environment, and also signed distance computation between robot and the
+environment. These functions can be used for robot self collision checking and also for robot 
+environment collision checking. 
+"""
diff --git a/src/curobo/geom/sdf/__init__.py b/src/curobo/geom/sdf/__init__.py
new file mode 100644
index 00000000..c991ef42
--- /dev/null
+++ b/src/curobo/geom/sdf/__init__.py
@@ -0,0 +1,18 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+"""
+This module contains code for geometric processing such as pointcloud processing, analytic signed 
+distance computation for the environment, and also signed distance computation between robot and 
+the environment. These functions can be used for robot self collision checking and also for robot 
+environment collision checking. The module also contains neural networks for learned signed distance
+fields.
+"""
diff --git a/src/curobo/geom/sdf/sdf_grid.py b/src/curobo/geom/sdf/sdf_grid.py
new file mode 100644
index 00000000..1463f12b
--- /dev/null
+++ b/src/curobo/geom/sdf/sdf_grid.py
@@ -0,0 +1,73 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Third Party
+import torch
+
+
+# @torch.jit.script
+def lookup_distance(pt, dist_matrix_flat, num_voxels):
+    # flatten:
+    ind_pt = (
+        (pt[..., 0]) * (num_voxels[1] * num_voxels[2]) + pt[..., 1] * num_voxels[2] + pt[..., 2]
+    )
+    dist = dist_matrix_flat[ind_pt]
+    return dist
+
+
+# @torch.jit.script
+def compute_sdf_gradient(pt, dist_matrix_flat, num_voxels, dist):
+    grad_l = []
+    for i in range(3):  # x,y,z
+        pt_n = pt.clone()
+        pt_p = pt.clone()
+        pt_n[..., i] -= 1
+        pt_p[..., i] += 1
+        # get distance from minus 1 and plus 1 idx:
+        pt_n[pt_n < 0] = 0
+        # pt_n[pt_n>nu]
+        # pt_p[pt_p > num_voxels] = num_voxels[i]
+        d_n = lookup_distance(pt_n, dist_matrix_flat, num_voxels)
+        d_p = lookup_distance(pt_p, dist_matrix_flat, num_voxels)
+        mask = d_n < d_p
+        dx = torch.where(mask, -1, 1)
+        ds = torch.where(mask, d_n - dist, d_p - dist)
+        g_d = ds / dx
+        grad_l.append(g_d)
+        # print(i, dist,  pt)
+    g_pt = torch.stack(grad_l, dim=-1)
+    # g_pt = g_pt/torch.linalg.norm(g_pt, dim=-1, keepdim=True)
+    return g_pt
+
+
+class SDFGrid(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, pt, dist_matrix_flat, num_voxels):
+        # input = x_id,y_id,z_id
+        pt = (pt).to(dtype=torch.int64)
+
+        dist = lookup_distance(pt, dist_matrix_flat, num_voxels)
+        ctx.save_for_backward(pt, dist_matrix_flat, num_voxels, dist)
+        return dist.unsqueeze(-1)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        pt, dist_matrix_flat, num_voxels, dist = ctx.saved_tensors
+        grad_pt = grad_voxels = grad_matrix_flat = None
+        if ctx.needs_input_grad[0]:
+            pt = pt.to(dtype=torch.int64)
+            g_pt = compute_sdf_gradient(pt, dist_matrix_flat, num_voxels, dist)
+            # print(g_pt)
+            grad_pt = grad_output * g_pt
+        if ctx.needs_input_grad[1]:
+            raise NotImplementedError("SDFGrid: Can't get gradient w.r.t. dist_matrix")
+        if ctx.needs_input_grad[2]:
+            raise NotImplementedError("SDFGrid: Can't get gradient w.r.t. num_voxels")
+        return grad_pt, grad_matrix_flat, grad_voxels
diff --git a/src/curobo/geom/sdf/utils.py b/src/curobo/geom/sdf/utils.py
new file mode 100644
index 00000000..9d482c1b
--- /dev/null
+++ b/src/curobo/geom/sdf/utils.py
@@ -0,0 +1,35 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# CuRobo
+from curobo.geom.sdf.world import CollisionCheckerType, WorldCollisionConfig
+from curobo.util.logger import log_error
+
+
+def create_collision_checker(config: WorldCollisionConfig):
+    if config.checker_type == CollisionCheckerType.PRIMITIVE:
+        # CuRobo
+        from curobo.geom.sdf.world import WorldPrimitiveCollision
+
+        return WorldPrimitiveCollision(config)
+    elif config.checker_type == CollisionCheckerType.BLOX:
+        # CuRobo
+        from curobo.geom.sdf.world_blox import WorldBloxCollision
+
+        return WorldBloxCollision(config)
+    elif config.checker_type == CollisionCheckerType.MESH:
+        # CuRobo
+        from curobo.geom.sdf.world_mesh import WorldMeshCollision
+
+        return WorldMeshCollision(config)
+    else:
+        log_error("Not implemented", exc_info=True)
+        raise NotImplementedError
diff --git a/src/curobo/geom/sdf/warp_primitives.py b/src/curobo/geom/sdf/warp_primitives.py
new file mode 100644
index 00000000..93bfbee4
--- /dev/null
+++ b/src/curobo/geom/sdf/warp_primitives.py
@@ -0,0 +1,1052 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import torch
+import warp as wp
+
+wp.set_module_options({"fast_math": False})
+
+
+# create warp kernels:
+@wp.kernel
+def get_swept_closest_pt(
+    pt: wp.array(dtype=wp.vec4),
+    distance: wp.array(dtype=wp.float32),  # this stores the output cost
+    closest_pt: wp.array(dtype=wp.float32),  # this stores the gradient
+    sparsity_idx: wp.array(dtype=wp.uint8),
+    weight: wp.array(dtype=wp.float32),
+    activation_distance: wp.array(dtype=wp.float32),  # eta threshold
+    speed_dt: wp.array(dtype=wp.float32),
+    mesh: wp.array(dtype=wp.uint64),
+    mesh_pose: wp.array(dtype=wp.float32),
+    mesh_enable: wp.array(dtype=wp.uint8),
+    n_env_mesh: wp.array(dtype=wp.int32),
+    max_dist: wp.float32,
+    write_grad: wp.uint8,
+    batch_size: wp.int32,
+    horizon: wp.int32,
+    nspheres: wp.int32,
+    max_nmesh: wp.int32,
+    sweep_steps: wp.uint8,
+    enable_speed_metric: wp.uint8,
+):
+    # we launch nspheres kernels
+    # compute gradient here and return
+    # distance is negative outside and positive inside
+    tid = int(0)
+    tid = wp.tid()
+
+    b_idx = int(0)
+    h_idx = int(0)
+    sph_idx = int(0)
+    # read horizon
+    eta = float(0.0)  # 5cm buffer
+    dt = float(1.0)
+
+    b_idx = tid / (horizon * nspheres)
+
+    h_idx = (tid - (b_idx * (horizon * nspheres))) / nspheres
+    sph_idx = tid - (b_idx * horizon * nspheres) - (h_idx * nspheres)
+    if b_idx >= batch_size or h_idx >= horizon or sph_idx >= nspheres:
+        return
+
+    n_mesh = int(0)
+    # $wp.printf("%d, %d, %d, %d \n", tid, b_idx, h_idx, sph_idx)
+    # read sphere
+    sphere_0_distance = float(0.0)
+    sphere_2_distance = float(0.0)
+
+    sphere_0 = wp.vec3(0.0)
+    sphere_2 = wp.vec3(0.0)
+    sphere_int = wp.vec3(0.0)
+    sphere_temp = wp.vec3(0.0)
+    k0 = float(0.0)
+    face_index = int(0)
+    face_u = float(0.0)
+    face_v = float(0.0)
+    sign = float(0.0)
+    dist = float(0.0)
+
+    uint_zero = wp.uint8(0)
+    uint_one = wp.uint8(1)
+    cl_pt = wp.vec3(0.0)
+    local_pt = wp.vec3(0.0)
+    in_sphere = pt[b_idx * horizon * nspheres + (h_idx * nspheres) + sph_idx]
+    in_rad = in_sphere[3]
+    if in_rad < 0.0:
+        distance[tid] = 0.0
+        if write_grad == 1 and sparsity_idx[tid] == uint_one:
+            sparsity_idx[tid] = uint_zero
+            closest_pt[tid * 4] = 0.0
+            closest_pt[tid * 4 + 1] = 0.0
+            closest_pt[tid * 4 + 2] = 0.0
+
+        return
+    eta = activation_distance[0]
+    dt = speed_dt[0]
+    in_rad += eta
+
+    max_dist_buffer = float(0.0)
+    max_dist_buffer = max_dist
+    if in_rad > max_dist_buffer:
+        max_dist_buffer += in_rad
+    in_pt = wp.vec3(in_sphere[0], in_sphere[1], in_sphere[2])
+
+    # read in sphere 0:
+    # read in sphere 0:
+    if h_idx > 0:
+        in_sphere = pt[b_idx * horizon * nspheres + ((h_idx - 1) * nspheres) + sph_idx]
+        sphere_0 += wp.vec3(in_sphere[0], in_sphere[1], in_sphere[2])
+        sphere_0_distance = wp.length(sphere_0 - in_pt) / 2.0
+
+    if h_idx < horizon - 1:
+        in_sphere = pt[b_idx * horizon * nspheres + ((h_idx + 1) * nspheres) + sph_idx]
+        sphere_2 += wp.vec3(in_sphere[0], in_sphere[1], in_sphere[2])
+        sphere_2_distance = wp.length(sphere_2 - in_pt) / 2.0
+
+    # read in sphere 2:
+    closest_distance = float(0.0)
+    closest_point = wp.vec3(0.0)
+    i = int(0)
+    dis_length = float(0.0)
+    jump_distance = float(0.0)
+    mid_distance = float(0.0)
+    n_mesh = n_env_mesh[0]
+    obj_position = wp.vec3()
+
+    while i < n_mesh:
+        if mesh_enable[i] == uint_one:
+            obj_position[0] = mesh_pose[i * 8 + 0]
+            obj_position[1] = mesh_pose[i * 8 + 1]
+            obj_position[2] = mesh_pose[i * 8 + 2]
+            obj_quat = wp.quaternion(
+                mesh_pose[i * 8 + 4],
+                mesh_pose[i * 8 + 5],
+                mesh_pose[i * 8 + 6],
+                mesh_pose[i * 8 + 3],
+            )
+
+            obj_w_pose = wp.transform(obj_position, obj_quat)
+            obj_w_pose_t = wp.transform_inverse(obj_w_pose)
+            # transform point to mesh frame:
+            # mesh_pt = T_inverse @ w_pt
+            local_pt = wp.transform_point(obj_w_pose, in_pt)
+
+            if wp.mesh_query_point(
+                mesh[i], local_pt, max_dist_buffer, sign, face_index, face_u, face_v
+            ):
+                cl_pt = wp.mesh_eval_position(mesh[i], face_index, face_u, face_v)
+                dis_length = wp.length(cl_pt - local_pt)
+                dist = (-1.0 * dis_length * sign) + in_rad
+                if dist > 0:
+                    cl_pt = sign * (cl_pt - local_pt) / dis_length
+                    grad_vec = wp.transform_vector(obj_w_pose_t, cl_pt)
+                    if dist > eta:
+                        dist_metric = dist - 0.5 * eta
+                    elif dist <= eta:
+                        dist_metric = (0.5 / eta) * (dist) * dist
+                        grad_vec = (1.0 / eta) * dist * grad_vec
+
+                    closest_distance += dist_metric
+                    closest_point += grad_vec
+                else:
+                    dist = -1.0 * dist
+            else:
+                dist = in_rad
+            dist = max(dist - in_rad, in_rad)
+
+            mid_distance = dist
+            # transform sphere -1
+            if h_idx > 0 and mid_distance < sphere_0_distance:
+                jump_distance = mid_distance
+                j = int(0)
+                sphere_temp = wp.transform_point(obj_w_pose, sphere_0)
+                while j < sweep_steps:
+                    k0 = (
+                        1.0 - 0.5 * jump_distance / sphere_0_distance
+                    )  # dist could be greater than sphere_0_distance here?
+                    sphere_int = k0 * local_pt + ((1.0 - k0) * sphere_temp)
+                    if wp.mesh_query_point(
+                        mesh[i], sphere_int, max_dist_buffer, sign, face_index, face_u, face_v
+                    ):
+                        cl_pt = wp.mesh_eval_position(mesh[i], face_index, face_u, face_v)
+                        dis_length = wp.length(cl_pt - sphere_int)
+                        dist = (-1.0 * dis_length * sign) + in_rad
+                        if dist > 0:
+                            cl_pt = sign * (cl_pt - sphere_int) / dis_length
+                            grad_vec = wp.transform_vector(obj_w_pose_t, cl_pt)
+                            if dist > eta:
+                                dist_metric = dist - 0.5 * eta
+                            elif dist <= eta:
+                                dist_metric = (0.5 / eta) * (dist) * dist
+                                grad_vec = (1.0 / eta) * dist * grad_vec
+
+                            closest_distance += dist_metric
+                            closest_point += grad_vec
+                            dist = max(dist - in_rad, in_rad)
+                            jump_distance += dist
+                        else:
+                            dist = max(-dist - in_rad, in_rad)
+
+                            jump_distance += dist
+                    else:
+                        jump_distance += max_dist_buffer
+                    j += 1
+                    if jump_distance >= sphere_0_distance:
+                        j = int(sweep_steps)
+            # transform sphere -1
+            if h_idx < horizon - 1 and mid_distance < sphere_2_distance:
+                jump_distance = mid_distance
+                j = int(0)
+                sphere_temp = wp.transform_point(obj_w_pose, sphere_2)
+                while j < sweep_steps:
+                    k0 = (
+                        1.0 - 0.5 * jump_distance / sphere_2_distance
+                    )  # dist could be greater than sphere_0_distance here?
+                    sphere_int = k0 * local_pt + (1.0 - k0) * sphere_temp
+
+                    if wp.mesh_query_point(
+                        mesh[i], sphere_int, max_dist_buffer, sign, face_index, face_u, face_v
+                    ):
+                        cl_pt = wp.mesh_eval_position(mesh[i], face_index, face_u, face_v)
+                        dis_length = wp.length(cl_pt - sphere_int)
+                        dist = (-1.0 * dis_length * sign) + in_rad
+                        if dist > 0:
+                            cl_pt = sign * (cl_pt - sphere_int) / dis_length
+                            grad_vec = wp.transform_vector(obj_w_pose_t, cl_pt)
+                            if dist > eta:
+                                dist_metric = dist - 0.5 * eta
+                            elif dist <= eta:
+                                dist_metric = (0.5 / eta) * (dist) * dist
+                                grad_vec = (1.0 / eta) * dist * grad_vec
+
+                            closest_distance += dist_metric
+                            closest_point += grad_vec
+                            dist = max(dist - in_rad, in_rad)
+
+                            jump_distance += dist
+                        else:
+                            dist = max(-dist - in_rad, in_rad)
+
+                            jump_distance += dist
+                    else:
+                        jump_distance += max_dist_buffer
+
+                    j += 1
+                    if jump_distance >= sphere_2_distance:
+                        j = int(sweep_steps)
+
+        i += 1
+    # return
+    if closest_distance == 0:
+        if sparsity_idx[tid] == uint_zero:
+            return
+        sparsity_idx[tid] = uint_zero
+        distance[tid] = 0.0
+        if write_grad == 1:
+            closest_pt[tid * 4 + 0] = 0.0
+            closest_pt[tid * 4 + 1] = 0.0
+            closest_pt[tid * 4 + 2] = 0.0
+        return
+
+    if enable_speed_metric == 1 and (h_idx > 0 and h_idx < horizon - 1):
+        # calculate sphere velocity and acceleration:
+        norm_vel_vec = wp.vec3(0.0)
+        sph_acc_vec = wp.vec3(0.0)
+        sph_vel = wp.float(0.0)
+
+        # use central difference
+        norm_vel_vec = (0.5 / dt) * (sphere_2 - sphere_0)
+        sph_acc_vec = (1.0 / (dt * dt)) * (sphere_0 + sphere_2 - 2.0 * in_pt)
+        # norm_vel_vec = -1.0 * norm_vel_vec
+        # sph_acc_vec = -1.0 * sph_acc_vec
+        sph_vel = wp.length(norm_vel_vec)
+
+        norm_vel_vec = norm_vel_vec / sph_vel
+
+        orth_proj = wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0) - wp.outer(
+            norm_vel_vec, norm_vel_vec
+        )
+
+        curvature_vec = orth_proj * (sph_acc_vec / (sph_vel * sph_vel))
+
+        closest_point = sph_vel * ((orth_proj * closest_point) - closest_distance * curvature_vec)
+
+        closest_distance = sph_vel * closest_distance
+
+    distance[tid] = weight[0] * closest_distance
+    sparsity_idx[tid] = uint_one
+    if write_grad == 1:
+        # compute gradient:
+        if closest_distance > 0.0:
+            closest_distance = weight[0]
+        closest_pt[tid * 4 + 0] = closest_distance * closest_point[0]
+        closest_pt[tid * 4 + 1] = closest_distance * closest_point[1]
+        closest_pt[tid * 4 + 2] = closest_distance * closest_point[2]
+
+
+@wp.kernel
+def get_swept_closest_pt_batch_env(
+    pt: wp.array(dtype=wp.vec4),
+    distance: wp.array(dtype=wp.float32),  # this stores the output cost
+    closest_pt: wp.array(dtype=wp.float32),  # this stores the gradient
+    sparsity_idx: wp.array(dtype=wp.uint8),
+    weight: wp.array(dtype=wp.float32),
+    activation_distance: wp.array(dtype=wp.float32),  # eta threshold
+    speed_dt: wp.array(dtype=wp.float32),
+    mesh: wp.array(dtype=wp.uint64),
+    mesh_pose: wp.array(dtype=wp.float32),
+    mesh_enable: wp.array(dtype=wp.uint8),
+    n_env_mesh: wp.array(dtype=wp.int32),
+    max_dist: wp.float32,
+    write_grad: wp.uint8,
+    batch_size: wp.int32,
+    horizon: wp.int32,
+    nspheres: wp.int32,
+    max_nmesh: wp.int32,
+    sweep_steps: wp.uint8,
+    enable_speed_metric: wp.uint8,
+    env_query_idx: wp.array(dtype=wp.int32),
+):
+    # we launch nspheres kernels
+    # compute gradient here and return
+    # distance is negative outside and positive inside
+    tid = int(0)
+    tid = wp.tid()
+
+    b_idx = int(0)
+    h_idx = int(0)
+    sph_idx = int(0)
+    # read horizon
+
+    b_idx = tid / (horizon * nspheres)
+
+    h_idx = (tid - (b_idx * (horizon * nspheres))) / nspheres
+    sph_idx = tid - (b_idx * horizon * nspheres) - (h_idx * nspheres)
+    if b_idx >= batch_size or h_idx >= horizon or sph_idx >= nspheres:
+        return
+    uint_zero = wp.uint8(0)
+    uint_one = wp.uint8(1)
+    env_idx = int(0)
+    n_mesh = int(0)
+    # $wp.printf("%d, %d, %d, %d \n", tid, b_idx, h_idx, sph_idx)
+    # read sphere
+    sphere_0_distance = float(0.0)
+    sphere_2_distance = float(0.0)
+
+    sphere_0 = wp.vec3(0.0)
+    sphere_2 = wp.vec3(0.0)
+    sphere_int = wp.vec3(0.0)
+    sphere_temp = wp.vec3(0.0)
+    grad_vec = wp.vec3(0.0)
+    eta = float(0.0)
+    dt = float(0.0)
+    k0 = float(0.0)
+    face_index = int(0)
+    face_u = float(0.0)
+    face_v = float(0.0)
+    sign = float(0.0)
+    dist = float(0.0)
+    dist_metric = float(0.0)
+    cl_pt = wp.vec3(0.0)
+    local_pt = wp.vec3(0.0)
+    in_sphere = pt[b_idx * horizon * nspheres + (h_idx * nspheres) + sph_idx]
+    in_rad = in_sphere[3]
+    if in_rad < 0.0:
+        distance[tid] = 0.0
+        if write_grad == 1 and sparsity_idx[tid] == uint_one:
+            sparsity_idx[tid] = uint_zero
+            closest_pt[tid * 4] = 0.0
+            closest_pt[tid * 4 + 1] = 0.0
+            closest_pt[tid * 4 + 2] = 0.0
+
+        return
+    dt = speed_dt[0]
+    eta = activation_distance[0]
+    in_rad += eta
+    max_dist_buffer = float(0.0)
+    max_dist_buffer = max_dist
+    if (in_rad) > max_dist_buffer:
+        max_dist_buffer += in_rad
+
+    in_pt = wp.vec3(in_sphere[0], in_sphere[1], in_sphere[2])
+    # read in sphere 0:
+    if h_idx > 0:
+        in_sphere = pt[b_idx * horizon * nspheres + ((h_idx - 1) * nspheres) + sph_idx]
+        sphere_0 += wp.vec3(in_sphere[0], in_sphere[1], in_sphere[2])
+        sphere_0_distance = wp.length(sphere_0 - in_pt) / 2.0
+    if h_idx < horizon - 1:
+        in_sphere = pt[b_idx * horizon * nspheres + ((h_idx + 1) * nspheres) + sph_idx]
+        sphere_2 += wp.vec3(in_sphere[0], in_sphere[1], in_sphere[2])
+        sphere_2_distance = wp.length(sphere_2 - in_pt) / 2.0
+
+    # read in sphere 2:
+    closest_distance = float(0.0)
+    closest_point = wp.vec3(0.0)
+    i = int(0)
+    dis_length = float(0.0)
+    jump_distance = float(0.0)
+    mid_distance = float(0.0)
+    env_idx = env_query_idx[b_idx]
+    i = max_nmesh * env_idx
+    n_mesh = i + n_env_mesh[env_idx]
+    obj_position = wp.vec3()
+
+    while i < n_mesh:
+        if mesh_enable[i] == uint_one:
+            # transform point to mesh frame:
+            # mesh_pt = T_inverse @ w_pt
+            obj_position[0] = mesh_pose[i * 8 + 0]
+            obj_position[1] = mesh_pose[i * 8 + 1]
+            obj_position[2] = mesh_pose[i * 8 + 2]
+            obj_quat = wp.quaternion(
+                mesh_pose[i * 8 + 4],
+                mesh_pose[i * 8 + 5],
+                mesh_pose[i * 8 + 6],
+                mesh_pose[i * 8 + 3],
+            )
+
+            obj_w_pose = wp.transform(obj_position, obj_quat)
+            obj_w_pose_t = wp.transform_inverse(obj_w_pose)
+            local_pt = wp.transform_point(obj_w_pose, in_pt)
+
+            if wp.mesh_query_point(
+                mesh[i], local_pt, max_dist_buffer, sign, face_index, face_u, face_v
+            ):
+                cl_pt = wp.mesh_eval_position(mesh[i], face_index, face_u, face_v)
+                dis_length = wp.length(cl_pt - local_pt)
+                dist = (-1.0 * dis_length * sign) + in_rad
+                if dist > 0:
+                    cl_pt = sign * (cl_pt - local_pt) / dis_length
+                    grad_vec = wp.transform_vector(obj_w_pose_t, cl_pt)
+                    if dist > eta:
+                        dist_metric = dist - 0.5 * eta
+                    elif dist <= eta:
+                        dist_metric = (0.5 / eta) * (dist) * dist
+                        grad_vec = (1.0 / eta) * dist * grad_vec
+
+                    closest_distance += dist_metric
+                    closest_point += grad_vec
+                else:
+                    dist = -1.0 * dist
+            else:
+                dist = max_dist_buffer
+            dist = max(dist - in_rad, in_rad)
+
+            mid_distance = dist
+            # transform sphere -1
+            if h_idx > 0 and mid_distance < sphere_0_distance:
+                jump_distance = mid_distance
+                j = int(0)
+                sphere_temp = wp.transform_point(obj_w_pose, sphere_0)
+                while j < sweep_steps:
+                    k0 = (
+                        1.0 - 0.5 * jump_distance / sphere_0_distance
+                    )  # dist could be greater than sphere_0_distance here?
+                    sphere_int = k0 * local_pt + ((1.0 - k0) * sphere_temp)
+                    if wp.mesh_query_point(
+                        mesh[i], sphere_int, max_dist_buffer, sign, face_index, face_u, face_v
+                    ):
+                        cl_pt = wp.mesh_eval_position(mesh[i], face_index, face_u, face_v)
+                        dis_length = wp.length(cl_pt - sphere_int)
+                        dist = (-1.0 * dis_length * sign) + in_rad
+                        if dist > 0:
+                            cl_pt = sign * (cl_pt - sphere_int) / dis_length
+                            grad_vec = wp.transform_vector(obj_w_pose_t, cl_pt)
+                            if dist > eta:
+                                dist_metric = dist - 0.5 * eta
+                            elif dist <= eta:
+                                dist_metric = (0.5 / eta) * (dist) * dist
+                                grad_vec = (1.0 / eta) * dist * grad_vec
+
+                            closest_distance += dist_metric
+                            closest_point += grad_vec
+                            dist = max(dist - in_rad, in_rad)
+                            jump_distance += dist
+                        else:
+                            dist = max(-dist - in_rad, in_rad)
+                            jump_distance += dist
+                    else:
+                        jump_distance += max_dist_buffer
+                    j += 1
+                    if jump_distance >= sphere_0_distance:
+                        j = int(sweep_steps)
+            # transform sphere -1
+            if h_idx < horizon - 1 and mid_distance < sphere_2_distance:
+                jump_distance = mid_distance
+                j = int(0)
+                sphere_temp = wp.transform_point(obj_w_pose, sphere_2)
+                while j < sweep_steps:
+                    k0 = (
+                        1.0 - 0.5 * jump_distance / sphere_2_distance
+                    )  # dist could be greater than sphere_0_distance here?
+                    sphere_int = k0 * local_pt + (1.0 - k0) * sphere_temp
+
+                    if wp.mesh_query_point(
+                        mesh[i], sphere_int, max_dist_buffer, sign, face_index, face_u, face_v
+                    ):
+                        cl_pt = wp.mesh_eval_position(mesh[i], face_index, face_u, face_v)
+                        dis_length = wp.length(cl_pt - sphere_int)
+                        dist = (-1.0 * dis_length * sign) + in_rad
+                        if dist > 0:
+                            cl_pt = sign * (cl_pt - sphere_int) / dis_length
+                            grad_vec = wp.transform_vector(obj_w_pose_t, cl_pt)
+                            if dist > eta:
+                                dist_metric = dist - 0.5 * eta
+                            elif dist <= eta:
+                                dist_metric = (0.5 / eta) * (dist) * dist
+                                grad_vec = (1.0 / eta) * dist * grad_vec
+                            closest_distance += dist_metric
+                            closest_point += grad_vec
+                            dist = max(dist - in_rad, in_rad)
+                            jump_distance += dist
+
+                        else:
+                            dist = max(-dist - in_rad, in_rad)
+
+                            jump_distance += dist
+                    else:
+                        jump_distance += max_dist_buffer
+
+                    j += 1
+                    if jump_distance >= sphere_2_distance:
+                        j = int(sweep_steps)
+        i += 1
+
+    # return
+    if closest_distance <= 0.0:
+        if sparsity_idx[tid] == uint_zero:
+            return
+        sparsity_idx[tid] = uint_zero
+        distance[tid] = 0.0
+        if write_grad == 1:
+            closest_pt[tid * 4 + 0] = 0.0
+            closest_pt[tid * 4 + 1] = 0.0
+            closest_pt[tid * 4 + 2] = 0.0
+
+        return
+    if enable_speed_metric == 1 and (h_idx > 0 and h_idx < horizon - 1):
+        # calculate sphere velocity and acceleration:
+        norm_vel_vec = wp.vec3(0.0)
+        sph_acc_vec = wp.vec3(0.0)
+        sph_vel = wp.float(0.0)
+
+        # use central difference
+        norm_vel_vec = (0.5 / dt) * (sphere_2 - sphere_0)
+        sph_acc_vec = (1.0 / (dt * dt)) * (sphere_0 + sphere_2 - 2.0 * in_pt)
+        # norm_vel_vec = -1.0 * norm_vel_vec
+        # sph_acc_vec = -1.0 * sph_acc_vec
+        sph_vel = wp.length(norm_vel_vec)
+
+        norm_vel_vec = norm_vel_vec / sph_vel
+
+        orth_proj = wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0) - wp.outer(
+            norm_vel_vec, norm_vel_vec
+        )
+
+        curvature_vec = orth_proj * (sph_acc_vec / (sph_vel * sph_vel))
+
+        closest_point = sph_vel * ((orth_proj * closest_point) - closest_distance * curvature_vec)
+
+        closest_distance = sph_vel * closest_distance
+
+    distance[tid] = weight[0] * closest_distance
+    sparsity_idx[tid] = uint_one
+    if write_grad == 1:
+        # compute gradient:
+        if closest_distance > 0.0:
+            closest_distance = weight[0]
+        closest_pt[tid * 4 + 0] = closest_distance * closest_point[0]
+        closest_pt[tid * 4 + 1] = closest_distance * closest_point[1]
+        closest_pt[tid * 4 + 2] = closest_distance * closest_point[2]
+
+
+@wp.kernel
+def get_closest_pt(
+    pt: wp.array(dtype=wp.vec4),
+    distance: wp.array(dtype=wp.float32),  # this stores the output cost
+    closest_pt: wp.array(dtype=wp.float32),  # this stores the gradient
+    sparsity_idx: wp.array(dtype=wp.uint8),
+    weight: wp.array(dtype=wp.float32),
+    activation_distance: wp.array(dtype=wp.float32),  # eta threshold
+    mesh: wp.array(dtype=wp.uint64),
+    mesh_pose: wp.array(dtype=wp.float32),
+    mesh_enable: wp.array(dtype=wp.uint8),
+    n_env_mesh: wp.array(dtype=wp.int32),
+    max_dist: wp.float32,
+    write_grad: wp.uint8,
+    batch_size: wp.int32,
+    horizon: wp.int32,
+    nspheres: wp.int32,
+    max_nmesh: wp.int32,
+):
+    # we launch nspheres kernels
+    # compute gradient here and return
+    # distance is negative outside and positive inside
+    tid = wp.tid()
+    n_mesh = int(0)
+    b_idx = int(0)
+    h_idx = int(0)
+    sph_idx = int(0)
+
+    # env_idx = int(0)
+
+    b_idx = tid / (horizon * nspheres)
+
+    h_idx = (tid - (b_idx * (horizon * nspheres))) / nspheres
+    sph_idx = tid - (b_idx * horizon * nspheres) - (h_idx * nspheres)
+    if b_idx >= batch_size or h_idx >= horizon or sph_idx >= nspheres:
+        return
+
+    face_index = int(0)
+    face_u = float(0.0)
+    face_v = float(0.0)
+    sign = float(0.0)
+    dist = float(0.0)
+    grad_vec = wp.vec3(0.0)
+    eta = float(0.05)
+    dist_metric = float(0.0)
+
+    cl_pt = wp.vec3(0.0)
+    local_pt = wp.vec3(0.0)
+    in_sphere = pt[tid]
+    in_pt = wp.vec3(in_sphere[0], in_sphere[1], in_sphere[2])
+    in_rad = in_sphere[3]
+    uint_zero = wp.uint8(0)
+    uint_one = wp.uint8(1)
+    if in_rad < 0.0:
+        distance[tid] = 0.0
+        if write_grad == 1 and sparsity_idx[tid] == uint_one:
+            sparsity_idx[tid] = uint_zero
+            closest_pt[tid * 4] = 0.0
+            closest_pt[tid * 4 + 1] = 0.0
+            closest_pt[tid * 4 + 2] = 0.0
+        return
+
+    eta = activation_distance[0]
+    in_rad += eta
+    max_dist_buffer = float(0.0)
+    max_dist_buffer = max_dist
+    if in_rad > max_dist_buffer:
+        max_dist_buffer += in_rad
+
+    # TODO: read vec4 and use first 3 for sphere position and last one for radius
+    # in_pt = pt[tid]
+    closest_distance = float(0.0)
+    closest_point = wp.vec3(0.0)
+    i = int(0)
+    dis_length = float(0.0)
+    # read env index:
+    # env_idx = env_query_idx[b_idx]
+    # read number of boxes in current environment:
+
+    # get start index
+    i = int(0)
+    n_mesh = n_env_mesh[0]
+    obj_position = wp.vec3()
+
+    # mesh_idx = wp.uint64(0)
+    while i < n_mesh:
+        if mesh_enable[i] == uint_one:
+            # transform point to mesh frame:
+            # mesh_pt = T_inverse @ w_pt
+
+            # read object pose:
+            obj_position[0] = mesh_pose[i * 8 + 0]
+            obj_position[1] = mesh_pose[i * 8 + 1]
+            obj_position[2] = mesh_pose[i * 8 + 2]
+            obj_quat = wp.quaternion(
+                mesh_pose[i * 8 + 4],
+                mesh_pose[i * 8 + 5],
+                mesh_pose[i * 8 + 6],
+                mesh_pose[i * 8 + 3],
+            )
+
+            obj_w_pose = wp.transform(obj_position, obj_quat)
+
+            local_pt = wp.transform_point(obj_w_pose, in_pt)
+            # mesh_idx = mesh[i]
+            if wp.mesh_query_point(
+                mesh[i], local_pt, max_dist_buffer, sign, face_index, face_u, face_v
+            ):
+                cl_pt = wp.mesh_eval_position(mesh[i], face_index, face_u, face_v)
+                dis_length = wp.length(cl_pt - local_pt)
+                dist = (-1.0 * dis_length * sign) + in_rad
+                if dist > 0:
+                    cl_pt = sign * (cl_pt - local_pt) / dis_length
+                    grad_vec = wp.transform_vector(wp.transform_inverse(obj_w_pose), cl_pt)
+                    if dist > eta:
+                        dist_metric = dist - 0.5 * eta
+                    elif dist <= eta:
+                        dist_metric = (0.5 / eta) * (dist) * dist
+                        grad_vec = (1.0 / eta) * dist * grad_vec
+                    closest_distance += dist_metric
+                    closest_point += grad_vec
+
+        i += 1
+
+    if closest_distance == 0:
+        if sparsity_idx[tid] == uint_zero:
+            return
+        sparsity_idx[tid] = uint_zero
+        distance[tid] = 0.0
+        if write_grad == 1:
+            closest_pt[tid * 4 + 0] = 0.0
+            closest_pt[tid * 4 + 1] = 0.0
+            closest_pt[tid * 4 + 2] = 0.0
+    else:
+        distance[tid] = weight[0] * closest_distance
+        sparsity_idx[tid] = uint_one
+        if write_grad == 1:
+            # compute gradient:
+            if closest_distance > 0.0:
+                closest_distance = weight[0]
+            closest_pt[tid * 4 + 0] = closest_distance * closest_point[0]
+            closest_pt[tid * 4 + 1] = closest_distance * closest_point[1]
+            closest_pt[tid * 4 + 2] = closest_distance * closest_point[2]
+
+
+@wp.kernel
+def get_closest_pt_batch_env(
+    pt: wp.array(dtype=wp.vec4),
+    distance: wp.array(dtype=wp.float32),  # this stores the output cost
+    closest_pt: wp.array(dtype=wp.float32),  # this stores the gradient
+    sparsity_idx: wp.array(dtype=wp.uint8),
+    weight: wp.array(dtype=wp.float32),
+    activation_distance: wp.array(dtype=wp.float32),  # eta threshold
+    mesh: wp.array(dtype=wp.uint64),
+    mesh_pose: wp.array(dtype=wp.float32),
+    mesh_enable: wp.array(dtype=wp.uint8),
+    n_env_mesh: wp.array(dtype=wp.int32),
+    max_dist: wp.float32,
+    write_grad: wp.uint8,
+    batch_size: wp.int32,
+    horizon: wp.int32,
+    nspheres: wp.int32,
+    max_nmesh: wp.int32,
+    env_query_idx: wp.array(dtype=wp.int32),
+):
+    # we launch nspheres kernels
+    # compute gradient here and return
+    # distance is negative outside and positive inside
+    tid = wp.tid()
+    n_mesh = int(0)
+    b_idx = int(0)
+    h_idx = int(0)
+    sph_idx = int(0)
+
+    env_idx = int(0)
+
+    b_idx = tid / (horizon * nspheres)
+
+    h_idx = (tid - (b_idx * (horizon * nspheres))) / nspheres
+    sph_idx = tid - (b_idx * horizon * nspheres) - (h_idx * nspheres)
+    if b_idx >= batch_size or h_idx >= horizon or sph_idx >= nspheres:
+        return
+
+    face_index = int(0)
+    face_u = float(0.0)
+    face_v = float(0.0)
+    sign = float(0.0)
+    dist = float(0.0)
+    grad_vec = wp.vec3(0.0)
+    eta = float(0.0)
+    dist_metric = float(0.0)
+    max_dist_buffer = float(0.0)
+    uint_zero = wp.uint8(0)
+    uint_one = wp.uint8(1)
+    cl_pt = wp.vec3(0.0)
+    local_pt = wp.vec3(0.0)
+    in_sphere = pt[tid]
+    in_pt = wp.vec3(in_sphere[0], in_sphere[1], in_sphere[2])
+    in_rad = in_sphere[3]
+    if in_rad < 0.0:
+        distance[tid] = 0.0
+        if write_grad == 1 and sparsity_idx[tid] == uint_one:
+            sparsity_idx[tid] = uint_zero
+            closest_pt[tid * 4] = 0.0
+            closest_pt[tid * 4 + 1] = 0.0
+            closest_pt[tid * 4 + 2] = 0.0
+
+        return
+    eta = activation_distance[0]
+    in_rad += eta
+    max_dist_buffer = max_dist
+    if (in_rad) > max_dist_buffer:
+        max_dist_buffer += in_rad
+
+    # TODO: read vec4 and use first 3 for sphere position and last one for radius
+    # in_pt = pt[tid]
+    closest_distance = float(0.0)
+    closest_point = wp.vec3(0.0)
+    dis_length = float(0.0)
+
+    # read env index:
+    env_idx = env_query_idx[b_idx]
+    # read number of boxes in current environment:
+
+    # get start index
+    i = int(0)
+    i = max_nmesh * env_idx
+    n_mesh = i + n_env_mesh[env_idx]
+    obj_position = wp.vec3()
+
+    while i < n_mesh:
+        if mesh_enable[i] == uint_one:
+            # transform point to mesh frame:
+            # mesh_pt = T_inverse @ w_pt
+            obj_position[0] = mesh_pose[i * 8 + 0]
+            obj_position[1] = mesh_pose[i * 8 + 1]
+            obj_position[2] = mesh_pose[i * 8 + 2]
+            obj_quat = wp.quaternion(
+                mesh_pose[i * 8 + 4],
+                mesh_pose[i * 8 + 5],
+                mesh_pose[i * 8 + 6],
+                mesh_pose[i * 8 + 3],
+            )
+
+            obj_w_pose = wp.transform(obj_position, obj_quat)
+
+            local_pt = wp.transform_point(obj_w_pose, in_pt)
+
+            if wp.mesh_query_point(
+                mesh[i], local_pt, max_dist_buffer, sign, face_index, face_u, face_v
+            ):
+                cl_pt = wp.mesh_eval_position(mesh[i], face_index, face_u, face_v)
+                dis_length = wp.length(cl_pt - local_pt)
+                dist = (-1.0 * dis_length * sign) + in_rad
+                if dist > 0:
+                    cl_pt = sign * (cl_pt - local_pt) / dis_length
+                    grad_vec = wp.transform_vector(wp.transform_inverse(obj_w_pose), cl_pt)
+                    if dist > eta:
+                        dist_metric = dist - 0.5 * eta
+                    elif dist <= eta:
+                        dist_metric = (0.5 / eta) * (dist) * dist
+                        grad_vec = (1.0 / eta) * dist * grad_vec
+                    closest_distance += dist_metric
+                    closest_point += grad_vec
+        i += 1
+
+    if closest_distance == 0:
+        if sparsity_idx[tid] == uint_zero:
+            return
+        sparsity_idx[tid] = uint_zero
+        distance[tid] = 0.0
+        if write_grad == 1:
+            closest_pt[tid * 4 + 0] = 0.0
+            closest_pt[tid * 4 + 1] = 0.0
+            closest_pt[tid * 4 + 2] = 0.0
+    else:
+        distance[tid] = weight[0] * closest_distance
+        sparsity_idx[tid] = uint_one
+        if write_grad == 1:
+            # compute gradient:
+            if closest_distance > 0.0:
+                closest_distance = weight[0]
+            closest_pt[tid * 4 + 0] = closest_distance * closest_point[0]
+            closest_pt[tid * 4 + 1] = closest_distance * closest_point[1]
+            closest_pt[tid * 4 + 2] = closest_distance * closest_point[2]
+
+
+class SdfMeshWarpPy(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        query_spheres,
+        out_cost,
+        out_grad,
+        sparsity_idx,
+        weight,
+        activation_distance,
+        mesh_idx,
+        mesh_pose_inverse,
+        mesh_enable,
+        n_env_mesh,
+        max_dist=0.05,
+        env_query_idx=None,
+        return_loss=False,
+    ):
+        b, h, n, _ = query_spheres.shape
+
+        if env_query_idx is None:
+            # launch
+            wp.launch(
+                kernel=get_closest_pt,
+                dim=b * h * n,
+                inputs=[
+                    wp.from_torch(query_spheres.detach().view(-1, 4), dtype=wp.vec4),
+                    wp.from_torch(out_cost.view(-1)),
+                    wp.from_torch(out_grad.view(-1), dtype=wp.float32),
+                    wp.from_torch(sparsity_idx.view(-1), dtype=wp.uint8),
+                    wp.from_torch(weight),
+                    wp.from_torch(activation_distance),
+                    wp.from_torch(mesh_idx.view(-1), dtype=wp.uint64),
+                    wp.from_torch(mesh_pose_inverse.view(-1), dtype=wp.float32),
+                    wp.from_torch(mesh_enable.view(-1), dtype=wp.uint8),
+                    wp.from_torch(n_env_mesh.view(-1), dtype=wp.int32),
+                    max_dist,
+                    query_spheres.requires_grad,
+                    b,
+                    h,
+                    n,
+                    mesh_idx.shape[1],
+                ],
+                stream=wp.stream_from_torch(query_spheres.device),
+            )
+        else:
+            wp.launch(
+                kernel=get_closest_pt_batch_env,
+                dim=b * h * n,
+                inputs=[
+                    wp.from_torch(query_spheres.detach().view(-1, 4), dtype=wp.vec4),
+                    wp.from_torch(out_cost.view(-1)),
+                    wp.from_torch(out_grad.view(-1), dtype=wp.float32),
+                    wp.from_torch(sparsity_idx.view(-1), dtype=wp.uint8),
+                    wp.from_torch(weight),
+                    wp.from_torch(activation_distance),
+                    wp.from_torch(mesh_idx.view(-1), dtype=wp.uint64),
+                    wp.from_torch(mesh_pose_inverse.view(-1), dtype=wp.float32),
+                    wp.from_torch(mesh_enable.view(-1), dtype=wp.uint8),
+                    wp.from_torch(n_env_mesh.view(-1), dtype=wp.int32),
+                    max_dist,
+                    query_spheres.requires_grad,
+                    b,
+                    h,
+                    n,
+                    mesh_idx.shape[1],
+                    wp.from_torch(env_query_idx.view(-1), dtype=wp.int32),
+                ],
+                stream=wp.stream_from_torch(query_spheres.device),
+            )
+        ctx.return_loss = return_loss
+        ctx.save_for_backward(out_grad)
+        return out_cost
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_sph = None
+        if ctx.needs_input_grad[0]:
+            (r,) = ctx.saved_tensors
+            grad_sph = r
+            if ctx.return_loss:
+                grad_sph = r * grad_output.unsqueeze(-1)
+        return grad_sph, None, None, None, None, None, None, None, None, None, None, None, None
+
+
+class SweptSdfMeshWarpPy(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        query_spheres,
+        out_cost,
+        out_grad,
+        sparsity_idx,
+        weight,
+        activation_distance,
+        speed_dt,
+        mesh_idx,
+        mesh_pose_inverse,
+        mesh_enable,
+        n_env_mesh,
+        sweep_steps=1,
+        enable_speed_metric=False,
+        max_dist=0.05,
+        env_query_idx=None,
+        return_loss=False,
+    ):
+        b, h, n, _ = query_spheres.shape
+
+        if env_query_idx is None:
+            wp.launch(
+                kernel=get_swept_closest_pt,
+                dim=b * h * n,
+                inputs=[
+                    wp.from_torch(query_spheres.detach().view(-1, 4), dtype=wp.vec4),
+                    wp.from_torch(out_cost.view(-1)),
+                    wp.from_torch(out_grad.view(-1), dtype=wp.float32),
+                    wp.from_torch(sparsity_idx.view(-1), dtype=wp.uint8),
+                    wp.from_torch(weight),
+                    wp.from_torch(activation_distance),
+                    wp.from_torch(speed_dt),
+                    wp.from_torch(mesh_idx.view(-1), dtype=wp.uint64),
+                    wp.from_torch(mesh_pose_inverse.view(-1), dtype=wp.float32),
+                    wp.from_torch(mesh_enable.view(-1), dtype=wp.uint8),
+                    wp.from_torch(n_env_mesh.view(-1), dtype=wp.int32),
+                    max_dist,
+                    query_spheres.requires_grad,
+                    b,
+                    h,
+                    n,
+                    mesh_idx.shape[1],
+                    sweep_steps,
+                    enable_speed_metric,
+                ],
+                stream=wp.stream_from_torch(query_spheres.device),
+            )
+        else:
+            wp.launch(
+                kernel=get_swept_closest_pt_batch_env,
+                dim=b * h * n,
+                inputs=[
+                    wp.from_torch(query_spheres.detach().view(-1, 4), dtype=wp.vec4),
+                    wp.from_torch(out_cost.view(-1)),
+                    wp.from_torch(out_grad.view(-1), dtype=wp.float32),
+                    wp.from_torch(sparsity_idx.view(-1), dtype=wp.uint8),
+                    wp.from_torch(weight),
+                    wp.from_torch(activation_distance),
+                    wp.from_torch(speed_dt),
+                    wp.from_torch(mesh_idx.view(-1), dtype=wp.uint64),
+                    wp.from_torch(mesh_pose_inverse.view(-1), dtype=wp.float32),
+                    wp.from_torch(mesh_enable.view(-1), dtype=wp.uint8),
+                    wp.from_torch(n_env_mesh.view(-1), dtype=wp.int32),
+                    max_dist,
+                    query_spheres.requires_grad,
+                    b,
+                    h,
+                    n,
+                    mesh_idx.shape[1],
+                    sweep_steps,
+                    enable_speed_metric,
+                    wp.from_torch(env_query_idx.view(-1), dtype=wp.int32),
+                ],
+                stream=wp.stream_from_torch(query_spheres.device),
+            )
+        ctx.return_loss = return_loss
+        ctx.save_for_backward(out_grad)
+        return out_cost
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_sph = None
+        if ctx.needs_input_grad[0]:
+            (r,) = ctx.saved_tensors
+            grad_sph = r
+            if ctx.return_loss:
+                grad_sph = grad_sph * grad_output.unsqueeze(-1)
+        return (
+            grad_sph,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
diff --git a/src/curobo/geom/sdf/world.py b/src/curobo/geom/sdf/world.py
new file mode 100644
index 00000000..09bd074d
--- /dev/null
+++ b/src/curobo/geom/sdf/world.py
@@ -0,0 +1,900 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+from dataclasses import dataclass
+from enum import Enum
+from typing import Dict, List, Optional, Union
+
+# Third Party
+import torch
+
+# CuRobo
+from curobo.curobolib.geom import SdfSphereOBB, SdfSweptSphereOBB
+from curobo.geom.types import Cuboid, Mesh, Obstacle, WorldConfig, batch_tensor_cube
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.util.logger import log_error, log_info, log_warn
+
+
+@dataclass
+class CollisionBuffer:
+    distance_buffer: torch.Tensor
+    grad_distance_buffer: torch.Tensor
+    sparsity_index_buffer: torch.Tensor
+    shape: Optional[torch.Size] = None
+
+    def __post_init__(self):
+        self.shape = self.distance_buffer.shape
+
+    @classmethod
+    def initialize_from_shape(cls, shape: torch.Size, tensor_args: TensorDeviceType):
+        batch, horizon, n_spheres, _ = shape
+        distance_buffer = torch.zeros(
+            (batch, horizon, n_spheres), device=tensor_args.device, dtype=tensor_args.dtype
+        )
+        grad_distance_buffer = torch.zeros(
+            (batch, horizon, n_spheres, 4), device=tensor_args.device, dtype=tensor_args.dtype
+        )
+        sparsity_idx = torch.zeros(
+            (batch, horizon, n_spheres),
+            device=tensor_args.device,
+            dtype=torch.uint8,
+        )
+        return CollisionBuffer(distance_buffer, grad_distance_buffer, sparsity_idx)
+
+    def _update_from_shape(self, shape: torch.Size, tensor_args: TensorDeviceType):
+        batch, horizon, n_spheres, _ = shape
+        self.distance_buffer = torch.zeros(
+            (batch, horizon, n_spheres), device=tensor_args.device, dtype=tensor_args.dtype
+        )
+        self.grad_distance_buffer = torch.zeros(
+            (batch, horizon, n_spheres, 4), device=tensor_args.device, dtype=tensor_args.dtype
+        )
+        self.sparsity_index_buffer = torch.zeros(
+            (batch, horizon, n_spheres),
+            device=tensor_args.device,
+            dtype=torch.uint8,
+        )
+        self.shape = shape[:3]
+
+    def update_buffer_shape(self, shape: torch.Size, tensor_args: TensorDeviceType):
+        if self.shape != shape[:3]:
+            # print("Recreating PRIM: ",self.shape, shape)
+
+            # self = CollisionBuffer.initialize_from_shape(
+            #    shape,
+            #    tensor_args,
+            # )
+            self._update_from_shape(shape, tensor_args)
+            # print("New shape:",self.shape)
+
+    def clone(self):
+        dist_buffer = self.distance_buffer.clone()
+        grad_buffer = self.grad_distance_buffer.clone()
+        sparse_buffer = self.sparsity_index_buffer.clone()
+        return CollisionBuffer(dist_buffer, grad_buffer, sparse_buffer)
+
+    def __mul__(self, scalar: float):
+        self.distance_buffer *= scalar
+        self.grad_distance_buffer *= scalar
+        self.sparsity_index_buffer *= int(scalar)
+        return self
+
+
+@dataclass
+class CollisionQueryBuffer:
+    """Class stores all buffers required to query collisions
+    This class currently has three main buffers. We initialize the required
+    buffers based on ?
+    """
+
+    primitive_collision_buffer: Optional[CollisionBuffer] = None
+    mesh_collision_buffer: Optional[CollisionBuffer] = None
+    blox_collision_buffer: Optional[CollisionBuffer] = None
+    shape: Optional[torch.Size] = None
+
+    def __post_init__(self):
+        if self.shape is None:
+            if self.primitive_collision_buffer is not None:
+                self.shape = self.primitive_collision_buffer.shape
+            elif self.mesh_collision_buffer is not None:
+                self.shape = self.mesh_collision_buffer.shape
+            elif self.blox_collision_buffer is not None:
+                self.shape = self.blox_collision_buffer.shape
+
+    def __mul__(self, scalar: float):
+        if self.primitive_collision_buffer is not None:
+            self.primitive_collision_buffer = self.primitive_collision_buffer * scalar
+        if self.mesh_collision_buffer is not None:
+            self.mesh_collision_buffer = self.mesh_collision_buffer * scalar
+        if self.blox_collision_buffer is not None:
+            self.blox_collision_buffer = self.blox_collision_buffer * scalar
+        return self
+
+    def clone(self):
+        prim_buffer = mesh_buffer = blox_buffer = None
+        if self.primitive_collision_buffer is not None:
+            prim_buffer = self.primitive_collision_buffer.clone()
+        if self.mesh_collision_buffer is not None:
+            mesh_buffer = self.mesh_collision_buffer.clone()
+        if self.blox_collision_buffer is not None:
+            blox_buffer = self.blox_collision_buffer.clone()
+        return CollisionQueryBuffer(prim_buffer, mesh_buffer, blox_buffer, self.shape)
+
+    @classmethod
+    def initialize_from_shape(
+        cls,
+        shape: torch.Size,
+        tensor_args: TensorDeviceType,
+        collision_types: Dict[str, bool],
+    ):
+        primitive_buffer = mesh_buffer = blox_buffer = None
+        if "primitive" in collision_types and collision_types["primitive"]:
+            primitive_buffer = CollisionBuffer.initialize_from_shape(shape, tensor_args)
+        if "mesh" in collision_types and collision_types["mesh"]:
+            mesh_buffer = CollisionBuffer.initialize_from_shape(shape, tensor_args)
+        if "blox" in collision_types and collision_types["blox"]:
+            blox_buffer = CollisionBuffer.initialize_from_shape(shape, tensor_args)
+        return CollisionQueryBuffer(primitive_buffer, mesh_buffer, blox_buffer)
+
+    def create_from_shape(
+        self,
+        shape: torch.Size,
+        tensor_args: TensorDeviceType,
+        collision_types: Dict[str, bool],
+    ):
+        if "primitive" in collision_types and collision_types["primitive"]:
+            self.primitive_collision_buffer = CollisionBuffer.initialize_from_shape(
+                shape, tensor_args
+            )
+        if "mesh" in collision_types and collision_types["mesh"]:
+            self.mesh_collision_buffer = CollisionBuffer.initialize_from_shape(shape, tensor_args)
+        if "blox" in collision_types and collision_types["blox"]:
+            self.blox_collision_buffer = CollisionBuffer.initialize_from_shape(shape, tensor_args)
+        self.shape = shape
+        # return self
+
+    def update_buffer_shape(
+        self,
+        shape: torch.Size,
+        tensor_args: TensorDeviceType,
+        collision_types: Optional[Dict[str, bool]],
+    ):
+        # print(shape, self.shape)
+        # update buffers:
+        assert len(shape) == 4  # shape is: batch, horizon, n_spheres, 4
+        if self.shape is None:  # buffers not initialized:
+            self.create_from_shape(shape, tensor_args, collision_types)
+            # print("Creating new memory", self.shape)
+        else:
+            # update buffers if shape doesn't match:
+            # TODO: allow for dynamic change of collision_types
+            if self.primitive_collision_buffer is not None:
+                # print(self.primitive_collision_buffer.shape, shape)
+                self.primitive_collision_buffer.update_buffer_shape(shape, tensor_args)
+            if self.mesh_collision_buffer is not None:
+                self.mesh_collision_buffer.update_buffer_shape(shape, tensor_args)
+            if self.blox_collision_buffer is not None:
+                self.blox_collision_buffer.update_buffer_shape(shape, tensor_args)
+            self.shape = shape
+
+    def get_gradient_buffer(
+        self,
+    ):
+        prim_buffer = mesh_buffer = blox_buffer = None
+        current_buffer = None
+        if self.primitive_collision_buffer is not None:
+            prim_buffer = self.primitive_collision_buffer.grad_distance_buffer
+            current_buffer = prim_buffer.clone()
+
+        if self.mesh_collision_buffer is not None:
+            mesh_buffer = self.mesh_collision_buffer.grad_distance_buffer
+            if current_buffer is None:
+                current_buffer = mesh_buffer.clone()
+            else:
+                current_buffer += mesh_buffer
+        if self.blox_collision_buffer is not None:
+            blox_buffer = self.blox_collision_buffer.grad_distance_buffer
+            if current_buffer is None:
+                current_buffer = blox_buffer.clone()
+            else:
+                current_buffer += blox_buffer
+
+        return current_buffer
+
+
+class CollisionCheckerType(Enum):
+    """Type of collision checker to use.
+    Args:
+        Enum (_type_): _description_
+    """
+
+    PRIMITIVE = "PRIMITIVE"
+    BLOX = "BLOX"
+    MESH = "MESH"
+
+
+@dataclass
+class WorldCollisionConfig:
+    tensor_args: TensorDeviceType
+    world_model: Optional[Union[List[WorldConfig], WorldConfig]] = None
+    cache: Optional[Dict[Obstacle, int]] = None
+    n_envs: int = 1
+    checker_type: CollisionCheckerType = CollisionCheckerType.PRIMITIVE
+    max_distance: float = 0.01
+
+    def __post_init__(self):
+        if self.world_model is not None and isinstance(self.world_model, list):
+            self.n_envs = len(self.world_model)
+
+    @staticmethod
+    def load_from_dict(
+        world_coll_checker_dict: Dict,
+        world_model_dict: Union[WorldConfig, Dict, List[WorldConfig]] = None,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+    ):
+        world_cfg = world_model_dict
+        if world_model_dict is not None:
+            if isinstance(world_model_dict, list) and isinstance(world_model_dict[0], dict):
+                world_cfg = [WorldConfig.from_dict(x) for x in world_model_dict]
+            elif isinstance(world_model_dict, dict):
+                world_cfg = WorldConfig.from_dict(world_model_dict)
+        world_coll_checker_dict["checker_type"] = CollisionCheckerType(
+            world_coll_checker_dict["checker_type"]
+        )
+        return WorldCollisionConfig(
+            tensor_args=tensor_args, world_model=world_cfg, **world_coll_checker_dict
+        )
+
+
+class WorldCollision(WorldCollisionConfig):
+    def __init__(self, config: Optional[WorldCollisionConfig] = None):
+        if config is not None:
+            WorldCollisionConfig.__init__(self, **vars(config))
+        self.collision_types = {}  # Use this dictionary to store collision types
+
+    def load_collision_model(self, world_model: WorldConfig):
+        raise NotImplementedError
+
+    def get_sphere_distance(
+        self,
+        query_sphere,
+        collision_query_buffer: CollisionQueryBuffer,
+        weight: torch.Tensor,
+        activation_distance: torch.Tensor,
+        env_query_idx: Optional[torch.Tensor] = None,
+        return_loss: bool = False,
+    ):
+        """
+        Computes the signed distance via analytic function
+        Args:
+        tensor_sphere: b, n, 4
+        """
+        raise NotImplementedError
+
+    def get_sphere_collision(
+        self,
+        query_sphere,
+        collision_query_buffer: CollisionQueryBuffer,
+        weight: torch.Tensor,
+        activation_distance: torch.Tensor,
+        env_query_idx: Optional[torch.Tensor] = None,
+        return_loss: bool = False,
+    ):
+        """
+        Computes the signed distance via analytic function
+        Args:
+        tensor_sphere: b, n, 4
+        we assume we don't need gradient for this function. If you need gradient, use get_sphere_distance
+        """
+
+        raise NotImplementedError
+
+    def get_swept_sphere_distance(
+        self,
+        query_sphere,
+        collision_query_buffer: CollisionQueryBuffer,
+        weight: torch.Tensor,
+        activation_distance: torch.Tensor,
+        speed_dt: torch.Tensor,
+        sweep_steps: int,
+        enable_speed_metric=False,
+        env_query_idx: Optional[torch.Tensor] = None,
+        return_loss: bool = False,
+    ):
+        raise NotImplementedError
+
+    def get_swept_sphere_collision(
+        self,
+        query_sphere,
+        collision_query_buffer: CollisionQueryBuffer,
+        weight: torch.Tensor,
+        activation_distance: torch.Tensor,
+        speed_dt: torch.Tensor,
+        sweep_steps: int,
+        enable_speed_metric=False,
+        env_query_idx: Optional[torch.Tensor] = None,
+        return_loss: bool = False,
+    ):
+        raise NotImplementedError
+
+    def get_sphere_trace(
+        self,
+        query_sphere,
+        collision_query_buffer: CollisionQueryBuffer,
+        weight: torch.Tensor,
+        sweep_steps: int,
+        env_query_idx: Optional[torch.Tensor] = None,
+        return_loss: bool = False,
+    ):
+        raise NotImplementedError
+
+
+class WorldPrimitiveCollision(WorldCollision):
+    """World Oriented Bounding Box representation object
+
+    We represent the world with oriented bounding boxes. For speed, we assume there is a
+    maximum number of obbs that can be instantiated. This number is read from the WorldCollisionConfig.
+    If no cache is setup, we use the number from the first call of load_collision_model.
+    """
+
+    def __init__(self, config: WorldCollisionConfig):
+        super().__init__(config)
+        self._world_cubes = None
+        self._cube_tensor_list = None
+        self._env_n_obbs = None
+        self._env_obbs_names = None
+        self._init_cache()
+        if self.world_model is not None:
+            if isinstance(self.world_model, list):
+                self.load_batch_collision_model(self.world_model)
+            else:
+                self.load_collision_model(self.world_model)
+
+    def _init_cache(self):
+        if self.cache is not None and "obb" in self.cache and self.cache["obb"] not in [None, 0]:
+            self._create_obb_cache(self.cache["obb"])
+
+    def load_collision_model(self, world_config: WorldConfig, env_idx=0):
+        self._load_collision_model_in_cache(world_config, env_idx)
+
+    def load_batch_collision_model(self, world_config_list: List[WorldConfig]):
+        """Load a batch of collision environments from a list of world configs.
+
+        Args:
+            world_config_list: list of world configs to load from.
+        """
+        # First find largest number of cuboid:
+        c_len = []
+        pose_batch = []
+        dims_batch = []
+        names_batch = []
+        for i in world_config_list:
+            c = i.cuboid
+            if c is not None:
+                c_len.append(len(c))
+                pose_batch.extend([i.pose for i in c])
+                dims_batch.extend([i.dims for i in c])
+                names_batch.extend([i.name for i in c])
+            else:
+                c_len.append(0)
+
+        max_obb = max(c_len)
+        if max_obb < 1:
+            log_warn("No obbs found")
+            return
+        # check if number of environments is same as config:
+        reset_buffers = False
+        if self._env_n_obbs is not None and len(world_config_list) != len(self._env_n_obbs):
+            log_warn(
+                "env_n_obbs is not same as world_config_list, reloading collision buffers (breaks CG)"
+            )
+            reset_buffers = True
+            self.n_envs = len(world_config_list)
+            self._env_n_obbs = torch.zeros(
+                (self.n_envs), device=self.tensor_args.device, dtype=torch.int32
+            )
+
+        if self._cube_tensor_list is not None and self._cube_tensor_list[0].shape[1] < max_obb:
+            log_warn(
+                "number of obbs is greater than buffer, reloading collision buffers (breaks CG)"
+            )
+            reset_buffers = True
+        # create cache if does not exist:
+        if self._cube_tensor_list is None or reset_buffers:
+            log_info("Creating Obb cache" + str(max_obb))
+            self._create_obb_cache(max_obb)
+
+        # load obstacles:
+        ## load data into gpu:
+        cube_batch = batch_tensor_cube(pose_batch, dims_batch, self.tensor_args)
+        c_start = 0
+        for i in range(len(self._env_n_obbs)):
+            if c_len[i] > 0:
+                # load obb:
+                self._cube_tensor_list[0][i, : c_len[i], :3] = cube_batch[0][
+                    c_start : c_start + c_len[i]
+                ]
+                self._cube_tensor_list[1][i, : c_len[i], :7] = cube_batch[1][
+                    c_start : c_start + c_len[i]
+                ]
+                self._cube_tensor_list[2][i, : c_len[i]] = 1
+                self._env_obbs_names[i][: c_len[i]] = names_batch[c_start : c_start + c_len[i]]
+                self._cube_tensor_list[2][i, c_len[i] :] = 0
+                c_start += c_len[i]
+        self._env_n_obbs[:] = torch.as_tensor(
+            c_len, dtype=torch.int32, device=self.tensor_args.device
+        )
+        self.collision_types["primitive"] = True
+
+    def _load_collision_model_in_cache(self, world_config: WorldConfig, env_idx: int = 0):
+        cube_objs = world_config.cuboid
+        max_obb = len(cube_objs)
+        self.world_model = world_config
+        if max_obb < 1:
+            log_info("No OBB objs")
+            return
+        if self._cube_tensor_list is None or self._cube_tensor_list[0].shape[1] < max_obb:
+            log_info("Creating Obb cache" + str(max_obb))
+            self._create_obb_cache(max_obb)
+
+        # load as a batch:
+        pose_batch = [c.pose for c in cube_objs]
+        dims_batch = [c.dims for c in cube_objs]
+        names_batch = [c.name for c in cube_objs]
+        cube_batch = batch_tensor_cube(pose_batch, dims_batch, self.tensor_args)
+
+        self._cube_tensor_list[0][env_idx, :max_obb, :3] = cube_batch[0]
+        self._cube_tensor_list[1][env_idx, :max_obb, :7] = cube_batch[1]
+
+        self._cube_tensor_list[2][env_idx, :max_obb] = 1  # enabling obstacle
+
+        self._cube_tensor_list[2][env_idx, max_obb:] = 0  # disabling obstacle
+        # self._cube_tensor_list[1][env_idx, max_obb:, 0] = 1000.0  # Not needed. TODO: remove
+
+        self._env_n_obbs[env_idx] = max_obb
+        self._env_obbs_names[env_idx][:max_obb] = names_batch
+        self.collision_types["primitive"] = True
+
+    def _create_obb_cache(self, obb_cache):
+        box_dims = (
+            torch.zeros(
+                (self.n_envs, obb_cache, 4),
+                dtype=self.tensor_args.dtype,
+                device=self.tensor_args.device,
+            )
+            + 0.01
+        )
+        box_pose = torch.zeros(
+            (self.n_envs, obb_cache, 8),
+            dtype=self.tensor_args.dtype,
+            device=self.tensor_args.device,
+        )
+        box_pose[..., 3] = 1.0
+        obs_enable = torch.zeros(
+            (self.n_envs, obb_cache), dtype=torch.uint8, device=self.tensor_args.device
+        )
+        self._env_n_obbs = torch.zeros(
+            (self.n_envs), device=self.tensor_args.device, dtype=torch.int32
+        )
+        self._cube_tensor_list = [box_dims, box_pose, obs_enable]
+        self.collision_types["primitive"] = True
+        self._env_obbs_names = [[None for _ in range(obb_cache)] for _ in range(self.n_envs)]
+
+    def add_obb_from_raw(
+        self,
+        name: str,
+        dims: torch.Tensor,
+        env_idx: int,
+        w_obj_pose: Optional[Pose] = None,
+        obj_w_pose: Optional[Pose] = None,
+    ):
+        """
+        Args:
+        dims: lenght, width, height
+        position: x,y,z
+        rotation: matrix (3x3)
+        """
+        assert w_obj_pose is not None or obj_w_pose is not None
+        if name in self._env_obbs_names[env_idx]:
+            log_error("Obstacle already exists with name: " + name, exc_info=True)
+        if w_obj_pose is not None:
+            obj_w_pose = w_obj_pose.inverse()
+        # cube = tensor_cube(w_obj_pose, dims, tensor_args=self.tensor_args)
+
+        self._cube_tensor_list[0][env_idx, self._env_n_obbs[env_idx], :3] = dims
+        self._cube_tensor_list[1][
+            env_idx, self._env_n_obbs[env_idx], :7
+        ] = obj_w_pose.get_pose_vector()
+        self._cube_tensor_list[2][env_idx, self._env_n_obbs[env_idx]] = 1
+        self._env_obbs_names[env_idx][self._env_n_obbs[env_idx]] = name
+        self._env_n_obbs[env_idx] += 1
+        return self._env_n_obbs[env_idx] - 1
+
+    def add_obb(
+        self,
+        cuboid: Cuboid,
+        env_idx: int = 0,
+    ):
+        return self.add_obb_from_raw(
+            cuboid.name,
+            self.tensor_args.to_device(cuboid.dims),
+            env_idx,
+            Pose.from_list(cuboid.pose, self.tensor_args),
+        )
+
+    def update_obb_dims(
+        self,
+        obj_dims: torch.Tensor,
+        name: Optional[str] = None,
+        env_obj_idx: Optional[torch.Tensor] = None,
+        env_idx: int = 0,
+    ):
+        """Update obstacle dimensions
+
+        Args:
+            obj_dims (torch.Tensor): [dim.x,dim.y, dim.z], give as [b,3]
+            obj_idx (torch.Tensor or int):
+
+        """
+        if env_obj_idx is not None:
+            self._cube_tensor_list[0][env_obj_idx, :3] = obj_dims
+        else:
+            # find index of given name:
+            obs_idx = self.get_obb_idx(name, env_idx)
+
+            self._cube_tensor_list[0][env_idx, obs_idx, :3] = obj_dims
+
+    def enable_obstacle(
+        self,
+        name: str,
+        enable: bool = True,
+        env_idx: int = 0,
+    ):
+        return self.enable_obb(enable, name, None, env_idx)
+
+    def enable_obb(
+        self,
+        enable: bool = True,
+        name: Optional[str] = None,
+        env_obj_idx: Optional[torch.Tensor] = None,
+        env_idx: int = 0,
+    ):
+        """Update obstacle dimensions
+
+        Args:
+            obj_dims (torch.Tensor): [dim.x,dim.y, dim.z], give as [b,3]
+            obj_idx (torch.Tensor or int):
+
+        """
+        if env_obj_idx is not None:
+            self._cube_tensor_list[2][env_obj_idx] = int(enable)  # enable == 1
+        else:
+            # find index of given name:
+            obs_idx = self.get_obb_idx(name, env_idx)
+
+            self._cube_tensor_list[2][env_idx, obs_idx] = int(enable)
+
+    def update_obstacle_pose(
+        self,
+        name: str,
+        w_obj_pose: Pose,
+        env_idx: int = 0,
+    ):
+        if self._env_obbs_names is not None and name in self._env_obbs_names[env_idx]:
+            self.update_obb_pose(name=name, w_obj_pose=w_obj_pose, env_idx=env_idx)
+        else:
+            log_error("obstacle not found in OBB world model: " + name)
+
+    def update_obb_pose(
+        self,
+        w_obj_pose: Optional[Pose] = None,
+        obj_w_pose: Optional[Pose] = None,
+        name: Optional[str] = None,
+        env_obj_idx: Optional[torch.Tensor] = None,
+        env_idx: int = 0,
+    ):
+        """Update pose of a specific objects.
+        This also updates the signed distance grid to account for the updated object pose.
+        Args:
+        obj_w_pose: Pose
+        obj_idx:
+        """
+        obj_w_pose = self._get_obstacle_poses(w_obj_pose, obj_w_pose)
+        if env_obj_idx is not None:
+            self._cube_tensor_list[1][env_obj_idx, :7] = obj_w_pose.get_pose_vector()
+        else:
+            obs_idx = self.get_obb_idx(name, env_idx)
+            self._cube_tensor_list[1][env_idx, obs_idx, :7] = obj_w_pose.get_pose_vector()
+
+    @classmethod
+    def _get_obstacle_poses(
+        cls,
+        w_obj_pose: Optional[Pose] = None,
+        obj_w_pose: Optional[Pose] = None,
+    ):
+        if w_obj_pose is not None:
+            w_inv_pose = w_obj_pose.inverse()
+        elif obj_w_pose is not None:
+            w_inv_pose = obj_w_pose
+        else:
+            raise ValueError("Object pose is not given")
+        return w_inv_pose
+
+    def get_obb_idx(
+        self,
+        name: str,
+        env_idx: int = 0,
+    ) -> int:
+        if name not in self._env_obbs_names[env_idx]:
+            log_error("Obstacle with name: " + name + " not found in current world", exc_info=True)
+        return self._env_obbs_names[env_idx].index(name)
+
+    def get_sphere_distance(
+        self,
+        query_sphere,
+        collision_query_buffer: CollisionQueryBuffer,
+        weight: torch.Tensor,
+        activation_distance: torch.Tensor,
+        env_query_idx: Optional[torch.Tensor] = None,
+        return_loss=False,
+    ):
+        if "primitive" not in self.collision_types or not self.collision_types["primitive"]:
+            raise ValueError("Primitive Collision has no obstacles")
+
+        b, h, n, _ = query_sphere.shape  # This can be read from collision query buffer
+        use_batch_env = True
+        if env_query_idx is None:
+            use_batch_env = False
+            env_query_idx = self._env_n_obbs
+
+        dist = SdfSphereOBB.apply(
+            query_sphere,
+            collision_query_buffer.primitive_collision_buffer.distance_buffer,
+            collision_query_buffer.primitive_collision_buffer.grad_distance_buffer,
+            collision_query_buffer.primitive_collision_buffer.sparsity_index_buffer,
+            weight,
+            activation_distance,
+            self._cube_tensor_list[0],
+            self._cube_tensor_list[0],
+            self._cube_tensor_list[1],
+            self._cube_tensor_list[2],
+            self._env_n_obbs,
+            env_query_idx,
+            self._cube_tensor_list[0].shape[1],
+            b,
+            h,
+            n,
+            query_sphere.requires_grad,
+            True,
+            use_batch_env,
+            return_loss,
+        )
+
+        return dist
+
+    def get_sphere_collision(
+        self,
+        query_sphere,
+        collision_query_buffer: CollisionQueryBuffer,
+        weight: torch.Tensor,
+        activation_distance: torch.Tensor,
+        env_query_idx: Optional[torch.Tensor] = None,
+        return_loss=False,
+    ):
+        if "primitive" not in self.collision_types or not self.collision_types["primitive"]:
+            raise ValueError("Primitive Collision has no obstacles")
+        if return_loss:
+            raise ValueError("cannot return loss for classification, use get_sphere_distance")
+        b, h, n, _ = query_sphere.shape
+        use_batch_env = True
+        if env_query_idx is None:
+            use_batch_env = False
+            env_query_idx = self._env_n_obbs
+
+        dist = SdfSphereOBB.apply(
+            query_sphere,
+            collision_query_buffer.primitive_collision_buffer.distance_buffer,
+            collision_query_buffer.primitive_collision_buffer.grad_distance_buffer,
+            collision_query_buffer.primitive_collision_buffer.sparsity_index_buffer,
+            weight,
+            activation_distance,
+            self._cube_tensor_list[0],
+            self._cube_tensor_list[0],
+            self._cube_tensor_list[1],
+            self._cube_tensor_list[2],
+            self._env_n_obbs,
+            env_query_idx,
+            self._cube_tensor_list[0].shape[1],
+            b,
+            h,
+            n,
+            query_sphere.requires_grad,
+            False,
+            use_batch_env,
+            return_loss,
+        )
+        return dist
+
+    def get_swept_sphere_distance(
+        self,
+        query_sphere,
+        collision_query_buffer: CollisionQueryBuffer,
+        weight: torch.Tensor,
+        activation_distance: torch.Tensor,
+        speed_dt: torch.Tensor,
+        sweep_steps: int,
+        enable_speed_metric=False,
+        env_query_idx: Optional[torch.Tensor] = None,
+        return_loss=False,
+    ):
+        """
+        Computes the signed distance via analytic function
+        Args:
+        tensor_sphere: b, n, 4
+        """
+        if "primitive" not in self.collision_types or not self.collision_types["primitive"]:
+            raise ValueError("Primitive Collision has no obstacles")
+
+        b, h, n, _ = query_sphere.shape
+        use_batch_env = True
+        if env_query_idx is None:
+            use_batch_env = False
+            env_query_idx = self._env_n_obbs
+
+        dist = SdfSweptSphereOBB.apply(
+            query_sphere,
+            collision_query_buffer.primitive_collision_buffer.distance_buffer,
+            collision_query_buffer.primitive_collision_buffer.grad_distance_buffer,
+            collision_query_buffer.primitive_collision_buffer.sparsity_index_buffer,
+            weight,
+            activation_distance,
+            speed_dt,
+            self._cube_tensor_list[0],
+            self._cube_tensor_list[0],
+            self._cube_tensor_list[1],
+            self._cube_tensor_list[2],
+            self._env_n_obbs,
+            env_query_idx,
+            self._cube_tensor_list[0].shape[1],
+            b,
+            h,
+            n,
+            sweep_steps,
+            enable_speed_metric,
+            query_sphere.requires_grad,
+            True,
+            use_batch_env,
+            return_loss,
+        )
+
+        return dist
+
+    def get_swept_sphere_collision(
+        self,
+        query_sphere,
+        collision_query_buffer: CollisionQueryBuffer,
+        weight: torch.Tensor,
+        activation_distance: torch.Tensor,
+        speed_dt: torch.Tensor,
+        sweep_steps: int,
+        enable_speed_metric=False,
+        env_query_idx: Optional[torch.Tensor] = None,
+        return_loss=False,
+    ):
+        """
+        Computes the signed distance via analytic function
+        Args:
+        tensor_sphere: b, n, 4
+        """
+        if "primitive" not in self.collision_types or not self.collision_types["primitive"]:
+            raise ValueError("Primitive Collision has no obstacles")
+        if return_loss:
+            raise ValueError("cannot return loss for classify, use get_swept_sphere_distance")
+        b, h, n, _ = query_sphere.shape
+
+        use_batch_env = True
+        if env_query_idx is None:
+            use_batch_env = False
+            env_query_idx = self._env_n_obbs
+        dist = SdfSweptSphereOBB.apply(
+            query_sphere,
+            collision_query_buffer.primitive_collision_buffer.distance_buffer,
+            collision_query_buffer.primitive_collision_buffer.grad_distance_buffer,
+            collision_query_buffer.primitive_collision_buffer.sparsity_index_buffer,
+            weight,
+            activation_distance,
+            speed_dt,
+            self._cube_tensor_list[0],
+            self._cube_tensor_list[0],
+            self._cube_tensor_list[1],
+            self._cube_tensor_list[2],
+            self._env_n_obbs,
+            env_query_idx,
+            self._cube_tensor_list[0].shape[1],
+            b,
+            h,
+            n,
+            sweep_steps,
+            enable_speed_metric,
+            query_sphere.requires_grad,
+            False,
+            use_batch_env,
+            return_loss,
+        )
+
+        return dist
+
+    def clear_cache(self):
+        pass
+
+    def get_voxels_in_bounding_box(
+        self,
+        cuboid: Cuboid,
+        voxel_size: float = 0.02,
+    ) -> Union[List[Cuboid], torch.Tensor]:
+        bounds = cuboid.dims
+        low = [-bounds[0], -bounds[1], -bounds[2]]
+        high = [bounds[0], bounds[1], bounds[2]]
+        trange = [h - l for l, h in zip(low, high)]
+
+        x = torch.linspace(
+            -bounds[0], bounds[0], int(trange[0] // voxel_size), device=self.tensor_args.device
+        )
+        y = torch.linspace(
+            -bounds[1], bounds[1], int(trange[1] // voxel_size), device=self.tensor_args.device
+        )
+        z = torch.linspace(
+            -bounds[2], bounds[2], int(trange[2] // voxel_size), device=self.tensor_args.device
+        )
+        w, l, h = x.shape[0], y.shape[0], z.shape[0]
+        xyz = (
+            torch.stack(torch.meshgrid(x, y, z, indexing="ij")).permute((1, 2, 3, 0)).reshape(-1, 3)
+        )
+
+        pose = Pose.from_list(cuboid.pose, tensor_args=self.tensor_args)
+        xyz = pose.transform_points(xyz.contiguous())
+        r = torch.zeros_like(xyz[:, 0:1]) + voxel_size
+        xyzr = torch.cat([xyz, r], dim=1)
+        xyzr = xyzr.reshape(-1, 1, 1, 4)
+        collision_buffer = CollisionQueryBuffer()
+        collision_buffer.update_buffer_shape(
+            xyzr.shape,
+            self.tensor_args,
+            self.collision_types,
+        )
+        weight = self.tensor_args.to_device([1.0])
+        act_distance = self.tensor_args.to_device([0.0])
+
+        d_sph = self.get_sphere_collision(
+            xyzr,
+            collision_buffer,
+            weight,
+            act_distance,
+        )
+        d_sph = d_sph.reshape(-1)
+        xyzr = xyzr.reshape(-1, 4)
+        # get occupied voxels:
+        occupied = xyzr[d_sph > 0.0]
+        return occupied
+
+    def get_mesh_in_bounding_box(
+        self,
+        cuboid: Cuboid,
+        voxel_size: float = 0.02,
+    ) -> Mesh:
+        voxels = self.get_voxels_in_bounding_box(cuboid, voxel_size)
+        mesh = Mesh.from_pointcloud(
+            voxels[:, :3].detach().cpu().numpy(),
+            pitch=voxel_size,
+        )
+        return mesh
diff --git a/src/curobo/geom/sdf/world_blox.py b/src/curobo/geom/sdf/world_blox.py
new file mode 100644
index 00000000..460b0217
--- /dev/null
+++ b/src/curobo/geom/sdf/world_blox.py
@@ -0,0 +1,478 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+from typing import List, Optional
+
+# Third Party
+import torch
+import torch.autograd.profiler as profiler
+
+# CuRobo
+from curobo.geom.sdf.world import CollisionQueryBuffer, WorldCollisionConfig
+from curobo.geom.sdf.world_mesh import WorldMeshCollision
+from curobo.geom.types import Cuboid, Mesh, Sphere, SphereFitType, WorldConfig
+from curobo.types.camera import CameraObservation
+from curobo.types.math import Pose
+from curobo.util.logger import log_error, log_info, log_warn
+
+try:
+    # Third Party
+    from nvblox_torch.mapper import Mapper
+except ImportError:
+    log_warn("nvblox torch wrapper is not installed, loading abstract class")
+    # Standard Library
+    from abc import ABC as Mapper
+
+
+class WorldBloxCollision(WorldMeshCollision):
+    """World Collision Representaiton using Nvidia's nvblox library.
+
+    This class depends on pytorch wrapper for nvblox.
+    Additionally, this representation does not support batched environments as we only store
+    one world via nvblox.
+
+    There are two ways to use nvblox, one is by loading maps from disk and the other is by
+    creating maps online. In both these instances, we might load more than one map and need to check
+    collisions against all maps.
+
+    To facilitate online map creation and updation, we build apis in this
+    class that provide.
+    """
+
+    def __init__(self, config: WorldCollisionConfig):
+        self._pose_offset = None
+        self._blox_mapper = None
+        self._blox_tensor_list = None
+        self._blox_voxel_sizes = [0.02]
+        super().__init__(config)
+
+    def load_collision_model(self, world_model: WorldConfig):
+        # load nvblox mesh
+        if len(world_model.blox) > 0:
+            # check if there is a mapper instance:
+            for k in world_model.blox:
+                if k.mapper_instance is not None:
+                    if self._blox_mapper is None:
+                        self._blox_mapper = k.mapper_instance
+                    else:
+                        log_error("Only one blox mapper instance is supported")
+            if self._blox_mapper is None:
+                voxel_sizes = []
+                integrator_types = []
+                # get voxel sizes and integration types:
+                for k in world_model.blox:
+                    voxel_sizes.append(k.voxel_size)
+                    integrator_types.append(k.integrator_type)
+                # create a mapper instance:
+                self._blox_mapper = Mapper(
+                    voxel_sizes=voxel_sizes,
+                    integrator_types=integrator_types,
+                    cuda_device_id=self.tensor_args.device.index,
+                    free_on_destruction=False,
+                )
+                self._blox_voxel_sizes = voxel_sizes
+            # load map from file if it exists:
+
+            names = []
+            pose = []
+
+            for i, k in enumerate(world_model.blox):
+                names.append(k.name)
+                pose.append(k.pose)
+                if k.map_path is not None:
+                    log_info("loading nvblox map for layer: " + str(i) + " from " + k.map_path)
+                    self._blox_mapper.load_from_file(k.map_path, i)
+
+            # load buffers:
+            pose_tensor = torch.zeros(
+                (len(names), 8), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            )
+            enable_tensor = torch.ones(
+                (len(names)), device=self.tensor_args.device, dtype=torch.uint8
+            )
+
+            pose_tensor[:, 3] = 1.0
+
+            pose_tensor[:, :7] = (
+                Pose.from_batch_list(pose, tensor_args=self.tensor_args).inverse().get_pose_vector()
+            )
+            self._blox_tensor_list = [pose_tensor, enable_tensor]
+            self._blox_names = names
+            self.collision_types["blox"] = True
+
+        return super().load_collision_model(world_model)
+
+    def clear_cache(self):
+        self._blox_mapper.clear()
+        super().clear_cache()
+
+    def clear_blox_layer(self, layer_name: str):
+        index = self._blox_names.index(layer_name)
+        self._blox_mapper.clear(index)
+
+    def _get_blox_sdf(
+        self,
+        query_spheres,
+        collision_query_buffer: CollisionQueryBuffer,
+        weight,
+        activation_distance,
+    ):
+        d = self._blox_mapper.query_sphere_sdf_cost(
+            query_spheres,
+            collision_query_buffer.blox_collision_buffer.distance_buffer,
+            collision_query_buffer.blox_collision_buffer.grad_distance_buffer,
+            collision_query_buffer.blox_collision_buffer.sparsity_index_buffer,
+            weight,
+            activation_distance,
+            self._blox_tensor_list[0],
+            self._blox_tensor_list[1],
+        )
+        return d
+
+    def _get_blox_swept_sdf(
+        self,
+        query_spheres,
+        collision_query_buffer,
+        weight,
+        activation_distance,
+        speed_dt,
+        sweep_steps,
+        enable_speed_metric,
+    ):
+        d = self._blox_mapper.query_sphere_trajectory_sdf_cost(
+            query_spheres,
+            collision_query_buffer.blox_collision_buffer.distance_buffer,
+            collision_query_buffer.blox_collision_buffer.grad_distance_buffer,
+            collision_query_buffer.blox_collision_buffer.sparsity_index_buffer,
+            weight,
+            activation_distance,
+            speed_dt,
+            self._blox_tensor_list[0],
+            self._blox_tensor_list[1],
+            sweep_steps,
+            enable_speed_metric,
+        )
+        return d
+
+    def get_sphere_distance(
+        self,
+        query_sphere: torch.Tensor,
+        collision_query_buffer: CollisionQueryBuffer,
+        weight: torch.Tensor,
+        activation_distance: torch.Tensor,
+        env_query_idx: Optional[torch.Tensor] = None,
+        return_loss: bool = False,
+    ):
+        if "blox" not in self.collision_types or not self.collision_types["blox"]:
+            return super().get_sphere_distance(
+                query_sphere,
+                collision_query_buffer,
+                weight,
+                activation_distance,
+                env_query_idx,
+                return_loss,
+            )
+
+        d = self._get_blox_sdf(
+            query_sphere,
+            collision_query_buffer,
+            weight=weight,
+            activation_distance=activation_distance,
+        )
+
+        if ("primitive" not in self.collision_types or not self.collision_types["primitive"]) and (
+            "mesh" not in self.collision_types or not self.collision_types["mesh"]
+        ):
+            return d
+        d_base = super().get_sphere_distance(
+            query_sphere,
+            collision_query_buffer,
+            weight,
+            activation_distance,
+            env_query_idx,
+            return_loss,
+        )
+        d = d + d_base
+
+        return d
+
+    def get_sphere_collision(
+        self,
+        query_sphere,
+        collision_query_buffer: CollisionQueryBuffer,
+        weight: torch.Tensor,
+        activation_distance: torch.Tensor,
+        env_query_idx=None,
+        return_loss: bool = False,
+    ):
+        if "blox" not in self.collision_types or not self.collision_types["blox"]:
+            return super().get_sphere_collision(
+                query_sphere,
+                collision_query_buffer,
+                weight,
+                activation_distance,
+                env_query_idx,
+                return_loss,
+            )
+
+        d = self._get_blox_sdf(
+            query_sphere,
+            collision_query_buffer,
+            weight=weight,
+            activation_distance=activation_distance,
+        )
+        if ("primitive" not in self.collision_types or not self.collision_types["primitive"]) and (
+            "mesh" not in self.collision_types or not self.collision_types["mesh"]
+        ):
+            return d
+        d_base = super().get_sphere_collision(
+            query_sphere,
+            collision_query_buffer,
+            weight,
+            activation_distance,
+            env_query_idx,
+            return_loss,
+        )
+        d = d + d_base
+
+        return d
+
+    def get_swept_sphere_distance(
+        self,
+        query_sphere,
+        collision_query_buffer: CollisionQueryBuffer,
+        weight: torch.Tensor,
+        activation_distance: torch.Tensor,
+        speed_dt: torch.Tensor,
+        sweep_steps: int,
+        enable_speed_metric=False,
+        env_query_idx: Optional[torch.Tensor] = None,
+        return_loss: bool = False,
+    ):
+        if "blox" not in self.collision_types or not self.collision_types["blox"]:
+            return super().get_swept_sphere_distance(
+                query_sphere,
+                collision_query_buffer,
+                weight,
+                activation_distance,
+                speed_dt,
+                sweep_steps,
+                enable_speed_metric,
+                env_query_idx,
+                return_loss=return_loss,
+            )
+
+        d = self._get_blox_swept_sdf(
+            query_sphere,
+            collision_query_buffer,
+            weight=weight,
+            activation_distance=activation_distance,
+            speed_dt=speed_dt,
+            sweep_steps=sweep_steps,
+            enable_speed_metric=enable_speed_metric,
+        )
+
+        if ("primitive" not in self.collision_types or not self.collision_types["primitive"]) and (
+            "mesh" not in self.collision_types or not self.collision_types["mesh"]
+        ):
+            return d
+        d_base = super().get_swept_sphere_distance(
+            query_sphere,
+            collision_query_buffer,
+            weight,
+            activation_distance,
+            speed_dt,
+            sweep_steps,
+            enable_speed_metric,
+            env_query_idx,
+            return_loss=return_loss,
+        )
+        d = d + d_base
+
+        return d
+
+    def get_swept_sphere_collision(
+        self,
+        query_sphere,
+        collision_query_buffer: CollisionQueryBuffer,
+        weight: torch.Tensor,
+        sweep_steps,
+        activation_distance: torch.Tensor,
+        speed_dt: torch.Tensor,
+        enable_speed_metric=False,
+        env_query_idx: Optional[torch.Tensor] = None,
+        return_loss: bool = False,
+    ):
+        if "blox" not in self.collision_types or not self.collision_types["blox"]:
+            return super().get_swept_sphere_collision(
+                query_sphere,
+                collision_query_buffer,
+                weight,
+                sweep_steps,
+                activation_distance,
+                speed_dt,
+                enable_speed_metric,
+                env_query_idx,
+                return_loss=return_loss,
+            )
+        d = self._get_blox_swept_sdf(
+            query_sphere,
+            collision_query_buffer,
+            weight=weight,
+            activation_distance=activation_distance,
+            speed_dt=speed_dt,
+            sweep_steps=sweep_steps,
+            enable_speed_metric=enable_speed_metric,
+        )
+
+        if ("primitive" not in self.collision_types or not self.collision_types["primitive"]) and (
+            "mesh" not in self.collision_types or not self.collision_types["mesh"]
+        ):
+            return d
+        d_base = super().get_swept_sphere_collision(
+            query_sphere,
+            collision_query_buffer,
+            weight,
+            activation_distance,
+            speed_dt,
+            sweep_steps,
+            enable_speed_metric,
+            env_query_idx,
+            return_loss=return_loss,
+        )
+        d = d + d_base
+        return d
+
+    def enable_obstacle(
+        self,
+        name: str,
+        enable: bool = True,
+        env_idx: int = 0,
+    ):
+        if self._blox_names is not None and name in self._blox_names:
+            self.enable_blox(enable, name)
+        else:
+            super().enable_obstacle(name, enable, env_idx)
+
+    def enable_blox(self, enable: bool = True, name: Optional[str] = None):
+        index = self._blox_names.index(name)
+        self._blox_tensor_list[1][index] = int(enable)
+
+    def update_blox_pose(
+        self,
+        w_obj_pose: Optional[Pose] = None,
+        obj_w_pose: Optional[Pose] = None,
+        name: Optional[str] = None,
+    ):
+        obj_w_pose = self._get_obstacle_poses(w_obj_pose, obj_w_pose)
+        index = self._blox_names.index(name)
+        self._blox_tensor_list[0][index][:7] = obj_w_pose.get_pose_vector()
+
+    def clear_bounding_box(
+        self,
+        cuboid: Cuboid,
+        layer_name: Optional[str] = None,
+    ):
+        raise NotImplementedError
+
+    def get_bounding_spheres(
+        self,
+        bounding_box: Cuboid,
+        obstacle_name: Optional[str] = None,
+        n_spheres: int = 1,
+        surface_sphere_radius: float = 0.002,
+        fit_type: SphereFitType = SphereFitType.VOXEL_VOLUME_SAMPLE_SURFACE,
+        voxelize_method: str = "ray",
+        pre_transform_pose: Optional[Pose] = None,
+        clear_region: bool = False,
+    ) -> List[Sphere]:
+        mesh = self.get_mesh_in_bounding_box(bounding_box, obstacle_name, clear_region=clear_region)
+
+        spheres = mesh.get_bounding_spheres(
+            n_spheres=n_spheres,
+            surface_sphere_radius=surface_sphere_radius,
+            fit_type=fit_type,
+            voxelize_method=voxelize_method,
+            pre_transform_pose=pre_transform_pose,
+            tensor_args=self.tensor_args,
+        )
+        return spheres
+
+    @profiler.record_function("world_blox/add_camera_frame")
+    def add_camera_frame(self, camera_observation: CameraObservation, layer_name: str):
+        index = self._blox_names.index(layer_name)
+        pose_mat = camera_observation.pose.get_matrix().view(4, 4)
+        if camera_observation.rgb_image is not None:
+            with profiler.record_function("world_blox/add_color_frame"):
+                self._blox_mapper.add_color_frame(
+                    camera_observation.rgb_image,
+                    pose_mat,
+                    camera_observation.intrinsics,
+                    mapper_id=index,
+                )
+
+        if camera_observation.depth_image is not None:
+            with profiler.record_function("world_blox/add_depth_frame"):
+                self._blox_mapper.add_depth_frame(
+                    camera_observation.depth_image,
+                    pose_mat,
+                    camera_observation.intrinsics,
+                    mapper_id=index,
+                )
+
+    def process_camera_frames(self, layer_name: Optional[str] = None, process_aux: bool = False):
+        self.update_blox_esdf(layer_name)
+        if process_aux:
+            self.update_blox_mesh(layer_name)
+
+    @profiler.record_function("world_blox/update_hashes")
+    def update_blox_hashes(self):
+        self._blox_mapper.update_hashmaps()
+
+    @profiler.record_function("world_blox/update_esdf")
+    def update_blox_esdf(self, layer_name: Optional[str] = None):
+        index = -1
+        if layer_name is not None:
+            index = self._blox_names.index(layer_name)
+        self._blox_mapper.update_esdf(index)
+
+    @profiler.record_function("world_blox/update_mesh")
+    def update_blox_mesh(self, layer_name: Optional[str] = None):
+        index = -1
+        if layer_name is not None:
+            index = self._blox_names.index(layer_name)
+        self._blox_mapper.update_mesh(index)
+
+    @profiler.record_function("world_blox/get_mesh")
+    def get_mesh_from_blox_layer(self, layer_name: str, mode: str = "nvblox") -> Mesh:
+        index = self._blox_names.index(layer_name)
+        if mode == "nvblox":
+            mesh_data = self._blox_mapper.get_mesh(index)
+            mesh = Mesh(
+                name=self._blox_names[index],
+                pose=self._blox_tensor_list[0][index, :7].squeeze().cpu().tolist(),
+                vertices=mesh_data["vertices"].tolist(),
+                faces=mesh_data["triangles"].tolist(),
+                vertex_colors=mesh_data["colors"],
+                vertex_normals=mesh_data["normals"],
+            )
+        elif mode == "voxel":
+            # query points using collision checker and use trimesh to create a mesh:
+            mesh = self.get_mesh_in_bounding_box(
+                Cuboid("test", pose=[0, 0, 0, 1, 0, 0, 0], dims=[1.5, 1.5, 1]),
+                voxel_size=0.03,
+            )
+        return mesh
+
+    def decay_layer(self, layer_name: str):
+        index = self._blox_names.index(layer_name)
+        self._blox_mapper.decay_occupancy(mapper_id=index)
diff --git a/src/curobo/geom/sdf/world_mesh.py b/src/curobo/geom/sdf/world_mesh.py
new file mode 100644
index 00000000..5d8035e1
--- /dev/null
+++ b/src/curobo/geom/sdf/world_mesh.py
@@ -0,0 +1,599 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+from dataclasses import dataclass
+from typing import List, Optional
+
+# Third Party
+import numpy as np
+import torch
+import warp as wp
+
+# CuRobo
+from curobo.geom.sdf.warp_primitives import SdfMeshWarpPy, SweptSdfMeshWarpPy
+from curobo.geom.sdf.world import (
+    CollisionQueryBuffer,
+    WorldCollisionConfig,
+    WorldPrimitiveCollision,
+)
+from curobo.geom.types import Mesh, WorldConfig
+from curobo.types.math import Pose
+from curobo.util.logger import log_error, log_info, log_warn
+from curobo.util.warp import init_warp
+
+
+@dataclass(frozen=True)
+class WarpMeshData:
+    name: str
+    m_id: int
+    vertices: wp.array
+    faces: wp.array
+    mesh: wp.Mesh
+
+
+class WorldMeshCollision(WorldPrimitiveCollision):
+    """World Mesh Collision using Nvidia's warp library
+
+    This currently requires passing int64 array from torch to warp which is only
+    available when compiled from source.
+    """
+
+    def __init__(self, config: WorldCollisionConfig):
+        # WorldCollision.(self)
+        init_warp()
+
+        self.tensor_args = config.tensor_args
+
+        self._env_n_mesh = None
+        self._mesh_tensor_list = None
+        self._env_mesh_names = None
+        self._wp_device = wp.torch.device_from_torch(self.tensor_args.device)
+        self._wp_mesh_cache = {}  # stores warp meshes across environments
+
+        super().__init__(config)
+
+    def _init_cache(self):
+        if (
+            self.cache is not None
+            and "mesh" in self.cache
+            and (not self.cache["mesh"] in [None, 0])
+        ):
+            self._create_mesh_cache(self.cache["mesh"])
+
+        return super()._init_cache()
+
+    def load_collision_model(
+        self, world_model: WorldConfig, env_idx: int = 0, load_obb_obs: bool = True
+    ):
+        max_nmesh = len(world_model.mesh)
+        if max_nmesh > 0:
+            if self._mesh_tensor_list is None or self._mesh_tensor_list[0].shape[1] < max_nmesh:
+                log_warn("Creating new Mesh cache: " + str(max_nmesh))
+                self._create_mesh_cache(max_nmesh)
+
+            # load all meshes as a batch:
+            name_list, w_mid, w_inv_pose = self._load_batch_mesh_to_warp(world_model.mesh)
+            self._mesh_tensor_list[0][env_idx, :max_nmesh] = w_mid
+            self._mesh_tensor_list[1][env_idx, :max_nmesh, :7] = w_inv_pose
+            self._mesh_tensor_list[2][env_idx, :max_nmesh] = 1
+
+            self._env_mesh_names[env_idx][:max_nmesh] = name_list
+            self._env_n_mesh[env_idx] = max_nmesh
+
+            self.collision_types["mesh"] = True
+        if load_obb_obs:
+            super().load_collision_model(world_model, env_idx)
+        else:
+            self.world_model = world_model
+
+    def load_batch_collision_model(self, world_config_list: List[WorldConfig]):
+        max_nmesh = max([len(x.mesh) for x in world_config_list])
+        if self._mesh_tensor_list is None or self._mesh_tensor_list[0].shape[1] < max_nmesh:
+            log_info("Creating new Mesh cache: " + str(max_nmesh))
+            self._create_mesh_cache(max_nmesh)
+
+        for env_idx, world_model in enumerate(world_config_list):
+            self.load_collision_model(world_model, env_idx=env_idx, load_obb_obs=False)
+        super().load_batch_collision_model(world_config_list)
+
+    def _load_mesh_to_warp(self, mesh: Mesh):
+        verts, faces = mesh.get_mesh_data()
+        v = wp.array(verts, dtype=wp.vec3, device=self._wp_device)
+        f = wp.array(np.ravel(faces), dtype=int, device=self._wp_device)
+        new_mesh = wp.Mesh(points=v, indices=f)
+        return WarpMeshData(mesh.name, new_mesh.id, v, f, new_mesh)
+
+    def _load_mesh_into_cache(self, mesh: Mesh) -> WarpMeshData:
+        #
+        if mesh.name not in self._wp_mesh_cache:
+            # load mesh into cache:
+            self._wp_mesh_cache[mesh.name] = self._load_mesh_to_warp(mesh)
+            # return self._wp_mesh_cache[mesh.name]
+        else:
+            log_info("Object already in warp cache, using existing instance: " + mesh.name)
+        return self._wp_mesh_cache[mesh.name]
+
+    def _load_batch_mesh_to_warp(self, mesh_list: List[Mesh]):
+        # First load all verts and faces:
+        name_list = []
+        pose_list = []
+        id_list = torch.zeros((len(mesh_list)), device=self.tensor_args.device, dtype=torch.int64)
+        for i, m_idx in enumerate(mesh_list):
+            m_data = self._load_mesh_into_cache(m_idx)
+            pose_list.append(m_idx.pose)
+
+            id_list[i] = m_data.m_id
+            name_list.append(m_data.name)
+        pose_buffer = Pose.from_batch_list(pose_list, self.tensor_args)
+        inv_pose_buffer = pose_buffer.inverse()
+        return name_list, id_list, inv_pose_buffer.get_pose_vector()
+
+    def add_mesh(self, new_mesh: Mesh, env_idx: int = 0):
+        if self._env_n_mesh[env_idx] >= self._mesh_tensor_list[0].shape[1]:
+            log_error(
+                "Cannot add new mesh as we are at mesh cache limit, increase cache limit in WorldMeshCollision"
+            )
+            return
+
+        wp_mesh_data = self._load_mesh_into_cache(new_mesh)
+
+        # get mesh pose:
+        w_obj_pose = Pose.from_list(new_mesh.pose, self.tensor_args)
+        # add loaded mesh into scene:
+
+        curr_idx = self._env_n_mesh[env_idx]
+        self._mesh_tensor_list[0][env_idx, curr_idx] = wp_mesh_data.m_id
+        self._mesh_tensor_list[1][env_idx, curr_idx, :7] = w_obj_pose.inverse().get_pose_vector()
+        self._mesh_tensor_list[2][env_idx, curr_idx] = 1
+        self._env_mesh_names[env_idx][curr_idx] = wp_mesh_data.name
+        self._env_n_mesh[env_idx] = curr_idx + 1
+
+    def get_mesh_idx(
+        self,
+        name: str,
+        env_idx: int = 0,
+    ) -> int:
+        if name not in self._env_mesh_names[env_idx]:
+            log_error("Obstacle with name: " + name + " not found in current world", exc_info=True)
+        return self._env_mesh_names[env_idx].index(name)
+
+    def create_collision_cache(self, mesh_cache=None, obb_cache=None, n_envs=None):
+        if n_envs is not None:
+            self.n_envs = n_envs
+        if mesh_cache is not None:
+            self._create_mesh_cache(mesh_cache)
+        if obb_cache is not None:
+            self._create_obb_cache(obb_cache)
+
+    def _create_mesh_cache(self, mesh_cache):
+        # create cache to store meshes, mesh poses and inverse poses
+
+        self._env_n_mesh = torch.zeros(
+            (self.n_envs), device=self.tensor_args.device, dtype=torch.int32
+        )
+
+        obs_enable = torch.zeros(
+            (self.n_envs, mesh_cache), dtype=torch.uint8, device=self.tensor_args.device
+        )
+        obs_inverse_pose = torch.zeros(
+            (self.n_envs, mesh_cache, 8),
+            dtype=self.tensor_args.dtype,
+            device=self.tensor_args.device,
+        )
+        obs_ids = torch.zeros(
+            (self.n_envs, mesh_cache), device=self.tensor_args.device, dtype=torch.int64
+        )
+        # v_empty = [[None for _ in range(mesh_cache)] for _ in range(self.n_envs)]
+        # @f_empty = [[None for _ in range(mesh_cache)] for _ in range(self.n_envs)]
+        # wp_m_empty = [[None for _ in range(mesh_cache)] for _ in range(self.n_envs)]
+        # warp requires uint64 for mesh indices, supports conversion from int64 to uint64
+        self._mesh_tensor_list = [
+            obs_ids,
+            obs_inverse_pose,
+            obs_enable,
+        ]  # 0=mesh idx, 1=pose, 2=mesh enable
+        self.collision_types["mesh"] = True  # TODO: enable this after loading first mesh
+        self._env_mesh_names = [[None for _ in range(mesh_cache)] for _ in range(self.n_envs)]
+
+        self._wp_mesh_cache = {}
+
+    def update_mesh_pose(
+        self,
+        w_obj_pose: Optional[Pose] = None,
+        obj_w_pose: Optional[Pose] = None,
+        name: Optional[str] = None,
+        env_obj_idx: Optional[torch.Tensor] = None,
+        env_idx: int = 0,
+    ):
+        w_inv_pose = self._get_obstacle_poses(w_obj_pose, obj_w_pose)
+
+        if name is not None:
+            obs_idx = self.get_mesh_idx(name, env_idx)
+            self._mesh_tensor_list[1][env_idx, obs_idx, :7] = w_inv_pose.get_pose_vector()
+        elif env_obj_idx is not None:
+            self._mesh_tensor_list[1][env_idx, env_obj_idx, :7] = w_inv_pose.get_pose_vector()
+        else:
+            raise ValueError("name or env_obj_idx needs to be given to update mesh pose")
+
+    def update_all_mesh_pose(
+        self,
+        w_obj_pose: Optional[Pose] = None,
+        obj_w_pose: Optional[Pose] = None,
+        name: Optional[List[str]] = None,
+        env_obj_idx: Optional[torch.Tensor] = None,
+        env_idx: int = 0,
+    ):
+        """Update poses for a list of meshes in the same environment
+
+        Args:
+            w_obj_pose (Optional[Pose], optional): _description_. Defaults to None.
+            obj_w_pose (Optional[Pose], optional): _description_. Defaults to None.
+            name (Optional[List[str]], optional): _description_. Defaults to None.
+            env_obj_idx (Optional[torch.Tensor], optional): _description_. Defaults to None.
+            env_idx (int, optional): _description_. Defaults to 0.
+        """
+        w_inv_pose = self._get_obstacle_poses(w_obj_pose, obj_w_pose)
+        raise NotImplementedError
+
+    def update_mesh_pose_env(
+        self,
+        w_obj_pose: Optional[Pose] = None,
+        obj_w_pose: Optional[Pose] = None,
+        name: Optional[str] = None,
+        env_obj_idx: Optional[torch.Tensor] = None,
+        env_idx: List[int] = [0],
+    ):
+        """Update pose of a single object in a list of environments
+
+        Args:
+            w_obj_pose (Optional[Pose], optional): _description_. Defaults to None.
+            obj_w_pose (Optional[Pose], optional): _description_. Defaults to None.
+            name (Optional[List[str]], optional): _description_. Defaults to None.
+            env_obj_idx (Optional[torch.Tensor], optional): _description_. Defaults to None.
+            env_idx (List[int], optional): _description_. Defaults to [0].
+        """
+        w_inv_pose = self._get_obstacle_poses(w_obj_pose, obj_w_pose)
+        # collect index of mesh across environments:
+        # index_tensor = torch.zeros((1, len(env_idx)), dtype=torch.long, device=self.tensor_args.device)
+
+        # for i, e in enumerate[env_idx]:
+        #    index_tensor[0,i] = self.get_mesh_idx(name, e)
+        raise NotImplementedError
+        # self._mesh_tensor_list[1][env_idx, obj_idx]
+
+    def update_mesh_from_warp(
+        self,
+        warp_mesh_idx: int,
+        w_obj_pose: Optional[Pose] = None,
+        obj_w_pose: Optional[Pose] = None,
+        obj_idx: int = 0,
+        env_idx: int = 0,
+        name: Optional[str] = None,
+    ):
+        if name is not None:
+            obj_idx = self.get_mesh_idx(name, env_idx)
+
+        if obj_idx >= self._mesh_tensor_list[0][env_idx].shape[0]:
+            log_error("Out of cache memory")
+        w_inv_pose = self._get_obstacle_poses(w_obj_pose, obj_w_pose)
+
+        self._mesh_tensor_list[0][env_idx, obj_idx] = warp_mesh_idx
+        self._mesh_tensor_list[1][env_idx, obj_idx] = w_inv_pose
+        self._mesh_tensor_list[2][env_idx, obj_idx] = 1
+        self._env_mesh_names[env_idx][obj_idx] = name
+        if self._env_n_mesh[env_idx] <= obj_idx:
+            self._env_n_mesh[env_idx] = obj_idx + 1
+
+    def update_obstacle_pose(
+        self,
+        name: str,
+        w_obj_pose: Pose,
+        env_idx: int = 0,
+    ):
+        if self._env_mesh_names is not None and name in self._env_mesh_names[env_idx]:
+            self.update_mesh_pose(name=name, w_obj_pose=w_obj_pose, env_idx=env_idx)
+        elif self._env_obbs_names is not None and name in self._env_obbs_names[env_idx]:
+            self.update_obb_pose(name=name, w_obj_pose=w_obj_pose, env_idx=env_idx)
+        else:
+            log_error("obstacle not found in OBB world model: " + name)
+
+    def enable_obstacle(
+        self,
+        name: str,
+        enable: bool = True,
+        env_idx: int = 0,
+    ):
+        if self._env_mesh_names is not None and name in self._env_mesh_names[env_idx]:
+            self.enable_mesh(enable, name, None, env_idx)
+        elif self._env_obbs_names is not None and name in self._env_obbs_names[env_idx]:
+            self.enable_obb(enable, name, None, env_idx)
+        else:
+            log_error("Obstacle not found in world model: " + name)
+        self.world_model.objects
+
+    def enable_mesh(
+        self,
+        enable: bool = True,
+        name: Optional[str] = None,
+        env_mesh_idx: Optional[torch.Tensor] = None,
+        env_idx: int = 0,
+    ):
+        """Update obstacle dimensions
+
+        Args:
+            obj_dims (torch.Tensor): [dim.x,dim.y, dim.z], give as [b,3]
+            obj_idx (torch.Tensor or int):
+
+        """
+        if env_mesh_idx is not None:
+            self._mesh_tensor_list[2][env_mesh_idx] = int(enable)  # enable == 1
+        else:
+            # find index of given name:
+            obs_idx = self.get_mesh_idx(name, env_idx)
+            self._mesh_tensor_list[2][env_idx, obs_idx] = int(enable)
+
+    def _get_sdf(
+        self,
+        query_spheres,
+        collision_query_buffer: CollisionQueryBuffer,
+        weight: torch.Tensor,
+        activation_distance: torch.Tensor,
+        env_query_idx=None,
+        return_loss=False,
+    ):
+        d = SdfMeshWarpPy.apply(
+            query_spheres,
+            collision_query_buffer.mesh_collision_buffer.distance_buffer,
+            collision_query_buffer.mesh_collision_buffer.grad_distance_buffer,
+            collision_query_buffer.mesh_collision_buffer.sparsity_index_buffer,
+            weight,
+            activation_distance,
+            self._mesh_tensor_list[0],
+            self._mesh_tensor_list[1],
+            self._mesh_tensor_list[2],
+            self._env_n_mesh,
+            self.max_distance,
+            env_query_idx,
+            return_loss,
+        )
+        return d
+
+    def _get_swept_sdf(
+        self,
+        query_spheres,
+        collision_query_buffer: CollisionQueryBuffer,
+        weight: torch.Tensor,
+        activation_distance: torch.Tensor,
+        speed_dt: torch.Tensor,
+        sweep_steps: int,
+        enable_speed_metric=False,
+        env_query_idx=None,
+        return_loss: bool = False,
+    ):
+        d = SweptSdfMeshWarpPy.apply(
+            query_spheres,
+            collision_query_buffer.mesh_collision_buffer.distance_buffer,
+            collision_query_buffer.mesh_collision_buffer.grad_distance_buffer,
+            collision_query_buffer.mesh_collision_buffer.sparsity_index_buffer,
+            weight,
+            activation_distance,
+            speed_dt,
+            self._mesh_tensor_list[0],
+            self._mesh_tensor_list[1],
+            self._mesh_tensor_list[2],
+            self._env_n_mesh,
+            sweep_steps,
+            enable_speed_metric,
+            self.max_distance,
+            env_query_idx,
+            return_loss,
+        )
+        return d
+
+    def get_sphere_distance(
+        self,
+        query_sphere: torch.Tensor,
+        collision_query_buffer: CollisionQueryBuffer,
+        weight: torch.Tensor,
+        activation_distance: torch.Tensor,
+        env_query_idx: Optional[torch.Tensor] = None,
+        return_loss: bool = False,
+    ):
+        # TODO: if no mesh object exist, call primitive
+        if "mesh" not in self.collision_types or not self.collision_types["mesh"]:
+            return super().get_sphere_distance(
+                query_sphere,
+                collision_query_buffer,
+                weight=weight,
+                activation_distance=activation_distance,
+                env_query_idx=env_query_idx,
+                return_loss=return_loss,
+            )
+
+        d = self._get_sdf(
+            query_sphere,
+            collision_query_buffer,
+            weight=weight,
+            activation_distance=activation_distance,
+            env_query_idx=env_query_idx,
+            return_loss=return_loss,
+        )
+
+        if "primitive" not in self.collision_types or not self.collision_types["primitive"]:
+            return d
+        d_prim = super().get_sphere_distance(
+            query_sphere,
+            collision_query_buffer,
+            weight=weight,
+            activation_distance=activation_distance,
+            env_query_idx=env_query_idx,
+            return_loss=return_loss,
+        )
+        d_val = d.view(d_prim.shape) + d_prim
+        return d_val
+
+    def get_sphere_collision(
+        self,
+        query_sphere,
+        collision_query_buffer: CollisionQueryBuffer,
+        weight: torch.Tensor,
+        activation_distance: torch.Tensor,
+        env_query_idx=None,
+        return_loss=False,
+    ):
+        if "mesh" not in self.collision_types or not self.collision_types["mesh"]:
+            return super().get_sphere_collision(
+                query_sphere,
+                collision_query_buffer,
+                weight=weight,
+                activation_distance=activation_distance,
+                env_query_idx=env_query_idx,
+                return_loss=return_loss,
+            )
+
+        d = self._get_sdf(
+            query_sphere,
+            collision_query_buffer,
+            weight=weight,
+            activation_distance=activation_distance,
+            env_query_idx=env_query_idx,
+            return_loss=return_loss,
+        )
+
+        if "primitive" not in self.collision_types or not self.collision_types["primitive"]:
+            return d
+
+        d_prim = super().get_sphere_collision(
+            query_sphere,
+            collision_query_buffer,
+            weight,
+            activation_distance=activation_distance,
+            env_query_idx=env_query_idx,
+            return_loss=return_loss,
+        )
+        d_val = d.view(d_prim.shape) + d_prim
+
+        return d_val
+
+    def get_swept_sphere_distance(
+        self,
+        query_sphere,
+        collision_query_buffer: CollisionQueryBuffer,
+        weight: torch.Tensor,
+        activation_distance: torch.Tensor,
+        speed_dt: torch.Tensor,
+        sweep_steps: int,
+        enable_speed_metric=False,
+        env_query_idx: Optional[torch.Tensor] = None,
+        return_loss: bool = False,
+    ):
+        # log_warn("Swept: Mesh + Primitive Collision Checking is experimental")
+        if "mesh" not in self.collision_types or not self.collision_types["mesh"]:
+            return super().get_swept_sphere_distance(
+                query_sphere,
+                collision_query_buffer,
+                weight=weight,
+                env_query_idx=env_query_idx,
+                sweep_steps=sweep_steps,
+                activation_distance=activation_distance,
+                speed_dt=speed_dt,
+                enable_speed_metric=enable_speed_metric,
+                return_loss=return_loss,
+            )
+
+        d = self._get_swept_sdf(
+            query_sphere,
+            collision_query_buffer,
+            weight=weight,
+            env_query_idx=env_query_idx,
+            sweep_steps=sweep_steps,
+            activation_distance=activation_distance,
+            speed_dt=speed_dt,
+            enable_speed_metric=enable_speed_metric,
+            return_loss=return_loss,
+        )
+        if "primitive" not in self.collision_types or not self.collision_types["primitive"]:
+            return d
+
+        d_prim = super().get_swept_sphere_distance(
+            query_sphere,
+            collision_query_buffer,
+            weight=weight,
+            env_query_idx=env_query_idx,
+            sweep_steps=sweep_steps,
+            activation_distance=activation_distance,
+            speed_dt=speed_dt,
+            enable_speed_metric=enable_speed_metric,
+            return_loss=return_loss,
+        )
+        d_val = d.view(d_prim.shape) + d_prim
+
+        return d_val
+
+    def get_swept_sphere_collision(
+        self,
+        query_sphere,
+        collision_query_buffer: CollisionQueryBuffer,
+        weight: torch.Tensor,
+        sweep_steps,
+        activation_distance: torch.Tensor,
+        speed_dt: torch.Tensor,
+        enable_speed_metric=False,
+        env_query_idx: Optional[torch.Tensor] = None,
+        return_loss: bool = False,
+    ):
+        if "mesh" not in self.collision_types or not self.collision_types["mesh"]:
+            return super().get_swept_sphere_collision(
+                query_sphere,
+                collision_query_buffer,
+                weight=weight,
+                env_query_idx=env_query_idx,
+                sweep_steps=sweep_steps,
+                activation_distance=activation_distance,
+                speed_dt=speed_dt,
+                enable_speed_metric=enable_speed_metric,
+                return_loss=return_loss,
+            )
+        d = self._get_swept_sdf(
+            query_sphere,
+            collision_query_buffer,
+            weight=weight,
+            activation_distance=activation_distance,
+            speed_dt=speed_dt,
+            env_query_idx=env_query_idx,
+            sweep_steps=sweep_steps,
+            enable_speed_metric=enable_speed_metric,
+            return_loss=return_loss,
+        )
+        if "primitive" not in self.collision_types or not self.collision_types["primitive"]:
+            return d
+
+        d_prim = super().get_swept_sphere_collision(
+            query_sphere,
+            collision_query_buffer,
+            weight=weight,
+            env_query_idx=env_query_idx,
+            sweep_steps=sweep_steps,
+            activation_distance=activation_distance,
+            speed_dt=speed_dt,
+            enable_speed_metric=enable_speed_metric,
+            return_loss=return_loss,
+        )
+        d_val = d.view(d_prim.shape) + d_prim
+
+        return d_val
+
+    def clear_cache(self):
+        self._wp_mesh_cache = {}
+        if self._mesh_tensor_list is not None:
+            self._mesh_tensor_list[2][:] = 0
+        super().clear_cache()
diff --git a/src/curobo/geom/sphere_fit.py b/src/curobo/geom/sphere_fit.py
new file mode 100644
index 00000000..d8aa65df
--- /dev/null
+++ b/src/curobo/geom/sphere_fit.py
@@ -0,0 +1,239 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+from enum import Enum
+from typing import List, Tuple
+
+# Third Party
+import numpy as np
+import torch
+import trimesh
+from trimesh.voxel.creation import voxelize
+
+# CuRobo
+from curobo.util.logger import log_warn
+
+
+class SphereFitType(Enum):
+    """Supported sphere fit types are listed here. VOXEL_VOLUME_SAMPLE_SURFACE works best.
+
+    See :ref:`attach_object_note` for more details.
+    """
+
+    #:  samples the surface of the mesh to approximate geometry
+    SAMPLE_SURFACE = "sample_surface"
+    #: voxelizes the volume and returns voxel positions that are intersecting with surface.
+    VOXEL_SURFACE = "voxel_surface"
+    #: voxelizes the volume and returns all ocupioed voxel positions.
+    VOXEL_VOLUME = "voxel_volume"
+    #: voxelizes the volume and returns voxel positions that are inside the surface of the geometry
+    VOXEL_VOLUME_INSIDE = "voxel_volume_inside"
+    #: voxelizes the volume and returns voxel positions that are inside the surface,
+    #: along with surface sampled positions
+    VOXEL_VOLUME_SAMPLE_SURFACE = "voxel_volume_sample_surface"
+
+
+def sample_even_fit_mesh(mesh: trimesh.Trimesh, n_spheres: int, sphere_radius: float):
+    n_pts = trimesh.sample.sample_surface_even(mesh, n_spheres)[0]
+    n_radius = [sphere_radius for _ in range(len(n_pts))]
+    return n_pts, n_radius
+
+
+def get_voxel_pitch(mesh: trimesh.Trimesh, n_cubes: int):
+    d = mesh.extents
+    cube_volume = d[0] * d[1] * d[2]
+    v = mesh.volume
+    if v > 0:
+        occupancy = 1.0 - ((cube_volume - v) / cube_volume)
+    else:
+        log_warn("sphere_fit: occupancy test failed, assuming cuboid volume")
+        occupancy = 1.0
+
+    # given the extents, find the radius to get number of spheres:
+    pitch = (occupancy * cube_volume / n_cubes) ** (1 / 3)
+    return pitch
+
+
+def voxel_fit_surface_mesh(
+    mesh: trimesh.Trimesh,
+    n_spheres: int,
+    sphere_radius: float,
+    voxelize_method: str = "ray",
+):
+    pts, rad = get_voxelgrid_from_mesh(mesh, n_spheres, voxelize_method)
+    if pts is None:
+        return pts, rad
+    pr = trimesh.proximity.ProximityQuery(mesh)
+    if len(pts) <= n_spheres:
+        surface_pts, _, _ = pr.on_surface(pts)
+        n_radius = [sphere_radius for _ in range(len(surface_pts))]
+        return surface_pts, n_radius
+
+    # compute signed distance:
+    dist = pr.signed_distance(pts)
+
+    # calculate distance to boundary:
+    dist = np.abs(dist - rad)
+    # get the first n points closest to boundary:
+    _, idx = torch.topk(torch.as_tensor(dist), k=n_spheres, largest=False)
+
+    n_pts = pts[idx]
+    n_radius = [sphere_radius for _ in range(len(n_pts))]
+    return n_pts, n_radius
+
+
+def get_voxelgrid_from_mesh(mesh: trimesh.Trimesh, n_spheres: int, voxelize_method: str = "ray"):
+    pitch = get_voxel_pitch(mesh, n_spheres)
+    radius = pitch / 2.0
+    try:
+        voxel = voxelize(mesh, pitch, voxelize_method)
+        voxel = voxel.fill("base")
+        pts = voxel.points
+        rad = np.ravel([radius for _ in range(len(pts))])
+    except:
+        log_warn("voxelization failed")
+        pts = rad = None
+    return pts, rad
+
+
+def voxel_fit_mesh(
+    mesh: trimesh.Trimesh,
+    n_spheres: int,
+    surface_sphere_radius: float,
+    voxelize_method: str = "ray",
+):
+    pts, rad = get_voxelgrid_from_mesh(mesh, n_spheres, voxelize_method)
+    if pts is None:
+        return pts, rad
+    # compute signed distance:
+    pr = trimesh.proximity.ProximityQuery(mesh)
+    dist = pr.signed_distance(pts)
+
+    # calculate distance to boundary:
+    dist = dist - rad
+
+    # all negative values are outside the mesh:
+    idx = dist < 0.0
+    surface_pts, _, _ = pr.on_surface(pts[idx])
+    pts[idx] = surface_pts
+    rad[idx] = surface_sphere_radius
+    if len(pts) > n_spheres:
+        # remove some surface pts:
+        inside_idx = dist >= 0.0
+        inside_pts = pts[inside_idx]
+        if len(inside_pts) < n_spheres:
+            new_pts = np.zeros((n_spheres, 3))
+            new_pts[: len(inside_pts)] = inside_pts
+            new_radius = np.zeros(n_spheres)
+            new_radius[: len(inside_pts)] = rad[inside_idx]
+
+            new_pts[len(inside_pts) :] = surface_pts[: n_spheres - len(inside_pts)]
+            new_radius[len(inside_pts) :] = surface_sphere_radius
+            pts = new_pts
+            rad = new_radius
+
+    n_pts = pts
+    n_radius = rad.tolist()
+
+    return n_pts, n_radius
+
+
+def voxel_fit_volume_sample_surface_mesh(
+    mesh: trimesh.Trimesh,
+    n_spheres: int,
+    surface_sphere_radius: float,
+    voxelize_method: str = "ray",
+):
+    pts, rad = voxel_fit_volume_inside_mesh(mesh, 0.75 * n_spheres, voxelize_method)
+    if pts is None:
+        return pts, rad
+    # compute surface points:
+    if len(pts) >= n_spheres:
+        return pts, rad
+
+    sample_count = n_spheres - (len(pts))
+
+    surface_sample_pts, sample_radius = sample_even_fit_mesh(
+        mesh, sample_count, surface_sphere_radius
+    )
+    pts = np.concatenate([pts, surface_sample_pts])
+    rad = np.concatenate([rad, sample_radius])
+    return pts, rad
+
+
+def voxel_fit_volume_inside_mesh(
+    mesh: trimesh.Trimesh,
+    n_spheres: int,
+    voxelize_method: str = "ray",
+):
+    pts, rad = get_voxelgrid_from_mesh(mesh, 2 * n_spheres, voxelize_method)
+    if pts is None:
+        return pts, rad
+    # compute signed distance:
+    pr = trimesh.proximity.ProximityQuery(mesh)
+    dist = pr.signed_distance(pts)
+
+    # calculate distance to boundary:
+    dist = dist - rad
+    # all negative values are outside the mesh:
+    idx = dist > 0.0
+    n_pts = pts[idx]
+    n_radius = rad[idx].tolist()
+    return n_pts, n_radius
+
+
+def fit_spheres_to_mesh(
+    mesh: trimesh.Trimesh,
+    n_spheres: int,
+    surface_sphere_radius: float = 0.01,
+    fit_type: SphereFitType = SphereFitType.VOXEL_VOLUME_SAMPLE_SURFACE,
+    voxelize_method: str = "ray",
+) -> Tuple[np.ndarray, List[float]]:
+    """Approximate a mesh with spheres. See :ref:`attach_object_note` for more details.
+
+
+    Args:
+        mesh: Input trimesh
+        n_spheres: _description_
+        surface_sphere_radius: _description_. Defaults to 0.01.
+        fit_type: _description_. Defaults to SphereFitType.VOXEL_VOLUME_SAMPLE_SURFACE.
+        voxelize_method: _description_. Defaults to "ray".
+
+    Returns:
+        _description_
+    """
+    n_pts = n_radius = None
+    if fit_type == SphereFitType.SAMPLE_SURFACE:
+        n_pts, n_radius = sample_even_fit_mesh(mesh, n_spheres, surface_sphere_radius)
+    elif fit_type == SphereFitType.VOXEL_SURFACE:
+        n_pts, n_radius = voxel_fit_surface_mesh(
+            mesh, n_spheres, surface_sphere_radius, voxelize_method
+        )
+    elif fit_type == SphereFitType.VOXEL_VOLUME_INSIDE:
+        n_pts, n_radius = voxel_fit_volume_inside_mesh(mesh, n_spheres, voxelize_method)
+    elif fit_type == SphereFitType.VOXEL_VOLUME:
+        n_pts, n_radius = voxel_fit_mesh(mesh, n_spheres, surface_sphere_radius, voxelize_method)
+    elif fit_type == SphereFitType.VOXEL_VOLUME_SAMPLE_SURFACE:
+        n_pts, n_radius = voxel_fit_volume_sample_surface_mesh(
+            mesh, n_spheres, surface_sphere_radius, voxelize_method
+        )
+    if n_pts is None or len(n_pts) < 1:
+        log_warn("sphere_fit: falling back to sampling surface")
+        n_pts, n_radius = sample_even_fit_mesh(mesh, n_spheres, surface_sphere_radius)
+
+    if n_pts is not None and len(n_pts) > n_spheres:
+        samples = torch.as_tensor(n_pts)
+        dist = torch.linalg.norm(samples - torch.mean(samples, dim=-1).unsqueeze(1), dim=-1)
+        _, knn_i = dist.topk(n_spheres, largest=True)
+        n_pts = samples[knn_i].cpu().numpy()
+        n_radius = np.ravel(n_radius)[knn_i.cpu().flatten().tolist()].tolist()
+    return n_pts, n_radius
diff --git a/src/curobo/geom/transform.py b/src/curobo/geom/transform.py
new file mode 100644
index 00000000..606358c7
--- /dev/null
+++ b/src/curobo/geom/transform.py
@@ -0,0 +1,1209 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+from typing import Optional
+
+# Third Party
+import torch
+import warp as wp
+
+# CuRobo
+from curobo.curobolib.kinematics import rotation_matrix_to_quaternion
+from curobo.util.logger import log_error
+from curobo.util.warp import init_warp
+
+
+def transform_points(
+    position, quaternion, points, out_points=None, out_gp=None, out_gq=None, out_gpt=None
+):
+    if out_points is None:
+        out_points = torch.zeros((points.shape[0], 3), device=points.device, dtype=points.dtype)
+    if out_gp is None:
+        out_gp = torch.zeros((position.shape[0], 3), device=position.device)
+    if out_gq is None:
+        out_gq = torch.zeros((quaternion.shape[0], 4), device=quaternion.device)
+    if out_gpt is None:
+        out_gpt = torch.zeros((points.shape[0], 3), device=position.device)
+    out_points = TransformPoint.apply(
+        position, quaternion, points, out_points, out_gp, out_gq, out_gpt
+    )
+    return out_points
+
+
+def batch_transform_points(
+    position, quaternion, points, out_points=None, out_gp=None, out_gq=None, out_gpt=None
+):
+    if out_points is None:
+        out_points = torch.zeros(
+            (points.shape[0], points.shape[1], 3), device=points.device, dtype=points.dtype
+        )
+    if out_gp is None:
+        out_gp = torch.zeros((position.shape[0], 3), device=position.device)
+    if out_gq is None:
+        out_gq = torch.zeros((quaternion.shape[0], 4), device=quaternion.device)
+    if out_gpt is None:
+        out_gpt = torch.zeros((points.shape[0], points.shape[1], 3), device=position.device)
+    out_points = BatchTransformPoint.apply(
+        position, quaternion, points, out_points, out_gp, out_gq, out_gpt
+    )
+    return out_points
+
+
+@torch.jit.script
+def get_inv_transform(w_rot_c, w_trans_c):
+    # type: (Tensor, Tensor) -> Tuple[Tensor, Tensor]
+    c_rot_w = w_rot_c.transpose(-1, -2)
+    c_trans_w = -1.0 * (c_rot_w @ w_trans_c.unsqueeze(-1)).squeeze(-1)
+    return c_rot_w, c_trans_w
+
+
+@torch.jit.script
+def transform_point_inverse(point, rot, trans):
+    # type: (Tensor, Tensor, Tensor) -> Tensor
+
+    # new_point = (rot @ (point).unsqueeze(-1)).squeeze(-1) + trans
+    n_rot, n_trans = get_inv_transform(rot, trans)
+    new_point = (point @ n_rot.transpose(-1, -2)) + n_trans
+    return new_point
+
+
+def matrix_to_quaternion(matrix, out_quat=None, adj_matrix=None):
+    matrix = matrix.view(-1, 3, 3)
+    out_quat = MatrixToQuaternion.apply(matrix, out_quat, adj_matrix)
+    # out_quat = cuda_matrix_to_quaternion(matrix)
+    return out_quat
+
+
+def cuda_matrix_to_quaternion(matrix):
+    """
+    Convert rotations given as rotation matrices to quaternions.
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+
+    Returns:
+        quaternions with real part first, as tensor of shape (..., 4). [qw, qx,qy,qz]
+    """
+    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
+        raise ValueError(f"Invalid rotation matrix  shape f{matrix.shape}.")
+
+    # account for different batch shapes here:
+    in_shape = matrix.shape
+    mat_in = matrix.view(-1, 3, 3)
+
+    out_quat = torch.zeros((mat_in.shape[0], 4), device=matrix.device, dtype=matrix.dtype)
+    out_quat = rotation_matrix_to_quaternion(matrix, out_quat)
+    out_shape = list(in_shape[:-2]) + [4]
+    out_quat = out_quat.view(out_shape)
+    return out_quat
+
+
+def quaternion_to_matrix(quaternions, out_mat=None, adj_quaternion=None):
+    # return torch_quaternion_to_matrix(quaternions)
+    out_mat = QuatToMatrix.apply(quaternions, out_mat, adj_quaternion)
+    return out_mat
+
+
+def torch_quaternion_to_matrix(quaternions):
+    """
+    Convert rotations given as quaternions to rotation matrices.
+
+    Args:
+        quaternions: quaternions with real part first,
+            as tensor of shape (..., 4).
+
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+
+    quaternions = torch.as_tensor(quaternions)
+    r, i, j, k = torch.unbind(quaternions, -1)
+    two_s = 2.0 / (quaternions * quaternions).sum(-1)
+
+    o = torch.stack(
+        (
+            1 - two_s * (j * j + k * k),
+            two_s * (i * j - k * r),
+            two_s * (i * k + j * r),
+            two_s * (i * j + k * r),
+            1 - two_s * (i * i + k * k),
+            two_s * (j * k - i * r),
+            two_s * (i * k - j * r),
+            two_s * (j * k + i * r),
+            1 - two_s * (i * i + j * j),
+        ),
+        -1,
+    )
+    return o.reshape(quaternions.shape[:-1] + (3, 3))
+
+
+def pose_to_matrix(
+    position: torch.Tensor, quaternion: torch.Tensor, out_matrix: Optional[torch.Tensor] = None
+):
+    if out_matrix is None:
+        if len(position.shape) == 2:
+            out_matrix = torch.zeros(
+                (position.shape[0], 4, 4), device=position.device, dtype=position.dtype
+            )
+        else:
+            out_matrix = torch.zeros(
+                (position.shape[0], position.shape[1], 4, 4),
+                device=position.device,
+                dtype=position.dtype,
+            )
+        out_matrix[..., 3, 3] = 1.0
+    out_matrix[..., :3, 3] = position
+    out_matrix[..., :3, :3] = quaternion_to_matrix(quaternion)
+    return out_matrix
+
+
+def pose_multiply(
+    position,
+    quaternion,
+    position2,
+    quaternion2,
+    out_position=None,
+    out_quaternion=None,
+    adj_pos=None,
+    adj_quat=None,
+    adj_pos2=None,
+    adj_quat2=None,
+):
+    if position.shape == position2.shape:
+        out_position, out_quaternion = BatchTransformPose.apply(
+            position,
+            quaternion,
+            position2,
+            quaternion2,
+            out_position,
+            out_quaternion,
+            adj_pos,
+            adj_quat,
+            adj_pos2,
+            adj_quat2,
+        )
+    elif position.shape[0] == 1 and position2.shape[0] > 1:
+        out_position, out_quaternion = TransformPose.apply(
+            position,
+            quaternion,
+            position2,
+            quaternion2,
+            out_position,
+            out_quaternion,
+            adj_pos,
+            adj_quat,
+            adj_pos2,
+            adj_quat2,
+        )
+    else:
+        log_error("shapes not supported")
+
+    return out_position, out_quaternion
+
+
+def pose_inverse(
+    position,
+    quaternion,
+    out_position=None,
+    out_quaternion=None,
+    adj_pos=None,
+    adj_quat=None,
+):
+    out_position, out_quaternion = PoseInverse.apply(
+        position,
+        quaternion,
+        out_position,
+        out_quaternion,
+        adj_pos,
+        adj_quat,
+    )
+
+    return out_position, out_quaternion
+
+
+@wp.kernel
+def compute_transform_point(
+    position: wp.array(dtype=wp.vec3),
+    quat: wp.array(dtype=wp.vec4),
+    pt: wp.array(dtype=wp.vec3),
+    n_pts: wp.int32,
+    n_poses: wp.int32,
+    out_pt: wp.array(dtype=wp.vec3),
+):  # given n,3 points and b poses, get b,n,3 transformed points
+    # we tile as
+    tid = wp.tid()
+    b_idx = tid / (n_pts)
+    p_idx = tid - (b_idx * n_pts)
+
+    # read data:
+
+    in_position = position[b_idx]
+    in_quat = quat[b_idx]
+    in_pt = pt[p_idx]
+
+    # read point
+    # create a transform from a vector/quaternion:
+    t = wp.transform(in_position, wp.quaternion(in_quat[1], in_quat[2], in_quat[3], in_quat[0]))
+
+    # transform a point
+    p = wp.transform_point(t, in_pt)
+
+    # write pt:
+    out_pt[b_idx * n_pts + p_idx] = p
+
+
+@wp.kernel
+def compute_batch_transform_point(
+    position: wp.array(dtype=wp.vec3),
+    quat: wp.array(dtype=wp.vec4),
+    pt: wp.array(dtype=wp.vec3),
+    n_pts: wp.int32,
+    n_poses: wp.int32,
+    out_pt: wp.array(dtype=wp.vec3),
+):  # given n,3 points and b poses, get b,n,3 transformed points
+    # we tile as
+    tid = wp.tid()
+    b_idx = tid / (n_pts)
+    p_idx = tid - (b_idx * n_pts)
+
+    # read data:
+
+    in_position = position[b_idx]
+    in_quat = quat[b_idx]
+    in_pt = pt[b_idx * n_pts + p_idx]
+
+    # read point
+    # create a transform from a vector/quaternion:
+    t = wp.transform(in_position, wp.quaternion(in_quat[1], in_quat[2], in_quat[3], in_quat[0]))
+
+    # transform a point
+    p = wp.transform_point(t, in_pt)
+
+    # write pt:
+    out_pt[b_idx * n_pts + p_idx] = p
+
+
+@wp.kernel
+def compute_batch_pose_multipy(
+    position: wp.array(dtype=wp.vec3),
+    quat: wp.array(dtype=wp.vec4),
+    position2: wp.array(dtype=wp.vec3),
+    quat2: wp.array(dtype=wp.vec4),
+    out_position: wp.array(dtype=wp.vec3),
+    out_quat: wp.array(dtype=wp.vec4),
+):  # b pose_1 and b pose_2, compute pose_1 * pose_2
+    b_idx = wp.tid()
+    # read data:
+
+    in_position = position[b_idx]
+    in_quat = quat[b_idx]
+
+    in_position2 = position2[b_idx]
+    in_quat2 = quat2[b_idx]
+
+    # read point
+    # create a transform from a vector/quaternion:
+    t_1 = wp.transform(in_position, wp.quaternion(in_quat[1], in_quat[2], in_quat[3], in_quat[0]))
+    t_2 = wp.transform(
+        in_position2, wp.quaternion(in_quat2[1], in_quat2[2], in_quat2[3], in_quat2[0])
+    )
+
+    # transform a point
+    t_3 = wp.transform_multiply(t_1, t_2)
+
+    # write pt:
+    out_q = wp.transform_get_rotation(t_3)
+
+    out_v = wp.vec4()
+    out_v[0] = out_q[3]
+    out_v[1] = out_q[0]
+    out_v[2] = out_q[1]
+    out_v[3] = out_q[2]
+
+    out_position[b_idx] = wp.transform_get_translation(t_3)
+    out_quat[b_idx] = out_v
+
+
+@wp.kernel
+def compute_pose_inverse(
+    position: wp.array(dtype=wp.vec3),
+    quat: wp.array(dtype=wp.vec4),
+    out_position: wp.array(dtype=wp.vec3),
+    out_quat: wp.array(dtype=wp.vec4),
+):  # b pose_1 and b pose_2, compute pose_1 * pose_2
+    b_idx = wp.tid()
+    # read data:
+
+    in_position = position[b_idx]
+    in_quat = quat[b_idx]
+
+    # read point
+    # create a transform from a vector/quaternion:
+    t_1 = wp.transform(in_position, wp.quaternion(in_quat[1], in_quat[2], in_quat[3], in_quat[0]))
+    t_3 = wp.transform_inverse(t_1)
+
+    # write pt:
+    out_q = wp.transform_get_rotation(t_3)
+
+    out_v = wp.vec4()
+    out_v[0] = wp.index(out_q, 3)
+    out_v[1] = wp.index(out_q, 0)
+    out_v[2] = wp.index(out_q, 1)
+    out_v[3] = wp.index(out_q, 2)
+
+    out_position[b_idx] = wp.transform_get_translation(t_3)
+    out_quat[b_idx] = out_v
+
+
+@wp.kernel
+def compute_quat_to_matrix(
+    quat: wp.array(dtype=wp.vec4),
+    out_mat: wp.array(dtype=wp.mat33),
+):
+    # b pose_1 and b pose_2, compute pose_1 * pose_2
+    b_idx = wp.tid()
+    # read data:
+
+    in_quat = quat[b_idx]
+
+    # read point
+    # create a transform from a vector/quaternion:
+    q_1 = wp.quaternion(in_quat[1], in_quat[2], in_quat[3], in_quat[0])
+    m_1 = wp.quat_to_matrix(q_1)
+
+    # write pt:
+    out_mat[b_idx] = m_1
+
+
+@wp.kernel
+def compute_matrix_to_quat(
+    in_mat: wp.array(dtype=wp.mat33),
+    out_quat: wp.array(dtype=wp.vec4),
+):
+    # b pose_1 and b pose_2, compute pose_1 * pose_2
+    b_idx = wp.tid()
+    # read data:
+
+    in_m = in_mat[b_idx]
+
+    # read point
+    # create a transform from a vector/quaternion:
+    out_q = wp.quat_from_matrix(in_m)
+
+    out_v = wp.vec4()
+    out_v[0] = wp.index(out_q, 3)
+    out_v[1] = wp.index(out_q, 0)
+    out_v[2] = wp.index(out_q, 1)
+    out_v[3] = wp.index(out_q, 2)
+    # write pt:
+    out_quat[b_idx] = out_v
+
+
+@wp.kernel
+def compute_pose_multipy(
+    position: wp.array(dtype=wp.vec3),
+    quat: wp.array(dtype=wp.vec4),
+    position2: wp.array(dtype=wp.vec3),
+    quat2: wp.array(dtype=wp.vec4),
+    out_position: wp.array(dtype=wp.vec3),
+    out_quat: wp.array(dtype=wp.vec4),
+):  # b pose_1 and b pose_2, compute pose_1 * pose_2
+    b_idx = wp.tid()
+    # read data:
+
+    in_position = position[0]
+    in_quat = quat[0]
+
+    in_position2 = position2[b_idx]
+    in_quat2 = quat2[b_idx]
+
+    # read point
+    # create a transform from a vector/quaternion:
+    t_1 = wp.transform(in_position, wp.quaternion(in_quat[1], in_quat[2], in_quat[3], in_quat[0]))
+    t_2 = wp.transform(
+        in_position2, wp.quaternion(in_quat2[1], in_quat2[2], in_quat2[3], in_quat2[0])
+    )
+
+    # transform a point
+    t_3 = wp.transform_multiply(t_1, t_2)
+
+    # write pt:
+    out_q = wp.transform_get_rotation(t_3)
+
+    out_v = wp.vec4()
+    out_v[0] = out_q[3]
+    out_v[1] = out_q[0]
+    out_v[2] = out_q[1]
+    out_v[3] = out_q[2]
+
+    out_position[b_idx] = wp.transform_get_translation(t_3)
+    out_quat[b_idx] = out_v
+
+
+class TransformPoint(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        position: torch.Tensor,
+        quaternion: torch.Tensor,
+        points: torch.Tensor,
+        out_points: torch.Tensor,
+        adj_position: torch.Tensor,
+        adj_quaternion: torch.Tensor,
+        adj_points: torch.Tensor,
+    ):
+        n, _ = out_points.shape
+        init_warp()
+        ctx.save_for_backward(
+            position, quaternion, points, out_points, adj_position, adj_quaternion, adj_points
+        )
+        b = 1
+        ctx.b = b
+        ctx.n = n
+
+        wp.launch(
+            kernel=compute_transform_point,
+            dim=b * n,
+            inputs=[
+                wp.from_torch(position.detach().view(-1, 3).contiguous(), dtype=wp.vec3),
+                wp.from_torch(quaternion.detach().view(-1, 4).contiguous(), dtype=wp.vec4),
+                wp.from_torch(points.detach().view(-1, 3).contiguous(), dtype=wp.vec3),
+                n,
+                b,
+            ],
+            outputs=[wp.from_torch(out_points.view(-1, 3), dtype=wp.vec3)],
+            stream=wp.stream_from_torch(position.device),
+        )
+
+        return out_points
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        (
+            position,
+            quaternion,
+            points,
+            out_points,
+            adj_position,
+            adj_quaternion,
+            adj_points,
+        ) = ctx.saved_tensors
+        adj_position = 0.0 * adj_position
+        adj_quaternion = 0.0 * adj_quaternion
+        adj_points = 0.0 * adj_points
+
+        wp_adj_out_points = wp.from_torch(grad_output.view(-1, 3).contiguous(), dtype=wp.vec3)
+        wp_adj_points = wp.from_torch(adj_points, dtype=wp.vec3)
+
+        wp_adj_position = wp.from_torch(adj_position, dtype=wp.vec3)
+        wp_adj_quat = wp.from_torch(adj_quaternion, dtype=wp.vec4)
+
+        wp.launch(
+            kernel=compute_transform_point,
+            dim=ctx.b * ctx.n,
+            inputs=[
+                wp.from_torch(
+                    position.view(-1, 3).contiguous(), dtype=wp.vec3, grad=wp_adj_position
+                ),
+                wp.from_torch(quaternion.view(-1, 4).contiguous(), dtype=wp.vec4, grad=wp_adj_quat),
+                wp.from_torch(points.view(-1, 3).contiguous(), dtype=wp.vec3, grad=wp_adj_points),
+                ctx.n,
+                ctx.b,
+            ],
+            outputs=[
+                wp.from_torch(
+                    out_points.view(-1, 3).contiguous(), dtype=wp.vec3, grad=wp_adj_out_points
+                ),
+            ],
+            adj_inputs=[
+                None,
+                None,
+                None,
+                ctx.n,
+                ctx.b,
+            ],
+            adj_outputs=[
+                None,
+            ],
+            stream=wp.stream_from_torch(grad_output.device),
+            adjoint=True,
+        )
+        g_p = g_q = g_pt = None
+        if ctx.needs_input_grad[0]:
+            g_p = adj_position
+        if ctx.needs_input_grad[1]:
+            g_q = adj_quaternion
+        if ctx.needs_input_grad[2]:
+            g_pt = adj_points
+        return g_p, g_q, g_pt, None, None, None, None
+
+
+class BatchTransformPoint(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        position: torch.Tensor,
+        quaternion: torch.Tensor,
+        points: torch.Tensor,
+        out_points: torch.Tensor,
+        adj_position: torch.Tensor,
+        adj_quaternion: torch.Tensor,
+        adj_points: torch.Tensor,
+    ):
+        b, n, _ = out_points.shape
+        init_warp()
+        points = points.detach()
+        ctx.save_for_backward(
+            position, quaternion, points, out_points, adj_position, adj_quaternion, adj_points
+        )
+        ctx.b = b
+        ctx.n = n
+        wp.launch(
+            kernel=compute_batch_transform_point,
+            dim=b * n,
+            inputs=[
+                wp.from_torch(position.detach().view(-1, 3).contiguous(), dtype=wp.vec3),
+                wp.from_torch(quaternion.detach().view(-1, 4).contiguous(), dtype=wp.vec4),
+                wp.from_torch(points.detach().view(-1, 3).contiguous(), dtype=wp.vec3),
+                n,
+                b,
+            ],
+            outputs=[wp.from_torch(out_points.view(-1, 3).contiguous(), dtype=wp.vec3)],
+            stream=wp.stream_from_torch(position.device),
+        )
+
+        return out_points
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        (
+            position,
+            quaternion,
+            points,
+            out_points,
+            adj_position,
+            adj_quaternion,
+            adj_points,
+        ) = ctx.saved_tensors
+        init_warp()
+        # print(adj_quaternion.shape)
+        wp_adj_out_points = wp.from_torch(grad_output.view(-1, 3).contiguous(), dtype=wp.vec3)
+
+        adj_position = 0.0 * adj_position
+        adj_quaternion = 0.0 * adj_quaternion
+        adj_points = 0.0 * adj_points
+
+        wp_adj_points = wp.from_torch(adj_points.view(-1, 3), dtype=wp.vec3)
+        wp_adj_position = wp.from_torch(adj_position.view(-1, 3), dtype=wp.vec3)
+        wp_adj_quat = wp.from_torch(adj_quaternion.view(-1, 4), dtype=wp.vec4)
+        wp.launch(
+            kernel=compute_batch_transform_point,
+            dim=ctx.b * ctx.n,
+            inputs=[
+                wp.from_torch(
+                    position.view(-1, 3).contiguous(), dtype=wp.vec3, grad=wp_adj_position
+                ),
+                wp.from_torch(quaternion.view(-1, 4).contiguous(), dtype=wp.vec4, grad=wp_adj_quat),
+                wp.from_torch(points.view(-1, 3).contiguous(), dtype=wp.vec3, grad=wp_adj_points),
+                ctx.n,
+                ctx.b,
+            ],
+            outputs=[
+                wp.from_torch(out_points.view(-1, 3), dtype=wp.vec3, grad=wp_adj_out_points),
+            ],
+            adj_inputs=[
+                None,
+                None,
+                None,
+                ctx.n,
+                ctx.b,
+            ],
+            adj_outputs=[
+                None,
+            ],
+            stream=wp.stream_from_torch(grad_output.device),
+            adjoint=True,
+        )
+        g_p = g_q = g_pt = None
+        if ctx.needs_input_grad[0]:
+            g_p = adj_position
+        if ctx.needs_input_grad[1]:
+            g_q = adj_quaternion
+        if ctx.needs_input_grad[2]:
+            g_pt = adj_points
+        return g_p, g_q, g_pt, None, None, None, None
+
+
+class BatchTransformPose(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        position: torch.Tensor,
+        quaternion: torch.Tensor,
+        position2: torch.Tensor,
+        quaternion2: torch.Tensor,
+        out_position: torch.Tensor,
+        out_quaternion: torch.Tensor,
+        adj_position: torch.Tensor,
+        adj_quaternion: torch.Tensor,
+        adj_position2: torch.Tensor,
+        adj_quaternion2: torch.Tensor,
+    ):
+        b, _ = position.shape
+
+        if out_position is None:
+            out_position = torch.zeros_like(position2)
+        if out_quaternion is None:
+            out_quaternion = torch.zeros_like(quaternion2)
+        if adj_position is None:
+            adj_position = torch.zeros_like(position)
+        if adj_quaternion is None:
+            adj_quaternion = torch.zeros_like(quaternion)
+        if adj_position2 is None:
+            adj_position2 = torch.zeros_like(position2)
+        if adj_quaternion2 is None:
+            adj_quaternion2 = torch.zeros_like(quaternion2)
+
+        init_warp()
+        ctx.save_for_backward(
+            position,
+            quaternion,
+            position2,
+            quaternion2,
+            out_position,
+            out_quaternion,
+            adj_position,
+            adj_quaternion,
+            adj_position2,
+            adj_quaternion2,
+        )
+        ctx.b = b
+        wp.launch(
+            kernel=compute_batch_pose_multipy,
+            dim=b,
+            inputs=[
+                wp.from_torch(position.detach().view(-1, 3).contiguous(), dtype=wp.vec3),
+                wp.from_torch(quaternion.detach().view(-1, 4).contiguous(), dtype=wp.vec4),
+                wp.from_torch(position2.detach().view(-1, 3).contiguous(), dtype=wp.vec3),
+                wp.from_torch(quaternion2.detach().view(-1, 4).contiguous(), dtype=wp.vec4),
+            ],
+            outputs=[
+                wp.from_torch(out_position.detach().view(-1, 3).contiguous(), dtype=wp.vec3),
+                wp.from_torch(out_quaternion.detach().view(-1, 4).contiguous(), dtype=wp.vec4),
+            ],
+            stream=wp.stream_from_torch(position.device),
+        )
+
+        return out_position, out_quaternion
+
+    @staticmethod
+    def backward(ctx, grad_out_position, grad_out_quaternion):
+        (
+            position,
+            quaternion,
+            position2,
+            quaternion2,
+            out_position,
+            out_quaternion,
+            adj_position,
+            adj_quaternion,
+            adj_position2,
+            adj_quaternion2,
+        ) = ctx.saved_tensors
+        init_warp()
+
+        wp_adj_out_position = wp.from_torch(
+            grad_out_position.view(-1, 3).contiguous(), dtype=wp.vec3
+        )
+        wp_adj_out_quaternion = wp.from_torch(
+            grad_out_quaternion.view(-1, 4).contiguous(), dtype=wp.vec4
+        )
+
+        adj_position = 0.0 * adj_position
+        adj_quaternion = 0.0 * adj_quaternion
+        adj_position2 = 0.0 * adj_position2
+        adj_quaternion2 = 0.0 * adj_quaternion2
+
+        wp_adj_position = wp.from_torch(adj_position.view(-1, 3), dtype=wp.vec3)
+        wp_adj_quat = wp.from_torch(adj_quaternion.view(-1, 4), dtype=wp.vec4)
+        wp_adj_position2 = wp.from_torch(adj_position2.view(-1, 3), dtype=wp.vec3)
+        wp_adj_quat2 = wp.from_torch(adj_quaternion2.view(-1, 4), dtype=wp.vec4)
+
+        wp.launch(
+            kernel=compute_batch_pose_multipy,
+            dim=ctx.b,
+            inputs=[
+                wp.from_torch(
+                    position.view(-1, 3).contiguous(), dtype=wp.vec3, grad=wp_adj_position
+                ),
+                wp.from_torch(quaternion.view(-1, 4).contiguous(), dtype=wp.vec4, grad=wp_adj_quat),
+                wp.from_torch(
+                    position2.view(-1, 3).contiguous(), dtype=wp.vec3, grad=wp_adj_position2
+                ),
+                wp.from_torch(
+                    quaternion2.view(-1, 4).contiguous(), dtype=wp.vec4, grad=wp_adj_quat2
+                ),
+            ],
+            outputs=[
+                wp.from_torch(
+                    out_position.view(-1, 3).contiguous(), dtype=wp.vec3, grad=wp_adj_out_position
+                ),
+                wp.from_torch(
+                    out_quaternion.view(-1, 4).contiguous(),
+                    dtype=wp.vec4,
+                    grad=wp_adj_out_quaternion,
+                ),
+            ],
+            adj_inputs=[
+                None,
+                None,
+                None,
+                None,
+            ],
+            adj_outputs=[
+                None,
+                None,
+            ],
+            stream=wp.stream_from_torch(grad_out_position.device),
+            adjoint=True,
+        )
+        g_p1 = g_q1 = g_p2 = g_q2 = None
+        if ctx.needs_input_grad[0]:
+            g_p1 = adj_position
+        if ctx.needs_input_grad[1]:
+            g_q1 = adj_quaternion
+        if ctx.needs_input_grad[2]:
+            g_p2 = adj_position2
+        if ctx.needs_input_grad[3]:
+            g_q2 = adj_quaternion2
+        return g_p1, g_q1, g_p2, g_q2, None, None, None, None
+
+
+class TransformPose(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        position: torch.Tensor,
+        quaternion: torch.Tensor,
+        position2: torch.Tensor,
+        quaternion2: torch.Tensor,
+        out_position: torch.Tensor,
+        out_quaternion: torch.Tensor,
+        adj_position: torch.Tensor,
+        adj_quaternion: torch.Tensor,
+        adj_position2: torch.Tensor,
+        adj_quaternion2: torch.Tensor,
+    ):
+        b, _ = position2.shape
+        init_warp()
+        if out_position is None:
+            out_position = torch.zeros_like(position2)
+        if out_quaternion is None:
+            out_quaternion = torch.zeros_like(quaternion2)
+        if adj_position is None:
+            adj_position = torch.zeros_like(position)
+        if adj_quaternion is None:
+            adj_quaternion = torch.zeros_like(quaternion)
+        if adj_position2 is None:
+            adj_position2 = torch.zeros_like(position2)
+        if adj_quaternion2 is None:
+            adj_quaternion2 = torch.zeros_like(quaternion2)
+
+        ctx.save_for_backward(
+            position,
+            quaternion,
+            position2,
+            quaternion2,
+            out_position,
+            out_quaternion,
+            adj_position,
+            adj_quaternion,
+            adj_position2,
+            adj_quaternion2,
+        )
+        ctx.b = b
+        wp.launch(
+            kernel=compute_batch_pose_multipy,
+            dim=b,
+            inputs=[
+                wp.from_torch(position.detach().view(-1, 3).contiguous(), dtype=wp.vec3),
+                wp.from_torch(quaternion.detach().view(-1, 4).contiguous(), dtype=wp.vec4),
+                wp.from_torch(position2.detach().view(-1, 3).contiguous(), dtype=wp.vec3),
+                wp.from_torch(quaternion2.detach().view(-1, 4).contiguous(), dtype=wp.vec4),
+            ],
+            outputs=[
+                wp.from_torch(out_position.detach().view(-1, 3).contiguous(), dtype=wp.vec3),
+                wp.from_torch(out_quaternion.detach().view(-1, 4).contiguous(), dtype=wp.vec4),
+            ],
+            stream=wp.stream_from_torch(position.device),
+        )
+
+        return out_position, out_quaternion
+
+    @staticmethod
+    def backward(ctx, grad_out_position, grad_out_quaternion):
+        (
+            position,
+            quaternion,
+            position2,
+            quaternion2,
+            out_position,
+            out_quaternion,
+            adj_position,
+            adj_quaternion,
+            adj_position2,
+            adj_quaternion2,
+        ) = ctx.saved_tensors
+        init_warp()
+
+        wp_adj_out_position = wp.from_torch(
+            grad_out_position.view(-1, 3).contiguous(), dtype=wp.vec3
+        )
+        wp_adj_out_quaternion = wp.from_torch(
+            grad_out_quaternion.view(-1, 4).contiguous(), dtype=wp.vec4
+        )
+
+        adj_position = 0.0 * adj_position
+        adj_quaternion = 0.0 * adj_quaternion
+        adj_position2 = 0.0 * adj_position2
+        adj_quaternion2 = 0.0 * adj_quaternion2
+
+        wp_adj_position = wp.from_torch(adj_position.view(-1, 3), dtype=wp.vec3)
+        wp_adj_quat = wp.from_torch(adj_quaternion.view(-1, 4), dtype=wp.vec4)
+        wp_adj_position2 = wp.from_torch(adj_position2.view(-1, 3), dtype=wp.vec3)
+        wp_adj_quat2 = wp.from_torch(adj_quaternion2.view(-1, 4), dtype=wp.vec4)
+
+        wp.launch(
+            kernel=compute_batch_pose_multipy,
+            dim=ctx.b,
+            inputs=[
+                wp.from_torch(
+                    position.view(-1, 3).contiguous(), dtype=wp.vec3, grad=wp_adj_position
+                ),
+                wp.from_torch(quaternion.view(-1, 4).contiguous(), dtype=wp.vec4, grad=wp_adj_quat),
+                wp.from_torch(
+                    position2.view(-1, 3).contiguous(), dtype=wp.vec3, grad=wp_adj_position2
+                ),
+                wp.from_torch(
+                    quaternion2.view(-1, 4).contiguous(), dtype=wp.vec4, grad=wp_adj_quat2
+                ),
+            ],
+            outputs=[
+                wp.from_torch(
+                    out_position.view(-1, 3).contiguous(), dtype=wp.vec3, grad=wp_adj_out_position
+                ),
+                wp.from_torch(
+                    out_quaternion.view(-1, 4).contiguous(),
+                    dtype=wp.vec4,
+                    grad=wp_adj_out_quaternion,
+                ),
+            ],
+            adj_inputs=[
+                None,
+                None,
+                None,
+                None,
+            ],
+            adj_outputs=[
+                None,
+                None,
+            ],
+            stream=wp.stream_from_torch(grad_out_position.device),
+            adjoint=True,
+        )
+        g_p1 = g_q1 = g_p2 = g_q2 = None
+        if ctx.needs_input_grad[0]:
+            g_p1 = adj_position
+        if ctx.needs_input_grad[1]:
+            g_q1 = adj_quaternion
+        if ctx.needs_input_grad[2]:
+            g_p2 = adj_position2
+        if ctx.needs_input_grad[3]:
+            g_q2 = adj_quaternion2
+        return g_p1, g_q1, g_p2, g_q2, None, None, None, None
+
+
+class PoseInverse(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        position: torch.Tensor,
+        quaternion: torch.Tensor,
+        out_position: torch.Tensor,
+        out_quaternion: torch.Tensor,
+        adj_position: torch.Tensor,
+        adj_quaternion: torch.Tensor,
+    ):
+        b, _ = position.shape
+
+        if out_position is None:
+            out_position = torch.zeros_like(position)
+        if out_quaternion is None:
+            out_quaternion = torch.zeros_like(quaternion)
+        if adj_position is None:
+            adj_position = torch.zeros_like(position)
+        if adj_quaternion is None:
+            adj_quaternion = torch.zeros_like(quaternion)
+
+        init_warp()
+        ctx.save_for_backward(
+            position,
+            quaternion,
+            out_position,
+            out_quaternion,
+            adj_position,
+            adj_quaternion,
+        )
+        ctx.b = b
+        wp.launch(
+            kernel=compute_pose_inverse,
+            dim=b,
+            inputs=[
+                wp.from_torch(position.detach().view(-1, 3).contiguous(), dtype=wp.vec3),
+                wp.from_torch(quaternion.detach().view(-1, 4).contiguous(), dtype=wp.vec4),
+            ],
+            outputs=[
+                wp.from_torch(out_position.detach().view(-1, 3).contiguous(), dtype=wp.vec3),
+                wp.from_torch(out_quaternion.detach().view(-1, 4).contiguous(), dtype=wp.vec4),
+            ],
+            stream=wp.stream_from_torch(position.device),
+        )
+        # remove close to zero values:
+        # out_position[torch.abs(out_position)<1e-8] = 0.0
+        # out_quaternion[torch.abs(out_quaternion)<1e-8] = 0.0
+
+        return out_position, out_quaternion
+
+    @staticmethod
+    def backward(ctx, grad_out_position, grad_out_quaternion):
+        (
+            position,
+            quaternion,
+            out_position,
+            out_quaternion,
+            adj_position,
+            adj_quaternion,
+        ) = ctx.saved_tensors
+        init_warp()
+
+        wp_adj_out_position = wp.from_torch(
+            grad_out_position.view(-1, 3).contiguous(), dtype=wp.vec3
+        )
+        wp_adj_out_quaternion = wp.from_torch(
+            grad_out_quaternion.view(-1, 4).contiguous(), dtype=wp.vec4
+        )
+
+        adj_position = 0.0 * adj_position
+        adj_quaternion = 0.0 * adj_quaternion
+
+        wp_adj_position = wp.from_torch(adj_position.view(-1, 3), dtype=wp.vec3)
+        wp_adj_quat = wp.from_torch(adj_quaternion.view(-1, 4), dtype=wp.vec4)
+
+        wp.launch(
+            kernel=compute_pose_inverse,
+            dim=ctx.b,
+            inputs=[
+                wp.from_torch(
+                    position.view(-1, 3).contiguous(), dtype=wp.vec3, grad=wp_adj_position
+                ),
+                wp.from_torch(quaternion.view(-1, 4).contiguous(), dtype=wp.vec4, grad=wp_adj_quat),
+            ],
+            outputs=[
+                wp.from_torch(
+                    out_position.view(-1, 3).contiguous(), dtype=wp.vec3, grad=wp_adj_out_position
+                ),
+                wp.from_torch(
+                    out_quaternion.view(-1, 4).contiguous(),
+                    dtype=wp.vec4,
+                    grad=wp_adj_out_quaternion,
+                ),
+            ],
+            adj_inputs=[
+                None,
+                None,
+                None,
+                None,
+            ],
+            adj_outputs=[
+                None,
+                None,
+            ],
+            stream=wp.stream_from_torch(grad_out_position.device),
+            adjoint=True,
+        )
+        g_p1 = g_q1 = None
+        if ctx.needs_input_grad[0]:
+            g_p1 = adj_position
+        if ctx.needs_input_grad[1]:
+            g_q1 = adj_quaternion
+
+        return g_p1, g_q1, None, None
+
+
+class QuatToMatrix(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        quaternion: torch.Tensor,
+        out_mat: torch.Tensor,
+        adj_quaternion: torch.Tensor,
+    ):
+        b, _ = quaternion.shape
+
+        if out_mat is None:
+            out_mat = torch.zeros(
+                (quaternion.shape[0], 3, 3), device=quaternion.device, dtype=quaternion.dtype
+            )
+        if adj_quaternion is None:
+            adj_quaternion = torch.zeros_like(quaternion)
+
+        init_warp()
+        ctx.save_for_backward(
+            quaternion,
+            out_mat,
+            adj_quaternion,
+        )
+        ctx.b = b
+        wp.launch(
+            kernel=compute_quat_to_matrix,
+            dim=b,
+            inputs=[
+                wp.from_torch(quaternion.detach().view(-1, 4).contiguous(), dtype=wp.vec4),
+            ],
+            outputs=[
+                wp.from_torch(out_mat.detach().view(-1, 3, 3).contiguous(), dtype=wp.mat33),
+            ],
+            stream=wp.stream_from_torch(quaternion.device),
+        )
+
+        return out_mat
+
+    @staticmethod
+    def backward(ctx, grad_out_mat):
+        (
+            quaternion,
+            out_mat,
+            adj_quaternion,
+        ) = ctx.saved_tensors
+        init_warp()
+
+        wp_adj_out_mat = wp.from_torch(grad_out_mat.view(-1, 3, 3).contiguous(), dtype=wp.mat33)
+
+        adj_quaternion = 0.0 * adj_quaternion
+
+        wp_adj_quat = wp.from_torch(adj_quaternion.view(-1, 4), dtype=wp.vec4)
+
+        wp.launch(
+            kernel=compute_quat_to_matrix,
+            dim=ctx.b,
+            inputs=[
+                wp.from_torch(quaternion.view(-1, 4).contiguous(), dtype=wp.vec4, grad=wp_adj_quat),
+            ],
+            outputs=[
+                wp.from_torch(
+                    out_mat.view(-1, 3, 3).contiguous(),
+                    dtype=wp.mat33,
+                    grad=wp_adj_out_mat,
+                ),
+            ],
+            adj_inputs=[
+                None,
+            ],
+            adj_outputs=[
+                None,
+            ],
+            stream=wp.stream_from_torch(grad_out_mat.device),
+            adjoint=True,
+        )
+        g_q1 = None
+        if ctx.needs_input_grad[0]:
+            g_q1 = adj_quaternion
+
+        return g_q1, None, None
+
+
+class MatrixToQuaternion(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        in_mat: torch.Tensor,
+        out_quaternion: torch.Tensor,
+        adj_mat: torch.Tensor,
+    ):
+        b, _, _ = in_mat.shape
+
+        if out_quaternion is None:
+            out_quaternion = torch.zeros(
+                (in_mat.shape[0], 4), device=in_mat.device, dtype=in_mat.dtype
+            )
+        if adj_mat is None:
+            adj_mat = torch.zeros_like(in_mat)
+
+        init_warp()
+        ctx.save_for_backward(
+            in_mat,
+            out_quaternion,
+            adj_mat,
+        )
+        ctx.b = b
+        wp.launch(
+            kernel=compute_matrix_to_quat,
+            dim=b,
+            inputs=[
+                wp.from_torch(in_mat.detach().view(-1, 3, 3).contiguous(), dtype=wp.mat33),
+            ],
+            outputs=[
+                wp.from_torch(out_quaternion.detach().view(-1, 4).contiguous(), dtype=wp.vec4),
+            ],
+            stream=wp.stream_from_torch(in_mat.device),
+        )
+
+        return out_quaternion
+
+    @staticmethod
+    def backward(ctx, grad_out_q):
+        (
+            in_mat,
+            out_quaternion,
+            adj_mat,
+        ) = ctx.saved_tensors
+        init_warp()
+
+        wp_adj_out_q = wp.from_torch(grad_out_q.view(-1, 4).contiguous(), dtype=wp.vec4)
+
+        adj_mat = 0.0 * adj_mat
+
+        wp_adj_mat = wp.from_torch(adj_mat.view(-1, 3, 3), dtype=wp.mat33)
+
+        wp.launch(
+            kernel=compute_matrix_to_quat,
+            dim=ctx.b,
+            inputs=[
+                wp.from_torch(in_mat.view(-1, 3, 3).contiguous(), dtype=wp.mat33, grad=wp_adj_mat),
+            ],
+            outputs=[
+                wp.from_torch(
+                    out_quaternion.view(-1, 4).contiguous(), dtype=wp.vec4, grad=wp_adj_out_q
+                ),
+            ],
+            adj_inputs=[
+                None,
+            ],
+            adj_outputs=[
+                None,
+            ],
+            stream=wp.stream_from_torch(grad_out_q.device),
+            adjoint=True,
+        )
+        g_q1 = None
+        if ctx.needs_input_grad[0]:
+            g_q1 = adj_mat
+
+        return g_q1, None, None
diff --git a/src/curobo/geom/types.py b/src/curobo/geom/types.py
new file mode 100644
index 00000000..59554182
--- /dev/null
+++ b/src/curobo/geom/types.py
@@ -0,0 +1,813 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+from __future__ import annotations
+
+# Standard Library
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Sequence
+
+# Third Party
+import numpy as np
+import torch
+import trimesh
+
+# CuRobo
+from curobo.geom.sphere_fit import SphereFitType, fit_spheres_to_mesh
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.util.logger import log_error, log_warn
+from curobo.util_file import get_assets_path, join_path
+
+
+@dataclass
+class Material:
+    metallic: float = 0.0
+    roughness: float = 0.4
+
+
+@dataclass
+class Obstacle:
+    """Base class for all obstacles."""
+
+    #: Unique name of obstacle.
+    name: str
+
+    #: Pose of obstacle as a list with format [x y z qw qx qy qz]
+    pose: Optional[List[float]] = None
+
+    #: NOTE: implement scaling for all obstacle types.
+    scale: Optional[List[float]] = None
+
+    #: Color of obstacle to use in visualization.
+    color: Optional[List[float]] = None
+
+    #: texture to apply to obstacle in visualization.
+    texture_id: Optional[str] = None
+
+    #: texture to apply to obstacle in visualization.
+    texture: Optional[str] = None
+
+    #: material properties to apply in visualization.
+    material: Material = Material()
+
+    tensor_args: TensorDeviceType = TensorDeviceType()
+
+    def get_trimesh_mesh(self, process: bool = True) -> trimesh.Trimesh:
+        """Create a trimesh instance from the obstacle representation.
+
+        Args:
+            process (bool, optional): process when loading from file. Defaults to True.
+
+        Raises:
+            NotImplementedError: requires implementation in derived class.
+
+        Returns:
+            trimesh.Trimesh: instance of obstacle as a trimesh.
+        """
+        raise NotImplementedError
+
+    def save_as_mesh(self, file_path: str):
+        mesh_scene = self.get_trimesh_mesh()
+        mesh_scene.export(file_path)
+
+    def get_cuboid(self) -> Cuboid:
+        """Get oriented bounding box of obstacle (OBB).
+
+        Returns:
+            Cuboid: returns obstacle as a cuboid.
+        """
+        # create a trimesh object:
+        m = self.get_trimesh_mesh()
+        # compute bounding box:
+        dims = m.bounding_box_oriented.primitive.extents
+        dims = dims
+        offset = m.bounding_box_oriented.primitive.transform
+        new_pose = self.pose
+
+        base_pose = Pose.from_list(self.pose)
+        offset_pose = Pose.from_matrix(self.tensor_args.to_device(offset))
+        new_base_pose = base_pose.multiply(offset_pose)
+        new_pose = new_base_pose.tolist()
+
+        if self.color is None:
+            # get average color:
+            if isinstance(m.visual, trimesh.visual.texture.TextureVisuals):
+                m.visual = m.visual.to_color()
+            if isinstance(m.visual, trimesh.visual.color.ColorVisuals):
+                self.color = (np.ravel(m.visual.main_color) / 255.0).tolist()
+        return Cuboid(
+            name=self.name,
+            pose=new_pose,
+            dims=dims.tolist(),
+            color=self.color,
+            texture=self.texture,
+            material=self.material,
+            tensor_args=self.tensor_args,
+        )
+
+    def get_mesh(self, process: bool = True) -> Mesh:
+        """Get obstacle as a mesh.
+
+        Args:
+            process (bool, optional): process mesh from file. Defaults to True.
+
+        Returns:
+            Mesh: obstacle as a mesh.
+        """
+        # load sphere as a mesh in trimesh:
+        m = self.get_trimesh_mesh(process=process)
+        if isinstance(m.visual, trimesh.visual.texture.TextureVisuals):
+            m.visual = m.visual.to_color()
+
+        return Mesh(
+            name=self.name,
+            vertices=m.vertices,
+            faces=m.faces,
+            pose=self.pose,
+            color=self.color,
+            texture_id=self.texture_id,
+            texture=self.texture,
+            material=self.material,
+            vertex_colors=m.visual.vertex_colors,
+            face_colors=m.visual.face_colors,
+            tensor_args=self.tensor_args,
+        )
+
+    def get_transform_matrix(self) -> np.ndarray:
+        """Get homogenous transformation matrix from pose.
+
+        Returns:
+           np.ndarray : transformation matrix.
+        """
+        # convert pose to matrix:
+        mat = trimesh.scene.transforms.kwargs_to_matrix(
+            translation=self.pose[:3], quaternion=self.pose[3:]
+        )
+        return mat
+
+    def get_sphere(self, n: int = 1) -> Sphere:
+        """Compute a sphere that fits in the volume of the object.
+
+        Args:
+            n: number of spheres
+        Returns:
+            spheres
+        """
+        obb = self.get_cuboid()
+
+        # from obb, compute the number of spheres
+
+        # fit a sphere of radius equal to the smallest dim
+        r = min(obb.dims)
+        sph = Sphere(
+            name="m_sphere",
+            pose=obb.pose,
+            position=obb.pose[:3],
+            radius=r,
+            tensor_args=self.tensor_args,
+        )
+
+        return sph
+
+    def get_bounding_spheres(
+        self,
+        n_spheres: int = 1,
+        surface_sphere_radius: float = 0.002,
+        fit_type: SphereFitType = SphereFitType.VOXEL_VOLUME_SAMPLE_SURFACE,
+        voxelize_method: str = "ray",
+        pre_transform_pose: Optional[Pose] = None,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+    ) -> List[Sphere]:
+        """Compute n spheres that fits in the volume of the object.
+
+        Args:
+            n: number of spheres
+        Returns:
+            spheres
+        """
+        mesh = self.get_trimesh_mesh()
+        pts, n_radius = fit_spheres_to_mesh(
+            mesh, n_spheres, surface_sphere_radius, fit_type, voxelize_method=voxelize_method
+        )
+
+        obj_pose = Pose.from_list(self.pose, tensor_args)
+        # transform object:
+
+        # transform points:
+        if pre_transform_pose is not None:
+            obj_pose = pre_transform_pose.multiply(obj_pose)  # convert object pose to another frame
+
+        if pts is None or len(pts) == 0:
+            log_warn("spheres could not be fit!, adding one sphere at origin")
+            pts = np.zeros((1, 3))
+            pts[0, :] = mesh.centroid
+            n_radius = [0.02]
+            obj_pose = Pose.from_list([0, 0, 0, 1, 0, 0, 0], tensor_args)
+
+        points_cuda = tensor_args.to_device(pts)
+        pts = obj_pose.transform_points(points_cuda).cpu().view(-1, 3).numpy()
+
+        new_spheres = [
+            Sphere(
+                name="sph_" + str(i),
+                pose=[pts[i, 0], pts[i, 1], pts[i, 2], 1, 0, 0, 0],
+                radius=n_radius[i],
+            )
+            for i in range(pts.shape[0])
+        ]
+
+        return new_spheres
+
+
+@dataclass
+class Cuboid(Obstacle):
+    """Represent obstacle as a cuboid."""
+
+    #: Dimensions of cuboid in meters [x_length, y_length, z_length].
+    dims: List[float] = field(default_factory=lambda: [0.0, 0.0, 0.0])
+
+    def __post_init__(self):
+        if self.pose is None:
+            log_error("Cuboid Obstacle requires Pose")
+
+    def get_trimesh_mesh(self, process: bool = True):
+        m = trimesh.creation.box(extents=self.dims)
+        if self.color is not None:
+            color_visual = trimesh.visual.color.ColorVisuals(
+                mesh=m, face_colors=self.color, vertex_colors=self.color
+            )
+            m.visual = color_visual
+        return m
+
+
+@dataclass
+class Capsule(Obstacle):
+    radius: float = 0.0
+    base: List[float] = field(default_factory=lambda: [0.0, 0.0, 0.0])
+    tip: List[float] = field(default_factory=lambda: [0.0, 0.0, 0.0])
+
+    def get_trimesh_mesh(self, process: bool = True):
+        height = self.tip[2] - self.base[2]
+        if (
+            height < 0
+            or self.tip[0] != 0
+            or self.tip[1] != 0
+            or self.base[0] != 0
+            or self.base[1] != 0
+        ):
+            log_error(
+                "Capsule to Mesh is only supported when base and tip are at xy origin with a positive height"
+            )
+        m = trimesh.creation.capsule(radius=self.radius, height=self.tip[2] - self.base[2])
+        if self.color is not None:
+            color_visual = trimesh.visual.color.ColorVisuals(
+                mesh=m, face_colors=self.color, vertex_colors=self.color
+            )
+            m.visual = color_visual
+        return m
+
+
+@dataclass
+class Cylinder(Obstacle):
+    radius: float = 0.0
+    height: float = 0.0
+
+    def get_trimesh_mesh(self, process: bool = True):
+        m = trimesh.creation.cylinder(radius=self.radius, height=self.height)
+        if self.color is not None:
+            color_visual = trimesh.visual.color.ColorVisuals(
+                mesh=m, face_colors=self.color, vertex_colors=self.color
+            )
+            m.visual = color_visual
+        return m
+
+
+@dataclass
+class Sphere(Obstacle):
+    radius: float = 0.0
+
+    #: position is deprecated, use pose instead
+    position: Optional[List[float]] = None
+
+    def __post_init__(self):
+        if self.position is not None:
+            self.pose = self.position + [1, 0, 0, 0]
+            log_warn("Sphere.position is deprecated, use Sphere.pose instead")
+        if self.pose is not None:
+            self.position = self.pose[:3]
+
+    def get_trimesh_mesh(self, process: bool = True):
+        m = trimesh.creation.icosphere(radius=self.radius)
+        if self.color is not None:
+            color_visual = trimesh.visual.color.ColorVisuals(
+                mesh=m, face_colors=self.color, vertex_colors=self.color
+            )
+            m.visual = color_visual
+        return m
+
+    def get_cuboid(self) -> Cuboid:
+        """Get oriented bounding box of obstacle (OBB).
+
+        Returns:
+            Cuboid: returns obstacle as a cuboid.
+        """
+
+        # create a trimesh object:
+        m = self.get_trimesh_mesh()
+        # compute bounding box:
+        dims = m.bounding_box_oriented.primitive.extents
+        dims = dims
+        offset = m.bounding_box_oriented.primitive.transform
+        new_pose = self.pose
+
+        base_pose = Pose.from_list(self.pose)
+        offset_pose = Pose.from_matrix(self.tensor_args.to_device(offset))
+        new_base_pose = base_pose.multiply(offset_pose)
+        new_pose = new_base_pose.tolist()
+
+        # since a sphere is symmetrical, we set the orientation to identity
+        new_pose[3:] = [1, 0, 0, 0]
+
+        if self.color is None:
+            # get average color:
+            if isinstance(m.visual, trimesh.visual.texture.TextureVisuals):
+                m.visual = m.visual.to_color()
+            if isinstance(m.visual, trimesh.visual.color.ColorVisuals):
+                self.color = (np.ravel(m.visual.main_color) / 255.0).tolist()
+        return Cuboid(
+            name=self.name,
+            pose=new_pose,
+            dims=dims.tolist(),
+            color=self.color,
+            texture=self.texture,
+            material=self.material,
+            tensor_args=self.tensor_args,
+        )
+
+
+@dataclass
+class Mesh(Obstacle):
+    file_path: Optional[str] = None
+    file_string: Optional[
+        str
+    ] = None  # storing full mesh as a string, loading from this is not implemented yet.
+    urdf_path: Optional[str] = None  # useful for visualization in isaac gym.
+    vertices: Optional[List[List[float]]] = None
+    faces: Optional[List[int]] = None
+    vertex_colors: Optional[List[List[float]]] = None
+    vertex_normals: Optional[List[List[float]]] = None
+    face_colors: Optional[List[List[float]]] = None
+
+    def __post_init__(self):
+        if self.file_path is not None:
+            self.file_path = join_path(get_assets_path(), self.file_path)
+        if self.urdf_path is not None:
+            self.urdf_path = join_path(get_assets_path(), self.urdf_path)
+        if self.scale is not None and self.vertices is not None:
+            self.vertices = np.ravel(self.scale) * self.vertices
+            self.scale = None
+
+    def get_trimesh_mesh(self, process: bool = True):
+        # load mesh from filepath or verts and faces:
+        if self.file_path is not None:
+            m = trimesh.load(self.file_path, process=process, force="mesh")
+            if isinstance(m, trimesh.Scene):
+                m = m.dump(concatenate=True)
+
+            if isinstance(m.visual, trimesh.visual.texture.TextureVisuals):
+                m.visual = m.visual.to_color()
+        else:
+            m = trimesh.Trimesh(
+                self.vertices,
+                self.faces,
+                vertex_colors=self.vertex_colors,
+                vertex_normals=self.vertex_normals,
+                face_colors=self.face_colors,
+            )
+        # if self.scale is not None:
+        #    m.vertices = np.ravel(self.scale) * m.vertices
+
+        return m
+
+    def get_mesh_data(self, process: bool = True):
+        verts = faces = None
+        if self.file_path is not None:
+            m = self.get_trimesh_mesh(process=process)
+            verts = m.vertices.view(np.ndarray)
+            faces = m.faces
+        elif self.vertices is not None and self.faces is not None:
+            verts = self.vertices
+            faces = self.faces
+        else:
+            ValueError("No Mesh object found")
+
+        return verts, faces
+
+    @staticmethod
+    def from_pointcloud(
+        pointcloud: np.ndarray,
+        pitch: float = 0.02,
+        name="world_pc",
+        pose: List[float] = [0, 0, 0, 1, 0, 0, 0],
+        filter_close_points: float = 0.0,
+    ):
+        if filter_close_points > 0.0:
+            # remove points that are closer than given threshold
+            dist = np.linalg.norm(pointcloud, axis=-1)
+            pointcloud = pointcloud[dist > filter_close_points]
+
+        pc = trimesh.PointCloud(pointcloud)
+        scene_mesh = trimesh.voxel.ops.points_to_marching_cubes(pc.vertices, pitch=pitch)
+        return Mesh(name, pose=pose, vertices=scene_mesh.vertices, faces=scene_mesh.faces)
+
+
+@dataclass
+class BloxMap(Obstacle):
+    map_path: Optional[str] = None
+    scale: List[float] = field(default_factory=lambda: [1.0, 1.0, 1.0])
+    voxel_size: float = 0.02
+    #: integrator type to use in nvblox. Options: ["tsdf", "occupancy"]
+    integrator_type: str = "tsdf"
+    mesh_file_path: Optional[str] = None
+    mapper_instance: Any = None
+    mesh: Optional[Mesh] = None
+
+    def __post_init__(self):
+        if self.map_path is not None:
+            self.map_path = join_path(get_assets_path(), self.map_path)
+        if self.mesh_file_path is not None:
+            self.mesh = Mesh(
+                name=self.name + "_mesh", file_path=self.mesh_file_path, pose=self.pose
+            )
+
+    def get_trimesh_mesh(self, process: bool = True):
+        if self.mesh is not None:
+            return self.mesh.get_trimesh_mesh(process)
+        else:
+            log_warn("no mesh found for obstacle: " + self.name)
+            return None
+
+
+@dataclass
+class WorldConfig(Sequence):
+    """Representation of World for use in CuRobo."""
+
+    #: List of Sphere obstacles.
+    sphere: Optional[List[Sphere]] = None
+
+    #: List of Cuboid obstacles.
+    cuboid: Optional[List[Cuboid]] = None
+
+    #: List of Capsule obstacles.
+    capsule: Optional[List[Capsule]] = None
+
+    #: List of Cylinder obstacles.
+    cylinder: Optional[List[Cylinder]] = None
+
+    #: List of Mesh obstacles.
+    mesh: Optional[List[Mesh]] = None
+
+    #: BloxMap obstacle.
+    blox: Optional[List[BloxMap]] = None
+
+    #: List of all obstacles in world.
+    objects: Optional[List[Obstacle]] = None
+
+    def __post_init__(self):
+        # create objects list:
+        if self.objects is None:
+            self.objects = []
+            if self.sphere is not None:
+                self.objects += self.sphere
+            if self.cuboid is not None:
+                self.objects += self.cuboid
+            if self.capsule is not None:
+                self.objects += self.capsule
+            if self.mesh is not None:
+                self.objects += self.mesh
+            if self.blox is not None:
+                self.objects += self.blox
+            if self.cylinder is not None:
+                self.objects += self.cylinder
+        if self.sphere is None:
+            self.sphere = []
+        if self.cuboid is None:
+            self.cuboid = []
+        if self.capsule is None:
+            self.capsule = []
+        if self.mesh is None:
+            self.mesh = []
+        if self.cylinder is None:
+            self.cylinder = []
+        if self.blox is None:
+            self.blox = []
+
+    def __len__(self):
+        return len(self.objects)
+
+    def __getitem__(self, idx):
+        return self.objects[idx]
+
+    def clone(self):
+        return WorldConfig(
+            cuboid=self.cuboid.copy() if self.cuboid is not None else None,
+            sphere=self.sphere.copy() if self.sphere is not None else None,
+            mesh=self.mesh.copy() if self.mesh is not None else None,
+            capsule=self.capsule.copy() if self.capsule is not None else None,
+            cylinder=self.cylinder.copy() if self.cylinder is not None else None,
+            blox=self.blox.copy() if self.blox is not None else None,
+        )
+
+    @staticmethod
+    def from_dict(data_dict: Dict[str, Any]) -> WorldConfig:
+        cuboid = None
+        sphere = None
+        capsule = None
+        mesh = None
+        blox = None
+        cylinder = None
+        # load yaml:
+        if "cuboid" in data_dict.keys():
+            cuboid = [Cuboid(name=x, **data_dict["cuboid"][x]) for x in data_dict["cuboid"]]
+        if "sphere" in data_dict.keys():
+            sphere = [Sphere(name=x, **data_dict["sphere"][x]) for x in data_dict["sphere"]]
+        if "mesh" in data_dict.keys():
+            mesh = [Mesh(name=x, **data_dict["mesh"][x]) for x in data_dict["mesh"]]
+        if "capsule" in data_dict.keys():
+            capsule = [Capsule(name=x, **data_dict["capsule"][x]) for x in data_dict["capsule"]]
+        if "cylinder" in data_dict.keys():
+            cylinder = [Cylinder(name=x, **data_dict["cylinder"][x]) for x in data_dict["cylinder"]]
+        if "blox" in data_dict.keys():
+            blox = [BloxMap(name=x, **data_dict["blox"][x]) for x in data_dict["blox"]]
+
+        return WorldConfig(
+            cuboid=cuboid,
+            sphere=sphere,
+            capsule=capsule,
+            cylinder=cylinder,
+            mesh=mesh,
+            blox=blox,
+        )
+
+    # load world config as obbs: convert all types to obbs
+    @staticmethod
+    def create_obb_world(current_world: WorldConfig):
+        sphere_obb = []
+        capsule_obb = []
+        cylinder_obb = []
+        mesh_obb = []
+        blox_obb = []
+        cuboid_obb = current_world.cuboid
+
+        if current_world.capsule is not None and len(current_world.capsule) > 0:
+            capsule_obb = [x.get_cuboid() for x in current_world.capsule]
+        if current_world.sphere is not None and len(current_world.sphere) > 0:
+            sphere_obb = [x.get_cuboid() for x in current_world.sphere]
+        if current_world.cylinder is not None and len(current_world.cylinder) > 0:
+            cylinder_obb = [x.get_cuboid() for x in current_world.cylinder]
+        if current_world.blox is not None and len(current_world.blox) > 0:
+            for i in range(len(current_world.blox)):
+                if current_world.blox[i].mesh is not None:
+                    blox_obb.append(current_world.blox[i].get_cuboid())
+
+        if current_world.mesh is not None and len(current_world.mesh) > 0:
+            mesh_obb = [x.get_cuboid() for x in current_world.mesh]
+        return WorldConfig(
+            cuboid=cuboid_obb + sphere_obb + capsule_obb + cylinder_obb + mesh_obb + blox_obb
+        )
+
+    @staticmethod
+    def create_mesh_world(current_world: WorldConfig, process: bool = False):
+        sphere_obb = []
+        capsule_obb = []
+        cuboid_obb = []
+        cylinder_obb = []
+        blox_obb = []
+        if current_world.capsule is not None and len(current_world.capsule) > 0:
+            capsule_obb = [x.get_mesh(process=process) for x in current_world.capsule]
+
+        if current_world.sphere is not None and len(current_world.sphere) > 0:
+            sphere_obb = [x.get_mesh(process=process) for x in current_world.sphere]
+
+        if current_world.cuboid is not None and len(current_world.cuboid) > 0:
+            cuboid_obb = [x.get_mesh(process=process) for x in current_world.cuboid]
+
+        if current_world.cylinder is not None and len(current_world.cylinder) > 0:
+            cylinder_obb = [x.get_mesh(process=process) for x in current_world.cylinder]
+        if current_world.blox is not None and len(current_world.blox) > 0:
+            for i in range(len(current_world.blox)):
+                if current_world.blox[i].mesh is not None:
+                    blox_obb.append(current_world.blox[i].get_mesh(process=process))
+
+        return WorldConfig(
+            mesh=current_world.mesh
+            + sphere_obb
+            + capsule_obb
+            + cuboid_obb
+            + cylinder_obb
+            + blox_obb
+        )
+
+    @staticmethod
+    def create_collision_support_world(current_world: WorldConfig, process: bool = True):
+        sphere_obb = []
+        capsule_obb = []
+        cuboid_obb = []
+        cylinder_obb = []
+        blox_obb = []
+        if current_world.capsule is not None and len(current_world.capsule) > 0:
+            capsule_obb = [x.get_mesh(process=process) for x in current_world.capsule]
+
+        if current_world.sphere is not None and len(current_world.sphere) > 0:
+            sphere_obb = [x.get_mesh(process=process) for x in current_world.sphere]
+
+        if current_world.cuboid is not None and len(current_world.cuboid) > 0:
+            cuboid_obb = current_world.cuboid
+
+        if current_world.cylinder is not None and len(current_world.cylinder) > 0:
+            cylinder_obb = [x.get_mesh(process=process) for x in current_world.cylinder]
+        if current_world.blox is not None and len(current_world.blox) > 0:
+            for i in range(len(current_world.blox)):
+                if current_world.blox[i].mesh is not None:
+                    blox_obb.append(current_world.blox[i].get_mesh(process=process))
+
+        return WorldConfig(
+            mesh=current_world.mesh + sphere_obb + capsule_obb + cylinder_obb + blox_obb,
+            cuboid=cuboid_obb,
+        )
+
+    @staticmethod
+    def get_scene_graph(current_world: WorldConfig):
+        m_world = WorldConfig.create_mesh_world(current_world)
+        mesh_scene = trimesh.scene.scene.Scene(base_frame="world")
+        mesh_list = m_world
+        for m in mesh_list:
+            mesh_scene.add_geometry(
+                m.get_trimesh_mesh(),
+                geom_name=m.name,
+                parent_node_name="world",
+                transform=m.get_transform_matrix(),
+            )
+
+        return mesh_scene
+
+    @staticmethod
+    def create_merged_mesh_world(current_world: WorldConfig, process: bool = True):
+        mesh_scene = WorldConfig.get_scene_graph(current_world)
+        mesh_scene = mesh_scene.dump(concatenate=True)
+        new_mesh = Mesh(
+            vertices=mesh_scene.vertices.view(np.ndarray),
+            faces=mesh_scene.faces,
+            name="merged_mesh",
+            pose=[0, 0, 0, 1, 0, 0, 0],
+        )
+        return WorldConfig(mesh=[new_mesh])
+
+    def get_obb_world(self):
+        return WorldConfig.create_obb_world(self)
+
+    def get_mesh_world(self, merge_meshes: bool = False, process: bool = False):
+        if merge_meshes:
+            return WorldConfig.create_merged_mesh_world(self, process=process)
+        else:
+            return WorldConfig.create_mesh_world(self, process=process)
+
+    def get_collision_check_world(self, mesh_process: bool = False):
+        return WorldConfig.create_collision_support_world(self, process=mesh_process)
+
+    def save_world_as_mesh(self, file_path: str, save_as_scene_graph=False):
+        mesh_scene = WorldConfig.get_scene_graph(self)
+        if save_as_scene_graph:
+            mesh_scene = mesh_scene.dump(concatenate=True)
+
+        mesh_scene.export(file_path)
+
+    def get_cache_dict(self) -> Dict[str, int]:
+        """Computes the number of obstacles in each type
+
+        Returns:
+            _description_
+        """
+        cache = {"obb": len(self.cuboid), "mesh": len(self.mesh)}
+        return cache
+
+    def add_obstacle(self, obstacle: Obstacle):
+        if isinstance(obstacle, Mesh):
+            self.mesh.append(obstacle)
+        elif isinstance(obstacle, Cuboid):
+            self.cuboid.append(obstacle)
+        elif isinstance(obstacle, Sphere):
+            self.sphere.append(obstacle)
+        elif isinstance(obstacle, Cylinder):
+            self.cylinder.append(obstacle)
+        elif isinstance(obstacle, Capsule):
+            self.capsule.append(obstacle)
+        else:
+            ValueError("Obstacle type not supported")
+        self.objects.append(obstacle)
+
+    def randomize_color(self, r=[0, 1], g=[0, 1], b=[0, 1]):
+        """Randomize color of objects within the given range
+
+        Args:
+            r: _description_. Defaults to [0,1].
+            g: _description_. Defaults to [0,1].
+            b: _description_. Defaults to [0,1].
+
+        Returns:
+            _description_
+        """
+        n = len(self.objects)
+        r_l = np.random.uniform(r[0], r[1], n)
+        g_l = np.random.uniform(g[0], g[1], n)
+        b_l = np.random.uniform(b[0], b[1], n)
+        for i, i_val in enumerate(self.objects):
+            i_val.color = [r_l[i], g_l[i], b_l[i], 1.0]
+
+    def add_color(self, rgba=[0.0, 0.0, 0.0, 1.0]):
+        for i, i_val in enumerate(self.objects):
+            i_val.color = rgba
+
+    def add_material(self, material=Material()):
+        for i, i_val in enumerate(self.objects):
+            i_val.material = material
+
+    def get_obstacle(self, name: str) -> Union[None, Obstacle]:
+        for i in self.objects:
+            if i.name == name:
+                return i
+        return None
+
+    def remove_obstacle(self, name: str):
+        for i in range(len(self.objects)):
+            if self.objects[i].name == name:
+                del self.objects[i]
+                return
+
+    def remove_absolute_paths(self) -> WorldConfig:
+        for obj in self.objects:
+            if obj.name.startswith("/"):
+                obj.name = obj.name[1:]
+
+
+def tensor_sphere(pt, radius, tensor=None, tensor_args=TensorDeviceType()):
+    if tensor is None:
+        tensor = torch.empty(4, device=tensor_args.device, dtype=tensor_args.dtype)
+    tensor[:3] = torch.as_tensor(pt, device=tensor_args.device, dtype=tensor_args.dtype)
+    tensor[3] = radius
+    return tensor
+
+
+def tensor_capsule(base, tip, radius, tensor=None, tensor_args=TensorDeviceType()):
+    if tensor is None:
+        tensor = torch.empty(7, device=tensor_args.device, dtype=tensor_args.dtype)
+    tensor[:3] = torch.as_tensor(base, device=tensor_args.device, dtype=tensor_args.dtype)
+    tensor[3:6] = torch.as_tensor(tip, device=tensor_args.device, dtype=tensor_args.dtype)
+    tensor[6] = radius
+    return tensor
+
+
+def tensor_cube(pose, dims, tensor_args=TensorDeviceType()):
+    """
+
+    Args:
+        pose (_type_): x,y,z, qw,qx,qy,qz
+        dims (_type_): _description_
+        tensor_args (_type_, optional): _description_. Defaults to TensorDeviceType().
+
+    Returns:
+        _type_: _description_
+    """
+    w_T_b = Pose.from_list(pose, tensor_args=tensor_args)
+    b_T_w = w_T_b.inverse()
+    dims_t = torch.tensor(
+        [dims[0], dims[1], dims[2]], device=tensor_args.device, dtype=tensor_args.dtype
+    )
+    cube = [dims_t, b_T_w.get_pose_vector()]
+    return cube
+
+
+def batch_tensor_cube(pose, dims, tensor_args=TensorDeviceType()):
+    """
+
+    Args:
+        pose (_type_): x,y,z, qw,qx,qy,qz
+        dims (_type_): _description_
+        tensor_args (_type_, optional): _description_. Defaults to TensorDeviceType().
+
+    Returns:
+        _type_: _description_
+    """
+    w_T_b = Pose.from_batch_list(pose, tensor_args=tensor_args)
+    b_T_w = w_T_b.inverse()
+    dims_t = torch.as_tensor(np.array(dims), device=tensor_args.device, dtype=tensor_args.dtype)
+    cube = [dims_t, b_T_w.get_pose_vector()]
+    return cube
diff --git a/src/curobo/graph/__init__.py b/src/curobo/graph/__init__.py
new file mode 100644
index 00000000..a08745f9
--- /dev/null
+++ b/src/curobo/graph/__init__.py
@@ -0,0 +1,10 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
diff --git a/src/curobo/graph/graph_base.py b/src/curobo/graph/graph_base.py
new file mode 100644
index 00000000..7e899ce4
--- /dev/null
+++ b/src/curobo/graph/graph_base.py
@@ -0,0 +1,1149 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+
+# Standard Library
+import math
+import time
+from abc import abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+# Third Party
+import numpy as np
+import torch
+import torch.autograd.profiler as profiler
+
+# CuRobo
+from curobo.geom.sdf.world import WorldCollision
+from curobo.geom.types import WorldConfig
+from curobo.graph.graph_nx import NetworkxGraph
+from curobo.rollout.arm_base import ArmBase, ArmBaseConfig
+from curobo.rollout.rollout_base import RolloutBase, RolloutMetrics
+from curobo.types import tensor
+from curobo.types.base import TensorDeviceType
+from curobo.types.robot import JointState, RobotConfig, State
+from curobo.util.logger import log_info, log_warn
+from curobo.util.sample_lib import HaltonGenerator
+from curobo.util.trajectory import InterpolateType, get_interpolated_trajectory
+from curobo.util_file import (
+    get_robot_configs_path,
+    get_task_configs_path,
+    get_world_configs_path,
+    join_path,
+    load_yaml,
+)
+
+
+@dataclass
+class GraphResult:
+    success: tensor.T_BValue_bool
+    start_q: tensor.T_BDOF
+    goal_q: tensor.T_BDOF
+    path_length: Optional[tensor.T_BValue_float] = None
+    solve_time: float = 0.0
+    plan: Optional[List[List[tensor.T_DOF]]] = None
+    interpolated_plan: Optional[JointState] = None
+    metrics: Optional[RolloutMetrics] = None
+    valid_query: bool = True
+    debug_info: Optional[Any] = None
+    optimized_dt: Optional[torch.Tensor] = None
+    path_buffer_last_tstep: Optional[List[int]] = None
+
+
+@dataclass
+class Graph:
+    nodes: tensor.T_BDOF
+    edges: tensor.T_BHDOF_float
+    connectivity: tensor.T_BValue_int
+    shortest_path_lengths: Optional[tensor.T_BValue_float] = None
+
+    def set_shortest_path_lengths(self, shortest_path_lengths: tensor.T_BValue_float):
+        self.shortest_path_lengths = shortest_path_lengths
+
+    def get_node_distance(self):
+        if self.shortest_path_lengths is not None:
+            min_l = min(self.nodes.shape[0], self.shortest_path_lengths.shape[0])
+            return torch.cat(
+                (self.nodes[:min_l], self.shortest_path_lengths[:min_l].unsqueeze(1)), dim=-1
+            )
+        else:
+            return None
+
+
+@dataclass
+class GraphConfig:
+    max_nodes: int
+    steer_delta_buffer: int
+    sample_pts: int
+    node_similarity_distance: float
+    rejection_ratio: int
+    k_nn: int
+    c_max: float
+    vertex_n: int
+    graph_max_attempts: int
+    graph_min_attempts: int
+    init_nodes: int
+    use_bias_node: bool
+    dof: int
+    bounds: torch.Tensor
+    tensor_args: TensorDeviceType
+    rollout_fn: RolloutBase
+    safety_rollout_fn: RolloutBase
+    max_buffer: int
+    max_cg_buffer: int
+    compute_metrics: bool
+    interpolation_type: InterpolateType
+    interpolation_steps: int
+    seed: int
+    use_cuda_graph_mask_samples: bool
+    distance_weight: torch.Tensor
+    bias_node: tensor.T_DOF
+    interpolation_dt: float = 0.02
+    interpolation_deviation: float = 0.05
+    interpolation_acceleration_scale: float = 0.5
+
+    @staticmethod
+    def from_dict(
+        graph_dict: Dict,
+        tensor_args: TensorDeviceType,
+        rollout_fn: RolloutBase,
+        safety_rollout_fn: RolloutBase,
+        use_cuda_graph: bool = True,
+    ):
+        graph_dict["dof"] = rollout_fn.d_action
+        graph_dict["bounds"] = rollout_fn.action_bounds
+        graph_dict["distance_weight"] = rollout_fn.cspace_config.cspace_distance_weight
+        graph_dict["bias_node"] = rollout_fn.cspace_config.retract_config.view(1, -1)
+        graph_dict["interpolation_type"] = InterpolateType(graph_dict["interpolation_type"])
+        return GraphConfig(
+            **graph_dict,
+            tensor_args=tensor_args,
+            rollout_fn=rollout_fn,
+            safety_rollout_fn=safety_rollout_fn,
+            use_cuda_graph_mask_samples=use_cuda_graph,
+        )
+
+    @staticmethod
+    @profiler.record_function("graph_plan_config/load_from_robot_config")
+    def load_from_robot_config(
+        robot_cfg: Union[Union[str, Dict], RobotConfig],
+        world_model: Optional[Union[Union[str, Dict], WorldConfig]] = None,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+        world_coll_checker: Optional[WorldCollision] = None,
+        base_cfg_file: str = "base_cfg.yml",
+        graph_file: str = "graph.yml",
+        self_collision_check: bool = True,
+        use_cuda_graph: bool = True,
+    ):
+        graph_data = load_yaml(join_path(get_task_configs_path(), graph_file))
+        base_config_data = load_yaml(join_path(get_task_configs_path(), base_cfg_file))
+        if isinstance(robot_cfg, str):
+            robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_cfg))["robot_cfg"]
+        if isinstance(world_model, str):
+            world_model = load_yaml(join_path(get_world_configs_path(), world_model))
+        if isinstance(robot_cfg, dict):
+            robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+        if not self_collision_check:
+            base_config_data["constraint"]["self_collision_cfg"]["weight"] = 0.0
+
+        cfg = ArmBaseConfig.from_dict(
+            robot_cfg,
+            graph_data["model"],
+            base_config_data["cost"],
+            base_config_data["constraint"],
+            base_config_data["convergence"],
+            base_config_data["world_collision_checker_cfg"],
+            world_model,
+            world_coll_checker=world_coll_checker,
+        )
+        arm_base = ArmBase(cfg)
+
+        if use_cuda_graph:
+            cfg_cg = ArmBaseConfig.from_dict(
+                robot_cfg,
+                graph_data["model"],
+                base_config_data["cost"],
+                base_config_data["constraint"],
+                base_config_data["convergence"],
+                base_config_data["world_collision_checker_cfg"],
+                world_model,
+                world_coll_checker=world_coll_checker,
+            )
+            arm_base_cg_rollout = ArmBase(cfg_cg)
+        else:
+            arm_base_cg_rollout = arm_base
+        graph_cfg = GraphConfig.from_dict(
+            graph_data["graph"],
+            tensor_args,
+            arm_base_cg_rollout,
+            arm_base,
+            use_cuda_graph,
+        )
+        return graph_cfg
+
+
+class GraphPlanBase(GraphConfig):
+    @profiler.record_function("graph_plan_base/init")
+    def __init__(self, config: Optional[GraphConfig] = None):
+        if config is not None:
+            super().__init__(**vars(config))
+        self._rollout_list = None
+        self._cu_act_buffer = None
+        if self.use_cuda_graph_mask_samples:
+            self._cu_act_buffer = torch.zeros(
+                (self.max_cg_buffer, 1, self.dof),
+                device=self.tensor_args.device,
+                dtype=self.tensor_args.dtype,
+            )
+        self._valid_bias_node = False
+        self._check_bias_node = self.use_bias_node
+        self.steer_radius = self.node_similarity_distance
+        self.xc_search = None
+        self.i = 0
+        self._valid_bias_node = False
+        self._out_traj_state = None
+        # validated graph is stored here:
+        self.graph = NetworkxGraph()
+
+        self.path = None
+
+        self.cat_buffer = torch.as_tensor(
+            [0.0, 0.0, 0.0], device=self.tensor_args.device, dtype=self.tensor_args.dtype
+        )
+        self.delta_vec = torch.as_tensor(
+            range(0, self.steer_delta_buffer),
+            device=self.tensor_args.device,
+            dtype=self.tensor_args.dtype,
+        )
+        self.path = torch.zeros(
+            (self.max_nodes + 100, self.dof + 3),
+            device=self.tensor_args.device,
+            dtype=self.tensor_args.dtype,
+        )
+        self.sample_gen = HaltonGenerator(
+            self.dof,
+            self.tensor_args,
+            up_bounds=self.rollout_fn.action_bound_highs,
+            low_bounds=self.rollout_fn.action_bound_lows,
+            seed=self.seed,
+        )
+
+        self.halton_samples, self.gauss_halton_samples = self._sample_pts()
+        self.batch_mode = True
+
+        self._rot_frame_col = torch.as_tensor(
+            torch.eye(self.dof)[:, 0:1],
+            device=self.tensor_args.device,
+            dtype=self.tensor_args.dtype,
+        ).T
+
+        self._max_joint_vel = (
+            self.rollout_fn.state_bounds.velocity.view(2, self.dof)[1, :].reshape(1, 1, self.dof)
+        ) - 0.02
+        self._max_joint_acc = self.rollout_fn.state_bounds.acceleration[1, :] - 0.02
+        self._max_joint_jerk = self.rollout_fn.state_bounds.jerk[1, :] - 0.02
+
+        self._rollout_list = None
+
+    def check_feasibility(self, x_set):
+        mask = self.mask_samples(x_set)
+        return mask.all(), mask
+
+    def get_feasible_sample_set(self, x_samples):
+        mask = self.mask_samples(x_samples)
+        x_samples = x_samples[mask]
+        return x_samples
+
+    def mask_samples(self, x_samples):  # call feasibility here:
+        if self.use_cuda_graph_mask_samples and x_samples.shape[0] <= self.max_cg_buffer:
+            return self._mask_samples_cuda_graph(x_samples)
+        else:
+            return self._mask_samples(x_samples)
+
+    @profiler.record_function("geometric_planner/cg_mask_samples")
+    def _mask_samples_cuda_graph(self, x_samples):
+        d = []
+        if self.max_cg_buffer < x_samples.shape[0]:
+            for i in range(math.ceil(x_samples.shape[0] / self.max_cg_buffer)):
+                start = i * self.max_cg_buffer
+                end = (i + 1) * self.max_cg_buffer
+                feasible = self._cuda_graph_rollout_constraint(
+                    x_samples[start:end, :].unsqueeze(1), use_batch_env=False
+                )
+                d.append(feasible)
+        else:
+            feasible = self._cuda_graph_rollout_constraint(
+                x_samples.unsqueeze(1), use_batch_env=False
+            )
+            d.append(feasible)
+        mask = torch.cat(d).squeeze()
+        return mask
+
+    @profiler.record_function("geometric_planner/mask_samples")
+    def _mask_samples(self, x_samples):
+        d = []
+        if self.max_buffer < x_samples.shape[0]:
+            # c_samples = x_samples[:, 0:1] * 0.0
+            for i in range(math.ceil(x_samples.shape[0] / self.max_buffer)):
+                start = i * self.max_buffer
+                end = (i + 1) * self.max_buffer
+                metrics = self.safety_rollout_fn.rollout_constraint(
+                    x_samples[start:end, :].unsqueeze(1), use_batch_env=False
+                )
+                d.append(metrics.feasible)
+        else:
+            metrics = self.safety_rollout_fn.rollout_constraint(
+                x_samples.unsqueeze(1), use_batch_env=False
+            )
+            d.append(metrics.feasible)
+        mask = torch.cat(d).squeeze()
+        return mask
+
+    def _cuda_graph_rollout_constraint(self, x_samples, use_batch_env=False):
+        self._cu_act_buffer[: x_samples.shape[0]] = x_samples
+        metrics = self.rollout_fn.rollout_constraint_cuda_graph(
+            self._cu_act_buffer, use_batch_env=False
+        )
+        return metrics.feasible[: x_samples.shape[0]]
+
+    def get_samples(self, n_samples: int, bounded: bool = True):
+        return self._sample_pts(n_samples, bounded)[0]
+
+    @profiler.record_function("geometric_planner/halton_samples")
+    def _sample_pts(self, n_samples=None, bounded=False, unit_ball=False, seed=123):
+        # sample in state space:
+        if n_samples is None:
+            n_samples = self.sample_pts
+        if unit_ball:
+            halton_samples = self.sample_gen.get_gaussian_samples(n_samples, variance=1.0)
+            halton_samples = halton_samples / torch.norm(halton_samples, dim=-1, keepdim=True)
+            if self.dof < 3:
+                radius_samples = self.sample_gen.get_samples(n_samples, bounded=False)
+                radius_samples = torch.clamp(radius_samples[:, 0:1], 0.0, 1.0)
+                halton_samples = radius_samples * halton_samples
+        else:
+            halton_samples = self.sample_gen.get_samples(n_samples, bounded=bounded)
+        return halton_samples, halton_samples
+
+    def reset_buffer(self):
+        # add a random node to graph:
+        self.path = torch.zeros(
+            (self.max_nodes, self.dof + 3),
+            device=self.tensor_args.device,
+            dtype=self.tensor_args.dtype,
+        )
+        self.reset_graph()
+        self.path *= 0.0
+        self.i = 0
+        self._valid_bias_node = False
+        self._check_bias_node = self.use_bias_node
+
+    @profiler.record_function("geometric_planner/sample_biased_nodes")
+    def get_biased_vertex_set(self, x_start, x_goal, c_max=10.0, c_min=1, n=None, lazy=False):
+        if n is None:
+            n = self.vertex_n
+        # get biased samples that are around x_start and x_goal
+        # print(c_min.item(), c_max)
+
+        unit_ball, _ = self._sample_pts(n_samples=n + int(n * self.rejection_ratio), unit_ball=True)
+        # compute cost_to_go:
+        x_samples = biased_vertex_projection_jit(
+            x_start,
+            x_goal,
+            self.distance_weight,
+            c_max,
+            c_min,
+            self.dof,
+            self._rot_frame_col,
+            unit_ball,
+            self.bounds,
+        )
+
+        if False:  # non jit version:
+            # rotate frame:
+            C = self._compute_rotation_frame(
+                x_start * self.distance_weight, x_goal * self.distance_weight
+            )
+
+            r = x_start * 0.0
+            r[0] = c_max / 2.0
+            r[1:] = (c_max**2 - c_min**2) / 2.0
+            L = torch.diag(r)
+            x_center = (x_start[..., : self.dof] + x_goal[..., : self.dof]) / 2.0
+            x_samples = ((C @ L @ unit_ball.T).T) / self.distance_weight + x_center
+            # clamp at joint angles:
+            x_samples = torch.clamp(x_samples, self.bounds[0, :], self.bounds[1, :])
+
+        if not lazy:
+            x_search = self.get_feasible_sample_set(x_samples)
+        else:
+            x_search = x_samples
+        xc_search = cat_xc_jit(x_search, n)
+        # c_search = x_search[:, 0:1] * 0.0
+        # xc_search = torch.cat((x_search, c_search), dim=1)[:n, :]
+        return xc_search
+
+    @profiler.record_function("geometric_planner/compute_rotation_frame")
+    def _compute_rotation_frame(self, x_start, x_goal):
+        return compute_rotation_frame_jit(x_start, x_goal, self._rot_frame_col)
+        #: non jit version below
+        a = ((x_goal - x_start) / torch.norm(x_start - x_goal)).unsqueeze(1)
+
+        M = a @ self._rot_frame_col  # .T
+
+        # with torch.cuda.amp.autocast(enabled=False):
+        U, _, V = torch.svd(M, compute_uv=True, some=False)
+        vec = a.flatten() * 0.0 + 1.0
+        vec[-1] = torch.det(U) * torch.det(V)
+
+        C = U @ torch.diag(vec) @ V.T
+        return C
+
+    @profiler.record_function("geometric_planner/sample_nodes")
+    def get_new_vertex_set(self, n=None, lazy=False):
+        if n is None:
+            n = self.vertex_n
+        # get a new seed value:
+        # seed = random.randint(1, 1000)
+        # generate new samples:
+        x_samples, _ = self._sample_pts(
+            n_samples=n + int(n * self.rejection_ratio),
+            bounded=True,
+        )
+        if not lazy:
+            x_search = self.get_feasible_sample_set(x_samples)
+        else:
+            x_search = x_samples
+        xc_search = cat_xc_jit(x_search, n)
+        # c_search = x_search[:, 0:1] * 0.0
+
+        # xc_search = torch.cat((x_search, c_search), dim=1)[:n, :]
+        return xc_search
+
+    @torch.no_grad()
+    def validate_graph(self):
+        self._validate_graph()
+
+    def get_graph_edges(self):
+        """Return edges in the graph with start node and end node locations
+
+        Returns:
+            tensor
+        """
+        self.graph.update_graph()
+        edge_list = self.graph.get_edges()
+        edges = torch.as_tensor(
+            edge_list, device=self.tensor_args.device, dtype=self.tensor_args.dtype
+        )
+
+        # find start and end points for these edges:
+        start_pts = self.path[edges[:, 0].long(), : self.dof].unsqueeze(1)
+        end_pts = self.path[edges[:, 1].long(), : self.dof].unsqueeze(1)
+
+        # first check the start and end points:
+        node_edges = torch.cat((start_pts, end_pts), dim=1)
+        return node_edges, edges
+
+    def get_graph(self):
+        node_edges, edge_connect = self.get_graph_edges()
+        nodes = self.path[: self.i, : self.dof]
+        return Graph(nodes=nodes, edges=node_edges, connectivity=edge_connect)
+
+    def _validate_graph(self):
+        self.graph.update_graph()
+        edge_list = self.graph.get_edges()
+        edges = torch.as_tensor(
+            edge_list, device=self.tensor_args.device, dtype=self.tensor_args.dtype
+        )
+
+        # find start and end points for these edges:
+        start_pts = self.path[edges[:, 0].long(), : self.dof]
+        end_pts = self.path[edges[:, 1].long(), : self.dof]
+
+        # first check the start and end points:
+        # get largest edge:
+        dist = self._distance(start_pts, end_pts, norm=False)
+        n = torch.ceil(torch.max(torch.abs(dist) / self.steer_radius)).item() + 1
+        if n + 1 > self.delta_vec.shape[0]:
+            print("error", n, self.delta_vec.shape)
+        delta_vec = self.delta_vec[: int(n + 1)] / n
+
+        #
+        line_vec = (
+            start_pts.unsqueeze(1)
+            + delta_vec.unsqueeze(1) @ dist.unsqueeze(1) / self.distance_weight
+        )
+        b, h, _ = line_vec.shape
+        print("Number of points to check: ", b * h)
+
+        mask = self.mask_samples(line_vec.view(b * h, self.dof))
+        mask = ~mask.view(b, h)
+        # edge mask contains all edges that are valid for current world:
+        edge_mask = ~torch.any(mask, dim=1)
+
+        # add these to graph:
+        new_edges = edges[edge_mask]  # .cpu().numpy()#.tolist()
+
+        #
+        node_mask = ~mask[:, 0]
+        node_list = (
+            torch.unique(
+                torch.cat((edges[node_mask][:, 0].long(), edges[~mask[:, -1]][:, 1].long()))
+            )
+            .cpu()
+            .tolist()
+        )
+
+        new_path = self.path[node_list]
+        new_path[:, self.dof + 1] = torch.as_tensor(
+            [x for x in range(new_path.shape[0])],
+            device=self.tensor_args.device,
+            dtype=self.tensor_args.dtype,
+        )
+        self.i = new_path.shape[0]  # + 1
+        self.path[: self.i] = new_path
+
+        reindex_edges = []
+        if len(new_edges) > 0:
+            # reindex edges:
+            for e in range(len(new_edges)):
+                st_idx = node_list.index(int(new_edges[e][0]))
+                end_idx = node_list.index(int(new_edges[e][1]))
+                reindex_edges.append([st_idx, end_idx])
+
+            new_edges[:, 0:2] = torch.as_tensor(
+                reindex_edges, device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            )
+        else:
+            print("ERROR")
+            new_edges = edges
+        d = self._distance(
+            self.path[new_edges[:, 0].long(), : self.dof],
+            self.path[new_edges[:, 1].long(), : self.dof],
+        )
+        new_edges[:, 2] = d
+        new_edges = new_edges.detach().cpu().numpy().tolist()
+
+        # compute path lengths:
+
+        new_edges = [[int(x[0]), int(x[1]), x[2]] for x in new_edges]
+        # self.i += 1
+        self.path[self.i :] *= 0.0
+        #
+
+        self.graph.reset_graph()
+        self.graph.add_edges(new_edges)
+        self.graph.add_nodes(list(range(self.i)))
+        self.graph.update_graph()
+        print("Validated graph", len(new_edges), edges.shape)
+
+    def _get_graph_shortest_path(self, start_node_idx, goal_node_idx, return_length=False):
+        # st_time = time.time()
+        path = self.graph.get_shortest_path(
+            start_node_idx, goal_node_idx, return_length=return_length
+        )
+        # print('Graph search time: ',time.time() - st_time)
+        return path
+
+    def batch_get_graph_shortest_path(self, start_idx_list, goal_idx_list, return_length=False):
+        if len(start_idx_list) != len(goal_idx_list):
+            raise ValueError("Start and Goal idx length are not equal")
+        path_list = []
+        cmax_list = []
+        for i in range(len(start_idx_list)):
+            path = self._get_graph_shortest_path(
+                start_idx_list[i], goal_idx_list[i], return_length=return_length
+            )
+            if return_length:
+                path_list.append(path[0])
+                cmax_list.append(path[1])
+            else:
+                path_list.append(path)
+        if return_length:
+            return path_list, cmax_list
+        return path_list
+
+    @torch.no_grad()
+    def batch_shortcut_path(self, g_path, start_idx, goal_idx):
+        edge_set = []
+        for k in range(len(g_path)):
+            path = self.path[g_path[k]]
+            for i in range(path.shape[0]):
+                for j in range(i, path.shape[0]):
+                    edge_set.append(
+                        torch.cat((path[i : i + 1], path[j : j + 1]), dim=0).unsqueeze(0)
+                    )
+        edge_set = torch.cat(edge_set, dim=0)
+        self.connect_nodes(edge_set=edge_set)
+        s_path, c_max = self.batch_get_graph_shortest_path(start_idx, goal_idx, return_length=True)
+        return s_path, c_max
+
+    def get_node_idx(self, goal_state, exact=False) -> Optional[int]:
+        goal_state = torch.as_tensor(
+            goal_state, device=self.tensor_args.device, dtype=self.tensor_args.dtype
+        )
+        dist = torch.norm(
+            self._distance(goal_state, self.path[: self.i, : self.dof], norm=False), dim=-1
+        )
+        c_idx = torch.argmin(dist)
+        if exact:
+            if dist[c_idx] != 0.0:
+                return None
+            else:
+                return c_idx.item()
+        if dist[c_idx] <= self.node_similarity_distance:
+            return c_idx.item()
+
+    def get_path_lengths(self, goal_idx):
+        path_lengths = self.graph.get_path_lengths(goal_idx)
+        path_length = {
+            "position": self.path[: self.i, : self.dof],
+            "value": torch.as_tensor(
+                path_lengths, device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            ),
+        }
+        return path_length
+
+    def get_graph_shortest_path_lengths(self, goal_idx: int):
+        graph = self.get_graph()
+        shortest_paths = self.get_path_lengths(goal_idx)
+        graph.set_shortest_path_lengths(shortest_paths["value"])
+        return graph
+
+    def path_exists(self, start_node_idx, goal_node_idx):
+        return self.graph.path_exists(start_node_idx, goal_node_idx)
+
+    def batch_path_exists(self, start_idx_list, goal_idx_list, all_paths=False):
+        if len(start_idx_list) != len(goal_idx_list):
+            raise ValueError("Start and Goal idx length are not equal")
+        path_label = []
+        for i in range(len(start_idx_list)):
+            path_label.append(self.path_exists(start_idx_list[i], goal_idx_list[i]))
+        if all_paths:
+            label = all(path_label)
+        else:
+            label = any(path_label)
+        return label, path_label
+
+    @torch.no_grad()
+    def find_paths(self, x_init, x_goal, interpolation_steps: Optional[int] = None) -> GraphResult:
+        start_time = time.time()
+
+        try:
+            path = self._find_paths(x_init, x_goal)
+            path.success = torch.as_tensor(
+                path.success, device=self.tensor_args.device, dtype=torch.bool
+            )
+            path.solve_time = time.time() - start_time
+            if self.interpolation_type is not None and torch.count_nonzero(path.success):
+                (
+                    path.interpolated_plan,
+                    path.path_buffer_last_tstep,
+                    path.optimized_dt,
+                ) = self.get_interpolated_trajectory(path.plan, interpolation_steps)
+                # path.js_interpolated_plan = self.rollout_fn.get_full_dof_from_solution(
+                #    path.interpolated_plan
+                # )
+                if self.compute_metrics:
+                    # compute metrics on interpolated plan:
+                    path.metrics = self.get_metrics(path.interpolated_plan)
+
+                    path.success = torch.logical_and(
+                        path.success, torch.all(path.metrics.feasible, 1)
+                    )
+
+        except ValueError as e:
+            log_info(e)
+            self.reset_buffer()
+            torch.cuda.empty_cache()
+            success = torch.zeros(x_init.shape[0], device=self.tensor_args.device, dtype=torch.bool)
+            path = GraphResult(success, x_init, x_goal)
+        except RuntimeError as e:
+            log_warn(e)
+            self.reset_buffer()
+            torch.cuda.empty_cache()
+            success = torch.zeros(x_init.shape[0], device=self.tensor_args.device, dtype=torch.long)
+            path = GraphResult(success, x_init, x_goal)
+        return path
+
+    @abstractmethod
+    def _find_paths(self, x_search, c_search, x_init) -> GraphResult:
+        raise NotImplementedError
+
+    def compute_path_length(self, path):
+        # compute cost to go to next timestep:
+        next_pt_path = path.roll(-1, dims=0)
+        dist_vec = self._distance(next_pt_path, path)[:-1]
+        path_length = torch.sum(dist_vec)
+        return path_length
+
+    def reset_graph(self):
+        self.graph.reset_graph()
+
+    @profiler.record_function("geometric_planner/compute_distance")
+    def _distance(self, pt, batch_pts, norm=True):
+        if norm:
+            return compute_distance_norm_jit(pt, batch_pts, self.distance_weight)
+        else:
+            return compute_distance_jit(pt, batch_pts, self.distance_weight)
+
+    def distance(self, pt, batch_pts, norm=True):
+        return self._distance(pt, batch_pts, norm=norm)
+
+    def _hybrid_nearest(self, sample_node, path, radius, k_n=10):
+        # compute distance:
+        dist = self._distance(sample_node[..., : self.dof], path[:, : self.dof])
+        nodes = path[dist < radius]
+        if nodes.shape[0] < k_n:
+            _, idx = torch.topk(dist, k_n, largest=False)
+            nodes = path[idx]  # , idx
+        return nodes
+
+    def _nearest(self, sample_point, current_graph):
+        dist = self._distance(sample_point[..., : self.dof], current_graph[:, : self.dof])
+        _, idx = torch.min(dist, 0)
+        return current_graph[idx], idx
+
+    def _k_nearest(self, sample_point, current_graph, k=10):
+        dist = self._distance(sample_point[..., : self.dof], current_graph[:, : self.dof])
+        # give the k nearest:
+        # get_top_k(dist, k)
+        _, idx = torch.topk(dist, k, largest=False)
+        return current_graph[idx]  # , idx
+
+    @profiler.record_function("geometric_planner/k_nearest")
+    def _batch_k_nearest(self, sample_point, current_graph, k=10):
+        dist = self._distance(
+            sample_point[:, : self.dof].unsqueeze(1), current_graph[:, : self.dof]
+        )
+        # give the k nearest:
+        # get_top_k(dist, k)
+        _, idx = torch.topk(dist, k, largest=False, dim=-1)
+        return current_graph[idx]  # , idx
+
+    def _near(self, sample_point, current_graph, radius):
+        dist = self._distance(sample_point[..., : self.dof], current_graph[:, : self.dof])
+        nodes = current_graph[dist < radius]
+        return nodes
+
+    @profiler.record_function("geometric_planner/batch_steer_and_connect")
+    def _batch_steer_and_connect(
+        self,
+        start_nodes,
+        goal_nodes,
+        add_steer_pts=-1,
+        lazy=False,
+        add_exact_node=False,
+    ):
+        """
+            Connect node from start to goal where both are batched.
+        Args:
+            start_node ([type]): [description]
+            goal_nodes ([type]): [description]
+        """
+
+        steer_nodes, _ = self._batch_steer(
+            start_nodes,
+            goal_nodes,
+            add_steer_pts=add_steer_pts,
+            lazy=lazy,
+        )
+        self._add_batch_edges_to_graph(
+            steer_nodes, start_nodes, lazy=lazy, add_exact_node=add_exact_node
+        )
+
+    @profiler.record_function("geometric_planner/batch_steer")
+    def _batch_steer(
+        self,
+        start_nodes,
+        desired_nodes,
+        steer_radius=None,
+        add_steer_pts=-1,
+        lazy=False,
+    ):
+        if lazy:
+            extra_data = self.cat_buffer.unsqueeze(0).repeat(desired_nodes.shape[0], 1)
+            current_node = torch.cat((desired_nodes, extra_data), dim=1)
+            return current_node, True
+
+        steer_radius = self.steer_radius if steer_radius is None else steer_radius
+        dof = self.dof
+
+        current_node = start_nodes
+
+        g_vec = self._distance(
+            start_nodes[..., :dof], desired_nodes[..., :dof], norm=False
+        )  # .unsqueeze(0)
+
+        n = torch.ceil(torch.max(torch.abs(g_vec) / steer_radius)).item() + 1
+
+        delta_vec = self.delta_vec[: int(n + 1)] / n
+
+        #
+        line_vec = (
+            start_nodes[..., :dof].unsqueeze(1)
+            + delta_vec.unsqueeze(1) @ g_vec.unsqueeze(1) / self.distance_weight
+        )
+        b, h, dof = line_vec.shape
+        line_vec = line_vec.view(b * h, dof)
+        # print("Collision checks: ", b)
+        # check along line vec:
+        mask = self.mask_samples(line_vec)
+
+        line_vec = line_vec.view(b, h, dof)
+        # TODO: Make this cleaner..
+        mask = mask.view(b, h).to(dtype=torch.int8)
+        mask[mask == 0.0] = -1.0
+        mask = mask * (delta_vec + 1.0)
+        mask[mask < 0.0] = 1 / (mask[mask < 0.0])
+
+        _, idx = torch.min(mask, dim=1)
+        # idx will contain 1 when there is no collision.
+        idx -= 1
+        idx[idx == -1] = h - 1
+        # idx contains the position of the first collision
+        # if idx value is zero, then there is not path, so return the current node,
+        # or you can just return line_vec[idx]
+        if add_steer_pts > 0:
+            raise NotImplementedError("Steer point addition is not implemented for batch mode")
+        new_nodes = torch.diagonal(line_vec[:, idx], dim1=0, dim2=1).transpose(0, 1)
+        edge_cost = self._distance(new_nodes[:, :dof], start_nodes[:, :dof])
+        # current_node = new_node
+        extra_data = self.cat_buffer.unsqueeze(0).repeat(new_nodes.shape[0], 1)
+        extra_data[:, 2] = edge_cost
+        current_node = torch.cat((new_nodes, extra_data), dim=1)
+        return current_node, True
+
+    @profiler.record_function("geometric_planner/add_edges_to_graph")
+    def _add_batch_edges_to_graph(self, new_nodes, start_nodes, lazy=False, add_exact_node=False):
+        # add new nodes to graph:
+        node_set = self.add_nodes_to_graph(new_nodes[:, : self.dof], add_exact_node=add_exact_node)
+        # now connect start nodes to new nodes:
+        edge_list = []
+        edge_distance = (
+            self.distance(start_nodes[:, : self.dof], node_set[:, : self.dof])
+            .to(device="cpu")
+            .tolist()
+        )
+        start_idx_list = start_nodes[:, self.dof + 1].to(device="cpu", dtype=torch.int64).tolist()
+        goal_idx_list = node_set[:, self.dof + 1].to(device="cpu", dtype=torch.int64).tolist()
+        edge_list = [
+            [start_idx_list[x], goal_idx_list[x], edge_distance[x]]
+            for x in range(node_set.shape[0])
+        ]
+        self.graph.add_edges(edge_list)
+        return True
+
+    @profiler.record_function("geometric_planner/add_nodes")
+    def add_nodes_to_graph(self, nodes, add_exact_node=False):
+        # TODO: check if this and unique nodes fn can be merged
+        # Check for duplicates in new nodes:
+        dist_node = self.distance(nodes[:, : self.dof].unsqueeze(1), nodes[:, : self.dof])
+        node_distance = self.node_similarity_distance
+        if add_exact_node:
+            node_distance = 0.0
+
+        unique_nodes, n_inv = get_unique_nodes(dist_node, nodes, node_distance)
+
+        node_set = self._add_unique_nodes_to_graph(unique_nodes, add_exact_node=add_exact_node)
+        node_set = node_set[n_inv]
+        return node_set
+
+    @profiler.record_function("geometric_planner/add_unique_nodes")
+    def _add_unique_nodes_to_graph(self, nodes, add_exact_node=False, skip_unique_check=False):
+        if self.i > 0:  # and not skip_unique_check:
+            dist, idx = torch.min(
+                self.distance(nodes[:, : self.dof].unsqueeze(1), self.path[: self.i, : self.dof]),
+                dim=-1,
+            )
+            node_distance = self.node_similarity_distance
+            if add_exact_node:
+                node_distance = 0.0
+            flag = dist <= node_distance
+            new_nodes = nodes[~flag]
+
+            if self.path.shape[0] <= self.i + new_nodes.shape[0]:
+                raise ValueError(
+                    "reached max_nodes in graph, reduce graph attempts or increase max_nodes",
+                    self.path.shape,
+                    self.i,
+                    new_nodes.shape,
+                )
+            self.path, node_set, i_new = add_new_nodes_jit(
+                nodes, new_nodes, flag, self.cat_buffer, self.path, idx, self.i, self.dof
+            )
+
+        else:
+            self.path, node_set, i_new = add_all_nodes_jit(
+                nodes, self.cat_buffer, self.path, self.i, self.dof
+            )
+
+        self.i += i_new
+
+        return node_set
+
+    @profiler.record_function("geometric_planner/connect_nodes")
+    def connect_nodes(
+        self,
+        x_set=None,
+        connect_mode="knn",
+        debug=False,
+        lazy=False,
+        add_exact_node=False,
+        k_nn=10,
+        edge_set=None,
+    ):
+        # connect the batch to the existing graph
+        path = self.path
+        dof = self.dof
+
+        i = self.i
+
+        if x_set is not None:
+            if x_set.shape[0] == 0:
+                return
+            if connect_mode == "radius":
+                raise NotImplementedError
+                scale_radius = self.neighbour_radius * (np.log(i) / i) ** (1 / dof)
+                nodes = self._near(sample_node, path[:i, :], radius=scale_radius)
+                if nodes.shape[0] == 0:
+                    nodes = self._k_nearest(sample_node, path[:i, :], k=k_n)
+            elif connect_mode == "nearest":
+                nodes = self._batch_k_nearest(x_set, path[:i, :], k=k_nn)[1:]
+            elif connect_mode == "knn":
+                # k_n = min(max(int(1 * 2.71828 * np.log(i)), k_nn), i)
+                # print(k_n, self.i, k_nn)
+                k_n = min(k_nn, i)
+
+                nodes = self._batch_k_nearest(x_set, path[:i, :], k=k_n)
+            elif connect_mode == "hybrid":
+                k_n = min(max(int(1 * 2.71828 * np.log(i)), k_nn), i)
+                nodes = self._batch_k_nearest(x_set, path[:i, :], k=k_n)
+                print("Hybrid will default to knn")
+            # you would end up with:
+            # for each node in x_set, you would have n nodes to connect
+            start_nodes = (
+                x_set.unsqueeze(1)
+                .repeat(1, nodes.shape[1], 1)
+                .reshape(x_set.shape[0] * nodes.shape[1], -1)
+            )
+            goal_nodes = nodes.reshape(
+                x_set.shape[0] * nodes.shape[1], -1
+            )  # batch x k_n or batch x 1
+
+            if edge_set is not None:
+                # add 0th index to goal_node and 1st index to start
+                goal_nodes = torch.cat((edge_set[:, 0], goal_nodes), dim=0)
+                start_nodes = torch.cat((edge_set[:, 1, : self.dof], start_nodes), dim=0)
+        elif edge_set is not None:
+            goal_nodes = edge_set[:, 0]
+            start_nodes = edge_set[:, 1, : self.dof]
+        self._batch_steer_and_connect(
+            goal_nodes, start_nodes, add_steer_pts=-1, lazy=lazy, add_exact_node=add_exact_node
+        )
+
+    def get_paths(self, path_list):
+        paths = []
+        for i in range(len(path_list)):
+            paths.append(self.path[path_list[i], : self.dof])
+        return paths
+
+    # get interpolated trajectory
+    def get_interpolated_trajectory(
+        self, trajectory: List[tensor.T_HDOF_float], interpolation_steps: Optional[int] = None
+    ):
+        buffer = self.interpolation_steps
+        if interpolation_steps is not None:
+            buffer = interpolation_steps
+        interpolation_type = self.interpolation_type
+        if interpolation_type == InterpolateType.LINEAR_CUDA:
+            log_warn(
+                "LINEAR_CUDA interpolation not supported for GraphPlanner, switching to LINEAR"
+            )
+            interpolation_type = InterpolateType.LINEAR
+        if (
+            self._out_traj_state is None
+            or self._out_traj_state.shape[0] != len(trajectory)
+            or self._out_traj_state.shape[1] != buffer
+        ):
+            self._out_traj_state = JointState.from_position(
+                torch.zeros(
+                    (len(trajectory), buffer, trajectory[0].shape[-1]),
+                    device=self.tensor_args.device,
+                ),
+                joint_names=self.rollout_fn.joint_names,
+            )
+
+        out_traj_state, last_tstep, opt_dt = get_interpolated_trajectory(
+            trajectory,
+            self._out_traj_state,
+            interpolation_steps,
+            self.interpolation_dt,
+            self._max_joint_vel,
+            self._max_joint_acc,  # * self.interpolation_acceleration_scale,
+            self._max_joint_jerk,
+            kind=self.interpolation_type,
+            tensor_args=self.tensor_args,
+            max_deviation=self.interpolation_deviation,
+        )
+        out_traj_state.joint_names = self.rollout_fn.joint_names
+
+        return out_traj_state, last_tstep, opt_dt
+
+    # validate plan
+    def get_metrics(self, state: State):
+        # compute metrics
+        metrics = self.safety_rollout_fn.get_metrics(state)
+        return metrics
+
+    def reset_seed(self):
+        self.safety_rollout_fn.reset_seed()
+        self.sample_gen = HaltonGenerator(
+            self.dof,
+            self.tensor_args,
+            up_bounds=self.safety_rollout_fn.action_bound_highs,
+            low_bounds=self.safety_rollout_fn.action_bound_lows,
+            seed=self.seed,
+        )
+
+    def reset_cuda_graph(self):
+        self.rollout_fn.reset_cuda_graph()
+
+    def get_all_rollout_instances(self) -> List[RolloutBase]:
+        if self._rollout_list is None:
+            self._rollout_list = [self.safety_rollout_fn, self.rollout_fn]
+        return self._rollout_list
+
+    def warmup(self, x_start: Optional[torch.Tensor] = None, x_goal: Optional[torch.Tensor] = None):
+        pass
+
+
+@torch.jit.script
+def get_unique_nodes(dist_node: torch.Tensor, nodes: torch.Tensor, node_distance: float):
+    node_flag = dist_node <= node_distance
+    dist_node[node_flag] = 0.0
+    dist_node[~node_flag] = 1.0
+    _, idx = torch.min(dist_node, dim=-1)
+    n_idx, n_inv = torch.unique(idx, return_inverse=True)
+
+    #
+    unique_nodes = nodes[n_idx]
+    return unique_nodes, n_inv
+
+
+@torch.jit.script
+def add_new_nodes_jit(
+    nodes, new_nodes, flag, cat_buffer, path, idx, i: int, dof: int
+) -> Tuple[torch.Tensor, torch.Tensor, int]:
+    new_idx = torch.as_tensor(
+        [i + x for x in range(new_nodes.shape[0])],
+        device=new_nodes.device,
+        dtype=new_nodes.dtype,
+    )
+
+    old_node_idx = idx[flag]
+
+    node_set = torch.cat((nodes, cat_buffer.unsqueeze(0).repeat(nodes.shape[0], 1)), dim=-1)
+    # node_set[flag][:, self.dof + 1] = old_node_idx.to(dtype=node_set.dtype)
+    node_set[flag, dof + 1] = old_node_idx.to(dtype=node_set.dtype)
+
+    path[i : i + new_nodes.shape[0], :dof] = new_nodes
+    path[i : i + new_nodes.shape[0], dof + 1] = new_idx
+    node_temp = node_set[~flag]
+    node_temp[:, dof + 1] = new_idx
+    node_set[~flag] = node_temp
+    return path, node_set, new_nodes.shape[0]
+
+
+@torch.jit.script
+def add_all_nodes_jit(
+    nodes, cat_buffer, path, i: int, dof: int
+) -> Tuple[torch.Tensor, torch.Tensor, int]:
+    new_idx = torch.as_tensor(
+        [i + x for x in range(nodes.shape[0])],
+        device=nodes.device,
+        dtype=nodes.dtype,
+    )
+
+    node_set = torch.cat((nodes, cat_buffer.unsqueeze(0).repeat(nodes.shape[0], 1)), dim=-1)
+
+    path[i : i + nodes.shape[0], :dof] = nodes
+    path[i : i + nodes.shape[0], dof + 1] = new_idx
+    node_set[:, dof + 1] = new_idx
+    return path, node_set, nodes.shape[0]
+
+
+@torch.jit.script
+def compute_distance_norm_jit(pt, batch_pts, distance_weight):
+    vec = (batch_pts - pt) * distance_weight
+    dist = torch.norm(vec, dim=-1)
+    return dist
+
+
+@torch.jit.script
+def compute_distance_jit(pt, batch_pts, distance_weight):
+    vec = (batch_pts - pt) * distance_weight
+    return vec
+
+
+@torch.jit.script
+def compute_rotation_frame_jit(
+    x_start: torch.Tensor, x_goal: torch.Tensor, rot_frame_col: torch.Tensor
+) -> torch.Tensor:
+    a = ((x_goal - x_start) / torch.norm(x_start - x_goal)).unsqueeze(1)
+
+    M = a @ rot_frame_col  # .T
+
+    # with torch.cuda.amp.autocast(enabled=False):
+    U, _, V = torch.svd(M, compute_uv=True, some=False)
+    vec = a.flatten() * 0.0 + 1.0
+    vec[-1] = torch.det(U) * torch.det(V)
+
+    C = U @ torch.diag(vec) @ V.T
+    return C
+
+
+@torch.jit.script
+def biased_vertex_projection_jit(
+    x_start,
+    x_goal,
+    distance_weight,
+    c_max: float,
+    c_min: float,
+    dof: int,
+    rot_frame_col: torch.Tensor,
+    unit_ball: torch.Tensor,
+    bounds: torch.Tensor,
+) -> torch.Tensor:
+    C = compute_rotation_frame_jit(
+        x_start * distance_weight,
+        x_goal * distance_weight,
+        rot_frame_col,
+    )
+
+    r = x_start * 0.0
+    r[0] = c_max / 2.0
+    r[1:] = (c_max**2 - c_min**2) / 2.0
+    L = torch.diag(r)
+    x_center = (x_start[..., :dof] + x_goal[..., :dof]) / 2.0
+    x_samples = ((C @ L @ unit_ball.T).T) / distance_weight + x_center
+    # clamp at joint angles:
+    x_samples = torch.clamp(x_samples, bounds[0, :], bounds[1, :]).contiguous()
+
+    return x_samples
+
+
+@torch.jit.script
+def cat_xc_jit(x, n: int):
+    c = x[:, 0:1] * 0.0
+    xc_search = torch.cat((x, c), dim=1)[:n, :]
+    return xc_search
diff --git a/src/curobo/graph/graph_nx.py b/src/curobo/graph/graph_nx.py
new file mode 100644
index 00000000..86ec22e8
--- /dev/null
+++ b/src/curobo/graph/graph_nx.py
@@ -0,0 +1,87 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+
+# Third Party
+import networkx as nx
+import torch
+
+
+class NetworkxGraph(object):
+    def __init__(self):
+        self.graph = nx.Graph()
+        # maintain node buffer
+        self.node_list = []
+        # maintain edge buffer
+        self.edge_list = []
+
+    def reset_graph(self):
+        self.graph.clear()
+        self.edge_list = []
+        self.node_list = []
+
+    def add_node(self, i):
+        self.node_list.append(i)
+
+    def add_edges(self, edge_list):
+        self.edge_list += edge_list
+
+    def add_nodes(self, node_list):
+        self.node_list += node_list
+
+    def add_edge(self, start_i, end_i, weight):
+        self.edge_list.append([start_i, end_i, weight])
+
+    def update_graph(self):
+        if len(self.edge_list) > 0:
+            self.graph.add_weighted_edges_from(self.edge_list)
+            self.edge_list = []
+        if len(self.node_list) > 0:
+            self.graph.add_nodes_from(self.node_list)
+            self.node_list = []
+
+    def get_edges(self, attribue="weight"):
+        edge_list = list(self.graph.edges.data("weight"))
+        return edge_list
+
+    def path_exists(self, start_node_idx, goal_node_idx):
+        self.update_graph()
+        return nx.has_path(self.graph, start_node_idx, goal_node_idx)
+
+    def get_shortest_path(self, start_node_idx, goal_node_idx, return_length=False):
+        self.update_graph()
+        length, path = nx.bidirectional_dijkstra(
+            self.graph, start_node_idx, goal_node_idx, weight="weight"
+        )
+        if return_length:
+            return path, length
+        return path
+
+    def get_path_lengths(self, goal_node_idx):
+        self.update_graph()
+        path_length_dict = nx.shortest_path_length(
+            self.graph, source=goal_node_idx, weight="weight"
+        )
+        dict_keys = list(path_length_dict.keys())
+        max_n = self.graph.number_of_nodes()
+
+        max_n = max(dict_keys) + 1
+
+        path_lengths = [-1.0 for x in range(max_n)]
+        for i in range(len(dict_keys)):
+            k = dict_keys[i]
+            # print(i,k, max_n)
+            if k >= max_n:
+                print(k, max_n)
+                continue
+            path_lengths[k] = path_length_dict[k]
+        path_lengths = torch.as_tensor(path_lengths)
+        return path_lengths.cpu().tolist()
diff --git a/src/curobo/graph/prm.py b/src/curobo/graph/prm.py
new file mode 100644
index 00000000..c2693645
--- /dev/null
+++ b/src/curobo/graph/prm.py
@@ -0,0 +1,526 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+import random
+from cmath import inf
+from typing import Optional
+
+# Third Party
+import numpy as np
+import torch
+import torch.autograd.profiler as profiler
+
+# CuRobo
+from curobo.graph.graph_base import GraphConfig, GraphPlanBase, GraphResult
+from curobo.util.logger import log_error, log_info, log_warn
+
+
+class PRMStar(GraphPlanBase):
+    def __init__(self, config: GraphConfig):
+        super().__init__(config)
+
+    @torch.no_grad()
+    def _find_paths(self, x_init_batch, x_goal_batch, all_paths=False):
+        if all_paths:
+            return self._find_all_path(x_init_batch, x_goal_batch)
+        else:
+            return self._find_one_path(x_init_batch, x_goal_batch)
+
+    @profiler.record_function("geometric_planner/prm/add_bias_graph")
+    def _add_bias_graph(self, x_init_batch, x_goal_batch, node_set_batch, node_set):
+        # if retract state is not in collision add it:
+        if self._check_bias_node:
+            bias_mask = self.mask_samples(self.bias_node)
+            if bias_mask.all() == True:
+                self._valid_bias_node = True
+            else:
+                log_warn("Bias node is not valid, not using bias node")
+            self._check_bias_node = False
+
+        if self._valid_bias_node:
+            # add retract config to node set:
+            start_retract = torch.cat(
+                (
+                    x_init_batch.unsqueeze(1),
+                    self.bias_node.repeat(x_init_batch.shape[0], 1).unsqueeze(1),
+                ),
+                dim=1,
+            )
+            goal_retract = torch.cat(
+                (
+                    x_goal_batch.unsqueeze(1),
+                    self.bias_node.repeat(x_init_batch.shape[0], 1).unsqueeze(1),
+                ),
+                dim=1,
+            )
+            retract_set = torch.cat((start_retract, goal_retract), dim=0)
+
+            b_retract, _, _ = retract_set.shape
+            retract_set = self.add_nodes_to_graph(
+                retract_set.view(retract_set.shape[0] * 2, self.dof)
+            )
+            retract_set_batch = retract_set.view(b_retract, 2, retract_set.shape[-1])
+            # create an edge set:
+            #    connecting start to goal and also goal to start:
+            edge_set = torch.cat(
+                (
+                    node_set_batch,
+                    torch.flip(node_set_batch, dims=[1]),
+                    retract_set_batch,
+                    torch.flip(retract_set_batch, dims=[1]),
+                ),
+                dim=0,
+            )
+        else:
+            edge_set = torch.cat(
+                (
+                    node_set_batch,
+                    torch.flip(node_set_batch, dims=[1]),
+                ),
+                dim=0,
+            )
+        self.connect_nodes(
+            node_set[:, : self.dof],
+            edge_set=edge_set,
+            k_nn=self.k_nn,
+            connect_mode="knn",
+            add_exact_node=False,
+        )
+
+    @torch.no_grad()
+    def _find_one_path(self, x_init_batch, x_goal_batch):
+        """Find path from a batch of initial and goal configs
+
+        Args:
+            x_init ([type]): batch of start
+            x_goal ([type]): batch of goal
+            return_path_lengths (bool, optional): [description]. Defaults to False.
+
+        Returns:
+            [type]: b, h, dof
+        """
+        result = GraphResult(
+            start_q=x_init_batch,
+            goal_q=x_goal_batch,
+            success=[False for x in range(x_init_batch.shape[0])],
+            path_length=self.tensor_args.to_device([inf for x in range(x_init_batch.shape[0])]),
+        )
+        # check if start and goal are same, if so just return false
+        if self.i > (self.max_nodes * 0.75):
+            self.reset_buffer()
+        # add start and goal nodes to graph:
+        node_set = torch.cat((x_init_batch.unsqueeze(1), x_goal_batch.unsqueeze(1)), dim=1)
+
+        b, _, dof = node_set.shape
+        node_set = node_set.view(b * 2, dof)
+        # check if start and goal are in freespace:
+        mask = self.mask_samples(node_set)
+        if mask.all() != True:
+            log_warn("Start or End state in collision", exc_info=False)
+            node_set_batch = node_set.view(b, 2, node_set.shape[-1])
+            result.plan = [node_set_batch[i, :, : self.dof] for i in range(node_set_batch.shape[0])]
+            result.valid_query = False
+            result.debug_info = "Start or End state in collision"
+            return result
+        node_set = self.add_nodes_to_graph(node_set, add_exact_node=True)
+        node_set_batch = node_set.view(b, 2, node_set.shape[-1])
+        if (
+            torch.min(
+                torch.abs(node_set_batch[:, 0, self.dof + 1] - node_set_batch[:, 1, self.dof + 1])
+            )
+            == 0.0
+        ):
+            log_warn("WARNING: Start and Goal are same")
+            result.success = [False for x in range(x_init_batch.shape[0])]
+            result.plan = [node_set_batch[i, :, : self.dof] for i in range(node_set_batch.shape[0])]
+            return result
+
+        self._add_bias_graph(x_init_batch, x_goal_batch, node_set_batch, node_set)
+
+        batch_start_idx = (
+            node_set_batch[:, 0, self.dof + 1].to(dtype=torch.int64, device="cpu").tolist()
+        )
+        batch_goal_idx = (
+            node_set_batch[:, 1, self.dof + 1].to(dtype=torch.int64, device="cpu").tolist()
+        )
+
+        graph_attempt = 0
+        path_exists, exist_label = self.batch_path_exists(batch_start_idx, batch_goal_idx)
+        k_nn = self.k_nn
+        s_path = [[x, x] for x in batch_start_idx]
+        c_max_all = [inf for _ in batch_start_idx]
+        c_min = self.distance(x_init_batch, x_goal_batch).cpu().numpy()
+
+        # NOTE: c_max is scaled by 10.0, this could be replaced by reading c_min
+        c_max = np.ravel([self.c_max * c_min[i] for i in range(x_init_batch.shape[0])])
+        if path_exists:
+            idx_list = np.where(exist_label)[0].tolist()
+            batch_start_ = [batch_start_idx[x] for x in idx_list]
+            batch_goal_ = [batch_goal_idx[x] for x in idx_list]
+            g_path, c_max_t = self.batch_get_graph_shortest_path(
+                batch_start_, batch_goal_, return_length=True
+            )
+
+            len_min = min([len(g) for g in g_path])
+            if len_min > 2:
+                g_path, c_max_t = self.batch_shortcut_path(g_path, batch_start_, batch_goal_)
+                len_min = min([len(g) for g in g_path])
+            for i, idx in enumerate(idx_list):
+                s_path[idx] = g_path[i]
+                c_max[idx] = c_max_t[i]
+                c_max_all[idx] = c_max_t[i]
+
+            s_new_path = []
+
+            # only take paths that are valid:
+            for g_i, g_p in enumerate(s_path):
+                if exist_label[g_i]:
+                    s_new_path.append(g_p)
+            s_path = s_new_path
+            if len_min <= 2:
+                paths = self.get_paths(s_path)
+                result.plan = paths
+                result.success = exist_label
+                result.path_length = self.tensor_args.to_device(c_max_all)
+                return result
+
+        n_nodes = self.init_nodes
+        # find paths
+        idx = 0
+        while not path_exists or graph_attempt <= (self.graph_min_attempts):
+            no_path_label = exist_label
+            if not any(exist_label):
+                no_path_label = [not x for x in exist_label]
+            no_path_idx = np.where(no_path_label)[0].tolist()
+            idx = random.choice(no_path_idx)
+            self.build_graph(
+                x_start=x_init_batch[idx],
+                x_goal=x_goal_batch[idx],
+                bias_samples=True,
+                k_nn=k_nn,
+                c_max=c_max[idx],
+                c_min=c_min[idx],
+                number_of_nodes=n_nodes,
+                lazy_nodes=False,
+            )
+            graph_attempt += 1
+
+            path_exists, exist_label = self.batch_path_exists(batch_start_idx, batch_goal_idx)
+            if path_exists:
+                idx_list = np.where(exist_label)[0].tolist()
+                batch_start_ = [batch_start_idx[x] for x in idx_list]
+                batch_goal_ = [batch_goal_idx[x] for x in idx_list]
+
+                g_path, c_max_ = self.batch_get_graph_shortest_path(
+                    batch_start_, batch_goal_, return_length=True
+                )
+                len_min = min([len(g) for g in g_path])
+                if len_min > 2:
+                    g_path, c_max_ = self.batch_shortcut_path(g_path, batch_start_, batch_goal_)
+                    len_min = min([len(g) for g in g_path])
+                for i, idx in enumerate(idx_list):
+                    c_max[idx] = c_max_[i]
+
+                if len_min <= 2:
+                    break
+            else:
+                if graph_attempt == 1:
+                    n_nodes = self.vertex_n
+                c_max[idx] += c_min[idx] * 0.05
+
+            k_nn += int(0.1 * k_nn)
+            n_nodes += int(0.1 * n_nodes)
+            if graph_attempt > self.graph_max_attempts:
+                break
+        path_exists, exist_label = self.batch_path_exists(
+            batch_start_idx, batch_goal_idx, all_paths=True
+        )
+
+        if not path_exists:
+            s_path = [[x, x] for x in batch_start_idx]
+            c_max = [inf for _ in batch_start_idx]
+            if any(exist_label):
+                # do shortcut for only possible paths:
+                # get true indices:
+                idx_list = np.where(exist_label)[0].tolist()
+                batch_start_idx = [batch_start_idx[x] for x in idx_list]
+                batch_goal_idx = [batch_goal_idx[x] for x in idx_list]
+                path_list = self.batch_get_graph_shortest_path(batch_start_idx, batch_goal_idx)
+
+                path_list, c_list = self.batch_shortcut_path(
+                    path_list, batch_start_idx, batch_goal_idx
+                )
+                # add this back
+                for i, idx in enumerate(idx_list):
+                    s_path[idx] = path_list[i]
+                    c_max[idx] = c_list[i]
+            g_path = []
+
+            # only take paths that are valid:
+            for g_i, g_p in enumerate(s_path):
+                if exist_label[g_i]:
+                    g_path.append(g_p)
+        else:
+            g_path, c_max = self.batch_get_graph_shortest_path(
+                batch_start_idx, batch_goal_idx, return_length=True
+            )
+            len_max = max([len(g) for g in g_path])
+            if len_max > 3:
+                g_path, c_max = self.batch_shortcut_path(g_path, batch_start_idx, batch_goal_idx)
+                len_max = max([len(g) for g in g_path])
+        paths = self.get_paths(g_path)
+        result.plan = paths
+        result.success = exist_label
+
+        # Debugging check:
+        # if torch.count_nonzero(torch.as_tensor(result.success)) != len(paths):
+        #    log_warn("Error here")
+
+        result.path_length = torch.as_tensor(
+            c_max, device=self.tensor_args.device, dtype=self.tensor_args.dtype
+        )
+        return result
+
+    @torch.no_grad()
+    def _find_all_path(self, x_init_batch, x_goal_batch):
+        """Find path from a batch of initial and goal configs
+
+        Args:
+            x_init ([type]): batch of start
+            x_goal ([type]): batch of goal
+            return_path_lengths (bool, optional): [description]. Defaults to False.
+
+        Returns:
+            [type]: b, h, dof
+        """
+
+        result = GraphResult(
+            start_q=x_init_batch,
+            goal_q=x_goal_batch,
+            success=[False for x in range(x_init_batch.shape[0])],
+            path_length=self.tensor_args.to_device([inf for x in range(x_init_batch.shape[0])]),
+        )
+        # check if start and goal are same, if so just return false
+        if self.i > (self.max_nodes * 0.75):
+            self.reset_buffer()
+        # add start and goal nodes to graph:
+        node_set = torch.cat((x_init_batch.unsqueeze(1), x_goal_batch.unsqueeze(1)), dim=1)
+
+        b, _, dof = node_set.shape
+        node_set = node_set.view(b * 2, dof)
+        # check if start and goal are in freespace:
+        mask = self.mask_samples(node_set)
+        if mask.all() != True:
+            log_warn("Start or End state in collision", exc_info=False)
+            node_set_batch = node_set.view(b, 2, node_set.shape[-1])
+            result.plan = [node_set_batch[i, :, : self.dof] for i in range(node_set_batch.shape[0])]
+            result.valid_query = False
+            result.debug_info = "Start or End state in collision"
+            return result
+        node_set = self.add_nodes_to_graph(node_set, add_exact_node=True)
+        node_set_batch = node_set.view(b, 2, node_set.shape[-1])
+        if (
+            torch.min(
+                torch.abs(node_set_batch[:, 0, self.dof + 1] - node_set_batch[:, 1, self.dof + 1])
+            )
+            == 0.0
+        ):
+            # print("WARNING: Start and Goal are same")
+            result.plan = [node_set_batch[i, :, : self.dof] for i in range(node_set_batch.shape[0])]
+            return result
+        self._add_bias_graph(x_init_batch, x_goal_batch, node_set_batch, node_set)
+
+        batch_start_idx = (
+            node_set_batch[:, 0, self.dof + 1].to(dtype=torch.int64, device="cpu").tolist()
+        )
+        batch_goal_idx = (
+            node_set_batch[:, 1, self.dof + 1].to(dtype=torch.int64, device="cpu").tolist()
+        )
+
+        graph_attempt = 0
+        path_exists, exist_label = self.batch_path_exists(
+            batch_start_idx, batch_goal_idx, all_paths=True
+        )
+        k_nn = self.k_nn
+
+        if path_exists:
+            g_path, c_max = self.batch_get_graph_shortest_path(
+                batch_start_idx, batch_goal_idx, return_length=True
+            )
+            len_max = max([len(g) for g in g_path])
+            if len_max > 2:
+                g_path, c_max = self.batch_shortcut_path(g_path, batch_start_idx, batch_goal_idx)
+                len_max = max([len(g) for g in g_path])
+                exist_label = [len(g) <= 3 for g in g_path]
+
+            if len_max <= 2:
+                paths = self.get_paths(g_path)
+                result.plan = paths
+                result.success = exist_label
+                result.path_length = self.tensor_args.to_device(c_max)
+                return result
+
+        c_min = self.distance(x_init_batch, x_goal_batch).cpu().numpy()
+        c_max = np.ravel([self.c_max * c_min[i] for i in range(x_init_batch.shape[0])])
+
+        n_nodes = self.init_nodes
+        # find paths
+        idx = 0
+        # print("Initial", path_exists, exist_label)
+        while not path_exists or graph_attempt < (self.graph_min_attempts):
+            if all(exist_label):
+                no_path_label = exist_label
+            else:
+                no_path_label = [not x for x in exist_label]
+            # choose x_init, x_goal from the ones that don't have a path:
+            no_path_idx = np.where(no_path_label)[0].tolist()
+            idx = random.choice(no_path_idx)
+            self.build_graph(
+                x_start=x_init_batch[idx],
+                x_goal=x_goal_batch[idx],
+                bias_samples=True,
+                k_nn=k_nn,
+                c_max=c_max[idx],
+                c_min=c_min[idx],
+                number_of_nodes=n_nodes,
+                lazy_nodes=False,
+            )
+            graph_attempt += 1
+            path_exists, exist_label = self.batch_path_exists(
+                batch_start_idx, batch_goal_idx, all_paths=True
+            )
+
+            if path_exists:
+                g_path, c_max = self.batch_get_graph_shortest_path(
+                    batch_start_idx, batch_goal_idx, return_length=True
+                )
+                len_max = max([len(g) for g in g_path])
+                if len_max > 2:
+                    g_path, c_max = self.batch_shortcut_path(
+                        g_path, batch_start_idx, batch_goal_idx
+                    )
+                    len_max = max([len(g) for g in g_path])
+                exist_label = [len(g) <= 3 for g in g_path]
+
+                if len_max <= 2:
+                    break
+            else:
+                if graph_attempt == 1:
+                    n_nodes = self.vertex_n
+                c_max[idx] += c_min[idx] * 0.05
+
+            k_nn += int(0.1 * k_nn)
+            n_nodes += int(0.1 * n_nodes)
+            if graph_attempt > self.graph_max_attempts:
+                break
+        path_exists, exist_label = self.batch_path_exists(
+            batch_start_idx, batch_goal_idx, all_paths=True
+        )
+
+        if not path_exists:
+            s_path = [[x, x] for x in batch_start_idx]
+            c_max = [inf for _ in batch_start_idx]
+            if any(exist_label):
+                # do shortcut for only possible paths:
+                # get true indices:
+                idx_list = np.where(exist_label)[0].tolist()
+                batch_start_idx = [batch_start_idx[x] for x in idx_list]
+                batch_goal_idx = [batch_goal_idx[x] for x in idx_list]
+                path_list = self.batch_get_graph_shortest_path(batch_start_idx, batch_goal_idx)
+
+                path_list, c_list = self.batch_shortcut_path(
+                    path_list, batch_start_idx, batch_goal_idx
+                )
+                # add this back
+                for i, idx in enumerate(idx_list):
+                    s_path[idx] = path_list[i]
+                    c_max[idx] = c_list[i]
+            g_path = []
+
+            # only take paths that are valid:
+            for g_i, g_p in enumerate(s_path):
+                if exist_label[g_i]:
+                    g_path.append(g_p)
+            # g_path = s_path
+        else:
+            g_path, c_max = self.batch_get_graph_shortest_path(
+                batch_start_idx, batch_goal_idx, return_length=True
+            )
+            len_max = max([len(g) for g in g_path])
+            if len_max > 3:
+                g_path, c_max = self.batch_shortcut_path(g_path, batch_start_idx, batch_goal_idx)
+                len_max = max([len(g) for g in g_path])
+        paths = self.get_paths(g_path)
+        result.plan = paths
+        result.success = exist_label
+        result.path_length = torch.as_tensor(
+            c_max, device=self.tensor_args.device, dtype=self.tensor_args.dtype
+        )
+        return result
+
+    def build_graph(
+        self,
+        x_start=None,
+        x_goal=None,
+        number_of_nodes=None,
+        lazy=False,
+        bias_samples=False,
+        k_nn=5,
+        c_max=10,
+        c_min=1,
+        lazy_nodes=False,
+    ):
+        # get samples to search in:
+        dof = self.dof
+        path = self.path
+        lazy_samples = lazy or lazy_nodes
+        # add few nodes to path:
+        if number_of_nodes is None:
+            number_of_nodes = self.vertex_n
+        if x_start is None or x_goal is None:
+            log_warn("Start and goal is not given, not using biased sampling")
+            bias_samples = False
+        # sample some points for vertex
+        if bias_samples:
+            v_set = self.get_biased_vertex_set(
+                x_start=x_start,
+                x_goal=x_goal,
+                n=number_of_nodes,
+                lazy=lazy_samples,
+                c_max=c_max,
+                c_min=c_min,
+            )
+        else:
+            v_set = self.get_new_vertex_set(n=number_of_nodes, lazy=lazy_samples)
+        number_of_nodes = v_set.shape[0]
+        if not lazy_samples:
+            if self.i + number_of_nodes >= path.shape[0]:
+                raise ValueError(
+                    "Path memory buffer is too small", path.shape[0], self.i + number_of_nodes
+                )
+            path[self.i : self.i + number_of_nodes, : dof + 1] = v_set
+            for i in range(number_of_nodes):
+                path[self.i + i, dof + 1] = self.i + i
+
+            self.i = self.i + number_of_nodes
+        sample_nodes = v_set[:, : self.dof]
+        self.connect_nodes(sample_nodes, lazy=lazy, k_nn=k_nn)
+
+    def warmup(self, x_start: Optional[torch.Tensor] = None, x_goal: Optional[torch.Tensor] = None):
+        for _ in range(3):
+            self.build_graph(
+                x_start=x_start.view(-1),
+                x_goal=x_goal.view(-1),
+                bias_samples=True,
+                k_nn=self.k_nn,
+            )
+        super().warmup()
diff --git a/src/curobo/opt/__init__.py b/src/curobo/opt/__init__.py
new file mode 100644
index 00000000..a08745f9
--- /dev/null
+++ b/src/curobo/opt/__init__.py
@@ -0,0 +1,10 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
diff --git a/src/curobo/opt/newton/__init__.py b/src/curobo/opt/newton/__init__.py
new file mode 100644
index 00000000..3bc9b899
--- /dev/null
+++ b/src/curobo/opt/newton/__init__.py
@@ -0,0 +1,14 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+"""
+This module contains code for cuda accelerated kinematics.
+"""
diff --git a/src/curobo/opt/newton/lbfgs.py b/src/curobo/opt/newton/lbfgs.py
new file mode 100644
index 00000000..4370cef5
--- /dev/null
+++ b/src/curobo/opt/newton/lbfgs.py
@@ -0,0 +1,215 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+from dataclasses import dataclass
+from typing import Optional
+
+# Third Party
+import torch
+import torch.autograd.profiler as profiler
+
+# CuRobo
+from curobo.curobolib.opt import LBFGScu
+from curobo.opt.newton.newton_base import NewtonOptBase, NewtonOptConfig
+from curobo.util.logger import log_warn
+
+
+# kernel for l-bfgs:
+@torch.jit.script
+def compute_step_direction(
+    alpha_buffer, rho_buffer, y_buffer, s_buffer, grad_q, m: int, epsilon, stable_mode: bool = True
+):
+    # m = 15 (int)
+    # y_buffer, s_buffer: m x b x 175
+    # q, grad_q: b x 175
+    # rho_buffer: m x b x 1
+    # alpha_buffer: m x b x 1 # this can be dynamically created
+    gq = grad_q.detach().clone()
+    rho_s = rho_buffer * s_buffer.transpose(-1, -2)  # batched m_scalar-m_vector product
+    for i in range(m - 1, -1, -1):
+        alpha_buffer[i] = rho_s[i] @ gq  # batched vector-vector dot product
+        gq = gq - (alpha_buffer[i] * y_buffer[i])  # batched scalar-vector product
+
+    var1 = (s_buffer[-1].transpose(-1, -2) @ y_buffer[-1]) / (
+        y_buffer[-1].transpose(-1, -2) @ y_buffer[-1]
+    )
+    if stable_mode:
+        var1 = torch.nan_to_num(var1, epsilon, epsilon, epsilon)
+    gamma = torch.nn.functional.relu(var1)  # + epsilon
+    r = gamma * gq.clone()
+    rho_y = rho_buffer * y_buffer.transpose(-1, -2)  # batched m_scalar-m_vector product
+    for i in range(m):
+        # batched dot product, scalar-vector product
+        r = r + (alpha_buffer[i] - (rho_y[i] @ r)) * s_buffer[i]
+    return -1.0 * r
+
+
+@dataclass
+class LBFGSOptConfig(NewtonOptConfig):
+    history: int = 10
+    epsilon: float = 0.01
+    use_cuda_kernel: bool = True
+    stable_mode: bool = True
+
+    def __post_init__(self):
+        return super().__post_init__()
+
+
+class LBFGSOpt(NewtonOptBase, LBFGSOptConfig):
+    @profiler.record_function("lbfgs_opt/init")
+    def __init__(self, config: Optional[LBFGSOptConfig] = None):
+        if config is not None:
+            LBFGSOptConfig.__init__(self, **vars(config))
+        NewtonOptBase.__init__(self)
+        if self.d_opt >= 1024 or self.history >= 512:
+            log_warn("LBFGS: Not using LBFGS Cuda Kernel as d_opt>1024 or history>=512")
+            self.use_cuda_kernel = False
+        if self.history > self.d_opt:
+            log_warn("LBFGS: history >= d_opt, reducing history to d_opt-1")
+            self.history = self.d_opt
+
+    @profiler.record_function("lbfgs/reset")
+    def reset(self):
+        self.x_0[:] = 0.0
+        self.grad_0[:] = 0.0
+        self.s_buffer[:] = 0.0
+        self.y_buffer[:] = 0.0
+        self.rho_buffer[:] = 0.0
+        self.alpha_buffer[:] = 0.0
+        self.step_q_buffer[:] = 0.0
+        return super().reset()
+
+    def update_nenvs(self, n_envs):
+        self.init_hessian(b=n_envs)
+        return super().update_nenvs(n_envs)
+
+    def init_hessian(self, b=1):
+        self.x_0 = torch.zeros(
+            (b, self.d_opt, 1), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+        )
+        self.grad_0 = torch.zeros(
+            (b, self.d_opt, 1), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+        )
+        self.y_buffer = torch.zeros(
+            (self.history, b, self.d_opt, 1),
+            device=self.tensor_args.device,
+            dtype=self.tensor_args.dtype,
+        )  # + 1.0
+        self.s_buffer = torch.zeros(
+            (self.history, b, self.d_opt, 1),
+            device=self.tensor_args.device,
+            dtype=self.tensor_args.dtype,
+        )  # + 1.0
+        self.rho_buffer = torch.zeros(
+            (self.history, b, 1, 1), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+        )  # + 1.0
+        self.step_q_buffer = torch.zeros(
+            (b, self.d_opt), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+        )
+        self.alpha_buffer = torch.zeros(
+            (self.history, b, 1, 1), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+        )  # + 1.0
+
+    @torch.no_grad()
+    def _get_step_direction(self, cost, q, grad_q):
+        if self.use_cuda_kernel:
+            with profiler.record_function("lbfgs/fused"):
+                dq = LBFGScu._call_cuda(
+                    self.step_q_buffer,
+                    self.rho_buffer,
+                    self.y_buffer,
+                    self.s_buffer,
+                    q,
+                    grad_q,
+                    self.x_0,
+                    self.grad_0,
+                    self.epsilon,
+                    self.stable_mode,
+                )
+
+        else:
+            grad_q = grad_q.transpose(-1, -2)
+            q = q.unsqueeze(-1)
+            self._update_buffers(q, grad_q)
+            dq = compute_step_direction(
+                self.alpha_buffer,
+                self.rho_buffer,
+                self.y_buffer,
+                self.s_buffer,
+                grad_q,
+                self.history,
+                self.epsilon,
+                self.stable_mode,
+            )
+
+            dq = dq.view(-1, self.d_opt)
+        return dq
+
+    def _update_q(self, grad_q):
+        q = grad_q.detach().clone()
+        rho_s = self.rho_buffer * self.s_buffer.transpose(-1, -2)
+        for i in range(self.history - 1, -1, -1):
+            self.alpha_buffer[i] = rho_s[i] @ q
+            q = q - (self.alpha_buffer[i] * self.y_buffer[i])
+        return q
+
+    def _update_r(self, r_init):
+        r = r_init.clone()
+        rho_y = self.rho_buffer * self.y_buffer.transpose(-1, -2)
+        for i in range(self.history):
+            r = r + self.s_buffer[i] * (self.alpha_buffer[i] - rho_y[i] @ r)
+        return -1.0 * r
+
+    def _update_buffers(self, q, grad_q):
+        y = grad_q - self.grad_0
+        s = q - self.x_0
+        rho = 1 / (y.transpose(-1, -2) @ s)
+        if self.stable_mode:
+            rho = torch.nan_to_num(rho, 0, 0, 0)
+        self.s_buffer[0] = s
+        self.s_buffer[:] = torch.roll(self.s_buffer, -1, dims=0)
+        self.y_buffer[0] = y
+        self.y_buffer[:] = torch.roll(self.y_buffer, -1, dims=0)  # .copy_(y_buff)
+
+        self.rho_buffer[0] = rho
+
+        self.rho_buffer[:] = torch.roll(self.rho_buffer, -1, dims=0)
+
+        self.x_0.copy_(q)
+        self.grad_0.copy_(grad_q)
+
+    def _shift(self, shift_steps=0):
+        """Shift the optimizer by shift_steps * d_opt
+
+        Args:
+            shift_steps (int, optional): _description_. Defaults to 0.
+        """
+        if shift_steps == 0:
+            return
+        self.reset()
+        shift_d = shift_steps * self.d_action
+        self.x_0 = self._shift_buffer(self.x_0, shift_d, shift_steps)
+        self.grad_0 = self._shift_buffer(self.grad_0, shift_d, shift_steps)
+        self.y_buffer = self._shift_buffer(self.y_buffer, shift_d, shift_steps)
+        self.s_buffer = self._shift_buffer(self.s_buffer, shift_d, shift_steps)
+        super()._shift(shift_steps=shift_steps)
+
+    def _shift_buffer(self, buffer, shift_d, shift_steps: int = 1):
+        buffer = buffer.roll(-shift_d, -2)
+        end_value = -(shift_steps - 1) * self.d_action
+        if end_value == 0:
+            end_value = buffer.shape[-2]
+        buffer[..., -shift_d:end_value, :] = buffer[
+            ..., -shift_d - self.d_action : -shift_d, :
+        ].clone()
+
+        return buffer
diff --git a/src/curobo/opt/newton/newton_base.py b/src/curobo/opt/newton/newton_base.py
new file mode 100644
index 00000000..adb4ab67
--- /dev/null
+++ b/src/curobo/opt/newton/newton_base.py
@@ -0,0 +1,604 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+import math
+import time
+from dataclasses import dataclass
+from enum import Enum
+from typing import List, Optional, Union
+
+# Third Party
+import torch
+import torch.autograd.profiler as profiler
+
+# CuRobo
+from curobo.curobolib.ls import update_best, wolfe_line_search
+from curobo.opt.opt_base import Optimizer, OptimizerConfig
+from curobo.rollout.dynamics_model.integration_utils import build_fd_matrix
+from curobo.types.base import TensorDeviceType
+from curobo.types.tensor import T_BDOF, T_BHDOF_float, T_BHValue_float, T_BValue_float, T_HDOF_float
+
+
+class LineSearchType(Enum):
+    GREEDY = "greedy"
+    ARMIJO = "armijo"
+    WOLFE = "wolfe"
+    STRONG_WOLFE = "strong_wolfe"
+    APPROX_WOLFE = "approx_wolfe"
+
+
+@dataclass
+class NewtonOptConfig(OptimizerConfig):
+    line_search_scale: List[int]
+    cost_convergence: float
+    cost_delta_threshold: float
+    fixed_iters: bool
+    inner_iters: int  # used for cuda graph
+    line_search_type: LineSearchType
+    use_cuda_line_search_kernel: bool
+    use_cuda_update_best_kernel: bool
+    min_iters: int
+    step_scale: float
+    last_best: float = 0
+    use_temporal_smooth: bool = False
+    cost_relative_threshold: float = 0.999
+
+    # use_update_best_kernel: bool
+    # c_1: float
+    # c_2: float
+    def __post_init__(self):
+        self.num_particles = len(self.line_search_scale)
+        self.line_search_type = LineSearchType(self.line_search_type)
+        if self.fixed_iters:
+            self.cost_delta_threshold = 0.0001
+            self.cost_relative_threshold = 1.0
+        return super().__post_init__()
+
+
+class NewtonOptBase(Optimizer, NewtonOptConfig):
+    @profiler.record_function("newton_opt/init")
+    def __init__(
+        self,
+        config: Optional[NewtonOptConfig] = None,
+    ):
+        if config is not None:
+            NewtonOptConfig.__init__(self, **vars(config))
+        self.d_opt = self.horizon * self.d_action
+        self.line_scale = self._create_box_line_search(self.line_search_scale)
+        Optimizer.__init__(self)
+        self.i = -1
+        self.outer_iters = math.ceil(self.n_iters / self.inner_iters)
+
+        # create line search
+        self.update_nenvs(self.n_envs)
+
+        self.reset()
+
+        # reshape action lows and highs:
+        self.action_lows = self.action_lows.repeat(self.horizon)
+        self.action_highs = self.action_highs.repeat(self.horizon)
+        self.action_range = self.action_highs - self.action_lows
+        self.action_step_max = self.step_scale * torch.abs(self.action_range)
+        self.c_1 = 1e-5
+        self.c_2 = 0.9
+        self._out_m_idx = None
+        self._out_best_x = None
+        self._out_best_c = None
+        self._out_best_grad = None
+        self.cu_opt_graph = None
+        if self.d_opt >= 1024:
+            self.use_cuda_line_search_kernel = False
+        if self.use_temporal_smooth:
+            self._temporal_mat = build_fd_matrix(
+                self.horizon, order=2, device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            ).unsqueeze(0)
+            eye_mat = torch.eye(
+                self.horizon, device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            ).unsqueeze(0)
+            self._temporal_mat += eye_mat
+
+    def reset_cuda_graph(self):
+        if self.cu_opt_graph is not None:
+            self.cu_opt_graph.reset()
+        super().reset_cuda_graph()
+
+    @torch.no_grad()
+    def _get_step_direction(self, cost, q, grad_q):
+        """
+        Reimplement this function in derived class. Gradient Descent is implemented here.
+        """
+        return -1.0 * grad_q.view(-1, self.d_opt)
+
+    def _shift(self, shift_steps=1):
+        # TODO: shift best q?:
+        self.best_cost[:] = 500000.0
+        self.best_iteration[:] = 0
+        self.current_iteration[:] = 0
+        return True
+
+    def _optimize(self, q: T_BHDOF_float, shift_steps=0, n_iters=None):
+        with profiler.record_function("newton_base/shift"):
+            self._shift(shift_steps)
+        # reshape q:
+        if self.store_debug:
+            self.debug.append(q.view(-1, self.horizon, self.d_action).clone())
+        with profiler.record_function("newton_base/init_opt"):
+            q = q.view(self.n_envs, self.horizon * self.d_action)
+            grad_q = q.detach() * 0.0
+        # run opt graph
+        if not self.cu_opt_init:
+            self._initialize_opt_iters_graph(q, grad_q, shift_steps=shift_steps)
+        for i in range(self.outer_iters):
+            best_q, best_cost, q, grad_q = self._call_opt_iters_graph(q, grad_q)
+            if (
+                not self.fixed_iters
+                and self.use_cuda_update_best_kernel
+                and (i + 1) * self.inner_iters >= self.min_iters
+            ):
+                if check_convergence(self.best_iteration, self.current_iteration, self.last_best):
+                    break
+
+        best_q = best_q.view(self.n_envs, self.horizon, self.d_action)
+        return best_q
+
+    def reset(self):
+        with profiler.record_function("newton/reset"):
+            self.i = -1
+            self._opt_finished = False
+            self.best_cost[:] = 500000.0
+            self.best_iteration[:] = 0
+
+        super().reset()
+
+    def _opt_iters(self, q, grad_q, shift_steps=0):
+        q = q.detach()  # .clone()
+        grad_q = grad_q.detach()  # .clone()
+        for _ in range(self.inner_iters):
+            self.i += 1
+            cost_n, q, grad_q = self._opt_step(q.detach(), grad_q.detach())
+        if self.store_debug:
+            self.debug.append(self.best_q.view(-1, self.horizon, self.d_action).clone())
+            self.debug_cost.append(self.best_cost.detach().view(-1, 1).clone())
+
+        return self.best_q.detach(), self.best_cost.detach(), q.detach(), grad_q.detach()
+
+    def _opt_step(self, q, grad_q):
+        with profiler.record_function("newton/line_search"):
+            q_n, cost_n, grad_q_n = self._approx_line_search(q, grad_q)
+        with profiler.record_function("newton/step_direction"):
+            grad_q = self._get_step_direction(cost_n, q_n, grad_q_n)
+        with profiler.record_function("newton/update_best"):
+            self._update_best(q_n, grad_q_n, cost_n)
+        return cost_n, q_n, grad_q
+
+    def clip_bounds(self, x):
+        x = torch.clamp(x, self.action_lows, self.action_highs)
+        return x
+
+    def scale_step_direction(self, dx):
+        if self.use_temporal_smooth:
+            dx_v = dx.view(-1, self.horizon, self.d_action)
+            dx_new = self._temporal_mat @ dx_v  # 1,h,h x b, h, dof -> b, h, dof
+            dx = dx_new.view(-1, self.horizon * self.d_action)
+        dx_scaled = scale_action(dx, self.action_step_max)
+
+        return dx_scaled
+
+    def project_bounds(self, x):
+        # Find maximum value along all joint angles:
+        max_tensor = torch.maximum((self.action_lows - x), (x - self.action_highs)) / (
+            (self.action_highs - self.action_lows)
+        )
+
+        # all values greater than 0 are out of bounds:
+        scale = torch.max(max_tensor, dim=-1, keepdim=True)[0]
+        scale = torch.nan_to_num(scale, nan=1.0, posinf=1.0, neginf=1.0)
+        scale[scale <= 0.0] = 1.0
+
+        x = (1.0 / scale) * x
+
+        # If we hit nans in scaling:
+        x = torch.nan_to_num(x, nan=0.0)
+        #
+        # max_val = torch.max()
+        # x = torch.clamp(x, self.action_lows, self.action_highs)
+        return x
+
+    def _compute_cost_gradient(self, x):
+        x_n = x.detach().requires_grad_(True)
+        x_in = x_n.view(
+            self.n_envs * self.num_particles, self.rollout_fn.horizon, self.rollout_fn.d_action
+        )
+        trajectories = self.rollout_fn(x_in)  # x_n = (batch*line_search_scale) x horizon x d_action
+        cost = torch.sum(
+            trajectories.costs.view(self.n_envs, self.num_particles, self.rollout_fn.horizon),
+            dim=-1,
+            keepdim=True,
+        )
+        g_x = cost.backward(gradient=self.l_vec, retain_graph=False)
+        g_x = x_n.grad.detach()
+        return (
+            cost,
+            g_x,
+        )  # cost: [n_envs, n_particles, 1], g_x: [n_envs, n_particles, horizon*d_action]
+
+    def _wolfe_line_search(self, x, step_direction):
+        # x_set = get_x_set_jit(step_direction, x, self.alpha_list, self.action_lows, self.action_highs)
+
+        step_direction = step_direction.detach()
+        step_vec = step_direction.unsqueeze(-2)
+        x_set = get_x_set_jit(step_vec, x, self.alpha_list, self.action_lows, self.action_highs)
+        # x_set = x.unsqueeze(-2) + self.alpha_list * step_vec
+        # x_set = self.clip_bounds(x_set)
+        # x_set = self.project_bounds(x_set)
+        x_set = x_set.detach().requires_grad_(True)
+
+        b, h, _ = x_set.shape
+        c, g_x = self._compute_cost_gradient(x_set)
+        with torch.no_grad():
+            if not self.use_cuda_line_search_kernel:
+                c_0 = c[:, 0:1]
+                step_vec_T = step_vec.transpose(-1, -2)
+                g_full_step = g_x @ step_vec_T
+
+                # g_step = g_x[:,0:1] @ step_vec_T
+
+                g_step = g_full_step[:, 0:1]
+                # condition 1:
+                wolfe_1 = c <= c_0 + self.c_1 * self.zero_alpha_list * g_step  # dot product
+
+                # condition 2:
+                if self.line_search_type == LineSearchType.STRONG_WOLFE:
+                    wolfe_2 = torch.abs(g_full_step) <= self.c_2 * torch.abs(g_step)
+                else:
+                    wolfe_2 = g_full_step >= self.c_2 * g_step  # dot product
+
+                wolfe = torch.logical_and(wolfe_1, wolfe_2)
+
+                # get the last occurence of true (this will be the largest admissable alpha value):
+                # wolfe will have 1 for indices that satisfy.
+                step_success = wolfe * (self.zero_alpha_list + 0.1)
+
+                _, m_idx = torch.max(step_success, dim=-2)
+
+                # The below can also be moved into approx wolfe?
+                if self.line_search_type != LineSearchType.APPROX_WOLFE:
+                    step_success_w1 = wolfe_1 * (self.zero_alpha_list + 0.1)
+
+                    _, m1_idx = torch.max(step_success_w1, dim=-2)
+
+                    m_idx = torch.where(m_idx == 0, m1_idx, m_idx)
+
+                # From ICRA23, we know that noisy update helps, so we don't check for zero here
+
+                if self.line_search_type != LineSearchType.APPROX_WOLFE:
+                    m_idx[m_idx == 0] = 1
+
+                m = m_idx.squeeze() + self.c_idx
+                g_x_1 = g_x.view(b * h, -1)
+                xs_1 = x_set.view(b * h, -1)
+                cs_1 = c.view(b * h, -1)
+                best_c = cs_1[m]
+
+                best_x = xs_1[m]
+                best_grad = g_x_1[m].view(b, 1, self.d_opt)
+                return best_x.detach(), best_c.detach(), best_grad.detach()
+            else:
+                if (
+                    self._out_best_x is None
+                    or self._out_best_x.shape[0] * self._out_best_x.shape[1]
+                    != x_set.shape[0] * x_set.shape[2]
+                ):
+                    self._out_best_x = torch.zeros(
+                        (x_set.shape[0], x_set.shape[2]),
+                        dtype=self.tensor_args.dtype,
+                        device=self.tensor_args.device,
+                    )
+                if (
+                    self._out_best_c is None
+                    or self._out_best_c.shape[0] * self._out_best_c.shape[1]
+                    != c.shape[0] * c.shape[2]
+                ):
+                    self._out_best_c = torch.zeros(
+                        (c.shape[0], c.shape[2]),
+                        dtype=self.tensor_args.dtype,
+                        device=self.tensor_args.device,
+                    )
+                if (
+                    self._out_best_grad is None
+                    or self._out_best_grad.shape[0] * self._out_best_grad.shape[1]
+                    != g_x.shape[0] * g_x.shape[2]
+                ):
+                    self._out_best_grad = torch.zeros(
+                        (g_x.shape[0], g_x.shape[2]),
+                        dtype=self.tensor_args.dtype,
+                        device=self.tensor_args.device,
+                    )
+
+                (best_x_n, best_c_n, best_grad_n) = wolfe_line_search(
+                    self._out_best_x,  # * 0.0,
+                    self._out_best_c,  # * 0.0,
+                    self._out_best_grad,  # * 0.0,
+                    g_x,
+                    x_set,
+                    step_vec,
+                    c,
+                    self.c_idx,
+                    self.c_1,
+                    self.c_2,
+                    self.zero_alpha_list,
+                    self.line_search_type == LineSearchType.STRONG_WOLFE,
+                    self.line_search_type == LineSearchType.APPROX_WOLFE,
+                )
+                # c_0 = c[:, 0:1]
+                # g_0 = g_x[:, 0:1]
+                best_grad_n = best_grad_n.view(b, 1, self.d_opt)
+                return best_x_n, best_c_n, best_grad_n
+
+    def _greedy_line_search(self, x, step_direction):
+        step_direction = step_direction.detach()
+        x_set = x.unsqueeze(-2) + self.alpha_list * step_direction.unsqueeze(-2)
+        x_set = self.clip_bounds(x_set)
+        x_set = x_set.detach()
+
+        x_set = x_set.detach().requires_grad_(True)
+        b, h, _ = x_set.shape
+
+        c, g_x = self._compute_cost_gradient(x_set)
+
+        best_c, m_idx = torch.min(c, dim=-2)
+
+        m = m_idx.squeeze() + self.c_idx
+        g_x = g_x.view(b * h, -1)
+        xs = x_set.view(b * h, -1)
+
+        best_x = xs[m]
+        best_grad = g_x[m].view(b, 1, self.d_opt)
+
+        return best_x, best_c, best_grad
+
+    def _armijo_line_search(self, x, step_direction):
+        step_direction = step_direction.detach()
+        step_vec = step_direction.unsqueeze(-2)
+        x_set = x.unsqueeze(-2) + self.alpha_list * step_vec
+        x_set = self.clip_bounds(x_set)
+        x_set = x_set.detach().requires_grad_(True)
+        b, h, _ = x_set.shape
+
+        c, g_x = self._compute_cost_gradient(x_set)
+
+        c_0 = c[:, 0:1]
+        g_0 = g_x[:, 0:1]
+
+        step_vec_T = step_vec.transpose(-1, -2)
+        g_step = g_0 @ step_vec_T
+        # condition 1:
+        armjio_1 = c <= c_0 + self.c_1 * self.zero_alpha_list * g_step  # dot product
+
+        # get the last occurence of true (this will be the largest admissable alpha value):
+        # wolfe will have 1 for indices that satisfy.
+        # find the
+        step_success = armjio_1 * (self.zero_alpha_list + 0.1)
+
+        _, m_idx = torch.max(step_success, dim=-2)
+
+        m_idx[m_idx == 0] = 1
+
+        m = m_idx.squeeze() + self.c_idx
+
+        g_x = g_x.view(b * h, -1)
+        xs = x_set.view(b * h, -1)
+        cs = c.view(b * h, -1)
+        best_c = cs[m]
+
+        best_x = xs[m]
+        best_grad = g_x[m].view(b, 1, self.d_opt)
+        return best_x, best_c, best_grad
+
+    def _approx_line_search(self, x, step_direction):
+        if self.step_scale != 0.0 and self.step_scale != 1.0:
+            step_direction = self.scale_step_direction(step_direction)
+        if self.line_search_type == LineSearchType.GREEDY:
+            return self._greedy_line_search(x, step_direction)
+        elif self.line_search_type == LineSearchType.ARMIJO:
+            return self._armijo_line_search(x, step_direction)
+        elif self.line_search_type in [
+            LineSearchType.WOLFE,
+            LineSearchType.STRONG_WOLFE,
+            LineSearchType.APPROX_WOLFE,
+        ]:
+            return self._wolfe_line_search(x, step_direction)
+
+    def check_convergence(self, cost):
+        above_threshold = cost > self.cost_convergence
+        above_threshold = torch.count_nonzero(above_threshold)
+        if above_threshold == 0:
+            self._opt_finished = True
+            return True
+        return False
+
+    def _update_best(self, q, grad_q, cost):
+        if self.use_cuda_update_best_kernel:
+            (self.best_cost, self.best_q, self.best_iteration) = update_best(
+                self.best_cost,
+                self.best_q,
+                self.best_iteration,
+                self.current_iteration,
+                cost,
+                q,
+                self.d_opt,
+                self.last_best,
+                self.cost_delta_threshold,
+                self.cost_relative_threshold,
+            )
+        else:
+            cost = cost.detach()
+            q = q.detach()
+            mask = cost < self.best_cost
+            self.best_cost.copy_(torch.where(mask, cost, self.best_cost))
+            mask = mask.view(mask.shape[0])
+            mask_q = mask.unsqueeze(-1).expand(-1, self.d_opt)
+            self.best_q.copy_(torch.where(mask_q, q, self.best_q))
+
+    def update_nenvs(self, n_envs):
+        self.l_vec = torch.ones(
+            (n_envs, self.num_particles, 1),
+            device=self.tensor_args.device,
+            dtype=self.tensor_args.dtype,
+        )
+        self.best_cost = (
+            torch.ones((n_envs, 1), device=self.tensor_args.device, dtype=self.tensor_args.dtype)
+            * 5000000.0
+        )
+        self.best_q = torch.zeros(
+            (n_envs, self.d_opt), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+        )
+        self.best_grad_q = torch.zeros(
+            (n_envs, 1, self.d_opt), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+        )
+
+        # create list:
+        self.alpha_list = self.line_scale.repeat(n_envs, 1, 1)
+        self.zero_alpha_list = self.alpha_list[:, :, 0:1].contiguous()
+        h = self.alpha_list.shape[1]
+        self.c_idx = torch.arange(
+            0, n_envs * h, step=(h), device=self.tensor_args.device, dtype=torch.long
+        )
+        self.best_iteration = torch.zeros(
+            (n_envs), device=self.tensor_args.device, dtype=torch.int16
+        )
+        self.current_iteration = torch.zeros((1), device=self.tensor_args.device, dtype=torch.int16)
+        self.cu_opt_init = False
+        super().update_nenvs(n_envs)
+
+    def _initialize_opt_iters_graph(self, q, grad_q, shift_steps):
+        if self.use_cuda_graph:
+            self._create_opt_iters_graph(q, grad_q, shift_steps)
+        self.cu_opt_init = True
+
+    def _create_box_line_search(self, line_search_scale):
+        """
+
+        Args:
+            line_search_scale (_type_): should have n values
+        """
+        d = []
+        dof_vec = torch.zeros(
+            (self.d_opt), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+        )
+        for i in line_search_scale:
+            d.append(dof_vec + i)
+        d = torch.stack(d, dim=0).unsqueeze(0)
+        return d
+
+    def _call_opt_iters_graph(self, q, grad_q):
+        if self.use_cuda_graph:
+            self._cu_opt_q_in.copy_(q.detach())
+            self._cu_opt_gq_in.copy_(grad_q.detach())
+            self.cu_opt_graph.replay()
+            return (
+                self._cu_opt_q.clone(),
+                self._cu_opt_cost.clone(),
+                self._cu_q.clone(),
+                self._cu_gq.clone(),
+            )
+        else:
+            return self._opt_iters(q, grad_q)
+
+    def _create_opt_iters_graph(self, q, grad_q, shift_steps):
+        # create a new stream:
+        self._cu_opt_q_in = q.detach().clone()
+        self._cu_opt_gq_in = grad_q.detach().clone()
+        s = torch.cuda.Stream(device=self.tensor_args.device)
+        s.wait_stream(torch.cuda.current_stream(device=self.tensor_args.device))
+
+        with torch.cuda.stream(s):
+            for _ in range(3):
+                self._cu_opt_q, self._cu_opt_cost, self._cu_q, self._cu_gq = self._opt_iters(
+                    self._cu_opt_q_in, self._cu_opt_gq_in, shift_steps
+                )
+        torch.cuda.current_stream(device=self.tensor_args.device).wait_stream(s)
+
+        self.cu_opt_graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(self.cu_opt_graph, stream=s):
+            self._cu_opt_q, self._cu_opt_cost, self._cu_q, self._cu_gq = self._opt_iters(
+                self._cu_opt_q_in, self._cu_opt_gq_in, shift_steps
+            )
+
+
+@torch.jit.script
+def get_x_set_jit(step_vec, x, alpha_list, action_lows, action_highs):
+    # step_direction = step_direction.detach()
+    x_set = torch.clamp(x.unsqueeze(-2) + alpha_list * step_vec, action_lows, action_highs)
+    # x_set = x.unsqueeze(-2) + alpha_list * step_vec
+    return x_set
+
+
+@torch.jit.script
+def _armijo_line_search_tail_jit(c, g_x, step_direction, c_1, alpha_list, c_idx, x_set, d_opt):
+    c_0 = c[:, 0:1]
+    g_0 = g_x[:, 0:1]
+    step_vec = step_direction.unsqueeze(-2)
+    step_vec_T = step_vec.transpose(-1, -2)
+    g_step = g_0 @ step_vec_T
+    # condition 1:
+    armjio_1 = c <= c_0 + c_1 * alpha_list * g_step  # dot product
+
+    # get the last occurence of true (this will be the largest admissable alpha value):
+    # wolfe will have 1 for indices that satisfy.
+    # find the
+    step_success = armjio_1 * (alpha_list[:, :, 0:1] + 0.1)
+
+    _, m_idx = torch.max(step_success, dim=-2)
+
+    m_idx[m_idx == 0] = 1
+
+    m = m_idx.squeeze() + c_idx
+    b, h, _ = x_set.shape
+    g_x = g_x.view(b * h, -1)
+    xs = x_set.view(b * h, -1)
+    cs = c.view(b * h, -1)
+    best_c = cs[m]
+
+    best_x = xs[m]
+    best_grad = g_x[m].view(b, 1, d_opt)
+    return (best_x, best_c, best_grad)
+
+
+@torch.jit.script
+def _wolfe_search_tail_jit(c, g_x, x_set, m, d_opt: int):
+    b, h, _ = x_set.shape
+    g_x = g_x.view(b * h, -1)
+    xs = x_set.view(b * h, -1)
+    cs = c.view(b * h, -1)
+    best_c = cs[m]
+    best_x = xs[m]
+    best_grad = g_x[m].view(b, 1, d_opt)
+    return (best_x, best_c, best_grad)
+
+
+@torch.jit.script
+def scale_action(dx, action_step_max):
+    scale_value = torch.max(torch.abs(dx) / action_step_max, dim=-1, keepdim=True)[0]
+    scale_value = torch.clamp(scale_value, 1.0)
+    dx_scaled = dx / scale_value
+    return dx_scaled
+
+
+@torch.jit.script
+def check_convergence(
+    best_iteration: torch.Tensor, current_iteration: torch.Tensor, last_best: int
+) -> bool:
+    success = False
+    if torch.max(best_iteration).item() <= (-1.0 * (last_best)):
+        success = True
+    return success
diff --git a/src/curobo/opt/opt_base.py b/src/curobo/opt/opt_base.py
new file mode 100644
index 00000000..45cf3a81
--- /dev/null
+++ b/src/curobo/opt/opt_base.py
@@ -0,0 +1,202 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+import time
+from abc import abstractmethod
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Union
+
+# Third Party
+import torch
+import torch.autograd.profiler as profiler
+
+# CuRobo
+from curobo.rollout.rollout_base import Goal, RolloutBase
+from curobo.types.base import TensorDeviceType
+from curobo.util.logger import log_info
+from curobo.util.torch_utils import is_cuda_graph_available
+
+
+@dataclass
+class OptimizerConfig:
+    d_action: int
+    action_lows: List[float]
+    action_highs: List[float]
+    horizon: int
+    n_iters: int
+    rollout_fn: RolloutBase
+    tensor_args: TensorDeviceType
+    use_cuda_graph: bool
+    store_debug: bool
+    debug_info: Any
+    cold_start_n_iters: int
+    num_particles: Union[int, None]
+    n_envs: int
+    sync_cuda_time: bool
+    use_coo_sparse: bool
+
+    def __post_init__(self):
+        object.__setattr__(self, "action_highs", self.tensor_args.to_device(self.action_highs))
+        object.__setattr__(self, "action_lows", self.tensor_args.to_device(self.action_lows))
+        # check cuda graph version:
+        if self.use_cuda_graph:
+            self.use_cuda_graph = is_cuda_graph_available()
+        if self.num_particles is None:
+            self.num_particles = 1
+
+    @staticmethod
+    def create_data_dict(
+        data_dict: Dict,
+        rollout_fn: RolloutBase,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+        child_dict: Optional[Dict] = None,
+    ):
+        if child_dict is None:
+            child_dict = deepcopy(data_dict)
+        child_dict["d_action"] = rollout_fn.d_action
+        child_dict["action_lows"] = rollout_fn.action_bound_lows
+        child_dict["action_highs"] = rollout_fn.action_bound_highs
+        child_dict["rollout_fn"] = rollout_fn
+        child_dict["tensor_args"] = tensor_args
+        child_dict["horizon"] = rollout_fn.horizon
+        if "num_particles" not in child_dict:
+            child_dict["num_particles"] = 1
+        return child_dict
+
+
+class Optimizer(OptimizerConfig):
+    def __init__(self, config: Optional[OptimizerConfig] = None) -> None:
+        if config is not None:
+            super().__init__(**vars(config))
+        self.opt_dt = 0.0
+        self.COLD_START = True
+        self.update_nenvs(self.n_envs)
+        self._batch_goal = None
+        self._rollout_list = None
+        self.debug = []
+        self.debug_cost = []
+
+    def optimize(self, opt_tensor: torch.Tensor, shift_steps=0, n_iters=None) -> torch.Tensor:
+        if self.COLD_START:
+            n_iters = self.cold_start_n_iters
+            self.COLD_START = False
+        st_time = time.time()
+        out = self._optimize(opt_tensor, shift_steps, n_iters)
+        if self.sync_cuda_time:
+            torch.cuda.synchronize()
+        self.opt_dt = time.time() - st_time
+        return out
+
+    @abstractmethod
+    def _optimize(self, opt_tensor: torch.Tensor, shift_steps=0, n_iters=None) -> torch.Tensor:
+        pass
+
+    def _shift(self, shift_steps=0):
+        """
+        Shift the variables in the solver to hotstart the next timestep
+        """
+        return
+
+    def update_params(self, goal: Goal):
+        with profiler.record_function("OptBase/batch_goal"):
+            if self._batch_goal is not None:
+                self._batch_goal.copy_(goal, update_idx_buffers=True)  # why True?
+            else:
+                self._batch_goal = goal.repeat_seeds(self.num_particles)
+        self.rollout_fn.update_params(self._batch_goal)
+
+    def reset(self):
+        """
+        Reset the controller
+        """
+        self.rollout_fn.reset()
+        self.debug = []
+        self.debug_cost = []
+        # self.COLD_START = True
+
+    def update_nenvs(self, n_envs):
+        assert n_envs > 0
+        self._update_env_kernel(n_envs, self.num_particles)
+        self.n_envs = n_envs
+
+    def _update_env_kernel(self, n_envs, num_particles):
+        log_info(
+            "Updating env kernel [n_envs: "
+            + str(n_envs)
+            + " , num_particles: "
+            + str(num_particles)
+            + " ]"
+        )
+
+        self.env_col = torch.arange(
+            0, n_envs, step=1, dtype=torch.long, device=self.tensor_args.device
+        )
+        self.n_select_ = torch.tensor(
+            [x * n_envs + x for x in range(n_envs)],
+            device=self.tensor_args.device,
+            dtype=torch.long,
+        )
+
+        # create sparse tensor:
+        sparse_indices = []
+        for i in range(n_envs):
+            sparse_indices.extend([[i * num_particles + x, i] for x in range(num_particles)])
+
+        sparse_values = torch.ones(len(sparse_indices))
+        sparse_indices = torch.tensor(sparse_indices)
+        if self.use_coo_sparse:
+            self.env_kernel_ = torch.sparse_coo_tensor(
+                sparse_indices.t(),
+                sparse_values,
+                device=self.tensor_args.device,
+                dtype=self.tensor_args.dtype,
+            )
+        else:
+            self.env_kernel_ = torch.sparse_coo_tensor(
+                sparse_indices.t(),
+                sparse_values,
+                device="cpu",
+                dtype=self.tensor_args.dtype,
+            )
+            self.env_kernel_ = self.env_kernel_.to_dense().to(device=self.tensor_args.device)
+        self._env_seeds = self.num_particles
+
+    def get_nenv_tensor(self, x):
+        """This function takes an input tensor of shape (n_env,....) and converts it into
+        (n_particles,...)
+        """
+
+        # if x.shape[0] != self.n_envs:
+        #    x_env = x.unsqueeze(0).repeat(self.n_envs, 1)
+        # else:
+        #    x_env = x
+
+        # create a tensor
+        nx_env = self.env_kernel_ @ x
+
+        return nx_env
+
+    def reset_seed(self):
+        return True
+
+    def reset_cuda_graph(self):
+        if self.use_cuda_graph:
+            self.cu_opt_init = False
+        else:
+            log_info("Cuda Graph was not enabled")
+        self._batch_goal = None
+        self.rollout_fn.reset_cuda_graph()
+
+    def get_all_rollout_instances(self) -> List[RolloutBase]:
+        if self._rollout_list is None:
+            self._rollout_list = [self.rollout_fn]
+        return self._rollout_list
diff --git a/src/curobo/opt/particle/__init__.py b/src/curobo/opt/particle/__init__.py
new file mode 100644
index 00000000..42ff7baa
--- /dev/null
+++ b/src/curobo/opt/particle/__init__.py
@@ -0,0 +1,14 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+"""
+This module contains particle-based optimization solvers.
+"""
diff --git a/src/curobo/opt/particle/parallel_es.py b/src/curobo/opt/particle/parallel_es.py
new file mode 100644
index 00000000..e3d955bb
--- /dev/null
+++ b/src/curobo/opt/particle/parallel_es.py
@@ -0,0 +1,76 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+from dataclasses import dataclass
+from typing import Optional
+
+# Third Party
+import torch
+
+# CuRobo
+from curobo.opt.particle.parallel_mppi import CovType, ParallelMPPI, ParallelMPPIConfig
+
+
+@dataclass
+class ParallelESConfig(ParallelMPPIConfig):
+    learning_rate: float = 0.1
+
+
+class ParallelES(ParallelMPPI, ParallelESConfig):
+    def __init__(self, config: Optional[ParallelESConfig] = None):
+        if config is not None:
+            ParallelESConfig.__init__(self, **vars(config))
+        ParallelMPPI.__init__(self)
+
+    def _compute_mean(self, w, actions):
+        if self.cov_type not in [CovType.SIGMA_I, CovType.DIAG_A]:
+            raise NotImplementedError()
+        new_mean = compute_es_mean(
+            w, actions, self.mean_action, self.full_inv_cov, self.num_particles, self.learning_rate
+        )
+        # get the new means from here
+        # use Evolutionary Strategy Mean here:
+        return new_mean
+
+    def _exp_util(self, total_costs):
+        w = calc_exp(total_costs)
+        return w
+
+
+@torch.jit.script
+def calc_exp(
+    total_costs,
+):
+    total_costs = -1.0 * total_costs
+    # total_costs[torch.abs(total_costs) < 5.0] == 0.0
+
+    w = (total_costs - torch.mean(total_costs, keepdim=True, dim=-1)) / torch.std(
+        total_costs, keepdim=True, dim=-1
+    )
+    return w
+
+
+@torch.jit.script
+def compute_es_mean(
+    w, actions, mean_action, full_inv_cov, num_particles: int, learning_rate: float
+):
+    std_w = torch.std(w, dim=[1, 2, 3], keepdim=True, unbiased=False)
+    # std_w = torch.sqrt(torch.sum(w - torch.mean(w, dim=[1,2,3], keepdim=True))/float(num_particles))
+    a_og = (actions - mean_action.unsqueeze(1)) / std_w
+    weighted_seq = (
+        (torch.sum(w * a_og, dim=-3, keepdim=True)) @ (full_inv_cov / num_particles)
+    ).squeeze(1)
+
+    # weighted_seq[weighted_seq != weighted_seq] = 0.0
+
+    # 0.01 is the learning rate:
+    new_mean = mean_action + learning_rate * weighted_seq  # torch.clamp(weighted_seq, -1000, 1000)
+    return new_mean
diff --git a/src/curobo/opt/particle/parallel_mppi.py b/src/curobo/opt/particle/parallel_mppi.py
new file mode 100644
index 00000000..09abb603
--- /dev/null
+++ b/src/curobo/opt/particle/parallel_mppi.py
@@ -0,0 +1,615 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+from copy import deepcopy
+from dataclasses import dataclass
+from enum import Enum
+from typing import Dict, Optional
+
+# Third Party
+import torch
+import torch.autograd.profiler as profiler
+
+# CuRobo
+from curobo.opt.particle.particle_opt_base import ParticleOptBase, ParticleOptConfig, SampleMode
+from curobo.opt.particle.particle_opt_utils import (
+    SquashType,
+    cost_to_go,
+    gaussian_entropy,
+    matrix_cholesky,
+    scale_ctrl,
+)
+from curobo.rollout.rollout_base import RolloutBase, Trajectory
+from curobo.types.base import TensorDeviceType
+from curobo.types.robot import State
+from curobo.util.logger import log_info
+from curobo.util.sample_lib import HaltonSampleLib, SampleConfig, SampleLib
+from curobo.util.tensor_util import copy_tensor
+
+
+class BaseActionType(Enum):
+    REPEAT = 0
+    NULL = 1
+    RANDOM = 2
+
+
+class CovType(Enum):
+    SIGMA_I = 0
+    DIAG_A = 1
+    FULL_A = 2
+    FULL_HA = 3
+
+
+@dataclass
+class ParallelMPPIConfig(ParticleOptConfig):
+    init_mean: float
+    init_cov: float
+    base_action: BaseActionType
+    step_size_mean: float
+    step_size_cov: float
+    null_act_frac: float
+    squash_fn: SquashType
+    cov_type: CovType
+    sample_params: SampleConfig
+    update_cov: bool
+    random_mean: bool
+    beta: float
+    alpha: float
+    gamma: float
+    kappa: float
+    sample_per_env: bool
+
+    def __post_init__(self):
+        self.init_cov = self.tensor_args.to_device(self.init_cov).unsqueeze(0)
+        self.init_mean = self.tensor_args.to_device(self.init_mean).clone()
+        return super().__post_init__()
+
+    @staticmethod
+    @profiler.record_function("parallel_mppi_config/create_data_dict")
+    def create_data_dict(
+        data_dict: Dict,
+        rollout_fn: RolloutBase,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+        child_dict: Optional[Dict] = None,
+    ):
+        if child_dict is None:
+            child_dict = deepcopy(data_dict)
+        child_dict = ParticleOptConfig.create_data_dict(
+            data_dict, rollout_fn, tensor_args, child_dict
+        )
+        child_dict["base_action"] = BaseActionType[child_dict["base_action"]]
+        child_dict["squash_fn"] = SquashType[child_dict["squash_fn"]]
+        child_dict["cov_type"] = CovType[child_dict["cov_type"]]
+        child_dict["sample_params"]["d_action"] = rollout_fn.d_action
+        child_dict["sample_params"]["horizon"] = child_dict["horizon"]
+        child_dict["sample_params"]["tensor_args"] = tensor_args
+        child_dict["sample_params"] = SampleConfig(**child_dict["sample_params"])
+
+        # init_mean:
+        if "init_mean" not in child_dict:
+            child_dict["init_mean"] = rollout_fn.get_init_action_seq()
+        return child_dict
+
+
+class ParallelMPPI(ParticleOptBase, ParallelMPPIConfig):
+    @profiler.record_function("parallel_mppi/init")
+    def __init__(self, config: Optional[ParallelMPPIConfig] = None):
+        if config is not None:
+            ParallelMPPIConfig.__init__(self, **vars(config))
+        ParticleOptBase.__init__(self)
+
+        self.sample_lib = SampleLib(self.sample_params)
+        self._sample_set = None
+        self._sample_iter = None
+        # initialize covariance types:
+        if self.cov_type == CovType.FULL_HA:
+            self.I = torch.eye(
+                self.horizon * self.d_action,
+                device=self.tensor_args.device,
+                dtype=self.tensor_args.dtype,
+            )
+
+        else:  # AxA
+            self.I = torch.eye(
+                self.d_action, device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            )
+
+        self.Z_seq = torch.zeros(
+            1,
+            self.horizon,
+            self.d_action,
+            device=self.tensor_args.device,
+            dtype=self.tensor_args.dtype,
+        )
+
+        self.delta = None
+        self.mean_action = None
+        self.act_seq = None
+        self.cov_action = None
+        self.best_traj = None
+        self.scale_tril = None
+        self.visual_traj = None
+        if self.debug_info is not None and "visual_traj" in self.debug_info.keys():
+            self.visual_traj = self.debug_info["visual_traj"]
+        self.top_values = None
+        self.top_idx = None
+        self.top_trajs = None
+
+        self.mean_lib = HaltonSampleLib(
+            SampleConfig(
+                self.horizon,
+                self.d_action,
+                tensor_args=self.tensor_args,
+                **{"fixed_samples": False, "seed": 2567, "filter_coeffs": None}
+            )
+        )
+
+        self.reset_distribution()
+        self.update_samples()
+        self._use_cuda_graph = False
+        self._init_cuda_graph = False
+        self.info = dict(rollout_time=0.0, entropy=[])
+        self._batch_size = -1
+        self._store_debug = False
+
+    def get_rollouts(self):
+        return self.top_trajs
+
+    def reset_distribution(self):
+        """
+        Reset control distribution
+        """
+
+        self.reset_mean()
+        self.reset_covariance()
+
+    def _compute_total_cost(self, costs):
+        """
+        Calculate weights using exponential utility
+        """
+
+        # cost_seq = self.gamma_seq * costs
+        # cost_seq = torch.sum(cost_seq, dim=-1, keepdim=False) / self.gamma_seq[..., 0]
+        # print(self.gamma_seq.shape, costs.shape)
+        cost_seq = jit_compute_total_cost(self.gamma_seq, costs)
+        return cost_seq
+
+    def _exp_util(self, total_costs):
+        w = jit_calculate_exp_util(self.beta, total_costs)
+        # w = torch.softmax((-1.0 / self.beta) * total_costs, dim=-1)
+        return w
+
+    def _compute_mean(self, w, actions):
+        # get the new means from here
+        new_mean = torch.sum(w * actions, dim=-3)
+        return new_mean
+
+    def _compute_covariance(self, w, actions):
+        if not self.update_cov:
+            return
+        # w = w.squeeze(-1).squeeze(-1)
+        # w = w[0, :]
+        if self.cov_type == CovType.SIGMA_I:
+            delta_actions = actions - self.mean_action.unsqueeze(-3)
+
+            # weighted_delta = w * (delta ** 2).T
+            # cov_update = torch.ean(torch.sum(weighted_delta.T, dim=0))
+            # print(cov_update.shape, self.cov_action)
+            weighted_delta = w * (delta_actions**2)
+            cov_update = torch.mean(
+                torch.sum(torch.sum(weighted_delta, dim=-2), dim=-1), dim=-1, keepdim=True
+            )
+            # raise NotImplementedError("Need to implement covariance update of form sigma*I")
+
+        elif self.cov_type == CovType.DIAG_A:
+            # Diagonal covariance of size AxA
+            # n, b, h, d = delta_actions.shape
+            # delta_actions = delta_actions.view(n,b,h*d)
+
+            # weighted_delta = w * (delta_actions**2)
+            # weighted_delta =
+            # sum across horizon and mean across particles:
+            # cov_update = torch.diag(torch.mean(torch.sum(weighted_delta.T  , dim=0), dim=0))
+            # cov_update = torch.mean(torch.sum(weighted_delta, dim=-2), dim=-2).unsqueeze(
+            #    -2
+            # )  # .expand(-1,-1,-1)
+            cov_update = jit_diag_a_cov_update(w, actions, self.mean_action)
+
+        elif self.cov_type == CovType.FULL_A:
+            delta_actions = actions - self.mean_action.unsqueeze(-3)
+
+            delta = delta_actions[0, ...]
+
+            raise NotImplementedError
+        elif self.cov_type == CovType.FULL_HA:
+            delta_actions = actions - self.mean_action.unsqueeze(-3)
+
+            delta = delta_actions[0, ...]
+
+            weighted_delta = (
+                torch.sqrt(w) * delta.view(delta.shape[0], delta.shape[1] * delta.shape[2]).T
+            )  # .unsqueeze(-1)
+            cov_update = torch.matmul(weighted_delta, weighted_delta.T)
+
+        else:
+            raise ValueError("Unidentified covariance type in update_distribution")
+        return cov_update
+
+    def _update_cov_scale(self):
+        if not self.update_cov:
+            return
+        if self.cov_type == CovType.SIGMA_I:
+            self.scale_tril = torch.sqrt(self.cov_action)
+
+        elif self.cov_type == CovType.DIAG_A:
+            self.scale_tril.copy_(torch.sqrt(self.cov_action))
+
+        elif self.cov_type == CovType.FULL_A:
+            self.scale_tril = matrix_cholesky(self.cov_action)
+
+        elif self.cov_type == CovType.FULL_HA:
+            raise NotImplementedError
+
+    @torch.no_grad()
+    def _update_distribution(self, trajectories: Trajectory):
+        costs = trajectories.costs
+        actions = trajectories.actions
+
+        total_costs = self._compute_total_cost(costs)
+        # Let's reshape to n_envs now:
+
+        # first find the means before doing exponential utility:
+        w = self._exp_util(total_costs)
+
+        # Update best action
+        if self.sample_mode == SampleMode.BEST:
+            best_idx = torch.argmax(w, dim=-1)
+            self.best_traj.copy_(actions[self.env_col, best_idx])
+
+        if self.store_rollouts and self.visual_traj is not None:
+            vis_seq = getattr(trajectories.state, self.visual_traj)
+            top_values, top_idx = torch.topk(total_costs, 20, dim=1)
+            self.top_values = top_values
+            self.top_idx = top_idx
+            top_trajs = torch.index_select(vis_seq, 0, top_idx[0])
+            for i in range(1, top_idx.shape[0]):
+                trajs = torch.index_select(vis_seq, 0, top_idx[i] + (self.particles_per_env * i))
+                top_trajs = torch.cat((top_trajs, trajs), dim=0)
+            if self.top_trajs is None or top_trajs.shape != self.top_trajs:
+                self.top_trajs = top_trajs
+            else:
+                self.top_trajs.copy_(top_trajs)
+
+        w = w.unsqueeze(-1).unsqueeze(-1)
+
+        new_mean = self._compute_mean(w, actions)
+        # print(new_mean)
+        if self.update_cov:
+            cov_update = self._compute_covariance(w, actions)
+            new_cov = jit_blend_cov(self.cov_action, cov_update, self.step_size_cov, self.kappa)
+            self.cov_action.copy_(new_cov)
+
+            self._update_cov_scale()
+        new_mean = jit_blend_mean(self.mean_action, new_mean, self.step_size_mean)
+        self.mean_action.copy_(new_mean)
+
+    @torch.no_grad()
+    def sample_actions(self, init_act):
+        delta = torch.index_select(self._sample_set, 0, self._sample_iter).squeeze(0)
+        if not self.sample_params.fixed_samples:
+            self._sample_iter[:] += 1
+            self._sample_iter_n += 1
+            if self._sample_iter_n >= self.n_iters:
+                self._sample_iter_n = 0
+                self._sample_iter[:] = 0
+                log_info(
+                    "Resetting sample iterations in particle opt base to 0, this is okay during graph capture"
+                )
+        scaled_delta = delta * self.full_scale_tril
+        act_seq = self.mean_action.unsqueeze(-3) + scaled_delta
+        cat_list = [act_seq]
+
+        if self.neg_per_env > 0:
+            neg_action = -1.0 * self.mean_action
+            neg_act_seqs = neg_action.unsqueeze(-3).expand(-1, self.neg_per_env, -1, -1)
+            cat_list.append(neg_act_seqs)
+        if self.null_per_env > 0:
+            cat_list.append(
+                self.null_act_seqs[: self.null_per_env].unsqueeze(0).expand(self.n_envs, -1, -1, -1)
+            )
+
+        act_seq = torch.cat(
+            (cat_list),
+            dim=-3,
+        )
+        act_seq = act_seq.reshape(self.total_num_particles, self.horizon, self.d_action)
+        act_seq = scale_ctrl(act_seq, self.action_lows, self.action_highs, squash_fn=self.squash_fn)
+
+        # if not copy_tensor(act_seq, self.act_seq):
+        #    self.act_seq = act_seq
+        return act_seq  # self.act_seq
+
+    def update_seed(self, init_act):
+        self.update_init_mean(init_act)
+
+    def update_init_mean(self, init_mean):
+        # update mean:
+        # init_mean = init_mean.clone()
+        if init_mean.shape[0] != self.n_envs:
+            init_mean = init_mean.expand(self.n_envs, -1, -1)
+        if not copy_tensor(init_mean, self.mean_action):
+            self.mean_action = init_mean.clone()
+        if not copy_tensor(init_mean, self.best_traj):
+            self.best_traj = init_mean.clone()
+
+    def reset_mean(self):
+        with profiler.record_function("mppi/reset_mean"):
+            if self.random_mean:
+                mean = self.mean_lib.get_samples([self.n_envs])
+                self.update_init_mean(mean)
+            else:
+                self.update_init_mean(self.init_mean)
+
+    def reset_covariance(self):
+        with profiler.record_function("mppi/reset_cov"):
+            # init_cov can either be a single value, or n_envs x 1 or n_envs x d_action
+
+            if self.cov_type == CovType.SIGMA_I:
+                # init_cov can either be a single value, or n_envs x 1
+                self.cov_action = self.init_cov
+                if self.init_cov.shape[0] != self.n_envs:
+                    self.cov_action = self.init_cov.unsqueeze(0).expand(self.n_envs, -1)
+                self.inv_cov_action = 1.0 / self.cov_action
+                a = torch.sqrt(self.cov_action)
+                if not copy_tensor(a, self.scale_tril):
+                    self.scale_tril = a
+
+            elif self.cov_type == CovType.DIAG_A:
+                # init_cov can either be a single value, or n_envs x 1 or n_envs x 7
+                init_cov = self.init_cov.clone()
+
+                # if(init_cov.shape[-1] != self.d_action):
+                if len(init_cov.shape) == 1:
+                    init_cov = init_cov.unsqueeze(-1).expand(-1, self.d_action)
+                if len(init_cov.shape) == 2 and init_cov.shape[-1] != self.d_action:
+                    init_cov = init_cov.expand(-1, self.d_action)
+                init_cov = init_cov.unsqueeze(1)
+                if init_cov.shape[0] != self.n_envs:
+                    init_cov = init_cov.expand(self.n_envs, -1, -1)
+                if not copy_tensor(init_cov.clone(), self.cov_action):
+                    self.cov_action = init_cov.clone()
+                self.inv_cov_action = 1.0 / self.cov_action
+                a = torch.sqrt(self.cov_action)
+                if not copy_tensor(a, self.scale_tril):
+                    self.scale_tril = a
+
+            else:
+                raise ValueError("Unidentified covariance type in update_distribution")
+
+    def _get_action_seq(self, mode: SampleMode):
+        if mode == SampleMode.MEAN:
+            act_seq = self.mean_action  # .clone()  # [self.mean_idx]#.clone()
+        elif mode == SampleMode.SAMPLE:
+            delta = self.generate_noise(
+                shape=torch.Size((1, self.horizon)), base_seed=self.seed + 123 * self.num_steps
+            )
+            act_seq = self.mean_action + torch.matmul(delta, self.full_scale_tril)
+        elif mode == SampleMode.BEST:
+            act_seq = self.best_traj  # [self.mean_idx]
+        else:
+            raise ValueError("Unidentified sampling mode in get_next_action")
+
+        # act_seq = scale_ctrl(act_seq, self.action_lows, self.action_highs, squash_fn=self.squash_fn)
+
+        return act_seq
+
+    def generate_noise(self, shape, base_seed=None):
+        """
+        Generate correlated noisy samples using autoregressive process
+        """
+        delta = self.sample_lib.get_samples(sample_shape=shape, seed=base_seed)
+        return delta
+
+    @property
+    def full_scale_tril(self):
+        """Returns the full scale tril
+
+        Returns:
+            Tensor: dimension is (d_action, d_action)
+        """
+        if self.cov_type == CovType.SIGMA_I:
+            return self.scale_tril.unsqueeze(-2).unsqueeze(-2).expand(-1, -1, self.horizon, -1)
+        elif self.cov_type == CovType.DIAG_A:
+            return self.scale_tril.unsqueeze(-2).expand(-1, -1, self.horizon, -1)  # .cl
+        elif self.cov_type == CovType.FULL_A:
+            return self.scale_tril
+        elif self.cov_type == CovType.FULL_HA:
+            return self.scale_tril
+
+    def _calc_val(self, trajectories: Trajectory):
+        costs = trajectories.costs
+        actions = trajectories.actions
+        delta = actions - self.mean_action.unsqueeze(0)
+
+        traj_costs = cost_to_go(costs, self.gamma_seq)[:, 0]
+        control_costs = self._control_costs(delta)
+        total_costs = traj_costs + self.beta * control_costs
+
+        val = -self.beta * torch.logsumexp((-1.0 / self.beta) * total_costs)
+        return val
+
+    def reset(self):
+        self.reset_distribution()
+
+        self._sample_iter[:] = 0
+        self._sample_iter_n = 0
+        self.update_samples()  # this helps in restarting optimization
+        super().reset()
+
+    @property
+    def squashed_mean(self):
+        return scale_ctrl(
+            self.mean_action, self.action_lows, self.action_highs, squash_fn=self.squash_fn
+        )
+
+    @property
+    def full_cov(self):
+        if self.cov_type == CovType.SIGMA_I:
+            return self.cov_action * self.I
+        elif self.cov_type == CovType.DIAG_A:
+            return torch.diag(self.cov_action)
+        elif self.cov_type == CovType.FULL_A:
+            return self.cov_action
+        elif self.cov_type == CovType.FULL_HA:
+            return self.cov_action
+
+    @property
+    def full_inv_cov(self):
+        if self.cov_type == CovType.SIGMA_I:
+            return self.inv_cov_action * self.I
+        elif self.cov_type == CovType.DIAG_A:
+            return torch.diag_embed(self.inv_cov_action)
+        elif self.cov_type == CovType.FULL_A:
+            return self.inv_cov_action
+        elif self.cov_type == CovType.FULL_HA:
+            return self.inv_cov_action
+
+    @property
+    def full_scale_tril(self):
+        if self.cov_type == CovType.SIGMA_I:
+            return (
+                self.scale_tril.unsqueeze(-2).unsqueeze(-2).expand(-1, -1, self.horizon, -1)
+            )  # .cl
+        elif self.cov_type == CovType.DIAG_A:
+            return self.scale_tril.unsqueeze(-2).expand(-1, -1, self.horizon, -1)  # .cl
+        elif self.cov_type == CovType.FULL_A:
+            return self.scale_tril
+        elif self.cov_type == CovType.FULL_HA:
+            return self.scale_tril
+
+    @property
+    def entropy(self):
+        ent_L = gaussian_entropy(L=self.full_scale_tril)
+        return ent_L
+
+    def reset_seed(self):
+        self.sample_lib = SampleLib(self.sample_params)
+        self.mean_lib = HaltonSampleLib(
+            SampleConfig(
+                self.horizon,
+                self.d_action,
+                tensor_args=self.tensor_args,
+                **{"fixed_samples": False, "seed": 2567, "filter_coeffs": None}
+            )
+        )
+        # resample if not fixed samples:
+        self.update_samples()
+        super().reset_seed()
+
+    def update_samples(self):
+        with profiler.record_function("mppi/update_samples"):
+            if self.sample_params.fixed_samples:
+                n_iters = 1
+            else:
+                n_iters = self.n_iters
+            if self.sample_per_env:
+                s_set = (
+                    self.sample_lib.get_samples(
+                        sample_shape=[self.sampled_particles_per_env * self.n_envs * n_iters],
+                        base_seed=self.seed,
+                    )
+                    .view(
+                        n_iters,
+                        self.n_envs,
+                        self.sampled_particles_per_env,
+                        self.horizon,
+                        self.d_action,
+                    )
+                    .clone()
+                )
+            else:
+                s_set = self.sample_lib.get_samples(
+                    sample_shape=[n_iters * (self.sampled_particles_per_env)],
+                    base_seed=self.seed,
+                )
+                s_set = s_set.view(
+                    n_iters, 1, self.sampled_particles_per_env, self.horizon, self.d_action
+                )
+                s_set = s_set.repeat(1, self.n_envs, 1, 1, 1).clone()
+            s_set[:, :, -1, :, :] = 0.0
+            if not copy_tensor(s_set, self._sample_set):
+                log_info("ParallelMPPI: Updating sample set")
+                self._sample_set = s_set
+            if self._sample_iter is None:
+                log_info("ParallelMPPI: Resetting sample iterations")  # , sample_iter.shape)
+
+                self._sample_iter = torch.zeros(
+                    (1), dtype=torch.long, device=self.tensor_args.device
+                )
+            else:
+                self._sample_iter[:] = 0
+
+            # if not copy_tensor(sample_iter, self._sample_iter):
+            #    log_info("ParallelMPPI: Resetting sample iterations")  # , sample_iter.shape)
+            #    self._sample_iter = sample_iter
+            self._sample_iter_n = 0
+
+    @torch.no_grad()
+    def generate_rollouts(self, init_act=None):
+        """
+        Samples a batch of actions, rolls out trajectories for each particle
+        and returns the resulting observations, costs,
+        actions
+
+        Parameters
+        ----------
+        state : dict or np.ndarray
+            Initial state to set the simulation env to
+        """
+
+        return super().generate_rollouts(init_act)
+
+
+@torch.jit.script
+def jit_calculate_exp_util(beta: float, total_costs):
+    w = torch.softmax((-1.0 / beta) * total_costs, dim=-1)
+    return w
+
+
+@torch.jit.script
+def jit_compute_total_cost(gamma_seq, costs):
+    cost_seq = gamma_seq * costs
+    cost_seq = torch.sum(cost_seq, dim=-1, keepdim=False) / gamma_seq[..., 0]
+    return cost_seq
+
+
+@torch.jit.script
+def jit_diag_a_cov_update(w, actions, mean_action):
+    delta_actions = actions - mean_action.unsqueeze(-3)
+
+    weighted_delta = w * (delta_actions**2)
+    # weighted_delta =
+    # sum across horizon and mean across particles:
+    # cov_update = torch.diag(torch.mean(torch.sum(weighted_delta.T  , dim=0), dim=0))
+    cov_update = torch.mean(torch.sum(weighted_delta, dim=-2), dim=-2).unsqueeze(-2)
+    return cov_update
+
+
+@torch.jit.script
+def jit_blend_cov(cov_action, cov_update, step_size_cov: float, kappa: float):
+    new_cov = (1.0 - step_size_cov) * cov_action + step_size_cov * cov_update + kappa
+    return new_cov
+
+
+@torch.jit.script
+def jit_blend_mean(mean_action, new_mean, step_size_mean: float):
+    mean_update = (1.0 - step_size_mean) * mean_action + step_size_mean * new_mean
+    return mean_update
diff --git a/src/curobo/opt/particle/particle_opt_base.py b/src/curobo/opt/particle/particle_opt_base.py
new file mode 100644
index 00000000..f292bad0
--- /dev/null
+++ b/src/curobo/opt/particle/particle_opt_base.py
@@ -0,0 +1,302 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+from abc import abstractmethod
+from copy import deepcopy
+from dataclasses import dataclass
+from enum import Enum
+from typing import Dict, Optional
+
+# Third Party
+import torch
+import torch.autograd.profiler as profiler
+
+# CuRobo
+from curobo.opt.opt_base import Optimizer, OptimizerConfig
+from curobo.rollout.rollout_base import RolloutBase, Trajectory
+from curobo.types.base import TensorDeviceType
+from curobo.types.tensor import T_BHDOF_float, T_HDOF_float
+from curobo.util.logger import log_error, log_info
+
+
+class SampleMode(Enum):
+    MEAN = 0
+    BEST = 1
+    SAMPLE = 2
+
+
+@dataclass
+class ParticleOptInfo:
+    info: Optional[Dict] = None
+
+
+@dataclass
+class ParticleOptConfig(OptimizerConfig):
+    gamma: float
+    sample_mode: SampleMode
+    seed: int
+    calculate_value: bool
+    store_rollouts: bool
+
+    def __post_init__(self):
+        object.__setattr__(self, "action_highs", self.tensor_args.to_device(self.action_highs))
+        object.__setattr__(self, "action_lows", self.tensor_args.to_device(self.action_lows))
+
+        if self.calculate_value and self.use_cuda_graph:
+            log_error("Cannot calculate_value when cuda graph is enabled")
+        return super().__post_init__()
+
+    @staticmethod
+    def create_data_dict(
+        data_dict: Dict,
+        rollout_fn: RolloutBase,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+        child_dict: Optional[Dict] = None,
+    ):
+        if child_dict is None:
+            child_dict = deepcopy(data_dict)
+        child_dict = OptimizerConfig.create_data_dict(
+            data_dict, rollout_fn, tensor_args, child_dict
+        )
+
+        child_dict["sample_mode"] = SampleMode[child_dict["sample_mode"]]
+        if "calculate_value" not in child_dict:
+            child_dict["calculate_value"] = False
+        if "store_rollouts" not in child_dict:
+            child_dict["store_rollouts"] = False
+        return child_dict
+
+
+class ParticleOptBase(Optimizer, ParticleOptConfig):
+    """Base class for sampling based controllers."""
+
+    @profiler.record_function("particle_opt/init")
+    def __init__(
+        self,
+        config: Optional[ParticleOptConfig] = None,
+    ):
+        if config is not None:
+            super().__init__(**vars(config))
+        Optimizer.__init__(self)
+        self.gamma_seq = torch.cumprod(
+            torch.tensor([1.0] + [self.gamma] * (self.horizon - 1)), dim=0
+        ).reshape(1, self.horizon)
+        self.gamma_seq = self.tensor_args.to_device(self.gamma_seq)
+        self.num_steps = 0
+        self.trajectories = None
+        self.cu_opt_init = False
+        self.info = ParticleOptInfo()
+        self.update_num_particles_per_env(self.num_particles)
+
+    @abstractmethod
+    def _get_action_seq(self, mode=SampleMode):
+        """
+        Get action sequence to execute on the system based
+        on current control distribution
+
+        Args:
+            mode : {'mean', 'sample'}
+                how to choose action to be executed
+                'mean' plays mean action and
+                'sample' samples from the distribution
+        """
+        pass
+
+    @abstractmethod
+    def sample_actions(self, init_act: T_BHDOF_float):
+        """
+        Sample actions from current control distribution
+        """
+        raise NotImplementedError("sample_actions funtion not implemented")
+
+    def update_seed(self, init_act):
+        raise NotImplementedError
+
+    @abstractmethod
+    def _update_distribution(self, trajectories: Trajectory):
+        """
+        Update current control distribution using
+        rollout trajectories
+
+        Args:
+            trajectories : dict
+                Rollout trajectories. Contains the following fields
+                observations : torch.tensor
+                    observations along rollouts
+                actions : torch.tensor
+                    actions sampled from control distribution along rollouts
+                costs : torch.tensor
+                    step costs along rollouts
+        """
+        pass
+
+    def reset(self):
+        """
+        Reset the optimizer
+        """
+        self.num_steps = 0
+
+        # self.rollout_fn.reset()
+        super().reset()
+
+    @abstractmethod
+    def _calc_val(self, trajectories: Trajectory):
+        """
+        Calculate value of state given
+        rollouts from a policy
+        """
+        pass
+
+    def check_convergence(self):
+        """
+        Checks if controller has converged
+        Returns False by default
+        """
+        return False
+
+    def generate_rollouts(self, init_act=None):
+        """
+        Samples a batch of actions, rolls out trajectories for each particle
+        and returns the resulting observations, costs,
+        actions
+
+        Parameters
+        ----------
+        state : dict or np.ndarray
+            Initial state to set the simulation env to
+        """
+
+        act_seq = self.sample_actions(init_act)
+        trajectories = self.rollout_fn(act_seq)
+        return trajectories
+
+    def _optimize(self, init_act: torch.Tensor, shift_steps=0, n_iters=None):
+        """
+        Optimize for best action at current state
+
+        Parameters
+        ----------
+        state : torch.Tensor
+            state to calculate optimal action from
+
+        calc_val : bool
+            If true, calculate the optimal value estimate
+            of the state along with action
+
+        Returns
+        -------
+        action : torch.Tensor
+            next action to execute
+        value: float
+            optimal value estimate (default: 0.)
+        info: dict
+            dictionary with side-information
+        """
+
+        n_iters = n_iters if n_iters is not None else self.n_iters
+
+        # create cuda graph:
+        if self.use_cuda_graph and self.cu_opt_init:
+            curr_action_seq = self._call_cuda_opt_iters(init_act)
+        else:
+            curr_action_seq = self._run_opt_iters(
+                init_act, n_iters=n_iters, shift_steps=shift_steps
+            )
+        if self.use_cuda_graph:
+            if not self.cu_opt_init:
+                self._initialize_cuda_graph(init_act, shift_steps=shift_steps)
+
+        self.num_steps += 1
+        if self.calculate_value:
+            trajectories = self.generate_rollouts(init_act)
+            value = self._calc_val(trajectories)
+            self.info["value"] = value
+        # print(self.act_seq)
+        return curr_action_seq
+
+    def _initialize_cuda_graph(self, init_act: T_HDOF_float, shift_steps=0):
+        log_info("ParticleOptBase: Creating Cuda Graph")
+        self._cu_act_in = init_act.detach().clone()
+
+        # create a new stream:
+        s = torch.cuda.Stream(device=self.tensor_args.device)
+        s.wait_stream(torch.cuda.current_stream(device=self.tensor_args.device))
+        with torch.cuda.stream(s):
+            for _ in range(3):
+                self._cu_act_seq = self._run_opt_iters(self._cu_act_in, shift_steps=shift_steps)
+        torch.cuda.current_stream(device=self.tensor_args.device).wait_stream(s)
+
+        self.reset()
+        self.cu_opt_graph = torch.cuda.CUDAGraph()
+
+        with torch.cuda.graph(self.cu_opt_graph, stream=s):
+            self._cu_act_seq = self._run_opt_iters(self._cu_act_in, shift_steps=shift_steps)
+        self.cu_opt_init = True
+
+    def _call_cuda_opt_iters(self, init_act: T_HDOF_float):
+        self._cu_act_in.copy_(init_act.detach())
+        self.cu_opt_graph.replay()
+        return self._cu_act_seq.detach().clone()  # .clone()
+
+    def _run_opt_iters(self, init_act: T_HDOF_float, shift_steps=0, n_iters=None):
+        n_iters = n_iters if n_iters is not None else self.n_iters
+
+        self._shift(shift_steps)
+        self.update_seed(init_act)
+        if not self.use_cuda_graph and self.store_debug:
+            self.debug.append(self._get_action_seq(mode=self.sample_mode).clone())
+
+        for _ in range(n_iters):
+            # generate random simulated trajectories
+            trajectory = self.generate_rollouts()
+            trajectory.actions = trajectory.actions.view(
+                self.n_envs, self.particles_per_env, self.horizon, self.d_action
+            )
+            trajectory.costs = trajectory.costs.view(
+                self.n_envs, self.particles_per_env, self.horizon
+            )
+            with profiler.record_function("mpc/mppi/update_distribution"):
+                self._update_distribution(trajectory)
+            if not self.use_cuda_graph and self.store_debug:
+                self.debug.append(self._get_action_seq(mode=self.sample_mode).clone())
+                self.debug_cost.append(
+                    torch.min(torch.sum(trajectory.costs, dim=-1), dim=-1)[0].unsqueeze(-1).clone()
+                )
+
+        curr_action_seq = self._get_action_seq(mode=self.sample_mode)
+        return curr_action_seq
+
+    def update_nenvs(self, n_envs):
+        assert n_envs > 0
+        self.total_num_particles = n_envs * self.num_particles
+        self.cu_opt_init = False
+        super().update_nenvs(n_envs)
+
+    def update_num_particles_per_env(self, num_particles_per_env):
+        self.null_per_env = round(int(self.null_act_frac * num_particles_per_env * 0.5))
+
+        self.neg_per_env = (
+            round(int(self.null_act_frac * num_particles_per_env)) - self.null_per_env
+        )
+
+        self.sampled_particles_per_env = (
+            num_particles_per_env - self.null_per_env - self.neg_per_env
+        )
+        self.particles_per_env = num_particles_per_env
+        if self.null_per_env > 0:
+            self.null_act_seqs = torch.zeros(
+                self.null_per_env,
+                self.horizon,
+                self.d_action,
+                device=self.tensor_args.device,
+                dtype=self.tensor_args.dtype,
+            )
diff --git a/src/curobo/opt/particle/particle_opt_utils.py b/src/curobo/opt/particle/particle_opt_utils.py
new file mode 100644
index 00000000..6785e751
--- /dev/null
+++ b/src/curobo/opt/particle/particle_opt_utils.py
@@ -0,0 +1,306 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+from enum import Enum
+
+# Third Party
+import numpy as np
+import torch
+import torch.autograd.profiler as profiler
+
+# CuRobo
+from curobo.types.base import TensorDeviceType
+
+
+class SquashType(Enum):
+    CLAMP = 0
+    CLAMP_RESCALE = 1
+    TANH = 2
+    IDENTITY = 3
+
+
+def scale_ctrl(ctrl, action_lows, action_highs, squash_fn: SquashType = SquashType.CLAMP):
+    if len(ctrl.shape) == 1:
+        ctrl = ctrl.unsqueeze(0).unsqueeze(-1)
+        # ctrl = ctrl[np.newaxis, :, np.newaxis] # TODO: does this work with gpu pytorch?
+    act_half_range = (action_highs - action_lows) / 2.0
+    act_mid_range = (action_highs + action_lows) / 2.0
+    if squash_fn == SquashType.CLAMP:
+        # ctrl = torch.clamp(ctrl, action_lows[0], action_highs[0])
+        ctrl = torch.max(torch.min(ctrl, action_highs), action_lows)
+        return ctrl
+    elif squash_fn == SquashType.CLAMP_RESCALE:
+        ctrl = torch.clamp(ctrl, -1.0, 1.0)
+    elif squash_fn == SquashType.TANH:
+        ctrl = torch.tanh(ctrl)
+    elif squash_fn == SquashType.IDENTITY:
+        return ctrl
+    return act_mid_range.unsqueeze(0) + ctrl * act_half_range.unsqueeze(0)
+
+
+#######################
+## STOMP Covariance  ##
+#######################
+
+
+@profiler.record_function("particle_opt_utils/get_stomp_cov")
+def get_stomp_cov(
+    horizon: int,
+    d_action: int,
+    tensor_args=TensorDeviceType(),
+    cov_mode="acc",
+    RETURN_M=False,
+):
+    """Computes the covariance matrix following STOMP motion planner
+
+    Coefficients from here: https://en.wikipedia.org/wiki/Finite_difference_coefficient
+    More info here: https://github.com/ros-industrial/stomp_ros/blob/7fe40fbe6ad446459d8d4889916c64e276dbf882/stomp_core/src/utils.cpp#L36
+    """
+    cov, scale_tril, scaled_M = get_stomp_cov_jit(horizon, d_action, cov_mode)
+    cov = tensor_args.to_device(cov)
+    scale_tril = tensor_args.to_device(scale_tril)
+    if RETURN_M:
+        return cov, scale_tril, tensor_args.to_device(scaled_M)
+    return cov, scale_tril
+
+
+@torch.jit.script
+def get_stomp_cov_jit(
+    horizon: int,
+    d_action: int,
+    cov_mode: str = "acc",
+):
+    vel_fd_array = [0.0, 0.0, 1.0, -2.0, 1.0, 0.0, 0.0]
+
+    fd_array = vel_fd_array
+    A = torch.zeros(
+        (d_action * horizon, d_action * horizon),
+        dtype=torch.float64,
+    )
+
+    if cov_mode == "vel":
+        for k in range(d_action):
+            for i in range(0, horizon):
+                for j in range(-3, 4):
+                    # print(j)
+                    index = i + j
+                    if index < 0:
+                        index = 0
+                        continue
+                    if index >= horizon:
+                        index = horizon - 1
+                        continue
+                    A[k * horizon + i, k * horizon + index] = fd_array[j + 3]
+    elif cov_mode == "acc":
+        for k in range(d_action):
+            for i in range(0, horizon):
+                for j in range(-3, 4):
+                    index = i + j
+                    if index < 0:
+                        index = 0
+                        continue
+                    if index >= horizon:
+                        index = horizon - 1
+                        continue
+
+                    if index >= horizon / 2:
+                        A[k * horizon + i, k * horizon - index - horizon // 2 - 1] = fd_array[j + 3]
+                    else:
+                        A[k * horizon + i, k * horizon + index] = fd_array[j + 3]
+
+    R = torch.matmul(A.transpose(-2, -1), A)
+
+    M = torch.inverse(R)
+    scaled_M = (1 / horizon) * M / (torch.max(torch.abs(M), dim=1)[0].unsqueeze(0))
+    cov = M / torch.max(torch.abs(M))
+
+    # also compute the cholesky decomposition:
+    # scale_tril = torch.zeros((d_action * horizon, d_action * horizon), **tensor_args)
+    scale_tril = torch.linalg.cholesky(cov)
+    """
+    k = 0
+    act_cov_matrix = cov[k * horizon:k * horizon + horizon, k * horizon:k * horizon + horizon]
+    print(act_cov_matrix.shape)
+    print(torch.det(act_cov_matrix))
+    local_cholesky = matrix_cholesky(act_cov_matrix)
+    for k in range(d_action):
+        
+        scale_tril[k * horizon:k * horizon + horizon,k * horizon:k * horizon + horizon] = local_cholesky
+    """
+
+    return cov, scale_tril, scaled_M
+
+
+########################
+## Gaussian Utilities ##
+########################
+
+
+def gaussian_logprob(mean, cov, x, cov_type="full"):
+    """
+    Calculate gaussian log prob for given input batch x
+    Parameters
+    ----------
+    mean (np.ndarray): [N x num_samples] batch of means
+    cov (np.ndarray): [N x N] covariance matrix
+    x  (np.ndarray): [N x num_samples] batch of sample values
+
+    Returns
+    --------
+    log_prob (np.ndarray): [num_sampls] log probability of each sample
+    """
+    N = cov.shape[0]
+    if cov_type == "diagonal":
+        cov_diag = cov.diagonal()
+        cov_inv = np.diag(1.0 / cov_diag)
+        cov_logdet = np.sum(np.log(cov_diag))
+    else:
+        cov_logdet = np.log(np.linalg.det(cov))
+        cov_inv = np.linalg.inv(cov)
+    diff = (x - mean).T
+    mahalanobis_dist = -0.5 * np.sum((diff @ cov_inv) * diff, axis=1)
+    const1 = -0.5 * N * np.log(2.0 * np.pi)
+    const2 = -0.5 * cov_logdet
+    log_prob = mahalanobis_dist + const1 + const2
+    return log_prob
+
+
+def gaussian_logprobgrad(mean, cov, x, cov_type="full"):
+    if cov_type == "diagonal":
+        cov_inv = np.diag(1.0 / cov.diagonal())
+    else:
+        cov_inv = np.linalg.inv(cov)
+    diff = (x - mean).T
+    grad = diff @ cov_inv
+    return grad
+
+
+def gaussian_entropy(cov=None, L=None):  # , cov_type="full"):
+    """
+    Entropy of multivariate gaussian given either covariance
+    or cholesky decomposition of covariance
+
+    """
+    if cov is not None:
+        inp_device = cov.device
+        cov_logdet = torch.log(torch.det(cov))
+        # print(np.linalg.det(cov.cpu().numpy()))
+        # print(torch.det(cov))
+        N = cov.shape[0]
+
+    else:
+        inp_device = L.device
+        cov_logdet = 2.0 * torch.sum(torch.log(torch.diagonal(L)))
+        N = L.shape[0]
+    # if cov_type == "diagonal":
+    # cov_logdet =  np.sum(np.log(cov.diagonal()))
+    # else:
+    # cov_logdet = np.log(np.linalg.det(cov))
+
+    term1 = 0.5 * cov_logdet
+    # pi = torch.tensor([math.pi], device=inp_device)
+    # pre-calculate 1.0 + torch.log(2.0*pi) = 2.837877066
+    term2 = 0.5 * N * 2.837877066
+
+    ent = term1 + term2
+    return ent.to(inp_device)
+
+
+def gaussian_kl(mean0, cov0, mean1, cov1, cov_type="full"):
+    """
+    KL-divergence between Gaussians given mean and covariance
+    KL(p||q) = E_{p}[log(p) - log(q)]
+
+    """
+    N = cov0.shape[0]
+    if cov_type == "diagonal":
+        cov1_diag = cov1.diagonal()
+        cov1_inv = np.diag(1.0 / cov1_diag)
+        cov0_logdet = np.sum(np.log(cov0.diagonal()))
+        cov1_logdet = np.sum(np.log(cov1_diag))
+    else:
+        cov1_inv = np.linalg.inv(cov1)
+        cov0_logdet = np.log(np.linalg.det(cov0))
+        cov1_logdet = np.log(np.linalg.det(cov1))
+
+    term1 = 0.5 * np.trace(cov1_inv @ cov0)
+    diff = (mean1 - mean0).T
+    mahalanobis_dist = 0.5 * np.sum((diff @ cov1_inv) * diff, axis=1)
+    term3 = 0.5 * (-1.0 * N + cov1_logdet - cov0_logdet)
+    return term1 + mahalanobis_dist + term3
+
+
+# @torch.jit.script
+def cost_to_go(cost_seq, gamma_seq, only_first=False):
+    # type: (Tensor, Tensor, bool) -> Tensor
+
+    """
+    Calculate (discounted) cost to go for given cost sequence
+    """
+    # if torch.any(gamma_seq == 0):
+    #     return cost_seq
+    cost_seq = gamma_seq * cost_seq  # discounted cost sequence
+    if only_first:
+        cost_seq = torch.sum(cost_seq, dim=-1, keepdim=True) / gamma_seq[..., 0]
+    else:
+        # cost_seq = torch.cumsum(cost_seq[:, ::-1], axis=-1)[:, ::-1]  # cost to go (but scaled by [1 , gamma, gamma*2 and so on])
+        cost_seq = torch.fliplr(
+            torch.cumsum(torch.fliplr(cost_seq), dim=-1)
+        )  # cost to go (but scaled by [1 , gamma, gamma*2 and so on])
+        cost_seq /= gamma_seq  # un-scale it to get true discounted cost to go
+    return cost_seq
+
+
+def cost_to_go_np(cost_seq, gamma_seq):
+    """
+    Calculate (discounted) cost to go for given cost sequence
+    """
+    # if np.any(gamma_seq == 0):
+    #     return cost_seq
+    cost_seq = gamma_seq * cost_seq  # discounted reward sequence
+    cost_seq = np.cumsum(cost_seq[:, ::-1], axis=-1)[
+        :, ::-1
+    ]  # cost to go (but scaled by [1 , gamma, gamma*2 and so on])
+    cost_seq /= gamma_seq  # un-scale it to get true discounted cost to go
+    return cost_seq
+
+
+############
+##Cholesky##
+############
+def matrix_cholesky(A):
+    L = torch.zeros_like(A)
+    for i in range(A.shape[-1]):
+        for j in range(i + 1):
+            s = 0.0
+            for k in range(j):
+                s = s + L[i, k] * L[j, k]
+
+            L[i, j] = torch.sqrt(A[i, i] - s) if (i == j) else (1.0 / L[j, j] * (A[i, j] - s))
+    return L
+
+
+# Batched Cholesky decomp
+def batch_cholesky(A):
+    L = torch.zeros_like(A)
+
+    for i in range(A.shape[-1]):
+        for j in range(i + 1):
+            s = 0.0
+            for k in range(j):
+                s = s + L[..., i, k] * L[..., j, k]
+
+            L[..., i, j] = (
+                torch.sqrt(A[..., i, i] - s)
+                if (i == j)
+                else (1.0 / L[..., j, j] * (A[..., i, j] - s))
+            )
+    return L
diff --git a/src/curobo/py.typed b/src/curobo/py.typed
new file mode 100644
index 00000000..19d2bfb3
--- /dev/null
+++ b/src/curobo/py.typed
@@ -0,0 +1 @@
+# Marker file for PEP 561 (https://www.python.org/dev/peps/pep-0561/) stating that this package uses inline types.
\ No newline at end of file
diff --git a/src/curobo/rollout/__init__.py b/src/curobo/rollout/__init__.py
new file mode 100644
index 00000000..d262a334
--- /dev/null
+++ b/src/curobo/rollout/__init__.py
@@ -0,0 +1,14 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+"""
+This module contains rollout classes
+"""
diff --git a/src/curobo/rollout/arm_base.py b/src/curobo/rollout/arm_base.py
new file mode 100644
index 00000000..da6f6094
--- /dev/null
+++ b/src/curobo/rollout/arm_base.py
@@ -0,0 +1,751 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+from abc import abstractmethod
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Union
+
+# Third Party
+import torch
+import torch.autograd.profiler as profiler
+
+# CuRobo
+from curobo.geom.sdf.utils import create_collision_checker
+from curobo.geom.sdf.world import WorldCollision, WorldCollisionConfig
+from curobo.geom.types import WorldConfig
+from curobo.rollout.cost.bound_cost import BoundCost, BoundCostConfig
+from curobo.rollout.cost.dist_cost import DistCost, DistCostConfig
+from curobo.rollout.cost.manipulability_cost import ManipulabilityCost, ManipulabilityCostConfig
+from curobo.rollout.cost.primitive_collision_cost import (
+    PrimitiveCollisionCost,
+    PrimitiveCollisionCostConfig,
+)
+from curobo.rollout.cost.self_collision_cost import SelfCollisionCost, SelfCollisionCostConfig
+from curobo.rollout.cost.stop_cost import StopCost, StopCostConfig
+from curobo.rollout.dynamics_model.kinematic_model import (
+    KinematicModel,
+    KinematicModelConfig,
+    KinematicModelState,
+)
+from curobo.rollout.rollout_base import Goal, RolloutBase, RolloutConfig, RolloutMetrics, Trajectory
+from curobo.types.base import TensorDeviceType
+from curobo.types.robot import CSpaceConfig, RobotConfig
+from curobo.types.state import JointState
+from curobo.util.logger import log_info, log_warn
+from curobo.util.tensor_util import cat_sum
+
+
+@dataclass
+class ArmCostConfig:
+    bound_cfg: Optional[BoundCostConfig] = None
+    null_space_cfg: Optional[DistCostConfig] = None
+    manipulability_cfg: Optional[ManipulabilityCostConfig] = None
+    stop_cfg: Optional[StopCostConfig] = None
+    self_collision_cfg: Optional[SelfCollisionCostConfig] = None
+    primitive_collision_cfg: Optional[PrimitiveCollisionCostConfig] = None
+
+    @staticmethod
+    def _get_base_keys():
+        k_list = {
+            "null_space_cfg": DistCostConfig,
+            "manipulability_cfg": ManipulabilityCostConfig,
+            "stop_cfg": StopCostConfig,
+            "self_collision_cfg": SelfCollisionCostConfig,
+            "bound_cfg": BoundCostConfig,
+        }
+        return k_list
+
+    @staticmethod
+    def from_dict(
+        data_dict: Dict,
+        robot_config: RobotConfig,
+        world_coll_checker: Optional[WorldCollision] = None,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+    ):
+        k_list = ArmCostConfig._get_base_keys()
+        data = ArmCostConfig._get_formatted_dict(
+            data_dict,
+            k_list,
+            robot_config,
+            world_coll_checker=world_coll_checker,
+            tensor_args=tensor_args,
+        )
+        return ArmCostConfig(**data)
+
+    @staticmethod
+    def _get_formatted_dict(
+        data_dict: Dict,
+        cost_key_list: Dict,
+        robot_config: RobotConfig,
+        world_coll_checker: Optional[WorldCollision] = None,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+    ):
+        data = {}
+        for k in cost_key_list:
+            if k in data_dict:
+                data[k] = cost_key_list[k](**data_dict[k], tensor_args=tensor_args)
+        if "primitive_collision_cfg" in data_dict and world_coll_checker is not None:
+            data["primitive_collision_cfg"] = PrimitiveCollisionCostConfig(
+                **data_dict["primitive_collision_cfg"],
+                world_coll_checker=world_coll_checker,
+                tensor_args=tensor_args
+            )
+
+        return data
+
+
+@dataclass
+class ArmBaseConfig(RolloutConfig):
+    model_cfg: KinematicModelConfig
+    cost_cfg: ArmCostConfig
+    constraint_cfg: ArmCostConfig
+    convergence_cfg: ArmCostConfig
+    world_coll_checker: Optional[WorldCollision] = None
+
+    @staticmethod
+    def model_from_dict(
+        model_data_dict: Dict,
+        robot_cfg: RobotConfig,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+    ):
+        return KinematicModelConfig.from_dict(model_data_dict, robot_cfg, tensor_args=tensor_args)
+
+    @staticmethod
+    def cost_from_dict(
+        cost_data_dict: Dict,
+        robot_cfg: RobotConfig,
+        world_coll_checker: Optional[WorldCollision] = None,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+    ):
+        return ArmCostConfig.from_dict(
+            cost_data_dict,
+            robot_cfg,
+            world_coll_checker=world_coll_checker,
+            tensor_args=tensor_args,
+        )
+
+    @staticmethod
+    def world_coll_checker_from_dict(
+        world_coll_checker_dict: Optional[Dict] = None,
+        world_model_dict: Optional[Union[WorldConfig, Dict]] = None,
+        world_coll_checker: Optional[WorldCollision] = None,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+    ):
+        # TODO: Check which type of collision checker and load that.
+        if (
+            world_coll_checker is None
+            and world_model_dict is not None
+            and world_coll_checker_dict is not None
+        ):
+            world_coll_cfg = WorldCollisionConfig.load_from_dict(
+                world_coll_checker_dict, world_model_dict, tensor_args
+            )
+
+            world_coll_checker = create_collision_checker(world_coll_cfg)
+        else:
+            log_info("*******USING EXISTING COLLISION CHECKER***********")
+        return world_coll_checker
+
+    @classmethod
+    @profiler.record_function("arm_base_config/from_dict")
+    def from_dict(
+        cls,
+        robot_cfg: Union[Dict, RobotConfig],
+        model_data_dict: Dict,
+        cost_data_dict: Dict,
+        constraint_data_dict: Dict,
+        convergence_data_dict: Dict,
+        world_coll_checker_dict: Optional[Dict] = None,
+        world_model_dict: Optional[Dict] = None,
+        world_coll_checker: Optional[WorldCollision] = None,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+    ):
+        """Create ArmBase class from dictionary
+
+        NOTE: We declare this as a classmethod to allow for derived classes to use it.
+
+        Args:
+            robot_cfg (Union[Dict, RobotConfig]): _description_
+            model_data_dict (Dict): _description_
+            cost_data_dict (Dict): _description_
+            constraint_data_dict (Dict): _description_
+            convergence_data_dict (Dict): _description_
+            world_coll_checker_dict (Optional[Dict], optional): _description_. Defaults to None.
+            world_model_dict (Optional[Dict], optional): _description_. Defaults to None.
+            world_coll_checker (Optional[WorldCollision], optional): _description_. Defaults to None.
+            tensor_args (TensorDeviceType, optional): _description_. Defaults to TensorDeviceType().
+
+        Returns:
+            _type_: _description_
+        """
+        if isinstance(robot_cfg, dict):
+            robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+        world_coll_checker = cls.world_coll_checker_from_dict(
+            world_coll_checker_dict, world_model_dict, world_coll_checker, tensor_args
+        )
+        model = cls.model_from_dict(model_data_dict, robot_cfg, tensor_args=tensor_args)
+        cost = cls.cost_from_dict(
+            cost_data_dict,
+            robot_cfg,
+            world_coll_checker=world_coll_checker,
+            tensor_args=tensor_args,
+        )
+        constraint = cls.cost_from_dict(
+            constraint_data_dict,
+            robot_cfg,
+            world_coll_checker=world_coll_checker,
+            tensor_args=tensor_args,
+        )
+        convergence = cls.cost_from_dict(
+            convergence_data_dict,
+            robot_cfg,
+            world_coll_checker=world_coll_checker,
+            tensor_args=tensor_args,
+        )
+        return cls(
+            model_cfg=model,
+            cost_cfg=cost,
+            constraint_cfg=constraint,
+            convergence_cfg=convergence,
+            world_coll_checker=world_coll_checker,
+            tensor_args=tensor_args,
+        )
+
+
+class ArmBase(RolloutBase, ArmBaseConfig):
+    """
+    This rollout function is for reaching a cartesian pose for a robot
+    """
+
+    @profiler.record_function("arm_base/init")
+    def __init__(self, config: Optional[ArmBaseConfig] = None):
+        if config is not None:
+            ArmBaseConfig.__init__(self, **vars(config))
+        RolloutBase.__init__(self)
+        self._init_after_config_load()
+
+    @profiler.record_function("arm_base/init_after_config_load")
+    def _init_after_config_load(self):
+        # self.current_state = None
+        # self.retract_state = None
+        self._goal_buffer = Goal()
+        self._goal_idx_update = True
+        # Create the dynamical system used for rollouts
+        self.dynamics_model = KinematicModel(self.model_cfg)
+
+        self.n_dofs = self.dynamics_model.n_dofs
+        self.traj_dt = self.dynamics_model.traj_dt
+        if self.cost_cfg.bound_cfg is not None:
+            self.cost_cfg.bound_cfg.set_bounds(
+                self.dynamics_model.get_state_bounds(),
+                teleport_mode=self.dynamics_model.teleport_mode,
+            )
+            self.cost_cfg.bound_cfg.cspace_distance_weight = (
+                self.dynamics_model.cspace_distance_weight
+            )
+            self.cost_cfg.bound_cfg.state_finite_difference_mode = (
+                self.dynamics_model.state_finite_difference_mode
+            )
+            self.cost_cfg.bound_cfg.update_vec_weight(self.dynamics_model.null_space_weight)
+
+            if self.cost_cfg.null_space_cfg is not None:
+                self.cost_cfg.bound_cfg.null_space_weight = self.cost_cfg.null_space_cfg.weight
+                log_warn(
+                    "null space cost is deprecated, use null_space_weight in bound cost instead"
+                )
+
+            self.bound_cost = BoundCost(self.cost_cfg.bound_cfg)
+
+        if self.cost_cfg.manipulability_cfg is not None:
+            self.manipulability_cost = ManipulabilityCost(self.cost_cfg.manipulability_cfg)
+
+        if self.cost_cfg.stop_cfg is not None:
+            self.cost_cfg.stop_cfg.horizon = self.dynamics_model.horizon
+            self.cost_cfg.stop_cfg.dt_traj_params = self.dynamics_model.dt_traj_params
+            self.stop_cost = StopCost(self.cost_cfg.stop_cfg)
+        self._goal_buffer.retract_state = self.retract_state
+        if self.cost_cfg.primitive_collision_cfg is not None:
+            self.primitive_collision_cost = PrimitiveCollisionCost(
+                self.cost_cfg.primitive_collision_cfg
+            )
+            if self.dynamics_model.robot_model.total_spheres == 0:
+                self.primitive_collision_cost.disable_cost()
+
+        if self.cost_cfg.self_collision_cfg is not None:
+            self.cost_cfg.self_collision_cfg.self_collision_kin_config = (
+                self.dynamics_model.robot_model.get_self_collision_config()
+            )
+            self.robot_self_collision_cost = SelfCollisionCost(self.cost_cfg.self_collision_cfg)
+            if self.dynamics_model.robot_model.total_spheres == 0:
+                self.robot_self_collision_cost.disable_cost()
+
+        # setup constraint terms:
+        if self.constraint_cfg.primitive_collision_cfg is not None:
+            self.primitive_collision_constraint = PrimitiveCollisionCost(
+                self.constraint_cfg.primitive_collision_cfg
+            )
+            if self.dynamics_model.robot_model.total_spheres == 0:
+                self.primitive_collision_constraint.disable_cost()
+
+        if self.constraint_cfg.self_collision_cfg is not None:
+            self.constraint_cfg.self_collision_cfg.self_collision_kin_config = (
+                self.dynamics_model.robot_model.get_self_collision_config()
+            )
+            self.robot_self_collision_constraint = SelfCollisionCost(
+                self.constraint_cfg.self_collision_cfg
+            )
+
+            if self.dynamics_model.robot_model.total_spheres == 0:
+                self.robot_self_collision_constraint.disable_cost()
+
+        self.constraint_cfg.bound_cfg.set_bounds(
+            self.dynamics_model.get_state_bounds(), teleport_mode=self.dynamics_model.teleport_mode
+        )
+        self.constraint_cfg.bound_cfg.cspace_distance_weight = (
+            self.dynamics_model.cspace_distance_weight
+        )
+        self.cost_cfg.bound_cfg.state_finite_difference_mode = (
+            self.dynamics_model.state_finite_difference_mode
+        )
+
+        self.bound_constraint = BoundCost(self.constraint_cfg.bound_cfg)
+
+        if self.convergence_cfg.null_space_cfg is not None:
+            self.null_convergence = DistCost(self.convergence_cfg.null_space_cfg)
+
+        # set start state:
+        start_state = torch.randn((1, self.dynamics_model.d_state), **vars(self.tensor_args))
+        self._start_state = JointState(
+            position=start_state[:, : self.dynamics_model.d_action],
+            velocity=start_state[:, : self.dynamics_model.d_action],
+            acceleration=start_state[:, : self.dynamics_model.d_action],
+        )
+        self.update_cost_dt(self.dynamics_model.dt_traj_params.base_dt)
+        return RolloutBase._init_after_config_load(self)
+
+    def cost_fn(self, state: KinematicModelState, action_batch=None, return_list=False):
+        # ee_pos_batch, ee_rot_batch = state_dict["ee_pos_seq"], state_dict["ee_rot_seq"]
+        state_batch = state.state_seq
+        cost_list = []
+
+        # compute state bound  cost:
+        if self.bound_cost.enabled:
+            with profiler.record_function("cost/bound"):
+                c = self.bound_cost.forward(
+                    state_batch,
+                    self._goal_buffer.retract_state,
+                    self._goal_buffer.batch_retract_state_idx,
+                )
+                cost_list.append(c)
+        if self.cost_cfg.manipulability_cfg is not None and self.manipulability_cost.enabled:
+            raise NotImplementedError("Manipulability Cost is not implemented")
+        if self.cost_cfg.stop_cfg is not None and self.stop_cost.enabled:
+            st_cost = self.stop_cost.forward(state_batch.velocity)
+            cost_list.append(st_cost)
+        if self.cost_cfg.self_collision_cfg is not None and self.robot_self_collision_cost.enabled:
+            with profiler.record_function("cost/self_collision"):
+                coll_cost = self.robot_self_collision_cost.forward(state.robot_spheres)
+                # cost += coll_cost
+                cost_list.append(coll_cost)
+        if (
+            self.cost_cfg.primitive_collision_cfg is not None
+            and self.primitive_collision_cost.enabled
+        ):
+            with profiler.record_function("cost/collision"):
+                coll_cost = self.primitive_collision_cost.forward(
+                    state.robot_spheres,
+                    env_query_idx=self._goal_buffer.batch_world_idx,
+                )
+                cost_list.append(coll_cost)
+        if return_list:
+            return cost_list
+        cost = cat_sum(cost_list)
+        return cost
+
+    def constraint_fn(
+        self,
+        state: KinematicModelState,
+        out_metrics: Optional[RolloutMetrics] = None,
+        use_batch_env: bool = True,
+    ) -> RolloutMetrics:
+        # setup constraint terms:
+
+        constraint = self.bound_constraint.forward(state.state_seq)
+        constraint_list = [constraint]
+        if (
+            self.constraint_cfg.primitive_collision_cfg is not None
+            and self.primitive_collision_constraint.enabled
+        ):
+            if use_batch_env and self._goal_buffer.batch_world_idx is not None:
+                coll_constraint = self.primitive_collision_constraint.forward(
+                    state.robot_spheres,
+                    env_query_idx=self._goal_buffer.batch_world_idx,
+                )
+            else:
+                coll_constraint = self.primitive_collision_constraint.forward(
+                    state.robot_spheres, env_query_idx=None
+                )
+
+            constraint_list.append(coll_constraint)
+        if (
+            self.constraint_cfg.self_collision_cfg is not None
+            and self.robot_self_collision_constraint.enabled
+        ):
+            self_constraint = self.robot_self_collision_constraint.forward(state.robot_spheres)
+            constraint_list.append(self_constraint)
+        constraint = cat_sum(constraint_list)
+        feasible = constraint == 0.0
+        if out_metrics is None:
+            out_metrics = RolloutMetrics()
+        out_metrics.feasible = feasible
+        out_metrics.constraint = constraint
+        return out_metrics
+
+    def get_metrics(self, state: Union[JointState, KinematicModelState]):
+        """Compute metrics given state
+            #TODO: Currently does not compute velocity and acceleration costs.
+        Args:
+            state (Union[JointState, URDFModelState]): _description_
+
+        Returns:
+            _type_: _description_
+
+        """
+        if isinstance(state, JointState):
+            state = self._get_augmented_state(state)
+        out_metrics = self.constraint_fn(state)
+        out_metrics.state = state
+        out_metrics = self.convergence_fn(state, out_metrics)
+        return out_metrics
+
+    def get_metrics_cuda_graph(self, state: JointState):
+        """Use a CUDA Graph to compute metrics
+
+        Args:
+            state: _description_
+
+        Raises:
+            ValueError: _description_
+
+        Returns:
+            _description_
+        """
+        if not self._metrics_cuda_graph_init:
+            # create new cuda graph for metrics:
+            self._cu_metrics_state_in = state.detach().clone()
+            s = torch.cuda.Stream(device=self.tensor_args.device)
+            s.wait_stream(torch.cuda.current_stream(device=self.tensor_args.device))
+            with torch.cuda.stream(s):
+                for _ in range(3):
+                    self._cu_out_metrics = self.get_metrics(self._cu_metrics_state_in)
+            torch.cuda.current_stream(device=self.tensor_args.device).wait_stream(s)
+            self.cu_metrics_graph = torch.cuda.CUDAGraph()
+            with torch.cuda.graph(self.cu_metrics_graph, stream=s):
+                self._cu_out_metrics = self.get_metrics(self._cu_metrics_state_in)
+            self._metrics_cuda_graph_init = True
+        self._cu_metrics_state_in.copy_(state)
+        self.cu_metrics_graph.replay()
+        out_metrics = self._cu_out_metrics
+        return out_metrics.clone()
+
+    @abstractmethod
+    def convergence_fn(
+        self, state: KinematicModelState, out_metrics: Optional[RolloutMetrics] = None
+    ):
+        if out_metrics is None:
+            out_metrics = RolloutMetrics()
+        if (
+            self.convergence_cfg.null_space_cfg is not None
+            and self.null_convergence.enabled
+            and self._goal_buffer.batch_retract_state_idx is not None
+        ):
+            out_metrics.cost = self.null_convergence.forward_target_idx(
+                self._goal_buffer.retract_state,
+                state.state_seq.position,
+                self._goal_buffer.batch_retract_state_idx,
+            )
+
+        return out_metrics
+
+    def _get_augmented_state(self, state: JointState) -> KinematicModelState:
+        aug_state = self.compute_kinematics(state)
+        if len(aug_state.state_seq.position.shape) == 2:
+            aug_state.state_seq = aug_state.state_seq.unsqueeze(1)
+            aug_state.ee_pos_seq = aug_state.ee_pos_seq.unsqueeze(1)
+            aug_state.ee_quat_seq = aug_state.ee_quat_seq.unsqueeze(1)
+            if aug_state.lin_jac_seq is not None:
+                aug_state.lin_jac_seq = aug_state.lin_jac_seq.unsqueeze(1)
+            if aug_state.ang_jac_seq is not None:
+                aug_state.ang_jac_seq = aug_state.ang_jac_seq.unsqueeze(1)
+            aug_state.robot_spheres = aug_state.robot_spheres.unsqueeze(1)
+            aug_state.link_pos_seq = aug_state.link_pos_seq.unsqueeze(1)
+            aug_state.link_quat_seq = aug_state.link_quat_seq.unsqueeze(1)
+        return aug_state
+
+    def compute_kinematics(self, state: JointState) -> KinematicModelState:
+        # assume input is joint state?
+        h = 0
+        current_state = state  # .detach().clone()
+        if len(current_state.position.shape) == 1:
+            current_state = current_state.unsqueeze(0)
+
+        q = current_state.position
+        if len(q.shape) == 3:
+            b, h, _ = q.shape
+            q = q.view(b * h, -1)
+
+        (
+            ee_pos_seq,
+            ee_rot_seq,
+            lin_jac_seq,
+            ang_jac_seq,
+            link_pos_seq,
+            link_rot_seq,
+            link_spheres,
+        ) = self.dynamics_model.robot_model.forward(q)
+
+        if h != 0:
+            ee_pos_seq = ee_pos_seq.view(b, h, 3)
+            ee_rot_seq = ee_rot_seq.view(b, h, 4)
+            if lin_jac_seq is not None:
+                lin_jac_seq = lin_jac_seq.view(b, h, 3, self.n_dofs)
+            if ang_jac_seq is not None:
+                ang_jac_seq = ang_jac_seq.view(b, h, 3, self.n_dofs)
+            link_spheres = link_spheres.view(b, h, link_spheres.shape[-2], link_spheres.shape[-1])
+            link_pos_seq = link_pos_seq.view(b, h, -1, 3)
+            link_rot_seq = link_rot_seq.view(b, h, -1, 4)
+
+        state = KinematicModelState(
+            current_state,
+            ee_pos_seq,
+            ee_rot_seq,
+            link_spheres,
+            link_pos_seq,
+            link_rot_seq,
+            lin_jac_seq,
+            ang_jac_seq,
+            link_names=self.kinematics.link_names,
+        )
+        return state
+
+    def rollout_constraint(
+        self, act_seq: torch.Tensor, use_batch_env: bool = True
+    ) -> RolloutMetrics:
+        state = self.dynamics_model.forward(self.start_state, act_seq)
+        metrics = self.constraint_fn(state, use_batch_env=use_batch_env)
+        return metrics
+
+    def rollout_constraint_cuda_graph(self, act_seq: torch.Tensor, use_batch_env: bool = True):
+        # TODO: move this to RolloutBase
+        if not self._rollout_constraint_cuda_graph_init:
+            # create new cuda graph for metrics:
+            self._cu_rollout_constraint_act_in = act_seq.clone()
+            s = torch.cuda.Stream(device=self.tensor_args.device)
+            s.wait_stream(torch.cuda.current_stream(device=self.tensor_args.device))
+            with torch.cuda.stream(s):
+                for _ in range(3):
+                    state = self.dynamics_model.forward(self.start_state, act_seq)
+                    self._cu_rollout_constraint_out_metrics = self.constraint_fn(
+                        state, use_batch_env=use_batch_env
+                    )
+            torch.cuda.current_stream(device=self.tensor_args.device).wait_stream(s)
+            self.cu_rollout_constraint_graph = torch.cuda.CUDAGraph()
+            with torch.cuda.graph(self.cu_rollout_constraint_graph, stream=s):
+                state = self.dynamics_model.forward(self.start_state, act_seq)
+                self._cu_rollout_constraint_out_metrics = self.constraint_fn(
+                    state, use_batch_env=use_batch_env
+                )
+            self._rollout_constraint_cuda_graph_init = True
+        self._cu_rollout_constraint_act_in.copy_(act_seq)
+        self.cu_rollout_constraint_graph.replay()
+        out_metrics = self._cu_rollout_constraint_out_metrics
+        return out_metrics.clone()
+
+    def rollout_fn(self, act_seq) -> Trajectory:
+        """
+        Return sequence of costs and states encountered
+        by simulating a batch of action sequences
+
+        Parameters
+        ----------
+        action_seq: torch.Tensor [num_particles, horizon, d_act]
+        """
+        # print(act_seq.shape, self._goal_buffer.batch_current_state_idx)
+        if self.start_state is None:
+            raise ValueError("start_state is not set in rollout")
+        with profiler.record_function("robot_model/rollout"):
+            state = self.dynamics_model.forward(
+                self.start_state, act_seq, self._goal_buffer.batch_current_state_idx
+            )
+        with profiler.record_function("cost/all"):
+            cost_seq = self.cost_fn(state, act_seq)
+
+        sim_trajs = Trajectory(actions=act_seq, costs=cost_seq, state=state)
+
+        return sim_trajs
+
+    def update_params(self, goal: Goal):
+        """
+        Updates the goal targets for the cost functions.
+
+        """
+        with profiler.record_function("arm_base/update_params"):
+            self._goal_buffer.copy_(
+                goal, update_idx_buffers=self._goal_idx_update
+            )  # TODO: convert this to a reference to avoid extra copy
+            # self._goal_buffer.copy_(goal, update_idx_buffers=True) # TODO: convert this to a reference to avoid extra copy
+
+            # TODO: move start state also inside Goal instance
+            if goal.current_state is not None:
+                if self.start_state is None:
+                    self.start_state = goal.current_state.clone()
+                else:
+                    self.start_state = self.start_state.copy_(goal.current_state)
+            self.batch_size = goal.batch
+        return True
+
+    def get_ee_pose(self, current_state):
+        current_state = current_state.to(**self.tensor_args)
+
+        (ee_pos_batch, ee_quat_batch) = self.dynamics_model.robot_model.forward(
+            current_state[:, : self.dynamics_model.n_dofs]
+        )[0:2]
+
+        state = KinematicModelState(current_state, ee_pos_batch, ee_quat_batch)
+        return state
+
+    def current_cost(self, current_state: JointState, no_coll=False, return_state=True, **kwargs):
+        state = self._get_augmented_state(current_state)
+
+        if "horizon_cost" not in kwargs:
+            kwargs["horizon_cost"] = False
+
+        cost = self.cost_fn(state, None, no_coll=no_coll, **kwargs)
+
+        if return_state:
+            return cost, state
+        else:
+            return cost
+
+    def filter_robot_state(self, current_state: JointState) -> JointState:
+        return self.dynamics_model.filter_robot_state(current_state)
+
+    def get_robot_command(
+        self,
+        current_state: JointState,
+        act_seq: torch.Tensor,
+        shift_steps: int = 1,
+        state_idx: Optional[torch.Tensor] = None,
+    ) -> JointState:
+        return self.dynamics_model.get_robot_command(
+            current_state,
+            act_seq,
+            shift_steps=shift_steps,
+            state_idx=state_idx,
+        )
+
+    def reset(self):
+        self.dynamics_model.state_filter.reset()
+        super().reset()
+
+    @property
+    def d_action(self):
+        return self.dynamics_model.d_action
+
+    @property
+    def action_bound_lows(self):
+        return self.dynamics_model.action_bound_lows
+
+    @property
+    def action_bound_highs(self):
+        return self.dynamics_model.action_bound_highs
+
+    @property
+    def state_bounds(self) -> Dict[str, List[float]]:
+        return self.dynamics_model.get_state_bounds()
+
+    @property
+    def dt(self):
+        return self.dynamics_model.dt
+
+    @property
+    def horizon(self):
+        return self.dynamics_model.horizon
+
+    def get_init_action_seq(self) -> torch.Tensor:
+        act_seq = self.dynamics_model.init_action_mean.unsqueeze(0).repeat(self.batch_size, 1, 1)
+        return act_seq
+
+    def reset_cuda_graph(self):
+        self._goal_idx_update = True
+
+        super().reset_cuda_graph()
+
+    def get_action_from_state(self, state: JointState):
+        return self.dynamics_model.get_action_from_state(state)
+
+    def get_state_from_action(
+        self,
+        start_state: JointState,
+        act_seq: torch.Tensor,
+        state_idx: Optional[torch.Tensor] = None,
+    ):
+        return self.dynamics_model.get_state_from_action(start_state, act_seq, state_idx)
+
+    @property
+    def kinematics(self):
+        return self.dynamics_model.robot_model
+
+    @property
+    def cspace_config(self) -> CSpaceConfig:
+        return self.dynamics_model.robot_model.kinematics_config.cspace
+
+    def get_full_dof_from_solution(self, q_js: JointState) -> JointState:
+        """This function will all the dof that are locked during optimization.
+
+
+        Args:
+            q_sol: _description_
+
+        Returns:
+            _description_
+        """
+        if self.kinematics.lock_jointstate is None:
+            return q_js
+        all_joint_names = self.kinematics.all_articulated_joint_names
+        lock_joint_state = self.kinematics.lock_jointstate
+
+        new_js = q_js.get_augmented_joint_state(all_joint_names, lock_joint_state)
+        return new_js
+
+    @property
+    def joint_names(self) -> List[str]:
+        return self.kinematics.joint_names
+
+    @property
+    def retract_state(self):
+        return self.dynamics_model.retract_config
+
+    def update_traj_dt(
+        self,
+        dt: Union[float, torch.Tensor],
+        base_dt: Optional[float] = None,
+        max_dt: Optional[float] = None,
+        base_ratio: Optional[float] = None,
+    ):
+        self.dynamics_model.update_traj_dt(dt, base_dt, max_dt, base_ratio)
+        self.update_cost_dt(dt)
+
+    def update_cost_dt(self, dt: float):
+        # scale any temporal costs by dt:
+        self.bound_cost.update_dt(dt)
+        if self.cost_cfg.primitive_collision_cfg is not None:
+            self.primitive_collision_cost.update_dt(dt)
diff --git a/src/curobo/rollout/arm_reacher.py b/src/curobo/rollout/arm_reacher.py
new file mode 100644
index 00000000..e99b699c
--- /dev/null
+++ b/src/curobo/rollout/arm_reacher.py
@@ -0,0 +1,403 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+from dataclasses import dataclass
+from typing import Dict, Optional
+
+# Third Party
+import torch
+import torch.autograd.profiler as profiler
+
+# CuRobo
+from curobo.geom.sdf.world import WorldCollision
+from curobo.rollout.cost.cost_base import CostConfig
+from curobo.rollout.cost.dist_cost import DistCost, DistCostConfig
+from curobo.rollout.cost.pose_cost import PoseCost, PoseCostConfig
+from curobo.rollout.cost.straight_line_cost import StraightLineCost
+from curobo.rollout.cost.zero_cost import ZeroCost
+from curobo.rollout.dynamics_model.kinematic_model import KinematicModelState
+from curobo.rollout.rollout_base import Goal, RolloutMetrics
+from curobo.types.base import TensorDeviceType
+from curobo.types.robot import RobotConfig
+from curobo.types.tensor import T_BValue_float
+from curobo.util.helpers import list_idx_if_not_none
+from curobo.util.tensor_util import cat_max, cat_sum
+
+# Local Folder
+from .arm_base import ArmBase, ArmBaseConfig, ArmCostConfig
+
+
+@dataclass
+class ArmReacherMetrics(RolloutMetrics):
+    cspace_error: Optional[T_BValue_float] = None
+    position_error: Optional[T_BValue_float] = None
+    rotation_error: Optional[T_BValue_float] = None
+    pose_error: Optional[T_BValue_float] = None
+
+    def __getitem__(self, idx):
+        d_list = [
+            self.cost,
+            self.constraint,
+            self.feasible,
+            self.state,
+            self.cspace_error,
+            self.position_error,
+            self.rotation_error,
+            self.pose_error,
+        ]
+        idx_vals = list_idx_if_not_none(d_list, idx)
+        return ArmReacherMetrics(*idx_vals)
+
+    def clone(self, clone_state=False):
+        if clone_state:
+            raise NotImplementedError()
+        return ArmReacherMetrics(
+            cost=None if self.cost is None else self.cost.clone(),
+            constraint=None if self.constraint is None else self.constraint.clone(),
+            feasible=None if self.feasible is None else self.feasible.clone(),
+            state=None if self.state is None else self.state,
+            cspace_error=None if self.cspace_error is None else self.cspace_error,
+            position_error=None if self.position_error is None else self.position_error,
+            rotation_error=None if self.rotation_error is None else self.rotation_error,
+            pose_error=None if self.pose_error is None else self.pose_error,
+        )
+
+
+@dataclass
+class ArmReacherCostConfig(ArmCostConfig):
+    pose_cfg: Optional[PoseCostConfig] = None
+    cspace_cfg: Optional[DistCostConfig] = None
+    straight_line_cfg: Optional[CostConfig] = None
+    zero_acc_cfg: Optional[CostConfig] = None
+    zero_vel_cfg: Optional[CostConfig] = None
+    zero_jerk_cfg: Optional[CostConfig] = None
+
+    @staticmethod
+    def _get_base_keys():
+        base_k = ArmCostConfig._get_base_keys()
+        # add new cost terms:
+        new_k = {
+            "pose_cfg": PoseCostConfig,
+            "cspace_cfg": DistCostConfig,
+            "straight_line_cfg": CostConfig,
+            "zero_acc_cfg": CostConfig,
+            "zero_vel_cfg": CostConfig,
+            "zero_jerk_cfg": CostConfig,
+        }
+        new_k.update(base_k)
+        return new_k
+
+    @staticmethod
+    def from_dict(
+        data_dict: Dict,
+        robot_cfg: RobotConfig,
+        world_coll_checker: Optional[WorldCollision] = None,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+    ):
+        k_list = ArmReacherCostConfig._get_base_keys()
+        data = ArmCostConfig._get_formatted_dict(
+            data_dict,
+            k_list,
+            robot_cfg,
+            world_coll_checker=world_coll_checker,
+            tensor_args=tensor_args,
+        )
+        return ArmReacherCostConfig(**data)
+
+
+@dataclass
+class ArmReacherConfig(ArmBaseConfig):
+    cost_cfg: ArmReacherCostConfig
+    constraint_cfg: ArmReacherCostConfig
+    convergence_cfg: ArmReacherCostConfig
+
+    @staticmethod
+    def cost_from_dict(
+        cost_data_dict: Dict,
+        robot_cfg: RobotConfig,
+        world_coll_checker: Optional[WorldCollision] = None,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+    ):
+        return ArmReacherCostConfig.from_dict(
+            cost_data_dict,
+            robot_cfg,
+            world_coll_checker=world_coll_checker,
+            tensor_args=tensor_args,
+        )
+
+
+@torch.jit.script
+def _compute_g_dist_jit(rot_err_norm, goal_dist):
+    # goal_cost = goal_cost.view(cost.shape)
+    # rot_err_norm = rot_err_norm.view(cost.shape)
+    # goal_dist = goal_dist.view(cost.shape)
+    g_dist = goal_dist.unsqueeze(-1) + 10.0 * rot_err_norm.unsqueeze(-1)
+    return g_dist
+
+
+class ArmReacher(ArmBase, ArmReacherConfig):
+    """
+    .. inheritance-diagram:: curobo.rollout.arm_reacher.ArmReacher
+    """
+
+    @profiler.record_function("arm_reacher/init")
+    def __init__(self, config: Optional[ArmReacherConfig] = None):
+        if config is not None:
+            ArmReacherConfig.__init__(self, **vars(config))
+        ArmBase.__init__(self)
+
+        # self.goal_state = None
+        # self.goal_ee_pos = None
+        # self.goal_ee_rot = None
+        # self.goal_ee_quat = None
+        self._compute_g_dist = False
+        self._n_goalset = 1
+
+        if self.cost_cfg.cspace_cfg is not None:
+            # self.cost_cfg.cspace_cfg.update_vec_weight(self.dynamics_model.cspace_distance_weight)
+            self.dist_cost = DistCost(self.cost_cfg.cspace_cfg)
+        if self.cost_cfg.pose_cfg is not None:
+            self.goal_cost = PoseCost(self.cost_cfg.pose_cfg)
+            self._link_pose_costs = {}
+            for i in self.kinematics.link_names:
+                if i != self.kinematics.ee_link:
+                    self._link_pose_costs[i] = PoseCost(self.cost_cfg.pose_cfg)
+        if self.cost_cfg.straight_line_cfg is not None:
+            self.straight_line_cost = StraightLineCost(self.cost_cfg.straight_line_cfg)
+        if self.cost_cfg.zero_vel_cfg is not None:
+            self.zero_vel_cost = ZeroCost(self.cost_cfg.zero_vel_cfg)
+            self._max_vel = self.state_bounds["velocity"][1]
+            if self.zero_vel_cost.hinge_value is not None:
+                self._compute_g_dist = True
+        if self.cost_cfg.zero_acc_cfg is not None:
+            self.zero_acc_cost = ZeroCost(self.cost_cfg.zero_acc_cfg)
+            self._max_vel = self.state_bounds["velocity"][1]
+            if self.zero_acc_cost.hinge_value is not None:
+                self._compute_g_dist = True
+
+        if self.cost_cfg.zero_jerk_cfg is not None:
+            self.zero_jerk_cost = ZeroCost(self.cost_cfg.zero_jerk_cfg)
+            self._max_vel = self.state_bounds["velocity"][1]
+            if self.zero_jerk_cost.hinge_value is not None:
+                self._compute_g_dist = True
+
+        self.z_tensor = torch.tensor(
+            0, device=self.tensor_args.device, dtype=self.tensor_args.dtype
+        )
+        if self.convergence_cfg.pose_cfg is not None:
+            self.pose_convergence = PoseCost(self.convergence_cfg.pose_cfg)
+            self._link_pose_convergence = {}
+            for i in self.kinematics.link_names:
+                if i != self.kinematics.ee_link:
+                    self._link_pose_convergence[i] = PoseCost(self.convergence_cfg.pose_cfg)
+        if self.convergence_cfg.cspace_cfg is not None:
+            self.cspace_convergence = DistCost(self.convergence_cfg.cspace_cfg)
+
+        # check if g_dist is required in any of the cost terms:
+        self.update_params(Goal(current_state=self._start_state))
+
+    def cost_fn(self, state: KinematicModelState, action_batch=None):
+        """
+        Compute cost given that state dictionary and actions
+
+
+        :class:`curobo.rollout.cost.PoseCost`
+        :class:`curobo.rollout.cost.DistCost`
+
+        """
+        state_batch = state.state_seq
+        with profiler.record_function("cost/base"):
+            cost_list = super(ArmReacher, self).cost_fn(state, action_batch, return_list=True)
+        ee_pos_batch, ee_quat_batch = state.ee_pos_seq, state.ee_quat_seq
+        g_dist = None
+        with profiler.record_function("cost/pose"):
+            if (
+                self._goal_buffer.goal_pose.position is not None
+                and self.cost_cfg.pose_cfg is not None
+                and self.goal_cost.enabled
+            ):
+                if self._compute_g_dist:
+                    goal_cost, rot_err_norm, goal_dist = self.goal_cost.forward_out_distance(
+                        ee_pos_batch,
+                        ee_quat_batch,
+                        self._goal_buffer,
+                    )
+
+                    g_dist = _compute_g_dist_jit(rot_err_norm, goal_dist)
+                else:
+                    goal_cost = self.goal_cost.forward(
+                        ee_pos_batch, ee_quat_batch, self._goal_buffer
+                    )
+                cost_list.append(goal_cost)
+        with profiler.record_function("cost/link_poses"):
+            if self._goal_buffer.links_goal_pose is not None and self.cost_cfg.pose_cfg is not None:
+                link_poses = state.link_pose
+
+                for k in self._goal_buffer.links_goal_pose.keys():
+                    if k != self.kinematics.ee_link:
+                        current_fn = self._link_pose_costs[k]
+                        if current_fn.enabled:
+                            # get link pose
+                            current_pose = link_poses[k]
+                            current_pos = current_pose.position
+                            current_quat = current_pose.quaternion
+
+                            c = current_fn.forward(current_pos, current_quat, self._goal_buffer, k)
+                            cost_list.append(c)
+
+        if (
+            self._goal_buffer.goal_state is not None
+            and self.cost_cfg.cspace_cfg is not None
+            and self.dist_cost.enabled
+        ):
+            joint_cost = self.dist_cost.forward_target_idx(
+                self._goal_buffer.goal_state.position,
+                state_batch.position,
+                self._goal_buffer.batch_goal_state_idx,
+            )
+            cost_list.append(joint_cost)
+        if self.cost_cfg.straight_line_cfg is not None and self.straight_line_cost.enabled:
+            st_cost = self.straight_line_cost.forward(ee_pos_batch)
+            cost_list.append(st_cost)
+
+        if (
+            self.cost_cfg.zero_acc_cfg is not None
+            and self.zero_acc_cost.enabled
+            # and g_dist is not None
+        ):
+            z_acc = self.zero_acc_cost.forward(
+                state_batch.acceleration,
+                g_dist,
+            )
+
+            # cost += z_acc
+            cost_list.append(z_acc)
+        # print(self.cost_cfg.zero_jerk_cfg)
+        if (
+            self.cost_cfg.zero_jerk_cfg is not None
+            and self.zero_jerk_cost.enabled
+            # and g_dist is not None
+        ):
+            # jerk = self.dynamics_model._aux_matrix @ state_batch.acceleration
+            z_jerk = self.zero_jerk_cost.forward(
+                state_batch.jerk,
+                g_dist,
+            )
+            cost_list.append(z_jerk)
+            # cost +=  z_jerk
+
+        if (
+            self.cost_cfg.zero_vel_cfg is not None
+            and self.zero_vel_cost.enabled
+            # and g_dist is not None
+        ):
+            z_vel = self.zero_vel_cost.forward(
+                state_batch.velocity,
+                g_dist,
+            )
+            # cost += z_vel
+            # print(z_vel.shape)
+            cost_list.append(z_vel)
+        cost = cat_sum(cost_list)
+        return cost
+
+    def convergence_fn(
+        self, state: KinematicModelState, out_metrics: Optional[ArmReacherMetrics] = None
+    ) -> ArmReacherMetrics:
+        if out_metrics is None:
+            out_metrics = ArmReacherMetrics()
+        if not isinstance(out_metrics, ArmReacherMetrics):
+            out_metrics = ArmReacherMetrics(**vars(out_metrics))
+        # print(self._goal_buffer.batch_retract_state_idx)
+        out_metrics = super(ArmReacher, self).convergence_fn(state, out_metrics)
+
+        # compute error with pose?
+        if (
+            self._goal_buffer.goal_pose.position is not None
+            and self.convergence_cfg.pose_cfg is not None
+        ):
+            (
+                out_metrics.pose_error,
+                out_metrics.rotation_error,
+                out_metrics.position_error,
+            ) = self.pose_convergence.forward_out_distance(
+                state.ee_pos_seq, state.ee_quat_seq, self._goal_buffer
+            )
+        if (
+            self._goal_buffer.links_goal_pose is not None
+            and self.convergence_cfg.pose_cfg is not None
+        ):
+            pose_error = [out_metrics.pose_error]
+            position_error = [out_metrics.position_error]
+            quat_error = [out_metrics.rotation_error]
+            link_poses = state.link_pose
+
+            for k in self._goal_buffer.links_goal_pose.keys():
+                if k != self.kinematics.ee_link:
+                    current_fn = self._link_pose_convergence[k]
+                    if current_fn.enabled:
+                        # get link pose
+                        current_pos = link_poses[k].position
+                        current_quat = link_poses[k].quaternion
+
+                        pose_err, pos_err, quat_err = current_fn.forward_out_distance(
+                            current_pos, current_quat, self._goal_buffer, k
+                        )
+                        pose_error.append(pose_err)
+                        position_error.append(pos_err)
+                        quat_error.append(quat_err)
+            out_metrics.pose_error = cat_max(pose_error)
+            out_metrics.rotation_error = cat_max(quat_error)
+            out_metrics.position_error = cat_max(position_error)
+
+        if (
+            self._goal_buffer.goal_state is not None
+            and self.convergence_cfg.cspace_cfg is not None
+            and self.cspace_convergence.enabled
+        ):
+            _, out_metrics.cspace_error = self.cspace_convergence.forward_target_idx(
+                self._goal_buffer.goal_state.position,
+                state.state_seq.position,
+                self._goal_buffer.batch_goal_state_idx,
+                True,
+            )
+
+        return out_metrics
+
+    def update_params(
+        self,
+        goal: Goal,
+    ):
+        """
+        Update params for the cost terms and dynamics model.
+
+        """
+
+        super(ArmReacher, self).update_params(goal)
+        if goal.batch_pose_idx is not None:
+            self._goal_idx_update = False
+        if goal.goal_pose.position is not None:
+            self.enable_cspace_cost(False)
+        return True
+
+    def enable_pose_cost(self, enable: bool = True):
+        if enable:
+            self.goal_cost.enable_cost()
+        else:
+            self.goal_cost.disable_cost()
+
+    def enable_cspace_cost(self, enable: bool = True):
+        if enable:
+            self.dist_cost.enable_cost()
+            self.cspace_convergence.enable_cost()
+        else:
+            self.dist_cost.disable_cost()
+            self.cspace_convergence.disable_cost()
diff --git a/src/curobo/rollout/cost/__init__.py b/src/curobo/rollout/cost/__init__.py
new file mode 100644
index 00000000..4b86de0f
--- /dev/null
+++ b/src/curobo/rollout/cost/__init__.py
@@ -0,0 +1,13 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+"""
+"""
diff --git a/src/curobo/rollout/cost/bound_cost.py b/src/curobo/rollout/cost/bound_cost.py
new file mode 100644
index 00000000..7d18efca
--- /dev/null
+++ b/src/curobo/rollout/cost/bound_cost.py
@@ -0,0 +1,1124 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+from dataclasses import dataclass
+from enum import Enum
+from typing import List, Optional, Union
+
+# Third Party
+import torch
+import warp as wp
+
+# CuRobo
+from curobo.cuda_robot_model.types import JointLimits
+from curobo.types.robot import JointState
+from curobo.types.tensor import T_DOF
+from curobo.util.warp import init_warp
+
+# Local Folder
+from .cost_base import CostBase, CostConfig
+
+wp.set_module_options({"fast_math": False})
+
+
+class BoundCostType(Enum):
+    POSITION = 0
+    BOUNDS = 1
+    BOUNDS_SMOOTH = 2
+
+
+@dataclass
+class BoundCostConfig(CostConfig):
+    joint_limits: Optional[JointLimits] = None
+    smooth_weight: Optional[List[float]] = None
+    run_weight_velocity: float = 0.0
+    run_weight_acceleration: float = 0.0
+    run_weight_jerk: float = 0.0
+    cspace_distance_weight: Optional[T_DOF] = None
+    cost_type: Optional[BoundCostType] = None
+    activation_distance: Union[torch.Tensor, float] = 0.0
+    state_finite_difference_mode: str = "BACKWARD"
+    null_space_weight: Optional[List[float]] = None
+
+    def set_bounds(self, bounds: JointLimits, teleport_mode: bool = False):
+        self.joint_limits = bounds.clone()
+        if teleport_mode:
+            self.cost_type = BoundCostType.POSITION
+
+    def __post_init__(self):
+        if isinstance(self.activation_distance, List):
+            self.activation_distance = self.tensor_args.to_device(self.activation_distance)
+        elif isinstance(self.activation_distance, float):
+            raise ValueError("Activation distance is a list for bound cost.")
+        if self.smooth_weight is not None:
+            self.smooth_weight = self.tensor_args.to_device(self.smooth_weight)
+
+        if self.cost_type is None:
+            if self.smooth_weight is not None:
+                self.cost_type = BoundCostType.BOUNDS_SMOOTH
+            else:
+                self.cost_type = BoundCostType.BOUNDS
+
+        if self.cspace_distance_weight is not None:
+            self.cspace_distance_weight = 1.0 + self.cspace_distance_weight / torch.max(
+                self.cspace_distance_weight
+            )
+        if self.null_space_weight is None:
+            self.null_space_weight = self.tensor_args.to_device([0.0])
+        else:
+            self.null_space_weight = self.tensor_args.to_device(self.null_space_weight)
+        if self.vec_weight is None:
+            self.vec_weight = self.tensor_args.to_device([0.0])
+        return super().__post_init__()
+
+
+class BoundCost(CostBase, BoundCostConfig):
+    def __init__(self, config: BoundCostConfig):
+        BoundCostConfig.__init__(self, **vars(config))
+        CostBase.__init__(self)
+        init_warp()
+        self._batch_size = -1
+        self._horizon = -1
+        self._dof = -1
+        empty_buffer = torch.tensor(
+            (0, 0, 0), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+        )
+        self._out_gv_buffer = self._out_ga_buffer = self._out_gj_buffer = empty_buffer
+
+    def update_batch_size(self, batch, horizon, dof):
+        if self._batch_size != batch or self._horizon != horizon or self._dof != dof:
+            self._out_c_buffer = torch.zeros(
+                (batch, horizon, dof), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            )
+            self._out_gp_buffer = torch.zeros(
+                (batch, horizon, dof), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            )
+            if not self.cost_type == BoundCostType.POSITION:
+                self._out_gv_buffer = torch.zeros(
+                    (batch, horizon, dof),
+                    device=self.tensor_args.device,
+                    dtype=self.tensor_args.dtype,
+                )
+                self._out_ga_buffer = torch.zeros(
+                    (batch, horizon, dof),
+                    device=self.tensor_args.device,
+                    dtype=self.tensor_args.dtype,
+                )
+                self._out_gj_buffer = torch.zeros(
+                    (batch, horizon, dof),
+                    device=self.tensor_args.device,
+                    dtype=self.tensor_args.dtype,
+                )
+
+            # create terminal buffers:
+            if self.cost_type == BoundCostType.BOUNDS_SMOOTH:
+                self._run_weight_vel = torch.ones(
+                    (1, horizon),
+                    device=self.tensor_args.device,
+                    dtype=self.tensor_args.dtype,
+                )
+                # 2: -3
+                if self.state_finite_difference_mode == "BACKWARD":
+                    self._run_weight_vel[:, :-4] *= self.run_weight_velocity
+                elif self.state_finite_difference_mode == "CENTRAL":
+                    self._run_weight_vel[:, :] *= self.run_weight_velocity
+
+                # print(self._run_weight_vel)
+                # exit()
+                self._run_weight_acc = torch.ones(
+                    (1, horizon),
+                    device=self.tensor_args.device,
+                    dtype=self.tensor_args.dtype,
+                )
+
+                # self._run_weight_acc[:, 3:-3] *= self.run_weight_acceleration
+                self._run_weight_jerk = torch.ones(
+                    (1, horizon),
+                    device=self.tensor_args.device,
+                    dtype=self.tensor_args.dtype,
+                )
+
+                # self._run_weight_jerk[:, :] *= self.run_weight_jerk
+
+            self._batch_size = batch
+            self._horizon = horizon
+            self._dof = dof
+            self._retract_cfg = torch.zeros(
+                (dof), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            )
+            self._retract_cfg_idx = torch.zeros(
+                (batch), device=self.tensor_args.device, dtype=torch.int32
+            )
+
+    def forward(
+        self,
+        state_batch: JointState,
+        retract_config: Optional[torch.Tensor] = None,
+        retract_idx: Optional[torch.Tensor] = None,
+    ):
+        b, h, dof = state_batch.position.shape
+        self.update_batch_size(b, h, dof)
+        if retract_config is None:
+            retract_config = self._retract_cfg
+        if retract_idx is None:
+            retract_idx = self._retract_cfg_idx
+
+        if self.cost_type == BoundCostType.BOUNDS_SMOOTH:
+            # print(self.joint_limits.jerk.shape, self.joint_limits.position.shape)
+            cost = WarpBoundSmoothFunction.apply(
+                state_batch.position,
+                state_batch.velocity,
+                state_batch.acceleration,
+                state_batch.jerk,
+                retract_config,
+                retract_idx,
+                self.joint_limits.position,
+                self.joint_limits.velocity,
+                self.joint_limits.acceleration,
+                self.joint_limits.jerk,
+                self.weight,
+                self.activation_distance,
+                self.smooth_weight,
+                self.cspace_distance_weight,
+                self.null_space_weight,
+                self.vec_weight,
+                self._run_weight_vel,
+                self._run_weight_acc,
+                self._run_weight_jerk,
+                self._out_c_buffer,
+                self._out_gp_buffer,
+                self._out_gv_buffer,
+                self._out_ga_buffer,
+                self._out_gj_buffer,
+            )
+            # print(self.cspace_distance_weight.shape)
+            # print(cost)
+            # print(self._run_weight_acc)
+        elif self.cost_type == BoundCostType.BOUNDS:
+            cost = WarpBoundFunction.apply(
+                state_batch.position,
+                state_batch.velocity,
+                state_batch.acceleration,
+                state_batch.jerk,
+                retract_config,
+                retract_idx,
+                self.joint_limits.position,
+                self.joint_limits.velocity,
+                self.joint_limits.acceleration,
+                self.joint_limits.jerk,
+                self.weight,
+                self.activation_distance,
+                self.null_space_weight,
+                self.vec_weight,
+                self._out_c_buffer,
+                self._out_gp_buffer,
+                self._out_gv_buffer,
+                self._out_ga_buffer,
+                self._out_gj_buffer,
+            )
+        elif self.cost_type == BoundCostType.POSITION:
+            if self.return_loss:
+                cost = WarpBoundPosLoss.apply(
+                    state_batch.position,
+                    retract_config,
+                    retract_idx,
+                    self.joint_limits.position,
+                    self.weight,
+                    self.activation_distance,
+                    self.null_space_weight,
+                    self.vec_weight,
+                    self._out_c_buffer,
+                    self._out_gp_buffer,
+                )
+            else:
+                cost = WarpBoundPosFunction.apply(
+                    state_batch.position,
+                    retract_config,
+                    retract_idx,
+                    self.joint_limits.position,
+                    self.weight,
+                    self.activation_distance,
+                    self.null_space_weight,
+                    self.vec_weight,
+                    self._out_c_buffer,
+                    self._out_gp_buffer,
+                )
+
+        else:
+            raise ValueError("No bounds set in BoundCost")
+        return cost
+
+    def update_dt(self, dt: Union[float, torch.Tensor]):
+        # return super().update_dt(dt)
+        if self.cost_type == BoundCostType.BOUNDS_SMOOTH:
+            v_scale = dt / self._dt
+            a_scale = v_scale**2
+            j_scale = v_scale**3
+            self.smooth_weight[1] *= a_scale
+            self.smooth_weight[2] *= j_scale
+        return super().update_dt(dt)
+
+
+@torch.jit.script
+def forward_bound_cost(p, lower_bounds, upper_bounds, weight):
+    # c = weight * torch.sum(torch.nn.functional.relu(torch.max(lower_bounds - p, p - upper_bounds)), dim=-1)
+
+    # This version does more work but fuses to 1 kernel
+    # c = torch.sum(weight * torch.nn.functional.relu(torch.max(lower_bounds - p, p - upper_bounds)), dim=-1)
+    c = torch.sum(
+        weight
+        * (torch.nn.functional.relu(lower_bounds - p) + torch.nn.functional.relu(p - upper_bounds)),
+        dim=(-1),
+    )
+    return c
+
+
+@torch.jit.script
+def forward_all_bound_cost(
+    p,
+    v,
+    a,
+    p_lower_bounds,
+    p_upper_bounds,
+    v_lower_bounds,
+    v_upper_bounds,
+    a_lower_bounds,
+    a_upper_bounds,
+    weight,
+):
+    # c = torch.sum(
+    #    weight *
+    #
+    #    (
+    #    torch.nn.functional.relu(torch.max(p_lower_bounds - p, p - p_upper_bounds))
+    #    + torch.nn.functional.relu(torch.max(v_lower_bounds - v, v - v_upper_bounds))
+    #    + torch.nn.functional.relu(torch.max(a_lower_bounds - a, a - a_upper_bounds))),
+    #    dim=-1,
+    # )
+
+    c = torch.sum(
+        weight
+        * (
+            torch.nn.functional.relu(p_lower_bounds - p)
+            + torch.nn.functional.relu(p - p_upper_bounds)
+            + torch.nn.functional.relu(v_lower_bounds - v)
+            + torch.nn.functional.relu(v - v_upper_bounds)
+            + torch.nn.functional.relu(a_lower_bounds - a)
+            + torch.nn.functional.relu(a - a_upper_bounds)
+        ),
+        dim=-1,
+    )
+
+    return c
+
+
+# create a bound cost tensor:
+class WarpBoundSmoothFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        pos,
+        vel,
+        acc,
+        jerk,
+        retract_config,
+        retract_idx,
+        p_b,
+        v_b,
+        a_b,
+        j_b,
+        weight,
+        activation_distance,
+        smooth_weight,
+        cspace_weight,
+        null_space_weight,
+        vec_weight,
+        run_weight_vel,
+        run_weight_acc,
+        run_weight_jerk,
+        out_cost,
+        out_gp,
+        out_gv,
+        out_ga,
+        out_gj,
+    ):
+        # scale the weights for smoothness by this dt:
+        wp_device = wp.device_from_torch(vel.device)
+        # assert smooth_weight.shape[0] == 7
+        b, h, dof = vel.shape
+        wp.launch(
+            kernel=forward_bound_smooth_warp,
+            dim=b * h * dof,
+            inputs=[
+                wp.from_torch(pos.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(vel.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(acc.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(jerk.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(retract_config.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(retract_idx.detach().view(-1), dtype=wp.int32),
+                wp.from_torch(p_b.view(-1), dtype=wp.float32),
+                wp.from_torch(v_b.view(-1), dtype=wp.float32),
+                wp.from_torch(a_b.view(-1), dtype=wp.float32),
+                wp.from_torch(j_b.view(-1), dtype=wp.float32),
+                wp.from_torch(weight, dtype=wp.float32),
+                wp.from_torch(activation_distance, dtype=wp.float32),
+                wp.from_torch(smooth_weight, dtype=wp.float32),
+                wp.from_torch(cspace_weight, dtype=wp.float32),
+                wp.from_torch(null_space_weight.view(-1), dtype=wp.float32),
+                wp.from_torch(vec_weight.view(-1), dtype=wp.float32),
+                wp.from_torch(run_weight_vel.view(-1), dtype=wp.float32),
+                wp.from_torch(run_weight_acc.view(-1), dtype=wp.float32),
+                wp.from_torch(run_weight_jerk.view(-1), dtype=wp.float32),
+                wp.from_torch(out_cost.view(-1), dtype=wp.float32),
+                wp.from_torch(out_gp.view(-1), dtype=wp.float32),
+                wp.from_torch(out_gv.view(-1), dtype=wp.float32),
+                wp.from_torch(out_ga.view(-1), dtype=wp.float32),
+                wp.from_torch(out_gj.view(-1), dtype=wp.float32),
+                pos.requires_grad,
+                b,
+                h,
+                dof,
+            ],
+            device=wp_device,
+            stream=wp.stream_from_torch(vel.device),
+        )
+        ctx.save_for_backward(out_gp, out_gv, out_ga, out_gj)
+        # out_c = out_cost
+        # out_c = torch.linalg.norm(out_cost, dim=-1)
+        out_c = torch.sum(out_cost, dim=-1)
+        return out_c
+
+    @staticmethod
+    def backward(ctx, grad_out_cost):
+        (
+            p_grad,
+            v_grad,
+            a_grad,
+            j_grad,
+        ) = ctx.saved_tensors
+        v_g = None
+        a_g = None
+        p_g = None
+        j_g = None
+        if ctx.needs_input_grad[0]:
+            p_g = p_grad  # * grad_out_cost#.unsqueeze(-1)
+        if ctx.needs_input_grad[1]:
+            v_g = v_grad  # * grad_out_cost#.unsqueeze(-1)
+        if ctx.needs_input_grad[2]:
+            a_g = a_grad  # * grad_out_cost#.unsqueeze(-1)
+        if ctx.needs_input_grad[3]:
+            j_g = j_grad  # * grad_out_cost#.unsqueeze(-1)
+        return (
+            p_g,
+            v_g,
+            a_g,
+            j_g,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+class WarpBoundFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        pos,
+        vel,
+        acc,
+        jerk,
+        retract_config,
+        retract_idx,
+        p_b,
+        v_b,
+        a_b,
+        j_b,
+        weight,
+        activation_distance,
+        null_space_weight,
+        vec_weight,
+        out_cost,
+        out_gp,
+        out_gv,
+        out_ga,
+        out_gj,
+    ):
+        wp_device = wp.device_from_torch(vel.device)
+        b, h, dof = vel.shape
+        wp.launch(
+            kernel=forward_bound_warp,
+            dim=b * h * dof,
+            inputs=[
+                wp.from_torch(pos.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(vel.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(acc.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(jerk.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(retract_config.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(retract_idx.detach().view(-1), dtype=wp.int32),
+                wp.from_torch(p_b.view(-1), dtype=wp.float32),
+                wp.from_torch(v_b.view(-1), dtype=wp.float32),
+                wp.from_torch(a_b.view(-1), dtype=wp.float32),
+                wp.from_torch(j_b.view(-1), dtype=wp.float32),
+                wp.from_torch(weight, dtype=wp.float32),
+                wp.from_torch(activation_distance, dtype=wp.float32),
+                wp.from_torch(null_space_weight.view(-1), dtype=wp.float32),
+                wp.from_torch(vec_weight.view(-1), dtype=wp.float32),
+                wp.from_torch(out_cost.view(-1), dtype=wp.float32),
+                wp.from_torch(out_gp.view(-1), dtype=wp.float32),
+                wp.from_torch(out_gv.view(-1), dtype=wp.float32),
+                wp.from_torch(out_ga.view(-1), dtype=wp.float32),
+                wp.from_torch(out_gj.view(-1), dtype=wp.float32),
+                pos.requires_grad,
+                b,
+                h,
+                dof,
+            ],
+            device=wp_device,
+            stream=wp.stream_from_torch(vel.device),
+        )
+        ctx.save_for_backward(out_gp, out_gv, out_ga, out_gj)
+        # out_c = out_cost
+        # out_c = torch.linalg.norm(out_cost, dim=-1)
+        out_c = torch.sum(out_cost, dim=-1)
+        return out_c
+
+    @staticmethod
+    def backward(ctx, grad_out_cost):
+        (
+            p_grad,
+            v_grad,
+            a_grad,
+            j_grad,
+        ) = ctx.saved_tensors
+        v_g = None
+        a_g = None
+        p_g = None
+        j_g = None
+        if ctx.needs_input_grad[0]:
+            p_g = p_grad  # * grad_out_cost#.unsqueeze(-1)
+        if ctx.needs_input_grad[1]:
+            v_g = v_grad  # * grad_out_cost#.unsqueeze(-1)
+        if ctx.needs_input_grad[2]:
+            a_g = a_grad  # * grad_out_cost#.unsqueeze(-1)
+        if ctx.needs_input_grad[3]:
+            j_g = j_grad
+        return (
+            p_g,
+            v_g,
+            a_g,
+            j_g,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+# create a bound cost tensor:
+class WarpBoundPosFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        pos,
+        retract_config,
+        retract_idx,
+        p_l,
+        weight,
+        activation_distance,
+        null_space_weight,
+        vec_weight,
+        out_cost,
+        out_gp,
+    ):
+        wp_device = wp.device_from_torch(pos.device)
+        b, h, dof = pos.shape
+        wp.launch(
+            kernel=forward_bound_pos_warp,
+            dim=b * h * dof,
+            inputs=[
+                # wp.from_torch(pos.detach().view(-1).contiguous(), dtype=wp.float32),
+                wp.from_torch(pos.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(retract_config.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(retract_idx.detach().view(-1), dtype=wp.int32),
+                wp.from_torch(p_l.view(-1), dtype=wp.float32),
+                wp.from_torch(weight, dtype=wp.float32),
+                wp.from_torch(activation_distance, dtype=wp.float32),
+                wp.from_torch(null_space_weight.view(-1), dtype=wp.float32),
+                wp.from_torch(vec_weight.view(-1), dtype=wp.float32),
+                wp.from_torch(out_cost.view(-1), dtype=wp.float32),
+                wp.from_torch(out_gp.view(-1), dtype=wp.float32),
+                pos.requires_grad,
+                b,
+                h,
+                dof,
+            ],
+            device=wp_device,
+            stream=wp.stream_from_torch(pos.device),
+        )
+        ctx.save_for_backward(out_gp)
+        # cost = torch.linalg.norm(out_cost, dim=-1)
+        cost = torch.sum(out_cost, dim=-1)
+        # cost = out_cost
+        return cost
+
+    @staticmethod
+    def backward(ctx, grad_out_cost):
+        (p_grad,) = ctx.saved_tensors
+
+        p_g = None
+        if ctx.needs_input_grad[0]:
+            p_g = p_grad  # * grad_out_cost.unsqueeze(-1)
+        return p_g, None, None, None, None, None, None, None, None, None
+
+
+# create a bound cost tensor:
+class WarpBoundPosLoss(WarpBoundPosFunction):
+    @staticmethod
+    def backward(ctx, grad_out_cost):
+        (p_grad,) = ctx.saved_tensors
+
+        p_g = None
+        if ctx.needs_input_grad[0]:
+            p_g = p_grad * grad_out_cost.unsqueeze(-1)
+        return p_g, None, None, None, None, None, None, None, None, None
+
+
+@wp.kernel
+def forward_bound_pos_warp(
+    pos: wp.array(dtype=wp.float32),
+    retract_config: wp.array(dtype=wp.float32),
+    retract_idx: wp.array(dtype=wp.int32),
+    p_b: wp.array(dtype=wp.float32),
+    weight: wp.array(dtype=wp.float32),
+    activation_distance: wp.array(dtype=wp.float32),
+    null_weight: wp.array(dtype=wp.float32),
+    vec_weight: wp.array(dtype=wp.float32),
+    out_cost: wp.array(dtype=wp.float32),
+    out_grad_p: wp.array(dtype=wp.float32),
+    write_grad: wp.uint8,  # this should be a bool
+    batch_size: wp.int32,
+    horizon: wp.int32,
+    dof: wp.int32,
+):
+    tid = wp.tid()
+    # initialize variables:
+    b_id = wp.int32(0)
+    h_id = wp.int32(0)
+    d_id = wp.int32(0)
+    b_addrs = int(0)
+
+    w = wp.float32(0.0)
+    c_p = wp.float32(0.0)
+    g_p = wp.float32(0.0)
+    c_total = wp.float32(0.0)
+
+    # we launch batch * horizon * dof kernels
+    b_id = tid / (horizon * dof)
+    h_id = (tid - (b_id * horizon * dof)) / dof
+    d_id = tid - (b_id * horizon * dof + h_id * dof)
+    if b_id >= batch_size or h_id >= horizon or d_id >= dof:
+        return
+
+    # read weights:
+    eta_p = activation_distance[0]
+    w = weight[0]
+
+    n_w = wp.float32(0.0)
+    n_w = null_weight[0]
+    target_p = wp.float32(0.0)
+    target_id = wp.int32(0.0)
+    if n_w > 0.0:
+        n_w *= vec_weight[d_id]
+        target_id = retract_idx[b_id]
+        target_id = target_id * dof + d_id
+        target_p = retract_config[target_id]
+    p_l = p_b[d_id]
+    p_u = p_b[dof + d_id]
+    p_l += eta_p
+    p_u -= eta_p
+    # compute cost:
+    b_addrs = b_id * horizon * dof + h_id * dof + d_id
+
+    # read buffers:
+
+    c_p = pos[b_addrs]
+
+    if n_w > 0.0:
+        error = c_p - target_p
+        c_total = n_w * error * error
+        g_p = 2.0 * n_w * error
+
+    # bound cost:
+    if c_p < p_l:
+        delta = p_l - c_p
+        if (delta) > eta_p or eta_p == 0.0:
+            c_total += w * (delta - 0.5 * eta_p)
+            g_p += -w
+        else:
+            c_total += w * (0.5 / eta_p) * delta * delta
+            g_p += -w * (1.0 / eta_p) * delta
+    elif c_p > p_u:
+        delta = c_p - p_u
+        if (delta) > eta_p or eta_p == 0.0:
+            c_total += w * (delta - 0.5 * eta_p)
+            g_p += w
+        else:
+            c_total += w * (0.5 / eta_p) * delta * delta
+            g_p += w * (1.0 / eta_p) * delta
+
+    out_cost[b_addrs] = c_total
+
+    # compute gradient
+    if write_grad == 1:
+        out_grad_p[b_addrs] = g_p
+
+
+@wp.kernel
+def forward_bound_warp(
+    pos: wp.array(dtype=wp.float32),
+    vel: wp.array(dtype=wp.float32),
+    acc: wp.array(dtype=wp.float32),
+    jerk: wp.array(dtype=wp.float32),
+    retract_config: wp.array(dtype=wp.float32),
+    retract_idx: wp.array(dtype=wp.int32),
+    p_b: wp.array(dtype=wp.float32),
+    v_b: wp.array(dtype=wp.float32),
+    a_b: wp.array(dtype=wp.float32),
+    j_b: wp.array(dtype=wp.float32),
+    weight: wp.array(dtype=wp.float32),
+    activation_distance: wp.array(dtype=wp.float32),
+    null_weight: wp.array(dtype=wp.float32),
+    vec_weight: wp.array(dtype=wp.float32),
+    out_cost: wp.array(dtype=wp.float32),
+    out_grad_p: wp.array(dtype=wp.float32),
+    out_grad_v: wp.array(dtype=wp.float32),
+    out_grad_a: wp.array(dtype=wp.float32),
+    out_grad_j: wp.array(dtype=wp.float32),
+    write_grad: wp.uint8,  # this should be a bool
+    batch_size: wp.int32,
+    horizon: wp.int32,
+    dof: wp.int32,
+):
+    tid = wp.tid()
+    # initialize variables:
+    b_id = wp.int32(0)
+    h_id = wp.int32(0)
+    d_id = wp.int32(0)
+    b_addrs = int(0)
+
+    w = wp.float32(0.0)
+    c_v = wp.float32(0.0)
+    c_a = wp.float32(0.0)
+    c_p = wp.float32(0.0)
+    g_p = wp.float32(0.0)
+    g_v = wp.float32(0.0)
+    g_a = wp.float32(0.0)
+    b_wv = float(0.0)
+    b_wa = float(0.0)
+    b_wj = float(0.0)
+    c_total = wp.float32(0.0)
+
+    # we launch batch * horizon * dof kernels
+    b_id = tid / (horizon * dof)
+    h_id = (tid - (b_id * horizon * dof)) / dof
+    d_id = tid - (b_id * horizon * dof + h_id * dof)
+    if b_id >= batch_size or h_id >= horizon or d_id >= dof:
+        return
+    n_w = wp.float32(0.0)
+    n_w = null_weight[0]
+    target_p = wp.float32(0.0)
+    target_id = wp.int32(0.0)
+    if n_w > 0.0:
+        n_w *= vec_weight[d_id]
+        target_id = retract_idx[b_id]
+        target_id = target_id * dof + d_id
+        target_p = retract_config[target_id]
+
+    # read weights:
+    w = weight[0]
+    b_wv = weight[1]
+    b_wa = weight[2]
+    b_wj = weight[3]
+
+    # compute cost:
+    b_addrs = b_id * horizon * dof + h_id * dof + d_id
+
+    # read buffers:
+    c_v = vel[b_addrs]
+    c_a = acc[b_addrs]
+    c_p = pos[b_addrs]
+
+    # if w_j > 0.0:
+    eta_p = activation_distance[0]
+    eta_v = activation_distance[1]
+    eta_a = activation_distance[2]
+    eta_j = activation_distance[3]
+
+    c_j = jerk[b_addrs]
+
+    p_l = p_b[d_id] + eta_p
+    p_u = p_b[dof + d_id] - eta_p
+
+    v_l = v_b[d_id] + eta_v
+    v_u = v_b[dof + d_id] - eta_v
+    a_l = a_b[d_id] + eta_a
+    a_u = a_b[dof + d_id] - eta_a
+
+    j_l = j_b[d_id] + eta_j
+    j_u = j_b[dof + d_id] - eta_j
+
+    delta = float(0.0)
+    if n_w > 0.0:
+        error = c_p - target_p
+        c_total = n_w * error * error
+        g_p = 2.0 * n_w * error
+
+    # bound cost:
+    if c_p < p_l:
+        delta = p_l - c_p
+        if (delta) > eta_p or eta_p == 0.0:
+            c_total += w * (delta - 0.5 * eta_p)
+            g_p += -w
+        else:
+            c_total += w * (0.5 / eta_p) * delta * delta
+            g_p += -w * (1.0 / eta_p) * delta
+    elif c_p > p_u:
+        delta = c_p - p_u
+        if (delta) > eta_p or eta_p == 0.0:
+            c_total += w * (delta - 0.5 * eta_p)
+            g_p += w
+        else:
+            c_total += w * (0.5 / eta_p) * delta * delta
+            g_p += w * (1.0 / eta_p) * delta
+
+    if c_v < v_l:
+        delta = v_l - c_v
+        if (delta) > eta_v or eta_v == 0.0:
+            c_total += b_wv * (delta - 0.5 * eta_v)
+            g_v = -b_wv
+        else:
+            c_total += b_wv * (0.5 / eta_v) * delta * delta
+            g_v = -b_wv * (1.0 / eta_v) * delta
+    elif c_v > v_u:
+        delta = c_v - v_u
+        if (delta) > eta_v or eta_v == 0.0:
+            c_total += b_wv * (delta - 0.5 * eta_v)
+            g_v = b_wv
+        else:
+            c_total += b_wv * (0.5 / eta_v) * delta * delta
+            g_v = b_wv * (1.0 / eta_v) * delta
+
+    if c_a < a_l:
+        delta = a_l - c_a
+        if (delta) > eta_a or eta_a == 0.0:
+            c_total += b_wa * (delta - 0.5 * eta_a)
+            g_a = -b_wa
+        else:
+            c_total += b_wa * (0.5 / eta_a) * delta * delta
+            g_a = -b_wa * (1.0 / eta_a) * delta
+    elif c_a > a_u:
+        delta = c_a - a_u
+        if (delta) > eta_a or eta_a == 0.0:
+            c_total += b_wa * (delta - 0.5 * eta_a)
+            g_a = b_wa
+        else:
+            c_total += b_wa * (0.5 / eta_a) * delta * delta
+            g_a = b_wa * (1.0 / eta_a) * delta
+
+    if c_j < j_l:
+        delta = j_l - c_j
+        if (delta) > eta_j or eta_j == 0.0:
+            c_total += b_wj * (delta - 0.5 * eta_j)
+            g_j = -b_wj
+        else:
+            c_total += b_wj * (0.5 / eta_j) * delta * delta
+            g_j = -b_wj * (1.0 / eta_j) * delta
+    elif c_j > j_u:
+        delta = c_j - j_u
+        if (delta) > eta_j or eta_j == 0.0:
+            c_total += b_wj * (delta - 0.5 * eta_j)
+            g_j = b_wj
+        else:
+            c_total += b_wj * (0.5 / eta_j) * delta * delta
+            g_j = b_wj * (1.0 / eta_j) * delta
+
+    out_cost[b_addrs] = c_total
+
+    # compute gradient
+    if write_grad == 1:
+        out_grad_p[b_addrs] = g_p
+        out_grad_v[b_addrs] = g_v
+        out_grad_a[b_addrs] = g_a
+        out_grad_j[b_addrs] = g_j
+
+
+@wp.kernel
+def forward_bound_smooth_warp(
+    pos: wp.array(dtype=wp.float32),
+    vel: wp.array(dtype=wp.float32),
+    acc: wp.array(dtype=wp.float32),
+    jerk: wp.array(dtype=wp.float32),
+    retract_config: wp.array(dtype=wp.float32),
+    retract_idx: wp.array(dtype=wp.int32),
+    p_b: wp.array(dtype=wp.float32),
+    v_b: wp.array(dtype=wp.float32),
+    a_b: wp.array(dtype=wp.float32),
+    j_b: wp.array(dtype=wp.float32),
+    weight: wp.array(dtype=wp.float32),
+    activation_distance: wp.array(dtype=wp.float32),
+    smooth_weight: wp.array(dtype=wp.float32),
+    cspace_weight: wp.array(dtype=wp.float32),
+    null_weight: wp.array(dtype=wp.float32),
+    vec_weight: wp.array(dtype=wp.float32),
+    run_weight_vel: wp.array(dtype=wp.float32),
+    run_weight_acc: wp.array(dtype=wp.float32),
+    run_weight_jerk: wp.array(dtype=wp.float32),
+    out_cost: wp.array(dtype=wp.float32),
+    out_grad_p: wp.array(dtype=wp.float32),
+    out_grad_v: wp.array(dtype=wp.float32),
+    out_grad_a: wp.array(dtype=wp.float32),
+    out_grad_j: wp.array(dtype=wp.float32),
+    write_grad: wp.uint8,  # this should be a bool
+    batch_size: wp.int32,
+    horizon: wp.int32,
+    dof: wp.int32,
+):
+    tid = wp.tid()
+    # initialize variables:
+    b_id = wp.int32(0)
+    h_id = wp.int32(0)
+    d_id = wp.int32(0)
+    b_addrs = int(0)
+
+    w = wp.float32(0.0)
+    b_wv = float(0.0)
+    b_wa = float(0.0)
+    b_wj = float(0.0)
+    cspace_w = wp.float32(0.0)
+    c_p = wp.float32(0.0)
+    c_v = wp.float32(0.0)
+    c_a = wp.float32(0.0)
+    c_j = wp.float32(0.0)
+    g_p = wp.float32(0.0)
+    g_v = wp.float32(0.0)
+    g_a = wp.float32(0.0)
+    g_j = wp.float32(0.0)
+    r_wv = wp.float32(0.0)
+    r_wa = wp.float32(0.0)
+    r_wj = wp.float32(0.0)
+    alpha_v = wp.float32(2.0)
+
+    w_v = wp.float32(0.0)
+    w_a = wp.float32(0.0)
+    w_j = wp.float32(0.0)
+
+    s_v = wp.float32(0.0)
+    s_a = wp.float32(0.0)
+    s_j = wp.float32(0.0)
+
+    c_total = wp.float32(0.0)
+
+    # we launch batch * horizon * dof kernels
+    b_id = tid / (horizon * dof)
+    h_id = (tid - (b_id * horizon * dof)) / dof
+    d_id = tid - (b_id * horizon * dof + h_id * dof)
+    if b_id >= batch_size or h_id >= horizon or d_id >= dof:
+        return
+
+    n_w = wp.float32(0.0)
+    n_w = null_weight[0]
+    target_p = wp.float32(0.0)
+    target_id = wp.int32(0.0)
+    if n_w > 0.0:
+        n_w *= vec_weight[d_id]
+        target_id = retract_idx[b_id]
+        target_id = target_id * dof + d_id
+        target_p = retract_config[target_id]
+    # read weights:
+    w = weight[0]
+    b_wv = weight[1]
+    b_wa = weight[2]
+    b_wj = weight[3]
+    cspace_w = cspace_weight[d_id]
+    r_wv = run_weight_vel[h_id]
+    r_wa = run_weight_acc[h_id]
+
+    w_v = smooth_weight[0]
+    w_a = smooth_weight[1]
+    w_j = smooth_weight[2]
+    alpha_v = smooth_weight[3]
+
+    # scale all smooth weights by cspace weight:
+    if r_wv < 1.0:
+        r_wv *= cspace_w
+        w_v *= cspace_w
+
+    r_wa *= cspace_w
+    r_wj *= cspace_w
+    w_a *= cspace_w
+    w_j *= cspace_w
+
+    # compute cost:
+    b_addrs = b_id * horizon * dof + h_id * dof + d_id
+
+    # read buffers:
+    c_v = vel[b_addrs]
+    c_a = acc[b_addrs]
+    c_p = pos[b_addrs]
+    # if w_j > 0.0:
+    eta_p = activation_distance[0]
+    eta_v = activation_distance[1]
+    eta_a = activation_distance[2]
+    eta_j = activation_distance[3]
+
+    r_wj = run_weight_jerk[h_id]
+    c_j = jerk[b_addrs]
+
+    p_l = p_b[d_id] + eta_p
+    p_u = p_b[dof + d_id] - eta_p
+
+    v_l = v_b[d_id] + eta_v
+    v_u = v_b[dof + d_id] - eta_v
+    a_l = a_b[d_id] + eta_a
+    a_u = a_b[dof + d_id] - eta_a
+
+    j_l = j_b[d_id] + eta_j
+    j_u = j_b[dof + d_id] - eta_j
+
+    delta = float(0.0)
+
+    # position:
+    if n_w > 0.0:
+        error = c_p - target_p
+        c_total = n_w * error * error
+        g_p = 2.0 * n_w * error
+
+    # bound cost:
+    if c_p < p_l:
+        delta = p_l - c_p
+        if (delta) > eta_p or eta_p == 0.0:
+            c_total += w * (delta - 0.5 * eta_p)
+            g_p += -w
+        else:
+            c_total += w * (0.5 / eta_p) * delta * delta
+            g_p += -w * (1.0 / eta_p) * delta
+    elif c_p > p_u:
+        delta = c_p - p_u
+        if (delta) > eta_p or eta_p == 0.0:
+            c_total += w * (delta - 0.5 * eta_p)
+            g_p += w
+        else:
+            c_total += w * (0.5 / eta_p) * delta * delta
+            g_p += w * (1.0 / eta_p) * delta
+
+    if c_v < v_l:
+        delta = v_l - c_v
+        if (delta) > eta_v or eta_v == 0.0:
+            c_total += b_wv * (delta - 0.5 * eta_v)
+            g_v = -b_wv
+        else:
+            c_total += b_wv * (0.5 / eta_v) * delta * delta
+            g_v = -b_wv * (1.0 / eta_v) * delta
+    elif c_v > v_u:
+        delta = c_v - v_u
+        if (delta) > eta_v or eta_v == 0.0:
+            c_total += b_wv * (delta - 0.5 * eta_v)
+            g_v = b_wv
+        else:
+            c_total += b_wv * (0.5 / eta_v) * delta * delta
+            g_v = b_wv * (1.0 / eta_v) * delta
+    if c_a < a_l:
+        delta = a_l - c_a
+        if (delta) > eta_a or eta_a == 0.0:
+            c_total += b_wa * (delta - 0.5 * eta_a)
+            g_a = -b_wa
+        else:
+            c_total += b_wa * (0.5 / eta_a) * delta * delta
+            g_a = -b_wa * (1.0 / eta_a) * delta
+    elif c_a > a_u:
+        delta = c_a - a_u
+        if (delta) > eta_a or eta_a == 0.0:
+            c_total += b_wa * (delta - 0.5 * eta_a)
+            g_a = b_wa
+        else:
+            c_total += b_wa * (0.5 / eta_a) * delta * delta
+            g_a = b_wa * (1.0 / eta_a) * delta
+    if c_j < j_l:
+        delta = j_l - c_j
+        if (delta) > eta_j or eta_j == 0.0:
+            c_total += b_wj * (delta - 0.5 * eta_j)
+            g_j = -b_wj
+        else:
+            c_total += b_wj * (0.5 / eta_j) * delta * delta
+            g_j = -b_wj * (1.0 / eta_j) * delta
+    elif c_j > j_u:
+        delta = c_j - j_u
+        if (delta) > eta_j or eta_j == 0.0:
+            c_total += b_wj * (delta - 0.5 * eta_j)
+            g_j = b_wj
+        else:
+            c_total += b_wj * (0.5 / eta_j) * delta * delta
+            g_j = b_wj * (1.0 / eta_j) * delta
+
+    # g_v = -1.0 * g_v
+    # g_a = -1.0 * g_a
+    # g_j = - 1.0
+    # do l2 regularization for velocity:
+    if r_wv < 1.0:
+        s_v = w_v * r_wv * c_v * c_v
+        g_v += 2.0 * r_wv * w_v * c_v
+    else:
+        s_v = w_v * r_wv * wp.log2(wp.cosh(alpha_v * c_v))
+        g_v += w_v * r_wv * alpha_v * wp.sinh(alpha_v * c_v) / wp.cosh(alpha_v * c_v)
+
+    s_a = w_a * (r_wa) * c_a * c_a
+    g_a += 2.0 * w_a * (r_wa) * c_a
+    s_j = w_j * r_wj * c_j * c_j
+    g_j += 2.0 * w_j * r_wj * c_j
+
+    c_total += s_v + s_a + s_j
+
+    out_cost[b_addrs] = c_total
+
+    # compute gradient
+    if write_grad == 1:
+        out_grad_p[b_addrs] = g_p
+        out_grad_v[b_addrs] = g_v
+        out_grad_a[b_addrs] = g_a
+        out_grad_j[b_addrs] = g_j
diff --git a/src/curobo/rollout/cost/cost_base.py b/src/curobo/rollout/cost/cost_base.py
new file mode 100644
index 00000000..7bb9e9a0
--- /dev/null
+++ b/src/curobo/rollout/cost/cost_base.py
@@ -0,0 +1,121 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+# Third Party
+import torch
+
+# CuRobo
+from curobo.types.base import TensorDeviceType
+
+
+@dataclass
+class CostConfig:
+    weight: Union[torch.Tensor, float, List[float]]
+    tensor_args: TensorDeviceType = None
+    distance_threshold: float = 0.0
+    classify: bool = False
+    terminal: bool = False
+    run_weight: Optional[float] = None
+    dof: int = 7
+    vec_weight: Optional[Union[torch.Tensor, List[float], float]] = None
+    max_value: Optional[float] = None
+    hinge_value: Optional[float] = None
+    vec_convergence: Optional[List[float]] = None
+    threshold_value: Optional[float] = None
+    return_loss: bool = False
+
+    def __post_init__(self):
+        self.weight = self.tensor_args.to_device(self.weight)
+        if len(self.weight.shape) == 0:
+            self.weight = torch.tensor(
+                [self.weight], device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            )
+        if self.vec_weight is not None:
+            self.vec_weight = self.tensor_args.to_device(self.vec_weight)
+        if self.max_value is not None:
+            self.max_value = self.tensor_args.to_device(self.max_value)
+        if self.hinge_value is not None:
+            self.hinge_value = self.tensor_args.to_device(self.hinge_value)
+        if self.threshold_value is not None:
+            self.threshold_value = self.tensor_args.to_device(self.threshold_value)
+
+    def update_vec_weight(self, vec_weight):
+        self.vec_weight = self.tensor_args.to_device(vec_weight)
+
+
+class CostBase(torch.nn.Module, CostConfig):
+    def __init__(self, config: Optional[CostConfig] = None):
+        """Initialize class
+
+        Args:
+            config (Optional[CostConfig], optional): To initialize this class directly, pass a config.
+            If this is a base class, it's assumed that you will initialize the child class with `CostConfig`.
+            Defaults to None.
+        """
+        self._run_weight_vec = None
+        super(CostBase, self).__init__()
+        if config is not None:
+            CostConfig.__init__(self, **vars(config))
+        CostBase._init_post_config(self)
+        self._batch_size = -1
+        self._horizon = -1
+        self._dof = -1
+        self._dt = 1
+
+    def _init_post_config(self):
+        self._weight = self.weight.clone()
+        self.cost_fn = None
+        self._cost_enabled = True
+        self._z_scalar = self.tensor_args.to_device(0.0)
+        if torch.sum(self.weight) == 0.0:
+            self.disable_cost()
+
+    def forward(self, q):
+        batch_size = q.shape[0]
+        horizon = q.shape[1]
+        q = q.view(batch_size * horizon, q.shape[2])
+
+        res = self.cost_fn(q)
+
+        res = res.view(batch_size, horizon)
+        res += self.distance_threshold
+        res = torch.nn.functional.relu(res, inplace=True)
+        if self.classify:
+            res = torch.where(res > 0, res + 1.0, res)
+        cost = self.weight * res
+        return cost
+
+    def disable_cost(self):
+        self.weight.copy_(self._weight * 0.0)
+        self._cost_enabled = False
+
+    def enable_cost(self):
+        self.weight.copy_(self._weight.clone())
+        if torch.sum(self.weight) == 0.0:
+            self._cost_enabled = False
+        else:
+            self._cost_enabled = True
+
+    def update_weight(self, weight: float):
+        if weight == 0.0:
+            self.disable_cost()
+        else:
+            self.weight.copy_(self._weight * 0.0 + weight)
+
+    @property
+    def enabled(self):
+        return self._cost_enabled
+
+    def update_dt(self, dt: Union[float, torch.Tensor]):
+        self._dt = dt
diff --git a/src/curobo/rollout/cost/dist_cost.py b/src/curobo/rollout/cost/dist_cost.py
new file mode 100644
index 00000000..3c387adc
--- /dev/null
+++ b/src/curobo/rollout/cost/dist_cost.py
@@ -0,0 +1,315 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+from dataclasses import dataclass
+from enum import Enum
+from typing import Optional
+
+# Third Party
+import torch
+import warp as wp
+
+# CuRobo
+from curobo.util.warp import init_warp
+
+# Local Folder
+from .cost_base import CostBase, CostConfig
+
+wp.set_module_options({"fast_math": False})
+
+
+class DistType(Enum):
+    L1 = 0
+    L2 = 1
+    SQUARED_L2 = 2
+
+
+@dataclass
+class DistCostConfig(CostConfig):
+    dist_type: DistType = DistType.L2
+    use_null_space: bool = False
+
+    def __post_init__(self):
+        return super().__post_init__()
+
+
+@torch.jit.script
+def L2_DistCost_jit(vec_weight, disp_vec):
+    return torch.norm(vec_weight * disp_vec, p=2, dim=-1, keepdim=False)
+
+
+@torch.jit.script
+def fwd_SQL2_DistCost_jit(vec_weight, disp_vec):
+    return torch.sum(torch.square(vec_weight * disp_vec), dim=-1, keepdim=False)
+
+
+@torch.jit.script
+def fwd_L1_DistCost_jit(vec_weight, disp_vec):
+    return torch.sum(torch.abs(vec_weight * disp_vec), dim=-1, keepdim=False)
+
+
+@torch.jit.script
+def L2_DistCost_target_jit(vec_weight, g_vec, c_vec, weight):
+    return torch.norm(weight * vec_weight * (g_vec - c_vec), p=2, dim=-1, keepdim=False)
+
+
+@torch.jit.script
+def fwd_SQL2_DistCost_target_jit(vec_weight, g_vec, c_vec, weight):
+    return torch.sum(torch.square(weight * vec_weight * (g_vec - c_vec)), dim=-1, keepdim=False)
+
+
+@torch.jit.script
+def fwd_L1_DistCost_target_jit(vec_weight, g_vec, c_vec, weight):
+    return torch.sum(torch.abs(weight * vec_weight * (g_vec - c_vec)), dim=-1, keepdim=False)
+
+
+@wp.kernel
+def forward_l2_warp(
+    pos: wp.array(dtype=wp.float32),
+    target: wp.array(dtype=wp.float32),
+    target_idx: wp.array(dtype=wp.int32),
+    weight: wp.array(dtype=wp.float32),
+    run_weight: wp.array(dtype=wp.float32),
+    vec_weight: wp.array(dtype=wp.float32),
+    out_cost: wp.array(dtype=wp.float32),
+    out_grad_p: wp.array(dtype=wp.float32),
+    write_grad: wp.uint8,  # this should be a bool
+    batch_size: wp.int32,
+    horizon: wp.int32,
+    dof: wp.int32,
+):
+    tid = wp.tid()
+    # initialize variables:
+    b_id = wp.int32(0)
+    h_id = wp.int32(0)
+    d_id = wp.int32(0)
+    b_addrs = wp.int32(0)
+    target_id = wp.int32(0)
+    w = wp.float32(0.0)
+    c_p = wp.float32(0.0)
+    target_p = wp.float32(0.0)
+    g_p = wp.float32(0.0)
+    r_w = wp.float32(0.0)
+    c_total = wp.float32(0.0)
+
+    # we launch batch * horizon * dof kernels
+    b_id = tid / (horizon * dof)
+    h_id = (tid - (b_id * horizon * dof)) / dof
+    d_id = tid - (b_id * horizon * dof + h_id * dof)
+
+    if b_id >= batch_size or h_id >= horizon or d_id >= dof:
+        return
+
+    # read weights:
+    w = weight[0]
+    r_w = run_weight[h_id]
+    w = r_w * w
+    r_w = vec_weight[d_id]
+    w = r_w * w
+    if w == 0.0:
+        return
+    # compute cost:
+    b_addrs = b_id * horizon * dof + h_id * dof + d_id
+
+    # read buffers:
+
+    c_p = pos[b_addrs]
+    target_id = target_idx[b_id]
+    target_id = target_id * dof + d_id
+    target_p = target[target_id]
+    error = c_p - target_p
+
+    if r_w >= 1.0 and w > 100.0:
+        c_total = w * wp.log2(wp.cosh(50.0 * error))
+        g_p = w * 50.0 * wp.sinh(50.0 * error) / (wp.cosh(50.0 * error))
+    else:
+        c_total = w * error * error
+        g_p = 2.0 * w * error
+
+    out_cost[b_addrs] = c_total
+
+    # compute gradient
+    if write_grad == 1:
+        out_grad_p[b_addrs] = g_p
+
+
+# create a bound cost tensor:
+class L2DistFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        pos,
+        target,
+        target_idx,
+        weight,
+        run_weight,
+        vec_weight,
+        out_cost,
+        out_cost_v,
+        out_gp,
+    ):
+        wp_device = wp.device_from_torch(pos.device)
+        b, h, dof = pos.shape
+        # print(target)
+
+        wp.launch(
+            kernel=forward_l2_warp,
+            dim=b * h * dof,
+            inputs=[
+                wp.from_torch(pos.detach().reshape(-1), dtype=wp.float32),
+                wp.from_torch(target.view(-1), dtype=wp.float32),
+                wp.from_torch(target_idx.view(-1), dtype=wp.int32),
+                wp.from_torch(weight, dtype=wp.float32),
+                wp.from_torch(run_weight.view(-1), dtype=wp.float32),
+                wp.from_torch(vec_weight.view(-1), dtype=wp.float32),
+                wp.from_torch(out_cost_v.view(-1), dtype=wp.float32),
+                wp.from_torch(out_gp.view(-1), dtype=wp.float32),
+                pos.requires_grad,
+                b,
+                h,
+                dof,
+            ],
+            device=wp_device,
+            stream=wp.stream_from_torch(pos.device),
+        )
+        # cost = torch.linalg.norm(out_cost_v, dim=-1)
+        # if pos.requires_grad:
+        #    out_gp = out_gp * torch.nan_to_num( 1.0/cost.unsqueeze(-1), 0.0)
+        cost = torch.sum(out_cost_v, dim=-1)
+
+        ctx.save_for_backward(out_gp)
+        return cost
+
+    @staticmethod
+    def backward(ctx, grad_out_cost):
+        (p_grad,) = ctx.saved_tensors
+
+        p_g = None
+        if ctx.needs_input_grad[0]:
+            p_g = p_grad
+        return p_g, None, None, None, None, None, None, None, None
+
+
+class DistCost(CostBase, DistCostConfig):
+    def __init__(self, config: Optional[DistCostConfig] = None):
+        if config is not None:
+            DistCostConfig.__init__(self, **vars(config))
+        CostBase.__init__(self)
+        self._init_post_config()
+        init_warp()
+
+    def _init_post_config(self):
+        if self.vec_weight is not None:
+            self.vec_weight = self.tensor_args.to_device(self.vec_weight)
+            if not self.use_null_space:
+                self.vec_weight = self.vec_weight * 0.0 + 1.0
+
+    def update_batch_size(self, batch, horizon, dof):
+        if self._batch_size != batch or self._horizon != horizon or self._dof != dof:
+            self._out_cv_buffer = torch.zeros(
+                (batch, horizon, dof), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            )
+            self._out_c_buffer = torch.zeros(
+                (batch, horizon), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            )
+
+            self._out_g_buffer = torch.zeros(
+                (batch, horizon, dof), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            )
+
+            self._batch_size = batch
+            self._horizon = horizon
+            self._dof = dof
+        if self.vec_weight is None:
+            self.vec_weight = torch.ones(
+                (1, 1, self._dof), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            )
+
+    def forward(self, disp_vec, RETURN_GOAL_DIST=False):
+        if self.dist_type == DistType.L2:
+            # dist = torch.norm(disp_vec, p=2, dim=-1, keepdim=False)
+            dist = L2_DistCost_jit(self.vec_weight, disp_vec)
+        elif self.dist_type == DistType.SQUARED_L2:
+            # cost = weight * (0.5 * torch.square(torch.norm(disp_vec, p=2, dim=-1)))
+            # dist = torch.sum(torch.square(disp_vec), dim=-1, keepdim=False)
+            dist = SQL2_DistCost_jit(self.vec_weight, disp_vec)
+        elif self.dist_type == DistType.L1:
+            # dist = torch.sum(torch.abs(disp_vec), dim=-1, keepdim=False)
+            dist = L1_DistCost_jit(self.vec_weight, disp_vec)
+
+        cost = self.weight * dist
+        if self.terminal and self.run_weight is not None:
+            if self._run_weight_vec is None or self._run_weight_vec.shape[1] != cost.shape[1]:
+                self._run_weight_vec = torch.ones(
+                    (1, cost.shape[1]), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+                )
+                self._run_weight_vec[:, :-1] *= self.run_weight
+        if RETURN_GOAL_DIST:
+            return cost, dist
+        return cost
+
+    def forward_target(self, goal_vec, current_vec, RETURN_GOAL_DIST=False):
+        if self.dist_type == DistType.L2:
+            # dist = torch.norm(disp_vec, p=2, dim=-1, keepdim=False)
+            cost = L2_DistCost_target_jit(self.vec_weight, goal_vec, current_vec, self.weight)
+        elif self.dist_type == DistType.SQUARED_L2:
+            # cost = weight * (0.5 * torch.square(torch.norm(disp_vec, p=2, dim=-1)))
+            # dist = torch.sum(torch.square(disp_vec), dim=-1, keepdim=False)
+            cost = fwd_SQL2_DistCost_target_jit(self.vec_weight, goal_vec, current_vec, self.weight)
+        elif self.dist_type == DistType.L1:
+            # dist = torch.sum(torch.abs(disp_vec), dim=-1, keepdim=False)
+            cost = fwd_L1_DistCost_target_jit(self.vec_weight, goal_vec, current_vec, self.weight)
+        dist = cost
+        if self.terminal and self.run_weight is not None:
+            if self._run_weight_vec is None or self._run_weight_vec.shape[1] != cost.shape[1]:
+                self._run_weight_vec = torch.ones(
+                    (1, cost.shape[1]), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+                )
+                self._run_weight_vec[:, :-1] *= self.run_weight
+            cost = self._run_weight_vec * dist
+        if RETURN_GOAL_DIST:
+            return cost, dist / self.weight
+        return cost
+
+    def forward_target_idx(self, goal_vec, current_vec, goal_idx, RETURN_GOAL_DIST=False):
+        b, h, dof = current_vec.shape
+        self.update_batch_size(b, h, dof)
+
+        if self.terminal and self.run_weight is not None:
+            if self._run_weight_vec is None or self._run_weight_vec.shape[1] != h:
+                self._run_weight_vec = torch.ones(
+                    (1, h), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+                )
+                self._run_weight_vec[:, :-1] *= self.run_weight
+        else:
+            raise NotImplementedError("terminal flag needs to be set to true")
+
+        if self.dist_type == DistType.L2:
+            # print(goal_idx.shape, goal_vec.shape)
+            cost = L2DistFunction.apply(
+                current_vec,
+                goal_vec,
+                goal_idx,
+                self.weight,
+                self._run_weight_vec,
+                self.vec_weight,
+                self._out_c_buffer,
+                self._out_cv_buffer,
+                self._out_g_buffer,
+            )
+            # cost = torch.linalg.norm(cost, dim=-1)
+
+        else:
+            raise NotImplementedError()
+        # print(cost.shape, cost[:,-1])
+        if RETURN_GOAL_DIST:
+            return cost, (cost / torch.sqrt((self.weight * self._run_weight_vec)))
+        return cost
diff --git a/src/curobo/rollout/cost/manipulability_cost.py b/src/curobo/rollout/cost/manipulability_cost.py
new file mode 100644
index 00000000..3799f503
--- /dev/null
+++ b/src/curobo/rollout/cost/manipulability_cost.py
@@ -0,0 +1,107 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+from dataclasses import dataclass
+from itertools import product
+
+# Third Party
+import torch
+
+# Local Folder
+from .cost_base import CostBase, CostConfig
+
+
+@dataclass
+class ManipulabilityCostConfig(CostConfig):
+    use_joint_limits: bool = False
+
+    def __post_init__(self):
+        return super().__post_init__()
+
+
+class ManipulabilityCost(CostBase, ManipulabilityCostConfig):
+    def __init__(self, config: ManipulabilityCostConfig):
+        ManipulabilityCostConfig.__init__(self, **vars(config))
+        CostBase.__init__(self)
+        self.i_mat = torch.ones(
+            (6, 1), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+        )
+
+        self.delta_vector = torch.zeros(
+            (64, 1, 1, 6, 1), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+        )
+        x = [i for i in product(range(2), repeat=6)]
+        self.delta_vector[:, 0, 0, :, 0] = torch.as_tensor(
+            x, device=self.tensor_args.device, dtype=self.tensor_args.dtype
+        )
+        self.delta_vector[self.delta_vector == 0] = -1.0
+
+        if self.cost_fn is None:
+            if self.use_joint_limits and self.joint_limits is not None:
+                self.cost_fn = self.joint_limited_manipulability_delta
+            else:
+                self.cost_fn = self.manipulability
+
+    def forward(self, jac_batch, q, qdot):
+        b, h, n = q.shape
+        if self.use_nn:
+            q = q.view(b * h, n)
+        score = self.cost_fn(q, jac_batch, qdot)
+        if self.use_nn:
+            score = score.view(b, h)
+        score[score > self.hinge_value] = self.hinge_value
+        score = (self.hinge_value / score) - 1
+        cost = self.weight * score
+
+        return cost
+
+    def manipulability(self, q, jac_batch, qdot=None):
+        with torch.cuda.amp.autocast(enabled=False):
+            J_J_t = torch.matmul(jac_batch, jac_batch.transpose(-2, -1))
+            score = torch.sqrt(torch.det(J_J_t))
+
+        score[score != score] = 0.0
+
+        return score
+
+    def joint_limited_manipulability_delta(self, q, jac_batch, qdot=None):
+        # q is [b,h,dof]
+        q_low = q - self.joint_limits[:, 0]
+        q_high = q - self.joint_limits[:, 1]
+
+        d_h_1 = torch.square(self.joint_limits[:, 1] - self.joint_limits[:, 0]) * (q_low + q_high)
+        d_h_2 = 4.0 * (torch.square(q_low) * torch.square(q_high))
+        d_h = torch.div(d_h_1, d_h_2)
+
+        dh_term = 1.0 / torch.sqrt(1 + torch.abs(d_h))
+        f_ten = torch.tensor(1.0, **self.tensor_args)
+        q_low = torch.abs(q_low)
+        q_high = torch.abs(q_high)
+        p_plus = torch.where(q_low > q_high, dh_term, f_ten).unsqueeze(-2)
+        p_minus = torch.where(q_low > q_high, f_ten, dh_term).unsqueeze(-2)
+
+        j_sign = torch.sign(jac_batch)
+        l_delta = torch.sign(self.delta_vector) * j_sign
+
+        L = torch.where(l_delta < 0.0, p_minus, p_plus)
+
+        with torch.cuda.amp.autocast(enabled=False):
+            w_J = L * jac_batch
+            J_J_t = torch.matmul(w_J, w_J.transpose(-2, -1))
+            score = torch.sqrt(torch.det(J_J_t))
+
+        # get actual score:
+        min_score = torch.min(score, dim=0)[0]
+        max_score = torch.max(score, dim=0)[0]
+        score = min_score / max_score
+        score[score != score] = 0.0
+        return score
diff --git a/src/curobo/rollout/cost/pose_cost.py b/src/curobo/rollout/cost/pose_cost.py
new file mode 100644
index 00000000..e445fab0
--- /dev/null
+++ b/src/curobo/rollout/cost/pose_cost.py
@@ -0,0 +1,765 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+from dataclasses import dataclass
+from enum import Enum
+from typing import List, Optional
+
+# Third Party
+import torch
+from torch.autograd import Function
+
+# CuRobo
+from curobo.curobolib.geom import get_pose_distance, get_pose_distance_backward
+from curobo.rollout.rollout_base import Goal
+from curobo.types.math import OrientationError, Pose
+
+# Local Folder
+from .cost_base import CostBase, CostConfig
+
+
+class PoseErrorType(Enum):
+    SINGLE_GOAL = 0  #: Distance will be computed to a single goal pose
+    BATCH_GOAL = 1  #: Distance will be computed pairwise between query batch and goal batch
+    GOALSET = 2  #: Shortest Distance will be computed to a goal set
+    BATCH_GOALSET = 3  #: Shortest Distance to a batch goal set
+
+
+@dataclass
+class PoseCostConfig(CostConfig):
+    cost_type: PoseErrorType = PoseErrorType.BATCH_GOAL
+    use_metric: bool = False
+    run_vec_weight: Optional[List[float]] = None
+
+    def __post_init__(self):
+        if self.run_vec_weight is not None:
+            self.run_vec_weight = self.tensor_args.to_device(self.run_vec_weight)
+        else:
+            self.run_vec_weight = torch.ones(
+                6, device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            )
+        if self.vec_weight is None:
+            self.vec_weight = torch.ones(
+                6, device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            )
+        if self.vec_convergence is None:
+            self.vec_convergence = torch.zeros(
+                2, device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            )
+        return super().__post_init__()
+
+
+@torch.jit.script
+def backward_PoseError_jit(grad_g_dist, grad_out_distance, weight, g_vec):
+    grad_vec = grad_g_dist + (grad_out_distance * weight)
+    grad = 1.0 * (grad_vec).unsqueeze(-1) * g_vec
+    return grad
+
+
+# full method:
+@torch.jit.script
+def backward_full_PoseError_jit(
+    grad_out_distance, grad_g_dist, grad_r_err, p_w, q_w, g_vec_p, g_vec_q
+):
+    p_grad = (grad_g_dist + (grad_out_distance * p_w)).unsqueeze(-1) * g_vec_p
+    q_grad = (grad_r_err + (grad_out_distance * q_w)).unsqueeze(-1) * g_vec_q
+    # p_grad = ((grad_out_distance * p_w)).unsqueeze(-1) * g_vec_p
+    # q_grad = ((grad_out_distance * q_w)).unsqueeze(-1) * g_vec_q
+
+    return p_grad, q_grad
+
+
+class PoseErrorDistance(Function):
+    @staticmethod
+    def forward(
+        ctx,
+        current_position,
+        goal_position,
+        current_quat,
+        goal_quat,
+        vec_weight,
+        weight,
+        vec_convergence,
+        run_weight,
+        run_vec_weight,
+        batch_pose_idx,
+        out_distance,
+        out_position_distance,
+        out_rotation_distance,
+        out_p_vec,
+        out_r_vec,
+        out_idx,
+        out_p_grad,
+        out_q_grad,
+        batch_size,
+        horizon,
+        mode=PoseErrorType.BATCH_GOAL.value,
+        num_goals=1,
+        use_metric=False,
+    ):
+        # out_distance = current_position[..., 0].detach().clone() * 0.0
+        # out_position_distance = out_distance.detach().clone()
+        # out_rotation_distance = out_distance.detach().clone()
+        # out_vec = (
+        #    torch.cat((current_position.detach().clone(), current_quat.detach().clone()), dim=-1)
+        #    * 0.0
+        # )
+        # out_idx = out_distance.clone().to(dtype=torch.long)
+
+        (
+            out_distance,
+            out_position_distance,
+            out_rotation_distance,
+            out_p_vec,
+            out_r_vec,
+            out_idx,
+        ) = get_pose_distance(
+            out_distance,
+            out_position_distance,
+            out_rotation_distance,
+            out_p_vec,
+            out_r_vec,
+            out_idx,
+            current_position.contiguous(),
+            goal_position,
+            current_quat.contiguous(),
+            goal_quat,
+            vec_weight,
+            weight,
+            vec_convergence,
+            run_weight,
+            run_vec_weight,
+            batch_pose_idx,
+            batch_size,
+            horizon,
+            mode,
+            num_goals,
+            current_position.requires_grad,
+            True,
+            use_metric,
+        )
+        ctx.save_for_backward(out_p_vec, out_r_vec, weight, out_p_grad, out_q_grad)
+        return out_distance, out_position_distance, out_rotation_distance, out_idx  # .view(-1,1)
+
+    @staticmethod
+    def backward(ctx, grad_out_distance, grad_g_dist, grad_r_err, grad_out_idx):
+        (g_vec_p, g_vec_q, weight, out_grad_p, out_grad_q) = ctx.saved_tensors
+        pos_grad = None
+        quat_grad = None
+        batch_size = g_vec_p.shape[0] * g_vec_p.shape[1]
+        if ctx.needs_input_grad[0] and ctx.needs_input_grad[2]:
+            pos_grad, quat_grad = get_pose_distance_backward(
+                out_grad_p,
+                out_grad_q,
+                grad_out_distance.contiguous(),
+                grad_g_dist.contiguous(),
+                grad_r_err.contiguous(),
+                weight,
+                g_vec_p,
+                g_vec_q,
+                batch_size,
+                use_distance=True,
+            )
+            # pos_grad, quat_grad = backward_full_PoseError_jit(
+            #    grad_out_distance,
+            #    grad_g_dist, grad_r_err, p_w, q_w, g_vec_p, g_vec_q
+            # )
+        elif ctx.needs_input_grad[0]:
+            pos_grad = backward_PoseError_jit(grad_g_dist, grad_out_distance, p_w, g_vec_p)
+            # grad_vec = grad_g_dist + (grad_out_distance * weight[1])
+            # pos_grad = 1.0 * (grad_vec).unsqueeze(-1) * g_vec[..., 4:]
+        elif ctx.needs_input_grad[2]:
+            quat_grad = backward_PoseError_jit(grad_r_err, grad_out_distance, q_w, g_vec_q)
+            # grad_vec = grad_r_err + (grad_out_distance * weight[0])
+            # quat_grad = 1.0 * (grad_vec).unsqueeze(-1) * g_vec[..., :4]
+        return (
+            pos_grad,
+            None,
+            quat_grad,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+class PoseLoss(Function):
+    @staticmethod
+    def forward(
+        ctx,
+        current_position,
+        goal_position,
+        current_quat,
+        goal_quat,
+        vec_weight,
+        weight,
+        vec_convergence,
+        run_weight,
+        run_vec_weight,
+        batch_pose_idx,
+        out_distance,
+        out_position_distance,
+        out_rotation_distance,
+        out_p_vec,
+        out_r_vec,
+        out_idx,
+        out_p_grad,
+        out_q_grad,
+        batch_size,
+        horizon,
+        mode=PoseErrorType.BATCH_GOAL.value,
+        num_goals=1,
+        use_metric=False,
+    ):
+        # out_distance = current_position[..., 0].detach().clone() * 0.0
+        # out_position_distance = out_distance.detach().clone()
+        # out_rotation_distance = out_distance.detach().clone()
+        # out_vec = (
+        #    torch.cat((current_position.detach().clone(), current_quat.detach().clone()), dim=-1)
+        #    * 0.0
+        # )
+        # out_idx = out_distance.clone().to(dtype=torch.long)
+
+        (
+            out_distance,
+            out_position_distance,
+            out_rotation_distance,
+            out_p_vec,
+            out_r_vec,
+            out_idx,
+        ) = get_pose_distance(
+            out_distance,
+            out_position_distance,
+            out_rotation_distance,
+            out_p_vec,
+            out_r_vec,
+            out_idx,
+            current_position.contiguous(),
+            goal_position,
+            current_quat.contiguous(),
+            goal_quat,
+            vec_weight,
+            weight,
+            vec_convergence,
+            run_weight,
+            run_vec_weight,
+            batch_pose_idx,
+            batch_size,
+            horizon,
+            mode,
+            num_goals,
+            current_position.requires_grad,
+            False,
+            use_metric,
+        )
+        ctx.save_for_backward(out_p_vec, out_r_vec)
+        return out_distance
+
+    @staticmethod
+    def backward(ctx, grad_out_distance):  # , grad_g_dist, grad_r_err, grad_out_idx):
+        pos_grad = None
+        quat_grad = None
+        if ctx.needs_input_grad[0] and ctx.needs_input_grad[2]:
+            (g_vec_p, g_vec_q) = ctx.saved_tensors
+            pos_grad = g_vec_p * grad_out_distance.unsqueeze(1)
+            quat_grad = g_vec_q * grad_out_distance.unsqueeze(1)
+            pos_grad = pos_grad.unsqueeze(-2)
+            quat_grad = quat_grad.unsqueeze(-2)
+        elif ctx.needs_input_grad[0]:
+            (g_vec_p, g_vec_q) = ctx.saved_tensors
+
+            pos_grad = g_vec_p * grad_out_distance.unsqueeze(1)
+            pos_grad = pos_grad.unsqueeze(-2)
+        elif ctx.needs_input_grad[2]:
+            (g_vec_p, g_vec_q) = ctx.saved_tensors
+
+            quat_grad = g_vec_q * grad_out_distance.unsqueeze(1)
+            quat_grad = quat_grad.unsqueeze(-2)
+
+        return (
+            pos_grad,
+            None,
+            quat_grad,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+class PoseError(Function):
+    @staticmethod
+    def forward(
+        ctx,
+        current_position,
+        goal_position,
+        current_quat,
+        goal_quat,
+        vec_weight,
+        weight,
+        vec_convergence,
+        run_weight,
+        run_vec_weight,
+        batch_pose_idx,
+        out_distance,
+        out_position_distance,
+        out_rotation_distance,
+        out_p_vec,
+        out_r_vec,
+        out_idx,
+        out_p_grad,
+        out_q_grad,
+        batch_size,
+        horizon,
+        mode=PoseErrorType.BATCH_GOAL.value,
+        num_goals=1,
+        use_metric=False,
+    ):
+        # out_distance = current_position[..., 0].detach().clone() * 0.0
+        # out_position_distance = out_distance.detach().clone()
+        # out_rotation_distance = out_distance.detach().clone()
+        # out_vec = (
+        #    torch.cat((current_position.detach().clone(), current_quat.detach().clone()), dim=-1)
+        #    * 0.0
+        # )
+        # out_idx = out_distance.clone().to(dtype=torch.long)
+
+        (
+            out_distance,
+            out_position_distance,
+            out_rotation_distance,
+            out_p_vec,
+            out_r_vec,
+            out_idx,
+        ) = get_pose_distance(
+            out_distance,
+            out_position_distance,
+            out_rotation_distance,
+            out_p_vec,
+            out_r_vec,
+            out_idx,
+            current_position.contiguous(),
+            goal_position,
+            current_quat.contiguous(),
+            goal_quat,
+            vec_weight,
+            weight,
+            vec_convergence,
+            run_weight,
+            run_vec_weight,
+            batch_pose_idx,
+            batch_size,
+            horizon,
+            mode,
+            num_goals,
+            current_position.requires_grad,
+            False,
+            use_metric,
+        )
+        ctx.save_for_backward(out_p_vec, out_r_vec)
+        return out_distance
+
+    @staticmethod
+    def backward(ctx, grad_out_distance):  # , grad_g_dist, grad_r_err, grad_out_idx):
+        pos_grad = None
+        quat_grad = None
+        if ctx.needs_input_grad[0] and ctx.needs_input_grad[2]:
+            (g_vec_p, g_vec_q) = ctx.saved_tensors
+
+            pos_grad = g_vec_p
+            quat_grad = g_vec_q
+        elif ctx.needs_input_grad[0]:
+            (g_vec_p, g_vec_q) = ctx.saved_tensors
+
+            pos_grad = g_vec_p
+        elif ctx.needs_input_grad[2]:
+            (g_vec_p, g_vec_q) = ctx.saved_tensors
+
+            quat_grad = g_vec_q
+        return (
+            pos_grad,
+            None,
+            quat_grad,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+class PoseCost(CostBase, PoseCostConfig):
+    def __init__(self, config: PoseCostConfig):
+        PoseCostConfig.__init__(self, **vars(config))
+        CostBase.__init__(self)
+        self.rot_weight = self.vec_weight[0:3]
+        self.pos_weight = self.vec_weight[3:6]
+
+        self._vec_convergence = self.tensor_args.to_device(self.vec_convergence)
+        self._batch_size = 0
+        self._horizon = 0
+
+    def update_batch_size(self, batch_size, horizon):
+        if batch_size != self._batch_size or horizon != self._horizon:
+            # batch_size = b*h
+            self.out_distance = torch.zeros(
+                (batch_size, horizon), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            )
+            self.out_position_distance = torch.zeros(
+                (batch_size, horizon), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            )
+            self.out_rotation_distance = torch.zeros(
+                (batch_size, horizon), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            )
+            self.out_idx = torch.zeros(
+                (batch_size, horizon), device=self.tensor_args.device, dtype=torch.int32
+            )
+            self.out_p_vec = torch.zeros(
+                (batch_size, horizon, 3),
+                device=self.tensor_args.device,
+                dtype=self.tensor_args.dtype,
+            )
+            self.out_q_vec = torch.zeros(
+                (batch_size, horizon, 4),
+                device=self.tensor_args.device,
+                dtype=self.tensor_args.dtype,
+            )
+            self.out_p_grad = torch.zeros(
+                (batch_size, horizon, 3),
+                device=self.tensor_args.device,
+                dtype=self.tensor_args.dtype,
+            )
+            self.out_q_grad = torch.zeros(
+                (batch_size, horizon, 4),
+                device=self.tensor_args.device,
+                dtype=self.tensor_args.dtype,
+            )
+            if self._run_weight_vec is None or self._run_weight_vec.shape[1] != horizon:
+                self._run_weight_vec = torch.ones(
+                    (1, horizon), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+                )
+            if self.terminal and self.run_weight is not None:
+                self._run_weight_vec[:, :-1] *= self.run_weight
+
+            self._batch_size = batch_size
+            self._horizon = horizon
+
+    def _forward_goal_distribution(self, ee_pos_batch, ee_rot_batch, ee_goal_pos, ee_goal_rot):
+        ee_goal_pos = ee_goal_pos.unsqueeze(1)
+        ee_goal_pos = ee_goal_pos.unsqueeze(1)
+        ee_goal_rot = ee_goal_rot.unsqueeze(1)
+        ee_goal_rot = ee_goal_rot.unsqueeze(1)
+        error, rot_error, pos_error = self.forward_single_goal(
+            ee_pos_batch, ee_rot_batch, ee_goal_pos, ee_goal_rot
+        )
+        min_idx = torch.argmin(error[:, :, -1], dim=0)
+        min_idx = min_idx.unsqueeze(1).expand(error.shape[1], error.shape[2])
+        if len(min_idx.shape) == 2:
+            min_idx = min_idx[0, 0]
+        error = error[min_idx]
+        rot_error = rot_error[min_idx]
+        pos_error = pos_error[min_idx]
+        return error, rot_error, pos_error, min_idx
+
+    def _forward_single_goal(self, ee_pos_batch, ee_rot_batch, ee_goal_pos, ee_goal_rot):
+        # b, h, _ = ee_pos_batch.shape
+        d_g_ee = ee_pos_batch - ee_goal_pos
+
+        position_err = torch.norm(self.pos_weight * d_g_ee, dim=-1)
+
+        goal_dist = position_err  # .clone()
+        rot_err = OrientationError.apply(ee_goal_rot, ee_rot_batch, ee_rot_batch.clone()).squeeze(
+            -1
+        )
+
+        rot_err_c = rot_err.clone()
+        goal_dist_c = goal_dist.clone()
+        # clamp:
+        if self.vec_convergence[1] > 0.0:
+            position_err = torch.where(
+                position_err > self.vec_convergence[1], position_err, position_err * 0.0
+            )
+        if self.vec_convergence[0] > 0.0:
+            rot_err = torch.where(rot_err > self.vec_convergence[0], rot_err, rot_err * 0.0)
+
+        # rot_err = torch.norm(goal_orient_vec, dim = -1)
+        cost = self.weight[0] * rot_err + self.weight[1] * position_err
+
+        # dimension should be bacth * traj_length
+        return cost, rot_err_c, goal_dist_c
+
+    def _forward_pytorch(self, ee_pos_batch, ee_rot_batch, ee_goal_pos, ee_goal_rot):
+        if self.cost_type == PoseErrorType.SINGLE_GOAL:
+            cost, r_err, g_dist = self.forward_single_goal(
+                ee_pos_batch, ee_rot_batch, ee_goal_pos, ee_goal_rot
+            )
+        elif self.cost_type == PoseErrorType.BATCH_GOAL:
+            cost, r_err, g_dist = self.forward_single_goal(
+                ee_pos_batch, ee_rot_batch, ee_goal_pos, ee_goal_rot
+            )
+        else:
+            cost, r_err, g_dist = self.forward_goal_distribution(
+                ee_pos_batch, ee_rot_batch, ee_goal_pos, ee_goal_rot
+            )
+        if self.terminal and self.run_weight is not None:
+            cost[:, :-1] *= self.run_weight
+        return cost, r_err, g_dist
+
+    def _update_cost_type(self, ee_goal_pos, ee_pos_batch, num_goals):
+        d_g = len(ee_goal_pos.shape)
+        b_sze = ee_goal_pos.shape[0]
+        if d_g == 2 and b_sze == 1:  # 1, 3
+            self.cost_type = PoseErrorType.SINGLE_GOAL
+        elif d_g == 2 and b_sze == ee_pos_batch.shape[0]:  # b, 3
+            self.cost_type = PoseErrorType.BATCH_GOAL
+        elif d_g == 3:
+            self.cost_type = PoseErrorType.GOALSET
+        elif len(ee_goal_pos.shape) == 4 and b_sze == ee_pos_bath.shape[0]:
+            self.cost_type = PoseErrorType.BATCH_GOALSET
+
+    def forward_out_distance(
+        self, ee_pos_batch, ee_rot_batch, goal: Goal, link_name: Optional[str] = None
+    ):
+        if link_name is None:
+            goal_pose = goal.goal_pose
+        else:
+            goal_pose = goal.links_goal_pose[link_name]
+
+        ee_goal_pos = goal_pose.position
+        ee_goal_rot = goal_pose.quaternion
+        num_goals = goal_pose.n_goalset
+        self._update_cost_type(ee_goal_pos, ee_pos_batch, num_goals)
+
+        b, h, _ = ee_pos_batch.shape
+
+        self.update_batch_size(b, h)
+
+        distance, g_dist, r_err, idx = PoseErrorDistance.apply(
+            ee_pos_batch,  # .view(-1, 3).contiguous(),
+            ee_goal_pos,
+            ee_rot_batch,  # .view(-1, 4).contiguous(),
+            ee_goal_rot,
+            self.vec_weight,
+            self.weight,
+            self._vec_convergence,
+            self._run_weight_vec,
+            self.run_vec_weight,
+            goal.batch_pose_idx,
+            self.out_distance,
+            self.out_position_distance,
+            self.out_rotation_distance,
+            self.out_p_vec,
+            self.out_q_vec,
+            self.out_idx,
+            self.out_p_grad,
+            self.out_q_grad,
+            b,
+            h,
+            self.cost_type.value,
+            num_goals,
+            self.use_metric,
+        )
+        # print(goal.batch_pose_idx.shape)
+        cost = distance  # .view(b, h)#.clone()
+        r_err = r_err  # .view(b, h)
+        g_dist = g_dist  # .view(b, h)
+        idx = idx  # .view(b, h)
+
+        return cost, r_err, g_dist
+
+    def forward(self, ee_pos_batch, ee_rot_batch, goal: Goal, link_name: Optional[str] = None):
+        if link_name is None:
+            goal_pose = goal.goal_pose
+        else:
+            goal_pose = goal.links_goal_pose[link_name]
+
+        ee_goal_pos = goal_pose.position
+        ee_goal_rot = goal_pose.quaternion
+        num_goals = goal_pose.n_goalset
+        self._update_cost_type(ee_goal_pos, ee_pos_batch, num_goals)
+
+        b, h, _ = ee_pos_batch.shape
+        self.update_batch_size(b, h)
+        # return self.out_distance
+        # print(b,h, ee_goal_pos.shape)
+        if self.return_loss:
+            distance = PoseLoss.apply(
+                ee_pos_batch,
+                ee_goal_pos,
+                ee_rot_batch,  # .view(-1, 4).contiguous(),
+                ee_goal_rot,
+                self.vec_weight,
+                self.weight,
+                self._vec_convergence,
+                self._run_weight_vec,
+                self.run_vec_weight,
+                goal.batch_pose_idx,
+                self.out_distance,
+                self.out_position_distance,
+                self.out_rotation_distance,
+                self.out_p_vec,
+                self.out_q_vec,
+                self.out_idx,
+                self.out_p_grad,
+                self.out_q_grad,
+                b,
+                h,
+                self.cost_type.value,
+                num_goals,
+                self.use_metric,
+            )
+        else:
+            distance = PoseError.apply(
+                ee_pos_batch,
+                ee_goal_pos,
+                ee_rot_batch,  # .view(-1, 4).contiguous(),
+                ee_goal_rot,
+                self.vec_weight,
+                self.weight,
+                self._vec_convergence,
+                self._run_weight_vec,
+                self.run_vec_weight,
+                goal.batch_pose_idx,
+                self.out_distance,
+                self.out_position_distance,
+                self.out_rotation_distance,
+                self.out_p_vec,
+                self.out_q_vec,
+                self.out_idx,
+                self.out_p_grad,
+                self.out_q_grad,
+                b,
+                h,
+                self.cost_type.value,
+                num_goals,
+                self.use_metric,
+            )
+
+        cost = distance
+
+        # print(cost.shape)
+        return cost
+
+    def forward_pose(
+        self,
+        goal_pose: Pose,
+        query_pose: Pose,
+        batch_pose_idx: torch.Tensor,
+        mode: PoseErrorType = PoseErrorType.BATCH_GOAL,
+    ):
+        ee_goal_pos = goal_pose.position
+        ee_goal_quat = goal_pose.quaternion
+        self.cost_type = mode
+
+        self.update_batch_size(query_pose.position.shape[0], query_pose.position.shape[1])
+        b = query_pose.position.shape[0]
+        h = query_pose.position.shape[1]
+        num_goals = 1
+        if self.return_loss:
+            distance = PoseLoss.apply(
+                query_pose.position.unsqueeze(1),
+                ee_goal_pos,
+                query_pose.quaternion.unsqueeze(1),
+                ee_goal_quat,
+                self.vec_weight,
+                self.weight,
+                self._vec_convergence,
+                self._run_weight_vec,
+                self.run_vec_weight,
+                batch_pose_idx,
+                self.out_distance,
+                self.out_position_distance,
+                self.out_rotation_distance,
+                self.out_p_vec,
+                self.out_q_vec,
+                self.out_idx,
+                self.out_p_grad,
+                self.out_q_grad,
+                b,
+                h,
+                self.cost_type.value,
+                num_goals,
+                self.use_metric,
+            )
+        else:
+            distance = PoseError.apply(
+                query_pose.position.unsqueeze(1),
+                ee_goal_pos,
+                query_pose.quaternion.unsqueeze(1),
+                ee_goal_quat,
+                self.vec_weight,
+                self.weight,
+                self._vec_convergence,
+                self._run_weight_vec,
+                self.run_vec_weight,
+                batch_pose_idx,
+                self.out_distance,
+                self.out_position_distance,
+                self.out_rotation_distance,
+                self.out_p_vec,
+                self.out_q_vec,
+                self.out_idx,
+                self.out_p_grad,
+                self.out_q_grad,
+                b,
+                h,
+                self.cost_type.value,
+                num_goals,
+                self.use_metric,
+            )
+        return distance
diff --git a/src/curobo/rollout/cost/primitive_collision_cost.py b/src/curobo/rollout/cost/primitive_collision_cost.py
new file mode 100644
index 00000000..7f9f6f22
--- /dev/null
+++ b/src/curobo/rollout/cost/primitive_collision_cost.py
@@ -0,0 +1,214 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+from dataclasses import dataclass
+from typing import Optional, Union
+
+# Third Party
+import torch
+
+# CuRobo
+from curobo.geom.sdf.world import CollisionQueryBuffer, WorldCollision
+from curobo.rollout.cost.cost_base import CostBase, CostConfig
+from curobo.rollout.dynamics_model.integration_utils import interpolate_kernel, sum_matrix
+
+
+@dataclass
+class PrimitiveCollisionCostConfig(CostConfig):
+    """Create Collision Cost Configuration."""
+
+    #: WorldCollision instance to use for distance queries.
+    world_coll_checker: Optional[WorldCollision] = None
+
+    #: Sweep for collisions between timesteps in a trajectory.
+    use_sweep: bool = False
+    use_sweep_kernel: bool = False
+    sweep_steps: int = 4
+
+    #: Speed metric scales the collision distance by sphere velocity (similar to CHOMP Planner
+    #: ICRA'09). This prevents the optimizer from speeding through obstacles to minimize cost and
+    #: instead encourages the robot to move around the obstacle.
+    use_speed_metric: bool = False
+
+    #: dt to use for computation of velocity and acceleration through central difference for
+    #: speed metric. Value less than 1 is better as that leads to different scaling between
+    #: acceleration and velocity.
+    speed_dt: Union[torch.Tensor, float] = 0.01
+
+    #: The distance outside collision at which to activate the cost. Having a non-zero value enables
+    #: the robot to move slowly when within this distance to an obstacle. This enables our
+    #: post optimization interpolation to not hit any obstacles.
+    activation_distance: Union[torch.Tensor, float] = 0.0
+
+    #: Setting this flag to true will sum the distance across spheres of the robot.
+    sum_distance: bool = True
+
+    def __post_init__(self):
+        if isinstance(self.speed_dt, float):
+            self.speed_dt = self.tensor_args.to_device([self.speed_dt])
+        if isinstance(self.activation_distance, float):
+            self.activation_distance = self.tensor_args.to_device([self.activation_distance])
+        return super().__post_init__()
+
+
+class PrimitiveCollisionCost(CostBase, PrimitiveCollisionCostConfig):
+    def __init__(self, config: PrimitiveCollisionCostConfig):
+        """Creates a primitive collision cost instance.
+
+        See note on :ref:`collision_checking_note` for details on the cost formulation.
+
+        Args:
+            config: Cost
+        """
+
+        PrimitiveCollisionCostConfig.__init__(self, **vars(config))
+        CostBase.__init__(self)
+        self._og_speed_dt = self.speed_dt.clone()
+        self.batch_size = -1
+        self._horizon = -1
+        self._n_spheres = -1
+
+        self.t_mat = None
+        if self.classify:
+            self.coll_check_fn = self.world_coll_checker.get_sphere_collision
+            self.sweep_check_fn = self.world_coll_checker.get_swept_sphere_collision
+        else:
+            self.coll_check_fn = self.world_coll_checker.get_sphere_distance
+            self.sweep_check_fn = self.world_coll_checker.get_swept_sphere_distance
+        self.sampled_spheres = None
+        self.sum_mat = None  #
+        if self.use_sweep:
+            # if self.use_sweep_kernel and (
+            #    type(self.world_coll_checker) in [WorldMeshCollision, WorldPrimitiveCollision]
+            # ):
+            # TODO: Implement sweep for nvblox collision checker.
+            self.forward = self.sweep_kernel_fn
+            # else:
+            # self.forward = self.discrete_fn
+        else:
+            self.forward = self.discrete_fn
+        self.int_mat = None
+        self._fd_matrix = None
+        self._collision_query_buffer = CollisionQueryBuffer()
+
+    def sweep_kernel_fn(self, robot_spheres_in, env_query_idx: Optional[torch.Tensor] = None):
+        self._collision_query_buffer.update_buffer_shape(
+            robot_spheres_in.shape, self.tensor_args, self.world_coll_checker.collision_types
+        )
+        dist = self.sweep_check_fn(
+            robot_spheres_in,
+            self._collision_query_buffer,
+            self.weight,
+            sweep_steps=self.sweep_steps,
+            activation_distance=self.activation_distance,
+            speed_dt=self.speed_dt,
+            enable_speed_metric=self.use_speed_metric,
+            env_query_idx=env_query_idx,
+            return_loss=self.return_loss,
+        )
+        if self.classify:
+            cost = weight_collision(dist, self.weight, self.sum_distance)
+        else:
+            cost = weight_distance(dist, self.weight, self.sum_distance)
+        return cost
+
+    def sweep_fn(self, robot_spheres_in, env_query_idx: Optional[torch.Tensor] = None):
+        batch_size, horizon, n_spheres, _ = robot_spheres_in.shape
+        # add intermediate spheres to account for discretization:
+        new_horizon = (horizon - 1) * self.sweep_steps
+        if self.int_mat is None:
+            self.int_mat = interpolate_kernel(horizon, self.sweep_steps, self.tensor_args)
+            self.int_mat_t = self.int_mat.transpose(0, 1)
+            self.int_sum_mat = sum_matrix(horizon, self.sweep_steps, self.tensor_args)
+        sampled_spheres = (
+            (robot_spheres_in.transpose(1, 2).transpose(2, 3) @ self.int_mat_t)
+            .transpose(2, 3)
+            .transpose(1, 2)
+            .contiguous()
+        )
+        # robot_spheres = sampled_spheres.view(batch_size * new_horizon * n_spheres, 4)
+        # self.update_batch_size(batch_size * new_horizon * n_spheres)
+
+        self._collision_query_buffer.update_buffer_shape(
+            sampled_spheres.shape, self.tensor_args, self.world_coll_checker.collision_types
+        )
+        dist = self.coll_check_fn(
+            sampled_spheres.contiguous(),
+            self._collision_query_buffer,
+            self.weight,
+            activation_distance=self.activation_distance,
+            env_query_idx=env_query_idx,
+            return_loss=self.return_loss,
+        )
+        dist = dist.view(batch_size, new_horizon, n_spheres)
+
+        if self.classify:
+            cost = weight_sweep_collision(self.int_sum_mat, dist, self.weight, self.sum_distance)
+        else:
+            cost = weight_sweep_distance(self.int_sum_mat, dist, self.weight, self.sum_distance)
+
+        return cost
+
+    def discrete_fn(self, robot_spheres_in, env_query_idx: Optional[torch.Tensor] = None):
+        self._collision_query_buffer.update_buffer_shape(
+            robot_spheres_in.shape, self.tensor_args, self.world_coll_checker.collision_types
+        )
+        dist = self.coll_check_fn(
+            robot_spheres_in,
+            self._collision_query_buffer,
+            self.weight,
+            env_query_idx=env_query_idx,
+            activation_distance=self.activation_distance,
+            return_loss=self.return_loss,
+        )
+
+        if self.classify:
+            cost = weight_collision(dist, self.weight, self.sum_distance)
+        else:
+            cost = weight_distance(dist, self.weight, self.sum_distance)
+        return cost
+
+    def update_dt(self, dt: Union[float, torch.Tensor]):
+        self.speed_dt[:] = dt  # / self._og_speed_dt
+        return super().update_dt(dt)
+
+    def get_gradient_buffer(self):
+        return self._collision_query_buffer.get_gradient_buffer()
+
+
+@torch.jit.script
+def weight_sweep_distance(int_mat, dist, weight, sum_cost: bool):
+    dist = torch.sum(dist, dim=-1)
+    dist = dist @ int_mat
+    return dist
+
+
+@torch.jit.script
+def weight_sweep_collision(int_mat, dist, weight, sum_cost: bool):
+    dist = torch.sum(dist, dim=-1)
+    dist = torch.where(dist > 0, dist + 1.0, dist)
+    dist = dist @ int_mat
+    return dist
+
+
+@torch.jit.script
+def weight_distance(dist, weight, sum_cost: bool):
+    if sum_cost:
+        dist = torch.sum(dist, dim=-1)
+    return dist
+
+
+@torch.jit.script
+def weight_collision(dist, weight, sum_cost: bool):
+    if sum_cost:
+        dist = torch.sum(dist, dim=-1)
+    dist = torch.where(dist > 0, dist + 1.0, dist)
+    return dist
diff --git a/src/curobo/rollout/cost/projected_dist_cost.py b/src/curobo/rollout/cost/projected_dist_cost.py
new file mode 100644
index 00000000..27eb8f50
--- /dev/null
+++ b/src/curobo/rollout/cost/projected_dist_cost.py
@@ -0,0 +1,69 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+"""
+Distance cost projected into the null-space of the Jacobian
+"""
+
+# Standard Library
+from dataclasses import dataclass
+from enum import Enum
+
+# Third Party
+import torch
+
+# Local Folder
+from .dist_cost import DistCost, DistCostConfig
+
+
+class ProjType(Enum):
+    IDENTITY = 0
+    PSEUDO_INVERSE = 1
+
+
+@dataclass
+class ProjectedDistCostConfig(DistCostConfig):
+    eps: float = 1e-4
+    proj_type: ProjType = ProjType.IDENTITY
+
+    def __post_init__(self):
+        return super().__post_init__()
+
+
+class ProjectedDistCost(DistCost, ProjectedDistCostConfig):
+    def __init__(self, config: ProjectedDistCostConfig):
+        ProjectedDistCostConfig.__init__(self, **vars(config))
+        DistCost.__init__(self)
+        self.I = torch.eye(self.dof, device=self.tensor_args.device, dtype=self.tensor_args.dtype)
+        self.task_I = torch.eye(6, device=self.tensor_args.device, dtype=self.tensor_args.dtype)
+
+    def forward(self, disp_vec, jac_batch=None):
+        disp_vec = self.vec_weight * disp_vec
+
+        if self.proj_type == ProjType.PSEUDO_INVERSE:
+            disp_vec_projected = self.get_pinv_null_disp(disp_vec, jac_batch)
+        elif self.proj_type == ProjType.IDENTITY:
+            disp_vec_projected = disp_vec
+
+        return super().forward(disp_vec_projected)
+
+    def get_pinv_null_disp(self, disp_vec, jac_batch):
+        jac_batch_t = jac_batch.transpose(-2, -1)
+
+        J_J_t = torch.matmul(jac_batch, jac_batch_t)
+
+        J_pinv = jac_batch_t @ torch.inverse(J_J_t + self.eps * self.task_I.expand_as(J_J_t))
+
+        J_pinv_J = torch.matmul(J_pinv, jac_batch)
+
+        null_proj = self.I.expand_as(J_pinv_J) - J_pinv_J
+
+        null_disp = torch.matmul(null_proj, disp_vec.unsqueeze(-1)).squeeze(-1)
+        return null_disp
diff --git a/src/curobo/rollout/cost/self_collision_cost.py b/src/curobo/rollout/cost/self_collision_cost.py
new file mode 100644
index 00000000..c4469f9c
--- /dev/null
+++ b/src/curobo/rollout/cost/self_collision_cost.py
@@ -0,0 +1,79 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+from dataclasses import dataclass
+from typing import Optional
+
+# Third Party
+import torch
+
+# CuRobo
+from curobo.cuda_robot_model.types import SelfCollisionKinematicsConfig
+from curobo.curobolib.geom import SelfCollisionDistance
+
+# Local Folder
+from .cost_base import CostBase, CostConfig
+
+
+@dataclass
+class SelfCollisionCostConfig(CostConfig):
+    self_collision_kin_config: Optional[SelfCollisionKinematicsConfig] = None
+
+    def __post_init__(self):
+        return super().__post_init__()
+
+
+class SelfCollisionCost(CostBase, SelfCollisionCostConfig):
+    def __init__(self, config: SelfCollisionCostConfig):
+        SelfCollisionCostConfig.__init__(self, **vars(config))
+        CostBase.__init__(self)
+        self._batch_size = None
+
+    def update_batch_size(self, robot_spheres):
+        # Assuming n stays constant
+        # TODO: use collision buffer here?
+
+        if self._batch_size is None or self._batch_size != robot_spheres.shape:
+            b, h, n, k = robot_spheres.shape
+            self._out_distance = torch.zeros(
+                (b, h), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            )
+            self._out_vec = torch.zeros(
+                (b, h, n, k), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            )
+            self._batch_size = robot_spheres.shape
+            self._sparse_sphere_idx = torch.zeros(
+                (b, h, n), device=self.tensor_args.device, dtype=torch.uint8
+            )
+
+    def forward(self, robot_spheres):
+        self.update_batch_size(robot_spheres)
+
+        dist = SelfCollisionDistance.apply(
+            self._out_distance,
+            self._out_vec,
+            self._sparse_sphere_idx,
+            robot_spheres,
+            self.self_collision_kin_config.offset,
+            self.weight,
+            self.self_collision_kin_config.collision_matrix,
+            self.self_collision_kin_config.thread_location,
+            self.self_collision_kin_config.thread_max,
+            self.self_collision_kin_config.checks_per_thread,
+            # False,
+            self.self_collision_kin_config.experimental_kernel,
+            self.return_loss,
+        )
+
+        if self.classify:
+            dist = torch.where(dist > 0, dist + 1.0, dist)
+
+        return dist
diff --git a/src/curobo/rollout/cost/stop_cost.py b/src/curobo/rollout/cost/stop_cost.py
new file mode 100644
index 00000000..f2aa54ce
--- /dev/null
+++ b/src/curobo/rollout/cost/stop_cost.py
@@ -0,0 +1,76 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+from dataclasses import dataclass
+from typing import Optional
+
+# Third Party
+import torch
+
+# CuRobo
+from curobo.rollout.dynamics_model.kinematic_model import TimeTrajConfig
+
+# Local Folder
+from .cost_base import CostBase, CostConfig
+
+
+@dataclass
+class StopCostConfig(CostConfig):
+    max_limit: Optional[float] = None
+    max_nlimit: Optional[float] = None
+    dt_traj_params: Optional[TimeTrajConfig] = None
+    horizon: int = 1
+
+    def __post_init__(self):
+        return super().__post_init__()
+
+
+class StopCost(CostBase, StopCostConfig):
+    def __init__(self, config: StopCostConfig):
+        StopCostConfig.__init__(self, **vars(config))
+        CostBase.__init__(self)
+        sum_matrix = torch.tril(
+            torch.ones(
+                (self.horizon, self.horizon),
+                device=self.tensor_args.device,
+                dtype=self.tensor_args.dtype,
+            )
+        ).T
+        traj_dt = self.tensor_args.to_device(self.dt_traj_params.get_dt_array(self.horizon))
+        if self.max_nlimit is not None:
+            # every timestep max acceleration:
+            sum_matrix = torch.tril(
+                torch.ones(
+                    (self.horizon, self.horizon),
+                    device=self.tensor_args.device,
+                    dtype=self.tensor_args.dtype,
+                )
+            ).T
+            delta_vel = traj_dt * self.max_nlimit
+            self.max_vel = (sum_matrix @ delta_vel).unsqueeze(-1)
+        elif self.max_limit is not None:
+            sum_matrix = torch.tril(
+                torch.ones(
+                    (self.horizon, self.horizon),
+                    device=self.tensor_args.device,
+                    dtype=self.tensor_args.dtype,
+                )
+            ).T
+            delta_vel = torch.ones_like(traj_dt) * self.max_limit
+            self.max_vel = (sum_matrix @ delta_vel).unsqueeze(-1)
+
+    def forward(self, vels):
+        vel_abs = torch.abs(vels)
+        vel_abs = torch.nn.functional.relu(vel_abs - self.max_vel)
+
+        cost = self.weight * (torch.sum(vel_abs**2, dim=-1))
+
+        return cost
diff --git a/src/curobo/rollout/cost/straight_line_cost.py b/src/curobo/rollout/cost/straight_line_cost.py
new file mode 100644
index 00000000..15abe17a
--- /dev/null
+++ b/src/curobo/rollout/cost/straight_line_cost.py
@@ -0,0 +1,45 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+
+# Third Party
+import torch
+
+# Local Folder
+from .cost_base import CostBase, CostConfig
+
+
+@torch.jit.script
+def st_cost(ee_pos_batch, vec_weight, weight):
+    ee_plus_one = torch.roll(ee_pos_batch, 1, dims=1)
+
+    xdot_current = ee_pos_batch - ee_plus_one + 1e-8
+
+    err_vec = vec_weight * xdot_current / 0.02
+    error = torch.sum(torch.square(err_vec), dim=-1)
+
+    # compute distance vector
+    cost = weight * error
+    return cost
+
+
+class StraightLineCost(CostBase):
+    def __init__(self, config: CostConfig):
+        CostBase.__init__(self, config)
+
+        self.vel_idxs = torch.arange(
+            self.dof, 2 * self.dof, dtype=torch.long, device=self.tensor_args.device
+        )
+        self.I = torch.eye(self.dof, device=self.tensor_args.device, dtype=self.tensor_args.dtype)
+
+    def forward(self, ee_pos_batch):
+        cost = st_cost(ee_pos_batch, self.vec_weight, self.weight)
+        return cost
diff --git a/src/curobo/rollout/cost/zero_cost.py b/src/curobo/rollout/cost/zero_cost.py
new file mode 100644
index 00000000..c1228531
--- /dev/null
+++ b/src/curobo/rollout/cost/zero_cost.py
@@ -0,0 +1,116 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Third Party
+import torch
+
+# Local Folder
+from .cost_base import CostBase
+
+
+@torch.jit.script
+def squared_sum(cost: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
+    # return  weight * torch.square(torch.linalg.norm(cost, dim=-1, ord=1))
+    # return  weight * torch.sum(torch.square(cost), dim=-1)
+
+    # return  torch.sum(torch.abs(cost) * weight, dim=-1)
+    return torch.sum(torch.square(cost) * weight, dim=-1)
+
+
+@torch.jit.script
+def run_squared_sum(
+    cost: torch.Tensor, weight: torch.Tensor, run_weight: torch.Tensor
+) -> torch.Tensor:
+    # return torch.sum(torch.abs(cost)* weight * run_weight.unsqueeze(-1), dim=-1)
+    ## below is smaller compute but more kernels
+    return torch.sum(torch.square(cost) * weight * run_weight.unsqueeze(-1), dim=-1)
+
+    # return torch.sum(torch.square(cost), dim=-1) * weight * run_weight
+
+
+@torch.jit.script
+def backward_squared_sum(cost_vec, w):
+    return 2.0 * w * cost_vec  # * g_out.unsqueeze(-1)
+    # return   w * g_out.unsqueeze(-1)
+
+
+@torch.jit.script
+def backward_run_squared_sum(cost_vec, w, r_w):
+    return 2.0 * w * r_w.unsqueeze(-1) * cost_vec  # * g_out.unsqueeze(-1)
+    # return   w * r_w.unsqueeze(-1) * cost_vec * g_out.unsqueeze(-1)
+
+
+class SquaredSum(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        cost_vec,
+        weight,
+    ):
+        cost = squared_sum(cost_vec, weight)
+        ctx.save_for_backward(cost_vec, weight)
+        return cost
+
+    @staticmethod
+    def backward(ctx, grad_out_cost):
+        (cost_vec, w) = ctx.saved_tensors
+        c_grad = None
+        if ctx.needs_input_grad[0]:
+            c_grad = backward_squared_sum(cost_vec, w)
+        return c_grad, None
+
+
+class RunSquaredSum(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        cost_vec,
+        weight,
+        run_weight,
+    ):
+        cost = run_squared_sum(cost_vec, weight, run_weight)
+        ctx.save_for_backward(cost_vec, weight, run_weight)
+        return cost
+
+    @staticmethod
+    def backward(ctx, grad_out_cost):
+        (cost_vec, w, r_w) = ctx.saved_tensors
+        c_grad = None
+        if ctx.needs_input_grad[0]:
+            c_grad = backward_run_squared_sum(cost_vec, w, r_w)
+        return c_grad, None, None
+
+
+class ZeroCost(CostBase):
+    """Zero Cost"""
+
+    def forward(self, x, goal_dist):
+        err = x
+
+        if self.max_value is not None:
+            err = torch.nn.functional.relu(torch.abs(err) - self.max_value)
+
+        if self.hinge_value is not None:
+            err = torch.where(goal_dist <= self.hinge_value, err, self._z_scalar)  # soft hinge
+        if self.threshold_value is not None:
+            err = torch.where(err <= self.distance_threshold, self._z_scalar, err)
+        if not self.terminal:  # or self.run_weight is not None:
+            cost = SquaredSum.apply(err, self.weight)
+        else:
+            if self._run_weight_vec is None or self._run_weight_vec.shape[1] != err.shape[1]:
+                self._run_weight_vec = torch.ones(
+                    (1, err.shape[1]), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+                )
+                self._run_weight_vec[:, 1:-1] *= self.run_weight
+            cost = RunSquaredSum.apply(
+                err, self.weight, self._run_weight_vec
+            )  # cost * self._run_weight_vec
+
+        return cost
diff --git a/src/curobo/rollout/dynamics_model/__init__.py b/src/curobo/rollout/dynamics_model/__init__.py
new file mode 100644
index 00000000..a08745f9
--- /dev/null
+++ b/src/curobo/rollout/dynamics_model/__init__.py
@@ -0,0 +1,10 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
diff --git a/src/curobo/rollout/dynamics_model/integration_utils.py b/src/curobo/rollout/dynamics_model/integration_utils.py
new file mode 100644
index 00000000..c7e9e994
--- /dev/null
+++ b/src/curobo/rollout/dynamics_model/integration_utils.py
@@ -0,0 +1,905 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+from typing import List
+
+# Third Party
+import torch
+from packaging import version
+
+# CuRobo
+from curobo.curobolib.tensor_step import (
+    tensor_step_acc_fwd,
+    tensor_step_acc_idx_fwd,
+    tensor_step_pos_clique_bwd,
+    tensor_step_pos_clique_fwd,
+    tensor_step_pos_clique_idx_fwd,
+)
+from curobo.types.base import TensorDeviceType
+from curobo.types.robot import JointState
+
+
+def build_clique_matrix(horizon, dt, device="cpu", dtype=torch.float32):
+    diag_dt = torch.diag(1 / dt)
+    one_t = torch.ones(horizon - 1, device=device, dtype=dtype)
+    fd_mat_pos = torch.diag_embed(one_t, offset=-1)
+
+    fd_mat_vel = -1.0 * torch.diag_embed(one_t, offset=-1)
+    one_t = torch.ones(horizon - 1, device=device, dtype=dtype)
+
+    fd_mat_vel += torch.eye(horizon, device=device, dtype=dtype)
+    fd_mat_vel[0, 0] = 0.0
+    fd_mat_vel = diag_dt @ fd_mat_vel
+    fd_mat_acc = diag_dt @ fd_mat_vel.clone()
+
+    fd_mat = torch.cat((fd_mat_pos, fd_mat_vel, fd_mat_acc), dim=0)
+    return fd_mat
+
+
+def build_fd_matrix(
+    horizon,
+    device="cpu",
+    dtype=torch.float32,
+    order=1,
+    PREV_STATE=False,
+    FULL_RANK=False,
+    SHIFT=False,
+):
+    if PREV_STATE:
+        # build order 1 fd matrix of horizon+order size
+        fd1_mat = build_fd_matrix(horizon + order, device, dtype, order=1)
+
+        # multiply order times to get fd_order matrix [h+order, h+order]
+        fd_mat = fd1_mat
+        fd_single = fd_mat.clone()
+        for _ in range(order - 1):
+            fd_mat = fd_single @ fd_mat
+        # return [horizon,h+order]
+        fd_mat = -1.0 * fd_mat[:horizon, :]
+        # fd_mat = torch.zeros((horizon, horizon + order),device=device, dtype=dtype)
+        # one_t = torch.ones(horizon, device=device, dtype=dtype)
+        # fd_mat[:horizon, :horizon] = torch.diag_embed(one_t)
+        # print(torch.diag_embed(one_t, offset=1).shape, fd_mat.shape)
+        # fd_mat += - torch.diag_embed(one_t, offset=1)[:-1,:]
+
+    elif FULL_RANK:
+        fd_mat = torch.eye(horizon, device=device, dtype=dtype)
+
+        one_t = torch.ones(horizon // 2, device=device, dtype=dtype)
+        fd_mat[: horizon // 2, : horizon // 2] = torch.diag_embed(one_t)
+        fd_mat[: horizon // 2 + 1, : horizon // 2 + 1] += -torch.diag_embed(one_t, offset=1)
+        one_t = torch.ones(horizon // 2, device=device, dtype=dtype)
+        fd_mat[horizon // 2 :, horizon // 2 :] += -torch.diag_embed(one_t, offset=-1)
+        fd_mat[horizon // 2, horizon // 2] = 0.0
+        fd_mat[horizon // 2, horizon // 2 - 1] = -1.0
+        fd_mat[horizon // 2, horizon // 2 + 1] = 1.0
+    else:
+        fd_mat = torch.zeros((horizon, horizon), device=device, dtype=dtype)
+        if horizon > 1:
+            one_t = torch.ones(horizon - 1, device=device, dtype=dtype)
+            if not SHIFT:
+                fd_mat[: horizon - 1, : horizon - 1] = -1.0 * torch.diag_embed(one_t)
+                fd_mat += torch.diag_embed(one_t, offset=1)
+            else:
+                fd_mat[1:, : horizon - 1] = -1.0 * torch.diag_embed(one_t)
+                fd_mat[1:, 1:] += torch.diag_embed(one_t)
+            fd_og = fd_mat.clone()
+            for _ in range(order - 1):
+                fd_mat = fd_og @ fd_mat
+            # if order > 1:
+            #    #print(order, fd_mat)
+            #    for i in range(order):
+            #        fd_mat[i,:] /= (2**(i+2))
+            #        #print(order, fd_mat[order])
+            #    #print(order, fd_mat)
+
+            # fd_mat[:order]
+            # if order > 1:
+            #    fd_mat[:order-1, :] = 0.0
+
+            # recreate this as a sparse tensor?
+            # print(fd_mat)
+            # sparse_indices = []
+            # sparse_values = []
+            # for i in range(horizon-1):
+            #    sparse_indices.extend([[i,i], [i,i+1]])
+            #    sparse_values.extend([-1.0, 1.0])
+            # sparse_indices.extend([[horizon-1, horizon-1]])
+            # sparse_values.extend([0.0])
+            # fd_kernel = torch.sparse_coo_tensor(torch.tensor(sparse_indices).t(),
+            # torch.tensor(sparse_values), device=device, dtype=dtype)
+            # fd_mat = fd_kernel.to_dense()
+    return fd_mat
+
+
+def build_int_matrix(horizon, diagonal=0, device="cpu", dtype=torch.float32, order=1, traj_dt=None):
+    integrate_matrix = torch.tril(
+        torch.ones((horizon, horizon), device=device, dtype=dtype), diagonal=diagonal
+    )
+    chain_list = [torch.eye(horizon, device=device, dtype=dtype)]
+    if traj_dt is None:
+        chain_list.extend([integrate_matrix for i in range(order)])
+    else:
+        diag_dt = torch.diag(traj_dt)
+
+        for _ in range(order):
+            chain_list.append(integrate_matrix)
+            chain_list.append(diag_dt)
+    if len(chain_list) == 1:
+        integrate_matrix = chain_list[0]
+    elif version.parse(torch.__version__) < version.parse("1.9.0"):
+        integrate_matrix = torch.chain_matmul(*chain_list)
+    else:
+        integrate_matrix = torch.linalg.multi_dot(chain_list)
+
+    return integrate_matrix
+
+
+def build_start_state_mask(horizon, tensor_args: TensorDeviceType):
+    mask = torch.zeros((horizon, 1), device=tensor_args.device, dtype=tensor_args.dtype)
+    # n_mask = torch.eye(horizon, device=tensor_args.device, dtype=tensor_args.dtype)
+    n_mask = torch.diag_embed(
+        torch.ones((horizon - 1), device=tensor_args.device, dtype=tensor_args.dtype), offset=-1
+    )
+    mask[0, 0] = 1.0
+    # n_mask[0,0] = 0.0
+    return mask, n_mask
+
+
+# @torch.jit.script
+def tensor_step_jerk(state, act, state_seq, dt_h, n_dofs, integrate_matrix, fd_matrix=None):
+    # type: (Tensor, Tensor, Tensor, Tensor, int, Tensor, Optional[Tensor]) -> Tensor
+
+    # This is batch,n_dof
+    q = state[:, :n_dofs]
+    qd = state[:, n_dofs : 2 * n_dofs]
+    qdd = state[:, 2 * n_dofs : 3 * n_dofs]
+
+    diag_dt = torch.diag(dt_h)
+    # qd_new = act
+    # integrate velocities:
+    qdd_new = qdd + torch.matmul(integrate_matrix, torch.matmul(diag_dt, act))
+    qd_new = qd + torch.matmul(integrate_matrix, torch.matmul(diag_dt, qdd_new))
+    q_new = q + torch.matmul(integrate_matrix, torch.matmul(diag_dt, qd_new))
+    state_seq[:, :, :n_dofs] = q_new
+    state_seq[:, :, n_dofs : n_dofs * 2] = qd_new
+    state_seq[:, :, n_dofs * 2 : n_dofs * 3] = qdd_new
+
+    return state_seq
+
+
+# @torch.jit.script
+def euler_integrate(q_0, u, diag_dt, integrate_matrix):
+    # q_new = q_0 + torch.matmul(integrate_matrix, torch.matmul(diag_dt, u))
+    q_new = q_0 + torch.matmul(integrate_matrix, u)
+    # q_new = torch.addmm(q_0,integrate_matrix,torch.matmul(diag_dt, u))
+    return q_new
+
+
+# @torch.jit.script
+def tensor_step_acc(state, act, state_seq, dt_h, n_dofs, integrate_matrix, fd_matrix=None):
+    # type: (Tensor, Tensor, Tensor, Tensor, int, Tensor, Optional[Tensor]) -> Tensor
+    # This is batch,n_dof
+    q = state[..., :n_dofs]
+    qd = state[..., n_dofs : 2 * n_dofs]
+    qdd_new = act
+    diag_dt = torch.diag(dt_h)
+    diag_dt_2 = torch.diag(dt_h**2)
+    qd_new = euler_integrate(qd, qdd_new, diag_dt, integrate_matrix)
+    q_new = euler_integrate(q, qd_new, diag_dt, integrate_matrix)
+
+    state_seq[..., n_dofs * 2 : n_dofs * 3] = qdd_new
+    state_seq[..., n_dofs : n_dofs * 2] = qd_new
+    # state_seq[:,1:, n_dofs: n_dofs * 2] = qd_new[:,:-1,:]
+    # state_seq[:,0:1, n_dofs: n_dofs * 2] = qd
+    # state_seq[:,1:, :n_dofs] = q_new[:,:-1,:] #+ 0.5 * torch.matmul(diag_dt_2,qdd_new)
+    state_seq[..., :n_dofs] = q_new  # + 0.5 * torch.matmul(diag_dt_2,qdd_new)
+    # state_seq[:,0:1, :n_dofs] = q #state[...,:n_dofs]
+
+    return state_seq
+
+
+@torch.jit.script
+def jit_tensor_step_pos_clique_contiguous(pos_act, start_position, mask, n_mask, fd_1, fd_2, fd_3):
+    state_position = (start_position.unsqueeze(1).transpose(1, 2) @ mask.transpose(0, 1)) + (
+        pos_act.transpose(1, 2) @ n_mask.transpose(0, 1)
+    )
+    # state_position = mask @ start_position.unsqueeze(1) + n_mask @ pos_act
+    # print(state_position.shape, fd_1.shape)
+    # # below 3 can be done in parallel:
+    state_vel = (state_position @ fd_1.transpose(0, 1)).transpose(1, 2).contiguous()
+    state_acc = (state_position @ fd_2.transpose(0, 1)).transpose(1, 2).contiguous()
+    state_jerk = (state_position @ fd_3.transpose(0, 1)).transpose(1, 2).contiguous()
+    state_position = state_position.transpose(1, 2).contiguous()
+    return state_position, state_vel, state_acc, state_jerk
+
+
+@torch.jit.script
+def jit_tensor_step_pos_clique(pos_act, start_position, mask, n_mask, fd_1, fd_2, fd_3):
+    state_position = mask @ start_position.unsqueeze(1) + n_mask @ pos_act
+    state_vel = fd_1 @ state_position
+    state_acc = fd_2 @ state_position
+    state_jerk = fd_3 @ state_position
+    return state_position, state_vel, state_acc, state_jerk
+
+
+@torch.jit.script
+def jit_backward_pos_clique(grad_p, grad_v, grad_a, grad_j, n_mask, fd_1, fd_2, fd_3):
+    p_grad = (
+        grad_p
+        + (fd_3).transpose(-1, -2) @ grad_j
+        + (fd_2).transpose(-1, -2) @ grad_a
+        + (fd_1).transpose(-1, -2) @ grad_v
+    )
+    u_grad = (n_mask).transpose(-1, -2) @ p_grad
+    # u_grad = n_mask @ p_grad
+    # p_grad = fd_3 @ grad_j + fd_2 @ grad_a + fd_1 @ grad_v + grad_p
+    # u_grad = n_mask @ p_grad
+
+    return u_grad
+
+
+@torch.jit.script
+def jit_backward_pos_clique_contiguous(grad_p, grad_v, grad_a, grad_j, n_mask, fd_1, fd_2, fd_3):
+    p_grad = grad_p + (
+        grad_j.transpose(-1, -2) @ fd_3
+        + grad_a.transpose(-1, -2) @ fd_2
+        + grad_v.transpose(-1, -2) @ fd_1
+    ).transpose(-1, -2)
+    # u_grad = (n_mask).transpose(-1, -2) @ p_grad
+
+    u_grad = (p_grad.transpose(-1, -2) @ n_mask).transpose(-1, -2).contiguous()
+    return u_grad
+
+
+class CliqueTensorStep(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        u_act,
+        start_position,
+        mask,
+        n_mask,
+        fd_1,
+        fd_2,
+        fd_3,
+    ):
+        state_position, state_vel, state_acc, state_jerk = jit_tensor_step_pos_clique(
+            u_act, start_position, mask, n_mask, fd_1, fd_2, fd_3
+        )
+        ctx.save_for_backward(n_mask, fd_1, fd_2, fd_3)
+        return state_position, state_vel, state_acc, state_jerk
+
+    @staticmethod
+    def backward(ctx, grad_out_p, grad_out_v, grad_out_a, grad_out_j):
+        u_grad = None
+        (n_mask, fd_1, fd_2, fd_3) = ctx.saved_tensors
+        if ctx.needs_input_grad[0]:
+            u_grad = jit_backward_pos_clique(
+                grad_out_p, grad_out_v, grad_out_a, grad_out_j, n_mask, fd_1, fd_2, fd_3
+            )
+        return u_grad, None, None, None, None, None, None
+
+
+class CliqueTensorStepKernel(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        u_act,
+        start_position,
+        start_velocity,
+        start_acceleration,
+        out_position,
+        out_velocity,
+        out_acceleration,
+        out_jerk,
+        traj_dt,
+        out_grad_position,
+    ):
+        (
+            state_position,
+            state_velocity,
+            state_acceleration,
+            state_jerk,
+        ) = tensor_step_pos_clique_fwd(
+            out_position,
+            out_velocity,
+            out_acceleration,
+            out_jerk,
+            u_act,
+            start_position,
+            start_velocity,
+            start_acceleration,
+            traj_dt,
+            out_position.shape[0],
+            out_position.shape[1],
+            out_position.shape[-1],
+        )
+        ctx.save_for_backward(traj_dt, out_grad_position)
+        return state_position, state_velocity, state_acceleration, state_jerk
+
+    @staticmethod
+    def backward(ctx, grad_out_p, grad_out_v, grad_out_a, grad_out_j):
+        u_grad = None
+
+        if ctx.needs_input_grad[0]:
+            (traj_dt, out_grad_position) = ctx.saved_tensors
+
+            u_grad = tensor_step_pos_clique_bwd(
+                out_grad_position,
+                grad_out_p,
+                grad_out_v,
+                grad_out_a,
+                grad_out_j,
+                traj_dt,
+                out_grad_position.shape[0],
+                out_grad_position.shape[1],
+                out_grad_position.shape[2],
+            )
+        return (
+            u_grad,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+class CliqueTensorStepIdxKernel(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        u_act,
+        start_position,
+        start_velocity,
+        start_acceleration,
+        start_idx,
+        out_position,
+        out_velocity,
+        out_acceleration,
+        out_jerk,
+        traj_dt,
+        out_grad_position,
+    ):
+        (
+            state_position,
+            state_velocity,
+            state_acceleration,
+            state_jerk,
+        ) = tensor_step_pos_clique_idx_fwd(
+            out_position,
+            out_velocity,
+            out_acceleration,
+            out_jerk,
+            u_act,
+            start_position,
+            start_velocity,
+            start_acceleration,
+            start_idx,
+            traj_dt,
+            out_position.shape[0],
+            out_position.shape[1],
+            out_position.shape[-1],
+        )
+
+        ctx.save_for_backward(traj_dt, out_grad_position)
+        return state_position, state_velocity, state_acceleration, state_jerk
+
+    @staticmethod
+    def backward(ctx, grad_out_p, grad_out_v, grad_out_a, grad_out_j):
+        u_grad = None
+
+        if ctx.needs_input_grad[0]:
+            (traj_dt, out_grad_position) = ctx.saved_tensors
+
+            u_grad = tensor_step_pos_clique_bwd(
+                out_grad_position,
+                grad_out_p,
+                grad_out_v,
+                grad_out_a,
+                grad_out_j,
+                traj_dt,
+                out_grad_position.shape[0],
+                out_grad_position.shape[1],
+                out_grad_position.shape[2],
+            )
+        return (
+            u_grad,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )  # , None, None, None, None,None
+
+
+class CliqueTensorStepCentralDifferenceKernel(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        u_act,
+        start_position,
+        start_velocity,
+        start_acceleration,
+        out_position,
+        out_velocity,
+        out_acceleration,
+        out_jerk,
+        traj_dt,
+        out_grad_position,
+    ):
+        (
+            state_position,
+            state_velocity,
+            state_acceleration,
+            state_jerk,
+        ) = tensor_step_pos_clique_fwd(
+            out_position,
+            out_velocity,
+            out_acceleration,
+            out_jerk,
+            u_act,
+            start_position,
+            start_velocity,
+            start_acceleration,
+            traj_dt,
+            out_position.shape[0],
+            out_position.shape[1],
+            out_position.shape[-1],
+            0,
+        )
+        ctx.save_for_backward(traj_dt, out_grad_position)
+        return state_position, state_velocity, state_acceleration, state_jerk
+
+    @staticmethod
+    def backward(ctx, grad_out_p, grad_out_v, grad_out_a, grad_out_j):
+        u_grad = None
+
+        if ctx.needs_input_grad[0]:
+            (traj_dt, out_grad_position) = ctx.saved_tensors
+
+            u_grad = tensor_step_pos_clique_bwd(
+                out_grad_position,
+                grad_out_p,
+                grad_out_v,
+                grad_out_a.contiguous(),
+                grad_out_j.contiguous(),
+                traj_dt,
+                out_grad_position.shape[0],
+                out_grad_position.shape[1],
+                out_grad_position.shape[2],
+                0,
+            )
+        return (
+            u_grad,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+class CliqueTensorStepIdxCentralDifferenceKernel(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        u_act,
+        start_position,
+        start_velocity,
+        start_acceleration,
+        start_idx,
+        out_position,
+        out_velocity,
+        out_acceleration,
+        out_jerk,
+        traj_dt,
+        out_grad_position,
+    ):
+        (
+            state_position,
+            state_velocity,
+            state_acceleration,
+            state_jerk,
+        ) = tensor_step_pos_clique_idx_fwd(
+            out_position,
+            out_velocity,
+            out_acceleration,
+            out_jerk,
+            u_act,
+            start_position,
+            start_velocity,
+            start_acceleration,
+            start_idx,
+            traj_dt,
+            out_position.shape[0],
+            out_position.shape[1],
+            out_position.shape[-1],
+            0,
+        )
+
+        ctx.save_for_backward(traj_dt, out_grad_position)
+        return state_position, state_velocity, state_acceleration, state_jerk
+
+    @staticmethod
+    def backward(ctx, grad_out_p, grad_out_v, grad_out_a, grad_out_j):
+        u_grad = None
+
+        if ctx.needs_input_grad[0]:
+            (traj_dt, out_grad_position) = ctx.saved_tensors
+
+            u_grad = tensor_step_pos_clique_bwd(
+                out_grad_position,
+                grad_out_p,
+                grad_out_v,
+                grad_out_a.contiguous(),
+                grad_out_j.contiguous(),
+                traj_dt,
+                out_grad_position.shape[0],
+                out_grad_position.shape[1],
+                out_grad_position.shape[2],
+                0,
+            )
+        return (
+            u_grad,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )  # , None, None, None, None,None
+
+
+class CliqueTensorStepCoalesceKernel(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        u_act,
+        start_position,
+        start_velocity,
+        start_acceleration,
+        out_position,
+        out_velocity,
+        out_acceleration,
+        out_jerk,
+        traj_dt,
+        out_grad_position,
+    ):
+        state_position, state_velocity, state_acceleration, state_jerk = tensor_step_pos_clique_fwd(
+            out_position.transpose(-1, -2).contiguous(),
+            out_velocity.transpose(-1, -2).contiguous(),
+            out_acceleration.transpose(-1, -2).contiguous(),
+            out_jerk.transpose(-1, -2).contiguous(),
+            u_act.transpose(-1, -2).contiguous(),
+            start_position,
+            start_velocity,
+            start_acceleration,
+            traj_dt,
+            out_position.shape[0],
+            out_position.shape[1],
+            out_position.shape[-1],
+        )
+        ctx.save_for_backward(traj_dt, out_grad_position)
+        return (
+            state_position.transpose(-1, -2).contiguous(),
+            state_velocity.transpose(-1, -2).contiguous(),
+            state_acceleration.transpose(-1, -2).contiguous(),
+            state_jerk.transpose(-1, -2).contiguous(),
+        )
+
+    @staticmethod
+    def backward(ctx, grad_out_p, grad_out_v, grad_out_a, grad_out_j):
+        u_grad = None
+        (traj_dt, out_grad_position) = ctx.saved_tensors
+        if ctx.needs_input_grad[0]:
+            u_grad = tensor_step_pos_clique_bwd(
+                out_grad_position.transpose(-1, -2).contiguous(),
+                grad_out_p.transpose(-1, -2).contiguous(),
+                grad_out_v.transpose(-1, -2).contiguous(),
+                grad_out_a.transpose(-1, -2).contiguous(),
+                grad_out_j.transpose(-1, -2).contiguous(),
+                traj_dt,
+                out_grad_position.shape[0],
+                out_grad_position.shape[1],
+                out_grad_position.shape[2],
+            )
+        return (
+            u_grad.transpose(-1, -2).contiguous(),
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+class AccelerationTensorStepKernel(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        u_act,
+        start_position,
+        start_velocity,
+        start_acceleration,
+        out_position,
+        out_velocity,
+        out_acceleration,
+        out_jerk,
+        traj_dt,
+        out_grad_position,
+    ):
+        state_position, state_velocity, state_acceleration, state_jerk = tensor_step_acc_fwd(
+            out_position,
+            out_velocity,
+            out_acceleration,
+            out_jerk,
+            u_act,
+            start_position,
+            start_velocity,
+            start_acceleration,
+            traj_dt,
+            out_position.shape[0],
+            out_position.shape[1],
+            out_position.shape[-1],
+        )
+        ctx.save_for_backward(traj_dt, out_grad_position)
+        return state_position, state_velocity, state_acceleration, state_jerk
+
+    @staticmethod
+    def backward(ctx, grad_out_p, grad_out_v, grad_out_a, grad_out_j):
+        u_grad = None
+        (traj_dt, out_grad_position) = ctx.saved_tensors
+        if ctx.needs_input_grad[0]:
+            raise NotImplementedError()
+            u_grad = tensor_step_pos_clique_bwd(
+                out_grad_position,
+                grad_out_p,
+                grad_out_v,
+                grad_out_a,
+                grad_out_j,
+                traj_dt,
+                out_grad_position.shape[0],
+                out_grad_position.shape[1],
+                out_grad_position.shape[2],
+            )
+        return u_grad, None, None, None, None, None, None, None, None, None
+
+
+class AccelerationTensorStepIdxKernel(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        u_act,
+        start_position,
+        start_velocity,
+        start_acceleration,
+        start_idx,
+        out_position,
+        out_velocity,
+        out_acceleration,
+        out_jerk,
+        traj_dt,
+        out_grad_position,
+    ):
+        state_position, state_velocity, state_acceleration, state_jerk = tensor_step_acc_idx_fwd(
+            out_position,
+            out_velocity,
+            out_acceleration,
+            out_jerk,
+            u_act,
+            start_position,
+            start_velocity,
+            start_acceleration,
+            start_idx,
+            traj_dt,
+            out_position.shape[0],
+            out_position.shape[1],
+            out_position.shape[-1],
+        )
+        ctx.save_for_backward(traj_dt, out_grad_position)
+        return state_position, state_velocity, state_acceleration, state_jerk
+
+    @staticmethod
+    def backward(ctx, grad_out_p, grad_out_v, grad_out_a, grad_out_j):
+        u_grad = None
+        (traj_dt, out_grad_position) = ctx.saved_tensors
+        if ctx.needs_input_grad[0]:
+            raise NotImplementedError()
+            u_grad = tensor_step_pos_clique_bwd(
+                out_grad_position,
+                grad_out_p,
+                grad_out_v,
+                grad_out_a,
+                grad_out_j,
+                traj_dt,
+                out_grad_position.shape[0],
+                out_grad_position.shape[1],
+                out_grad_position.shape[2],
+            )
+        return u_grad, None, None, None, None, None, None, None, None, None, None
+
+
+# @torch.jit.script
+def tensor_step_pos_clique(
+    state: JointState,
+    act: torch.Tensor,
+    state_seq: JointState,
+    mask_matrix: List[torch.Tensor],
+    fd_matrix: List[torch.Tensor],
+):
+    (
+        state_seq.position,
+        state_seq.velocity,
+        state_seq.acceleration,
+        state_seq.jerk,
+    ) = CliqueTensorStep.apply(
+        act,
+        state.position,
+        mask_matrix[0],
+        mask_matrix[1],
+        fd_matrix[0],
+        fd_matrix[1],
+        fd_matrix[2],
+    )
+    return state_seq
+
+
+def step_acc_semi_euler(state, act, diag_dt, n_dofs, integrate_matrix):
+    q = state[..., :n_dofs]
+    qd = state[..., n_dofs : 2 * n_dofs]
+    qdd_new = act
+    # diag_dt = torch.diag(dt_h)
+    qd_new = euler_integrate(qd, qdd_new, diag_dt, integrate_matrix)
+    q_new = euler_integrate(q, qd_new, diag_dt, integrate_matrix)
+    state_seq = torch.cat((q_new, qd_new, qdd_new), dim=-1)
+    return state_seq
+
+
+# @torch.jit.script
+def tensor_step_acc_semi_euler(
+    state, act, state_seq, diag_dt, integrate_matrix, integrate_matrix_pos
+):
+    # type: (Tensor, Tensor, Tensor, int, Tensor, Optional[Tensor]) -> Tensor
+    # This is batch,n_dof
+    state = state.unsqueeze(1)
+    q = state.position  # [..., :n_dofs]
+    qd = state.velocity  # [..., n_dofs : 2 * n_dofs]
+    qdd_new = act
+    # diag_dt = torch.diag(dt_h)
+    qd_new = euler_integrate(qd, qdd_new, diag_dt, integrate_matrix)
+    q_new = euler_integrate(q, qd_new, diag_dt, integrate_matrix_pos)
+    state_seq.acceleration = qdd_new
+    state_seq.velocity = qd_new
+    state_seq.position = q_new
+
+    return state_seq
+
+
+# @torch.jit.script
+def tensor_step_vel(state, act, state_seq, dt_h, n_dofs, integrate_matrix, fd_matrix):
+    # type: (Tensor, Tensor, Tensor, Tensor, int, Tensor, Tensor) -> Tensor
+
+    # This is batch,n_dof
+    state_seq[:, 0:1, : n_dofs * 3] = state
+    q = state[..., :n_dofs]
+
+    qd_new = act[:, :-1, :]
+    # integrate velocities:
+    dt_diag = torch.diag(dt_h)
+    state_seq[:, 1:, n_dofs : n_dofs * 2] = qd_new
+    qd = state_seq[:, :, n_dofs : n_dofs * 2]
+
+    q_new = euler_integrate(q, qd, dt_diag, integrate_matrix)
+
+    state_seq[:, :, :n_dofs] = q_new
+
+    qdd = (torch.diag(1 / dt_h)) @ fd_matrix @ qd
+    state_seq[:, 1:, n_dofs * 2 : n_dofs * 3] = qdd[:, :-1, :]
+
+    return state_seq
+
+
+# @torch.jit.script
+def tensor_step_pos(state, act, state_seq, fd_matrix):
+    # This is batch,n_dof
+    state_seq.position[:, 0, :] = state.position
+    state_seq.velocity[:, 0, :] = state.velocity
+    state_seq.acceleration[:, 0, :] = state.acceleration
+
+    # integrate velocities:
+    state_seq.position[:, 1:] = act[:, :-1, :]
+
+    qd = fd_matrix @ state_seq.position  # [:, :, :n_dofs]
+    state_seq.velocity[:, 1:] = qd[:, :-1, :]  # qd_new
+    qdd = fd_matrix @ state_seq.velocity  # [:, :, n_dofs : n_dofs * 2]
+
+    state_seq.acceleration[:, 1:] = qdd[:, :-1, :]
+    # jerk = fd_matrix @ state_seq.acceleration
+
+    return state_seq
+
+
+# @torch.jit.script
+def tensor_step_pos_ik(act, state_seq):
+    state_seq.position = act
+    return state_seq
+
+
+def tensor_linspace(start_tensor, end_tensor, steps=10):
+    dist = end_tensor - start_tensor
+    interpolate_matrix = (
+        torch.ones((steps), device=start_tensor.device, dtype=start_tensor.dtype) / steps
+    )
+    cum_matrix = torch.cumsum(interpolate_matrix, dim=0)
+
+    interp_tensor = start_tensor + cum_matrix * dist
+
+    return interp_tensor
+
+
+def sum_matrix(h, int_steps, tensor_args):
+    sum_mat = torch.zeros(((h - 1) * int_steps, h), **vars(tensor_args))
+    for i in range(h - 1):
+        sum_mat[i * int_steps : i * int_steps + int_steps, i] = 1.0
+    # hack:
+    # sum_mat[-1, -1] = 1.0
+    return sum_mat
+
+
+def interpolate_kernel(h, int_steps, tensor_args: TensorDeviceType):
+    mat = torch.zeros(
+        ((h - 1) * (int_steps), h), device=tensor_args.device, dtype=tensor_args.dtype
+    )
+    delta = torch.arange(0, int_steps, device=tensor_args.device, dtype=tensor_args.dtype) / (
+        int_steps - 1
+    )
+    for i in range(h - 1):
+        mat[i * int_steps : i * int_steps + int_steps, i] = delta.flip(0)
+        mat[i * int_steps : i * int_steps + int_steps, i + 1] = delta
+    return mat
+
+
+def action_interpolate_kernel(h, int_steps, tensor_args: TensorDeviceType):
+    mat = torch.zeros(
+        ((h - 1) * (int_steps), h), device=tensor_args.device, dtype=tensor_args.dtype
+    )
+    delta = torch.arange(0, int_steps - 2, device=tensor_args.device, dtype=tensor_args.dtype) / (
+        int_steps - 1.0 - 2
+    )
+    for i in range(h - 1):
+        mat[i * int_steps : i * (int_steps) + int_steps - 1 - 2, i] = delta.flip(0)[1:]
+        mat[i * int_steps : i * (int_steps) + int_steps - 1 - 2, i + 1] = delta[1:]
+    mat[-3:, 1] = 1.0
+
+    return mat
diff --git a/src/curobo/rollout/dynamics_model/kinematic_model.py b/src/curobo/rollout/dynamics_model/kinematic_model.py
new file mode 100644
index 00000000..4cfa655f
--- /dev/null
+++ b/src/curobo/rollout/dynamics_model/kinematic_model.py
@@ -0,0 +1,605 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Sequence, Union
+
+# Third Party
+import torch
+import torch.autograd.profiler as profiler
+
+# CuRobo
+from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel
+from curobo.rollout.dynamics_model.tensor_step import (
+    TensorStepAccelerationKernel,
+    TensorStepPosition,
+    TensorStepPositionClique,
+    TensorStepPositionCliqueKernel,
+    TensorStepPositionTeleport,
+)
+from curobo.types.base import TensorDeviceType
+from curobo.types.enum import StateType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.util.helpers import list_idx_if_not_none
+from curobo.util.logger import log_error, log_info
+from curobo.util.state_filter import FilterConfig, JointStateFilter
+
+
+@dataclass
+class TimeTrajConfig:
+    base_dt: float
+    base_ratio: float
+    max_dt: float
+
+    def get_dt_array(self, num_points: int):
+        dt_array = [self.base_dt] * int(self.base_ratio * num_points)
+
+        smooth_blending = torch.linspace(
+            self.base_dt,
+            self.max_dt,
+            steps=int((1 - self.base_ratio) * num_points),
+        ).tolist()
+        dt_array += smooth_blending
+        if len(dt_array) != num_points:
+            dt_array.insert(0, dt_array[0])
+        return dt_array
+
+    def update_dt(
+        self,
+        all_dt: float = None,
+        base_dt: float = None,
+        max_dt: float = None,
+        base_ratio: float = None,
+    ):
+        if all_dt is not None:
+            self.base_dt = all_dt
+            self.max_dt = all_dt
+            return
+        if base_dt is not None:
+            self.base_dt = base_dt
+        if base_ratio is not None:
+            self.base_ratio = base_ratio
+        if max_dt is not None:
+            self.max_dt = max_dt
+
+
+@dataclass
+class KinematicModelState(Sequence):
+    # TODO: subclass this from State
+    state_seq: JointState
+    ee_pos_seq: Optional[torch.Tensor] = None
+    ee_quat_seq: Optional[torch.Tensor] = None
+    robot_spheres: Optional[torch.Tensor] = None
+    link_pos_seq: Optional[torch.Tensor] = None
+    link_quat_seq: Optional[torch.Tensor] = None
+    lin_jac_seq: Optional[torch.Tensor] = None
+    ang_jac_seq: Optional[torch.Tensor] = None
+    link_names: Optional[List[str]] = None
+
+    def __getitem__(self, idx):
+        d_list = [
+            self.state_seq,
+            self.ee_pos_seq,
+            self.ee_quat_seq,
+            self.robot_spheres,
+            self.link_pos_seq,
+            self.link_quat_seq,
+            self.lin_jac_seq,
+            self.ang_jac_seq,
+        ]
+        idx_vals = list_idx_if_not_none(d_list, idx)
+        return KinematicModelState(*idx_vals, link_names=self.link_names)
+
+    def __len__(self):
+        return len(self.state_seq)
+
+    @property
+    def ee_pose(self) -> Pose:
+        return Pose(self.ee_pos_seq, self.ee_quat_seq, normalize_rotation=False)
+
+    @property
+    def link_pose(self):
+        if self.link_names is not None:
+            link_pos_seq = self.link_pos_seq.contiguous()
+            link_quat_seq = self.link_quat_seq.contiguous()
+            link_poses = {}
+            for i, v in enumerate(self.link_names):
+                link_poses[v] = Pose(
+                    link_pos_seq[..., i, :], link_quat_seq[..., i, :], normalize_rotation=False
+                )
+        else:
+            link_poses = None
+        return link_poses
+
+
+@dataclass(frozen=False)
+class KinematicModelConfig:
+    robot_config: RobotConfig
+    dt_traj_params: TimeTrajConfig
+    tensor_args: TensorDeviceType
+    vel_scale: float = 1.0
+    state_estimation_variance: float = 0.0
+    batch_size: int = 1
+    horizon: int = 5
+    control_space: StateType = StateType.ACCELERATION
+    state_filter_cfg: Optional[FilterConfig] = None
+    teleport_mode: bool = False
+    return_full_act_buffer: bool = False
+    state_finite_difference_mode: str = "BACKWARD"
+    filter_robot_command: bool = False
+    # tensor_step_type: TensorStepType = TensorStepType.ACCELERATION
+
+    @staticmethod
+    def from_dict(
+        data_dict_in, robot_cfg: Union[Dict, RobotConfig], tensor_args=TensorDeviceType()
+    ):
+        data_dict = deepcopy(data_dict_in)
+        if isinstance(robot_cfg, dict):
+            robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+        data_dict["robot_config"] = robot_cfg
+        data_dict["dt_traj_params"] = TimeTrajConfig(**data_dict["dt_traj_params"])
+        data_dict["control_space"] = StateType[data_dict["control_space"]]
+        data_dict["state_filter_cfg"] = FilterConfig.from_dict(
+            data_dict["state_filter_cfg"]["filter_coeff"],
+            enable=data_dict["state_filter_cfg"]["enable"],
+            dt=data_dict["dt_traj_params"].base_dt,
+            control_space=data_dict["control_space"],
+            tensor_args=tensor_args,
+            teleport_mode=data_dict["teleport_mode"],
+        )
+
+        return KinematicModelConfig(**data_dict, tensor_args=tensor_args)
+
+
+class KinematicModel(KinematicModelConfig):
+    def __init__(
+        self,
+        kinematic_model_config: KinematicModelConfig,
+    ):
+        super().__init__(**vars(kinematic_model_config))
+        self.dt = self.dt_traj_params.base_dt
+        self.robot_model = CudaRobotModel(self.robot_config.kinematics)
+        # update cspace to store joint names in the order given by robot model:
+        self.n_dofs = self.robot_model.get_dof()
+        self._use_clique = True
+        self._use_bmm_tensor_step = False
+        self._use_clique_kernel = True
+        self.d_state = 4 * self.n_dofs  # + 1
+        self.d_action = self.n_dofs
+
+        # Variables for enforcing joint limits
+        self.joint_names = self.robot_model.joint_names
+        self.joint_limits = self.robot_model.get_joint_limits()
+
+        # #pre-allocating memory for rollouts
+        self.state_seq = JointState.from_state_tensor(
+            torch.zeros(
+                self.batch_size,
+                self.horizon,
+                self.d_state,
+                device=self.tensor_args.device,
+                dtype=self.tensor_args.dtype,
+            ),
+            dof=int(self.d_state / 3),
+        )
+        self.Z = torch.tensor([0.0], device=self.tensor_args.device, dtype=self.tensor_args.dtype)
+
+        dt_array = self.dt_traj_params.get_dt_array(self.horizon)
+        self.traj_dt = torch.tensor(
+            dt_array, dtype=self.tensor_args.dtype, device=self.tensor_args.device
+        )
+        # TODO: choose tensor integration type here:
+        if self.control_space == StateType.ACCELERATION:
+            # self._rollout_step_fn = TensorStepAcceleration(self.tensor_args, self._dt_h)
+            # self._cmd_step_fn = TensorStepAcceleration(self.tensor_args, self.traj_dt)
+
+            self._rollout_step_fn = TensorStepAccelerationKernel(
+                self.tensor_args, self.traj_dt, self.n_dofs
+            )
+            self._cmd_step_fn = TensorStepAccelerationKernel(
+                self.tensor_args, self.traj_dt, self.n_dofs
+            )
+        elif self.control_space == StateType.VELOCITY:
+            raise NotImplementedError()
+        elif self.control_space == StateType.JERK:
+            raise NotImplementedError()
+        elif self.control_space == StateType.POSITION:
+            if self.teleport_mode:
+                self._rollout_step_fn = TensorStepPositionTeleport(self.tensor_args)
+                self._cmd_step_fn = TensorStepPositionTeleport(self.tensor_args)
+            else:
+                if self._use_clique:
+                    if self._use_clique_kernel:
+                        if self.state_finite_difference_mode == "BACKWARD":
+                            finite_difference = -1
+                        elif self.state_finite_difference_mode == "CENTRAL":
+                            finite_difference = 0
+                        else:
+                            log_error(
+                                "unknown state finite difference mode: "
+                                + self.state_finite_difference_mode
+                            )
+                        self._rollout_step_fn = TensorStepPositionCliqueKernel(
+                            self.tensor_args,
+                            self.traj_dt,
+                            self.n_dofs,
+                            finite_difference_mode=finite_difference,
+                            filter_velocity=False,
+                            filter_acceleration=False,
+                            filter_jerk=False,
+                        )
+                        self._cmd_step_fn = TensorStepPositionCliqueKernel(
+                            self.tensor_args,
+                            self.traj_dt,
+                            self.n_dofs,
+                            finite_difference_mode=finite_difference,
+                            filter_velocity=False,
+                            filter_acceleration=self.filter_robot_command,
+                            filter_jerk=self.filter_robot_command,
+                        )
+
+                    else:
+                        self._rollout_step_fn = TensorStepPositionClique(
+                            self.tensor_args, self.traj_dt
+                        )
+                        self._cmd_step_fn = TensorStepPositionClique(self.tensor_args, self.traj_dt)
+                else:
+                    self._rollout_step_fn = TensorStepPosition(self.tensor_args, self.traj_dt)
+                    self._cmd_step_fn = TensorStepPosition(self.tensor_args, self.traj_dt)
+
+        self.update_batch_size(self.batch_size)
+
+        self.state_filter = JointStateFilter(self.state_filter_cfg)
+        self._robot_cmd_state_seq = JointState.zeros(
+            (1, self.horizon, self.d_action), self.tensor_args
+        )
+        self._cmd_batch_size = -1
+        if not self.teleport_mode:
+            self._max_joint_vel = (
+                self.get_state_bounds()
+                .velocity.view(2, self.d_action)[1, :]
+                .reshape(1, 1, self.d_action)
+            ) - 0.2
+            self._max_joint_acc = self.get_state_bounds().acceleration[1, :] - 0.2
+            self._max_joint_jerk = self.get_state_bounds().jerk[1, :] - 0.2
+
+    def update_traj_dt(
+        self,
+        dt: Union[float, torch.Tensor],
+        base_dt: Optional[float] = None,
+        max_dt: Optional[float] = None,
+        base_ratio: Optional[float] = None,
+    ):
+        self.dt_traj_params.update_dt(dt, base_dt, max_dt, base_ratio)
+        dt_array = self.dt_traj_params.get_dt_array(self.horizon)
+        self.traj_dt[:] = torch.tensor(
+            dt_array, dtype=self.tensor_args.dtype, device=self.tensor_args.device
+        )
+        self._cmd_step_fn.update_dt(self.traj_dt)
+        self._rollout_step_fn.update_dt(self.traj_dt)
+
+    def get_next_state(self, curr_state: torch.Tensor, act: torch.Tensor, dt):
+        """Does a single step from the current state
+        Args:
+        curr_state: current state
+        act: action
+        dt: time to integrate
+        Returns:
+        next_state
+        TODO: Move this into tensorstep class?
+        """
+
+        if self.control_space == StateType.JERK:
+            curr_state[2 * self.n_dofs : 3 * self.n_dofs] = (
+                curr_state[self.n_dofs : 2 * self.n_dofs] + act * dt
+            )
+            curr_state[self.n_dofs : 2 * self.n_dofs] = (
+                curr_state[self.n_dofs : 2 * self.n_dofs]
+                + curr_state[self.n_dofs * 2 : self.n_dofs * 3] * dt
+            )
+
+            curr_state[: self.n_dofs] = (
+                curr_state[: self.n_dofs] + curr_state[self.n_dofs : 2 * self.n_dofs] * dt
+            )
+        elif self.control_space == StateType.ACCELERATION:
+            curr_state[2 * self.n_dofs : 3 * self.n_dofs] = act
+            curr_state[self.n_dofs : 2 * self.n_dofs] = (
+                curr_state[self.n_dofs : 2 * self.n_dofs]
+                + curr_state[self.n_dofs * 2 : self.n_dofs * 3] * dt
+            )
+
+            curr_state[: self.n_dofs] = (
+                curr_state[: self.n_dofs]
+                + curr_state[self.n_dofs : 2 * self.n_dofs] * dt
+                + 0.5 * act * dt * dt
+            )
+        elif self.control_space == StateType.VELOCITY:
+            curr_state[2 * self.n_dofs : 3 * self.n_dofs] = 0.0
+            curr_state[self.n_dofs : 2 * self.n_dofs] = act * dt
+
+            curr_state[: self.n_dofs] = (
+                curr_state[: self.n_dofs] + curr_state[self.n_dofs : 2 * self.n_dofs] * dt
+            )
+        elif self.control_space == StateType.POSITION:
+            curr_state[2 * self.n_dofs : 3 * self.n_dofs] = 0.0
+            curr_state[1 * self.n_dofs : 2 * self.n_dofs] = 0.0
+            curr_state[: self.n_dofs] = act
+        return curr_state
+
+    def tensor_step(
+        self,
+        state: JointState,
+        act: torch.Tensor,
+        state_seq: JointState,
+        state_idx: Optional[torch.Tensor] = None,
+    ) -> JointState:
+        """
+        Args:
+        state: [1,N]
+        act: [H,N]
+        todo:
+        Integration  with variable dt along trajectory
+        """
+        state_seq = self._rollout_step_fn.forward(state, act, state_seq, state_idx)
+
+        return state_seq
+
+    def robot_cmd_tensor_step(
+        self,
+        state: JointState,
+        act: torch.Tensor,
+        state_seq: JointState,
+        state_idx: Optional[torch.Tensor] = None,
+    ) -> JointState:
+        """
+        Args:
+        state: [1,N]
+        act: [H,N]
+        todo:
+        Integration  with variable dt along trajectory
+        """
+        state_seq = self._cmd_step_fn.forward(state, act, state_seq, state_idx)
+        state_seq.joint_names = self.joint_names
+        return state_seq
+
+    def update_cmd_batch_size(self, batch_size):
+        if self._cmd_batch_size != batch_size:
+            self._robot_cmd_state_seq = JointState.zeros(
+                (batch_size, self.horizon, self.d_action), self.tensor_args
+            )
+            self._cmd_step_fn.update_batch_size(batch_size, self.horizon)
+            self._cmd_batch_size = batch_size
+
+    def update_batch_size(self, batch_size, force_update=False):
+        if self.batch_size != batch_size:
+            # TODO: Remove tensor recreation upon force update?
+            self.state_seq = JointState.zeros(
+                (batch_size, self.horizon, self.d_action), self.tensor_args
+            )
+
+            log_info("Updating state_seq buffer reference (created new tensor)")
+            # print("Creating new tensor")
+        if force_update:
+            self.state_seq = self.state_seq.detach()
+
+        self._rollout_step_fn.update_batch_size(batch_size, self.horizon, force_update)
+        self.batch_size = batch_size
+
+    def forward(
+        self,
+        start_state: JointState,
+        act_seq: torch.Tensor,
+        start_state_idx: Optional[torch.Tensor] = None,
+    ) -> KinematicModelState:
+        # filter state if needed:
+        start_state_shaped = start_state  # .unsqueeze(1)
+        # batch_size, horizon, d_act = act_seq.shape
+
+        batch_size = act_seq.shape[0]
+        self.update_batch_size(batch_size, force_update=act_seq.requires_grad)
+        state_seq = self.state_seq
+        curr_batch_size = self.batch_size
+        num_traj_points = self.horizon
+
+        with profiler.record_function("tensor_step"):
+            # forward step with step matrix:
+            state_seq = self.tensor_step(start_state_shaped, act_seq, state_seq, start_state_idx)
+
+        shape_tup = (curr_batch_size * num_traj_points, self.n_dofs)
+        with profiler.record_function("fk + jacobian"):
+            (
+                ee_pos_seq,
+                ee_quat_seq,
+                lin_jac_seq,
+                ang_jac_seq,
+                link_pos_seq,
+                link_quat_seq,
+                link_spheres,
+            ) = self.robot_model.forward(state_seq.position.view(shape_tup))
+        link_pos_seq = link_pos_seq.view(
+            ((curr_batch_size, num_traj_points, link_pos_seq.shape[1], 3))
+        )
+        link_quat_seq = link_quat_seq.view(
+            ((curr_batch_size, num_traj_points, link_quat_seq.shape[1], 4))
+        )
+        link_spheres = link_spheres.view(
+            (curr_batch_size, num_traj_points, link_spheres.shape[1], link_spheres.shape[-1])
+        )
+        ee_pos_seq = ee_pos_seq.view((curr_batch_size, num_traj_points, 3))
+        ee_quat_seq = ee_quat_seq.view((curr_batch_size, num_traj_points, 4))
+        if lin_jac_seq is not None:
+            lin_jac_seq = lin_jac_seq.view((curr_batch_size, num_traj_points, 3, self.n_dofs))
+        if ang_jac_seq is not None:
+            ang_jac_seq = ang_jac_seq.view((curr_batch_size, num_traj_points, 3, self.n_dofs))
+
+        state = KinematicModelState(
+            state_seq,
+            ee_pos_seq,
+            ee_quat_seq,
+            link_spheres,
+            link_pos_seq,
+            link_quat_seq,
+            lin_jac_seq,
+            ang_jac_seq,
+            link_names=self.robot_model.link_names,
+        )
+
+        return state
+
+    def integrate_action(self, act_seq):
+        if self.action_order == 0:
+            return act_seq
+
+        nth_act_seq = self._integrate_matrix_nth @ act_seq
+        return nth_act_seq
+
+    def integrate_action_step(self, act, dt):
+        for i in range(self.action_order):
+            act = act * dt
+
+        return act
+
+    def filter_robot_state(self, current_state: JointState):
+        filtered_state = self.state_filter.filter_joint_state(current_state)
+        return filtered_state
+
+    @torch.no_grad()
+    def get_robot_command(
+        self,
+        current_state: JointState,
+        act_seq: torch.Tensor,
+        shift_steps: int = 1,
+        state_idx: Optional[torch.Tensor] = None,
+    ) -> JointState:
+        if self.return_full_act_buffer:
+            if act_seq.shape[0] != self._cmd_batch_size:
+                self.update_cmd_batch_size(act_seq.shape[0])
+            full_state = self.robot_cmd_tensor_step(
+                current_state,
+                act_seq,
+                self._robot_cmd_state_seq,
+                state_idx,
+            )
+            return full_state
+        if shift_steps == 1:
+            if self.control_space == StateType.POSITION:
+                act_step = act_seq[..., 0, :].clone()
+            else:
+                act_step = act_seq[..., 0, :].clone()
+            cmd = self.state_filter.integrate_action(act_step, current_state)
+            return cmd
+
+        # get the first timestep in action buffer
+        cmd = current_state.clone()
+
+        for i in range(shift_steps):
+            act_step = act_seq[..., i, :]
+            # we integrate the action with the current belief:
+            cmd = self.state_filter.integrate_action(act_step, cmd)
+            if i == 0:
+                cmd_buffer = cmd.clone()
+            else:
+                cmd_buffer = cmd_buffer.stack(cmd)
+
+        return cmd_buffer
+
+    @property
+    def action_bound_lows(self):
+        if self.control_space == StateType.POSITION:
+            # use joint limits:
+            return self.joint_limits.position[0]
+        if self.control_space == StateType.VELOCITY:
+            # use joint limits:
+            return self.joint_limits.velocity[0]
+        if self.control_space == StateType.ACCELERATION:
+            # use joint limits:
+            return self.joint_limits.acceleration[0]
+
+    @property
+    def action_bound_highs(self):
+        if self.control_space == StateType.POSITION:
+            # use joint limits:
+            return self.joint_limits.position[1]
+        if self.control_space == StateType.VELOCITY:
+            # use joint limits:
+            return self.joint_limits.velocity[1]
+        if self.control_space == StateType.ACCELERATION:
+            # use joint limits:
+            return self.joint_limits.acceleration[1]
+
+    @property
+    def init_action_mean(self):
+        # output should be d_action * horizon
+        if self.control_space == StateType.POSITION:
+            # use joint limits:
+            return self.retract_config.unsqueeze(0).repeat(self.horizon, 1)
+        if self.control_space == StateType.VELOCITY or self.control_space == StateType.ACCELERATION:
+            # use joint limits:
+            return self.retract_config.unsqueeze(0).repeat(self.horizon, 1) * 0.0
+
+    @property
+    def retract_config(self):
+        return self.robot_model.kinematics_config.cspace.retract_config
+
+    @property
+    def cspace_distance_weight(self):
+        return self.robot_model.kinematics_config.cspace.cspace_distance_weight
+
+    @property
+    def null_space_weight(self):
+        return self.robot_model.kinematics_config.cspace.null_space_weight
+
+    @property
+    def max_acceleration(self):
+        return self.get_state_bounds().acceleration[1, 0].item()
+
+    @property
+    def max_jerk(self):
+        return self.get_state_bounds().jerk[1, 0].item()
+
+    def get_state_bounds(self):
+        joint_limits = self.robot_model.get_joint_limits()
+        return joint_limits
+
+    def get_action_from_state(self, state: JointState) -> torch.Tensor:
+        if self.control_space == StateType.POSITION:
+            return state.position
+        if self.control_space == StateType.VELOCITY:
+            return state.velocity
+        if self.control_space == StateType.ACCELERATION:
+            return state.acceleration
+
+    def get_state_from_action(
+        self,
+        start_state: JointState,
+        act_seq: torch.Tensor,
+        state_idx: Optional[torch.Tensor] = None,
+    ) -> JointState:
+        """Compute State sequence from an action trajectory
+
+        Args:
+            start_state (JointState): _description_
+            act_seq (torch.Tensor): _description_
+
+        Returns:
+            JointState: _description_
+        """
+        if act_seq.shape[0] != self._cmd_batch_size:
+            self.update_cmd_batch_size(act_seq.shape[0])
+        full_state = self.robot_cmd_tensor_step(
+            start_state,
+            act_seq,
+            self._robot_cmd_state_seq,
+            state_idx,
+        )
+        return full_state
diff --git a/src/curobo/rollout/dynamics_model/model_base.py b/src/curobo/rollout/dynamics_model/model_base.py
new file mode 100644
index 00000000..91c3a6bc
--- /dev/null
+++ b/src/curobo/rollout/dynamics_model/model_base.py
@@ -0,0 +1,33 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+from abc import ABC, abstractmethod
+
+
+class DynamicsModelBase(ABC):
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def forward(self, start_state, act_seq, *args):
+        pass
+
+    @abstractmethod
+    def get_next_state(self, currend_state, act, dt):
+        pass
+
+    @abstractmethod
+    def filter_robot_state(self, current_state):
+        pass
+
+    @abstractmethod
+    def get_robot_command(self, current_state, act_seq):
+        pass
diff --git a/src/curobo/rollout/dynamics_model/tensor_step.py b/src/curobo/rollout/dynamics_model/tensor_step.py
new file mode 100644
index 00000000..7df54f4f
--- /dev/null
+++ b/src/curobo/rollout/dynamics_model/tensor_step.py
@@ -0,0 +1,524 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+from abc import abstractmethod
+from enum import Enum
+from typing import Optional
+
+# Third Party
+import torch
+
+# CuRobo
+from curobo.types.base import TensorDeviceType
+from curobo.types.robot import JointState
+
+# Local Folder
+from .integration_utils import (
+    AccelerationTensorStepIdxKernel,
+    AccelerationTensorStepKernel,
+    CliqueTensorStepCentralDifferenceKernel,
+    CliqueTensorStepIdxCentralDifferenceKernel,
+    CliqueTensorStepIdxKernel,
+    CliqueTensorStepKernel,
+    build_fd_matrix,
+    build_int_matrix,
+    build_start_state_mask,
+    tensor_step_acc_semi_euler,
+    tensor_step_pos,
+    tensor_step_pos_clique,
+)
+
+
+class TensorStepType(Enum):
+    POSITION_TELEPORT = 0
+    POSITION_CLIQUE_KERNEL = 1
+    VELOCITY = 2  # Not implemented
+    ACCELERATION_KERNEL = 3
+    JERK = 4  # Not implemented
+    POSITION = 5  # deprecated
+    POSITION_CLIQUE = 6  # deprecated
+    ACCELERATION = 7  # deprecated
+
+
+class TensorStepBase:
+    def __init__(self, tensor_args: TensorDeviceType) -> None:
+        self.batch_size = -1
+        self.horizon = -1
+        self.tensor_args = tensor_args
+        self._diag_dt = None
+        self._inv_dt_h = None
+
+    def update_dt(self, dt: float):
+        self._dt_h[:] = dt
+        if self._inv_dt_h is not None:
+            self._inv_dt_h[:] = 1.0 / dt
+
+    @abstractmethod
+    def update_batch_size(
+        self,
+        batch_size: Optional[int] = None,
+        horizon: Optional[int] = None,
+        force_update: bool = False,
+    ) -> None:
+        self.horizon = horizon
+        self.batch_size = batch_size
+
+    @abstractmethod
+    def forward(
+        self,
+        start_state: JointState,
+        u_act: torch.Tensor,
+        out_state_seq: JointState,
+        start_state_idx: Optional[torch.Tensor] = None,
+    ) -> JointState:
+        pass
+
+
+class TensorStepAcceleration(TensorStepBase):
+    def __init__(self, tensor_args: TensorDeviceType, dt_h: torch.Tensor) -> None:
+        super().__init__(tensor_args)
+        self._dt_h = dt_h
+        self._diag_dt_h = torch.diag(self._dt_h)
+        self._integrate_matrix_pos = None
+        self._integrate_matrix_vel = None
+
+    def update_batch_size(
+        self,
+        batch_size: Optional[int] = None,
+        horizon: Optional[int] = None,
+        force_update: bool = False,
+    ) -> None:
+        if self.horizon != horizon:
+            self._integrate_matrix_pos = (
+                build_int_matrix(
+                    horizon,
+                    device=self.tensor_args.device,
+                    dtype=self.tensor_args.dtype,
+                    diagonal=0,
+                )
+                @ self._diag_dt_h
+            )
+            self._integrate_matrix_vel = self._integrate_matrix_pos @ self._diag_dt_h
+        return super().update_batch_size(batch_size, horizon)
+
+    def forward(
+        self,
+        start_state: JointState,
+        u_act: torch.Tensor,
+        out_state_seq: JointState,
+        start_state_idx: Optional[torch.Tensor] = None,
+    ) -> JointState:
+        if start_state_idx is None:
+            state_seq = tensor_step_acc_semi_euler(
+                start_state,
+                u_act,
+                out_state_seq,
+                self._diag_dt_h,
+                self._integrate_matrix_vel,
+                self._integrate_matrix_pos,
+            )
+        else:
+            state_seq = tensor_step_acc_semi_euler(
+                start_state[start_state_idx],
+                u_act,
+                out_state_seq,
+                self._diag_dt_h,
+                self._integrate_matrix_vel,
+                self._integrate_matrix_pos,
+            )
+        return state_seq
+
+
+class TensorStepPositionTeleport(TensorStepBase):
+    def __init__(self, tensor_args: TensorDeviceType) -> None:
+        super().__init__(tensor_args)
+
+    def forward(
+        self,
+        start_state: JointState,
+        u_act: torch.Tensor,
+        out_state_seq: JointState,
+        start_state_idx: Optional[torch.Tensor] = None,
+    ) -> JointState:
+        out_state_seq.position = u_act
+        return out_state_seq
+
+
+class TensorStepPosition(TensorStepBase):
+    def __init__(self, tensor_args: TensorDeviceType, dt_h: torch.Tensor) -> None:
+        super().__init__(tensor_args)
+
+        self._dt_h = dt_h
+        # self._diag_dt_h = torch.diag(1 / self._dt_h)
+        self._fd_matrix = None
+
+    def update_dt(self, dt: float):
+        super().update_dt(dt)
+        self._fd_matrix = build_fd_matrix(
+            self.horizon,
+            device=self.tensor_args.device,
+            dtype=self.tensor_args.dtype,
+            order=1,
+        )
+        self._fd_matrix = torch.diag(1.0 / self._dt_h) @ self._fd_matrix
+
+    def update_batch_size(
+        self,
+        batch_size: Optional[int] = None,
+        horizon: Optional[int] = None,
+        force_update: bool = False,
+    ) -> None:
+        if horizon != self.horizon:
+            self._fd_matrix = build_fd_matrix(
+                horizon,
+                device=self.tensor_args.device,
+                dtype=self.tensor_args.dtype,
+                order=1,
+            )
+            self._fd_matrix = torch.diag(1.0 / self._dt_h) @ self._fd_matrix
+
+            # self._fd_matrix = self._diag_dt_h @ self._fd_matrix
+        return super().update_batch_size(batch_size, horizon)
+
+    def forward(
+        self,
+        start_state: JointState,
+        u_act: torch.Tensor,
+        out_state_seq: JointState,
+        start_state_idx: Optional[torch.Tensor] = None,
+    ) -> JointState:
+        if start_state_idx is None:
+            state_seq = tensor_step_pos(start_state, u_act, out_state_seq, self._fd_matrix)
+        else:
+            state_seq = tensor_step_pos(
+                start_state[start_state_idx], u_act, out_state_seq, self._fd_matrix
+            )
+        return state_seq
+
+
+class TensorStepPositionClique(TensorStepBase):
+    def __init__(self, tensor_args: TensorDeviceType, dt_h: torch.Tensor) -> None:
+        super().__init__(tensor_args)
+
+        self._dt_h = dt_h
+        self._inv_dt_h = 1.0 / dt_h
+        self._fd_matrix = None
+        self._start_mask_matrix = None
+
+    def update_dt(self, dt: float):
+        super().update_dt(dt)
+        self._fd_matrix = []
+        for i in range(3):
+            self._fd_matrix.append(
+                build_fd_matrix(
+                    self.horizon,
+                    device=self.tensor_args.device,
+                    dtype=self.tensor_args.dtype,
+                    order=i + 1,
+                    SHIFT=True,
+                )
+            )
+        self._diag_dt_h = torch.diag(self._inv_dt_h)
+
+        self._fd_matrix[0] = self._diag_dt_h @ self._fd_matrix[0]
+        self._fd_matrix[1] = self._diag_dt_h**2 @ self._fd_matrix[1]
+        self._fd_matrix[2] = self._diag_dt_h**3 @ self._fd_matrix[2]
+
+    def update_batch_size(
+        self,
+        batch_size: Optional[int] = None,
+        horizon: Optional[int] = None,
+        force_update: bool = False,
+    ) -> None:
+        if self.horizon != horizon:
+            self._fd_matrix = []
+            for i in range(3):
+                self._fd_matrix.append(
+                    build_fd_matrix(
+                        horizon,
+                        device=self.tensor_args.device,
+                        dtype=self.tensor_args.dtype,
+                        order=i + 1,
+                        SHIFT=True,
+                    )
+                )
+            self._diag_dt_h = torch.diag(self._inv_dt_h)
+
+            self._fd_matrix[0] = self._diag_dt_h @ self._fd_matrix[0]
+            self._fd_matrix[1] = self._diag_dt_h**2 @ self._fd_matrix[1]
+            self._fd_matrix[2] = self._diag_dt_h**3 @ self._fd_matrix[2]
+            self._start_mask_matrix = list(build_start_state_mask(horizon, self.tensor_args))
+        return super().update_batch_size(batch_size, horizon)
+
+    def forward(
+        self,
+        start_state: JointState,
+        u_act: torch.Tensor,
+        out_state_seq: JointState,
+        start_state_idx: Optional[torch.Tensor] = None,
+    ) -> JointState:
+        if start_state_idx is None:
+            state_seq = tensor_step_pos_clique(
+                start_state, u_act, out_state_seq, self._start_mask_matrix, self._fd_matrix
+            )
+        else:
+            state_seq = tensor_step_pos_clique(
+                start_state[start_state_idx],
+                u_act,
+                out_state_seq,
+                self._start_mask_matrix,
+                self._fd_matrix,
+            )
+        return state_seq
+
+
+class TensorStepAccelerationKernel(TensorStepBase):
+    def __init__(self, tensor_args: TensorDeviceType, dt_h: torch.Tensor, dof: int) -> None:
+        super().__init__(tensor_args)
+
+        self._dt_h = dt_h
+        self._u_grad = None
+        self.dof = dof
+
+    def update_batch_size(
+        self,
+        batch_size: Optional[int] = None,
+        horizon: Optional[int] = None,
+        force_update: bool = False,
+    ) -> None:
+        if batch_size != self.batch_size or horizon != self.horizon:
+            self._u_grad = torch.zeros(
+                (batch_size, horizon, self.dof),
+                device=self.tensor_args.device,
+                dtype=self.tensor_args.dtype,
+            )
+        if force_update:
+            self._u_grad = self._u_grad.detach()
+        return super().update_batch_size(batch_size, horizon)
+
+    def forward(
+        self,
+        start_state: JointState,
+        u_act: torch.Tensor,
+        out_state_seq: JointState,
+        start_state_idx: Optional[torch.Tensor] = None,
+    ) -> JointState:
+        if start_state_idx is None:
+            (
+                out_state_seq.position,
+                out_state_seq.velocity,
+                out_state_seq.acceleration,
+                out_state_seq.jerk,
+            ) = AccelerationTensorStepKernel.apply(
+                u_act,
+                start_state.position,  # .contiguous(),
+                start_state.velocity,  # .contiguous(),
+                start_state.acceleration,  # .contiguous(),
+                out_state_seq.position,  # .contiguous(),
+                out_state_seq.velocity,  # .contiguous(),
+                out_state_seq.acceleration,  # .contiguous(),
+                out_state_seq.jerk,  # .contiguous(),
+                self._dt_h,
+                self._u_grad,
+            )
+
+        else:
+            (
+                out_state_seq.position,
+                out_state_seq.velocity,
+                out_state_seq.acceleration,
+                out_state_seq.jerk,
+            ) = AccelerationTensorStepIdxKernel.apply(
+                u_act,
+                start_state.position,  # .contiguous(),
+                start_state.velocity,  # .contiguous(),
+                start_state.acceleration,  # .contiguous(),
+                start_state_idx,
+                out_state_seq.position,  # .contiguous(),
+                out_state_seq.velocity,  # .contiguous(),
+                out_state_seq.acceleration,  # .contiguous(),
+                out_state_seq.jerk,  # .contiguous(),
+                self._dt_h,
+                self._u_grad,
+            )
+
+        return out_state_seq
+
+
+class TensorStepPositionCliqueKernel(TensorStepBase):
+    def __init__(
+        self,
+        tensor_args: TensorDeviceType,
+        dt_h: torch.Tensor,
+        dof: int,
+        finite_difference_mode: int = -1,
+        filter_velocity: bool = False,
+        filter_acceleration: bool = False,
+        filter_jerk: bool = False,
+    ) -> None:
+        super().__init__(tensor_args)
+        self._dt_h = dt_h
+        self._inv_dt_h = 1.0 / dt_h
+        self._u_grad = None
+        self.dof = dof
+        self._fd_mode = finite_difference_mode
+        self._filter_velocity = filter_velocity
+        self._filter_acceleration = filter_acceleration
+        self._filter_jerk = filter_jerk
+
+        if self._filter_velocity or self._filter_acceleration or self._filter_jerk:
+            kernel = self.tensor_args.to_device([[[0.06136, 0.24477, 0.38774, 0.24477, 0.06136]]])
+            self._sma = torch.nn.functional.conv1d
+
+            weights = kernel
+            self._sma_kernel = weights
+
+            # self._sma = torch.nn.AvgPool1d(kernel_size=5, stride=2, padding=1).to(
+            #    device=self.tensor_args.device
+            # )
+
+    def update_batch_size(
+        self,
+        batch_size: Optional[int] = None,
+        horizon: Optional[int] = None,
+        force_update: bool = False,
+    ) -> None:
+        if batch_size != self.batch_size or horizon != self.horizon:
+            self._u_grad = torch.zeros(
+                (batch_size, horizon, self.dof),
+                device=self.tensor_args.device,
+                dtype=self.tensor_args.dtype,
+            )
+        if force_update:
+            self._u_grad = self._u_grad.detach()
+        return super().update_batch_size(batch_size, horizon)
+
+    def forward(
+        self,
+        start_state: JointState,
+        u_act: torch.Tensor,
+        out_state_seq: JointState,
+        start_state_idx: Optional[torch.Tensor] = None,
+    ) -> JointState:
+        if start_state_idx is None:
+            if self._fd_mode == -1:
+                (
+                    out_state_seq.position,
+                    out_state_seq.velocity,
+                    out_state_seq.acceleration,
+                    out_state_seq.jerk,
+                ) = CliqueTensorStepKernel.apply(
+                    u_act,
+                    start_state.position,  # .contiguous(),
+                    start_state.velocity,  # .contiguous(),
+                    start_state.acceleration,  # .contiguous(),
+                    out_state_seq.position,  # .contiguous(),
+                    out_state_seq.velocity,  # .contiguous(),
+                    out_state_seq.acceleration,  # .contiguous(),
+                    out_state_seq.jerk,  # .contiguous(),
+                    self._inv_dt_h,
+                    self._u_grad,
+                )
+            else:
+                (
+                    out_state_seq.position,
+                    out_state_seq.velocity,
+                    out_state_seq.acceleration,
+                    out_state_seq.jerk,
+                ) = CliqueTensorStepCentralDifferenceKernel.apply(
+                    u_act,
+                    start_state.position,  # .contiguous(),
+                    start_state.velocity,  # .contiguous(),
+                    start_state.acceleration,  # .contiguous(),
+                    out_state_seq.position,  # .contiguous(),
+                    out_state_seq.velocity,  # .contiguous(),
+                    out_state_seq.acceleration,  # .contiguous(),
+                    out_state_seq.jerk,  # .contiguous(),
+                    self._inv_dt_h,
+                    self._u_grad,
+                )
+
+        else:
+            if self._fd_mode == -1:
+                (
+                    out_state_seq.position,
+                    out_state_seq.velocity,
+                    out_state_seq.acceleration,
+                    out_state_seq.jerk,
+                ) = CliqueTensorStepIdxKernel.apply(
+                    u_act,
+                    start_state.position,  # .contiguous(),
+                    start_state.velocity,  # .contiguous(),
+                    start_state.acceleration,  # .contiguous(),
+                    start_state_idx,
+                    out_state_seq.position,  # .contiguous(),
+                    out_state_seq.velocity,  # .contiguous(),
+                    out_state_seq.acceleration,  # .contiguous(),
+                    out_state_seq.jerk,  # .contiguous(),
+                    self._inv_dt_h,
+                    self._u_grad,
+                )
+            else:
+                (
+                    out_state_seq.position,
+                    out_state_seq.velocity,
+                    out_state_seq.acceleration,
+                    out_state_seq.jerk,
+                ) = CliqueTensorStepIdxCentralDifferenceKernel.apply(
+                    u_act,
+                    start_state.position,  # .contiguous(),
+                    start_state.velocity,  # .contiguous(),
+                    start_state.acceleration,  # .contiguous(),
+                    start_state_idx,
+                    out_state_seq.position,  # .contiguous(),
+                    out_state_seq.velocity,  # .contiguous(),
+                    out_state_seq.acceleration,  # .contiguous(),
+                    out_state_seq.jerk,  # .contiguous(),
+                    self._inv_dt_h,
+                    self._u_grad,
+                )
+            if self._filter_velocity:
+                out_state_seq.velocity = self.filter_signal(out_state_seq.velocity)
+
+            if self._filter_acceleration:
+                out_state_seq.acceleration = self.filter_signal(out_state_seq.acceleration)
+
+            if self._filter_jerk:
+                out_state_seq.jerk = self.filter_signal(out_state_seq.jerk)
+        return out_state_seq
+
+    def filter_signal(self, signal: torch.Tensor):
+        return filter_signal_jit(signal, self._sma_kernel)
+        b, h, dof = signal.shape
+        new_signal = (
+            self._sma(
+                signal.transpose(-1, -2).reshape(b * dof, 1, h), self._sma_kernel, padding="same"
+            )
+            .view(b, dof, h)
+            .transpose(-1, -2)
+            .contiguous()
+        )
+        return new_signal
+
+
+@torch.jit.script
+def filter_signal_jit(signal, kernel):
+    b, h, dof = signal.shape
+
+    new_signal = (
+        torch.nn.functional.conv1d(
+            signal.transpose(-1, -2).reshape(b * dof, 1, h), kernel, padding="same"
+        )
+        .view(b, dof, h)
+        .transpose(-1, -2)
+        .contiguous()
+    )
+    return new_signal
diff --git a/src/curobo/rollout/rollout_base.py b/src/curobo/rollout/rollout_base.py
new file mode 100644
index 00000000..078ff4ac
--- /dev/null
+++ b/src/curobo/rollout/rollout_base.py
@@ -0,0 +1,563 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# import torch
+from __future__ import annotations
+
+# Standard Library
+from abc import abstractmethod, abstractproperty
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Sequence
+
+# Third Party
+import torch
+
+# CuRobo
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import CSpaceConfig, State
+from curobo.types.tensor import (
+    T_BDOF,
+    T_DOF,
+    T_BHDOF_float,
+    T_BHValue_float,
+    T_BValue_bool,
+    T_BValue_float,
+)
+from curobo.util.helpers import list_idx_if_not_none
+from curobo.util.sample_lib import HaltonGenerator
+from curobo.util.tensor_util import copy_tensor
+
+
+@dataclass
+class RolloutMetrics(Sequence):
+    cost: Optional[T_BValue_float] = None
+    constraint: Optional[T_BValue_float] = None
+    feasible: Optional[T_BValue_bool] = None
+    state: Optional[State] = None
+
+    def __getitem__(self, idx):
+        d_list = [self.cost, self.constraint, self.feasible, self.state]
+        idx_vals = list_idx_if_not_none(d_list, idx)
+        return RolloutMetrics(idx_vals[0], idx_vals[1], idx_vals[2], idx_vals[3])
+
+    def __len__(self):
+        if self.cost is not None:
+            return self.cost.shape[0]
+        else:
+            return -1
+
+    def clone(self, clone_state=False):
+        if clone_state:
+            raise NotImplementedError()
+        return RolloutMetrics(
+            cost=None if self.cost is None else self.cost.clone(),
+            constraint=None if self.constraint is None else self.constraint.clone(),
+            feasible=None if self.feasible is None else self.feasible.clone(),
+            state=None if self.state is None else self.state,
+        )
+
+
+@dataclass
+class Trajectory:
+    actions: T_BHDOF_float
+    costs: T_BHValue_float
+    state: Optional[State] = None
+    debug: Optional[dict] = None
+
+
+@dataclass
+class Goal(Sequence):
+    """Goal data class used to update optimization target.
+
+    #NOTE:
+    We can parallelize Goal in two ways:
+    1. Solve for current_state, pose pair in same environment
+    2. Solve for current_state, pose pair in different environment
+    For case (1), we use batch_pose_idx to find the memory address of the
+    current_state, pose pair while keeping batch_world_idx = [0]
+    For case (2), we add a batch_world_idx[0,1,2..].
+    """
+
+    name: str = "goal"
+    goal_state: Optional[State] = None
+    goal_pose: Pose = Pose()
+    links_goal_pose: Optional[Dict[str, Pose]] = None
+    current_state: Optional[State] = None
+    retract_state: Optional[T_DOF] = None
+    batch: int = -1  # NOTE: add another variable for size of index tensors?
+    # this should also contain a batch index tensor:
+    batch_pose_idx: Optional[torch.Tensor] = None  # shape: [batch]
+    batch_goal_state_idx: Optional[torch.Tensor] = None
+    batch_retract_state_idx: Optional[torch.Tensor] = None
+    batch_current_state_idx: Optional[torch.Tensor] = None  # shape: [batch]
+    batch_enable_idx: Optional[torch.Tensor] = None  # shape: [batch, n]
+    batch_world_idx: Optional[torch.Tensor] = None  # shape: [batch, n]
+    update_batch_idx_buffers: bool = True
+    n_goalset: int = 1  # NOTE: This currently does not get updated if goal_pose is updated later.
+
+    def __getitem__(self, idx):
+        d_list = [
+            self.goal_state,
+            self.goal_pose,
+            self.current_state,
+            self.retract_state,
+            self.batch_pose_idx,
+            self.batch_goal_state_idx,
+            self.batch_retract_state_idx,
+            self.batch_current_state_idx,
+            self.batch_enable_idx,
+            self.batch_world_idx,
+        ]
+        idx_vals = list_idx_if_not_none(d_list, idx)
+        return Goal(
+            name=self.name,
+            batch=self.batch,
+            n_goalset=self.n_goalset,
+            goal_state=idx_vals[0],
+            goal_pose=idx_vals[1],
+            current_state=idx_vals[2],
+            retract_state=idx_vals[3],
+            batch_pose_idx=idx_vals[4],
+            batch_goal_state_idx=idx_vals[5],
+            batch_retract_state_idx=idx_vals[6],
+            batch_current_state_idx=idx_vals[7],
+            batch_enable_idx=idx_vals[8],
+            batch_world_idx=idx_vals[9],
+        )
+
+    def __len__(self):
+        return self.batch
+
+    def __post_init__(self):
+        self._update_batch_size()
+        if self.goal_pose.position is not None:
+            if self.batch_pose_idx is None:
+                self.batch_pose_idx = torch.arange(
+                    0, self.batch, 1, device=self.goal_pose.position.device, dtype=torch.int32
+                ).unsqueeze(-1)
+            self.n_goalset = self.goal_pose.n_goalset
+        if self.current_state is not None:
+            if self.batch_current_state_idx is None:
+                self.batch_current_state_idx = torch.arange(
+                    0,
+                    self.current_state.position.shape[0],
+                    1,
+                    device=self.current_state.position.device,
+                    dtype=torch.int32,
+                ).unsqueeze(-1)
+        if self.retract_state is not None:
+            if self.batch_retract_state_idx is None:
+                self.batch_retract_state_idx = torch.arange(
+                    0,
+                    self.retract_state.shape[0],
+                    1,
+                    device=self.retract_state.device,
+                    dtype=torch.int32,
+                ).unsqueeze(-1)
+
+    def _update_batch_size(self):
+        if self.goal_pose.position is not None:
+            self.batch = self.goal_pose.batch
+        elif self.goal_state is not None:
+            self.batch = self.goal_state.position.shape[0]
+        elif self.current_state is not None:
+            self.batch = self.current_state.position.shape[0]
+
+    def repeat_seeds(self, num_seeds: int):
+        # across seeds, the data is the same, so could we just expand batch_idx
+        # TODO:
+        goal_pose = goal_state = current_state = links_goal_pose = retract_state = None
+        batch_enable_idx = batch_pose_idx = batch_world_idx = batch_current_state_idx = None
+        batch_retract_state_idx = batch_goal_state_idx = None
+
+        if self.links_goal_pose is not None:
+            links_goal_pose = self.links_goal_pose
+        if self.goal_pose is not None:
+            goal_pose = self.goal_pose
+        #    goal_pose = self.goal_pose.repeat_seeds(num_seeds)
+        if self.goal_state is not None:
+            goal_state = self.goal_state  # .repeat_seeds(num_seeds)
+        if self.current_state is not None:
+            current_state = self.current_state  # .repeat_seeds(num_seeds)
+        if self.retract_state is not None:
+            retract_state = self.retract_state
+        # repeat seeds for indexing:
+        if self.batch_pose_idx is not None:
+            batch_pose_idx = self._tensor_repeat_seeds(self.batch_pose_idx, num_seeds)
+        if self.batch_goal_state_idx is not None:
+            batch_goal_state_idx = self._tensor_repeat_seeds(self.batch_goal_state_idx, num_seeds)
+        if self.batch_retract_state_idx is not None:
+            batch_retract_state_idx = self._tensor_repeat_seeds(
+                self.batch_retract_state_idx, num_seeds
+            )
+        if self.batch_enable_idx is not None:
+            batch_enable_idx = self._tensor_repeat_seeds(self.batch_enable_idx, num_seeds)
+        if self.batch_world_idx is not None:
+            batch_world_idx = self._tensor_repeat_seeds(self.batch_world_idx, num_seeds)
+        if self.batch_current_state_idx is not None:
+            batch_current_state_idx = self._tensor_repeat_seeds(
+                self.batch_current_state_idx, num_seeds
+            )
+
+        return Goal(
+            goal_state=goal_state,
+            goal_pose=goal_pose,
+            current_state=current_state,
+            retract_state=retract_state,
+            batch_pose_idx=batch_pose_idx,
+            batch_world_idx=batch_world_idx,
+            batch_enable_idx=batch_enable_idx,
+            batch_current_state_idx=batch_current_state_idx,
+            batch_retract_state_idx=batch_retract_state_idx,
+            batch_goal_state_idx=batch_goal_state_idx,
+            links_goal_pose=links_goal_pose,
+        )
+
+    def _tensor_repeat_seeds(self, tensor, num_seeds):
+        return tensor_repeat_seeds(tensor, num_seeds)
+
+    def apply_kernel(self, kernel_mat):
+        # For each seed in optimization, we use kernel_mat to transform to many parallel goals
+        # This can be modified to just multiply self.batch and update self.batch by the shape of self.batch
+        # TODO: add other elements
+        goal_pose = goal_state = current_state = links_goal_pose = None
+        batch_enable_idx = batch_pose_idx = batch_world_idx = batch_current_state_idx = None
+        batch_retract_state_idx = batch_goal_state_idx = None
+        if self.links_goal_pose is not None:
+            links_goal_pose = self.links_goal_pose
+        if self.goal_pose is not None:
+            goal_pose = self.goal_pose  # .apply_kernel(kernel_mat)
+        if self.goal_state is not None:
+            goal_state = self.goal_state  # .apply_kernel(kernel_mat)
+        if self.current_state is not None:
+            current_state = self.current_state  # .apply_kernel(kernel_mat)
+        if self.batch_enable_idx is not None:
+            batch_enable_idx = kernel_mat @ self.batch_enable_idx
+        if self.batch_retract_state_idx is not None:
+            batch_retract_state_idx = (
+                kernel_mat @ self.batch_retract_state_idx.to(dtype=torch.float32)
+            ).to(dtype=torch.int32)
+        if self.batch_goal_state_idx is not None:
+            batch_goal_state_idx = (
+                kernel_mat @ self.batch_goal_state_idx.to(dtype=torch.float32)
+            ).to(dtype=torch.int32)
+
+        if self.batch_current_state_idx is not None:
+            batch_current_state_idx = (
+                kernel_mat @ self.batch_current_state_idx.to(dtype=torch.float32)
+            ).to(dtype=torch.int32)
+        if self.batch_pose_idx is not None:
+            batch_pose_idx = (kernel_mat @ self.batch_pose_idx.to(dtype=torch.float32)).to(
+                dtype=torch.int32
+            )
+        if self.batch_world_idx is not None:
+            batch_world_idx = (kernel_mat @ self.batch_world_idx.to(dtype=torch.float32)).to(
+                dtype=torch.int32
+            )
+
+        return Goal(
+            goal_state=goal_state,
+            goal_pose=goal_pose,
+            current_state=current_state,
+            batch_pose_idx=batch_pose_idx,
+            batch_enable_idx=batch_enable_idx,
+            batch_world_idx=batch_world_idx,
+            batch_current_state_idx=batch_current_state_idx,
+            batch_goal_state_idx=batch_goal_state_idx,
+            batch_retract_state_idx=batch_retract_state_idx,
+            links_goal_pose=links_goal_pose,
+        )
+
+    def to(self, tensor_args: TensorDeviceType):
+        if self.goal_pose is not None:
+            self.goal_pose = self.goal_pose.to(tensor_args)
+        if self.goal_state is not None:
+            self.goal_state = self.goal_state.to(**vars(tensor_args))
+        if self.current_state is not None:
+            self.current_state = self.current_state.to(**vars(tensor_args))
+        return self
+
+    def copy_(self, goal: Goal, update_idx_buffers: bool = True):
+        """Copy data from another goal object.
+
+        Args:
+            goal (Goal): _description_
+
+        Raises:
+            NotImplementedError: _description_
+            NotImplementedError: _description_
+
+        Returns:
+            _type_: _description_
+        """
+
+        self.goal_pose = self._copy_buffer(self.goal_pose, goal.goal_pose)
+        self.goal_state = self._copy_buffer(self.goal_state, goal.goal_state)
+        self.retract_state = self._copy_tensor(self.retract_state, goal.retract_state)
+        self.current_state = self._copy_buffer(self.current_state, goal.current_state)
+        if goal.links_goal_pose is not None:
+            if self.links_goal_pose is None:
+                self.links_goal_pose = goal.links_goal_pose
+            else:
+                for k in goal.links_goal_pose.keys():
+                    self.links_goal_pose[k] = self._copy_buffer(
+                        self.links_goal_pose[k], goal.links_goal_pose[k]
+                    )
+        self._update_batch_size()
+        # copy pose indices as well?
+        if goal.update_batch_idx_buffers and update_idx_buffers:
+            self.batch_pose_idx = self._copy_tensor(self.batch_pose_idx, goal.batch_pose_idx)
+            self.batch_enable_idx = self._copy_tensor(self.batch_enable_idx, goal.batch_enable_idx)
+            self.batch_world_idx = self._copy_tensor(self.batch_world_idx, goal.batch_world_idx)
+            self.batch_current_state_idx = self._copy_tensor(
+                self.batch_current_state_idx, goal.batch_current_state_idx
+            )
+            self.batch_retract_state_idx = self._copy_tensor(
+                self.batch_retract_state_idx, goal.batch_retract_state_idx
+            )
+            self.batch_goal_state_idx = self._copy_tensor(
+                self.batch_goal_state_idx, goal.batch_goal_state_idx
+            )
+
+    def _copy_buffer(self, ref_buffer, buffer):
+        if buffer is not None:
+            if ref_buffer is not None:
+                ref_buffer = ref_buffer.copy_(buffer)
+            else:
+                ref_buffer = buffer.clone()
+        return ref_buffer
+
+    def _copy_tensor(self, ref_buffer, buffer):
+        if buffer is not None:
+            if ref_buffer is not None:
+                if not copy_tensor(buffer, ref_buffer):
+                    ref_buffer = buffer.clone()
+            else:
+                ref_buffer = buffer.clone()
+        return ref_buffer
+
+    def get_batch_goal_state(self):
+        return self.goal_state[self.batch_pose_idx[:, 0]]
+
+    def create_index_buffers(
+        self,
+        batch_size: int,
+        batch_env: bool,
+        batch_retract: bool,
+        num_seeds: int,
+        tensor_args: TensorDeviceType,
+    ):
+        new_goal = Goal.create_idx(batch_size, batch_env, batch_retract, num_seeds, tensor_args)
+        new_goal.copy_(self, update_idx_buffers=False)
+        return new_goal
+
+    @classmethod
+    def create_idx(
+        cls,
+        pose_batch_size: int,
+        batch_env: bool,
+        batch_retract: bool,
+        num_seeds: int,
+        tensor_args: TensorDeviceType,
+    ):
+        batch_pose_idx = torch.arange(
+            0, pose_batch_size, 1, device=tensor_args.device, dtype=torch.int32
+        ).unsqueeze(-1)
+        if batch_env:
+            batch_world_idx = batch_pose_idx.clone()
+        else:
+            batch_world_idx = 0 * batch_pose_idx
+        if batch_retract:
+            batch_retract_state_idx = batch_pose_idx.clone()
+        else:
+            batch_retract_state_idx = 0 * batch_pose_idx.clone()
+        batch_currernt_state_idx = batch_pose_idx.clone()
+        batch_goal_state_idx = batch_pose_idx.clone()
+
+        g = Goal(
+            batch_pose_idx=batch_pose_idx,
+            batch_retract_state_idx=batch_retract_state_idx,
+            batch_world_idx=batch_world_idx,
+            batch_current_state_idx=batch_currernt_state_idx,
+            batch_goal_state_idx=batch_goal_state_idx,
+        )
+        g_seeds = g.repeat_seeds(num_seeds)
+        return g_seeds
+
+
+@dataclass
+class RolloutConfig:
+    tensor_args: TensorDeviceType
+
+
+class RolloutBase:
+    def __init__(self, config: Optional[RolloutConfig] = None):
+        self.start_state = None
+        self.batch_size = 1
+        self._metrics_cuda_graph_init = False
+        self.cu_metrics_graph = None
+        self._rollout_constraint_cuda_graph_init = False
+        self.cu_rollout_constraint_graph = None
+        if config is not None:
+            self.tensor_args = config.tensor_args
+
+    def _init_after_config_load(self):
+        self.act_sample_gen = HaltonGenerator(
+            self.d_action,
+            self.tensor_args,
+            up_bounds=self.action_bound_highs,
+            low_bounds=self.action_bound_lows,
+            seed=1312,
+        )
+
+    @abstractmethod
+    def cost_fn(self, state: State):
+        return
+
+    @abstractmethod
+    def constraint_fn(
+        self, state: State, out_metrics: Optional[RolloutMetrics] = None
+    ) -> RolloutMetrics:
+        return
+
+    @abstractmethod
+    def convergence_fn(
+        self, state: State, out_metrics: Optional[RolloutMetrics] = None
+    ) -> RolloutMetrics:
+        return
+
+    def get_metrics(self, state: State):
+        out_metrics = self.constraint_fn(state)
+        out_metrics = self.convergence_fn(state, out_metrics)
+        return out_metrics
+
+    def get_metrics_cuda_graph(self, state: State):
+        return get_metrics(state)
+
+    def rollout_fn(self, act):
+        pass
+
+    def current_cost(self, current_state):
+        pass
+
+    @abstractmethod
+    def update_params(self, goal: Goal):
+        return
+
+    def __call__(self, act: T_BHDOF_float) -> Trajectory:
+        return self.rollout_fn(act)
+
+    @abstractproperty
+    def action_bounds(self):
+        return self.tensor_args.to_device(
+            torch.stack([self.action_bound_lows, self.action_bound_highs])
+        )
+
+    @abstractmethod
+    def filter_robot_state(self, current_state: State) -> State:
+        return current_state
+
+    @abstractmethod
+    def get_robot_command(
+        self, current_state, act_seq, shift_steps: int = 1, state_idx: Optional[torch.Tensor] = None
+    ):
+        return act_seq
+
+    def reset_seed(self):
+        self.act_sample_gen.reset()
+
+    def reset(self):
+        return True
+
+    @abstractproperty
+    def d_action(self) -> int:
+        raise NotImplementedError
+
+    @abstractproperty
+    def action_bound_lows(self):
+        return 1
+
+    @abstractproperty
+    def action_bound_highs(self):
+        return 1
+
+    @abstractproperty
+    def dt(self):
+        return 0.1
+
+    @property
+    def horizon(self) -> int:
+        raise NotImplementedError
+
+    def update_start_state(self, start_state: torch.Tensor):
+        if self.start_state is None:
+            self.start_state = start_state
+        copy_tensor(start_state, self.start_state)
+
+    @abstractmethod
+    def get_init_action_seq(self):
+        raise NotImplementedError
+
+    @property
+    def state_bounds(self) -> Dict[str, List[float]]:
+        pass
+
+    # sample random actions
+    # @abstractmethod
+    def sample_random_actions(self, n: int = 0):
+        act_rand = self.act_sample_gen.get_samples(n, bounded=True)
+        return act_rand
+
+    # how to map act_seq to state?
+    # rollout for feasibility?
+    @abstractmethod
+    def rollout_constraint(self, act_seq: torch.Tensor) -> RolloutMetrics:
+        # get state by rolling out
+
+        # get feasibility:
+        pass
+
+    def reset_cuda_graph(self):
+        self._metrics_cuda_graph_init = False
+        if self.cu_metrics_graph is not None:
+            self.cu_metrics_graph.reset()
+
+        self._rollout_constraint_cuda_graph_init = False
+        if self.cu_rollout_constraint_graph is not None:
+            self.cu_rollout_constraint_graph.reset()
+
+    @abstractmethod
+    def get_action_from_state(self, state: State):
+        pass
+
+    @abstractmethod
+    def get_state_from_action(
+        self, start_state: State, act_seq: torch.Tensor, state_idx: Optional[torch.Tensor] = None
+    ):
+        pass
+
+    @abstractproperty
+    def cspace_config(self) -> CSpaceConfig:
+        pass
+
+    def get_full_dof_from_solution(self, q_js: JointState) -> JointState:
+        return q_js
+
+
+@torch.jit.script
+def tensor_repeat_seeds(tensor, num_seeds: int):
+    a = (
+        tensor.view(tensor.shape[0], 1, tensor.shape[-1])
+        .repeat(1, num_seeds, 1)
+        .view(tensor.shape[0] * num_seeds, tensor.shape[-1])
+    )
+    return a
diff --git a/src/curobo/types/__init__.py b/src/curobo/types/__init__.py
new file mode 100644
index 00000000..a08745f9
--- /dev/null
+++ b/src/curobo/types/__init__.py
@@ -0,0 +1,10 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
diff --git a/src/curobo/types/base.py b/src/curobo/types/base.py
new file mode 100644
index 00000000..54bb1252
--- /dev/null
+++ b/src/curobo/types/base.py
@@ -0,0 +1,38 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+from dataclasses import dataclass
+
+# Third Party
+import numpy as np
+import torch
+
+
+@dataclass(frozen=True)
+class TensorDeviceType:
+    device: torch.device = torch.device("cuda", 0)
+    dtype: torch.float32 = torch.float32
+
+    @staticmethod
+    def from_basic(device: str, dev_id: int):
+        return TensorDeviceType(torch.device(device, dev_id))
+
+    def to_device(self, data_tensor):
+        if isinstance(data_tensor, torch.Tensor):
+            return data_tensor.to(device=self.device, dtype=self.dtype)
+        else:
+            return torch.as_tensor(np.array(data_tensor), device=self.device, dtype=self.dtype)
+
+    def to_int8_device(self, data_tensor):
+        return data_tensor.to(device=self.device, dtype=torch.int8)
+
+    def cpu(self):
+        return TensorDeviceType(device=torch.device("cpu"), dtype=self.dtype)
diff --git a/src/curobo/types/camera.py b/src/curobo/types/camera.py
new file mode 100644
index 00000000..a07f4bd9
--- /dev/null
+++ b/src/curobo/types/camera.py
@@ -0,0 +1,52 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+from __future__ import annotations
+
+# Standard Library
+from dataclasses import dataclass
+from typing import List, Optional
+
+# Third Party
+import torch
+
+# CuRobo
+from curobo.types.math import Pose
+
+
+@dataclass
+class CameraObservation:
+    name: str = "camera_image"
+    rgb_image: Optional[torch.Tensor] = None
+    depth_image: Optional[torch.Tensor] = None
+    image_segmentation: Optional[torch.Tensor] = None
+    projection_matrix: Optional[torch.Tensor] = None
+    resolution: Optional[List[int]] = None
+    pose: Optional[Pose] = None
+    intrinsics: Optional[torch.Tensor] = None
+    timestamp: float = 0.0
+
+    @property
+    def shape(self):
+        return self.rgb_image.shape
+
+    def copy_(self, new_data: CameraObservation):
+        self.rgb_image.copy_(new_data.rgb_image)
+        self.depth_image.copy_(new_data.depth_image)
+        self.image_segmentation.copy_(new_data.image_segmentation)
+        self.projection_matrix.copy_(new_data.projection_matrix)
+        self.resolution = new_data.resolution
+
+    def to(self, device: torch.device):
+        if self.rgb_image is not None:
+            self.rgb_image = self.rgb_image.to(device=device)
+        if self.depth_image is not None:
+            self.depth_image = self.depth_image.to(device=device)
+        return self
diff --git a/src/curobo/types/enum.py b/src/curobo/types/enum.py
new file mode 100644
index 00000000..ce587eea
--- /dev/null
+++ b/src/curobo/types/enum.py
@@ -0,0 +1,19 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+from enum import Enum
+
+
+class StateType(Enum):
+    POSITION = 0
+    VELOCITY = 1
+    ACCELERATION = 2
+    JERK = 3
diff --git a/src/curobo/types/math.py b/src/curobo/types/math.py
new file mode 100644
index 00000000..a4015d10
--- /dev/null
+++ b/src/curobo/types/math.py
@@ -0,0 +1,519 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+from __future__ import annotations
+
+# Standard Library
+from dataclasses import dataclass
+from typing import List, Optional, Sequence
+
+# Third Party
+import numpy as np
+import torch
+import torch.autograd.profiler as profiler
+from torch.autograd import Function
+
+# CuRobo
+from curobo.geom.transform import (
+    batch_transform_points,
+    matrix_to_quaternion,
+    pose_inverse,
+    pose_multiply,
+    pose_to_matrix,
+    quaternion_to_matrix,
+    transform_points,
+)
+from curobo.types.base import TensorDeviceType
+from curobo.util.helpers import list_idx_if_not_none
+from curobo.util.logger import log_error, log_info, log_warn
+from curobo.util.tensor_util import copy_if_not_none, copy_tensor
+
+# Local Folder
+from .tensor import T_BPosition, T_BQuaternion, T_BRotation
+
+
+@dataclass
+class Pose(Sequence):
+    """Pose representation used in CuRobo. You can initialize a pose by calling
+    pose = Pose(position, quaternion).
+    """
+
+    #: Position is represented as x, y, z, in meters
+    position: Optional[T_BPosition] = None
+    #: Quaternion is represented as w, x, y, z.
+    quaternion: Optional[T_BQuaternion] = None
+    #: Rotation is represents orientation as a 3x3 rotation matrix
+    rotation: Optional[T_BRotation] = None
+    #: Batch size will be initialized from input.
+    batch: int = 1
+    #: Goalset will be initialized from input when position shape is batch x n_goalset x 3
+    n_goalset: int = 1
+    #: name of link that this pose represents.
+    name: str = "ee_link"
+    #: quaternion input will be normalized when this flag is enabled. This is recommended when
+    #: a pose comes from an external source as some programs do not send normalized quaternions.
+    normalize_rotation: bool = True
+
+    def __post_init__(self):
+        if self.rotation is not None and self.quaternion is None:
+            self.quaternion = matrix_to_quaternion(self.rotation)
+        if self.position is not None:
+            if len(self.position.shape) > 2:
+                self.n_goalset = self.position.shape[1]
+            if len(self.position.shape) > 1:
+                self.batch = self.position.shape[0]
+            else:
+                self.position = self.position.unsqueeze(0)
+                self.quaternion = self.quaternion.unsqueeze(0)
+        if self.quaternion is None and self.position is not None:
+            q_size = list(self.position.shape)
+            q_size[-1] = 4
+            self.quaternion = torch.zeros(
+                q_size, device=self.position.device, dtype=self.position.dtype
+            )
+            self.quaternion[..., 0] = 1.0
+        if self.quaternion is not None and self.normalize_rotation:
+            self.quaternion = normalize_quaternion(self.quaternion)
+
+    @staticmethod
+    def from_matrix(matrix: np.ndarray):
+        if len(matrix.shape) == 2:
+            matrix = matrix.view(-1, 4, 4)
+        return Pose(position=matrix[..., :3, 3], rotation=matrix[..., :3, :3])
+
+    def get_rotation(self):
+        if self.rotation is not None:
+            return self.rotation
+        elif self.quaternion is not None:
+            return quaternion_to_matrix(self.quaternion)
+        else:
+            return None
+
+    def stack(self, other_pose: Pose):
+        position = torch.vstack((self.position, other_pose.position))
+        quaternion = None
+        rotation = None
+        if self.quaternion is not None and other_pose.quaternion is not None:
+            quaternion = torch.vstack((self.quaternion, other_pose.quaternion))
+        return Pose(position=position, quaternion=quaternion, rotation=rotation)
+
+    def repeat(self, n):
+        """Repeat pose
+
+        Args:
+            n ([type]): [description]
+        """
+        if n <= 1:
+            return self
+
+        position = self.position
+        quaternion = self.quaternion
+
+        if self.n_goalset > 1:
+            position = self.position.repeat(n, 1, 1)
+            quaternion = self.quaternion.repeat(n, 1, 1)
+        else:
+            position = self.position.repeat(n, 1)
+            quaternion = self.quaternion.repeat(n, 1)
+        return Pose(position=position, quaternion=quaternion, batch=position.shape[0])
+
+    def unsqueeze(self, dim=-1):
+        if self.position is not None:
+            self.position = self.position.unsqueeze(dim)
+        if self.quaternion is not None:
+            self.quaternion = self.quaternion.unsqueeze(dim)
+        if self.rotation is not None:
+            self.rotation = self.rotation.unsqueeze(dim)
+        return self
+
+    def repeat_seeds(self, num_seeds: int):
+        if (self.position is None and self.quaternion is None) or num_seeds <= 1:
+            return Pose(self.position, self.quaternion)
+        if self.n_goalset <= 1:
+            position = (
+                self.position.view(self.batch, 1, 3)
+                .repeat(1, num_seeds, 1)
+                .reshape(self.batch * num_seeds, 3)
+            )
+            quaternion = (
+                self.quaternion.view(self.batch, 1, 4)
+                .repeat(1, num_seeds, 1)
+                .reshape(self.batch * num_seeds, 4)
+            )
+        else:
+            position = (
+                self.position.view(self.batch, 1, self.n_goalset, 3)
+                .repeat(1, num_seeds, 1, 1)
+                .reshape(self.batch * num_seeds, self.n_goalset, 3)
+            )
+            quaternion = (
+                self.quaternion.view(self.batch, 1, self.n_goalset, 4)
+                .repeat(1, num_seeds, 1, 1)
+                .reshape(self.batch * num_seeds, self.n_goalset, 4)
+            )
+        return Pose(position=position, quaternion=quaternion)
+
+    def __getitem__(self, idx):
+        idx_vals = list_idx_if_not_none([self.position, self.quaternion], idx)
+
+        return Pose(position=idx_vals[0], quaternion=idx_vals[1], normalize_rotation=False)
+
+    def __len__(self):
+        return self.batch
+
+    def get_index(self, b: int, n: Optional[int] = None) -> Pose:
+        if n is None:
+            return Pose(
+                position=self.position[b, :].contiguous(),
+                quaternion=self.quaternion[b, :].contiguous(),
+                normalize_rotation=False,
+            )
+        else:
+            return Pose(
+                position=self.position[b, n, :].contiguous(),
+                quaternion=self.quaternion[b, n, :].contiguous(),
+                normalize_rotation=False,
+            )
+
+    def apply_kernel(self, kernel_mat):
+        if self.position is None:
+            return self
+        if self.n_goalset <= 1:
+            return Pose(
+                position=kernel_mat @ self.position, quaternion=kernel_mat @ self.quaternion
+            )
+        else:
+            pos = self.position.view(self.batch, -1)
+            quat = self.quaternion.view(self.batch, -1)
+            pos_k = kernel_mat @ pos
+            quat_k = kernel_mat @ quat
+            return Pose(
+                position=pos_k.view(-1, self.n_goalset, 3),
+                quaternion=quat_k.view(-1, self.n_goalset, 4),
+                normalize_rotation=False,
+            )
+
+    @classmethod
+    def from_list(
+        cls, pose: List[float], tensor_args: TensorDeviceType = TensorDeviceType(), q_xyzw=False
+    ):
+        position = torch.as_tensor(
+            pose[:3], device=tensor_args.device, dtype=tensor_args.dtype
+        ).unsqueeze(0)
+        if q_xyzw:
+            quaternion = torch.as_tensor(
+                [pose[6], pose[3], pose[4], pose[5]],
+                device=tensor_args.device,
+                dtype=tensor_args.dtype,
+            ).unsqueeze(0)
+        else:
+            quaternion = torch.as_tensor(
+                pose[3:], device=tensor_args.device, dtype=tensor_args.dtype
+            ).unsqueeze(0)
+        return Pose(position=position, quaternion=quaternion)
+
+    @classmethod
+    def from_batch_list(cls, pose: List[List[float]], tensor_args: TensorDeviceType, q_xyzw=False):
+        # create a cpu pytorch tensor first
+        pose_mat = np.array(pose)
+        position = torch.as_tensor(
+            pose_mat[..., :3], device=tensor_args.device, dtype=tensor_args.dtype
+        )
+        quaternion = torch.as_tensor(
+            pose_mat[..., 3:], device=tensor_args.device, dtype=tensor_args.dtype
+        )
+        if q_xyzw:
+            q_c = quaternion.clone()
+            quaternion[..., 0] = q_c[..., 3]
+            quaternion[..., 1] = q_c[..., 0]
+            quaternion[..., 2] = q_c[..., 1]
+            quaternion[..., 3] = q_c[..., 2]
+
+        return Pose(position=position, quaternion=quaternion)
+
+    def to_list(self, q_xyzw=False):
+        return self.tolist(q_xyzw)
+
+    def tolist(self, q_xyzw=False):
+        if q_xyzw:
+            q = self.quaternion.cpu().squeeze().tolist()
+            return self.position.cpu().squeeze().tolist() + [q[1], q[2], q[3], q[0]]
+
+        return self.position.cpu().squeeze().tolist() + self.quaternion.cpu().squeeze().tolist()
+
+    def clone(self):
+        return Pose(
+            position=copy_if_not_none(self.position),
+            quaternion=copy_if_not_none(self.quaternion),
+            normalize_rotation=False,
+            # rotation=copy_if_not_none(self.rotation),
+        )
+
+    def to(self, tensor_args: TensorDeviceType):
+        t_type = vars(tensor_args)
+        if self.position is not None:
+            self.position = self.position.to(**t_type)
+        if self.quaternion is not None:
+            self.quaternion = self.quaternion.to(**t_type)
+        if self.rotation is not None:
+            self.rotation = self.rotation.to(**t_type)
+        return self
+
+    @profiler.record_function("pose/get_matrix")
+    def get_matrix(self, out_matrix: Optional[torch.Tensor] = None):
+        full_mat = pose_to_matrix(self.position, self.quaternion, out_matrix)
+        return full_mat
+
+    def get_numpy_matrix(self):
+        return self.get_matrix().cpu().numpy()
+
+    @profiler.record_function("pose/inverse")
+    def inverse(self):
+        """Inverse of pose
+
+        Returns:
+            Pose: inverse pose
+        """
+        # rot, position = get_inv_transform(self.get_rotation(), self.position)
+        # out = Pose(position, rotation=rot)
+        # return out
+
+        position, quaternion = pose_inverse(self.position, self.quaternion)
+        out = Pose(position.clone(), quaternion.clone())
+        return out
+
+    def get_pose_vector(self):
+        return torch.cat((self.position, self.quaternion), dim=-1)
+
+    def copy_(self, pose: Pose):
+        """Copies pose data from another memory buffer.
+        This will create a new instance if buffers are not same shape
+
+        Args:
+            pose (Pose): _description_
+        """
+        if pose.position is None and pose.quaternion is None:
+            return self
+        if not copy_tensor(pose.position, self.position) or not copy_tensor(
+            pose.quaternion, self.quaternion
+        ):
+            log_info("Cloning math.Pose")
+
+            position = pose.position.clone()
+            quaternion = pose.quaternion.clone()
+            batch = position.shape[0]
+            return Pose(
+                position=position,
+                quaternion=quaternion,
+                batch=batch,
+                normalize_rotation=pose.normalize_rotation,
+            )
+        self.position.copy_(pose.position)
+        self.quaternion.copy_(pose.quaternion)
+        return self
+
+    @staticmethod
+    def cat(pose_list: List[Pose]):
+        position_cat = torch.cat([i.position for i in pose_list])
+        quaternion_cat = torch.cat([i.quaternion for i in pose_list])
+        return Pose(position=position_cat, quaternion=quaternion_cat, normalize_rotation=False)
+
+    def distance(self, other_pose: Pose, use_phi3: bool = False):
+        quat_distance = self.angular_distance(other_pose, use_phi3)
+        p_distance = self.linear_distance(other_pose)
+        return p_distance, quat_distance
+
+    def angular_distance(self, other_pose: Pose, use_phi3: bool = False):
+        """This function computes the angular distance \phi_3.
+
+        See Huynh, Du Q. "Metrics for 3D rotations: Comparison and analysis." Journal of Mathematical
+        Imaging and Vision 35 (2009): 155-164.
+
+        Args:
+            goal_quat: _description_
+            current_quat: _description_
+
+        Returns:
+            Angular distance in range [0,1]
+        """
+        if use_phi3:
+            quat_distance = angular_distance_phi3(self.quaternion, other_pose.quaternion)
+
+        else:
+            quat_distance = OrientationError.apply(
+                self.quaternion, other_pose.quaternion, self.quaternion.clone()
+            )
+        return quat_distance
+
+    def linear_distance(self, other_pose: Pose):
+        p_distance = torch.linalg.norm(self.position - other_pose.position, dim=-1)
+        return p_distance
+
+    @profiler.record_function("pose/multiply")
+    def multiply(self, other_pose: Pose):
+        if self.shape == other_pose.shape or (self.shape[0] == 1 and other_pose.shape[0] > 1):
+            p3, q3 = pose_multiply(
+                self.position, self.quaternion, other_pose.position, other_pose.quaternion
+            )
+            return Pose(p3, q3)
+        else:
+            mat_mul = self.get_matrix() @ other_pose.get_matrix()
+            return Pose.from_matrix(mat_mul)
+
+    def transform_point(
+        self,
+        points: torch.Tensor,
+        out_buffer: Optional[torch.Tensor] = None,
+        gp_out: Optional[torch.Tensor] = None,
+        gq_out: Optional[torch.Tensor] = None,
+        gpt_out: Optional[torch.Tensor] = None,
+    ):
+        log_warn("Deprecated, use Pose.transform_points instead")
+        if len(points.shape) > 2:
+            points = points.view(-1, 3)
+        return transform_points(
+            self.position, self.quaternion, points, out_buffer, gp_out, gq_out, gpt_out
+        )
+
+    def transform_points(
+        self,
+        points: torch.Tensor,
+        out_buffer: Optional[torch.Tensor] = None,
+        gp_out: Optional[torch.Tensor] = None,
+        gq_out: Optional[torch.Tensor] = None,
+        gpt_out: Optional[torch.Tensor] = None,
+    ):
+        if len(points.shape) > 2:
+            points = points.view(-1, 3)
+        return transform_points(
+            self.position.view(-1, 3),
+            self.quaternion.view(-1, 4),
+            points,
+            out_buffer,
+            gp_out,
+            gq_out,
+            gpt_out,
+        )
+
+    def batch_transform_points(
+        self,
+        points: torch.Tensor,
+        out_buffer: Optional[torch.Tensor] = None,
+        gp_out: Optional[torch.Tensor] = None,
+        gq_out: Optional[torch.Tensor] = None,
+        gpt_out: Optional[torch.Tensor] = None,
+    ):
+        if len(points.shape) <= 2:
+            log_error("batch_transform requires points to be b,n,3 shape")
+        return batch_transform_points(
+            self.position.view(-1, 3),
+            self.quaternion.view(-1, 4),
+            points,
+            out_buffer,
+            gp_out,
+            gq_out,
+            gpt_out,
+        )
+
+    @property
+    def shape(self):
+        return self.position.shape
+
+
+def quat_multiply(q1, q2, q_res):
+    a_w = q1[..., 0]
+    a_x = q1[..., 1]
+    a_y = q1[..., 2]
+    a_z = q1[..., 3]
+    b_w = q2[..., 0]
+    b_x = q2[..., 1]
+    b_y = q2[..., 2]
+    b_z = q2[..., 3]
+
+    q_res[..., 0] = a_w * b_w - a_x * b_x - a_y * b_y - a_z * b_z
+
+    q_res[..., 1] = a_w * b_x + b_w * a_x + a_y * b_z - b_y * a_z
+    q_res[..., 2] = a_w * b_y + b_w * a_y + a_z * b_x - b_z * a_x
+    q_res[..., 3] = a_w * b_z + b_w * a_z + a_x * b_y - b_x * a_y
+    return q_res
+
+
+@torch.jit.script
+def angular_distance_phi3(goal_quat, current_quat):
+    """This function computes the angular distance \phi_3.
+
+    See Huynh, Du Q. "Metrics for 3D rotations: Comparison and analysis." Journal of Mathematical
+    Imaging and Vision 35 (2009): 155-164.
+
+    Args:
+        goal_quat: _description_
+        current_quat: _description_
+
+    Returns:
+        Angular distance in range [0,1]
+    """
+    dot_prod = (
+        goal_quat[..., 0] * current_quat[..., 0]
+        + goal_quat[..., 1] * current_quat[..., 1]
+        + goal_quat[..., 2] * current_quat[..., 2]
+        + goal_quat[..., 3] * current_quat[..., 3]
+    )
+
+    dot_prod = torch.abs(dot_prod)
+    distance = dot_prod
+    distance = torch.arccos(dot_prod) / (torch.pi * 0.5)
+    return distance
+
+
+class OrientationError(Function):
+    @staticmethod
+    def geodesic_distance(goal_quat, current_quat, quat_res):
+        conjugate_quat = current_quat.clone()
+        conjugate_quat[..., 1:] *= -1.0
+        quat_res = quat_multiply(goal_quat, conjugate_quat, quat_res)
+
+        quat_res = -1.0 * quat_res * torch.sign(quat_res[..., 0]).unsqueeze(-1)
+        quat_res[..., 0] = 0.0
+        # quat_res = conjugate_quat * 0.0
+        return quat_res
+
+    @staticmethod
+    def forward(ctx, goal_quat, current_quat, quat_res):
+        quat_res = OrientationError.geodesic_distance(goal_quat, current_quat, quat_res)
+        rot_error = torch.norm(quat_res, dim=-1, keepdim=True)
+        ctx.save_for_backward(quat_res, rot_error)
+        return rot_error
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        grad_mul = None
+        if ctx.needs_input_grad[1]:
+            (quat_error, r_err) = ctx.saved_tensors
+            scale = 1 / r_err
+            scale = torch.nan_to_num(scale, 0, 0, 0)
+
+            grad_mul = grad_out * scale * quat_error
+            # print(grad_out.shape)
+            # if grad_out.shape[0] == 6:
+            #    #print(grad_out.view(-1))
+            #    #print(grad_mul.view(-1)[-6:])
+            #    #exit()
+        return None, grad_mul, None
+
+
+@torch.jit.script
+def normalize_quaternion(in_quaternion: torch.Tensor) -> torch.Tensor:
+    k = torch.sign(in_quaternion[..., 0:1])
+    # NOTE: torch sign returns 0 as sign value when value is 0.0
+    k = torch.where(k == 0, 1.0, k)
+    k2 = k / torch.linalg.norm(in_quaternion, dim=-1, keepdim=True)
+    # normalize quaternion
+    in_q = k2 * in_quaternion
+    return in_q
diff --git a/src/curobo/types/robot.py b/src/curobo/types/robot.py
new file mode 100644
index 00000000..61d1ede3
--- /dev/null
+++ b/src/curobo/types/robot.py
@@ -0,0 +1,73 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+from __future__ import annotations
+
+# Standard Library
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import Any, Optional
+
+# CuRobo
+from curobo.cuda_robot_model.cuda_robot_generator import CudaRobotGeneratorConfig
+from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModelConfig
+from curobo.cuda_robot_model.types import CSpaceConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.state import JointState, State  # For compatibility with older versions.
+from curobo.util_file import write_yaml
+
+
+@dataclass
+class NNConfig:
+    ik_seeder: Optional[Any] = None
+
+
+@dataclass
+class RobotConfig:
+    """Robot configuration to load in CuRobo.
+
+    Tutorial: :ref:`tut_robot_configuration`
+    """
+
+    #: Robot kinematics configuration to load into cuda robot model.
+    kinematics: CudaRobotModelConfig
+
+    tensor_args: TensorDeviceType = TensorDeviceType()
+
+    @staticmethod
+    def from_dict(data_dict_in, tensor_args=TensorDeviceType()):
+        data_dict = deepcopy(data_dict_in)
+        if isinstance(data_dict["kinematics"], dict):
+            data_dict["kinematics"] = CudaRobotModelConfig.from_config(
+                CudaRobotGeneratorConfig(**data_dict_in["kinematics"], tensor_args=tensor_args)
+            )
+
+        return RobotConfig(**data_dict, tensor_args=tensor_args)
+
+    @staticmethod
+    def from_basic(urdf_path, base_link, ee_link, tensor_args=TensorDeviceType()):
+        cuda_robot_model_config = CudaRobotModelConfig.from_basic_urdf(
+            urdf_path, base_link, ee_link, tensor_args
+        )
+        # load other params:
+
+        return RobotConfig(
+            cuda_robot_model_config,
+            tensor_args=tensor_args,
+        )
+
+    def write_config(self, file_path):
+        dictionary = vars(self)
+        dictionary["kinematics"] = vars(dictionary["kinematics"])
+        write_yaml(dictionary, file_path)
+
+    @property
+    def cspace(self) -> CSpaceConfig:
+        return self.kinematics.cspace
diff --git a/src/curobo/types/state.py b/src/curobo/types/state.py
new file mode 100644
index 00000000..5bd4d9c8
--- /dev/null
+++ b/src/curobo/types/state.py
@@ -0,0 +1,517 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+from __future__ import annotations
+
+# Standard Library
+from dataclasses import dataclass
+from typing import List, Optional, Sequence, Tuple, Union
+
+# Third Party
+import numpy as np
+import torch
+import torch.autograd.profiler as profiler
+
+# CuRobo
+from curobo.types.base import TensorDeviceType
+from curobo.types.tensor import T_BDOF, T_DOF
+from curobo.util.logger import log_error, log_info, log_warn
+from curobo.util.tensor_util import (
+    check_tensor_shapes,
+    copy_if_not_none,
+    copy_tensor,
+    fd_tensor,
+    tensor_repeat_seeds,
+)
+
+
+@dataclass
+class FilterCoeff:
+    position: float = 0.0
+    velocity: float = 0.0
+    acceleration: float = 0.0
+    jerk: float = 0.0
+
+
+@dataclass
+class State(Sequence):
+    def blend(self, coeff: FilterCoeff, new_state: State):
+        return self
+
+    def to(self, tensor_args: TensorDeviceType):
+        return self
+
+    def get_state_tensor(self):
+        return torch.tensor([0.0])
+
+    def apply_kernel(self, kernel_mat):
+        return self
+
+    def __getitem__(self, idx):
+        raise NotImplementedError
+
+    def __len__(self):
+        raise NotImplementedError
+
+    def clone(self):
+        raise NotImplementedError
+
+
+@dataclass
+class JointState(State):
+    position: Union[List[float], T_DOF]
+    velocity: Union[List[float], T_DOF, None] = None
+    acceleration: Union[List[float], T_DOF, None] = None
+    joint_names: Optional[List[str]] = None
+    jerk: Union[List[float], T_DOF, None] = None  # Optional
+    tensor_args: TensorDeviceType = TensorDeviceType()
+
+    def __post_init__(self):
+        if isinstance(self.position, torch.Tensor):
+            self.tensor_args = TensorDeviceType(self.position.device)
+
+    @staticmethod
+    def from_numpy(
+        joint_names: List[str],
+        position: np.ndarry,
+        velocity: Optional[np.ndarray] = None,
+        acceleration: Optional[np.ndarray] = None,
+        jerk: Optional[np.ndarray] = None,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+    ):
+        pos = tensor_args.to_device(position)
+        vel = acc = je = None
+        if velocity is not None:
+            vel = tensor_args.to_device(velocity)
+        if acceleration is not None:
+            acc = tensor_args.to_device(acceleration)
+        else:
+            acc = pos * 0.0
+        if jerk is not None:
+            je = tensor_args.to_device(jerk)
+        else:
+            je = pos * 0.0
+        return JointState(pos, vel, acc, joint_names=joint_names, jerk=je, tensor_args=tensor_args)
+
+    @staticmethod
+    def from_position(position: T_BDOF, joint_names: Optional[List[str]] = None):
+        return JointState(
+            position=position,
+            velocity=position * 0.0,
+            acceleration=position * 0.0,
+            jerk=position * 0.0,
+            joint_names=joint_names,
+        )
+
+    def apply_kernel(self, kernel_mat):
+        return JointState(
+            position=kernel_mat @ self.position,
+            velocity=kernel_mat @ self.velocity,
+            acceleration=kernel_mat @ self.acceleration,
+            joint_names=self.joint_names,
+        )
+
+    def repeat_seeds(self, num_seeds: int):
+        return JointState(
+            position=tensor_repeat_seeds(self.position, num_seeds),
+            velocity=tensor_repeat_seeds(self.velocity, num_seeds)
+            if self.velocity is not None
+            else None,
+            acceleration=tensor_repeat_seeds(self.acceleration, num_seeds)
+            if self.acceleration is not None
+            else None,
+            joint_names=self.joint_names,
+        )
+
+    def to(self, tensor_args: TensorDeviceType):
+        position = tensor_args.to_device(self.position)
+        velocity = acceleration = jerk = None
+        if self.velocity is not None:
+            velocity = tensor_args.to_device(self.velocity)
+        if self.acceleration is not None:
+            acceleration = tensor_args.to_device(self.acceleration)
+        if self.jerk is not None:
+            jerk = tensor_args.to_device(self.jerk)
+        return JointState(
+            position,
+            velocity,
+            acceleration,
+            jerk=jerk,
+            tensor_args=tensor_args,
+            joint_names=self.joint_names,
+        )
+
+    def clone(self):
+        j_names = None
+        if self.joint_names is not None:
+            j_names = self.joint_names.copy()
+        return JointState(
+            position=copy_if_not_none(self.position),
+            velocity=copy_if_not_none(self.velocity),
+            acceleration=copy_if_not_none(self.acceleration),
+            jerk=copy_if_not_none(self.jerk),
+            joint_names=j_names,
+            tensor_args=self.tensor_args,
+        )
+
+    def blend(self, coeff: FilterCoeff, new_state: JointState):
+        self.position[:] = (
+            coeff.position * new_state.position + (1.0 - coeff.position) * self.position
+        )
+        self.velocity[:] = (
+            coeff.velocity * new_state.velocity + (1.0 - coeff.velocity) * self.velocity
+        )
+        self.acceleration[:] = (
+            coeff.acceleration * new_state.acceleration
+            + (1.0 - coeff.acceleration) * self.acceleration
+        )
+        self.jerk[:] = coeff.jerk * new_state.jerk + (1.0 - coeff.jerk) * self.jerk
+        return self
+
+    def get_state_tensor(self):
+        velocity = self.velocity
+        acceleration = self.acceleration
+        jerk = self.jerk
+        if velocity is None:
+            velocity = self.position * 0.0
+        if acceleration is None:
+            acceleration = self.positoin * 0.0
+        if jerk is None:
+            jerk = self.position * 0.0
+        state_tensor = torch.cat((self.position, velocity, acceleration, jerk), dim=-1)
+        return state_tensor
+
+    @staticmethod
+    def from_state_tensor(state_tensor, joint_names=None, dof=7):
+        return JointState(
+            state_tensor[..., :dof],
+            state_tensor[..., dof : 2 * dof],
+            state_tensor[..., 2 * dof : 3 * dof],
+            jerk=state_tensor[..., 3 * dof : 4 * dof],
+            joint_names=joint_names,
+        )
+
+    def stack(self, new_state: JointState):
+        return JointState.from_state_tensor(
+            torch.cat((self.get_state_tensor(), new_state.get_state_tensor()), dim=-2),
+            joint_names=self.joint_names,
+            dof=self.position.shape[-1],
+        )
+
+    def __getitem__(self, idx):
+        j = None
+        v = a = None
+        max_idx = 0
+        if isinstance(idx, int):
+            max_idx = idx
+        elif isinstance(idx, List):
+            max_idx = max(idx)
+        elif isinstance(idx, torch.Tensor):
+            max_idx = torch.max(idx)
+        if max_idx >= self.position.shape[0]:
+            raise ValueError(
+                str(max_idx)
+                + " index out of range, current state is of length "
+                + str(self.position.shape)
+            )
+        p = self.position[idx]
+        if self.velocity is not None:
+            if max_idx >= self.velocity.shape[0]:
+                raise ValueError(
+                    str(max_idx)
+                    + " index out of range, current velocity is of length "
+                    + str(self.velocity.shape)
+                )
+            v = self.velocity[idx]
+        if self.acceleration is not None:
+            if max_idx >= self.acceleration.shape[0]:
+                raise ValueError(
+                    str(max_idx)
+                    + " index out of range, current acceleration is of length "
+                    + str(self.acceleration.shape)
+                )
+            a = self.acceleration[idx]
+        if self.jerk is not None:
+            if max_idx >= self.jerk.shape[0]:
+                raise ValueError(
+                    str(max_idx)
+                    + " index out of range, current jerk is of length "
+                    + str(self.jerk.shape)
+                )
+            j = self.jerk[idx]
+        return JointState(p, v, a, joint_names=self.joint_names, jerk=j)
+
+    def __len__(self):
+        return self.position.shape[0]
+
+    @staticmethod
+    def from_list(position, velocity, acceleration, tensor_args: TensorDeviceType()):
+        js = JointState(position, velocity, acceleration)
+        js = js.to(tensor_args)
+        return js
+
+    def copy_at_index(self, in_joint_state: JointState, idx: Union[int, torch.Tensor]):
+        """Copy joint state to specific index
+
+        Args:
+            in_joint_state (JointState): _description_
+            idx (Union[int,torch.Tensor]): _description_
+        """
+        max_idx = 0
+        if isinstance(idx, int):
+            max_idx = idx
+        elif isinstance(idx, List):
+            max_idx = max(idx)
+        elif isinstance(idx, torch.Tensor):
+            max_idx = torch.max(idx)
+        if self.position is not None:
+            if max_idx >= self.position.shape[0]:
+                raise ValueError(
+                    str(max_idx)
+                    + " index out of range, current state is of length "
+                    + str(self.position.shape[0])
+                )
+            self.position[idx] = in_joint_state.position
+        if self.velocity is not None:
+            self.velocity[idx] = in_joint_state.velocity
+        if self.acceleration is not None:
+            self.acceleration[idx] = in_joint_state.acceleration
+        if self.jerk is not None:
+            self.jerk[idx] = in_joint_state.jerk
+
+    def copy_data(self, in_joint_state: JointState):
+        """Copy data from in_joint_state to self
+
+        Args:
+            in_joint_state (JointState): _description_
+        """
+        log_warn("JointState.copy_data is deprecated, use JointState.copy_ instead")
+        if not copy_tensor(in_joint_state.position, self.position):
+            self.position = in_joint_state.position
+            log_info("Cloning JointState")
+            print(self.position.shape, in_joint_state.position.shape)
+
+        if not copy_tensor(in_joint_state.velocity, self.velocity):
+            self.velocity = in_joint_state.velocity
+        if not copy_tensor(in_joint_state.acceleration, self.acceleration):
+            self.acceleration = in_joint_state.acceleration
+        return self
+
+    def _same_shape(self, new_js: JointState):
+        same_shape = False
+
+        if (
+            check_tensor_shapes(new_js.position, self.position)
+            and check_tensor_shapes(new_js.velocity, self.velocity)
+            and check_tensor_shapes(new_js.acceleration, self.acceleration)
+        ):
+            same_shape = True
+
+        # optional jerk check:
+        # if self.jerk is not None and new_js.jerk is not None:
+        #    same_shape = same_shape and check_tensor_shapes(new_js.jerk, self.jerk)
+        return same_shape
+
+    def copy_(self, in_joint_state: JointState):
+        # return self.copy_data(in_joint_state)
+        # copy data if tensor shapes are same:
+        if in_joint_state.joint_names is not None:
+            self.joint_names = in_joint_state.joint_names
+        if self._same_shape(in_joint_state):
+            # copy data:
+            self.position.copy_(in_joint_state.position)
+            self.velocity.copy_(in_joint_state.velocity)
+            self.acceleration.copy_(in_joint_state.acceleration)
+            # if self.jerk is not None:
+            #    self.jerk.copy_(in_joint_state.jerk)
+            return self
+        else:
+            log_info("Cloning JointState (breaks ref pointer)")
+            # print(self.position.shape, in_joint_state.position.shape)
+            # clone and create a new instance of JointState
+            return in_joint_state.clone()
+
+    def unsqueeze(self, idx: int):
+        p = self.position.unsqueeze(idx)
+        v = a = j = None
+        if self.velocity is not None:
+            v = self.velocity.unsqueeze(idx)
+        if self.acceleration is not None:
+            a = self.acceleration.unsqueeze(idx)
+        if self.jerk is not None:
+            j = self.jerk.unsqueeze(idx)
+        return JointState(p, v, a, self.joint_names, jerk=j)
+
+    def squeeze(self, dim: Optional[int] = 0):
+        p = torch.squeeze(self.position, dim)
+        v = a = j = None
+        if self.velocity is not None:
+            v = torch.squeeze(self.velocity, dim)
+        if self.acceleration is not None:
+            a = torch.squeeze(self.acceleration, dim)
+        if self.jerk is not None:
+            j = torch.squeeze(self.jerk, dim)
+        return JointState(p, v, a, self.joint_names, jerk=j)
+
+    def calculate_fd_from_position(self, dt: torch.Tensor):
+        self.velocity = fd_tensor(self.position, dt)
+        self.acceleration = fd_tensor(self.velocity, dt)
+        self.jerk = fd_tensor(self.acceleration, dt)
+        return self
+
+    @staticmethod
+    def zeros(
+        size: Tuple[int], tensor_args: TensorDeviceType, joint_names: Optional[List[str]] = None
+    ):
+        return JointState(
+            position=torch.zeros(size, device=tensor_args.device, dtype=tensor_args.dtype),
+            velocity=torch.zeros(size, device=tensor_args.device, dtype=tensor_args.dtype),
+            acceleration=torch.zeros(size, device=tensor_args.device, dtype=tensor_args.dtype),
+            jerk=torch.zeros(size, device=tensor_args.device, dtype=tensor_args.dtype),
+            joint_names=joint_names,
+        )
+
+    def detach(self):
+        self.position = self.position.detach()
+        self.velocity = self.velocity.detach()
+        self.acceleration = self.acceleration.detach()
+        if self.jerk is not None:
+            self.jerk = self.jerk.detach()
+
+        return self
+
+    def get_ordered_joint_state(self, ordered_joint_names: List[str]) -> JointState:
+        """Return joint state with a ordered joint names
+        Args:
+            ordered_joint_names (List[str]): _description_
+
+        Returns:
+            _type_: _description_
+        """
+
+        new_js = self.clone()
+        new_js.inplace_reindex(ordered_joint_names)
+        return new_js
+
+    @profiler.record_function("joint_state/inplace_reindex")
+    def inplace_reindex(self, joint_names: List[str]):
+        if self.joint_names is None:
+            raise ValueError("joint names are not specified in JointState")
+        # get index of joint names:
+        new_index_l = [self.joint_names.index(j) for j in joint_names]
+
+        new_index = torch.as_tensor(new_index_l, device=self.position.device, dtype=torch.long)
+        self.position = torch.index_select(self.position, -1, new_index)
+        if self.velocity is not None:
+            self.velocity = torch.index_select(self.velocity, -1, new_index)
+        if self.acceleration is not None:
+            self.acceleration = torch.index_select(self.acceleration, -1, new_index)
+        if self.jerk is not None:
+            self.jerk = torch.index_select(self.jerk, -1, new_index)
+        self.joint_names = [self.joint_names[x] for x in new_index_l]
+
+    def get_augmented_joint_state(
+        self, joint_names, lock_joints: Optional[JointState] = None
+    ) -> JointState:
+        if lock_joints is None:
+            return self.get_ordered_joint_state(joint_names)
+        if joint_names is None or self.joint_names is None:
+            raise ValueError("joint_names can't be None")
+
+        # if some joints are locked, we assume that these joints are not in self.joint_names:
+        if any(item in self.joint_names for item in lock_joints.joint_names):
+            raise ValueError("lock_joints is also listed in self.joint_names")
+
+        # append the lock_joints to existing joint state:
+        new_js = self.clone()
+
+        new_js = new_js.append_joints(lock_joints)
+
+        new_js = new_js.get_ordered_joint_state(joint_names)
+        return new_js
+
+    def append_joints(self, js: JointState):
+        if js.joint_names is None or len(js.joint_names) == 0:
+            log_error("joint_names are required to append")
+
+        current_shape = self.position.shape
+        extra_len = len(js.joint_names)
+        current_js = self
+        one_dim = False
+        # if joint state is of shape dof:
+        if len(current_shape) == 1:
+            current_js = self.unsqueeze(0)
+            one_dim = True
+            current_shape = current_js.position.shape
+
+        if current_shape[:-1] != js.position.shape:
+            if len(js.position.shape) > 1 and js.position.shape[0] > 1:
+                raise ValueError(
+                    "appending joints requires the new joints to have a shape matching current"
+                    + " batch size or have a batch size of 1."
+                )
+        current_js.joint_names.extend(js.joint_names)
+
+        if current_shape[:-1] == js.position.shape and len(current_shape) == len(js.position.shape):
+            current_js.position = torch.cat((current_js.position, js.position), dim=-1)
+            new_js = current_js
+        else:
+            current_shape = list(current_shape)
+            current_shape[-1] += len(js.joint_names)
+            new_js = JointState.zeros(
+                current_shape, current_js.tensor_args, joint_names=self.joint_names
+            )
+            new_js.position[..., :-extra_len] = current_js.position
+            if current_js.velocity is not None:
+                new_js.velocity[..., :-extra_len] = current_js.velocity
+            if current_js.acceleration is not None:
+                new_js.acceleration[..., :-extra_len] = current_js.acceleration
+            if current_js.jerk is not None:
+                new_js.jerk[..., :-extra_len] = current_js.jerk
+
+            new_js.position[..., -extra_len:] = js.position
+            if js.velocity is not None:
+                new_js.velocity[..., -extra_len:] = js.velocity
+            if js.acceleration is not None:
+                new_js.acceleration[..., -extra_len:] = js.acceleration
+            if js.jerk is not None:
+                new_js.jerk[..., -extra_len:] = js.jerk
+        if one_dim:
+            new_js = new_js.squeeze()
+        return new_js
+
+    def trim_trajectory(self, start_idx: int, end_idx: Optional[int] = None):
+        if end_idx is None:
+            end_idx = self.position.shape[-2]
+        if len(self.position.shape) < 2:
+            raise ValueError("JointState does not have horizon")
+        pos = self.position[..., start_idx:end_idx, :].clone()
+        vel = acc = jerk = None
+        if self.velocity is not None:
+            vel = self.velocity[..., start_idx:end_idx, :].clone()
+        if self.acceleration is not None:
+            acc = self.acceleration[..., start_idx:end_idx, :].clone()
+        if self.jerk is not None:
+            jerk = self.jerk[..., start_idx:end_idx, :].clone()
+        return JointState(pos, vel, acc, self.joint_names, jerk, self.tensor_args)
+
+    def scale(self, dt: Union[float, torch.Tensor]):
+        vel = acc = jerk = None
+        if self.velocity is not None:
+            vel = self.velocity * dt
+        if self.acceleration is not None:
+            acc = self.acceleration * (dt**2)
+        if self.jerk is not None:
+            jerk = self.jerk * (dt**3)
+        return JointState(self.position, vel, acc, self.joint_names, jerk, self.tensor_args)
+
+    @property
+    def shape(self):
+        return self.position.shape
diff --git a/src/curobo/types/tensor.py b/src/curobo/types/tensor.py
new file mode 100644
index 00000000..ee3da600
--- /dev/null
+++ b/src/curobo/types/tensor.py
@@ -0,0 +1,62 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# CuRobo
+from curobo.util.logger import log_warn
+
+try:
+    # Third Party
+    from torchtyping import TensorType
+except ImportError:
+    log_warn("torchtyping could not be imported, falling back to basic types")
+    TensorType = None
+    # Third Party
+    import torch
+b_dof = ["batch", "dof"]
+b_value = ["batch", "value"]
+bh_value = ["batch", "horizon", "value"]
+bh_dof = ["batch", "horizon", "dof"]
+h_dof = ["horizon", "dof"]
+
+if TensorType is not None:
+    T_DOF = TensorType[tuple(["dof"] + [float])]
+    T_BDOF = TensorType[tuple(b_dof + [float])]
+    T_BValue_float = TensorType[tuple(b_value + [float])]
+    T_BHValue_float = TensorType[tuple(bh_value + [float])]
+    T_BValue_bool = TensorType[tuple(b_value + [bool])]
+    T_BValue_int = TensorType[tuple(b_value + [int])]
+
+    T_BPosition = TensorType["batch", "xyz":3, float]
+    T_BQuaternion = TensorType["batch", "wxyz":4, float]
+    T_BRotation = TensorType["batch", 3, 3, float]
+    T_Position = TensorType["xyz":3, float]
+    T_Quaternion = TensorType["wxyz":4, float]
+    T_Rotation = TensorType[3, 3, float]
+
+    T_BHDOF_float = TensorType[tuple(bh_dof + [float])]
+    T_HDOF_float = TensorType[tuple(h_dof + [float])]
+else:
+    T_DOF = torch.Tensor
+    T_BDOF = torch.Tensor
+    T_BValue_float = torch.Tensor
+    T_BHValue_float = torch.Tensor
+    T_BValue_bool = torch.Tensor
+    T_BValue_int = torch.Tensor
+
+    T_BPosition = torch.Tensor
+    T_BQuaternion = torch.Tensor
+    T_BRotation = torch.Tensor
+    T_Position = torch.Tensor
+    T_Quaternion = torch.Tensor
+    T_Rotation = torch.Tensor
+
+    T_BHDOF_float = torch.Tensor
+    T_HDOF_float = torch.Tensor
diff --git a/src/curobo/util/__init__.py b/src/curobo/util/__init__.py
new file mode 100644
index 00000000..a08745f9
--- /dev/null
+++ b/src/curobo/util/__init__.py
@@ -0,0 +1,10 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
diff --git a/src/curobo/util/error_metrics.py b/src/curobo/util/error_metrics.py
new file mode 100644
index 00000000..2e7b05ae
--- /dev/null
+++ b/src/curobo/util/error_metrics.py
@@ -0,0 +1,50 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+import math
+
+# Third Party
+import numpy as np
+import torch
+
+
+def rotation_error_quaternion(q_des, q):
+    #
+    sum_q = torch.norm(q_des + q)
+    diff_q = torch.norm(q_des - q)
+    # err = torch.minimum(sum_q, diff_q) / math.sqrt(2)
+    err = np.minimum(sum_q.cpu().numpy(), diff_q.cpu().numpy()) / math.sqrt(2)
+    return err
+
+
+def rotation_error_matrix(r_des, r):
+    #
+    """
+    px = torch.tensor([1.0,0.0,0.0],device=r_des.device).T
+    py = torch.tensor([0.0,1.0,0.0],device=r_des.device).T
+    pz = torch.tensor([0.0,0.0,1.0],device=r_des.device).T
+    print(px.shape, r.shape)
+
+    current_px = r * px
+    current_py = r * py
+    current_pz = r * pz
+
+    des_px = r_des * px
+    des_py = r_des * py
+    des_pz = r_des * pz
+
+    cost = torch.norm(current_px -  des_px) +  torch.norm(current_py -  des_py) + torch.norm(current_pz -  des_pz)
+    return cost
+    """
+    rot_delta = r - r_des
+    cost = 0.5 * torch.sum(torch.square(rot_delta), dim=-2)
+    cost = torch.sum(cost, dim=-1)
+    return cost
diff --git a/src/curobo/util/helpers.py b/src/curobo/util/helpers.py
new file mode 100644
index 00000000..6ff31b93
--- /dev/null
+++ b/src/curobo/util/helpers.py
@@ -0,0 +1,29 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+from collections import defaultdict
+from typing import List
+
+
+def default_to_regular(d):
+    if isinstance(d, defaultdict):
+        d = {k: default_to_regular(v) for k, v in d.items()}
+    return d
+
+
+def list_idx_if_not_none(d_list: List, idx: int):
+    idx_list = []
+    for x in d_list:
+        if x is not None:
+            idx_list.append(x[idx])
+        else:
+            idx_list.append(None)
+    return idx_list
diff --git a/src/curobo/util/logger.py b/src/curobo/util/logger.py
new file mode 100644
index 00000000..9b79a45f
--- /dev/null
+++ b/src/curobo/util/logger.py
@@ -0,0 +1,45 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+import logging
+
+
+def setup_curobo_logger(level="info"):
+    FORMAT = "[%(levelname)s] [%(name)s] %(message)s"
+    if level == "info":
+        level = logging.INFO
+    elif level == "debug":
+        level = logging.DEBUG
+    elif level == "error":
+        level = logging.ERROR
+    elif level in ["warn", "warning"]:
+        level = logging.WARN
+    else:
+        raise ValueError("Log level should be one of [info,debug, warn, error]")
+    logging.basicConfig(format=FORMAT, level=level)
+    logger = logging.getLogger("curobo")
+    logger.setLevel(level=level)
+
+
+def log_warn(txt: str, *args, **kwargs):
+    logger = logging.getLogger("curobo")
+    logger.warning(txt, *args, **kwargs)
+
+
+def log_info(txt: str, *args, **kwargs):
+    logger = logging.getLogger("curobo")
+    logger.info(txt, *args, **kwargs)
+
+
+def log_error(txt: str, exc_info=True, stack_info=True, *args, **kwargs):
+    logger = logging.getLogger("curobo")
+    logger.error(txt, exc_info=exc_info, stack_info=stack_info, *args, **kwargs)
+    raise
diff --git a/src/curobo/util/sample_lib.py b/src/curobo/util/sample_lib.py
new file mode 100644
index 00000000..4c66e80b
--- /dev/null
+++ b/src/curobo/util/sample_lib.py
@@ -0,0 +1,662 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+
+# Standard Library
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+
+# Third Party
+import numpy as np
+import scipy.interpolate as si
+import torch
+import torch.autograd.profiler as profiler
+from scipy.stats.qmc import Halton
+from torch.distributions.multivariate_normal import MultivariateNormal
+
+# CuRobo
+from curobo.types.base import TensorDeviceType
+from curobo.util.logger import log_error, log_warn
+
+# Local Folder
+from ..opt.particle.particle_opt_utils import get_stomp_cov
+
+
+@dataclass
+class SampleConfig:
+    horizon: int
+    d_action: int
+    tensor_args: TensorDeviceType
+    fixed_samples: bool = True
+    sample_ratio: Dict[str, float] = field(
+        default_factory=lambda: (
+            {"halton": 0.3, "halton-knot": 0.7, "random": 0.0, "random-knot": 0.0, "stomp": 0.0}
+        )
+    )
+    seed: int = 0
+    filter_coeffs: Optional[List[float]] = field(default_factory=lambda: [0.3, 0.3, 0.4])
+    n_knots: int = 3
+    scale_tril: Optional[float] = None
+    covariance_matrix: Optional[torch.tensor] = None
+    sample_method: str = "halton"
+    cov_mode: str = "vel"  # for STOMP sampler
+    sine_period: int = 2  # for Sinewave sampler
+    degree: int = 3  # bspline
+
+
+class BaseSampleLib(SampleConfig):
+    @profiler.record_function("sample_lib/init")
+    def __init__(
+        self,
+        sample_config,
+    ):
+        super().__init__(**vars(sample_config))
+
+        self.Z = torch.zeros(
+            self.horizon * self.d_action,
+            dtype=self.tensor_args.dtype,
+            device=self.tensor_args.device,
+        )
+        if self.scale_tril is None and self.covariance_matrix is not None:
+            self.scale_tril = self.tensor_args.to_device(
+                torch.linalg.cholesky(covariance_matrix.to("cpu"))
+            )
+        self.samples = None
+        self.sample_shape = 0
+        self.ndims = self.horizon * self.d_action
+        self.stomp_matrix, self.stomp_scale_tril = None, None
+
+    def get_samples(self, sample_shape, base_seed, current_state=None, **kwargs):
+        raise NotImplementedError
+
+    def filter_samples(self, eps):
+        if self.filter_coeffs is not None:
+            beta_0, beta_1, beta_2 = self.filter_coeffs
+
+            # This could be tensorized:
+            for i in range(2, eps.shape[1]):
+                eps[:, i, :] = (
+                    beta_0 * eps[:, i, :] + beta_1 * eps[:, i - 1, :] + beta_2 * eps[:, i - 2, :]
+                )
+        return eps
+
+    def filter_smooth(self, samples):
+        # scale by stomp matrix:
+        if samples.shape[0] == 0:
+            return samples
+        if self.stomp_matrix is None:
+            self.stomp_matrix, self.stomp_scale_tril = get_stomp_cov(
+                self.horizon, self.d_action, tensor_args=self.tensor_args
+            )
+
+        # fit bspline:
+
+        filter_samples = self.stomp_matrix[: self.horizon, : self.horizon] @ samples
+        # print(filter_samples.shape)
+        filter_samples = filter_samples / torch.max(torch.abs(filter_samples))
+        return filter_samples
+
+
+class HaltonSampleLib(BaseSampleLib):
+    @profiler.record_function("sample_lib/halton")
+    def __init__(self, sample_config: SampleConfig):
+        super().__init__(sample_config)
+        # create halton generator:
+        self.halton_generator = HaltonGenerator(
+            self.ndims, seed=self.seed, tensor_args=self.tensor_args
+        )
+
+    def get_samples(self, sample_shape, base_seed=None, filter_smooth=False, **kwargs):
+        if self.sample_shape != sample_shape or not self.fixed_samples:
+            if len(sample_shape) > 1:
+                log_error("sample shape should be a single value")
+                raise ValueError
+            seed = self.seed if base_seed is None else base_seed
+            self.sample_shape = sample_shape
+            self.seed = seed
+            self.samples = self.halton_generator.get_gaussian_samples(sample_shape[0])
+            self.samples = self.samples.view(self.samples.shape[0], self.horizon, self.d_action)
+
+            if filter_smooth:
+                self.samples = self.filter_smooth(self.samples)
+            else:
+                self.samples = self.filter_samples(self.samples)
+        if self.samples.shape[0] != sample_shape[0]:
+            print(self.samples.shape, sample_shape)
+            log_error("sampling failed")
+        return self.samples
+
+
+def bspline(c_arr, t_arr=None, n=100, degree=3):
+    sample_device = c_arr.device
+    sample_dtype = c_arr.dtype
+    cv = c_arr.cpu().numpy()
+
+    if t_arr is None:
+        t_arr = np.linspace(0, cv.shape[0], cv.shape[0])
+    else:
+        t_arr = t_arr.cpu().numpy()
+    spl = si.splrep(t_arr, cv, k=degree, s=0.5)
+
+    xx = np.linspace(0, cv.shape[0], n)
+    samples = si.splev(xx, spl, ext=3)
+    samples = torch.as_tensor(samples, device=sample_device, dtype=sample_dtype)
+
+    return samples
+
+
+class KnotSampleLib(SampleConfig):
+    def __init__(self, sample_config: SampleConfig):
+        super().__init__(**vars(sample_config))
+        self.sample_shape = 0
+        self.ndims = self.n_knots * self.d_action
+        self.Z = torch.zeros(
+            self.ndims, device=self.tensor_args.device, dtype=self.tensor_args.dtype
+        )
+        if self.covariance_matrix is None:
+            self.cov_matrix = torch.eye(
+                self.ndims, device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            )
+        self.scale_tril = torch.linalg.cholesky(
+            self.cov_matrix.to(dtype=torch.float32, device="cpu")
+        ).to(device=self.tensor_args.device, dtype=self.tensor_args.dtype)
+        if self.sample_method == "random":
+            self.mvn = MultivariateNormal(
+                loc=self.Z,
+                scale_tril=self.scale_tril,
+            )
+        if self.sample_method == "halton":
+            self.halton_generator = HaltonGenerator(
+                self.ndims, seed=self.seed, tensor_args=self.tensor_args
+            )
+
+    def get_samples(self, sample_shape, **kwargs):
+        if self.sample_shape != sample_shape or not self.fixed_samples:
+            # sample shape is the number of particles to sample
+            if self.sample_method == "halton":
+                self.knot_points = self.halton_generator.get_gaussian_samples(sample_shape[0])
+            elif self.sample_method == "random":
+                self.knot_points = self.mvn.sample(sample_shape=sample_shape)
+
+            # Sample splines from knot points:
+            # iteratre over action dimension:
+            knot_samples = self.knot_points.view(sample_shape[0], self.d_action, self.n_knots)
+            self.samples = torch.zeros(
+                (sample_shape[0], self.horizon, self.d_action),
+                device=self.tensor_args.device,
+                dtype=self.tensor_args.dtype,
+            )
+            for i in range(sample_shape[0]):
+                for j in range(self.d_action):
+                    self.samples[i, :, j] = bspline(
+                        knot_samples[i, j, :], n=self.horizon, degree=self.degree
+                    )
+            self.sample_shape = sample_shape
+
+        return self.samples
+
+
+class RandomSampleLib(BaseSampleLib):
+    def __init__(self, sample_config: SampleConfig):
+        super().__init__(sample_config)
+
+        if self.scale_tril is None:
+            self.scale_tril = torch.eye(
+                self.ndims, device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            )
+
+        self.mvn = MultivariateNormal(loc=self.Z, scale_tril=self.scale_tril)
+
+    def get_samples(self, sample_shape, base_seed=None, filter_smooth=False, **kwargs):
+        if base_seed is not None and base_seed != self.seed:
+            self.seed = base_seed
+            # print(self.seed)
+            torch.manual_seed(self.seed)
+        if self.sample_shape != sample_shape or not self.fixed_samples:
+            self.sample_shape = sample_shape
+            self.samples = self.mvn.sample(sample_shape=self.sample_shape)
+            self.samples = self.samples.view(self.samples.shape[0], self.horizon, self.d_action)
+            if filter_smooth:
+                self.samples = self.filter_smooth(self.samples)
+            else:
+                self.samples = self.filter_samples(self.samples)
+        return self.samples
+
+
+class SineSampleLib(BaseSampleLib):  # pragma : no cover
+    def __init__(self, sample_config: SampleConfig):
+        super().__init__(sample_config)
+
+        self.const_pi = torch.acos(torch.zeros(1)).item()
+        self.ndims = self.d_action
+        self.sine_wave = self.generate_sine_wave()
+        self.diag_sine_wave = torch.diag(self.sine_wave)
+
+    def get_samples(self, sample_shape, base_seed=None, **kwargs):  # pragma : no cover
+        if self.sample_shape != sample_shape or not self.fixed_samples:
+            if len(sample_shape) > 1:
+                print("sample shape should be a single value")
+                raise ValueError
+            seed = self.seed if base_seed is None else base_seed
+            self.sample_shape = sample_shape
+            self.seed = seed
+
+            # sample only amplitudes from halton sequence:
+            self.amplitude_samples = generate_gaussian_halton_samples(
+                sample_shape[0],
+                self.ndims,
+                use_scipy_halton=True,
+                seed=self.seed,
+                tensor_args=self.tensor_args,
+            )
+
+            self.amplitude_samples = self.filter_samples(self.amplitude_samples)
+            self.amplitude_samples = self.amplitude_samples.unsqueeze(1).expand(
+                -1, self.horizon, -1
+            )
+
+            # generate sine waves from samples for the full horizon:
+            self.samples = self.diag_sine_wave @ self.amplitude_samples
+
+        return self.samples
+
+    def generate_sine_wave(self, horizon=None):  # pragma : no cover
+        horizon = self.horizon if horizon is None else horizon
+
+        # generate a sine wave:
+        x = torch.linspace(
+            0,
+            4 * self.const_pi / self.sine_period,
+            horizon,
+            device=self.tensor_args.device,
+            dtype=self.tensor_args.dtype,
+        )
+        sin_out = torch.sin(x)
+
+        return sin_out
+
+
+class StompSampleLib(BaseSampleLib):
+    @profiler.record_function("stomp_sample_lib/init")
+    def __init__(
+        self,
+        sample_config: SampleConfig,
+    ):
+        super(StompSampleLib, self).__init__(sample_config)
+
+        _, self.stomp_scale_tril, _ = get_stomp_cov(
+            self.horizon,
+            self.d_action,
+            tensor_args=self.tensor_args,
+            cov_mode=self.cov_mode,
+            RETURN_M=True,
+        )
+
+        self.filter_coeffs = None
+        self.halton_generator = HaltonGenerator(
+            self.ndims, seed=self.seed, tensor_args=self.tensor_args
+        )
+
+    def get_samples(self, sample_shape, base_seed=None, **kwargs):
+        if self.sample_shape != sample_shape or not self.fixed_samples:
+            if len(sample_shape) > 1:
+                print("sample shape should be a single value")
+                raise ValueError
+            # seed = self.seed if base_seed is None else base_seed
+            self.sample_shape = sample_shape
+            # self.seed = seed
+            # torch.manual_seed(self.seed)
+            halton_samples = self.halton_generator.get_gaussian_samples(sample_shape[0])
+            halton_samples = (
+                self.stomp_scale_tril.unsqueeze(0) @ halton_samples.unsqueeze(-1)
+            ).squeeze(-1)
+
+            halton_samples = (
+                (halton_samples)
+                .view(self.sample_shape[0], self.d_action, self.horizon)
+                .transpose(-2, -1)
+            )
+            halton_samples = halton_samples / torch.max(torch.abs(halton_samples))
+            # halton_samples[:, 0, :] = 0.0
+            halton_samples[:, -1:, :] = 0.0
+            self.samples = halton_samples
+        return self.samples
+
+
+class SampleLib(BaseSampleLib):
+    def __init__(self, sample_config: SampleConfig):
+        super().__init__(sample_config)
+        # sample from a mix of possibilities:
+        # TODO: Create instances only if the ratio is not 0.0
+        # halton
+        self.halton_sample_lib = HaltonSampleLib(sample_config)
+        self.knot_halton_sample_lib = KnotSampleLib(sample_config)
+        self.random_sample_lib = RandomSampleLib(sample_config)
+        self.knot_random_sample_lib = KnotSampleLib(sample_config)
+        self.stomp_sample_lib = StompSampleLib(sample_config)
+        self.sine_sample_lib = SineSampleLib(sample_config)
+        self.sample_fns = []
+
+        self.sample_fns = {
+            "halton": self.halton_sample_lib.get_samples,
+            "halton-knot": self.knot_halton_sample_lib.get_samples,
+            "random": self.random_sample_lib.get_samples,
+            "random-knot": self.knot_random_sample_lib.get_samples,
+            "stomp": self.stomp_sample_lib.get_samples,
+            "sine": self.sine_sample_lib.get_samples,
+        }
+        self.samples = None
+
+    def get_samples(self, sample_shape, base_seed=None, **kwargs):
+        # TODO: Make sure ratio * sample_shape is an integer
+
+        if (
+            (not self.fixed_samples)
+            or self.samples is None
+            or sample_shape[0] != self.samples.shape[0]
+        ):
+            cat_list = []
+            sample_shape = list(sample_shape)
+            for ki, k in enumerate(self.sample_ratio.keys()):
+                if self.sample_ratio[k] == 0.0:
+                    continue
+                n_samples = round(sample_shape[0] * self.sample_ratio[k])
+                s_shape = torch.Size(
+                    [n_samples], device=self.tensor_args.device, dtype=self.tensor_args.dtype
+                )
+                # if(k == 'halton' or k == 'random'):
+                samples = self.sample_fns[k](sample_shape=s_shape)
+                # else:
+                #    samples = self.sample_fns[k](sample_shape=s_shape)
+                cat_list.append(samples)
+                samples = torch.cat(cat_list, dim=0)
+                self.samples = samples
+        return self.samples
+
+
+def get_ranged_halton_samples(
+    dof,
+    up_bounds,
+    low_bounds,
+    num_particles,
+    tensor_args: TensorDeviceType = TensorDeviceType("cpu"),
+    seed=123,
+):
+    q_samples = generate_halton_samples(
+        num_particles,
+        dof,
+        use_scipy_halton=True,
+        tensor_args=tensor_args,
+        seed=seed,
+    )
+
+    # scale samples by joint range:
+    range_b = up_bounds - low_bounds
+    q_samples = q_samples * range_b + low_bounds
+
+    return q_samples
+
+
+class HaltonGenerator:
+    def __init__(
+        self,
+        ndims,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+        up_bounds=[1],
+        low_bounds=[0],
+        seed=123,
+        store_buffer: Optional[int] = 1000,
+    ):
+        self._seed = seed
+        self.tensor_args = tensor_args
+        self.sequencer = Halton(d=ndims, seed=seed, scramble=False)
+        # scale samples by joint range:
+        up_bounds = self.tensor_args.to_device(up_bounds)
+        low_bounds = self.tensor_args.to_device(low_bounds)
+        self.range_b = up_bounds - low_bounds
+        self.low_bounds = low_bounds
+        self.ndims = ndims
+        self.proj_mat = torch.sqrt(
+            torch.tensor([2.0], device=self.tensor_args.device, dtype=self.tensor_args.dtype)
+        )
+        self.i_mat = torch.eye(
+            self.ndims, device=self.tensor_args.device, dtype=self.tensor_args.dtype
+        )
+        self._sample_buffer = None
+        self._store_buffer = store_buffer
+        if store_buffer is not None:
+            # sample some and just randomly get tensors from this buffer:
+            self._sample_buffer = torch.tensor(
+                self.sequencer.random(store_buffer),
+                device=self.tensor_args.device,
+                dtype=self.tensor_args.dtype,
+            )
+            self._int_gen = torch.Generator(device=self.tensor_args.device)
+            self._int_gen.manual_seed(seed)
+            self._index_buffer = None
+
+    def reset(self):
+        self.sequencer.reset()
+        if self._store_buffer is not None:
+            self._sample_buffer = torch.tensor(
+                self.sequencer.random(self._store_buffer),
+                device=self.tensor_args.device,
+                dtype=self.tensor_args.dtype,
+            )
+            self._int_gen = torch.Generator(device=self.tensor_args.device)
+            self._int_gen.manual_seed(self._seed)
+            # self._index_buffer = None
+
+    def fast_forward(self, steps: int):
+        """
+        Fast forward sampler by steps
+        """
+        self.sequencer.fast_forward(steps)
+        if self.fixed_samples:
+            log_warn("fast forward will not work with fixed samples.")
+
+    def _get_samples(self, num_samples: int):
+        if self._sample_buffer is not None:
+            out_buffer = None
+            if self._index_buffer is not None and self._index_buffer.shape[0] == num_samples:
+                out_buffer = self._index_buffer
+            index = torch.randint(
+                0,
+                self._sample_buffer.shape[0],
+                (num_samples,),
+                generator=self._int_gen,
+                device=self.tensor_args.device,
+                out=out_buffer,
+            )
+            samples = self._sample_buffer[index]
+            if self._index_buffer is None:
+                self._index_buffer = index
+        else:
+            samples = torch.tensor(
+                self.sequencer.random(num_samples),
+                device=self.tensor_args.device,
+                dtype=self.tensor_args.dtype,
+            )
+        return samples
+
+    @profiler.record_function("halton_generator/samples")
+    def get_samples(self, num_samples, bounded=False):
+        samples = self._get_samples(num_samples)
+        if bounded:
+            samples = samples * self.range_b + self.low_bounds
+        return samples
+
+    @profiler.record_function("halton_generator/gaussian_samples")
+    def get_gaussian_samples(self, num_samples, variance=1.0):
+        std_dev = np.sqrt(variance)
+        uniform_samples = self.get_samples(num_samples)
+        gaussian_halton_samples = gaussian_transform(
+            uniform_samples, self.proj_mat, self.i_mat, std_dev
+        )
+        return gaussian_halton_samples
+
+
+@torch.jit.script
+def gaussian_transform(
+    uniform_samples: torch.Tensor, proj_mat: torch.Tensor, i_mat: torch.Tensor, variance: float
+):
+    gaussian_halton_samples = proj_mat * torch.erfinv(2 * uniform_samples - 1)
+    i_mat = i_mat * variance
+    gaussian_halton_samples = torch.matmul(gaussian_halton_samples, i_mat)
+    return gaussian_halton_samples
+
+
+#######################
+## Gaussian Sampling ##
+#######################
+
+
+def generate_noise(cov, shape, base_seed, filter_coeffs=None, device=torch.device("cpu")):
+    """
+    Generate correlated Gaussian samples using autoregressive process
+    """
+    torch.manual_seed(base_seed)
+    beta_0, beta_1, beta_2 = filter_coeffs
+    N = cov.shape[0]
+    m = MultivariateNormal(loc=torch.zeros(N).to(device), covariance_matrix=cov)
+    eps = m.sample(sample_shape=shape)
+    # eps = np.random.multivariate_normal(mean=np.zeros((N,)), cov = cov, size=shape)
+    if filter_coeffs is not None:
+        for i in range(2, eps.shape[1]):
+            eps[:, i, :] = (
+                beta_0 * eps[:, i, :] + beta_1 * eps[:, i - 1, :] + beta_2 * eps[:, i - 2, :]
+            )
+    return eps
+
+
+def generate_noise_np(cov, shape, base_seed, filter_coeffs=None):
+    """
+    Generate correlated noisy samples using autoregressive process
+    """
+    np.random.seed(base_seed)
+    beta_0, beta_1, beta_2 = filter_coeffs
+    N = cov.shape[0]
+    eps = np.random.multivariate_normal(mean=np.zeros((N,)), cov=cov, size=shape)
+    if filter_coeffs is not None:
+        for i in range(2, eps.shape[1]):
+            eps[:, i, :] = (
+                beta_0 * eps[:, i, :] + beta_1 * eps[:, i - 1, :] + beta_2 * eps[:, i - 2, :]
+            )
+    return eps
+
+
+###########################
+## Quasi-Random Sampling ##
+###########################
+
+
+def generate_prime_numbers(num):
+    def is_prime(n):
+        for j in range(2, ((n // 2) + 1), 1):
+            if n % j == 0:
+                return False
+        return True
+
+    primes = [0] * num  # torch.zeros(num, device=device)
+    primes[0] = 2
+    curr_num = 1
+    for i in range(1, num):
+        while True:
+            curr_num += 2
+            if is_prime(curr_num):
+                primes[i] = curr_num
+                break
+
+    return primes
+
+
+def generate_van_der_corput_sample(idx, base):
+    f, r = 1.0, 0
+    while idx > 0:
+        f /= base * 1.0
+        r += f * (idx % base)
+        idx = idx // base
+    return r
+
+
+def generate_van_der_corput_samples_batch(idx_batch, base):
+    inp_device = idx_batch.device
+    batch_size = idx_batch.shape[0]
+    f = 1.0  # torch.ones(batch_size, device=inp_device)
+    r = torch.zeros(batch_size, device=inp_device)
+    while torch.any(idx_batch > 0):
+        f /= base * 1.0
+        r += f * (idx_batch % base)  # * (idx_batch > 0)
+        idx_batch = idx_batch // base
+    return r
+
+
+def generate_halton_samples(
+    num_samples,
+    ndims,
+    bases=None,
+    use_scipy_halton=True,
+    seed=123,
+    tensor_args: TensorDeviceType = TensorDeviceType(),
+):
+    if not use_scipy_halton:
+        samples = torch.zeros(
+            num_samples, ndims, device=tensor_args.device, dtype=tensor_args.dtype
+        )
+        if not bases:
+            bases = generate_prime_numbers(ndims)
+        idx_batch = torch.arange(1, num_samples + 1, device=tensor_args.device)
+        for dim in range(ndims):
+            samples[:, dim] = generate_van_der_corput_samples_batch(idx_batch, bases[dim])
+    else:
+        sequencer = Halton(d=ndims, seed=seed, scramble=False)
+        samples = torch.tensor(
+            sequencer.random(num_samples), device=tensor_args.device, dtype=tensor_args.dtype
+        )
+    return samples
+
+
+def generate_gaussian_halton_samples(
+    num_samples,
+    ndims,
+    bases=None,
+    use_scipy_halton=True,
+    seed=123,
+    tensor_args=TensorDeviceType(),
+    variance=1.0,
+):
+    uniform_halton_samples = generate_halton_samples(
+        num_samples, ndims, bases, use_scipy_halton, seed, tensor_args=tensor_args
+    )
+
+    gaussian_halton_samples = torch.sqrt(
+        torch.tensor([2.0], device=tensor_args.device, dtype=tensor_args.dtype)
+    ) * torch.erfinv(2 * uniform_halton_samples - 1)
+
+    # project them to covariance:
+    i_mat = torch.eye(ndims, device=tensor_args.device, dtype=tensor_args.dtype)
+    gaussian_halton_samples = torch.matmul(gaussian_halton_samples, np.sqrt(variance) * i_mat)
+    return gaussian_halton_samples
+
+
+def generate_gaussian_sobol_samples(
+    num_samples,
+    ndims,
+    seed,
+    tensor_args=TensorDeviceType(),
+):
+    soboleng = torch.quasirandom.SobolEngine(dimension=ndims, scramble=True, seed=seed)
+    uniform_sobol_samples = soboleng.draw(num_samples).to(tensor_args.device)
+
+    gaussian_sobol_samples = torch.sqrt(
+        torch.tensor([2.0], device=tensor_args.device, dtype=tensor_args.dtype)
+    ) * torch.erfinv(2 * uniform_sobol_samples - 1)
+    return gaussian_sobol_samples
diff --git a/src/curobo/util/state_filter.py b/src/curobo/util/state_filter.py
new file mode 100644
index 00000000..303d9761
--- /dev/null
+++ b/src/curobo/util/state_filter.py
@@ -0,0 +1,143 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+from dataclasses import dataclass
+from typing import Optional
+
+# CuRobo
+from curobo.types.base import TensorDeviceType
+from curobo.types.enum import StateType
+from curobo.types.state import FilterCoeff, JointState
+from curobo.types.tensor import T_DOF
+
+
+@dataclass(frozen=True)
+class FilterConfig:
+    filter_coeff: FilterCoeff
+    dt: float
+    control_space: StateType
+    tensor_args: TensorDeviceType = TensorDeviceType()
+    enable: bool = True
+    teleport_mode: bool = False
+
+    @staticmethod
+    def from_dict(
+        coeff_dict,
+        enable=True,
+        dt=0.0,
+        control_space=StateType.ACCELERATION,
+        tensor_args=TensorDeviceType(),
+        teleport_mode=False,
+    ):
+        data = {}
+        data["filter_coeff"] = FilterCoeff(**coeff_dict)
+        data["dt"] = dt
+        data["control_space"] = control_space
+        data["enable"] = enable
+        data["teleport_mode"] = teleport_mode
+        return FilterConfig(**data, tensor_args=tensor_args)
+
+
+class JointStateFilter(FilterConfig):
+    def __init__(self, filter_config: FilterConfig):
+        super().__init__(**vars(filter_config))
+        self.cmd_joint_state = None
+        if self.control_space == StateType.ACCELERATION:
+            self.integrate_action = self.integrate_acc
+        elif self.control_space == StateType.VELOCITY:
+            self.integrate_action = self.integrate_vel
+        elif self.control_space == StateType.POSITION:
+            self.integrate_action = self.integrate_pos
+
+    def filter_joint_state(self, raw_joint_state: JointState):
+        if not self.enable:
+            return raw_joint_state
+        raw_joint_state = raw_joint_state.to(self.tensor_args)
+
+        if self.cmd_joint_state is None:
+            self.cmd_joint_state = raw_joint_state.clone()
+            return self.cmd_joint_state
+        self.cmd_joint_state.blend(self.filter_coeff, raw_joint_state)
+        return self.cmd_joint_state
+
+    def integrate_jerk(
+        self, qddd_des, cmd_joint_state: Optional[JointState] = None, dt: Optional[float] = None
+    ):
+        dt = self.dt if dt is None else dt
+        if cmd_joint_state is not None:
+            if self.cmd_joint_state is None:
+                self.cmd_joint_state = cmd_joint_state.clone()
+            else:
+                self.cmd_joint_state.copy_(cmd_joint_state)
+        self.cmd_joint_state.acceleration[:] = self.cmd_joint_state.acceleration + qddd_des * dt
+        self.cmd_joint_state.velocity[:] = (
+            self.cmd_joint_state.velocity + self.cmd_joint_state.acceleration * dt
+        )
+        self.cmd_joint_state.position[:] = (
+            self.cmd_joint_state.position + self.cmd_joint_state.velocity * dt
+        )
+
+        return self.cmd_joint_state
+
+    def integrate_acc(
+        self,
+        qdd_des: T_DOF,
+        cmd_joint_state: Optional[JointState] = None,
+        dt: Optional[float] = None,
+    ):
+        dt = self.dt if dt is None else dt
+        if cmd_joint_state is not None:
+            if self.cmd_joint_state is None:
+                self.cmd_joint_state = cmd_joint_state.clone()
+            else:
+                self.cmd_joint_state.copy_(cmd_joint_state)
+        self.cmd_joint_state.acceleration[:] = qdd_des
+        self.cmd_joint_state.velocity[:] = self.cmd_joint_state.velocity + qdd_des * dt
+        self.cmd_joint_state.position[:] = (
+            self.cmd_joint_state.position + self.cmd_joint_state.velocity * dt
+        )
+        # TODO: for now just have zero jerl:
+        if self.cmd_joint_state.jerk is None:
+            self.cmd_joint_state.jerk = qdd_des * 0.0
+        else:
+            self.cmd_joint_state.jerk[:] = qdd_des * 0.0
+        return self.cmd_joint_state.clone()
+
+    def integrate_vel(
+        self,
+        qd_des: T_DOF,
+        cmd_joint_state: Optional[JointState] = None,
+        dt: Optional[float] = None,
+    ):
+        dt = self.dt if dt is None else dt
+        if cmd_joint_state is not None:
+            self.cmd_joint_state = cmd_joint_state
+        self.cmd_joint_state.velocity = qd_des
+        self.cmd_joint_state.position = (
+            self.cmd_joint_state.position + self.cmd_joint_state.velocity * dt
+        )
+
+        return self.cmd_joint_state
+
+    def integrate_pos(
+        self, q_des: T_DOF, cmd_joint_state: Optional[JointState] = None, dt: Optional[float] = None
+    ):
+        dt = self.dt if dt is None else dt
+        if cmd_joint_state is not None:
+            self.cmd_joint_state = cmd_joint_state
+
+        if not self.teleport_mode:
+            self.cmd_joint_state.velocity = (q_des - self.cmd_joint_state.position) / dt
+        self.cmd_joint_state.position = q_des
+        return self.cmd_joint_state
+
+    def reset(self):
+        self.cmd_joint_state = None
diff --git a/src/curobo/util/tensor_util.py b/src/curobo/util/tensor_util.py
new file mode 100644
index 00000000..efd53b50
--- /dev/null
+++ b/src/curobo/util/tensor_util.py
@@ -0,0 +1,73 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+from typing import List
+
+# Third Party
+import torch
+
+
+def check_tensor_shapes(new_tensor: torch.Tensor, mem_tensor: torch.Tensor):
+    if not isinstance(mem_tensor, torch.Tensor):
+        return False
+    if len(mem_tensor.shape) != len(new_tensor.shape):
+        return False
+    if mem_tensor.shape == new_tensor.shape:
+        return True
+
+
+def copy_tensor(new_tensor: torch.Tensor, mem_tensor: torch.Tensor):
+    if check_tensor_shapes(new_tensor, mem_tensor):
+        mem_tensor.copy_(new_tensor)
+        return True
+    return False
+
+
+def copy_if_not_none(x):
+    """Clones x if it's not None.
+    TODO: Rename this to clone_if_not_none
+
+
+    Args:
+        x (torch.Tensor): _description_
+
+    Returns:
+        _type_: _description_
+    """
+    if x is not None:
+        return x.clone()
+    return None
+
+
+@torch.jit.script
+def cat_sum(tensor_list: List[torch.Tensor]):
+    cat_tensor = torch.sum(torch.stack(tensor_list, dim=0), dim=0)
+    return cat_tensor
+
+
+@torch.jit.script
+def cat_max(tensor_list: List[torch.Tensor]):
+    cat_tensor = torch.max(torch.stack(tensor_list, dim=0), dim=0)[0]
+    return cat_tensor
+
+
+def tensor_repeat_seeds(tensor, num_seeds):
+    return (
+        tensor.view(tensor.shape[0], 1, tensor.shape[-1])
+        .repeat(1, num_seeds, 1)
+        .reshape(tensor.shape[0] * num_seeds, tensor.shape[-1])
+    )
+
+
+@torch.jit.script
+def fd_tensor(p: torch.Tensor, dt: torch.Tensor):
+    out = ((torch.roll(p, -1, -2) - p) * (1 / dt).unsqueeze(-1))[..., :-1, :]
+    return out
diff --git a/src/curobo/util/torch_utils.py b/src/curobo/util/torch_utils.py
new file mode 100644
index 00000000..d81f176d
--- /dev/null
+++ b/src/curobo/util/torch_utils.py
@@ -0,0 +1,36 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Third Party
+import torch
+from packaging import version
+
+# CuRobo
+from curobo.util.logger import log_warn
+
+
+def find_first_idx(array, value, EQUAL=False):
+    if EQUAL:
+        f_idx = torch.nonzero(array >= value, as_tuple=False)[0].item()
+    else:
+        f_idx = torch.nonzero(array > value, as_tuple=False)[0].item()
+    return f_idx
+
+
+def find_last_idx(array, value):
+    f_idx = torch.nonzero(array <= value, as_tuple=False)[-1].item()
+    return f_idx
+
+
+def is_cuda_graph_available():
+    if version.parse(torch.__version__) < version.parse("1.10"):
+        log_warn("WARNING: Disabling CUDA Graph as pytorch < 1.10")
+        return False
+    return True
diff --git a/src/curobo/util/trajectory.py b/src/curobo/util/trajectory.py
new file mode 100644
index 00000000..c6c72a7c
--- /dev/null
+++ b/src/curobo/util/trajectory.py
@@ -0,0 +1,514 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+import math
+from enum import Enum
+from typing import List, Optional
+
+# Third Party
+import numpy as np
+import torch
+
+# SRL
+import torch.autograd.profiler as profiler
+
+# CuRobo
+from curobo.types.base import TensorDeviceType
+from curobo.types.robot import JointState
+from curobo.util.logger import log_error, log_info, log_warn
+from curobo.util.sample_lib import bspline
+from curobo.util.warp_interpolation import get_cuda_linear_interpolation
+
+
+class InterpolateType(Enum):
+    #: linear interpolation using scipy
+    LINEAR = "linear"
+    #: cubic interpolation using scipy
+    CUBIC = "cubic"
+    #: quintic interpolation using scipy
+    QUINTIC = "quintic"
+    #: cuda accelerated linear interpolation using warp-lang
+    #: custom kernel :meth: get_cuda_linear_interpolation
+    LINEAR_CUDA = "linear_cuda"
+    #: Uses "Time-optimal trajectory generation for path following with bounded acceleration
+    #: and velocity." Robotics: Science and Systems VIII (2012): 1-8, Kunz & Stillman.
+    KUNZ_STILMAN_OPTIMAL = "kunz_stilman_optimal"
+
+
+def get_linear_traj(
+    positions,
+    dt=0.5,
+    duration=20,
+    tensor_args={"device": "cpu", "dtype": torch.float32},
+    max_traj_pts=None,
+    compute_dynamics=True,
+):
+    with profiler.record_function("linear_traj"):
+        if max_traj_pts is not None:
+            duration = max_traj_pts * dt
+            # max_pts = max_traj_pts
+
+        p_arr = torch.as_tensor(positions)
+        # create path:
+        path = torch.zeros((p_arr.shape[0] - 1, 2, p_arr.shape[1]))
+        for i in range(p_arr.shape[0] - 1):
+            path[i, 0] = p_arr[i]
+            path[i, 1] = p_arr[i + 1]
+        max_pts = math.ceil(duration / dt)
+
+        n_pts = int(max_pts / (p_arr.shape[0] - 1))
+        pts = torch.zeros((math.ceil(duration / dt), p_arr.shape[-1]))
+
+        linear_pts = torch.zeros((n_pts, p_arr.shape[-1]))
+
+        for i in range(1, p_arr.shape[0]):
+            weight = torch.as_tensor([(i + 1) / n_pts for i in range(n_pts)])
+            # do linear interplation between p_arr[i-1], p_arr[i]
+            for j in range(linear_pts.shape[0]):
+                linear_pts[j] = p_arr[i - 1] + weight[j] * (p_arr[i] - p_arr[i - 1])
+            pts[(i - 1) * n_pts : (i) * n_pts] = linear_pts
+
+        # compute velocity and acceleration:
+
+        # pts[0] = path[0, 0]
+        # pts[-1] = path[-1, 1]
+        # pts = pts[: i * n_pts]
+        pts[i * n_pts - 1 :] = pts[i * n_pts - 1].clone()
+        pts = pts.to(**tensor_args)
+        vel = (pts.clone().roll(-1, dims=0) - pts) / dt
+        vel = vel.roll(1, dims=0)
+        vel[0] = 0.0
+        acc = (vel.clone().roll(-1, dims=0) - vel) / dt
+        acc = acc.roll(1, dims=0)
+        acc[0] = 0.0
+        trajectory = {
+            "position": pts,
+            "velocity": vel,
+            "acceleration": acc,
+            "traj_buffer": torch.cat((pts, vel, acc), dim=-1),
+        }
+    return trajectory
+
+
+def get_smooth_trajectory(raw_traj, degree=5):
+    cpu_traj = raw_traj.cpu()
+
+    smooth_traj = torch.zeros_like(cpu_traj)
+    for i in range(cpu_traj.shape[-1]):
+        smooth_traj[:, i] = bspline(cpu_traj[:, i], n=cpu_traj.shape[0], degree=degree)
+    return smooth_traj.to(dtype=raw_traj.dtype, device=raw_traj.device)
+
+
+def get_spline_interpolated_trajectory(raw_traj, des_horizon, degree=5):
+    retimed_traj = torch.zeros((des_horizon, raw_traj.shape[-1]))
+    tensor_args = TensorDeviceType(device=raw_traj.device, dtype=raw_traj.dtype)
+    cpu_traj = raw_traj.cpu().numpy()
+
+    for i in range(cpu_traj.shape[-1]):
+        retimed_traj[:, i] = bspline(cpu_traj[:, i], n=des_horizon, degree=degree)
+    retimed_traj = retimed_traj.to(**vars(tensor_args))
+    return retimed_traj
+
+
+def get_batch_interpolated_trajectory(
+    raw_traj: JointState,
+    interpolation_dt: float,
+    max_vel: torch.Tensor,
+    max_acc: torch.Tensor,
+    max_jerk: torch.Tensor,
+    raw_dt: float,
+    kind: InterpolateType = InterpolateType.LINEAR_CUDA,
+    out_traj_state: Optional[JointState] = None,
+    tensor_args: TensorDeviceType = TensorDeviceType(),
+    max_deviation: float = 0.1,
+    min_dt: float = 0.02,
+):
+    # compute dt across trajectory:
+    b, horizon, dof = raw_traj.position.shape  # horizon
+    # given the dt required to run trajectory at maximum velocity,
+    # we find the number of timesteps required:
+    traj_steps, steps_max, opt_dt = calculate_tsteps(
+        raw_traj.velocity,
+        raw_traj.acceleration,
+        raw_traj.jerk,
+        interpolation_dt,
+        max_vel,
+        max_acc,
+        max_jerk,
+        raw_dt,
+        min_dt,
+        horizon,
+    )
+    # traj_steps contains the tsteps for each trajectory
+    assert steps_max > 0
+    # To do linear interpolation, we
+    if out_traj_state is None:
+        out_traj_state = JointState.zeros([b, steps_max, dof], tensor_args)
+
+    if out_traj_state.position.shape[1] < steps_max:
+        log_error("Interpolation buffer shape is smaller than steps_max")
+
+    if kind in [InterpolateType.LINEAR, InterpolateType.CUBIC]:
+        # plot and save:
+        out_traj_state = get_cpu_linear_interpolation(
+            raw_traj,
+            traj_steps,
+            out_traj_state,
+            kind,
+            opt_dt=opt_dt,
+            interpolation_dt=interpolation_dt,
+        )
+
+    elif kind == InterpolateType.LINEAR_CUDA:
+        out_traj_state = get_cuda_linear_interpolation(
+            raw_traj, traj_steps, out_traj_state, opt_dt, raw_dt
+        )
+    elif kind == InterpolateType.KUNZ_STILMAN_OPTIMAL:
+        out_traj_state = get_cpu_kunz_stilman_interpolation(
+            raw_traj,
+            traj_steps,
+            out_traj_state,
+            opt_dt=opt_dt,
+            interpolation_dt=interpolation_dt,
+            max_velocity=max_vel,
+            max_acceleration=max_acc,
+            max_deviation=max_deviation,
+        )
+    else:
+        raise ValueError("Unknown interpolation type")
+    # opt_dt = (raw_dt) / opt_dt
+    return out_traj_state, traj_steps, opt_dt
+
+
+def get_cpu_linear_interpolation(
+    raw_traj, traj_steps, out_traj_state, kind: InterpolateType, opt_dt=None, interpolation_dt=None
+):
+    cpu_traj = raw_traj.position.cpu().numpy()
+    out_traj = out_traj_state.position
+    retimed_traj = out_traj.cpu()
+    for k in range(out_traj.shape[0]):
+        tstep = traj_steps[k].item()
+        opt_d = opt_dt[k].item()
+        for i in range(cpu_traj.shape[-1]):
+            retimed_traj[k, :tstep, i] = linear_smooth(
+                cpu_traj[k, :, i],
+                y=None,
+                n=tstep,
+                kind=kind,
+                last_step=tstep,
+                opt_dt=opt_d,
+                interpolation_dt=interpolation_dt,
+            )
+        retimed_traj[k, tstep:, :] = retimed_traj[k, tstep - 1 : tstep, :]
+    out_traj_state.position[:] = retimed_traj.to(device=raw_traj.position.device)
+    return out_traj_state
+
+
+def get_cpu_kunz_stilman_interpolation(
+    raw_traj: JointState,
+    traj_steps: int,
+    out_traj_state: JointState,
+    max_velocity: torch.Tensor,
+    max_acceleration: torch.Tensor,
+    opt_dt: float,
+    interpolation_dt: float,
+    max_deviation: float = 0.1,
+):
+    try:
+        # Third Party
+        from trajectory_smoothing import TrajectorySmoother
+    except:
+        log_info(
+            "trajectory_smoothing package not found, try installing curobo with "
+            + "pip install .[smooth]"
+        )
+        return get_cpu_linear_interpolation(
+            raw_traj, traj_steps, out_traj_state, InterpolateType.LINEAR, opt_dt, interpolation_dt
+        )
+
+    cpu_traj = raw_traj.position.cpu().numpy()
+    out_traj = out_traj_state.position
+    retimed_traj = out_traj.cpu()
+    out_traj_vel = out_traj_state.velocity.cpu()
+    out_traj_acc = out_traj_state.acceleration.cpu()
+    out_traj_jerk = out_traj_state.jerk.cpu()
+    dof = cpu_traj.shape[-1]
+    trajectory_sm = TrajectorySmoother(
+        dof,
+        max_velocity.cpu().view(dof).numpy(),
+        max_acceleration.cpu().view(dof).numpy() * 0.5,
+        max_deviation,
+    )
+    for k in range(out_traj.shape[0]):
+        tstep = traj_steps[k].item()
+        opt_d = opt_dt[k].item()
+        in_traj = np.copy(cpu_traj[k])
+
+        if np.sum(in_traj[-1]) != 0.0:
+            out = trajectory_sm.smooth_interpolate(
+                in_traj, traj_dt=0.001, interpolation_dt=interpolation_dt, max_tsteps=tstep
+            )
+            if out.success:
+                retimed_traj[k, : out.length, :] = torch.as_tensor(out.position)
+                out_traj_vel[k, : out.length, :] = torch.as_tensor(out.velocity)
+                out_traj_acc[k, : out.length, :] = torch.as_tensor(out.acceleration)
+                out_traj_jerk[k, : out.length, :] = torch.as_tensor(out.jerk)
+                retimed_traj[k, out.length :, :] = retimed_traj[k, out.length - 1 : out.length, :]
+
+                out_traj_vel[k, out.length :, :] = out_traj_vel[k, out.length - 1 : out.length, :]
+                out_traj_acc[k, out.length :, :] = out_traj_acc[k, out.length - 1 : out.length, :]
+                out_traj_jerk[k, out.length :, :] = out_traj_jerk[k, out.length - 1 : out.length, :]
+            else:
+                log_warn("Kunz Stilman interpolation failed, using linear")
+                for i in range(cpu_traj.shape[-1]):
+                    retimed_traj[k, :tstep, i] = linear_smooth(
+                        cpu_traj[k, :, i],
+                        y=None,
+                        n=tstep,
+                        kind=InterpolateType.LINEAR,
+                        last_step=tstep,
+                        opt_dt=opt_d,
+                        interpolation_dt=interpolation_dt,
+                    )
+                retimed_traj[k, tstep:, :] = retimed_traj[k, tstep - 1 : tstep, :]
+        else:
+            for i in range(cpu_traj.shape[-1]):
+                retimed_traj[k, :tstep, i] = linear_smooth(
+                    cpu_traj[k, :, i],
+                    y=None,
+                    n=tstep,
+                    kind=InterpolateType.LINEAR,
+                    last_step=tstep,
+                    opt_dt=opt_d,
+                    interpolation_dt=interpolation_dt,
+                )
+            retimed_traj[k, tstep:, :] = retimed_traj[k, tstep - 1 : tstep, :]
+    out_traj_state.position[:] = retimed_traj.to(device=raw_traj.position.device)
+    out_traj_state.velocity[:] = out_traj_vel.to(device=raw_traj.position.device)
+    out_traj_state.acceleration[:] = out_traj_acc.to(device=raw_traj.position.device)
+    out_traj_state.jerk[:] = out_traj_jerk.to(device=raw_traj.position.device)
+
+    return out_traj_state
+
+
+def get_interpolated_trajectory(
+    trajectory: List[torch.Tensor],
+    out_traj_state: JointState,
+    des_horizon: Optional[int] = None,
+    interpolation_dt: float = 0.02,
+    max_velocity: Optional[torch.Tensor] = None,
+    max_acceleration: Optional[torch.Tensor] = None,
+    max_jerk: Optional[torch.Tensor] = None,
+    kind=InterpolateType.CUBIC,
+    max_deviation: float = 0.05,
+    tensor_args: TensorDeviceType = TensorDeviceType(),
+) -> JointState:
+    try:
+        # Third Party
+        from trajectory_smoothing import TrajectorySmoother
+
+    except:
+        log_info(
+            "trajectory_smoothing package not found, InterpolateType.KUNZ_STILMAN_OPTIMAL"
+            + " is disabled. to enable, try installing curobo with"
+            + " pip install .[smooth]"
+        )
+        kind = InterpolateType.LINEAR
+    dof = trajectory[0].shape[-1]
+    last_tsteps = []
+    opt_dt = []
+    if des_horizon is None:
+        interpolation_steps = out_traj_state.position.shape[1]
+    else:
+        interpolation_steps = des_horizon
+
+    # create an empty state message to fill data:
+    if kind == InterpolateType.KUNZ_STILMAN_OPTIMAL:
+        trajectory_sm = TrajectorySmoother(
+            dof,
+            max_velocity.cpu().view(dof).numpy(),
+            max_acceleration.cpu().view(dof).numpy(),
+            max_deviation,
+        )
+
+    for b in range(len(trajectory)):
+        raw_traj = trajectory[b].cpu().view(-1, dof).numpy()
+        current_kind = kind
+
+        if current_kind == InterpolateType.KUNZ_STILMAN_OPTIMAL:
+            out = trajectory_sm.smooth_interpolate(
+                raw_traj, interpolation_dt=interpolation_dt, traj_dt=0.001, max_tsteps=des_horizon
+            )
+            if out.success:
+                out_traj_state.position[b, : out.length, :] = tensor_args.to_device(out.position)
+                out_traj_state.position[b, out.length :, :] = out_traj_state.position[
+                    b, out.length - 1 : out.length, :
+                ]
+                out_traj_state.velocity[b, : out.length, :] = tensor_args.to_device(out.velocity)
+                out_traj_state.velocity[b, out.length :, :] = out_traj_state.velocity[
+                    b, out.length - 1 : out.length, :
+                ]
+                out_traj_state.acceleration[b, : out.length, :] = tensor_args.to_device(
+                    out.acceleration
+                )
+                out_traj_state.acceleration[b, out.length :, :] = out_traj_state.acceleration[
+                    b, out.length - 1 : out.length, :
+                ]
+                out_traj_state.jerk[b, : out.length, :] = tensor_args.to_device(out.jerk)
+                out_traj_state.jerk[b, out.length :, :] = out_traj_state.jerk[
+                    b, out.length - 1 : out.length, :
+                ]
+                last_tsteps.append(out.length)
+                opt_dt.append(out.interpolation_dt)
+            else:
+                current_kind = InterpolateType.LINEAR
+        if current_kind in [InterpolateType.LINEAR, InterpolateType.CUBIC, InterpolateType.QUINTIC]:
+            retimed_traj = torch.zeros((interpolation_steps, raw_traj.shape[-1]))
+            if raw_traj.shape[0] < 5:
+                current_kind = InterpolateType.LINEAR
+            for i in range(raw_traj.shape[-1]):
+                retimed_traj[:, i] = linear_smooth(
+                    raw_traj[:, i],
+                    y=None,
+                    n=interpolation_steps,
+                    kind=kind,
+                    last_step=des_horizon,
+                )
+            retimed_traj = retimed_traj.to(**vars(tensor_args))
+            out_traj_state.position[b, :interpolation_steps, :] = retimed_traj
+            out_traj_state.position[b, interpolation_steps:, :] = retimed_traj[
+                interpolation_steps - 1 : interpolation_steps, :
+            ]
+            last_tsteps.append(interpolation_steps)
+            opt_dt.append(interpolation_dt)
+    opt_dt = tensor_args.to_device(opt_dt)
+    return out_traj_state, last_tsteps, opt_dt
+
+
+@profiler.record_function("interpolation/1D")
+def linear_smooth(
+    x: np.array,
+    y=None,
+    n=10,
+    kind=InterpolateType.CUBIC,
+    last_step=None,
+    opt_dt=None,
+    interpolation_dt=None,
+):
+    # Third Party
+    import numpy as np
+    from scipy import interpolate
+
+    if last_step is None:
+        last_step = n  # min(x.shape[0],n)
+
+    if opt_dt is not None:
+        y = np.ravel([i * opt_dt for i in range(x.shape[0])])
+
+    if kind == InterpolateType.CUBIC and y is None:
+        y = np.linspace(0, last_step + 3, x.shape[0] + 4)
+        x = np.concatenate((x, x[-1:], x[-1:], x[-1:], x[-1:]))
+    elif y is None:
+        step = float(last_step) / float(x.shape[0] - 1)
+        y = np.ravel([float(i) * step for i in range(x.shape[0])])
+        # y[-1] = np.floor(y[-1])
+
+    if kind == InterpolateType.QUINTIC:
+        f = interpolate.make_interp_spline(y, x, k=5)
+
+    else:
+        f = interpolate.interp1d(y, x, kind=kind.value, assume_sorted=True)
+    if opt_dt is None:
+        x_new = np.ravel([i for i in range(last_step)])
+    else:
+        x_new = np.ravel([i * interpolation_dt for i in range(last_step)])
+    ynew = f(x_new)
+    y_new = torch.as_tensor(ynew)
+    return y_new
+
+
+@torch.jit.script
+def calculate_dt(
+    vel: torch.Tensor,
+    acc: torch.Tensor,
+    jerk: torch.Tensor,
+    max_vel: torch.Tensor,
+    max_acc: torch.Tensor,
+    max_jerk: torch.Tensor,
+    raw_dt: float,
+    interpolation_dt: float,
+):
+    # compute scaled dt:
+    max_v_arr = torch.max(torch.abs(vel), dim=-2)[0]  # output is batch, dof
+
+    max_acc_arr = torch.max(torch.abs(acc), dim=-2)[0]
+    max_jerk_arr = torch.max(torch.abs(jerk), dim=-2)[0]
+
+    vel_scale_dt = (max_v_arr) / (max_vel.view(1, max_v_arr.shape[-1]))  # batch,dof
+    acc_scale_dt = max_acc_arr / (max_acc.view(1, max_acc_arr.shape[-1]))
+    jerk_scale_dt = max_jerk_arr / (max_jerk.view(1, max_jerk_arr.shape[-1]))
+
+    dt_score_vel = raw_dt * torch.max(vel_scale_dt, dim=-1)[0]  # batch, 1
+    dt_score_acc = raw_dt * torch.sqrt((torch.max(acc_scale_dt, dim=-1)[0]))
+    dt_score_jerk = raw_dt * torch.pow((torch.max(jerk_scale_dt, dim=-1)[0]), 1 / 3)
+    dt_score = torch.maximum(dt_score_vel, dt_score_acc)
+    dt_score = torch.maximum(dt_score, dt_score_jerk)
+    dt_score = torch.clamp(dt_score, interpolation_dt, raw_dt)
+    # NOTE: this dt score is not dt, rather a scaling to convert velocity, acc, jerk that was
+    # computed with raw_dt to a new dt
+    return dt_score
+
+
+@torch.jit.script
+def calculate_dt_no_clamp(
+    vel: torch.Tensor,
+    acc: torch.Tensor,
+    jerk: torch.Tensor,
+    max_vel: torch.Tensor,
+    max_acc: torch.Tensor,
+    max_jerk: torch.Tensor,
+):
+    # compute scaled dt:
+    max_v_arr = torch.max(torch.abs(vel), dim=-2)[0]  # output is batch, dof
+
+    max_acc_arr = torch.max(torch.abs(acc), dim=-2)[0]
+    max_jerk_arr = torch.max(torch.abs(jerk), dim=-2)[0]
+
+    # max_v_arr = torch.clamp(max_v_arr, None, max_vel.view(1, max_v_arr.shape[-1]))
+    vel_scale_dt = (max_v_arr) / (max_vel.view(1, max_v_arr.shape[-1]))  # batch,dof
+    acc_scale_dt = max_acc_arr / (max_acc.view(1, max_acc_arr.shape[-1]))
+    jerk_scale_dt = max_jerk_arr / (max_jerk.view(1, max_jerk_arr.shape[-1]))
+    dt_score_vel = torch.max(vel_scale_dt, dim=-1)[0]  # batch, 1
+    dt_score_acc = torch.sqrt((torch.max(acc_scale_dt, dim=-1)[0]))
+    dt_score_jerk = torch.pow((torch.max(jerk_scale_dt, dim=-1)[0]), 1 / 3)
+    dt_score = torch.maximum(dt_score_vel, dt_score_acc)
+    dt_score = torch.maximum(dt_score, dt_score_jerk)
+    return dt_score
+
+
+@torch.jit.script
+def calculate_tsteps(
+    vel: torch.Tensor,
+    acc: torch.Tensor,
+    jerk: torch.Tensor,
+    interpolation_dt: float,
+    max_vel: torch.Tensor,
+    max_acc: torch.Tensor,
+    max_jerk: torch.Tensor,
+    raw_dt: float,
+    min_dt: float,
+    horizon: int,
+):
+    # compute scaled dt:
+    opt_dt = calculate_dt(vel, acc, jerk, max_vel, max_acc, max_jerk, raw_dt, interpolation_dt)
+    traj_steps = (torch.ceil((horizon - 1) * ((opt_dt) / interpolation_dt))).to(dtype=torch.int32)
+    steps_max = torch.max(traj_steps)
+    return traj_steps, steps_max, opt_dt
diff --git a/src/curobo/util/usd_helper.py b/src/curobo/util/usd_helper.py
new file mode 100644
index 00000000..ac51d617
--- /dev/null
+++ b/src/curobo/util/usd_helper.py
@@ -0,0 +1,1218 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+import math
+from typing import List, Optional, Union
+
+# Third Party
+import numpy as np
+import torch
+from pxr import Gf, Sdf, Usd, UsdGeom, UsdPhysics, UsdShade
+from tqdm import tqdm
+
+# CuRobo
+from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel, CudaRobotModelConfig
+from curobo.geom.types import (
+    Capsule,
+    Cuboid,
+    Cylinder,
+    Material,
+    Mesh,
+    Obstacle,
+    Sphere,
+    WorldConfig,
+)
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import RobotConfig
+from curobo.types.state import JointState
+from curobo.util.logger import log_error, log_warn
+from curobo.util_file import (
+    file_exists,
+    get_filename,
+    get_files_from_dir,
+    get_robot_configs_path,
+    join_path,
+    load_yaml,
+)
+from curobo.wrap.reacher.motion_gen import MotionGenResult
+
+
+def set_prim_translate(prim, translation):
+    UsdGeom.Xformable(prim).AddTranslateOp().Set(Gf.Vec3d(translation))
+
+
+def set_prim_transform(
+    prim, pose: List[float], scale: List[float] = [1, 1, 1], use_float: bool = False
+):
+    if prim.GetAttribute("xformOp:orient").IsValid():
+        if isinstance(prim.GetAttribute("xformOp:orient").Get(), Gf.Quatf):
+            use_float = True
+    else:
+        UsdGeom.Xformable(prim).AddOrientOp(UsdGeom.XformOp.PrecisionFloat)
+        use_float = True
+
+    if not prim.GetAttribute("xformOp:translate").IsValid():
+        UsdGeom.Xformable(prim).AddTranslateOp(UsdGeom.XformOp.PrecisionFloat)
+    if not prim.GetAttribute("xformOp:scale").IsValid():
+        UsdGeom.Xformable(prim).AddScaleOp(UsdGeom.XformOp.PrecisionFloat)
+    quat = pose[3:]
+
+    if use_float:
+        position = Gf.Vec3f(pose[:3])
+        q = Gf.Quatf(quat[0], quat[1:])
+        dims = Gf.Vec3f(scale)
+
+    else:
+        position = Gf.Vec3d(pose[:3])
+        q = Gf.Quatd(quat[0], quat[1:])
+        dims = Gf.Vec3d(scale)
+
+    prim.GetAttribute("xformOp:translate").Set(position)
+    prim.GetAttribute("xformOp:orient").Set(q)
+    prim.GetAttribute("xformOp:scale").Set(dims)
+
+    # get scale:
+
+
+def get_prim_world_pose(cache: UsdGeom.XformCache, prim: Usd.Prim, inverse: bool = False):
+    world_transform: Gf.Matrix4d = cache.GetLocalToWorldTransform(prim)
+    # get scale:
+    scale: Gf.Vec3d = Gf.Vec3d(*(v.GetLength() for v in world_transform.ExtractRotationMatrix()))
+    scale = list(scale)
+    t_mat = world_transform.RemoveScaleShear()
+    if inverse:
+        t_mat = t_mat.GetInverse()
+
+    # mat = np.zeros((4,4))
+    # mat[:,:] = t_mat
+    translation: Gf.Vec3d = t_mat.ExtractTranslation()
+    rotation: Gf.Rotation = t_mat.ExtractRotation()
+    q = rotation.GetQuaternion()
+    orientation = [q.GetReal()] + list(q.GetImaginary())
+    t_mat = (
+        Pose.from_list(list(translation) + orientation, TensorDeviceType())
+        .get_matrix()
+        .view(4, 4)
+        .cpu()
+        .numpy()
+    )
+
+    return t_mat, scale
+
+
+def get_transform(pose):
+    position = Gf.Vec3d(pose[:3])
+    quat = pose[3:]
+    rotation = Gf.Rotation(Gf.Quatf(quat[0], quat[1:]))
+
+    mat_pose = Gf.Matrix4d()
+
+    mat_pose.SetTransform(rotation, position)
+    return mat_pose
+
+
+def get_position_quat(pose, use_float: bool = True):
+    quat = pose[3:]
+
+    if use_float:
+        position = Gf.Vec3f(pose[:3])
+
+        quat = Gf.Quatf(quat[0], quat[1:])
+
+    else:
+        position = Gf.Vec3d(pose[:3])
+
+        quat = Gf.Quatd(quat[0], quat[1:])
+    return position, quat
+
+
+def set_geom_mesh_attrs(mesh_geom: UsdGeom.Mesh, obs: Mesh, timestep=None):
+    verts, faces = obs.get_mesh_data()
+    mesh_geom.CreatePointsAttr(verts)
+    mesh_geom.CreateFaceVertexCountsAttr([3 for _ in range(len(faces))])
+    mesh_geom.CreateFaceVertexIndicesAttr(np.ravel(faces).tolist())
+    mesh_geom.CreateSubdivisionSchemeAttr().Set(UsdGeom.Tokens.none)
+
+    if obs.vertex_colors is not None:
+        primvarsapi = UsdGeom.PrimvarsAPI(mesh_geom)
+        primvar = primvarsapi.CreatePrimvar(
+            "displayColor", Sdf.ValueTypeNames.Color3f, interpolation="faceVarying"
+        )
+        primvar.Set([Gf.Vec3f(x[0] / 255.0, x[1] / 255.0, x[2] / 255.0) for x in obs.vertex_colors])
+
+    # low = np.min(verts, axis=0)
+    # high = np.max(verts, axis=0)
+    # mesh_geom.CreateExtentAttr([low, high])
+    pose = obs.pose
+    position = Gf.Vec3d(pose[:3])
+    quat = pose[3:]
+    q = Gf.Quatf(quat[0], quat[1:])
+
+    # rotation = Gf.Rotation(Gf.Quatf(quat[0], quat[1:]))
+
+    # mat_pose = Gf.Matrix4d()
+    # mat_pose.SetTransform(rotation, position)
+    # size = 1.0
+    # mesh_geom.CreateSizeAttr(size)
+    if timestep is not None:
+        # UsdGeom.Xformable(mesh_geom).AddTransformOp().Set(time=timestep, value=mat_pose)
+        a = UsdGeom.Xformable(mesh_geom)  #
+        a.AddTranslateOp().Set(time=timestep, value=position)
+        a.AddOrientOp().Set(time=timestep, value=q)
+    else:
+        a = UsdGeom.Xformable(mesh_geom)  #
+        a.AddTranslateOp().Set(position)
+        a.AddOrientOp().Set(q)
+
+        # UsdGeom.Xformable(mesh_geom).AddTransformOp().Set(mat_pose)
+
+
+def set_geom_cube_attrs(
+    cube_geom: UsdGeom.Cube, dims: List[float], pose: List[float], timestep=None
+):
+    dims = Gf.Vec3d(np.ravel(dims).tolist())
+    position = Gf.Vec3d(pose[:3])
+    quat = pose[3:]
+    q = Gf.Quatf(quat[0], quat[1:])
+    # rotation = Gf.Rotation(q)
+
+    # mat_pose = Gf.Matrix4d()
+    # mat_pose.SetTransform(rotation, position)
+    # mat = mat_pose
+    # mat_scale = Gf.Matrix4d()
+    # mat_scale.SetScale(dims)
+    # mat = mat_scale * mat_pose
+    size = 1.0
+    cube_geom.CreateSizeAttr(size)
+    # create scale:
+
+    a = UsdGeom.Xformable(cube_geom)  #
+    a.AddTranslateOp().Set(position)
+    a.AddOrientOp().Set(q)
+
+    # a.AddTransformOp().Set(mat)
+    # scale will set the length to the given value
+    a.AddScaleOp().Set(dims)
+
+
+def set_geom_cylinder_attrs(
+    cube_geom: UsdGeom.Cylinder, radius, height, pose: List[float], timestep=None
+):
+    position = Gf.Vec3d(pose[:3])
+    quat = pose[3:]
+    q = Gf.Quatf(quat[0], quat[1:])
+
+    # create scale:
+    cube_geom.CreateRadiusAttr(radius)
+    cube_geom.CreateHeightAttr(height)
+    a = UsdGeom.Xformable(cube_geom)  #
+    a.AddTranslateOp().Set(position)
+    a.AddOrientOp().Set(q)
+
+
+def set_geom_sphere_attrs(
+    sphere_geom: UsdGeom.Sphere, radius: float, pose: List[float], timestep=None
+):
+    position = Gf.Vec3d(pose[:3])
+    quat = pose[3:]
+    q = Gf.Quatf(quat[0], quat[1:])
+
+    a = UsdGeom.Xformable(sphere_geom)  #
+    a.AddTranslateOp().Set(position)
+    a.AddOrientOp().Set(q)
+
+    sphere_geom.CreateRadiusAttr(float(radius))
+
+
+def set_cylinder_attrs(prim: UsdGeom.Cylinder, radius: float, height: float, pose, color=[]):
+    # set size to 1:
+    position = Gf.Vec3d(np.ravel(pose.xyz).tolist())
+    quat = pose.so3.wxyz
+    rotation = Gf.Rotation(Gf.Quatf(quat[0], quat[1:]))
+
+    mat_pose = Gf.Matrix4d()
+    mat_pose.SetTransform(rotation, position)
+
+    prim.GetAttribute("height").Set(height)
+    prim.GetAttribute("radius").Set(radius)
+
+    UsdGeom.Xformable(prim).AddTransformOp().Set(mat_pose)
+
+
+def get_cylinder_attrs(prim, cache=None, transform=None) -> Cylinder:
+    size = prim.GetAttribute("size").Get()
+    if size is None:
+        size = 1.0
+    height = prim.GetAttribute("height").Get() * size
+    radius = prim.GetAttribute("radius").Get() * size
+
+    mat, t_scale = get_prim_world_pose(cache, prim)
+
+    if transform is not None:
+        mat = transform @ mat
+    # compute position and orientation on cuda:
+    tensor_mat = torch.as_tensor(mat, device=torch.device("cuda", 0))
+    pose = Pose.from_matrix(tensor_mat).tolist()
+    return Cylinder(name=str(prim.GetPath()), pose=pose, height=height, radius=radius)
+
+
+def get_capsule_attrs(prim, cache=None, transform=None) -> Cylinder:
+    size = prim.GetAttribute("size").Get()
+    if size is None:
+        size = 1.0
+    height = prim.GetAttribute("height").Get() * size
+    radius = prim.GetAttribute("radius").Get() * size
+
+    mat, t_scale = get_prim_world_pose(cache, prim)
+    base = [0, 0, -height / 2]
+    tip = [0, 0, height / 2]
+
+    if transform is not None:
+        mat = transform @ mat
+    # compute position and orientation on cuda:
+    tensor_mat = torch.as_tensor(mat, device=torch.device("cuda", 0))
+    pose = Pose.from_matrix(tensor_mat).tolist()
+    return Capsule(name=str(prim.GetPath()), pose=pose, base=base, tip=tip, radius=radius)
+
+
+def get_cube_attrs(prim, cache=None, transform=None) -> Cuboid:
+    # read cube size:
+    size = prim.GetAttribute("size").Get()
+    if size is None:
+        size = 1.0
+    dims = list(prim.GetAttribute("xformOp:scale").Get())
+    # scale is 0.5 -> length of 1 will become 0.5,
+    dims = [d * size for d in dims]
+    if any([x <= 0 for x in dims]):
+        raise ValueError("Negative or zero dimension")
+    # dims = [x*2 for x in dims]
+    mat, t_scale = get_prim_world_pose(cache, prim)
+
+    if transform is not None:
+        mat = transform @ mat
+    # compute position and orientation on cuda:
+    tensor_mat = torch.as_tensor(mat, device=torch.device("cuda", 0))
+    pose = Pose.from_matrix(tensor_mat).tolist()
+    return Cuboid(name=str(prim.GetPath()), pose=pose, dims=dims)
+
+
+def get_sphere_attrs(prim, cache=None, transform=None) -> Sphere:
+    # read cube information
+    # scale = prim.GetAttribute("size").Get()
+    size = prim.GetAttribute("size").Get()
+    if size is None:
+        size = 1.0
+    radius = prim.GetAttribute("radius").Get()
+    scale = prim.GetAttribute("xformOp:scale").Get()
+    print(radius, scale)
+    if scale is not None:
+        radius = radius * max(list(scale)) * size
+
+    if radius <= 0:
+        raise ValueError("Negative or zero radius")
+    # dims = [x*2 for x in dims]
+    mat, t_scale = get_prim_world_pose(cache, prim)
+    # position = list(prim.GetAttribute("xformOp:translate").Get())
+    # q = prim.GetAttribute("xformOp:orient").Get()
+    # orientation = [q.GetReal()] + list(q.GetImaginary())
+
+    if transform is not None:
+        mat = transform @ mat
+    # compute position and orientation on cuda:
+    tensor_mat = torch.as_tensor(mat, device=torch.device("cuda", 0))
+    pose = Pose.from_matrix(tensor_mat).tolist()
+
+    return Sphere(name=str(prim.GetPath()), pose=pose, radius=radius, position=pose[:3])
+
+
+def get_mesh_attrs(prim, cache=None, transform=None) -> Mesh:
+    # read cube information
+    # scale = prim.GetAttribute("size").Get()
+    points = list(prim.GetAttribute("points").Get())
+    points = [np.ravel(x) for x in points]
+    # points = np.ndarray(points)
+
+    faces = list(prim.GetAttribute("faceVertexIndices").Get())
+
+    face_count = list(prim.GetAttribute("faceVertexCounts").Get())
+    # assume faces are 3:
+    if len(faces) / 3 != len(face_count):
+        log_warn(
+            "Mesh faces "
+            + str(len(faces) / 3)
+            + " are not matching faceVertexCounts "
+            + str(len(face_count))
+        )
+        return None
+    faces = np.array(faces).reshape(len(face_count), 3).tolist()
+    if prim.GetAttribute("xformOp:scale").IsValid():
+        scale = list(prim.GetAttribute("xformOp:scale").Get())
+    else:
+        scale = [1.0, 1.0, 1.0]
+    size = prim.GetAttribute("size").Get()
+    if size is None:
+        size = 1
+    scale = [s * size for s in scale]
+
+    mat, t_scale = get_prim_world_pose(cache, prim)
+    # also get any world scale:
+    scale = t_scale
+    # position = list(prim.GetAttribute("xformOp:translate").Get())
+    # q = prim.GetAttribute("xformOp:orient").Get()
+    # orientation = [q.GetReal()] + list(q.GetImaginary())
+
+    if transform is not None:
+        mat = transform @ mat
+    # compute position and orientation on cuda:
+    tensor_mat = torch.as_tensor(mat, device=torch.device("cuda", 0))
+    pose = Pose.from_matrix(tensor_mat).tolist()
+
+    #
+
+    m = Mesh(
+        name=str(prim.GetPath()),
+        pose=pose,
+        vertices=points,
+        faces=faces,
+        scale=scale,
+    )
+    # print(len(m.vertices), max(m.faces))
+
+    return m
+
+
+def create_stage(
+    name: str = "curobo_stage.usd",
+    base_frame: str = "/world",
+):
+    stage = Usd.Stage.CreateNew(name)
+    UsdGeom.SetStageUpAxis(stage, "Z")
+    UsdGeom.SetStageMetersPerUnit(stage, 1)
+    UsdPhysics.SetStageKilogramsPerUnit(stage, 1)
+    xform = stage.DefinePrim(base_frame, "Xform")
+    stage.SetDefaultPrim(xform)
+    return stage
+
+
+class UsdHelper:
+    def __init__(self, use_float=True) -> None:
+        self.stage = None
+        self.dt = None
+        self._use_float = use_float
+        self._xform_cache = UsdGeom.XformCache()
+
+    def create_stage(
+        self,
+        name: str = "curobo_stage.usd",
+        base_frame: str = "/world",
+        timesteps: Optional[int] = None,
+        dt=0.02,
+        interpolation_steps: float = 1,
+    ):
+        # print("name", name)
+        self.stage = Usd.Stage.CreateNew(name)
+        UsdGeom.SetStageUpAxis(self.stage, "Z")
+        UsdGeom.SetStageMetersPerUnit(self.stage, 1)
+        UsdPhysics.SetStageKilogramsPerUnit(self.stage, 1)
+        xform = self.stage.DefinePrim(base_frame, "Xform")
+        self.stage.SetDefaultPrim(xform)
+        self.dt = dt
+        self.interpolation_steps = interpolation_steps
+        if timesteps is not None:
+            self.stage.SetStartTimeCode(1)
+            self.stage.SetEndTimeCode(timesteps * self.interpolation_steps)
+            self.stage.SetTimeCodesPerSecond((1.0 / self.dt))
+            # print(1.0 / self)
+
+    def add_subroot(self, root="/world", sub_root="/obstacles", pose: Optional[Pose] = None):
+        xform = self.stage.DefinePrim(join_path(root, sub_root), "Xform")
+        if pose is not None:
+            set_prim_transform(xform, pose.tolist(), use_float=self._use_float)
+
+    def load_stage_from_file(self, file_path: str):
+        self.stage = Usd.Stage.Open(file_path)
+
+    def load_stage(self, stage: Usd.Stage):
+        self.stage = stage
+
+    def get_pose(self, prim_path: str, timecode: float = 0.0, inverse: bool = False) -> np.matrix:
+        self._xform_cache.SetTime(timecode)
+        reference_prim = self.stage.GetPrimAtPath(prim_path)
+        r_T_w, _ = get_prim_world_pose(self._xform_cache, reference_prim, inverse=inverse)
+        return r_T_w
+
+    def get_obstacles_from_stage(
+        self,
+        only_paths: Optional[List[str]] = None,
+        ignore_paths: Optional[List[str]] = None,
+        only_substring: Optional[List[str]] = None,
+        ignore_substring: Optional[List[str]] = None,
+        reference_prim_path: Optional[str] = None,
+        timecode: float = 0,
+    ) -> WorldConfig:
+        # read obstacles from usd by iterating through all prims:
+        obstacles = {"cuboid": [], "sphere": None, "mesh": None, "cylinder": None, "capsule": None}
+        r_T_w = None
+        self._xform_cache.Clear()
+        self._xform_cache.SetTime(timecode)
+        if reference_prim_path is not None:
+            reference_prim = self.stage.GetPrimAtPath(reference_prim_path)
+            r_T_w, _ = get_prim_world_pose(self._xform_cache, reference_prim, inverse=True)
+        all_items = self.stage.Traverse()
+        for x in all_items:
+            if only_paths is not None:
+                if not any([str(x.GetPath()).startswith(k) for k in only_paths]):
+                    continue
+            if ignore_paths is not None:
+                if any([str(x.GetPath()).startswith(k) for k in ignore_paths]):
+                    continue
+            if only_substring is not None:
+                if not any([k in str(x.GetPath()) for k in only_substring]):
+                    continue
+            if ignore_substring is not None:
+                if any([k in str(x.GetPath()) for k in ignore_substring]):
+                    continue
+            if x.IsA(UsdGeom.Cube):
+                if obstacles["cuboid"] is None:
+                    obstacles["cuboid"] = []
+                cube = get_cube_attrs(x, cache=self._xform_cache, transform=r_T_w)
+                obstacles["cuboid"].append(cube)
+            elif x.IsA(UsdGeom.Sphere):
+                if obstacles["sphere"] is None:
+                    obstacles["sphere"] = []
+                obstacles["sphere"].append(
+                    get_sphere_attrs(x, cache=self._xform_cache, transform=r_T_w)
+                )
+            elif x.IsA(UsdGeom.Mesh):
+                if obstacles["mesh"] is None:
+                    obstacles["mesh"] = []
+                m_data = get_mesh_attrs(x, cache=self._xform_cache, transform=r_T_w)
+                if m_data is not None:
+                    obstacles["mesh"].append(m_data)
+            elif x.IsA(UsdGeom.Cylinder):
+                if obstacles["cylinder"] is None:
+                    obstacles["cylinder"] = []
+                cube = get_cylinder_attrs(x, cache=self._xform_cache, transform=r_T_w)
+                obstacles["cylinder"].append(cube)
+            elif x.IsA(UsdGeom.Capsule):
+                if obstacles["capsule"] is None:
+                    obstacles["capsule"] = []
+                cap = get_capsule_attrs(x, cache=self._xform_cache, transform=r_T_w)
+                obstacles["capsule"].append(cap)
+        world_model = WorldConfig(**obstacles)
+        return world_model
+
+    def add_world_to_stage(
+        self,
+        obstacles: WorldConfig,
+        base_frame: str = "/world",
+        obstacles_frame: str = "obstacles",
+        base_t_obstacle_pose: Optional[Pose] = None,
+        timestep: Optional[float] = None,
+    ):
+        # iterate through every obstacle type and create prims:
+
+        self.add_subroot(base_frame, obstacles_frame, base_t_obstacle_pose)
+        full_path = join_path(base_frame, obstacles_frame)
+        prim_path = [
+            self.get_prim_from_obstacle(o, full_path, timestep=timestep) for o in obstacles.objects
+        ]
+        return prim_path
+
+    def get_prim_from_obstacle(
+        self, obstacle: Obstacle, base_frame: str = "/world/obstacles", timestep=None
+    ):
+        if isinstance(obstacle, Cuboid):
+            return self.add_cuboid_to_stage(obstacle, base_frame, timestep=timestep)
+        elif isinstance(obstacle, Mesh):
+            return self.add_mesh_to_stage(obstacle, base_frame, timestep=timestep)
+        elif isinstance(obstacle, Sphere):
+            return self.add_sphere_to_stage(obstacle, base_frame, timestep=timestep)
+        elif isinstance(obstacle, Cylinder):
+            return self.add_cylinder_to_stage(obstacle, base_frame, timestep=timestep)
+
+        else:
+            raise NotImplementedError
+
+    def add_cuboid_to_stage(
+        self,
+        obstacle: Cuboid,
+        base_frame: str = "/world/obstacles",
+        timestep=None,
+        enable_physics: bool = False,
+    ):
+        root_path = join_path(base_frame, obstacle.name)
+        obj_geom = UsdGeom.Cube.Define(self.stage, root_path)
+        obj_prim = self.stage.GetPrimAtPath(root_path)
+
+        set_geom_cube_attrs(obj_geom, obstacle.dims, obstacle.pose, timestep=timestep)
+        obj_prim.CreateAttribute("physics:rigidBodyEnabled", Sdf.ValueTypeNames.Bool, custom=False)
+        obj_prim.GetAttribute("physics:rigidBodyEnabled").Set(enable_physics)
+
+        if obstacle.color is not None:
+            self.add_material(
+                "material_" + obstacle.name, root_path, obstacle.color, obj_prim, obstacle.material
+            )
+        return root_path
+
+    def add_cylinder_to_stage(
+        self,
+        obstacle: Cylinder,
+        base_frame: str = "/world/obstacles",
+        timestep=None,
+        enable_physics: bool = False,
+    ):
+        root_path = join_path(base_frame, obstacle.name)
+        obj_geom = UsdGeom.Cylinder.Define(self.stage, root_path)
+        obj_prim = self.stage.GetPrimAtPath(root_path)
+
+        set_geom_cylinder_attrs(
+            obj_geom, obstacle.radius, obstacle.height, obstacle.pose, timestep=timestep
+        )
+        obj_prim.CreateAttribute("physics:rigidBodyEnabled", Sdf.ValueTypeNames.Bool, custom=False)
+        obj_prim.GetAttribute("physics:rigidBodyEnabled").Set(enable_physics)
+
+        if obstacle.color is not None:
+            self.add_material(
+                "material_" + obstacle.name, root_path, obstacle.color, obj_prim, obstacle.material
+            )
+        return root_path
+
+    def add_sphere_to_stage(
+        self,
+        obstacle: Sphere,
+        base_frame: str = "/world/obstacles",
+        timestep=None,
+        enable_physics: bool = False,
+    ):
+        root_path = join_path(base_frame, obstacle.name)
+        obj_geom = UsdGeom.Sphere.Define(self.stage, root_path)
+        obj_prim = self.stage.GetPrimAtPath(root_path)
+        if obstacle.pose is None:
+            obstacle.pose = obstacle.position + [1, 0, 0, 0]
+        set_geom_sphere_attrs(obj_geom, obstacle.radius, obstacle.pose, timestep=timestep)
+        obj_prim.CreateAttribute("physics:rigidBodyEnabled", Sdf.ValueTypeNames.Bool, custom=False)
+        obj_prim.GetAttribute("physics:rigidBodyEnabled").Set(enable_physics)
+
+        if obstacle.color is not None:
+            self.add_material(
+                "material_" + obstacle.name, root_path, obstacle.color, obj_prim, obstacle.material
+            )
+        return root_path
+
+    def add_mesh_to_stage(
+        self,
+        obstacle: Mesh,
+        base_frame: str = "/world/obstacles",
+        timestep=None,
+        enable_physics: bool = False,
+    ):
+        root_path = join_path(base_frame, obstacle.name)
+        obj_geom = UsdGeom.Mesh.Define(self.stage, root_path)
+        obj_prim = self.stage.GetPrimAtPath(root_path)
+
+        set_geom_mesh_attrs(obj_geom, obstacle, timestep=timestep)
+
+        obj_prim.CreateAttribute("physics:rigidBodyEnabled", Sdf.ValueTypeNames.Bool, custom=False)
+        obj_prim.GetAttribute("physics:rigidBodyEnabled").Set(enable_physics)
+
+        if obstacle.color is not None:
+            self.add_material(
+                "material_" + obstacle.name, root_path, obstacle.color, obj_prim, obstacle.material
+            )
+
+        return root_path
+
+    def get_obstacle_from_prim(self, prim_path: str) -> Obstacle:
+        pass
+
+    def write_stage_to_file(self, file_path: str, flatten: bool = False):
+        if flatten:
+            usd_str = self.stage.Flatten().ExportToString()
+        else:
+            usd_str = self.stage.GetRootLayer().ExportToString()
+        with open(file_path, "w") as f:
+            f.write(usd_str)
+
+    def create_animation(
+        self,
+        robot_world_cfg: WorldConfig,
+        pose: Pose,
+        base_frame="/world",
+        robot_frame="/robot",
+        dt: float = 0.02,
+    ):
+        """Create animation, given meshes and pose
+
+        Args:
+            prim_names: _description_
+            pose: [ timesteps, n_meshes, pose]
+            dt: _description_. Defaults to 0.02.
+        """
+        prim_names = self.add_world_to_stage(
+            robot_world_cfg, base_frame=base_frame, obstacles_frame=robot_frame, timestep=0
+        )
+        for i, i_val in enumerate(prim_names):
+            curr_prim = self.stage.GetPrimAtPath(i_val)
+            form = UsdGeom.Xformable(curr_prim).GetOrderedXformOps()[0]
+            for t in range(pose.batch):
+                c_t = get_transform(pose.get_index(t, i).tolist())
+                form.Set(time=t * self.interpolation_steps, value=c_t)
+
+    def create_obstacle_animation(
+        self,
+        obstacles: List[List[Obstacle]],
+        base_frame: str = "/world",
+        obstacles_frame: str = "robot_base",
+    ):
+        # add obstacles to stage:
+        prim_paths = self.add_world_to_stage(
+            WorldConfig(objects=obstacles[0]),
+            base_frame=base_frame,
+            obstacles_frame=obstacles_frame,
+        )
+
+        #
+        for t in range(len(obstacles)):
+            current_obs = obstacles[t]
+            for j in range(len(current_obs)):
+                obs = current_obs[j]
+                obs_name = join_path(join_path(base_frame, obstacles_frame), obs.name)
+                if obs_name not in prim_paths:
+                    log_warn("Obstacle not found")
+                    continue
+                #
+                prim = self.stage.GetPrimAtPath(obs_name)
+                form = UsdGeom.Xformable(prim).GetOrderedXformOps()
+
+                pos_form = form[0]
+                c_p = Gf.Vec3d(obs.position)
+                pos_form.Set(time=t * self.interpolation_steps, value=c_p)
+
+    def create_linkpose_robot_animation(
+        self,
+        robot_usd_path: str,
+        link_names: List[str],
+        joint_names: List[str],
+        pose: Pose,
+        robot_base_frame="/world/robot",
+        local_asset_path="assets/",
+        write_robot_usd_path="assets/",
+        robot_asset_prim_path="/panda",
+    ):
+        """Create animation, given meshes and pose
+
+        Args:
+            prim_names: _description_
+            pose: [ timesteps, n_meshes, pose]
+            dt: _description_. Defaults to 0.02.
+        """
+        link_prims, joint_prims = self.load_robot_usd(
+            robot_usd_path,
+            link_names,
+            joint_names,
+            robot_base_frame=robot_base_frame,
+            local_asset_path=local_asset_path,
+            write_asset_path=write_robot_usd_path,
+            robot_asset_prim_path=robot_asset_prim_path,
+        )
+
+        for i, i_val in enumerate(link_names):
+            if i_val not in link_prims:
+                log_warn("Link not found in usd: " + i_val)
+                continue
+            form = UsdGeom.Xformable(link_prims[i_val]).GetOrderedXformOps()
+            if len(form) < 2:
+                log_warn("Pose transformation not found" + i_val)
+                continue
+
+            pos_form = form[0]
+            quat_form = form[1]
+            use_float = False
+            if link_prims[i_val].GetAttribute("xformOp:orient").IsValid():
+                if isinstance(link_prims[i_val].GetAttribute("xformOp:orient").Get(), Gf.Quatf):
+                    use_float = True
+
+            for t in range(pose.batch):
+                c_p, c_q = get_position_quat(pose.get_index(t, i).tolist(), use_float)
+                pos_form.Set(time=t * self.interpolation_steps, value=c_p)
+                quat_form.Set(time=t * self.interpolation_steps, value=c_q)
+
+    def add_material(
+        self,
+        material_name: str,
+        object_path: str,
+        color: List[float],
+        obj_prim: Usd.Prim,
+        material: Material = Material(),
+    ):
+        mat_path = join_path(object_path, material_name)
+        material_usd = UsdShade.Material.Define(self.stage, mat_path)
+        pbrShader = UsdShade.Shader.Define(self.stage, join_path(mat_path, "PbrShader"))
+        pbrShader.CreateIdAttr("UsdPreviewSurface")
+        pbrShader.CreateInput("roughness", Sdf.ValueTypeNames.Float).Set(material.roughness)
+        pbrShader.CreateInput("metallic", Sdf.ValueTypeNames.Float).Set(material.metallic)
+        pbrShader.CreateInput("specularColor", Sdf.ValueTypeNames.Color3f).Set(Gf.Vec3f(color[:3]))
+        pbrShader.CreateInput("diffuseColor", Sdf.ValueTypeNames.Color3f).Set(Gf.Vec3f(color[:3]))
+        pbrShader.CreateInput("baseColor", Sdf.ValueTypeNames.Color3f).Set(Gf.Vec3f(color[:3]))
+
+        pbrShader.CreateInput("opacity", Sdf.ValueTypeNames.Float).Set(color[3])
+        material_usd.CreateSurfaceOutput().ConnectToSource(pbrShader.ConnectableAPI(), "surface")
+        obj_prim.GetPrim().ApplyAPI(UsdShade.MaterialBindingAPI)
+        UsdShade.MaterialBindingAPI(obj_prim).Bind(material_usd)
+        return material_usd
+
+    def save(self):
+        self.stage.Save()
+
+    @staticmethod
+    def write_trajectory_animation(
+        robot_model_file: str,
+        world_model: WorldConfig,
+        q_start: JointState,
+        q_traj: JointState,
+        dt: float = 0.02,
+        save_path: str = "out.usd",
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+        interpolation_steps: float = 1.0,
+        robot_color: List[float] = [0.8, 0.8, 0.8, 1.0],
+    ):
+        config_file = load_yaml(join_path(get_robot_configs_path(), robot_model_file))
+        config_file["robot_cfg"]["kinematics"]["load_link_names_with_mesh"] = True
+        robot_cfg = CudaRobotModelConfig.from_data_dict(
+            config_file["robot_cfg"]["kinematics"], tensor_args=tensor_args
+        )
+        kin_model = CudaRobotModel(robot_cfg)
+
+        m = kin_model.get_robot_as_mesh(q_start.position)
+
+        robot_mesh_model = WorldConfig(mesh=m)
+        robot_mesh_model.add_color(robot_color)
+        robot_mesh_model.add_material(Material(metallic=0.4))
+        usd_helper = UsdHelper()
+        usd_helper.create_stage(
+            save_path,
+            timesteps=q_traj.position.shape[0],
+            dt=dt,
+            interpolation_steps=interpolation_steps,
+        )
+        usd_helper.add_world_to_stage(world_model)
+
+        animation_links = kin_model.mesh_link_names
+        animation_poses = kin_model.get_link_poses(q_traj.position, animation_links)
+        usd_helper.create_animation(robot_mesh_model, animation_poses)
+        usd_helper.write_stage_to_file(save_path + ".usd")
+
+    @staticmethod
+    def load_robot(
+        robot_model_file: str,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+    ) -> CudaRobotModel:
+        config_file = load_yaml(join_path(get_robot_configs_path(), robot_model_file))
+        if "robot_cfg" in config_file:
+            config_file = config_file["robot_cfg"]
+        config_file["kinematics"]["load_link_names_with_mesh"] = True
+        # config_file["robot_cfg"]["kinematics"]["use_usd_kinematics"] = False
+
+        robot_cfg = CudaRobotModelConfig.from_data_dict(
+            config_file["kinematics"], tensor_args=tensor_args
+        )
+
+        kin_model = CudaRobotModel(robot_cfg)
+        return kin_model
+
+    @staticmethod
+    def write_trajectory_animation_with_robot_usd(
+        robot_model_file: str,
+        world_model: Union[WorldConfig, None],
+        q_start: JointState,
+        q_traj: JointState,
+        dt: float = 0.02,
+        save_path: str = "out.usd",
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+        interpolation_steps: float = 1.0,
+        write_robot_usd_path: str = "assets/",
+        robot_base_frame: str = "robot",
+        robot_usd_local_reference: str = "assets/",
+        base_frame="/world",
+        kin_model: Optional[CudaRobotModel] = None,
+        visualize_robot_spheres: bool = True,
+        robot_asset_prim_path=None,
+    ):
+        if kin_model is None:
+            config_file = load_yaml(join_path(get_robot_configs_path(), robot_model_file))
+            if "robot_cfg" in config_file:
+                config_file = config_file["robot_cfg"]
+            config_file["kinematics"]["load_link_names_with_mesh"] = True
+            config_file["kinematics"]["use_usd_kinematics"] = True
+
+            robot_cfg = CudaRobotModelConfig.from_data_dict(
+                config_file["kinematics"], tensor_args=tensor_args
+            )
+
+            kin_model = CudaRobotModel(robot_cfg)
+
+        if robot_asset_prim_path is None:
+            robot_asset_prim_path = kin_model.kinematics_parser.robot_prim_root
+
+        robot_base_frame = join_path(base_frame, robot_base_frame)
+
+        robot_usd_path = kin_model.generator_config.usd_path
+
+        usd_helper = UsdHelper()
+        usd_helper.create_stage(
+            save_path,
+            timesteps=q_traj.position.shape[0],
+            dt=dt,
+            interpolation_steps=interpolation_steps,
+            base_frame=base_frame,
+        )
+        if world_model is not None:
+            usd_helper.add_world_to_stage(world_model, base_frame=base_frame)
+
+        animation_links = kin_model.kinematics_config.mesh_link_names
+        animation_poses = kin_model.get_link_poses(q_traj.position, animation_links)
+
+        usd_helper.create_linkpose_robot_animation(
+            robot_usd_path,
+            animation_links,
+            kin_model.joint_names,
+            animation_poses,
+            local_asset_path=robot_usd_local_reference,
+            write_robot_usd_path=write_robot_usd_path,
+            robot_base_frame=robot_base_frame,
+            robot_asset_prim_path=robot_asset_prim_path,
+        )
+        if visualize_robot_spheres:
+            # visualize robot spheres:
+            sphere_traj = kin_model.get_robot_as_spheres(q_traj.position)
+            # change color:
+            for s in sphere_traj:
+                for k in s:
+                    k.color = [0, 0.27, 0.27, 1.0]
+            usd_helper.create_obstacle_animation(
+                sphere_traj, base_frame=base_frame, obstacles_frame="curobo/robot_collision"
+            )
+        usd_helper.write_stage_to_file(save_path)
+
+    @staticmethod
+    def create_grid_usd(
+        usds_path: Union[str, List[str]],
+        save_path: str,
+        base_frame: str,
+        max_envs: int,
+        max_timecode: float,
+        x_space: float,
+        y_space: float,
+        x_per_row: int,
+        local_asset_path: str,
+        dt: float = 0.02,
+        interpolation_steps: int = 1,
+        prefix_string: Optional[str] = None,
+    ):
+        # create stage:
+        usd_helper = UsdHelper()
+        usd_helper.create_stage(
+            save_path,
+            timesteps=max_timecode,
+            dt=dt,
+            interpolation_steps=interpolation_steps,
+            base_frame=base_frame,
+        )
+
+        # read all usds:
+        if isinstance(usds_path, list):
+            files = usds_path
+        else:
+            files = get_files_from_dir(usds_path, [".usda", ".usd"], prefix_string)
+        # get count and clamp to max:
+        n_envs = min(len(files), max_envs)
+        # create grid
+        # :
+        count_x = x_per_row
+        count_y = int(np.ceil((n_envs) / x_per_row))
+        x_set = np.linspace(0, x_space * count_x, count_x)
+        y_set = np.linspace(0, y_space * count_y, count_y)
+        xv, yv = np.meshgrid(x_set, y_set)
+        xv = np.ravel(xv)
+        yv = np.ravel(yv)
+
+        # define prim + add reference:
+
+        for i in range(n_envs):
+            world_usd_path = files[i]
+            env_base_frame = (
+                base_frame + "/grid_" + get_filename(world_usd_path, remove_extension=True)
+            )
+            prim = usd_helper.stage.DefinePrim(env_base_frame, "Xform")
+            set_prim_transform(prim, [xv[i], yv[i], 0, 1, 0, 0, 0])
+            ref = prim.GetReferences()
+            ref.AddReference(assetPath=join_path(local_asset_path, get_filename(world_usd_path)))
+
+        # write usd to disk:
+
+        usd_helper.write_stage_to_file(save_path)
+
+    def load_robot_usd(
+        self,
+        robot_usd_path: str,
+        link_names: List[str],
+        joint_names: List[str],
+        robot_base_frame="/world/robot",
+        write_asset_path="assets/",
+        local_asset_path="assets/",
+        robot_asset_prim_path="/panda",
+    ):
+        # copy robot prim and it's derivatives to a seperate usd:
+        robot_usd_name = get_filename(robot_usd_path)
+
+        out_path = join_path(write_asset_path, robot_usd_name)
+        out_local_path = join_path(local_asset_path, robot_usd_name)
+        if not file_exists(out_path) or not file_exists(out_local_path):
+            robot_stage = Usd.Stage.Open(robot_usd_path)  # .Flatten()  # .Flatten()
+            # set pose to zero for root prim:
+
+            prim = robot_stage.GetPrimAtPath(robot_asset_prim_path)
+            if not prim.IsValid():
+                log_error(
+                    "robot prim is not valid : " + robot_asset_prim_path + " " + robot_usd_path
+                )
+            set_prim_transform(prim, [0, 0, 0, 1, 0, 0, 0])
+            robot_stage = robot_stage.Flatten()
+            robot_stage.Export(out_path)
+            robot_stage.Export(out_local_path)
+
+        # create a base prim:
+        prim = self.stage.DefinePrim(robot_base_frame)
+        ref = prim.GetReferences()
+
+        ref.AddReference(
+            assetPath=join_path(local_asset_path, robot_usd_name), primPath=robot_asset_prim_path
+        )
+        link_prims, joint_prims = self.get_robot_prims(link_names, joint_names, robot_base_frame)
+        return link_prims, joint_prims
+
+    def get_robot_prims(
+        self, link_names: List[str], joint_names: List[str], robot_base_path: str = "/world/robot"
+    ):
+        all_prims = [x for x in self.stage.Traverse()]
+        joint_prims = {}
+        link_prims = {}
+        for j_idx, j in enumerate(joint_names):
+            for k in range(len(all_prims)):
+                current_prim = all_prims[k]
+                prim_path = current_prim.GetPath().pathString
+                if robot_base_path in prim_path and j in prim_path:
+                    joint_prims[j] = current_prim
+                    current_prim.GetAttribute("physics:jointEnabled").Set(False)
+        for j_idx, j in enumerate(link_names):
+            for k in range(len(all_prims)):
+                current_prim = all_prims[k]
+                prim_path = current_prim.GetPath().pathString
+                if (
+                    robot_base_path in prim_path
+                    and j in prim_path
+                    and "geometry" not in prim_path
+                    and "joint" not in prim_path
+                    and current_prim.GetTypeName() == "Xform"
+                ):
+                    link_prims[j] = current_prim
+
+                    # stat = current_prim.GetAttribute("physics:rigidBodyEnabled")
+                    current_prim.GetAttribute("physics:rigidBodyEnabled").Set(False)
+
+        return link_prims, joint_prims
+
+    def update_robot_joint_state(self, joint_prims: List[Usd.Prim], js: JointState, timestep: int):
+        for j_idx, j in enumerate(js.joint_names):
+            if timestep is not None:
+                joint_prims[j].GetAttribute("drive:angular:physics:targetPosition").Set(
+                    time=timestep, value=np.degrees(js.position[..., j_idx].item())
+                )
+            else:
+                joint_prims[j].GetAttribute("drive:angular:physics:targetPosition").Set(
+                    value=np.degrees(js.position[..., j_idx].item())
+                )
+
+    @staticmethod
+    def write_motion_gen_log(
+        result: MotionGenResult,
+        robot_file: Union[str, RobotConfig],
+        world_config: Union[None, WorldConfig],
+        start_state: JointState,
+        goal_pose: Pose,
+        save_prefix: str = "log",
+        write_ik: bool = False,
+        write_trajopt: bool = False,
+        write_graph: bool = False,
+        goal_object: Optional[Obstacle] = None,
+        overlay_ik: bool = False,
+        overlay_trajopt: bool = False,
+        visualize_robot_spheres: bool = True,
+        link_spheres: Optional[torch.Tensor] = None,
+        grid_space: float = 1.0,
+        write_robot_usd_path="assets/",
+        robot_asset_prim_path="/panda",
+    ):
+        # if goal_object is None:
+        #    log_warn("Using franka gripper as goal object")
+        #    goal_object =
+        if goal_object is not None:
+            goal_object.pose = np.ravel(goal_pose.tolist()).tolist()
+            world_config = world_config.clone()
+            world_config.add_obstacle(goal_object)
+        kin_model = UsdHelper.load_robot(robot_file)
+        if link_spheres is not None:
+            kin_model.kinematics_config.link_spheres = link_spheres
+        if write_graph:
+            log_error("Logging graph planner is not supported")
+        if write_ik:
+            x_space = y_space = grid_space
+            if overlay_ik:
+                x_space = y_space = 0.0
+
+            ik_iter_steps = result.debug_info["ik_result"].debug_info["solver"]["steps"]
+            # convert ik_iter to a trajectory:
+            ik_iter_steps = torch.cat(
+                [torch.cat(ik_iter_steps[i], dim=1) for i in range(len(ik_iter_steps))], dim=1
+            )
+            vis_traj = ik_iter_steps
+            num_seeds, n_iters, dof = vis_traj.shape
+            usd_paths = []
+            for j in tqdm(range(num_seeds)):
+                current_traj = vis_traj[j].view(-1, dof)[:-1, :]  # we remove last timestep
+
+                usd_paths.append(save_prefix + "_ik_seed_" + str(j) + ".usd")
+                UsdHelper.write_trajectory_animation_with_robot_usd(
+                    robot_file,
+                    world_config,
+                    start_state,
+                    JointState.from_position(current_traj),
+                    dt=(1),
+                    save_path=usd_paths[-1],
+                    base_frame="/world_base_" + str(j),
+                    kin_model=kin_model,
+                    visualize_robot_spheres=visualize_robot_spheres,
+                    write_robot_usd_path=write_robot_usd_path,
+                    robot_asset_prim_path=robot_asset_prim_path,
+                )
+
+            UsdHelper.create_grid_usd(
+                usd_paths,
+                save_prefix + "_grid_ik.usd",
+                base_frame="/world",
+                max_envs=len(usd_paths),
+                max_timecode=n_iters,
+                x_space=x_space,
+                y_space=y_space,
+                x_per_row=int(math.floor(math.sqrt(len(usd_paths)))),
+                local_asset_path="",
+                dt=(1),
+            )
+        if write_trajopt:
+            if "trajopt_result" not in result.debug_info:
+                log_warn("Trajopt result was not found in debug information")
+                return
+            trajectory_iter_steps = result.debug_info["trajopt_result"].debug_info["solver"][
+                "steps"
+            ]
+            vis_traj = []
+            for i in range(len(trajectory_iter_steps)):
+                vis_traj += trajectory_iter_steps[i]
+
+            full_traj = torch.cat(vis_traj, dim=0)
+            num_seeds, h, dof = vis_traj[-1].shape
+            n, _, _ = full_traj.shape  # this will have iterations
+            full_traj = full_traj.view(-1, num_seeds, h, dof)
+            n_steps = full_traj.shape[0]
+
+            full_traj = full_traj.transpose(0, 1).contiguous()  # n_seeds, n_steps, h, dof
+            n1, n2, _, _ = full_traj.shape
+
+            full_traj = torch.cat(
+                (start_state.position.view(1, 1, 1, -1).repeat(n1, n2, 1, 1), full_traj), dim=-2
+            )
+            usd_paths = []
+            for j in tqdm(range(num_seeds)):
+                current_traj = full_traj[j].view(-1, dof)[:-1, :]  # we remove last timestep
+                # add start state to current trajectory since it's not in the optimization:
+                usd_paths.append(save_prefix + "_trajopt_seed_" + str(j) + ".usd")
+                UsdHelper.write_trajectory_animation_with_robot_usd(
+                    robot_file,
+                    world_config,
+                    start_state,
+                    JointState.from_position(current_traj),
+                    dt=(1 / 24),
+                    save_path=usd_paths[-1],
+                    base_frame="/world_base_" + str(j),
+                    kin_model=kin_model,
+                    visualize_robot_spheres=visualize_robot_spheres,
+                    write_robot_usd_path=write_robot_usd_path,
+                    robot_asset_prim_path=robot_asset_prim_path,
+                )
+            # add finetuning step:
+
+            if "finetune_trajopt_result" in result.debug_info and True:
+                finetune_iter_steps = result.debug_info["finetune_trajopt_result"].debug_info[
+                    "solver"
+                ]["steps"]
+
+                vis_traj = []
+                if finetune_iter_steps is not None:
+                    for i in range(len(finetune_iter_steps)):
+                        vis_traj += finetune_iter_steps[i]
+                full_traj = torch.cat(vis_traj, dim=0)
+                num_seeds, h, dof = vis_traj[-1].shape
+                n, _, _ = full_traj.shape  # this will have iterations
+                full_traj = full_traj.view(-1, num_seeds, h, dof)
+                n1, n2, _, _ = full_traj.shape
+
+                full_traj = torch.cat(
+                    (start_state.position.view(1, 1, 1, -1).repeat(n1, n2, 1, 1), full_traj), dim=-2
+                )
+
+                # n_steps = full_traj.shape[0]
+
+                full_traj = full_traj.transpose(0, 1).contiguous()  # n_seeds, n_steps, h, dof
+                for j in tqdm(range(num_seeds)):
+                    current_traj = full_traj[j].view(-1, dof)[:-1, :]  # we remove last timestep
+
+                    usd_paths.append(save_prefix + "_trajopt_finetune_seed_" + str(j) + ".usd")
+                    UsdHelper.write_trajectory_animation_with_robot_usd(
+                        robot_file,
+                        world_config,
+                        start_state,
+                        JointState.from_position(current_traj),
+                        dt=(1 / 24),
+                        save_path=usd_paths[-1],
+                        base_frame="/world_base_" + str(j),
+                        kin_model=kin_model,
+                        visualize_robot_spheres=visualize_robot_spheres,
+                        write_robot_usd_path=write_robot_usd_path,
+                        robot_asset_prim_path=robot_asset_prim_path,
+                    )
+            x_space = y_space = grid_space
+            if overlay_trajopt:
+                x_space = y_space = 0.0
+            UsdHelper.create_grid_usd(
+                usd_paths,
+                save_prefix + "_grid_trajopt.usd",
+                base_frame="/world",
+                max_envs=len(usd_paths),
+                max_timecode=n_steps * h,
+                x_space=x_space,
+                y_space=y_space,
+                x_per_row=int(math.floor(math.sqrt(len(usd_paths)))),
+                local_asset_path="",
+                dt=(1 / 24),
+            )
diff --git a/src/curobo/util/warp.py b/src/curobo/util/warp.py
new file mode 100644
index 00000000..e8a2c7be
--- /dev/null
+++ b/src/curobo/util/warp.py
@@ -0,0 +1,28 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import warp as wp
+
+# CuRobo
+from curobo.types.base import TensorDeviceType
+
+
+def init_warp(quiet=True, tensor_args: TensorDeviceType = TensorDeviceType()):
+    wp.config.quiet = quiet
+    # wp.config.print_launches = True
+    # wp.config.verbose = True
+    # wp.config.mode = "debug"
+    # wp.config.enable_backward = True
+    wp.init()
+
+    wp.force_load(wp.device_from_torch(tensor_args.device))
+    return True
diff --git a/src/curobo/util/warp_interpolation.py b/src/curobo/util/warp_interpolation.py
new file mode 100644
index 00000000..259bc711
--- /dev/null
+++ b/src/curobo/util/warp_interpolation.py
@@ -0,0 +1,176 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import torch
+import warp as wp
+
+# CuRobo
+from curobo.types.robot import JointState
+from curobo.util.warp import init_warp
+
+wp.set_module_options({"fast_math": False})
+
+
+@wp.kernel
+def linear_interpolate_trajectory_kernel(
+    raw_position: wp.array(dtype=wp.float32),
+    raw_velocity: wp.array(dtype=wp.float32),
+    raw_acceleration: wp.array(dtype=wp.float32),
+    raw_jerk: wp.array(dtype=wp.float32),
+    out_position: wp.array(dtype=wp.float32),
+    out_velocity: wp.array(dtype=wp.float32),
+    out_acceleration: wp.array(dtype=wp.float32),
+    out_jerk: wp.array(dtype=wp.float32),
+    opt_dt: wp.array(dtype=wp.float32),
+    traj_tsteps: wp.array(dtype=wp.int32),
+    batch_size: wp.int32,
+    raw_horizon: wp.int32,
+    dof: wp.int32,
+    out_horizon: wp.int32,
+    raw_dt: wp.float32,
+):
+    tid = wp.tid()
+
+    b_idx = int(0)
+    h_idx = int(0)
+
+    oh_idx = int(0)
+    d_idx = int(0)
+    b_idx = tid / (out_horizon * dof)
+
+    oh_idx = (tid - (b_idx * (out_horizon * dof))) / dof
+    d_idx = tid - (b_idx * out_horizon * dof) - (oh_idx * dof)
+    if b_idx >= batch_size or oh_idx >= out_horizon or d_idx >= dof:
+        return
+
+    nh_idx = int(0)
+    weight = float(0)
+    n_weight = float(0)
+    max_tstep = int(0)
+    int_steps = float(0)
+    op_dt = float(0)
+    op_dt = opt_dt[b_idx]
+    max_tstep = traj_tsteps[b_idx]
+    int_steps = float((float(max_tstep) / float(raw_horizon - 1)))
+    scale = float(1.0)
+    scale = raw_dt / op_dt
+    # scale = 1.0 #scale * int_steps * (0.01) # Bug is here
+    # print(oh_idx)
+    h_idx = int(wp.ceil(float(oh_idx) / int_steps))
+
+    if oh_idx >= (max_tstep) or h_idx >= raw_horizon:  # - int(int_steps):
+        # write last tstep data:
+        h_idx = raw_horizon - 1
+        out_position[b_idx * out_horizon * dof + oh_idx * dof + d_idx] = raw_position[
+            b_idx * raw_horizon * dof + h_idx * dof + d_idx
+        ]
+        out_velocity[b_idx * out_horizon * dof + oh_idx * dof + d_idx] = (
+            raw_velocity[b_idx * raw_horizon * dof + h_idx * dof + d_idx] * scale
+        )
+        out_acceleration[b_idx * out_horizon * dof + oh_idx * dof + d_idx] = (
+            raw_acceleration[b_idx * raw_horizon * dof + h_idx * dof + d_idx] * scale * scale
+        )
+        out_jerk[b_idx * out_horizon * dof + oh_idx * dof + d_idx] = (
+            raw_jerk[b_idx * raw_horizon * dof + h_idx * dof + d_idx] * scale * scale * scale
+        )
+        return
+    # we find the current h_idx and interpolate backwards:
+    # find the h_idx -1 and h_idx
+
+    # Find current tstep:
+    # print(h_idx)
+
+    if h_idx == 0:
+        h_idx = 1
+    nh_idx = h_idx - 1
+    weight = (float(oh_idx) / int_steps) - float(nh_idx)
+
+    n_weight = 1.0 - weight
+
+    # do linear interpolation of position, velocity, acceleration and jerk:
+    out_position[b_idx * out_horizon * dof + oh_idx * dof + d_idx] = (
+        weight * raw_position[b_idx * raw_horizon * dof + h_idx * dof + d_idx]
+        + n_weight * raw_position[b_idx * raw_horizon * dof + nh_idx * dof + d_idx]
+    )
+    out_velocity[b_idx * out_horizon * dof + oh_idx * dof + d_idx] = (
+        weight * raw_velocity[b_idx * raw_horizon * dof + h_idx * dof + d_idx]
+        + n_weight * raw_velocity[b_idx * raw_horizon * dof + nh_idx * dof + d_idx]
+    ) * scale
+    out_acceleration[b_idx * out_horizon * dof + oh_idx * dof + d_idx] = (
+        (
+            weight * raw_acceleration[b_idx * raw_horizon * dof + h_idx * dof + d_idx]
+            + n_weight * raw_acceleration[b_idx * raw_horizon * dof + nh_idx * dof + d_idx]
+        )
+        * scale
+        * scale
+    )
+    out_jerk[b_idx * out_horizon * dof + oh_idx * dof + d_idx] = (
+        (
+            weight * raw_jerk[b_idx * raw_horizon * dof + h_idx * dof + d_idx]
+            + n_weight * raw_jerk[b_idx * raw_horizon * dof + nh_idx * dof + d_idx]
+        )
+        * scale
+        * scale
+        * scale
+    )
+
+
+def get_cuda_linear_interpolation(
+    raw_traj: JointState,
+    traj_tsteps: torch.Tensor,
+    out_traj: JointState,
+    opt_dt: torch.Tensor,
+    raw_dt: float = 0.5,
+):
+    """Use warp to perform linear interpolation on GPU for a batch of trajectories.
+
+    #NOTE: There is a bug in the indexing which makes the last horizon step in the trajectory to be
+    not missed. This will not affect solutions solved by our arm_base class as we make last 3
+    timesteps the same.
+
+    Args:
+        raw_traj (JointState): _description_
+        traj_tsteps (torch.Tensor): _description_
+        out_traj (JointState): _description_
+        opt_dt (torch.Tensor): _description_
+        raw_dt (float, optional): _description_. Defaults to 0.5.
+
+    Returns:
+        _type_: _description_
+    """
+    init_warp()
+    batch, int_horizon, dof = out_traj.position.shape
+    horizon = raw_traj.position.shape[1]
+
+    wp.launch(
+        kernel=linear_interpolate_trajectory_kernel,
+        dim=batch * int_horizon * dof,
+        inputs=[
+            wp.from_torch(raw_traj.position.view(-1)),
+            wp.from_torch(raw_traj.velocity.view(-1)),
+            wp.from_torch(raw_traj.acceleration.view(-1)),
+            wp.from_torch(raw_traj.jerk.view(-1)),
+            wp.from_torch(out_traj.position.view(-1)),
+            wp.from_torch(out_traj.velocity.view(-1)),
+            wp.from_torch(out_traj.acceleration.view(-1)),
+            wp.from_torch(out_traj.jerk.view(-1)),
+            wp.from_torch(opt_dt.view(-1)),
+            wp.from_torch(traj_tsteps.view(-1)),
+            batch,
+            horizon,
+            dof,
+            int_horizon,
+            raw_dt,
+        ],
+        stream=wp.stream_from_torch(raw_traj.position.device),
+    )
+    return out_traj
diff --git a/src/curobo/util_file.py b/src/curobo/util_file.py
new file mode 100644
index 00000000..aac662ea
--- /dev/null
+++ b/src/curobo/util_file.py
@@ -0,0 +1,192 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+import os
+import shutil
+from typing import Dict, List
+
+# Third Party
+import yaml
+from yaml import Loader
+
+
+# get paths
+def get_module_path():
+    path = os.path.dirname(__file__)
+    return path
+
+
+def get_root_path():
+    path = os.path.dirname(get_module_path())
+    return path
+
+
+def get_content_path():
+    root_path = get_module_path()
+    path = os.path.join(root_path, "content")
+    return path
+
+
+def get_configs_path():
+    content_path = get_content_path()
+    path = os.path.join(content_path, "configs")
+    return path
+
+
+def get_assets_path():
+    content_path = get_content_path()
+    path = os.path.join(content_path, "assets")
+    return path
+
+
+def get_weights_path():
+    content_path = get_content_path()
+    path = os.path.join(content_path, "weights")
+    return path
+
+
+def join_path(path1, path2):
+    if isinstance(path2, str):
+        return os.path.join(path1, path2)
+    else:
+        return path2
+
+
+def load_yaml(file_path):
+    if isinstance(file_path, str):
+        with open(file_path) as file_p:
+            yaml_params = yaml.load(file_p, Loader=Loader)
+    else:
+        yaml_params = file_path
+    return yaml_params
+
+
+def write_yaml(data: Dict, file_path):
+    with open(file_path, "w") as file:
+        yaml.dump(data, file)
+
+
+def get_robot_path():
+    config_path = get_configs_path()
+    path = os.path.join(config_path, "robot")
+    return path
+
+
+def get_task_configs_path():
+    config_path = get_configs_path()
+    path = os.path.join(config_path, "task")
+    return path
+
+
+def get_robot_configs_path():
+    config_path = get_configs_path()
+    path = os.path.join(config_path, "robot")
+    return path
+
+
+def get_world_configs_path():
+    config_path = get_configs_path()
+    path = os.path.join(config_path, "world")
+    return path
+
+
+def get_debug_path():
+    asset_path = get_assets_path()
+    path = join_path(asset_path, "debug")
+    return path
+
+
+def get_cpp_path():
+    path = os.path.dirname(__file__)
+    return os.path.join(path, "curobolib/cpp")
+
+
+def add_cpp_path(sources):
+    cpp_path = get_cpp_path()
+    new_list = []
+    for s in sources:
+        s = join_path(cpp_path, s)
+        new_list.append(s)
+    return new_list
+
+
+def copy_file_to_path(source_file, destination_path):
+    #
+    isExist = os.path.exists(destination_path)
+    if not isExist:
+        os.makedirs(destination_path)
+    _, file_name = os.path.split(source_file)
+    new_path = join_path(destination_path, file_name)
+    isExist = os.path.exists(new_path)
+    if not isExist:
+        shutil.copyfile(source_file, new_path)
+    return new_path
+
+
+def get_filename(file_path, remove_extension: bool = False):
+    _, file_name = os.path.split(file_path)
+    if remove_extension:
+        file_name = os.path.splitext(file_name)[0]
+    return file_name
+
+
+def get_path_of_dir(file_path):
+    dir_path, _ = os.path.split(file_path)
+    return dir_path
+
+
+def get_files_from_dir(dir_path, extension: List[str], contains: str):
+    file_names = [
+        fn
+        for fn in os.listdir(dir_path)
+        if (any(fn.endswith(ext) for ext in extension) and contains in fn)
+    ]
+    file_names.sort()
+    return file_names
+
+
+def file_exists(path):
+    isExist = os.path.exists(path)
+    return isExist
+
+
+def get_motion_gen_robot_list() -> List[str]:
+    """returns list of robots available in curobo for motion generation
+
+    Returns:
+        _description_
+    """
+    robot_list = [
+        "franka.yml",
+        "ur5e.yml",
+        "ur10e.yml",
+        "dual_ur10e.yml",
+        "tm12.yml",
+        "jaco7.yml",
+        "kinova_gen3.yml",
+        "iiwa.yml",
+        "iiwa_allegro.yml",
+        # "franka_mobile.yml",
+    ]
+    return robot_list
+
+
+def get_robot_list() -> List[str]:
+    return get_motion_gen_robot_list()
+
+
+def get_multi_arm_robot_list() -> List[str]:
+    robot_list = [
+        "dual_ur10e.yml",
+        "tri_ur10e.yml",
+        "quad_ur10e.yml",
+    ]
+    return robot_list
diff --git a/src/curobo/wrap/__init__.py b/src/curobo/wrap/__init__.py
new file mode 100644
index 00000000..a08745f9
--- /dev/null
+++ b/src/curobo/wrap/__init__.py
@@ -0,0 +1,10 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
diff --git a/src/curobo/wrap/model/__init__.py b/src/curobo/wrap/model/__init__.py
new file mode 100644
index 00000000..a08745f9
--- /dev/null
+++ b/src/curobo/wrap/model/__init__.py
@@ -0,0 +1,10 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
diff --git a/src/curobo/wrap/model/curobo_robot_world.py b/src/curobo/wrap/model/curobo_robot_world.py
new file mode 100644
index 00000000..cc75c829
--- /dev/null
+++ b/src/curobo/wrap/model/curobo_robot_world.py
@@ -0,0 +1,31 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+"""This module has differentiable layers built from CuRobo's core features for use in Pytorch. """
+
+# Standard Library
+from dataclasses import dataclass
+
+# CuRobo
+from curobo.util.logger import log_warn
+from curobo.wrap.model.robot_world import RobotWorld, RobotWorldConfig
+
+
+@dataclass
+class CuroboRobotWorldConfig(RobotWorldConfig):
+    def __post_init__(self):
+        log_warn("CuroboRobotWorldConfig is deprecated, use RobotWorldConfig instead")
+        return super().__post_init__()
+
+
+class CuroboRobotWorld(RobotWorld):
+    def __init__(self, config: CuroboRobotWorldConfig):
+        log_warn("CuroboRobotWorld is deprecated, use RobotWorld instead")
+        return super().__init__(config)
diff --git a/src/curobo/wrap/model/robot_world.py b/src/curobo/wrap/model/robot_world.py
new file mode 100644
index 00000000..69bbfa08
--- /dev/null
+++ b/src/curobo/wrap/model/robot_world.py
@@ -0,0 +1,359 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+"""This module has differentiable layers built from CuRobo's core features for use in Pytorch. """
+
+# Standard Library
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple, Union
+
+# Third Party
+import torch
+
+# CuRobo
+from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel
+from curobo.cuda_robot_model.types import CudaRobotModelState
+from curobo.geom.sdf.utils import create_collision_checker
+from curobo.geom.sdf.world import CollisionCheckerType, WorldCollision, WorldCollisionConfig
+from curobo.geom.types import WorldConfig
+from curobo.rollout.cost.bound_cost import BoundCost, BoundCostConfig, BoundCostType
+from curobo.rollout.cost.pose_cost import PoseCost, PoseCostConfig, PoseErrorType
+from curobo.rollout.cost.primitive_collision_cost import (
+    PrimitiveCollisionCost,
+    PrimitiveCollisionCostConfig,
+)
+from curobo.rollout.cost.self_collision_cost import SelfCollisionCost, SelfCollisionCostConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import RobotConfig
+from curobo.types.state import JointState
+from curobo.util.sample_lib import HaltonGenerator
+from curobo.util.warp import init_warp
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+
+
+@dataclass
+class RobotWorldConfig:
+    kinematics: CudaRobotModel
+    sampler: HaltonGenerator
+    bound_scale: torch.Tensor
+    bound_cost: BoundCost
+    pose_cost: PoseCost
+    self_collision_cost: Optional[SelfCollisionCost] = None
+    collision_cost: Optional[PrimitiveCollisionCost] = None
+    collision_constraint: Optional[PrimitiveCollisionCost] = None
+    world_model: Optional[WorldCollision] = None
+    rejection_ratio: int = 10
+    tensor_args: TensorDeviceType = TensorDeviceType()
+    contact_distance: float = 0.0
+
+    @staticmethod
+    def load_from_config(
+        robot_config: Union[RobotConfig, str] = "franka.yml",
+        world_model: Union[None, str, Dict, WorldConfig, List[WorldConfig], List[str]] = None,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+        n_envs: int = 1,
+        n_meshes: int = 10,
+        n_cuboids: int = 10,
+        collision_activation_distance: float = 0.2,
+        self_collision_activation_distance: float = 0.0,
+        max_collision_distance: float = 1.0,
+        collision_checker_type: CollisionCheckerType = CollisionCheckerType.MESH,
+        world_collision_checker: Optional[WorldCollision] = None,
+        pose_weight: List[float] = [1, 1, 1, 1],
+    ):
+        init_warp(tensor_args=tensor_args)
+        world_collision_cost = self_collision_cost = world_collision_constraint = None
+        if isinstance(robot_config, str):
+            robot_config = load_yaml(join_path(get_robot_configs_path(), robot_config))["robot_cfg"]
+        if isinstance(robot_config, Dict):
+            robot_config = RobotConfig.from_dict(robot_config, tensor_args)
+        kinematics = CudaRobotModel(robot_config.kinematics)
+
+        if isinstance(world_model, str):
+            world_model = load_yaml(join_path(get_world_configs_path(), world_model))
+        if isinstance(world_model, List):
+            if isinstance(world_model[0], str):
+                world_model = [
+                    load_yaml(join_path(get_world_configs_path(), x)) for x in world_model
+                ]
+        if world_collision_checker is None and world_model is not None:
+            world_cfg = WorldCollisionConfig.load_from_dict(
+                {
+                    "checker_type": collision_checker_type,
+                    "cache": {"mesh": n_meshes, "obb": n_cuboids},
+                    "max_distance": max_collision_distance,
+                    "n_envs": n_envs,
+                },
+                world_model,
+                tensor_args,
+            )
+            world_collision_checker = create_collision_checker(world_cfg)
+
+        if world_collision_checker is not None:
+            collision_cost_config = PrimitiveCollisionCostConfig(
+                tensor_args.to_device([1.0]),
+                tensor_args,
+                return_loss=True,
+                world_coll_checker=world_collision_checker,
+                activation_distance=collision_activation_distance,
+            )
+            world_collision_cost = PrimitiveCollisionCost(collision_cost_config)
+            collision_constraint_config = PrimitiveCollisionCostConfig(
+                tensor_args.to_device([1.0]),
+                tensor_args,
+                return_loss=True,
+                world_coll_checker=world_collision_checker,
+                activation_distance=0.0,
+            )
+            world_collision_constraint = PrimitiveCollisionCost(collision_constraint_config)
+
+        self_collision_config = SelfCollisionCostConfig(
+            tensor_args.to_device([1.0]),
+            tensor_args,
+            return_loss=True,
+            self_collision_kin_config=kinematics.get_self_collision_config(),
+            distance_threshold=self_collision_activation_distance,
+        )
+
+        self_collision_cost = SelfCollisionCost(self_collision_config)
+        bound_config = BoundCostConfig(
+            tensor_args.to_device([1.0]),
+            tensor_args,
+            return_loss=True,
+            cost_type=BoundCostType.POSITION,
+            activation_distance=[0.0],
+        )
+        bound_config.set_bounds(kinematics.get_joint_limits(), teleport_mode=True)
+
+        bound_cost = BoundCost(bound_config)
+        sample_gen = HaltonGenerator(
+            kinematics.get_dof(),
+            tensor_args,
+            up_bounds=kinematics.get_joint_limits().position[1],
+            low_bounds=kinematics.get_joint_limits().position[0],
+        )
+        pose_cost_config = PoseCostConfig(
+            tensor_args.to_device(pose_weight),
+            tensor_args=tensor_args,
+            terminal=False,
+            return_loss=True,
+            cost_type=PoseErrorType.BATCH_GOAL,
+        )
+        pose_cost = PoseCost(pose_cost_config)
+        bound_scale = (
+            kinematics.get_joint_limits().position[1] - kinematics.get_joint_limits().position[0]
+        ).unsqueeze(0) / 2.0
+        dist_threshold = 0.0
+        if collision_activation_distance > 0.0:
+            dist_threshold = (
+                (0.5 / collision_activation_distance)
+                * collision_activation_distance
+                * collision_activation_distance
+            )
+
+        return RobotWorldConfig(
+            kinematics,
+            sample_gen,
+            bound_scale,
+            bound_cost,
+            pose_cost,
+            self_collision_cost,
+            world_collision_cost,
+            world_collision_constraint,
+            world_collision_checker,
+            tensor_args=tensor_args,
+            contact_distance=dist_threshold,
+        )
+
+
+class RobotWorld(RobotWorldConfig):
+    def __init__(self, config: RobotWorldConfig) -> None:
+        RobotWorldConfig.__init__(self, **vars(config))
+        self._batch_pose_idx = None
+
+    def get_kinematics(self, q: torch.Tensor) -> CudaRobotModelState:
+        state = self.kinematics.get_state(q)
+        return state
+
+    def update_world(self, world_config: WorldConfig):
+        self.world_model.load_collision_model(world_config)
+
+    def get_collision_distance(
+        self, x_sph: torch.Tensor, env_query_idx: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """Get collision distance
+
+        Args:
+            x_sph (torch.Tensor): batch, horizon, n_spheres, 4
+
+        Returns:
+            torch.Tensor: _description_
+        """
+        d = self.collision_cost.forward(x_sph, env_query_idx=env_query_idx)
+        return d
+
+    def get_collision_constraint(
+        self, x_sph: torch.Tensor, env_query_idx: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """Get collision distance
+
+        Args:
+            x_sph (torch.Tensor): batch, horizon, n_spheres, 4
+
+        Returns:
+            torch.Tensor: _description_
+        """
+        d = self.collision_constraint.forward(x_sph, env_query_idx=env_query_idx)
+        return d
+
+    def get_self_collision_distance(self, x_sph: torch.Tensor) -> torch.Tensor:
+        return self.get_self_collision(x_sph)
+
+    def get_self_collision(self, x_sph: torch.Tensor) -> torch.Tensor:
+        d = self.self_collision_cost.forward(x_sph)
+        return d
+
+    def get_collision_vector(
+        self, x_sph: torch.Tensor, env_query_idx: Optional[torch.Tensor] = None
+    ):
+        """
+        NOTE: This function is not differentiable
+        """
+        x_sph.requires_grad = True
+        d = self.collision_cost.forward(x_sph, env_query_idx=env_query_idx)
+        vec = self.collision_cost.get_gradient_buffer()
+        d = d.detach()
+        x_sph.requires_grad = False
+        x_sph.grad = None
+        return d, vec
+
+    def get_world_self_collision_distance_from_joints(
+        self,
+        q: torch.Tensor,
+        env_query_idx: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        q : batch, dof
+        """
+        state = self.get_kinematics(q)
+        d_world = self.get_collision_distance(
+            state.link_spheres_tensor.unsqueeze(1), env_query_idx=env_query_idx
+        ).squeeze(1)
+        d_self = self.get_self_collision_distance(state.link_spheres_tensor.unsqueeze(1)).squeeze(1)
+        return d_world, d_self
+
+    def get_world_self_collision_distance_from_joint_trajectory(
+        self,
+        q: torch.Tensor,
+        env_query_idx: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        q : batch, horizon, dof
+        """
+        b, h, dof = q.shape
+        state = self.get_kinematics(q.view(b * h, dof))
+        x_sph = state.link_spheres_tensor.view(b, h, -1, 4)
+        d_world = self.get_collision_distance(x_sph, env_query_idx=env_query_idx)
+        d_self = self.get_self_collision_distance(x_sph)
+        return d_world, d_self
+
+    def get_bound(self, q: torch.Tensor) -> torch.Tensor:
+        d = self.bound_cost.forward(JointState(position=q))
+        return d
+
+    def sample(self, n: int, mask_valid: bool = True, env_query_idx: Optional[torch.Tensor] = None):
+        """
+        This does not support batched environments, use sample_trajectory instead.
+        """
+        n_samples = n
+        if mask_valid:
+            n_samples = n * self.rejection_ratio
+        q = self.sampler.get_samples(n_samples, bounded=True)
+        if mask_valid:
+            q_mask = self.validate(q, env_query_idx)
+            q = q[q_mask][:n]
+        return q
+
+    def validate(self, q: torch.Tensor, env_query_idx: Optional[torch.Tensor] = None):
+        """
+        This does not support batched environments, use validate_trajectory instead
+        """
+        # run collision, self collision, bounds
+        b, dof = q.shape
+        kin_state = self.get_kinematics(q)
+        spheres = kin_state.link_spheres_tensor.view(b, 1, -1, 4)
+        d_self = self.get_self_collision(spheres)
+        d_world = self.get_collision_constraint(spheres, env_query_idx)
+        d_bound = self.get_bound(q.view(b, 1, dof))
+        d_mask = sum_mask(d_self, d_world, d_bound)
+        return d_mask
+
+    def sample_trajectory(
+        self,
+        batch: int,
+        horizon: int,
+        mask_valid: bool = True,
+        env_query_idx: Optional[torch.Tensor] = None,
+    ):
+        n_samples = batch * horizon
+        if mask_valid:
+            n_samples = batch * horizon * self.rejection_ratio
+        q = self.sampler.get_samples(n_samples, bounded=True)
+        q = q.reshape(batch, horizon * self.rejection_ratio, -1)
+        if mask_valid:
+            q_mask = self.validate_trajectory(q, env_query_idx)
+            q = [q[i][q_mask[i, :], :][:horizon, :].unsqueeze(0) for i in range(batch)]
+            q = torch.cat(q)
+        return q
+
+    def validate_trajectory(self, q: torch.Tensor, env_query_idx: Optional[torch.Tensor] = None):
+        """
+        q: batch , horizon, dof
+        env_query_idx: batch, 1
+        """
+        # run collision, self collision, bounds
+        b, h, dof = q.shape
+        q = q.view(b * h, dof)
+        kin_state = self.get_kinematics(q)
+        spheres = kin_state.link_spheres_tensor.view(b, h, -1, 4)
+        d_self = self.get_self_collision(spheres)
+        d_world = self.get_collision_constraint(spheres, env_query_idx)
+        d_bound = self.get_bound(q.view(b, h, dof))
+        d_mask = mask(d_self, d_world, d_bound)
+        return d_mask
+
+    def pose_distance(self, x_des: Pose, x_current: Pose):
+        if len(x_current.position.shape) == 2:
+            x_current = x_current.unsqueeze(1)
+        # calculate pose loss:
+        if (
+            self._batch_pose_idx is None
+            or self._batch_pose_idx.shape[0] != x_current.position.shape[0]
+        ):
+            self._batch_pose_idx = torch.arange(
+                0, x_current.position.shape[0], 1, device=self.tensor_args.device, dtype=torch.int32
+            )
+        distance = self.pose_cost.forward_pose(x_des, x_current, self._batch_pose_idx)
+        return distance
+
+
+@torch.jit.script
+def sum_mask(d1, d2, d3):
+    d_total = d1 + d2 + d3
+    d_mask = d_total == 0.0
+    return d_mask.view(-1)
+
+
+@torch.jit.script
+def mask(d1, d2, d3):
+    d_total = d1 + d2 + d3
+    d_mask = d_total == 0.0
+    return d_mask
diff --git a/src/curobo/wrap/reacher/__init__.py b/src/curobo/wrap/reacher/__init__.py
new file mode 100644
index 00000000..a08745f9
--- /dev/null
+++ b/src/curobo/wrap/reacher/__init__.py
@@ -0,0 +1,10 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
diff --git a/src/curobo/wrap/reacher/evaluator.py b/src/curobo/wrap/reacher/evaluator.py
new file mode 100644
index 00000000..64e958f3
--- /dev/null
+++ b/src/curobo/wrap/reacher/evaluator.py
@@ -0,0 +1,196 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+from dataclasses import dataclass
+from typing import Optional
+
+# Third Party
+import torch
+import torch.autograd.profiler as profiler
+
+# CuRobo
+from curobo.types.robot import JointState
+from curobo.types.tensor import T_DOF
+from curobo.util.trajectory import calculate_dt
+
+
+@dataclass
+class TrajEvaluatorConfig:
+    max_acc: float = 15.0
+    max_jerk: float = 500.0
+    cost_weight: float = 0.01
+    min_dt: float = 0.001
+    max_dt: float = 0.1
+
+
+@torch.jit.script
+def compute_path_length(vel, traj_dt, cspace_distance_weight):
+    pl = torch.sum(
+        torch.sum(torch.abs(vel) * traj_dt.unsqueeze(-1) * cspace_distance_weight, dim=-1), dim=-1
+    )
+    return pl
+
+
+@torch.jit.script
+def compute_path_length_cost(vel, cspace_distance_weight):
+    pl = torch.sum(torch.sum(torch.abs(vel) * cspace_distance_weight, dim=-1), dim=-1)
+    return pl
+
+
+@torch.jit.script
+def smooth_cost(abs_acc, abs_jerk, opt_dt):
+    # acc = torch.max(torch.max(abs_acc, dim=-1)[0], dim=-1)[0]
+    # jerk = torch.max(torch.max(abs_jerk, dim=-1)[0], dim=-1)[0]
+    jerk = torch.mean(torch.max(abs_jerk, dim=-1)[0], dim=-1)
+    mean_acc = torch.mean(torch.max(abs_acc, dim=-1)[0], dim=-1)  # [0]
+    a = (jerk * 0.001) + opt_dt + (mean_acc * 0.01)
+    return a
+
+
+@torch.jit.script
+def compute_smoothness(
+    vel: torch.Tensor,
+    acc: torch.Tensor,
+    jerk: torch.Tensor,
+    max_vel: torch.Tensor,
+    max_acc: float,
+    max_jerk: float,
+    traj_dt: torch.Tensor,
+    min_dt: float,
+    max_dt: float,
+):
+    # compute scaled dt:
+    # h = int(vel.shape[-2] / 2)
+
+    max_v_arr = torch.max(torch.abs(vel), dim=-2)[0]  # output is batch, dof
+    scale_dt = (max_v_arr) / (max_vel.view(1, max_v_arr.shape[-1]))  # batch,dof
+
+    max_acc_arr = torch.max(torch.abs(acc), dim=-2)[0]  # batch, dof
+
+    scale_dt_acc = torch.sqrt(torch.max(max_acc_arr / max_acc, dim=-1)[0])  # batch, 1
+
+    max_jerk_arr = torch.max(torch.abs(jerk), dim=-2)[0]  # batch, dof
+
+    scale_dt_jerk = torch.pow(torch.max(max_jerk_arr / max_jerk, dim=-1)[0], 1.0 / 3.0)  # batch, 1
+
+    dt_score_vel = torch.max(scale_dt, dim=-1)[0]  # batch, 1
+
+    dt_score = torch.maximum(dt_score_vel, scale_dt_acc)
+    dt_score = torch.maximum(dt_score, scale_dt_jerk)
+
+    # clamp dt score:
+
+    dt_score = torch.clamp(dt_score, min_dt, max_dt)
+    scale_dt = (1 / dt_score).view(-1, 1, 1)
+    abs_acc = torch.abs(acc) * (scale_dt**2)
+    # mean_acc_val = torch.max(torch.mean(abs_acc, dim=-1), dim=-1)[0]
+    max_acc_val = torch.max(torch.max(abs_acc, dim=-1)[0], dim=-1)[0]
+    abs_jerk = torch.abs(jerk) * scale_dt**3
+    # calculate max mean jerk:
+    # mean_jerk_val = torch.max(torch.mean(abs_jerk, dim=-1), dim=-1)[0]
+    max_jerk_val = torch.max(torch.max(abs_jerk, dim=-1)[0], dim=-1)[0]
+    acc_label = torch.logical_and(max_acc_val <= max_acc, max_jerk_val <= max_jerk)
+    # print(max_acc_val, max_jerk_val, dt_score)
+    return (acc_label, smooth_cost(abs_acc, abs_jerk, dt_score))
+
+
+@torch.jit.script
+def compute_smoothness_opt_dt(
+    vel, acc, jerk, max_vel: torch.Tensor, max_acc: float, max_jerk: float, opt_dt: torch.Tensor
+):
+    abs_acc = torch.abs(acc)
+    max_acc_val = torch.max(torch.max(abs_acc, dim=-1)[0], dim=-1)[0]
+    abs_jerk = torch.abs(jerk)
+    max_jerk_val = torch.max(torch.max(abs_jerk, dim=-1)[0], dim=-1)[0]
+
+    acc_label = torch.logical_and(max_acc_val <= max_acc, max_jerk_val <= max_jerk)
+    return acc_label, smooth_cost(abs_acc, abs_jerk, opt_dt)
+
+
+class TrajEvaluator(TrajEvaluatorConfig):
+    def __init__(self, config: Optional[TrajEvaluatorConfig] = None):
+        if config is None:
+            config = TrajEvaluatorConfig()
+        super().__init__(**vars(config))
+
+    def _compute_path_length(
+        self, js: JointState, traj_dt: torch.Tensor, cspace_distance_weight: T_DOF
+    ):
+        path_length = compute_path_length(js.velocity, traj_dt, cspace_distance_weight)
+        return path_length
+
+    def _check_smoothness(self, js: JointState, traj_dt: torch.Tensor, max_vel: torch.Tensor):
+        if js.jerk is None:
+            js.jerk = (
+                (torch.roll(js.acceleration, -1, -2) - js.acceleration)
+                * (1 / traj_dt).unsqueeze(-1)
+            )[..., :-1, :]
+
+        acc_label, max_acc = compute_smoothness(
+            js.velocity,
+            js.acceleration,
+            js.jerk,
+            max_vel,
+            self.max_acc,
+            self.max_jerk,
+            traj_dt,
+            self.min_dt,
+            self.max_dt,
+        )
+        return acc_label, max_acc
+
+    @profiler.record_function("traj_evaluate_smoothness")
+    def evaluate(
+        self,
+        js: JointState,
+        traj_dt: torch.Tensor,
+        cspace_distance_weight: T_DOF,
+        max_vel: torch.Tensor,
+    ):
+        label, cost = self._check_smoothness(js, traj_dt, max_vel)
+        pl_cost = self._compute_path_length(js, traj_dt, cspace_distance_weight)
+        return label, pl_cost + self.cost_weight * cost
+
+    @profiler.record_function("traj_evaluate_interpolated_smoothness")
+    def evaluate_interpolated_smootheness(
+        self,
+        js: JointState,
+        opt_dt: torch.Tensor,
+        cspace_distance_weight: T_DOF,
+        max_vel: torch.Tensor,
+    ):
+        label, cost = compute_smoothness_opt_dt(
+            js.velocity, js.acceleration, js.jerk, max_vel, self.max_acc, self.max_jerk, opt_dt
+        )
+        label = torch.logical_and(label, opt_dt <= self.max_dt)
+        pl_cost = compute_path_length_cost(js.velocity, cspace_distance_weight)
+        return label, pl_cost + self.cost_weight * cost
+
+    def evaluate_from_position(
+        self,
+        js: JointState,
+        traj_dt: torch.Tensor,
+        cspace_distance_weight: T_DOF,
+        max_vel: torch.Tensor,
+        skip_last_tstep: bool = False,
+    ):
+        js = js.calculate_fd_from_position(traj_dt)
+        if skip_last_tstep:
+            js.position = js.position[..., :-1, :]
+            js.velocity = js.velocity[..., :-1, :]
+            js.acceleration = js.acceleration[..., :-1, :]
+            js.jerk = js.jerk[..., :-1, :]
+
+        return self.evaluate(js, traj_dt, cspace_distance_weight, max_vel)
+
+    def calculate_dt(self, js: JointState, max_vel: torch.Tensor, raw_dt: float):
+        return calculate_dt(js.velocity, max_vel, raw_dt)
diff --git a/src/curobo/wrap/reacher/ik_solver.py b/src/curobo/wrap/reacher/ik_solver.py
new file mode 100644
index 00000000..b8f57a04
--- /dev/null
+++ b/src/curobo/wrap/reacher/ik_solver.py
@@ -0,0 +1,1021 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Sequence, Union
+
+# Third Party
+import torch
+import torch.autograd.profiler as profiler
+
+# CuRobo
+from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel, CudaRobotModelState
+from curobo.geom.sdf.utils import create_collision_checker
+from curobo.geom.sdf.world import CollisionCheckerType, WorldCollision, WorldCollisionConfig
+from curobo.geom.types import WorldConfig
+from curobo.opt.newton.lbfgs import LBFGSOpt, LBFGSOptConfig
+from curobo.opt.newton.newton_base import NewtonOptBase, NewtonOptConfig
+from curobo.opt.particle.parallel_es import ParallelES, ParallelESConfig
+from curobo.opt.particle.parallel_mppi import ParallelMPPI, ParallelMPPIConfig
+from curobo.rollout.arm_reacher import ArmReacher, ArmReacherConfig
+from curobo.rollout.rollout_base import Goal, RolloutBase, RolloutMetrics
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.types.tensor import T_BDOF, T_BValue_bool, T_BValue_float
+from curobo.util.logger import log_error, log_warn
+from curobo.util.sample_lib import HaltonGenerator
+from curobo.util_file import (
+    get_robot_configs_path,
+    get_task_configs_path,
+    get_world_configs_path,
+    join_path,
+    load_yaml,
+)
+from curobo.wrap.reacher.types import ReacherSolveState, ReacherSolveType
+from curobo.wrap.wrap_base import WrapBase, WrapConfig, WrapResult
+
+
+@dataclass
+class IKSolverConfig:
+    robot_config: RobotConfig
+    solver: WrapBase
+    num_seeds: int
+    position_threshold: float
+    rotation_threshold: float
+    rollout_fn: ArmReacher
+    ik_nn_seeder: Optional[str] = None
+    world_coll_checker: Optional[WorldCollision] = None
+    sample_rejection_ratio: int = 50
+    tensor_args: TensorDeviceType = TensorDeviceType()
+
+    @staticmethod
+    @profiler.record_function("ik_solver/load_from_robot_config")
+    def load_from_robot_config(
+        robot_cfg: Union[str, Dict, RobotConfig],
+        world_model: Optional[
+            Union[Union[List[Dict], List[WorldConfig]], Union[Dict, WorldConfig]]
+        ] = None,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+        num_seeds: int = 100,
+        position_threshold: float = 0.005,
+        rotation_threshold: float = 0.05,
+        world_coll_checker=None,
+        base_cfg_file: str = "base_cfg.yml",
+        particle_file: str = "particle_ik.yml",
+        gradient_file: str = "gradient_ik.yml",
+        use_cuda_graph: Optional[bool] = None,
+        self_collision_check: bool = True,
+        self_collision_opt: bool = True,
+        grad_iters: Optional[int] = None,
+        use_particle_opt: bool = True,
+        collision_checker_type: Optional[CollisionCheckerType] = CollisionCheckerType.PRIMITIVE,
+        sync_cuda_time: Optional[bool] = None,
+        use_gradient_descent: bool = False,
+        collision_cache: Optional[Dict[str, int]] = None,
+        n_collision_envs: Optional[int] = None,
+        ee_link_name: Optional[str] = None,
+        use_es: Optional[bool] = None,
+        es_learning_rate: Optional[float] = 0.1,
+        use_fixed_samples: Optional[bool] = None,
+        store_debug: bool = False,
+        regularization: bool = True,
+        collision_activation_distance: Optional[float] = None,
+        high_precision: bool = False,
+    ):
+        if position_threshold <= 0.001:
+            high_precision = True
+        if high_precision:
+            if grad_iters is None:
+                grad_iters = 200
+        # use default values, disable environment collision checking
+        base_config_data = load_yaml(join_path(get_task_configs_path(), base_cfg_file))
+        config_data = load_yaml(join_path(get_task_configs_path(), particle_file))
+        grad_config_data = load_yaml(join_path(get_task_configs_path(), gradient_file))
+
+        if collision_cache is not None:
+            base_config_data["world_collision_checker_cfg"]["cache"] = collision_cache
+        if n_collision_envs is not None:
+            base_config_data["world_collision_checker_cfg"]["n_envs"] = n_collision_envs
+
+        if collision_checker_type is not None:
+            base_config_data["world_collision_checker_cfg"]["checker_type"] = collision_checker_type
+        if not self_collision_check:
+            base_config_data["constraint"]["self_collision_cfg"]["weight"] = 0.0
+            self_collision_opt = False
+        if not regularization:
+            base_config_data["convergence"]["null_space_cfg"]["weight"] = 0.0
+            base_config_data["convergence"]["cspace_cfg"]["weight"] = 0.0
+            config_data["cost"]["bound_cfg"]["null_space_weight"] = 0.0
+            grad_config_data["cost"]["bound_cfg"]["null_space_weight"] = 0.0
+        if ee_link_name is not None:
+            if isinstance(robot_cfg, RobotConfig):
+                raise NotImplementedError("ee link cannot be changed after creating RobotConfig")
+
+                robot_cfg.kinematics.ee_link = ee_link_name
+            else:
+                robot_cfg["kinematics"]["ee_link"] = ee_link_name
+        if isinstance(robot_cfg, str):
+            robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_cfg))["robot_cfg"]
+        if isinstance(robot_cfg, dict):
+            robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+
+        if isinstance(world_model, str):
+            world_model = load_yaml(join_path(get_world_configs_path(), world_model))
+        if world_coll_checker is None and world_model is not None:
+            world_cfg = WorldCollisionConfig.load_from_dict(
+                base_config_data["world_collision_checker_cfg"], world_model, tensor_args
+            )
+            world_coll_checker = create_collision_checker(world_cfg)
+
+        if collision_activation_distance is not None:
+            config_data["cost"]["primitive_collision_cfg"][
+                "activation_distance"
+            ] = collision_activation_distance
+            grad_config_data["cost"]["primitive_collision_cfg"][
+                "activation_distance"
+            ] = collision_activation_distance
+
+        if store_debug:
+            use_cuda_graph = False
+            grad_config_data["lbfgs"]["store_debug"] = store_debug
+            config_data["mppi"]["store_debug"] = store_debug
+            grad_config_data["lbfgs"]["inner_iters"] = 1
+        if use_cuda_graph is not None:
+            config_data["mppi"]["use_cuda_graph"] = use_cuda_graph
+            grad_config_data["lbfgs"]["use_cuda_graph"] = use_cuda_graph
+        if use_fixed_samples is not None:
+            config_data["mppi"]["sample_params"]["fixed_samples"] = use_fixed_samples
+
+        if not self_collision_opt:
+            config_data["cost"]["self_collision_cfg"]["weight"] = 0.0
+            grad_config_data["cost"]["self_collision_cfg"]["weight"] = 0.0
+        if grad_iters is not None:
+            grad_config_data["lbfgs"]["n_iters"] = grad_iters
+        config_data["mppi"]["n_envs"] = 1
+        grad_config_data["lbfgs"]["n_envs"] = 1
+        grad_cfg = ArmReacherConfig.from_dict(
+            robot_cfg,
+            grad_config_data["model"],
+            grad_config_data["cost"],
+            base_config_data["constraint"],
+            base_config_data["convergence"],
+            base_config_data["world_collision_checker_cfg"],
+            world_model,
+            world_coll_checker=world_coll_checker,
+            tensor_args=tensor_args,
+        )
+        cfg = ArmReacherConfig.from_dict(
+            robot_cfg,
+            config_data["model"],
+            config_data["cost"],
+            base_config_data["constraint"],
+            base_config_data["convergence"],
+            base_config_data["world_collision_checker_cfg"],
+            world_model,
+            world_coll_checker=world_coll_checker,
+            tensor_args=tensor_args,
+        )
+
+        arm_rollout_mppi = ArmReacher(cfg)
+        arm_rollout_grad = ArmReacher(grad_cfg)
+        arm_rollout_safety = ArmReacher(grad_cfg)
+        aux_rollout = ArmReacher(grad_cfg)
+
+        config_dict = ParallelMPPIConfig.create_data_dict(
+            config_data["mppi"], arm_rollout_mppi, tensor_args
+        )
+        if use_es is not None and use_es:
+            mppi_cfg = ParallelESConfig(**config_dict)
+            if es_learning_rate is not None:
+                mppi_cfg.learning_rate = es_learning_rate
+            parallel_mppi = ParallelES(mppi_cfg)
+        else:
+            mppi_cfg = ParallelMPPIConfig(**config_dict)
+            parallel_mppi = ParallelMPPI(mppi_cfg)
+        config_dict = LBFGSOptConfig.create_data_dict(
+            grad_config_data["lbfgs"], arm_rollout_grad, tensor_args
+        )
+        lbfgs_cfg = LBFGSOptConfig(**config_dict)
+
+        if use_gradient_descent:
+            newton_keys = NewtonOptConfig.__dataclass_fields__.keys()
+            newton_d = {}
+            lbfgs_k = vars(lbfgs_cfg)
+            for k in newton_keys:
+                newton_d[k] = lbfgs_k[k]
+            newton_d["step_scale"] = 0.9
+
+            newton_cfg = NewtonOptConfig(**newton_d)
+            lbfgs = NewtonOptBase(newton_cfg)
+        else:
+            lbfgs = LBFGSOpt(lbfgs_cfg)
+
+        if use_particle_opt:
+            opts = [parallel_mppi]
+        else:
+            opts = []
+        opts.append(lbfgs)
+        cfg = WrapConfig(
+            safety_rollout=arm_rollout_safety,
+            optimizers=opts,
+            compute_metrics=True,
+            use_cuda_graph_metrics=grad_config_data["lbfgs"]["use_cuda_graph"],
+            sync_cuda_time=sync_cuda_time,
+        )
+        ik = WrapBase(cfg)
+        ik_cfg = IKSolverConfig(
+            robot_config=robot_cfg,
+            solver=ik,
+            num_seeds=num_seeds,
+            position_threshold=position_threshold,
+            rotation_threshold=rotation_threshold,
+            world_coll_checker=world_coll_checker,
+            rollout_fn=aux_rollout,
+            tensor_args=tensor_args,
+        )
+        return ik_cfg
+
+
+@dataclass
+class IKResult(Sequence):
+    js_solution: JointState
+    goal_pose: Pose
+    solution: T_BDOF
+    seed: T_BDOF
+    success: T_BValue_bool
+    position_error: T_BValue_float
+
+    #: rotation error is computed as \sqrt(q_des^T * q)
+    rotation_error: T_BValue_float
+    error: T_BValue_float
+    solve_time: float
+    debug_info: Optional[Any] = None
+
+    def __getitem__(self, idx):
+        success = self.success[idx]
+
+        return IKResult(
+            js_solution=self.js_solution[idx],
+            goal_pose=self.goal_pose[idx],
+            solution=self.solution[idx],
+            success=success,
+            seed=self.seed[idx],
+            position_error=self.position_error[idx],
+            rotation_error=self.rotation_error[idx],
+            debug_info=self.debug_info,
+        )
+
+    def __len__(self):
+        return self.seed.shape[0]
+
+    def get_unique_solution(self, roundoff_decimals: int = 2) -> torch.Tensor:
+        in_solution = self.solution[self.success]
+        r_sol = torch.round(in_solution, decimals=roundoff_decimals)
+
+        if not (len(in_solution.shape) == 2):
+            log_error("Solution shape is not of length 2")
+
+        s, i = torch.unique(r_sol, dim=-2, return_inverse=True)
+        sol = in_solution[i[: s.shape[0]]]
+
+        return sol
+
+    def get_batch_unique_solution(self, roundoff_decimals: int = 2) -> List[torch.Tensor]:
+        in_solution = self.solution
+        r_sol = torch.round(in_solution, decimals=roundoff_decimals)
+        if not (len(in_solution.shape) == 3):
+            log_error("Solution shape is not of length 3")
+
+        # do a for loop and return list of tensors
+        sol = []
+        for k in range(in_solution.shape[0]):
+            # filter by success:
+            in_succ = in_solution[k][self.success[k]]
+            r_k = r_sol[k][self.success[k]]
+            s, i = torch.unique(r_k, dim=-2, return_inverse=True)
+            sol.append(in_succ[i[: s.shape[0]]])
+            # sol.append(s)
+        return sol
+
+
+class IKSolver(IKSolverConfig):
+    def __init__(self, config: IKSolverConfig) -> None:
+        super().__init__(**vars(config))
+        # self._solve_
+        self.batch_size = -1
+        self._num_seeds = self.num_seeds
+        self.init_state = JointState.from_position(
+            self.solver.rollout_fn.retract_state.unsqueeze(0)
+        )
+        self.dof = self.solver.safety_rollout.d_action
+        self._col = torch.arange(0, 1, device=self.tensor_args.device, dtype=torch.long)
+
+        # self.fixed_seeds = self.solver.safety_rollout.sample_random_actions(100 * 200)
+        # create random seeder:
+        self.q_sample_gen = HaltonGenerator(
+            self.dof,
+            self.tensor_args,
+            up_bounds=self.solver.safety_rollout.action_bound_highs,
+            low_bounds=self.solver.safety_rollout.action_bound_lows,
+            seed=1531,
+            # store_buffer=1000,
+        )
+
+        # load ik nn:
+
+        # store og outer iters:
+        self.og_newton_iters = self.solver.newton_optimizer.outer_iters
+        self._goal_buffer = Goal()
+        self._solve_state = None
+        self._kin_list = None
+        self._rollout_list = None
+
+    def update_goal_buffer(
+        self,
+        solve_state: ReacherSolveState,
+        goal_pose: Pose,
+        retract_config: Optional[T_BDOF] = None,
+        link_poses: Optional[Dict[str, Pose]] = None,
+    ) -> Goal:
+        self._solve_state, self._goal_buffer, update_reference = solve_state.update_goal_buffer(
+            goal_pose,
+            None,
+            retract_config,
+            link_poses,
+            self._solve_state,
+            self._goal_buffer,
+            self.tensor_args,
+        )
+        # print("Goal:", self._goal_buffer.links_goal_pose)
+        if update_reference:
+            self.solver.update_nenvs(self._solve_state.get_ik_batch_size())
+            self.reset_cuda_graph()
+            self._goal_buffer.current_state = self.init_state.repeat_seeds(goal_pose.batch)
+            self._col = torch.arange(
+                0,
+                self._goal_buffer.goal_pose.batch,
+                device=self.tensor_args.device,
+                dtype=torch.long,
+            )
+
+        return self._goal_buffer
+
+    def solve_any(
+        self,
+        solve_type: ReacherSolveType,
+        goal_pose: Pose,
+        retract_config: Optional[T_BDOF] = None,
+        seed_config: Optional[T_BDOF] = None,
+        return_seeds: int = 1,
+        num_seeds: Optional[int] = None,
+        use_nn_seed: bool = True,
+        newton_iters: Optional[int] = None,
+        link_poses: Optional[Dict[str, Pose]] = None,
+    ) -> IKResult:
+        if solve_type == ReacherSolveType.SINGLE:
+            return self.solve_single(
+                goal_pose,
+                retract_config,
+                seed_config,
+                return_seeds,
+                num_seeds,
+                use_nn_seed,
+                newton_iters,
+                link_poses,
+            )
+        elif solve_type == ReacherSolveType.GOALSET:
+            return self.solve_goalset(
+                goal_pose,
+                retract_config,
+                seed_config,
+                return_seeds,
+                num_seeds,
+                use_nn_seed,
+                newton_iters,
+            )
+        elif solve_type == ReacherSolveType.BATCH:
+            return self.solve_batch(
+                goal_pose,
+                retract_config,
+                seed_config,
+                return_seeds,
+                num_seeds,
+                use_nn_seed,
+                newton_iters,
+                link_poses,
+            )
+        elif solve_type == ReacherSolveType.BATCH_GOALSET:
+            return self.solve_batch_goalset(
+                goal_pose,
+                retract_config,
+                seed_config,
+                return_seeds,
+                num_seeds,
+                use_nn_seed,
+                newton_iters,
+            )
+        elif solve_type == ReacherSolveType.BATCH_ENV:
+            return self.solve_batch_env(
+                goal_pose,
+                retract_config,
+                seed_config,
+                return_seeds,
+                num_seeds,
+                use_nn_seed,
+                newton_iters,
+            )
+        elif solve_type == ReacherSolveType.BATCH_ENV_GOALSET:
+            return self.solve_batch_env_goalset(
+                goal_pose,
+                retract_config,
+                seed_config,
+                return_seeds,
+                num_seeds,
+                use_nn_seed,
+                newton_iters,
+            )
+
+    def solve_single(
+        self,
+        goal_pose: Pose,
+        retract_config: Optional[T_BDOF] = None,
+        seed_config: Optional[T_BDOF] = None,
+        return_seeds: int = 1,
+        num_seeds: Optional[int] = None,
+        use_nn_seed: bool = True,
+        newton_iters: Optional[int] = None,
+        link_poses: Optional[Dict[str, Pose]] = None,
+    ) -> IKResult:
+        if num_seeds is None:
+            num_seeds = self.num_seeds
+        if return_seeds > num_seeds:
+            num_seeds = return_seeds
+
+        solve_state = ReacherSolveState(
+            ReacherSolveType.SINGLE, num_ik_seeds=num_seeds, batch_size=1, n_envs=1, n_goalset=1
+        )
+
+        return self.solve_from_solve_state(
+            solve_state,
+            goal_pose,
+            num_seeds,
+            retract_config,
+            seed_config,
+            return_seeds,
+            use_nn_seed,
+            newton_iters,
+            link_poses=link_poses,
+        )
+
+    def solve_goalset(
+        self,
+        goal_pose: Pose,
+        retract_config: Optional[T_BDOF] = None,
+        seed_config: Optional[T_BDOF] = None,
+        return_seeds: int = 1,
+        num_seeds: Optional[int] = None,
+        use_nn_seed: bool = True,
+        newton_iters: Optional[int] = None,
+        link_poses: Optional[Dict[str, Pose]] = None,
+    ) -> IKResult:
+        if num_seeds is None:
+            num_seeds = self.num_seeds
+        if return_seeds > num_seeds:
+            num_seeds = return_seeds
+
+        solve_state = ReacherSolveState(
+            ReacherSolveType.GOALSET,
+            num_ik_seeds=num_seeds,
+            batch_size=1,
+            n_envs=1,
+            n_goalset=goal_pose.n_goalset,
+        )
+        return self.solve_from_solve_state(
+            solve_state,
+            goal_pose,
+            num_seeds,
+            retract_config,
+            seed_config,
+            return_seeds,
+            use_nn_seed,
+            newton_iters,
+            link_poses=link_poses,
+        )
+
+    def solve_batch(
+        self,
+        goal_pose: Pose,
+        retract_config: Optional[T_BDOF] = None,
+        seed_config: Optional[T_BDOF] = None,
+        return_seeds: int = 1,
+        num_seeds: Optional[int] = None,
+        use_nn_seed: bool = True,
+        newton_iters: Optional[int] = None,
+        link_poses: Optional[Dict[str, Pose]] = None,
+    ) -> IKResult:
+        if num_seeds is None:
+            num_seeds = self.num_seeds
+        if return_seeds > num_seeds:
+            num_seeds = return_seeds
+
+        solve_state = ReacherSolveState(
+            ReacherSolveType.BATCH,
+            num_ik_seeds=num_seeds,
+            batch_size=goal_pose.batch,
+            n_envs=1,
+            n_goalset=1,
+        )
+        return self.solve_from_solve_state(
+            solve_state,
+            goal_pose,
+            num_seeds,
+            retract_config,
+            seed_config,
+            return_seeds,
+            use_nn_seed,
+            newton_iters,
+            link_poses=link_poses,
+        )
+
+    def solve_batch_goalset(
+        self,
+        goal_pose: Pose,
+        retract_config: Optional[T_BDOF] = None,
+        seed_config: Optional[T_BDOF] = None,
+        return_seeds: int = 1,
+        num_seeds: Optional[int] = None,
+        use_nn_seed: bool = True,
+        newton_iters: Optional[int] = None,
+        link_poses: Optional[Dict[str, Pose]] = None,
+    ) -> IKResult:
+        if num_seeds is None:
+            num_seeds = self.num_seeds
+        if return_seeds > num_seeds:
+            num_seeds = return_seeds
+
+        solve_state = ReacherSolveState(
+            ReacherSolveType.BATCH_GOALSET,
+            num_ik_seeds=num_seeds,
+            batch_size=goal_pose.batch,
+            n_envs=1,
+            n_goalset=goal_pose.n_goalset,
+        )
+        return self.solve_from_solve_state(
+            solve_state,
+            goal_pose,
+            num_seeds,
+            retract_config,
+            seed_config,
+            return_seeds,
+            use_nn_seed,
+            newton_iters,
+            link_poses=link_poses,
+        )
+
+    def solve_batch_env(
+        self,
+        goal_pose: Pose,
+        retract_config: Optional[T_BDOF] = None,
+        seed_config: Optional[T_BDOF] = None,
+        return_seeds: int = 1,
+        num_seeds: Optional[int] = None,
+        use_nn_seed: bool = True,
+        newton_iters: Optional[int] = None,
+        link_poses: Optional[Dict[str, Pose]] = None,
+    ) -> IKResult:
+        if num_seeds is None:
+            num_seeds = self.num_seeds
+        if return_seeds > num_seeds:
+            num_seeds = return_seeds
+
+        solve_state = ReacherSolveState(
+            ReacherSolveType.BATCH_ENV,
+            num_ik_seeds=num_seeds,
+            batch_size=goal_pose.batch,
+            n_envs=goal_pose.batch,
+            n_goalset=1,
+        )
+        return self.solve_from_solve_state(
+            solve_state,
+            goal_pose,
+            num_seeds,
+            retract_config,
+            seed_config,
+            return_seeds,
+            use_nn_seed,
+            newton_iters,
+            link_poses=link_poses,
+        )
+
+    def solve_batch_env_goalset(
+        self,
+        goal_pose: Pose,
+        retract_config: Optional[T_BDOF] = None,
+        seed_config: Optional[T_BDOF] = None,
+        return_seeds: int = 1,
+        num_seeds: Optional[int] = None,
+        use_nn_seed: bool = True,
+        newton_iters: Optional[int] = None,
+        link_poses: Optional[Dict[str, Pose]] = None,
+    ) -> IKResult:
+        if num_seeds is None:
+            num_seeds = self.num_seeds
+        if return_seeds > num_seeds:
+            num_seeds = return_seeds
+
+        solve_state = ReacherSolveState(
+            ReacherSolveType.BATCH_ENV_GOALSET,
+            num_ik_seeds=num_seeds,
+            batch_size=goal_pose.batch,
+            n_envs=goal_pose.batch,
+            n_goalset=goal_pose.n_goalset,
+        )
+        return self.solve_from_solve_state(
+            solve_state,
+            goal_pose,
+            num_seeds,
+            retract_config,
+            seed_config,
+            return_seeds,
+            use_nn_seed,
+            newton_iters,
+            link_poses=link_poses,
+        )
+
+    def solve_from_solve_state(
+        self,
+        solve_state: ReacherSolveState,
+        goal_pose: Pose,
+        num_seeds: int,
+        retract_config: Optional[T_BDOF] = None,
+        seed_config: Optional[T_BDOF] = None,
+        return_seeds: int = 1,
+        use_nn_seed: bool = True,
+        newton_iters: Optional[int] = None,
+        link_poses: Optional[Dict[str, Pose]] = None,
+    ) -> IKResult:
+        # create goal buffer:
+        goal_buffer = self.update_goal_buffer(solve_state, goal_pose, retract_config, link_poses)
+        coord_position_seed = self.get_seed(
+            num_seeds, goal_buffer.goal_pose, use_nn_seed, seed_config
+        )
+
+        if newton_iters is not None:
+            self.solver.newton_optimizer.outer_iters = newton_iters
+        self.solver.reset()
+        result = self.solver.solve(goal_buffer, coord_position_seed)
+        if newton_iters is not None:
+            self.solver.newton_optimizer.outer_iters = self.og_newton_iters
+        ik_result = self.get_result(num_seeds, result, goal_buffer.goal_pose, return_seeds)
+
+        return ik_result
+
+    @profiler.record_function("ik/get_result")
+    def get_result(
+        self, num_seeds: int, result: WrapResult, goal_pose: Pose, return_seeds: int
+    ) -> IKResult:
+        success = self.get_success(result.metrics, num_seeds=num_seeds)
+        if result.metrics.cost is not None:
+            result.metrics.pose_error += result.metrics.cost
+        if result.metrics.cspace_error is not None:
+            result.metrics.pose_error += result.metrics.cspace_error
+
+        q_sol, success, position_error, rotation_error, total_error = get_result(
+            result.metrics.pose_error,
+            result.metrics.position_error,
+            result.metrics.rotation_error,
+            success,
+            result.action.position,
+            self._col,
+            goal_pose.batch,
+            return_seeds,
+            num_seeds,
+        )
+        # check if locked joints exist and create js solution:
+
+        new_js = JointState(q_sol, joint_names=self.rollout_fn.kinematics.joint_names)
+        sol_js = self.rollout_fn.get_full_dof_from_solution(new_js)
+        # reindex success to get successful poses?
+        ik_result = IKResult(
+            success=success,
+            goal_pose=goal_pose,
+            solution=q_sol,
+            seed=None,
+            js_solution=sol_js,
+            # seed=coord_position_seed[idx].view(goal_pose.batch, return_seeds, -1).detach(),
+            position_error=position_error,
+            rotation_error=rotation_error,
+            solve_time=result.solve_time,
+            error=total_error,
+            debug_info={"solver": result.debug},
+        )
+        return ik_result
+
+    @profiler.record_function("ik/get_seed")
+    def get_seed(
+        self, num_seeds: int, goal_pose: Pose, use_nn_seed, seed_config: Optional[T_BDOF] = None
+    ) -> torch.Tensor:
+        if seed_config is None:
+            coord_position_seed = self.generate_seed(
+                num_seeds=num_seeds,
+                batch=goal_pose.batch,
+                use_nn_seed=use_nn_seed,
+                pose=goal_pose,
+            )
+        elif seed_config.shape[1] < num_seeds:
+            coord_position_seed = self.generate_seed(
+                num_seeds=num_seeds - seed_config.shape[1],
+                batch=goal_pose.batch,
+                use_nn_seed=use_nn_seed,
+                pose=goal_pose,
+            )
+            coord_position_seed = torch.cat((seed_config, coord_position_seed), dim=1)
+        else:
+            coord_position_seed = seed_config
+        coord_position_seed = coord_position_seed.view(-1, 1, self.dof)
+        return coord_position_seed
+
+    def solve(
+        self,
+        goal_pose: Pose,
+        retract_config: Optional[T_BDOF] = None,
+        seed_config: Optional[T_BDOF] = None,
+        return_seeds: int = 1,
+        num_seeds: Optional[int] = None,
+        use_nn_seed: bool = True,
+        newton_iters: Optional[int] = None,
+    ) -> IKResult:  # pragma : no cover
+        """Ik solver
+
+        Args:
+            goal_pose (Pose): _description_
+            retract_config (Optional[T_BDOF], optional): _description_. Defaults to None.
+            seed_config (Optional[T_BDOF], optional): _description_. Defaults to None.
+            return_seeds (int, optional): _description_. Defaults to 1.
+            num_seeds (Optional[int], optional): _description_. Defaults to None.
+            use_nn_seed (bool, optional): _description_. Defaults to True.
+            newton_iters (Optional[int], optional): _description_. Defaults to None.
+
+        Returns:
+            IKResult: _description_
+        """
+        log_warn("IKSolver.solve() is deprecated, use solve_single() or others instead")
+        if goal_pose.batch == 1 and goal_pose.n_goalset == 1:
+            return self.solve_single(
+                goal_pose,
+                retract_config,
+                seed_config,
+                return_seeds,
+                num_seeds,
+                use_nn_seed,
+                newton_iters,
+            )
+        if goal_pose.batch > 1 and goal_pose.n_goalset == 1:
+            return self.solve_batch(
+                goal_pose,
+                retract_config,
+                seed_config,
+                return_seeds,
+                num_seeds,
+                use_nn_seed,
+                newton_iters,
+            )
+        if goal_pose.batch > 1 and goal_pose.n_goalset > 1:
+            return self.solve_batch_goalset(
+                goal_pose,
+                retract_config,
+                seed_config,
+                return_seeds,
+                num_seeds,
+                use_nn_seed,
+                newton_iters,
+            )
+        if goal_pose.batch == 1 and goal_pose.n_goalset > 1:
+            return self.solve_goalset(
+                goal_pose,
+                retract_config,
+                seed_config,
+                return_seeds,
+                num_seeds,
+                use_nn_seed,
+                newton_iters,
+            )
+
+    def batch_env_solve(
+        self,
+        goal_pose: Pose,
+        retract_config: Optional[T_BDOF] = None,
+        seed_config: Optional[T_BDOF] = None,
+        return_seeds: int = 1,
+        num_seeds: Optional[int] = None,
+        use_nn_seed: bool = True,
+        newton_iters: Optional[int] = None,
+    ) -> IKResult:  # pragma : no cover
+        """Ik solver
+
+        Args:
+            goal_pose (Pose): _description_
+            retract_config (Optional[T_BDOF], optional): _description_. Defaults to None.
+            seed_config (Optional[T_BDOF], optional): _description_. Defaults to None.
+            return_seeds (int, optional): _description_. Defaults to 1.
+            num_seeds (Optional[int], optional): _description_. Defaults to None.
+            use_nn_seed (bool, optional): _description_. Defaults to True.
+            newton_iters (Optional[int], optional): _description_. Defaults to None.
+
+        Returns:
+            IKResult: _description_
+        """
+        log_warn(
+            "IKSolver.batch_env_solve() is deprecated, use solve_batch_env() or others instead"
+        )
+        if goal_pose.n_goalset == 1:
+            return self.solve_batch_env(
+                goal_pose,
+                retract_config,
+                seed_config,
+                return_seeds,
+                num_seeds,
+                use_nn_seed,
+                newton_iters,
+            )
+        if goal_pose.n_goalset > 1:
+            return self.solve_batch_env_goalset(
+                goal_pose,
+                retract_config,
+                seed_config,
+                return_seeds,
+                num_seeds,
+                use_nn_seed,
+                newton_iters,
+            )
+
+    @torch.no_grad()
+    @profiler.record_function("ik/get_success")
+    def get_success(self, metrics: RolloutMetrics, num_seeds: int) -> torch.Tensor:
+        success = get_success(
+            metrics.feasible,
+            metrics.position_error,
+            metrics.rotation_error,
+            num_seeds,
+            self.position_threshold,
+            self.rotation_threshold,
+        )
+
+        return success
+
+    @torch.no_grad()
+    @profiler.record_function("ik/generate_seed")
+    def generate_seed(
+        self,
+        num_seeds: int,
+        batch: int,
+        use_nn_seed: bool = False,
+        pose: Optional[Pose] = None,
+    ) -> torch.Tensor:
+        """Generate seeds for a batch. Given a pose set, this will create all
+        the seeds: [batch + batch*random_restarts]
+
+        Args:
+            batch (int, optional): [description]. Defaults to 1.
+            num_seeds (Optional[int], optional): [description]. Defaults to None.
+
+        Returns:
+            [type]: [description]
+        """
+        num_random_seeds = num_seeds
+        seed_list = []
+        if use_nn_seed and self.ik_nn_seeder is not None:
+            num_random_seeds = num_seeds - 1
+            in_data = torch.cat(
+                (pose.position, pose.quaternion), dim=-1
+            )  # .to(dtype=torch.float32)
+            nn_seed = self.ik_nn_seeder(in_data).view(
+                batch, 1, self.dof
+            )  # .to(dtype=self.tensor_args.dtype)
+            seed_list.append(nn_seed)
+            # print("using nn seed")
+        if num_random_seeds > 0:
+            random_seed = self.q_sample_gen.get_gaussian_samples(num_random_seeds * batch).view(
+                batch, num_random_seeds, self.dof
+            )
+
+            # random_seed = self.fixed_seeds[:num_random_seeds*batch].view(batch, num_random_seeds,
+            # self.solver.safety_rollout.d_action)
+
+            seed_list.append(random_seed)
+        coord_position_seed = torch.cat(seed_list, dim=1)
+        return coord_position_seed
+
+    def update_world(self, world: WorldConfig) -> bool:
+        self.world_coll_checker.load_collision_model(world)
+        return True
+
+    def reset_seed(self) -> None:
+        self.q_sample_gen.reset()
+
+    def check_constraints(self, q: JointState) -> RolloutMetrics:
+        metrics = self.rollout_fn.rollout_constraint(q.position.unsqueeze(1))
+        return metrics
+
+    def sample_configs(self, n: int, use_batch_env=False) -> torch.Tensor:
+        """
+        Only works for environment=0
+        """
+        samples = self.rollout_fn.sample_random_actions(n * self.sample_rejection_ratio)
+        metrics = self.rollout_fn.rollout_constraint(
+            samples.unsqueeze(1), use_batch_env=use_batch_env
+        )
+        return samples[metrics.feasible.squeeze()][:n]
+
+    @property
+    def kinematics(self) -> CudaRobotModel:
+        return self.rollout_fn.dynamics_model.robot_model
+
+    def get_all_rollout_instances(self) -> List[RolloutBase]:
+        if self._rollout_list is None:
+            self._rollout_list = [self.rollout_fn] + self.solver.get_all_rollout_instances()
+        return self._rollout_list
+
+    def get_all_kinematics_instances(self) -> List[CudaRobotModel]:
+        if self._kin_list is None:
+            self._kin_list = [
+                i.dynamics_model.robot_model for i in self.get_all_rollout_instances()
+            ]
+        return self._kin_list
+
+    def fk(self, q: torch.Tensor) -> CudaRobotModelState:
+        return self.kinematics.get_state(q)
+
+    def reset_cuda_graph(self) -> None:
+        self.solver.reset_cuda_graph()
+        self.rollout_fn.reset_cuda_graph()
+
+    def attach_object_to_robot(
+        self,
+        sphere_radius: float,
+        sphere_tensor: Optional[torch.Tensor] = None,
+        link_name: str = "attached_object",
+    ) -> None:
+        for k in self.get_all_kinematics_instances():
+            k.attach_object(
+                sphere_radius=sphere_radius, sphere_tensor=sphere_tensor, link_name=link_name
+            )
+
+    def detach_object_from_robot(self, link_name: str = "attached_object") -> None:
+        for k in self.get_all_kinematics_instances():
+            k.detach_object(link_name)
+
+    def get_retract_config(self):
+        return self.rollout_fn.dynamics_model.retract_config
+
+
+@torch.jit.script
+def get_success(
+    feasible,
+    position_error,
+    rotation_error,
+    num_seeds: int,
+    position_threshold: float,
+    rotation_threshold: float,
+):
+    feasible = feasible.view(-1, num_seeds)
+    converge = torch.logical_and(
+        position_error <= position_threshold,
+        rotation_error <= rotation_threshold,
+    ).view(-1, num_seeds)
+    success = torch.logical_and(feasible, converge)
+    return success
+
+
+@torch.jit.script
+def get_result(
+    pose_error,
+    position_error,
+    rotation_error,
+    success,
+    sol_position,
+    col,
+    batch_size: int,
+    return_seeds: int,
+    num_seeds: int,
+):
+    error = pose_error.view(-1, num_seeds)
+    error[~success] += 1000.0
+    _, idx = torch.topk(error, k=return_seeds, largest=False, dim=-1)
+    idx = idx + num_seeds * col.unsqueeze(-1)
+    q_sol = sol_position[idx].view(batch_size, return_seeds, -1)
+
+    success = success.view(-1)[idx].view(batch_size, return_seeds)
+    position_error = position_error[idx].view(batch_size, return_seeds)
+    rotation_error = rotation_error[idx].view(batch_size, return_seeds)
+    total_error = position_error + rotation_error
+    return q_sol, success, position_error, rotation_error, total_error
diff --git a/src/curobo/wrap/reacher/motion_gen.py b/src/curobo/wrap/reacher/motion_gen.py
new file mode 100644
index 00000000..835819a7
--- /dev/null
+++ b/src/curobo/wrap/reacher/motion_gen.py
@@ -0,0 +1,2391 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+
+from __future__ import annotations
+
+# Standard Library
+import math
+import time
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Union
+
+# Third Party
+import numpy as np
+import torch
+import torch.autograd.profiler as profiler
+import warp as wp
+
+# CuRobo
+from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel
+from curobo.geom.sdf.utils import create_collision_checker
+from curobo.geom.sdf.world import CollisionCheckerType, WorldCollision, WorldCollisionConfig
+from curobo.geom.sphere_fit import SphereFitType
+from curobo.geom.types import Cuboid, Obstacle, WorldConfig
+from curobo.graph.graph_base import GraphConfig, GraphPlanBase, GraphResult
+from curobo.graph.prm import PRMStar
+from curobo.rollout.dynamics_model.kinematic_model import KinematicModelState
+from curobo.rollout.rollout_base import Goal, RolloutBase, RolloutMetrics
+from curobo.types.base import TensorDeviceType
+from curobo.types.camera import CameraObservation
+from curobo.types.math import Pose
+from curobo.types.robot import RobotConfig
+from curobo.types.state import JointState
+from curobo.types.tensor import T_BDOF, T_BValue_bool, T_BValue_float
+from curobo.util.logger import log_error, log_info, log_warn
+from curobo.util.tensor_util import tensor_repeat_seeds
+from curobo.util.trajectory import InterpolateType
+from curobo.util.warp import init_warp
+from curobo.util_file import (
+    get_robot_configs_path,
+    get_task_configs_path,
+    get_world_configs_path,
+    join_path,
+    load_yaml,
+)
+from curobo.wrap.reacher.evaluator import TrajEvaluator, TrajEvaluatorConfig
+from curobo.wrap.reacher.ik_solver import IKResult, IKSolver, IKSolverConfig
+from curobo.wrap.reacher.trajopt import TrajOptSolver, TrajOptSolverConfig
+from curobo.wrap.reacher.types import ReacherSolveState, ReacherSolveType
+
+
+@dataclass
+class MotionGenConfig:
+    """Creates a configuration file for running motion generation."""
+
+    #: number of IK seeds to run per query problem.
+    ik_seeds: int
+
+    #: number of graph planning seeds to use per query problem.
+    graph_seeds: int
+
+    #: number of trajectory optimization seeds to use per query problem.
+    trajopt_seeds: int
+
+    #: number of seeds to run trajectory optimization per trajopt_seed.
+    noisy_trajopt_seeds: int
+
+    #: number of IK seeds to use for batched queries.
+    batch_ik_seeds: int
+
+    #: number of trajectory optimization seeds to use for batched queries.
+    batch_trajopt_seeds: int
+
+    #: instance of robot configuration shared across all solvers.
+    robot_cfg: RobotConfig
+
+    #: instance of IK solver to use for motion generation.
+    ik_solver: IKSolver
+
+    #: instance of graph planner to use.
+    graph_planner: GraphPlanBase
+
+    #: instance of trajectory optimization solver to use for reaching Cartesian poses.
+    trajopt_solver: TrajOptSolver
+
+    #: instance of trajectory optimization solver to use for reaching joint space targets.
+    js_trajopt_solver: TrajOptSolver
+
+    #: instance of trajectory optimization solver for final fine tuning.
+    finetune_trajopt_solver: TrajOptSolver
+
+    #: interpolation to use for getting dense waypoints from optimized solution.
+    interpolation_type: InterpolateType
+
+    #: maximum number of steps to interpolate
+    interpolation_steps: int
+
+    #: instance of world collision checker.
+    world_coll_checker: WorldCollision
+
+    #: device to load motion generation.
+    tensor_args: TensorDeviceType
+
+    #: number of IK iterations to run for initializing trajectory optimization
+    partial_ik_iters: int
+
+    #: number of iterations to run trajectory optimization when seeded from a graph plan.
+    graph_trajopt_iters: Optional[int] = None
+
+    #: store debugging information in MotionGenResult
+    store_debug_in_result: bool = False
+
+    #: interpolation dt to use for output trajectory.
+    interpolation_dt: float = 0.01
+
+    #: scale initial dt by this value to finetune trajectory optimization.
+    finetune_dt_scale: float = 1.05
+
+    @staticmethod
+    @profiler.record_function("motion_gen_config/load_from_robot_config")
+    def load_from_robot_config(
+        robot_cfg: Union[Union[str, Dict], RobotConfig],
+        world_model: Optional[Union[Union[str, Dict], WorldConfig]] = None,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+        num_ik_seeds: int = 30,
+        num_graph_seeds: int = 1,
+        num_trajopt_seeds: int = 12,
+        num_batch_ik_seeds: int = 30,
+        num_batch_trajopt_seeds: int = 1,
+        num_trajopt_noisy_seeds: int = 1,
+        position_threshold: float = 0.005,
+        rotation_threshold: float = 0.05,
+        cspace_threshold: float = 0.05,
+        world_coll_checker=None,
+        base_cfg_file: str = "base_cfg.yml",
+        particle_ik_file: str = "particle_ik.yml",
+        gradient_ik_file: str = "gradient_ik.yml",
+        graph_file: str = "graph.yml",
+        particle_trajopt_file: str = "particle_trajopt.yml",
+        gradient_trajopt_file: str = "gradient_trajopt.yml",
+        finetune_trajopt_file: str = "finetune_trajopt.yml",
+        trajopt_tsteps: int = 32,
+        interpolation_steps: int = 5000,
+        interpolation_dt: float = 0.02,
+        interpolation_type: InterpolateType = InterpolateType.LINEAR_CUDA,
+        use_cuda_graph: Optional[bool] = None,
+        self_collision_check: bool = True,
+        self_collision_opt: bool = True,
+        grad_trajopt_iters: Optional[int] = None,
+        trajopt_seed_ratio: Dict[str, int] = {"linear": 1.0, "bias": 0.0},
+        ik_opt_iters: Optional[int] = None,
+        ik_particle_opt: bool = True,
+        collision_checker_type: Optional[CollisionCheckerType] = CollisionCheckerType.PRIMITIVE,
+        sync_cuda_time: Optional[bool] = None,
+        trajopt_particle_opt: bool = True,
+        traj_evaluator_config: Optional[TrajEvaluatorConfig] = None,
+        traj_evaluator: Optional[TrajEvaluator] = None,
+        minimize_jerk: bool = True,
+        filter_robot_command: bool = True,
+        use_gradient_descent: bool = False,
+        collision_cache: Optional[Dict[str, int]] = None,
+        n_collision_envs: Optional[int] = None,
+        ee_link_name: Optional[str] = None,
+        use_es_ik: Optional[bool] = None,
+        use_es_trajopt: Optional[bool] = None,
+        es_ik_learning_rate: float = 1.0,
+        es_trajopt_learning_rate: float = 1.0,
+        use_ik_fixed_samples: Optional[bool] = None,
+        use_trajopt_fixed_samples: Optional[bool] = None,
+        evaluate_interpolated_trajectory: bool = True,
+        partial_ik_iters: int = 2,
+        fixed_iters_trajopt: Optional[bool] = None,
+        store_ik_debug: bool = False,
+        store_trajopt_debug: bool = False,
+        graph_trajopt_iters: Optional[int] = None,
+        collision_max_outside_distance: Optional[float] = None,
+        collision_activation_distance: Optional[float] = None,
+        trajopt_dt: Optional[float] = None,
+        js_trajopt_dt: Optional[float] = None,
+        js_trajopt_tsteps: Optional[int] = None,
+        trim_steps: Optional[List[int]] = None,
+        store_debug_in_result: bool = False,
+        finetune_trajopt_iters: Optional[int] = None,
+        smooth_weight: List[float] = None,
+        finetune_smooth_weight: Optional[List[float]] = None,
+        state_finite_difference_mode: Optional[str] = None,
+        finetune_dt_scale: float = 1.05,
+        maximum_trajectory_time: Optional[float] = None,
+        maximum_trajectory_dt: float = 0.1,
+        velocity_scale: Optional[Union[List[float], float]] = None,
+        acceleration_scale: Optional[Union[List[float], float]] = None,
+        jerk_scale: Optional[Union[List[float], float]] = None,
+    ):
+        """Load motion generation configuration from robot and world configurations.
+
+        Args:
+            robot_cfg (Union[Union[str, Dict], RobotConfig]): robot configuration to load.
+            world_model (Optional[Union[Union[str, Dict], WorldConfig]], optional): World configuration to load. Defaults to None.
+            tensor_args (TensorDeviceType, optional): _description_. Defaults to TensorDeviceType().
+            num_ik_seeds (int, optional): _description_. Defaults to 30.
+            num_graph_seeds (int, optional): _description_. Defaults to 1.
+            num_trajopt_seeds (int, optional): _description_. Defaults to 12.
+            num_batch_ik_seeds (int, optional): _description_. Defaults to 30.
+            num_batch_trajopt_seeds (int, optional): _description_. Defaults to 1.
+            num_trajopt_noisy_seeds (int, optional): _description_. Defaults to 1.
+            position_threshold (float, optional): _description_. Defaults to 0.005.
+            rotation_threshold (float, optional): _description_. Defaults to 0.05.
+            cspace_threshold (float, optional): _description_. Defaults to 0.05.
+            world_coll_checker (_type_, optional): _description_. Defaults to None.
+            base_cfg_file (str, optional): _description_. Defaults to "base_cfg.yml".
+            particle_ik_file (str, optional): _description_. Defaults to "particle_ik.yml".
+            gradient_ik_file (str, optional): _description_. Defaults to "gradient_ik.yml".
+            graph_file (str, optional): _description_. Defaults to "graph.yml".
+            particle_trajopt_file (str, optional): _description_. Defaults to "particle_trajopt.yml".
+            gradient_trajopt_file (str, optional): _description_. Defaults to "gradient_trajopt.yml".
+            finetune_trajopt_file (str, optional): _description_. Defaults to "finetune_trajopt.yml".
+            trajopt_tsteps (int, optional): _description_. Defaults to 32.
+            interpolation_steps (int, optional): _description_. Defaults to 5000.
+            interpolation_dt (float, optional): _description_. Defaults to 0.02.
+            interpolation_type (InterpolateType, optional): _description_. Defaults to InterpolateType.LINEAR_CUDA.
+            use_cuda_graph (Optional[bool], optional): _description_. Defaults to None.
+            self_collision_check (bool, optional): _description_. Defaults to True.
+            self_collision_opt (bool, optional): _description_. Defaults to True.
+            grad_trajopt_iters (Optional[int], optional): _description_. Defaults to None.
+            trajopt_seed_ratio (_type_, optional): _description_. Defaults to {"linear": 1.0, "bias": 0.0}.
+            ik_opt_iters (Optional[int], optional): _description_. Defaults to None.
+            ik_particle_opt (bool, optional): _description_. Defaults to True.
+            collision_checker_type (Optional[CollisionCheckerType], optional): _description_. Defaults to CollisionCheckerType.PRIMITIVE.
+            sync_cuda_time (Optional[bool], optional): _description_. Defaults to None.
+            trajopt_particle_opt (bool, optional): _description_. Defaults to True.
+            traj_evaluator_config (Optional[TrajEvaluatorConfig], optional): _description_. Defaults to None.
+            traj_evaluator (Optional[TrajEvaluator], optional): _description_. Defaults to None.
+            minimize_jerk (bool, optional): _description_. Defaults to True.
+            filter_robot_command (bool, optional): _description_. Defaults to True.
+            use_gradient_descent (bool, optional): _description_. Defaults to False.
+            collision_cache (Optional[Dict[str, int]], optional): _description_. Defaults to None.
+            n_collision_envs (Optional[int], optional): _description_. Defaults to None.
+            ee_link_name (Optional[str], optional): _description_. Defaults to None.
+            use_es_ik (Optional[bool], optional): _description_. Defaults to None.
+            use_es_trajopt (Optional[bool], optional): _description_. Defaults to None.
+            es_ik_learning_rate (float, optional): _description_. Defaults to 1.0.
+            es_trajopt_learning_rate (float, optional): _description_. Defaults to 1.0.
+            use_ik_fixed_samples (Optional[bool], optional): _description_. Defaults to None.
+            use_trajopt_fixed_samples (Optional[bool], optional): _description_. Defaults to None.
+            evaluate_interpolated_trajectory (bool, optional): _description_. Defaults to True.
+            partial_ik_iters (int, optional): _description_. Defaults to 2.
+            fixed_iters_trajopt (Optional[bool], optional): _description_. Defaults to None.
+            store_ik_debug (bool, optional): _description_. Defaults to False.
+            store_trajopt_debug (bool, optional): _description_. Defaults to False.
+            graph_trajopt_iters (Optional[int], optional): _description_. Defaults to None.
+            collision_max_outside_distance (Optional[float], optional): _description_. Defaults to None.
+            collision_activation_distance (Optional[float], optional): _description_. Defaults to None.
+            trajopt_dt (Optional[float], optional): _description_. Defaults to None.
+            js_trajopt_dt (Optional[float], optional): _description_. Defaults to None.
+            js_trajopt_tsteps (Optional[int], optional): _description_. Defaults to None.
+            trim_steps (Optional[List[int]], optional): _description_. Defaults to None.
+            store_debug_in_result (bool, optional): _description_. Defaults to False.
+            finetune_trajopt_iters (Optional[int], optional): _description_. Defaults to None.
+            smooth_weight (List[float], optional): _description_. Defaults to None.
+            finetune_smooth_weight (Optional[List[float]], optional): _description_. Defaults to None.
+            state_finite_difference_mode (Optional[str], optional): _description_. Defaults to None.
+            finetune_dt_scale (float, optional): _description_. Defaults to 1.05.
+            maximum_trajectory_time (Optional[float], optional): _description_. Defaults to None.
+            maximum_trajectory_dt (float, optional): _description_. Defaults to 0.1.
+            velocity_scale (Optional[Union[List[float], float]], optional): _description_. Defaults to None.
+            acceleration_scale (Optional[Union[List[float], float]], optional): _description_. Defaults to None.
+            jerk_scale (Optional[Union[List[float], float]], optional): _description_. Defaults to None.
+
+        Returns:
+            _type_: _description_
+        """
+
+        init_warp(tensor_args=tensor_args)
+
+        if velocity_scale is not None and isinstance(velocity_scale, float):
+            velocity_scale = [velocity_scale]
+
+        if acceleration_scale is not None and isinstance(acceleration_scale, float):
+            acceleration_scale = [acceleration_scale]
+        if jerk_scale is not None and isinstance(jerk_scale, float):
+            jerk_scale = [jerk_scale]
+
+        if store_ik_debug or store_trajopt_debug:
+            store_debug_in_result = True
+        if (
+            acceleration_scale is not None
+            and min(acceleration_scale) < 1.0
+            and maximum_trajectory_dt <= 0.1
+        ):
+            maximum_trajectory_dt = np.sqrt(1.0 / min(acceleration_scale)) * maximum_trajectory_dt
+        elif (
+            velocity_scale is not None
+            and min(velocity_scale) < 1.0
+            and maximum_trajectory_dt <= 0.1
+        ):
+            maximum_trajectory_dt = np.sqrt(1.0 / min(velocity_scale)) * maximum_trajectory_dt
+
+        if traj_evaluator_config is None:
+            if maximum_trajectory_dt is not None:
+                max_dt = maximum_trajectory_dt
+            if maximum_trajectory_time is not None:
+                max_dt = maximum_trajectory_time / trajopt_tsteps
+            if acceleration_scale is not None:
+                max_dt = max_dt * (1.0 / np.sqrt(min(acceleration_scale)))
+            traj_evaluator_config = TrajEvaluatorConfig(min_dt=interpolation_dt, max_dt=max_dt)
+        if acceleration_scale is not None and min(acceleration_scale) < 0.5:
+            fixed_iters_trajopt = True
+        if velocity_scale is not None and min(velocity_scale) < 0.5:
+            fixed_iters_trajopt = True
+
+        if isinstance(robot_cfg, str):
+            robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_cfg))["robot_cfg"]
+        if isinstance(robot_cfg, RobotConfig):
+            if ee_link_name is not None:
+                log_error("ee link cannot be changed after creating RobotConfig")
+            if acceleration_scale is not None:
+                log_error("acceleration_scale cannot be changed after creating RobotConfig")
+            if velocity_scale is not None:
+                log_error("velocity cannot be changed after creating RobotConfig")
+        else:
+            if ee_link_name is not None:
+                robot_cfg["kinematics"]["ee_link"] = ee_link_name
+            if jerk_scale is not None:
+                robot_cfg["kinematics"]["cspace"]["jerk_scale"] = jerk_scale
+            if acceleration_scale is not None:
+                robot_cfg["kinematics"]["cspace"]["acceleration_scale"] = acceleration_scale
+            if velocity_scale is not None:
+                robot_cfg["kinematics"]["cspace"]["velocity_scale"] = velocity_scale
+
+            robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+        traj_evaluator_config.max_acc = (
+            robot_cfg.kinematics.get_joint_limits().acceleration[1, 0].item()
+        )
+        traj_evaluator_config.max_jerk = robot_cfg.kinematics.get_joint_limits().jerk[1, 0].item()
+
+        if isinstance(world_model, str):
+            world_model = load_yaml(join_path(get_world_configs_path(), world_model))
+
+        base_config_data = load_yaml(join_path(get_task_configs_path(), base_cfg_file))
+        if collision_cache is not None:
+            base_config_data["world_collision_checker_cfg"]["cache"] = collision_cache
+        if n_collision_envs is not None:
+            base_config_data["world_collision_checker_cfg"]["n_envs"] = n_collision_envs
+        if collision_max_outside_distance is not None:
+            assert collision_max_outside_distance >= 0.0
+            base_config_data["world_collision_checker_cfg"][
+                "max_distance"
+            ] = collision_max_outside_distance
+        if collision_checker_type is not None:
+            # log_info("updating collision checker type to ",collision_checker_type)
+            base_config_data["world_collision_checker_cfg"]["checker_type"] = collision_checker_type
+        if not self_collision_check:
+            base_config_data["constraint"]["self_collision_cfg"]["weight"] = 0.0
+
+        if world_coll_checker is None and world_model is not None:
+            world_cfg = WorldCollisionConfig.load_from_dict(
+                base_config_data["world_collision_checker_cfg"], world_model, tensor_args
+            )
+            world_coll_checker = create_collision_checker(world_cfg)
+        ik_solver_cfg = IKSolverConfig.load_from_robot_config(
+            robot_cfg,
+            world_model,
+            tensor_args,
+            num_ik_seeds,
+            position_threshold,
+            rotation_threshold,
+            world_coll_checker,
+            base_config_data,
+            particle_ik_file,
+            gradient_ik_file,
+            use_cuda_graph=use_cuda_graph,
+            self_collision_check=self_collision_check,
+            self_collision_opt=self_collision_opt,
+            grad_iters=ik_opt_iters,
+            use_particle_opt=ik_particle_opt,
+            sync_cuda_time=sync_cuda_time,
+            # use_gradient_descent=use_gradient_descent,
+            use_es=use_es_ik,
+            es_learning_rate=es_ik_learning_rate,
+            use_fixed_samples=use_ik_fixed_samples,
+            store_debug=store_ik_debug,
+            collision_activation_distance=collision_activation_distance,
+        )
+
+        ik_solver = IKSolver(ik_solver_cfg)
+
+        graph_cfg = GraphConfig.load_from_robot_config(
+            robot_cfg,
+            world_model,
+            tensor_args,
+            world_coll_checker,
+            base_config_data,
+            graph_file,
+            use_cuda_graph=use_cuda_graph,
+        )
+        graph_cfg.interpolation_dt = interpolation_dt
+        graph_cfg.interpolation_steps = interpolation_steps
+
+        graph_planner = PRMStar(graph_cfg)
+
+        trajopt_cfg = TrajOptSolverConfig.load_from_robot_config(
+            robot_cfg=robot_cfg,
+            world_model=world_model,
+            tensor_args=tensor_args,
+            position_threshold=position_threshold,
+            rotation_threshold=rotation_threshold,
+            world_coll_checker=world_coll_checker,
+            base_cfg_file=base_config_data,
+            particle_file=particle_trajopt_file,
+            gradient_file=gradient_trajopt_file,
+            traj_tsteps=trajopt_tsteps,
+            interpolation_type=interpolation_type,
+            interpolation_steps=interpolation_steps,
+            use_cuda_graph=use_cuda_graph,
+            self_collision_check=self_collision_check,
+            self_collision_opt=self_collision_opt,
+            grad_trajopt_iters=grad_trajopt_iters,
+            num_seeds=num_trajopt_noisy_seeds,
+            seed_ratio=trajopt_seed_ratio,
+            interpolation_dt=interpolation_dt,
+            use_particle_opt=trajopt_particle_opt,
+            traj_evaluator_config=traj_evaluator_config,
+            traj_evaluator=traj_evaluator,
+            use_gradient_descent=use_gradient_descent,
+            use_es=use_es_trajopt,
+            es_learning_rate=es_trajopt_learning_rate,
+            use_fixed_samples=use_trajopt_fixed_samples,
+            evaluate_interpolated_trajectory=evaluate_interpolated_trajectory,
+            fixed_iters=fixed_iters_trajopt,
+            store_debug=store_trajopt_debug,
+            collision_activation_distance=collision_activation_distance,
+            trajopt_dt=trajopt_dt,
+            # trim_steps=trim_steps,
+            store_debug_in_result=store_debug_in_result,
+            smooth_weight=smooth_weight,
+            cspace_threshold=cspace_threshold,
+            state_finite_difference_mode=state_finite_difference_mode,
+            filter_robot_command=filter_robot_command,
+            minimize_jerk=minimize_jerk,
+        )
+        trajopt_solver = TrajOptSolver(trajopt_cfg)
+
+        js_trajopt_cfg = TrajOptSolverConfig.load_from_robot_config(
+            robot_cfg=robot_cfg,
+            world_model=world_model,
+            tensor_args=tensor_args,
+            position_threshold=position_threshold,
+            rotation_threshold=rotation_threshold,
+            world_coll_checker=world_coll_checker,
+            base_cfg_file=base_config_data,
+            particle_file=particle_trajopt_file,
+            gradient_file=gradient_trajopt_file,
+            traj_tsteps=js_trajopt_tsteps,
+            interpolation_type=interpolation_type,
+            interpolation_steps=interpolation_steps,
+            use_cuda_graph=use_cuda_graph,
+            self_collision_check=self_collision_check,
+            self_collision_opt=self_collision_opt,
+            grad_trajopt_iters=grad_trajopt_iters,
+            # num_seeds=num_trajopt_noisy_seeds,
+            # seed_ratio=trajopt_seed_ratio,
+            interpolation_dt=interpolation_dt,
+            use_particle_opt=trajopt_particle_opt,
+            traj_evaluator_config=traj_evaluator_config,
+            traj_evaluator=traj_evaluator,
+            use_gradient_descent=use_gradient_descent,
+            use_es=use_es_trajopt,
+            es_learning_rate=es_trajopt_learning_rate,
+            use_fixed_samples=use_trajopt_fixed_samples,
+            evaluate_interpolated_trajectory=evaluate_interpolated_trajectory,
+            fixed_iters=fixed_iters_trajopt,
+            store_debug=store_trajopt_debug,
+            collision_activation_distance=collision_activation_distance,
+            trajopt_dt=js_trajopt_dt,
+            store_debug_in_result=store_debug_in_result,
+            smooth_weight=smooth_weight,
+            cspace_threshold=cspace_threshold,
+            state_finite_difference_mode=state_finite_difference_mode,
+            minimize_jerk=minimize_jerk,
+            filter_robot_command=filter_robot_command,
+        )
+        js_trajopt_solver = TrajOptSolver(js_trajopt_cfg)
+
+        finetune_trajopt_cfg = TrajOptSolverConfig.load_from_robot_config(
+            robot_cfg=robot_cfg,
+            world_model=world_model,
+            tensor_args=tensor_args,
+            position_threshold=position_threshold,
+            rotation_threshold=rotation_threshold,
+            world_coll_checker=world_coll_checker,
+            base_cfg_file=base_config_data,
+            particle_file=particle_trajopt_file,
+            gradient_file=finetune_trajopt_file,
+            traj_tsteps=trajopt_tsteps,
+            interpolation_type=interpolation_type,
+            interpolation_steps=interpolation_steps,
+            use_cuda_graph=use_cuda_graph,
+            self_collision_check=self_collision_check,
+            self_collision_opt=self_collision_opt,
+            grad_trajopt_iters=finetune_trajopt_iters,
+            interpolation_dt=interpolation_dt,
+            use_particle_opt=False,
+            traj_evaluator_config=traj_evaluator_config,
+            traj_evaluator=traj_evaluator,
+            evaluate_interpolated_trajectory=True,
+            fixed_iters=fixed_iters_trajopt,
+            store_debug=store_trajopt_debug,
+            collision_activation_distance=collision_activation_distance,
+            trajopt_dt=trajopt_dt,
+            store_debug_in_result=store_debug_in_result,
+            smooth_weight=finetune_smooth_weight,
+            cspace_threshold=cspace_threshold,
+            state_finite_difference_mode=state_finite_difference_mode,
+            minimize_jerk=minimize_jerk,
+            trim_steps=trim_steps,
+            use_gradient_descent=use_gradient_descent,
+            filter_robot_command=filter_robot_command,
+        )
+
+        finetune_trajopt_solver = TrajOptSolver(finetune_trajopt_cfg)
+        if graph_trajopt_iters is not None:
+            graph_trajopt_iters = math.ceil(
+                graph_trajopt_iters / finetune_trajopt_solver.solver.newton_optimizer.inner_iters
+            )
+        else:
+            graph_trajopt_iters = finetune_trajopt_solver.solver.newton_optimizer.outer_iters + 2
+        return MotionGenConfig(
+            num_ik_seeds,
+            num_graph_seeds,
+            num_trajopt_seeds,
+            num_trajopt_noisy_seeds,
+            num_batch_ik_seeds,
+            num_batch_trajopt_seeds,
+            robot_cfg,
+            ik_solver,
+            graph_planner,
+            trajopt_solver=trajopt_solver,
+            js_trajopt_solver=js_trajopt_solver,
+            finetune_trajopt_solver=finetune_trajopt_solver,
+            interpolation_type=interpolation_type,
+            interpolation_steps=interpolation_steps,
+            world_coll_checker=world_coll_checker,
+            tensor_args=tensor_args,
+            partial_ik_iters=partial_ik_iters,
+            graph_trajopt_iters=graph_trajopt_iters,
+            store_debug_in_result=store_debug_in_result,
+            interpolation_dt=interpolation_dt,
+            finetune_dt_scale=finetune_dt_scale,
+        )
+
+
+@dataclass
+class MotionGenResult:
+    """Result obtained from motion generation."""
+
+    #: success tensor with index referring to the batch index.
+    success: Optional[T_BValue_bool] = None
+
+    #: returns true if the start state is not satisfying constraints (e.g., in collision)
+    valid_query: bool = True
+
+    #: optimized trajectory
+    optimized_plan: Optional[JointState] = None
+
+    #: dt between steps in the optimized plan
+    optimized_dt: Optional[T_BValue_float] = None
+
+    #: Cartesian position error at final timestep, returning l2 distance.
+    position_error: Optional[T_BValue_float] = None
+
+    #: Cartesian rotation error at final timestep, computed as q^(-1) * q_g
+    rotation_error: Optional[T_BValue_float] = None
+
+    #: Error in joint configuration, when planning in joint space, returning l2 distance.
+    cspace_error: Optional[T_BValue_float] = None
+
+    #: seconds taken  in the optimizer for solving the motion generation problem.
+    solve_time: float = 0.0
+
+    #: seconds taken to solve IK.
+    ik_time: float = 0.0
+
+    #: seconds taken to find graph plan.
+    graph_time: float = 0.0
+
+    #: seconds taken in trajectory optimization.
+    trajopt_time: float = 0.0
+
+    #: seconds to run finetune trajectory optimization.
+    finetune_time: float = 0.0
+
+    #: sum of ik_time, graph_time, and trajopt_time. This also includes any processing between
+    #: calling the different modules.
+    total_time: float = 0.0
+
+    #: interpolated solution, useful for visualization.
+    interpolated_plan: Optional[JointState] = None
+
+    #: dt between steps in interpolated plan
+    interpolation_dt: float = 0.02
+
+    #: last timestep in interpolated plan per batch index. Only used for batched queries
+    path_buffer_last_tstep: Optional[List[int]] = None
+
+    #: Debug information
+    debug_info: Any = None
+
+    #: status of motion generation query. returns [IK Fail, Graph Fail, TrajOpt Fail].
+    status: Optional[str] = None
+
+    #: number of attempts used before returning a solution.
+    attempts: int = 1
+
+    #: number of trajectory optimization attempts used per attempt.
+    trajopt_attempts: int = 0
+
+    #: returns true when a graph plan was used to seed trajectory optimization.
+    used_graph: bool = False
+
+    #: stores graph plan.
+    graph_plan: Optional[JointState] = None
+
+    def clone(self):
+        m = MotionGenResult(
+            self.success.clone(),
+            valid_query=self.valid_query,
+            optimized_plan=self.optimized_plan.clone() if self.optimized_plan is not None else None,
+            optimized_dt=self.optimized_dt.clone() if self.optimized_dt is not None else None,
+            position_error=self.position_error.clone() if self.position_error is not None else None,
+            rotation_error=self.rotation_error.clone() if self.rotation_error is not None else None,
+            cspace_error=self.cspace_error.clone() if self.cspace_error is not None else None,
+            solve_time=self.solve_time,
+            ik_time=self.ik_time,
+            graph_time=self.graph_time,
+            trajopt_time=self.trajopt_time,
+            total_time=self.total_time,
+            graph_plan=self.graph_plan.clone() if self.graph_plan is not None else None,
+            debug_info=self.debug_info,
+            status=self.status,
+            attempts=self.attempts,
+            trajopt_attempts=self.trajopt_attempts,
+            used_graph=self.used_graph,
+            path_buffer_last_tstep=self.path_buffer_last_tstep,
+            interpolated_plan=self.interpolated_plan.clone()
+            if self.interpolated_plan is not None
+            else None,
+            interpolation_dt=self.interpolation_dt,
+        )
+        return m
+
+    @staticmethod
+    def _check_none_and_copy_idx(
+        current_tensor: Union[torch.Tensor, JointState, None],
+        source_tensor: Union[torch.Tensor, JointState, None],
+        idx: int,
+    ):
+        if source_tensor is not None:
+            if current_tensor is None:
+                current_tensor = source_tensor.clone()
+            else:
+                if isinstance(current_tensor, torch.Tensor) and isinstance(
+                    source_tensor, torch.Tensor
+                ):
+                    current_tensor[idx] = source_tensor[idx]
+                elif isinstance(current_tensor, JointState) and isinstance(
+                    source_tensor, JointState
+                ):
+                    source_state = source_tensor[idx]
+                    current_tensor.copy_at_index(source_state, idx)
+
+        return current_tensor
+
+    def copy_idx(self, idx: torch.Tensor, source_result: MotionGenResult):
+        idx = idx.to(dtype=torch.long)
+        # copy data from source result:
+        self.success[idx] = source_result.success[idx]
+
+        self.optimized_plan = self._check_none_and_copy_idx(
+            self.optimized_plan, source_result.optimized_plan, idx
+        )
+        self.interpolated_plan = self._check_none_and_copy_idx(
+            self.interpolated_plan, source_result.interpolated_plan, idx
+        )
+
+        self.position_error = self._check_none_and_copy_idx(
+            self.position_error, source_result.position_error, idx
+        )
+
+        self.rotation_error = self._check_none_and_copy_idx(
+            self.rotation_error, source_result.rotation_error, idx
+        )
+        self.cspace_error = self._check_none_and_copy_idx(
+            self.cspace_error, source_result.cspace_error, idx
+        )
+
+        # NOTE: graph plan will have different shape based on success.
+        # self.graph_plan = self._check_none_and_copy_idx(
+        #    self.graph_plan, source_result.graph_plan, idx
+        # )
+
+        idx_list = idx.cpu().tolist()
+        if source_result.path_buffer_last_tstep is not None:
+            if self.path_buffer_last_tstep is None:
+                self.path_buffer_last_tstep = [0 for i in range(len(self.success))]
+
+            for i in idx_list:
+                self.path_buffer_last_tstep[i] = source_result.path_buffer_last_tstep[i]
+
+        return self
+
+    def get_paths(self) -> List[JointState]:
+        path = [
+            self.interpolated_plan[x].trim_trajectory(0, self.path_buffer_last_tstep[x])
+            for x in range(len(self.interpolated_plan))
+        ]
+        return path
+
+    def get_successful_paths(self) -> List[torch.Tensor]:
+        path = []
+        nz_i = torch.nonzero(self.success.view(-1)).view(-1)
+        path = self.interpolated_plan[nz_i]
+        path_list = []
+
+        if self.path_buffer_last_tstep is not None:
+            for i in range(len(path)):
+                last_tstep = self.path_buffer_last_tstep[nz_i[i]]
+                path_list.append(path[i].trim_trajectory(0, last_tstep))
+        else:
+            path_list = [path[i, :, :] for i in range(path.shape[0])]
+
+        return path_list
+
+    def get_interpolated_plan(self) -> JointState:
+        if self.path_buffer_last_tstep is None:
+            return self.interpolated_plan
+        if len(self.path_buffer_last_tstep) > 1:
+            raise ValueError("only single result is supported")
+
+        return self.interpolated_plan.trim_trajectory(0, self.path_buffer_last_tstep[0])
+
+    @property
+    def motion_time(self):
+        return self.optimized_dt * (self.optimized_plan.position.shape[-2] - 1)
+
+
+@dataclass
+class MotionGenPlanConfig:
+    enable_graph: bool = False
+    enable_opt: bool = True
+    use_nn_ik_seed: bool = False
+    need_graph_success: bool = False
+    max_attempts: int = 60
+    timeout: float = 10.0
+    enable_graph_attempt: Optional[int] = 3
+    disable_graph_attempt: Optional[int] = None
+    ik_fail_return: Optional[int] = None
+    partial_ik_opt: bool = True
+    num_ik_seeds: Optional[int] = None
+    num_graph_seeds: Optional[int] = None
+    num_trajopt_seeds: Optional[int] = None
+    success_ratio: float = 1
+    fail_on_invalid_query: bool = True
+    #: enables retiming trajectory optimization, useful for getting low jerk trajectories.
+    enable_finetune_trajopt: bool = True
+
+    def __post_init__(self):
+        if not self.enable_opt and not self.enable_graph:
+            log_error("Graph search and Optimization are both disabled, enable one")
+
+    def clone(self) -> MotionGenPlanConfig:
+        return MotionGenPlanConfig(
+            enable_graph=self.enable_graph,
+            enable_opt=self.enable_opt,
+            use_nn_ik_seed=self.use_nn_ik_seed,
+            need_graph_success=self.need_graph_success,
+            max_attempts=self.max_attempts,
+            timeout=self.timeout,
+            enable_graph_attempt=self.enable_graph_attempt,
+            disable_graph_attempt=self.disable_graph_attempt,
+            ik_fail_return=self.ik_fail_return,
+            partial_ik_opt=self.partial_ik_opt,
+            num_ik_seeds=self.num_ik_seeds,
+            num_graph_seeds=self.num_graph_seeds,
+            num_trajopt_seeds=self.num_trajopt_seeds,
+            success_ratio=self.success_ratio,
+            fail_on_invalid_query=self.fail_on_invalid_query,
+            enable_finetune_trajopt=self.enable_finetune_trajopt,
+        )
+
+
+class MotionGen(MotionGenConfig):
+    def __init__(self, config: MotionGenConfig):
+        super().__init__(**vars(config))
+        self.rollout_fn = (
+            self.graph_planner.safety_rollout_fn
+        )  # TODO: compute_kinematics fn in rolloutbase
+        self._trajopt_goal_config = None
+        self._dof = self.rollout_fn.d_action
+        self._batch_graph_search_buffer = None
+        self._batch_path_buffer_last_tstep = None
+        self._rollout_list = None
+        self._kin_list = None
+        self.update_batch_size(seeds=self.trajopt_seeds)
+
+    def update_batch_size(self, seeds=10, batch=1):
+        if (
+            self._trajopt_goal_config is None
+            or self._trajopt_goal_config.shape[0] != batch
+            or self._trajopt_goal_config.shape[1] != seeds
+        ):
+            self._trajopt_goal_config = torch.zeros(
+                (batch, seeds, self.rollout_fn.d_action),
+                device=self.tensor_args.device,
+                dtype=self.tensor_args.dtype,
+            )
+            self._batch_col = (
+                torch.arange(0, batch, device=self.tensor_args.device, dtype=torch.long) * seeds
+            )
+        if (
+            self._batch_graph_search_buffer is None
+            or self._batch_graph_search_buffer.shape[0] != batch
+        ):
+            self._batch_graph_search_buffer = JointState.zeros(
+                (batch, self.interpolation_steps, self.kinematics.get_dof()),
+                tensor_args=self.tensor_args,
+                joint_names=self.rollout_fn.joint_names,
+            )
+            self._batch_path_buffer_last_tstep = [0 for i in range(batch)]
+
+    def solve_ik(
+        self,
+        goal_pose: Pose,
+        retract_config: Optional[T_BDOF] = None,
+        seed_config: Optional[T_BDOF] = None,
+        return_seeds: int = 1,
+        num_seeds: Optional[int] = None,
+        use_nn_seed: bool = True,
+        newton_iters: Optional[int] = None,
+    ) -> IKResult:
+        return self.ik_solver.solve(
+            goal_pose,
+            retract_config,
+            seed_config,
+            return_seeds,
+            num_seeds,
+            use_nn_seed,
+            newton_iters,
+        )
+
+    @profiler.record_function("motion_gen/ik")
+    def _solve_ik_from_solve_state(
+        self,
+        goal_pose: Pose,
+        solve_state: ReacherSolveState,
+        start_state: JointState,
+        use_nn_seed: bool,
+        partial_ik_opt: bool,
+        link_poses: Optional[Dict[str, Pose]] = None,
+    ) -> IKResult:
+        newton_iters = None
+        if partial_ik_opt:
+            newton_iters = self.partial_ik_iters
+        ik_result = self.ik_solver.solve_any(
+            solve_state.solve_type,
+            goal_pose,
+            start_state.position.view(-1, self._dof),
+            start_state.position.view(-1, 1, self._dof),
+            solve_state.num_trajopt_seeds,
+            solve_state.num_ik_seeds,
+            use_nn_seed,
+            newton_iters,
+            link_poses,
+        )
+        return ik_result
+
+    @profiler.record_function("motion_gen/trajopt_solve")
+    def _solve_trajopt_from_solve_state(
+        self,
+        goal: Goal,
+        solve_state: ReacherSolveState,
+        act_seed: Optional[JointState] = None,
+        use_nn_seed: bool = False,
+        return_all_solutions: bool = False,
+        seed_success: Optional[torch.Tensor] = None,
+        newton_iters: Optional[int] = None,
+        trajopt_instance: Optional[TrajOptSolver] = None,
+        num_seeds_override: Optional[int] = None,
+    ) -> TrajOptResult:
+        if trajopt_instance is None:
+            trajopt_instance = self.trajopt_solver
+        if num_seeds_override is None:
+            num_seeds_override = solve_state.num_trajopt_seeds
+        traj_result = trajopt_instance.solve_any(
+            solve_state.solve_type,
+            goal,
+            act_seed,
+            use_nn_seed,
+            return_all_solutions,
+            num_seeds_override,
+            seed_success,
+            newton_iters=newton_iters,
+        )
+        return traj_result
+
+    @profiler.record_function("motion_gen/graph_search")
+    def graph_search(
+        self, start_config: T_BDOF, goal_config: T_BDOF, interpolation_steps: Optional[int] = None
+    ) -> GraphResult:
+        return self.graph_planner.find_paths(start_config, goal_config, interpolation_steps)
+
+    def _get_solve_state(
+        self,
+        solve_type: ReacherSolveType,
+        plan_config: MotionGenPlanConfig,
+        goal_pose: Pose,
+        start_state: JointState,
+    ):
+        # TODO: for batch seeds
+        num_ik_seeds = (
+            self.ik_seeds if plan_config.num_ik_seeds is None else plan_config.num_ik_seeds
+        )
+        num_trajopt_seeds = (
+            self.trajopt_seeds
+            if plan_config.num_trajopt_seeds is None
+            else plan_config.num_trajopt_seeds
+        )
+
+        num_graph_seeds = (
+            self.trajopt_seeds if plan_config.num_graph_seeds is None else num_trajopt_seeds
+        )
+        solve_state = None
+        if solve_type == ReacherSolveType.SINGLE:
+            solve_state = ReacherSolveState(
+                solve_type,
+                num_ik_seeds=num_ik_seeds,
+                num_trajopt_seeds=num_trajopt_seeds,
+                num_graph_seeds=num_graph_seeds,
+                batch_size=1,
+                n_envs=1,
+                n_goalset=1,
+            )
+        elif solve_type == ReacherSolveType.GOALSET:
+            solve_state = ReacherSolveState(
+                solve_type,
+                num_ik_seeds=num_ik_seeds,
+                num_trajopt_seeds=num_trajopt_seeds,
+                num_graph_seeds=num_graph_seeds,
+                batch_size=1,
+                n_envs=1,
+                n_goalset=goal_pose.n_goalset,
+            )
+        elif solve_type == ReacherSolveType.BATCH:
+            solve_state = ReacherSolveState(
+                solve_type,
+                num_ik_seeds=num_ik_seeds,
+                num_trajopt_seeds=num_trajopt_seeds,
+                num_graph_seeds=num_graph_seeds,
+                batch_size=goal_pose.batch,
+                n_envs=1,
+                n_goalset=1,
+            )
+        elif solve_type == ReacherSolveType.BATCH_GOALSET:
+            solve_state = ReacherSolveState(
+                solve_type,
+                num_ik_seeds=num_ik_seeds,
+                num_trajopt_seeds=num_trajopt_seeds,
+                num_graph_seeds=num_graph_seeds,
+                batch_size=goal_pose.batch,
+                n_envs=1,
+                n_goalset=goal_pose.n_goalset,
+            )
+        elif solve_type == ReacherSolveType.BATCH_ENV:
+            solve_state = ReacherSolveState(
+                solve_type,
+                num_ik_seeds=num_ik_seeds,
+                num_trajopt_seeds=num_trajopt_seeds,
+                num_graph_seeds=num_graph_seeds,
+                batch_size=goal_pose.batch,
+                n_envs=goal_pose.batch,
+                n_goalset=1,
+            )
+        elif solve_type == ReacherSolveType.BATCH_ENV_GOALSET:
+            solve_state = ReacherSolveState(
+                solve_type,
+                num_ik_seeds=num_ik_seeds,
+                num_trajopt_seeds=num_trajopt_seeds,
+                num_graph_seeds=num_graph_seeds,
+                batch_size=goal_pose.batch,
+                n_envs=goal_pose.batch,
+                n_goalset=goal_pose.n_goalset,
+            )
+        else:
+            raise ValueError("Unsupported Solve type")
+        return solve_state
+
+    def _plan_attempts(
+        self,
+        solve_state: ReacherSolveState,
+        start_state: JointState,
+        goal_pose: Pose,
+        plan_config: MotionGenPlanConfig = MotionGenPlanConfig(),
+        link_poses: List[Pose] = None,
+    ):
+        start_time = time.time()
+        # if plan_config.enable_opt:
+        self.update_batch_size(seeds=solve_state.num_trajopt_seeds, batch=solve_state.batch_size)
+        if solve_state.batch_env:
+            if solve_state.batch_size > self.world_coll_checker.n_envs:
+                raise ValueError("Batch Env is less that goal batch")
+        force_graph = plan_config.enable_graph
+        partial_ik = plan_config.partial_ik_opt
+        ik_fail_count = 0
+        time_dict = {
+            "solve_time": 0,
+            "ik_time": 0,
+            "graph_time": 0,
+            "trajopt_time": 0,
+            "trajopt_attempts": 0,
+        }
+        best_status = 0  # 0== None, 1==IK Fail, 2== Graph Fail, 3==Opt Fail
+
+        for n in range(plan_config.max_attempts):
+            log_info("MG Iter: " + str(n))
+            result = self._plan_from_solve_state(
+                solve_state,
+                start_state,
+                goal_pose,
+                plan_config,
+                link_poses,
+            )
+            time_dict["solve_time"] += result.solve_time
+            time_dict["ik_time"] += result.ik_time
+            time_dict["graph_time"] += result.graph_time
+            time_dict["trajopt_time"] += result.trajopt_time
+            time_dict["trajopt_attempts"] += result.trajopt_attempts
+            if (
+                result.status == "IK Fail" and plan_config.ik_fail_return is not None
+            ):  # IF IK fails the first time, we exist assuming the goal is not reachable
+                ik_fail_count += 1
+                best_status = max(best_status, 1)
+
+                if ik_fail_count > plan_config.ik_fail_return:
+                    break
+            if result.success[0].item():
+                break
+            if plan_config.enable_graph_attempt is not None and (
+                n >= plan_config.enable_graph_attempt - 1
+                and result.status == "Opt Fail"
+                and not plan_config.enable_graph
+            ):
+                plan_config.enable_graph = True
+                plan_config.partial_ik_opt = False
+            if plan_config.disable_graph_attempt is not None and (
+                n >= plan_config.disable_graph_attempt - 1
+                and result.status in ["Opt Fail", "Graph Fail"]
+                and not force_graph
+            ):
+                plan_config.enable_graph = False
+                plan_config.partial_ik_opt = partial_ik
+            if result.status in ["Opt Fail"]:
+                best_status = 3
+            elif result.status in ["Graph Fail"]:
+                best_status = 2
+            if time.time() - start_time > plan_config.timeout:
+                break
+            if not result.valid_query:
+                result.status = "Invalid Problem"
+                break
+            if n == 10:
+                self.reset_seed()
+                log_warn("Couldn't find solution with 10 attempts, resetting seeds")
+
+        result.solve_time = time_dict["solve_time"]
+        result.ik_time = time_dict["ik_time"]
+        result.graph_time = time_dict["graph_time"]
+        result.trajopt_time = time_dict["trajopt_time"]
+        result.trajopt_attempts = time_dict["trajopt_attempts"]
+        result.attempts = n + 1
+        torch.cuda.synchronize()
+        result.total_time = time.time() - start_time
+        return result
+
+    def _plan_batch_attempts(
+        self,
+        solve_state: ReacherSolveState,
+        start_state: JointState,
+        goal_pose: Pose,
+        plan_config: MotionGenPlanConfig = MotionGenPlanConfig(),
+    ):
+        start_time = time.time()
+
+        if solve_state.batch_env:
+            if solve_state.batch_size > self.world_coll_checker.n_envs:
+                raise ValueError("Batch Env is less that goal batch")
+            if plan_config.enable_graph:
+                raise ValueError("Graph Search / Geometric Planner not supported in batch_env mode")
+
+        if plan_config.enable_graph:
+            log_info("Batch mode enable graph is only supported with num_graph_seeds==1")
+            plan_config.num_trajopt_seeds = 1
+            plan_config.num_graph_seeds = 1
+            solve_state.num_trajopt_seeds = 1
+            solve_state.num_graph_seeds = 1
+        self.update_batch_size(seeds=solve_state.num_trajopt_seeds, batch=solve_state.batch_size)
+
+        ik_fail_count = 0
+        force_graph = plan_config.enable_graph
+        partial_ik = plan_config.partial_ik_opt
+
+        time_dict = {
+            "solve_time": 0,
+            "ik_time": 0,
+            "graph_time": 0,
+            "trajopt_time": 0,
+            "trajopt_attempts": 0,
+        }
+        best_result = None
+
+        for n in range(plan_config.max_attempts):
+            result = self._plan_from_solve_state_batch(
+                solve_state, start_state, goal_pose, plan_config
+            )
+
+            # torch.cuda.synchronize()
+            # print("Success: ",torch.count_nonzero(result.success))
+            # print(result.success)
+            time_dict["solve_time"] += result.solve_time
+            time_dict["ik_time"] += result.ik_time
+
+            time_dict["graph_time"] += result.graph_time
+            time_dict["trajopt_time"] += result.trajopt_time
+            time_dict["trajopt_attempts"] += result.trajopt_attempts
+
+            # if not all have succeeded, store the successful ones and re attempt:
+            # TODO: update stored based on error
+            if best_result is None:
+                best_result = result.clone()
+            else:
+                # get success idx:
+                idx = torch.nonzero(result.success).reshape(-1)
+                # print(idx, result.optimized_plan.shape, best_result.optimized_plan.shape)
+                if len(idx) > 0:
+                    best_result.copy_idx(idx, result)
+
+            if (
+                result.status == "IK Fail" and plan_config.ik_fail_return is not None
+            ):  # IF IK fails the first time, we exit assuming the goal is not reachable
+                ik_fail_count += 1
+                if ik_fail_count > plan_config.ik_fail_return:
+                    break
+
+            if (
+                torch.count_nonzero(best_result.success)
+                >= goal_pose.batch * plan_config.success_ratio
+            ):  # we want 90% targets to succeed
+                best_result.status = None
+                break
+            if plan_config.enable_graph_attempt is not None and (
+                n >= plan_config.enable_graph_attempt - 1
+                and result.status != "IK Fail"
+                and not plan_config.enable_graph
+            ):
+                plan_config.enable_graph = True
+                plan_config.partial_ik_opt = False
+
+            if plan_config.disable_graph_attempt is not None and (
+                n >= plan_config.disable_graph_attempt - 1
+                and result.status in ["Opt Fail", "Graph Fail"]
+                and not force_graph
+            ):
+                plan_config.enable_graph = False
+                plan_config.partial_ik_opt = partial_ik
+
+            if plan_config.fail_on_invalid_query:
+                if not result.valid_query:
+                    best_result.valid_query = False
+                    best_result.status = "Invalid Problem"
+                    break
+            if time.time() - start_time > plan_config.timeout:
+                break
+        best_result.solve_time = time_dict["solve_time"]
+        best_result.ik_time = time_dict["ik_time"]
+        best_result.graph_time = time_dict["graph_time"]
+        best_result.trajopt_time = time_dict["trajopt_time"]
+        best_result.trajopt_attempts = time_dict["trajopt_attempts"]
+        best_result.attempts = n + 1
+        torch.cuda.synchronize()
+        best_result.total_time = time.time() - start_time
+        return best_result
+
+    def plan_single(
+        self,
+        start_state: JointState,
+        goal_pose: Pose,
+        plan_config: MotionGenPlanConfig = MotionGenPlanConfig(),
+        link_poses: List[Pose] = None,
+    ) -> MotionGenResult:
+        solve_state = self._get_solve_state(
+            ReacherSolveType.SINGLE, plan_config, goal_pose, start_state
+        )
+
+        result = self._plan_attempts(
+            solve_state,
+            start_state,
+            goal_pose,
+            plan_config,
+            link_poses=link_poses,
+        )
+        return result
+
+    def plan_goalset(
+        self,
+        start_state: JointState,
+        goal_pose: Pose,
+        plan_config: MotionGenPlanConfig = MotionGenPlanConfig(),
+    ) -> MotionGenResult:
+        solve_state = self._get_solve_state(
+            ReacherSolveType.GOALSET, plan_config, goal_pose, start_state
+        )
+
+        result = self._plan_attempts(
+            solve_state,
+            start_state,
+            goal_pose,
+            plan_config,
+        )
+        return result
+
+    def plan_batch(
+        self,
+        start_state: JointState,
+        goal_pose: Pose,
+        plan_config: MotionGenPlanConfig = MotionGenPlanConfig(),
+    ) -> MotionGenResult:
+        solve_state = self._get_solve_state(
+            ReacherSolveType.BATCH, plan_config, goal_pose, start_state
+        )
+
+        result = self._plan_batch_attempts(
+            solve_state,
+            start_state,
+            goal_pose,
+            plan_config,
+        )
+        return result
+
+    def plan_batch_goalset(
+        self,
+        start_state: JointState,
+        goal_pose: Pose,
+        plan_config: MotionGenPlanConfig = MotionGenPlanConfig(),
+    ) -> MotionGenResult:
+        solve_state = self._get_solve_state(
+            ReacherSolveType.BATCH_GOALSET, plan_config, goal_pose, start_state
+        )
+
+        result = self._plan_batch_attempts(
+            solve_state,
+            start_state,
+            goal_pose,
+            plan_config,
+        )
+        return result
+
+    def plan_batch_env_goalset(
+        self,
+        start_state: JointState,
+        goal_pose: Pose,
+        plan_config: MotionGenPlanConfig = MotionGenPlanConfig(),
+    ) -> MotionGenResult:
+        solve_state = self._get_solve_state(
+            ReacherSolveType.BATCH_ENV_GOALSET, plan_config, goal_pose, start_state
+        )
+
+        result = self._plan_batch_attempts(
+            solve_state,
+            start_state,
+            goal_pose,
+            plan_config,
+        )
+        return result
+
+    def plan_batch_env(
+        self,
+        start_state: JointState,
+        goal_pose: Pose,
+        plan_config: MotionGenPlanConfig = MotionGenPlanConfig(),
+    ) -> MotionGenResult:
+        solve_state = self._get_solve_state(
+            ReacherSolveType.BATCH_ENV, plan_config, goal_pose, start_state
+        )
+        result = self._plan_batch_attempts(
+            solve_state,
+            start_state,
+            goal_pose,
+            plan_config,
+        )
+        return result
+
+    def _plan_from_solve_state(
+        self,
+        solve_state: ReacherSolveState,
+        start_state: JointState,
+        goal_pose: Pose,
+        plan_config: MotionGenPlanConfig = MotionGenPlanConfig(),
+        link_poses: Optional[Dict[str, Pose]] = None,
+    ) -> MotionGenResult:
+        trajopt_seed_traj = None
+        trajopt_seed_success = None
+        trajopt_newton_iters = None
+
+        graph_success = 0
+        if len(start_state.shape) == 1:
+            log_error("Joint state should be not a vector (dof) should be (bxdof)")
+        # plan ik:
+
+        ik_result = self._solve_ik_from_solve_state(
+            goal_pose,
+            solve_state,
+            start_state,
+            plan_config.use_nn_ik_seed,
+            plan_config.partial_ik_opt,
+            link_poses,
+        )
+        if not plan_config.enable_graph and plan_config.partial_ik_opt:
+            ik_result.success[:] = True
+
+        # check for success:
+        result = MotionGenResult(
+            ik_result.success.view(-1)[0:1],  # This is not true for batch mode
+            ik_time=ik_result.solve_time,
+            solve_time=ik_result.solve_time,
+        )
+
+        if self.store_debug_in_result:
+            result.debug_info = {"ik_result": ik_result}
+        ik_success = torch.count_nonzero(ik_result.success)
+        if ik_success == 0:
+            result.status = "IK Fail"
+            return result
+
+        # do graph search:
+        with profiler.record_function("motion_gen/post_ik"):
+            ik_out_seeds = solve_state.num_trajopt_seeds
+            if plan_config.enable_graph:
+                ik_out_seeds = min(solve_state.num_trajopt_seeds, ik_success)
+
+            goal_config = ik_result.solution[ik_result.success].view(-1, self.ik_solver.dof)[
+                :ik_success
+            ]
+            start_config = tensor_repeat_seeds(start_state.position, ik_out_seeds)
+            if plan_config.enable_opt:
+                self._trajopt_goal_config[:] = ik_result.solution
+
+        # do graph search:
+        if plan_config.enable_graph:
+            interpolation_steps = None
+            if plan_config.enable_opt:
+                interpolation_steps = self.trajopt_solver.traj_tsteps - 4
+            log_info("MG: running GP")
+            graph_result = self.graph_search(start_config, goal_config, interpolation_steps)
+            trajopt_seed_success = graph_result.success
+
+            graph_success = torch.count_nonzero(graph_result.success).item()
+            result.graph_time = graph_result.solve_time
+            result.solve_time += graph_result.solve_time
+            if graph_success > 0:
+                result.graph_plan = graph_result.interpolated_plan
+                result.interpolated_plan = graph_result.interpolated_plan
+
+                result.used_graph = True
+                if plan_config.enable_opt:
+                    trajopt_seed = (
+                        result.graph_plan.position.view(
+                            1,  # solve_state.batch_size,
+                            graph_success,  # solve_state.num_trajopt_seeds,
+                            interpolation_steps,
+                            self._dof,
+                        )
+                        .transpose(0, 1)
+                        .contiguous()
+                    )
+                    trajopt_seed_traj = torch.zeros(
+                        (trajopt_seed.shape[0], 1, self.trajopt_solver.traj_tsteps, self._dof),
+                        device=self.tensor_args.device,
+                        dtype=self.tensor_args.dtype,
+                    )
+                    trajopt_seed_traj[:, :, :-4, :] = trajopt_seed
+                    trajopt_seed_traj[:, :, -4:, :] = trajopt_seed_traj[:, :, -5:-4, :]
+                    trajopt_seed_success = ik_result.success.clone()
+                    trajopt_seed_success[ik_result.success] = graph_result.success
+
+                    trajopt_seed_success = trajopt_seed_success.view(
+                        solve_state.batch_size, solve_state.num_trajopt_seeds
+                    )
+                    trajopt_newton_iters = self.graph_trajopt_iters
+                else:
+                    _, idx = torch.topk(
+                        graph_result.path_length[graph_result.success], k=1, largest=False
+                    )
+                    result.interpolated_plan = result.interpolated_plan[idx].squeeze(0)
+                    result.optimized_dt = self.tensor_args.to_device(self.interpolation_dt)
+                    result.optimized_plan = result.interpolated_plan[
+                        : graph_result.path_buffer_last_tstep[idx.item()]
+                    ]
+                    idx = idx.view(-1) + self._batch_col
+                    result.position_error = ik_result.position_error[ik_result.success][
+                        graph_result.success
+                    ][idx]
+                    result.rotation_error = ik_result.rotation_error[ik_result.success][
+                        graph_result.success
+                    ][idx]
+                    result.path_buffer_last_tstep = graph_result.path_buffer_last_tstep[
+                        idx.item() : idx.item() + 1
+                    ]
+                    result.success = result.success.view(-1)[0:1]
+                    result.success[:] = True
+                    return result
+            else:
+                result.success[:] = False
+                result.status = "Graph Fail"
+                if not graph_result.valid_query:
+                    result.valid_query = False
+                    if self.store_debug_in_result:
+                        result.debug_info["graph_debug"] = graph_result.debug_info
+                    return result
+                if plan_config.need_graph_success:
+                    return result
+
+        # do trajopt:
+
+        if plan_config.enable_opt:
+            with profiler.record_function("motion_gen/setup_trajopt_seeds"):
+                self._trajopt_goal_config[:, :ik_success] = goal_config
+
+                goal = Goal(
+                    goal_pose=goal_pose,
+                    current_state=start_state,
+                    links_goal_pose=link_poses,
+                )
+
+                if (
+                    trajopt_seed_traj is None
+                    or graph_success < solve_state.num_trajopt_seeds * self.noisy_trajopt_seeds
+                ):
+                    goal_config = self._trajopt_goal_config[0]  # batch index == 0
+
+                    goal_state = JointState.from_position(
+                        goal_config,
+                    )
+                    seed_link_poses = None
+                    if link_poses is not None:
+                        seed_link_poses = {}
+
+                        for k in link_poses.keys():
+                            seed_link_poses[k] = link_poses[k].repeat_seeds(
+                                solve_state.num_trajopt_seeds
+                            )
+                    seed_goal = Goal(
+                        goal_pose=goal_pose.repeat_seeds(solve_state.num_trajopt_seeds),
+                        current_state=start_state.repeat_seeds(solve_state.num_trajopt_seeds),
+                        goal_state=goal_state,
+                        links_goal_pose=seed_link_poses,
+                    )
+                    if trajopt_seed_traj is not None:
+                        trajopt_seed_traj = trajopt_seed_traj.transpose(0, 1).contiguous()
+                        # batch, num_seeds, h, dof
+                        if (
+                            trajopt_seed_success.shape[1]
+                            < solve_state.num_trajopt_seeds * self.noisy_trajopt_seeds
+                        ):
+                            trajopt_seed_success_new = torch.zeros(
+                                (1, solve_state.num_trajopt_seeds * self.noisy_trajopt_seeds),
+                                device=self.tensor_args.device,
+                                dtype=torch.bool,
+                            )
+                            trajopt_seed_success_new[
+                                0, : trajopt_seed_success.shape[1]
+                            ] = trajopt_seed_success
+                            trajopt_seed_success = trajopt_seed_success_new
+                    # create seeds here:
+                    trajopt_seed_traj = self.trajopt_solver.get_seed_set(
+                        seed_goal,
+                        trajopt_seed_traj,  # batch, num_seeds, h, dof
+                        num_seeds=self.noisy_trajopt_seeds,
+                        batch_mode=solve_state.batch_mode,
+                        seed_success=trajopt_seed_success,
+                    )
+                    trajopt_seed_traj = trajopt_seed_traj.view(
+                        solve_state.num_trajopt_seeds * self.noisy_trajopt_seeds,
+                        solve_state.batch_size,
+                        self.trajopt_solver.traj_tsteps,
+                        self._dof,
+                    ).contiguous()
+            if plan_config.enable_finetune_trajopt:
+                og_value = self.trajopt_solver.interpolation_type
+                self.trajopt_solver.interpolation_type = InterpolateType.LINEAR_CUDA
+            with profiler.record_function("motion_gen/trajopt"):
+                log_info("MG: running TO")
+                traj_result = self._solve_trajopt_from_solve_state(
+                    goal,
+                    solve_state,
+                    trajopt_seed_traj,
+                    num_seeds_override=solve_state.num_trajopt_seeds * self.noisy_trajopt_seeds,
+                    newton_iters=trajopt_newton_iters,
+                )
+                if False and not traj_result.success.item():
+                    # pose_convergence = traj_result.position_error < self.
+                    print(
+                        traj_result.position_error.item(),
+                        traj_result.rotation_error.item(),
+                        torch.count_nonzero(~traj_result.metrics.feasible[0]).item(),
+                        torch.count_nonzero(~traj_result.metrics.feasible[1]).item(),
+                        traj_result.optimized_dt.item(),
+                    )
+            if plan_config.enable_finetune_trajopt:
+                self.trajopt_solver.interpolation_type = og_value
+                # self.trajopt_solver.compute_metrics(not og_evaluate, og_evaluate)
+            if self.store_debug_in_result:
+                result.debug_info["trajopt_result"] = traj_result
+            # run finetune
+            if plan_config.enable_finetune_trajopt and traj_result.success[0].item():
+                with profiler.record_function("motion_gen/finetune_trajopt"):
+                    seed_traj = traj_result.solution.position.clone()
+                    seed_traj = torch.roll(seed_traj, -2, dims=-2)
+                    # seed_traj[..., -2:, :] = seed_traj[..., -3, :]
+                    seed_traj = seed_traj.contiguous()
+                    og_solve_time = traj_result.solve_time
+                    scaled_dt = torch.clamp(
+                        traj_result.optimized_dt * self.finetune_dt_scale,
+                        self.trajopt_solver.interpolation_dt,
+                    )
+                    self.finetune_trajopt_solver.update_solver_dt(scaled_dt.item())
+
+                    traj_result = self._solve_trajopt_from_solve_state(
+                        goal,
+                        solve_state,
+                        seed_traj,
+                        trajopt_instance=self.finetune_trajopt_solver,
+                        num_seeds_override=1,
+                    )
+                    if False and not traj_result.success.item():
+                        print(
+                            traj_result.position_error.item(),
+                            traj_result.rotation_error.item(),
+                            torch.count_nonzero(~traj_result.metrics.feasible).item(),
+                            traj_result.optimized_dt.item(),
+                        )
+                # if not traj_result.success.item():
+                #    #print(traj_result.metrics.constraint)
+                #    print(traj_result.position_error.item() * 100.0,
+                #          traj_result.rotation_error.item() * 100.0)
+
+                result.finetune_time = traj_result.solve_time
+
+                traj_result.solve_time = og_solve_time
+                if self.store_debug_in_result:
+                    result.debug_info["finetune_trajopt_result"] = traj_result
+
+            result.solve_time += traj_result.solve_time + result.finetune_time
+            result.trajopt_time = traj_result.solve_time
+            result.trajopt_attempts = 1
+            result.success = traj_result.success
+
+            if torch.count_nonzero(result.success) == 0:
+                result.status = "Opt Fail"
+
+            result.interpolated_plan = traj_result.interpolated_solution.trim_trajectory(
+                0, traj_result.path_buffer_last_tstep[0]
+            )
+            result.interpolation_dt = self.trajopt_solver.interpolation_dt
+            result.path_buffer_last_tstep = traj_result.path_buffer_last_tstep
+            result.position_error = traj_result.position_error
+            result.rotation_error = traj_result.rotation_error
+            result.optimized_dt = traj_result.optimized_dt
+            result.optimized_plan = traj_result.solution
+
+        return result
+
+    def _plan_from_solve_state_batch(
+        self,
+        solve_state: ReacherSolveState,
+        start_state: JointState,
+        goal_pose: Pose,
+        plan_config: MotionGenPlanConfig = MotionGenPlanConfig(),
+    ) -> MotionGenResult:
+        self._trajopt_goal_config[:] = self.get_retract_config().view(1, 1, self._dof)
+        trajopt_seed_traj = None
+        trajopt_seed_success = None
+        trajopt_newton_iters = None
+        graph_success = 0
+
+        # plan ik:
+
+        ik_result = self._solve_ik_from_solve_state(
+            goal_pose,
+            solve_state,
+            start_state,
+            plan_config.use_nn_ik_seed,
+            plan_config.partial_ik_opt,
+        )
+
+        if not plan_config.enable_graph and plan_config.partial_ik_opt:
+            ik_result.success[:] = True
+
+        # check for success:
+        result = MotionGenResult(
+            ik_result.success,
+            position_error=ik_result.position_error,
+            rotation_error=ik_result.rotation_error,
+            ik_time=ik_result.solve_time,
+            solve_time=ik_result.solve_time,
+            debug_info={},
+        )
+
+        ik_success = torch.count_nonzero(ik_result.success)
+        if ik_success == 0:
+            result.status = "IK Fail"
+            result.success = result.success[:, 0]
+            return result
+
+        # do graph search:
+        ik_out_seeds = solve_state.num_trajopt_seeds
+        if plan_config.enable_graph:
+            ik_out_seeds = min(solve_state.num_trajopt_seeds, ik_success)
+
+        # if not plan_config.enable_opt and plan_config.enable_graph:
+        #    self.graph_planner.interpolation_steps = self.interpolation_steps
+        #    self.graph_planner.interpolation_type = self.interpolation_type
+        # elif plan_config.enable_graph:
+        #    self.graph_planner.interpolation_steps = self.trajopt_solver.traj_tsteps
+        #    self.graph_planner.interpolation_type = InterpolateType.LINEAR
+        goal_config = ik_result.solution[ik_result.success].view(-1, self.ik_solver.dof)
+
+        # get shortest path
+        if plan_config.enable_graph:
+            interpolation_steps = None
+            if plan_config.enable_opt:
+                interpolation_steps = self.trajopt_solver.traj_tsteps - 4
+
+            start_graph_state = start_state.repeat_seeds(ik_out_seeds)
+            start_config = start_graph_state.position[ik_result.success.view(-1)].view(
+                -1, self.ik_solver.dof
+            )
+            graph_result = self.graph_search(start_config, goal_config, interpolation_steps)
+            graph_success = torch.count_nonzero(graph_result.success).item()
+
+            result.graph_time = graph_result.solve_time
+            result.solve_time += graph_result.solve_time
+            if graph_success > 0:
+                # path = graph_result.interpolated_plan
+                result.graph_plan = graph_result.interpolated_plan
+                result.interpolated_plan = graph_result.interpolated_plan
+                result.used_graph = True
+
+                if plan_config.enable_opt:
+                    trajopt_seed = (
+                        result.graph_plan.position.view(
+                            1,  # solve_state.batch_size,
+                            graph_success,  # solve_state.num_trajopt_seeds,
+                            interpolation_steps,
+                            self._dof,
+                        )
+                        .transpose(0, 1)
+                        .contiguous()
+                    )
+                    trajopt_seed_traj = torch.zeros(
+                        (trajopt_seed.shape[0], 1, self.trajopt_solver.traj_tsteps, self._dof),
+                        device=self.tensor_args.device,
+                        dtype=self.tensor_args.dtype,
+                    )
+                    trajopt_seed_traj[:, :, :-4, :] = trajopt_seed
+                    trajopt_seed_traj[:, :, -4:, :] = trajopt_seed_traj[:, :, -5:-4, :]
+
+                    trajopt_seed_success = ik_result.success.clone()
+                    trajopt_seed_success[ik_result.success] = graph_result.success
+
+                    trajopt_seed_success = trajopt_seed_success.view(
+                        solve_state.num_trajopt_seeds, solve_state.batch_size
+                    )
+                    trajopt_newton_iters = self.graph_trajopt_iters
+
+                else:
+                    ik_success = ik_result.success.view(-1).clone()
+
+                    # only some might be successful:
+
+                    g_dim = torch.nonzero(ik_success).view(-1)[graph_result.success]
+
+                    self._batch_graph_search_buffer.copy_at_index(
+                        graph_result.interpolated_plan, g_dim
+                    )
+
+                    # result.graph_plan = JointState.from_position(
+                    #    self._batch_graph_search_buffer,
+                    #    joint_names=graph_result.interpolated_plan.joint_names,
+                    # )
+                    result.interpolated_plan = self._batch_graph_search_buffer
+                    g_dim = g_dim.cpu().squeeze().tolist()
+                    for x, x_val in enumerate(g_dim):
+                        self._batch_path_buffer_last_tstep[
+                            x_val
+                        ] = graph_result.path_buffer_last_tstep[x]
+                    result.path_buffer_last_tstep = self._batch_path_buffer_last_tstep
+                    result.optimized_plan = result.interpolated_plan
+                    result.optimized_dt = torch.as_tensor(
+                        [
+                            self.interpolation_dt
+                            for i in range(result.interpolated_plan.position.shape[0])
+                        ],
+                        device=self.tensor_args.device,
+                        dtype=self.tensor_args.dtype,
+                    )
+                    result.success = result.success.view(-1).clone()
+                    result.success[ik_success][graph_result.success] = True
+                    return result
+
+            else:
+                result.success[:] = False
+                result.success = result.success[:, 0]
+                result.status = "Graph Fail"
+                if not graph_result.valid_query:
+                    result.valid_query = False
+                    if self.store_debug_in_result:
+                        result.debug_info = {"graph_debug": graph_result.debug_info}
+                    return result
+
+        if plan_config.enable_opt:
+            # get goal configs based on ik success:
+            self._trajopt_goal_config[ik_result.success] = goal_config
+
+            goal_config = self._trajopt_goal_config  # batch index == 0
+
+            goal = Goal(
+                goal_pose=goal_pose,
+                current_state=start_state,
+            )
+            # generate seeds:
+            if trajopt_seed_traj is None or graph_success < solve_state.num_trajopt_seeds:
+                seed_goal = Goal(
+                    goal_pose=goal_pose.repeat_seeds(solve_state.num_trajopt_seeds),
+                    current_state=start_state.repeat_seeds(solve_state.num_trajopt_seeds),
+                    goal_state=JointState.from_position(goal_config.view(-1, self._dof)),
+                )
+                if trajopt_seed_traj is not None:
+                    trajopt_seed_traj = (
+                        trajopt_seed_traj.transpose(0, 1).contiguous(),
+                    )  # batch, num_seeds, h, dof
+
+                # create seeds here:
+                trajopt_seed_traj = self.trajopt_solver.get_seed_set(
+                    seed_goal,
+                    trajopt_seed_traj,  # batch, num_seeds, h, dof
+                    num_seeds=1,
+                    batch_mode=solve_state.batch_mode,
+                    seed_success=trajopt_seed_success,
+                )
+                trajopt_seed_traj = trajopt_seed_traj.view(
+                    solve_state.num_trajopt_seeds,
+                    solve_state.batch_size,
+                    self.trajopt_solver.traj_tsteps,
+                    self._dof,
+                ).contiguous()
+            traj_result = self._solve_trajopt_from_solve_state(
+                goal, solve_state, trajopt_seed_traj, newton_iters=trajopt_newton_iters
+            )
+
+            # output of traj result will have 1 solution per batch
+            result.success = traj_result.success
+
+            result.interpolated_plan = traj_result.interpolated_solution
+            result.solve_time += traj_result.solve_time
+            result.trajopt_time = traj_result.solve_time
+            result.position_error = traj_result.position_error
+            result.rotation_error = traj_result.rotation_error
+            result.cspace_error = traj_result.cspace_error
+            result.path_buffer_last_tstep = traj_result.path_buffer_last_tstep
+            result.optimized_plan = traj_result.solution
+            result.optimized_dt = traj_result.optimized_dt
+            if torch.count_nonzero(traj_result.success) == 0:
+                result.status = "Opt Fail"
+                result.success[:] = False
+            if self.store_debug_in_result:
+                result.debug_info = {"trajopt_result": traj_result}
+
+        return result
+
+    def compute_kinematics(self, state: JointState) -> KinematicModelState:
+        out = self.rollout_fn.compute_kinematics(state)
+        return out
+
+    @property
+    def kinematics(self):
+        return self.rollout_fn.kinematics
+
+    @property
+    def dof(self):
+        return self.kinematics.get_dof()
+
+    def check_constraints(self, state: JointState) -> RolloutMetrics:
+        metrics = self.ik_solver.check_constraints(state)
+        return metrics
+
+    def update_world(self, world: WorldConfig):
+        self.world_coll_checker.load_collision_model(world)
+        self.graph_planner.reset_graph()
+        return True
+
+    def clear_world_cache(self):
+        self.world_coll_checker.clear_cache()
+
+    def reset(self, reset_seed=True):
+        self.graph_planner.reset_buffer()
+        if reset_seed:
+            self.reset_seed()
+
+    def reset_seed(self):
+        self.rollout_fn.reset_seed()
+        self.ik_solver.reset_seed()
+        self.graph_planner.reset_seed()
+        self.trajopt_solver.reset_seed()
+
+    def get_retract_config(self):
+        return self.rollout_fn.dynamics_model.retract_config
+
+    def warmup(
+        self,
+        enable_graph: bool = True,
+        batch: Optional[int] = None,
+        warmup_js_trajopt: bool = True,
+        batch_env_mode: bool = False,
+    ):
+        log_info("Warmup")
+
+        if batch is None:
+            start_state = JointState.from_position(
+                self.rollout_fn.dynamics_model.retract_config.view(1, -1).clone(),
+                joint_names=self.rollout_fn.joint_names,
+            )
+            state = self.rollout_fn.compute_kinematics(start_state)
+            link_poses = state.link_pose
+            retract_pose = Pose(state.ee_pos_seq, quaternion=state.ee_quat_seq)
+            start_state.position += 0.1
+
+            for _ in range(2):
+                self.plan_single(
+                    start_state,
+                    retract_pose,
+                    MotionGenPlanConfig(max_attempts=1, enable_finetune_trajopt=True),
+                    link_poses=link_poses,
+                )
+            if enable_graph:
+                self.graph_planner.warmup(
+                    self.rollout_fn.dynamics_model.retract_config.view(1, -1).clone(),
+                    start_state.position,
+                )
+
+                self.plan_single(
+                    start_state,
+                    retract_pose,
+                    MotionGenPlanConfig(
+                        max_attempts=1, enable_finetune_trajopt=True, enable_graph=enable_graph
+                    ),
+                    link_poses=link_poses,
+                )
+            if warmup_js_trajopt:
+                # warm up js_trajopt:
+                goal_state = start_state.clone()
+                goal_state.position += 0.2
+                for _ in range(2):
+                    self.plan_single_js(
+                        start_state, goal_state, MotionGenPlanConfig(max_attempts=1)
+                    )
+
+        else:
+            start_state = JointState.from_position(
+                self.get_retract_config().view(1, -1).clone(),
+                joint_names=self.rollout_fn.joint_names,
+            ).repeat_seeds(batch)
+            state = self.rollout_fn.compute_kinematics(start_state)
+            retract_pose = Pose(state.ee_pos_seq, quaternion=state.ee_quat_seq)
+            start_state.position += 0.1
+
+            for _ in range(3):
+                if batch_env_mode:
+                    self.plan_batch_env(
+                        start_state,
+                        retract_pose,
+                        MotionGenPlanConfig(
+                            max_attempts=1, enable_finetune_trajopt=True, enable_graph=enable_graph
+                        ),
+                    )
+                else:
+                    self.plan_batch(
+                        start_state,
+                        retract_pose,
+                        MotionGenPlanConfig(
+                            max_attempts=1, enable_finetune_trajopt=True, enable_graph=enable_graph
+                        ),
+                    )
+        log_info("Warmup complete")
+        return True
+
+    def plan_single_js(
+        self,
+        start_state: JointState,
+        goal_state: JointState,
+        plan_config: MotionGenPlanConfig = MotionGenPlanConfig(),
+    ) -> MotionGenResult:
+        # NOTE: currently runs only one seed
+        time_dict = {
+            "solve_time": 0,
+            "ik_time": 0,
+            "graph_time": 0,
+            "trajopt_time": 0,
+            "trajopt_attempts": 0,
+        }
+        result = None
+        goal = Goal(goal_state=goal_state, current_state=start_state)
+
+        for n in range(plan_config.max_attempts):
+            traj_result = self.js_trajopt_solver.solve_single(goal)
+            time_dict["trajopt_time"] += traj_result.solve_time
+            time_dict["trajopt_attempts"] = n
+
+            if result is None:
+                result = MotionGenResult(success=traj_result.success)
+
+            if traj_result.success.item():
+                break
+
+        result.solve_time = time_dict["trajopt_time"]
+        if self.store_debug_in_result:
+            result.debug_info = {"trajopt_result": traj_result}
+        status = None
+        if not traj_result.success.item():
+            # print(traj_result.cspace_error, traj_result.success)
+            status = ""
+            if traj_result.cspace_error.item() >= self.js_trajopt_solver.cspace_threshold:
+                status += " Fail: C-SPACE Convergence"
+            if torch.count_nonzero(~traj_result.metrics.feasible).item() > 0:
+                status += " Fail: Constraints"
+            # print(traj_result.metrics.feasible)
+
+        result.status = status
+        result.position_error = traj_result.position_error
+        result.rotation_error = traj_result.rotation_error
+        result.cspace_error = traj_result.cspace_error
+        result.optimized_dt = traj_result.optimized_dt
+        result.interpolated_plan = traj_result.interpolated_solution
+        result.optimized_plan = traj_result.solution
+        result.path_buffer_last_tstep = traj_result.path_buffer_last_tstep
+        result.success = traj_result.success
+        return result
+
+    def plan(
+        self,
+        start_state: JointState,
+        goal_pose: Pose,
+        enable_graph: bool = True,
+        enable_opt: bool = True,
+        use_nn_ik_seed: bool = False,
+        need_graph_success: bool = False,
+        max_attempts: int = 100,
+        timeout: float = 10.0,
+        enable_graph_attempt: Optional[int] = None,
+        disable_graph_attempt: Optional[int] = None,
+        trajopt_attempts: int = 1,
+        ik_fail_return: Optional[int] = None,
+        partial_ik_opt: bool = False,
+        num_ik_seeds: Optional[int] = None,
+        num_graph_seeds: Optional[int] = None,
+        num_trajopt_seeds: Optional[int] = None,
+    ):
+        plan_config = MotionGenPlanConfig(
+            enable_graph,
+            enable_opt,
+            use_nn_ik_seed,
+            need_graph_success,
+            max_attempts,
+            timeout,
+            enable_graph_attempt,
+            disable_graph_attempt,
+            ik_fail_return,
+            partial_ik_opt,
+            num_ik_seeds,
+            num_graph_seeds,
+            num_trajopt_seeds,
+        )
+        result = self.plan_single(start_state, goal_pose, plan_config)
+        return result
+
+    def batch_plan(
+        self,
+        start_state: JointState,
+        goal_pose: Pose,
+        enable_graph: bool = True,
+        enable_opt: bool = True,
+        use_nn_ik_seed: bool = False,
+        need_graph_success: bool = False,
+        max_attempts: int = 1,
+        timeout: float = 10.0,
+        enable_graph_attempt: Optional[int] = None,
+        disable_graph_attempt: Optional[int] = None,
+        success_ratio: float = 1.0,
+        ik_fail_return: Optional[int] = None,
+        fail_on_invalid_query: bool = False,
+        partial_ik_opt: bool = False,
+        num_ik_seeds: Optional[int] = None,
+        num_graph_seeds: Optional[int] = None,
+        num_trajopt_seeds: Optional[int] = None,
+    ):
+        plan_config = MotionGenPlanConfig(
+            enable_graph,
+            enable_opt,
+            use_nn_ik_seed,
+            need_graph_success,
+            max_attempts,
+            timeout,
+            enable_graph_attempt,
+            disable_graph_attempt,
+            ik_fail_return,
+            partial_ik_opt,
+            num_ik_seeds,
+            num_graph_seeds,
+            num_trajopt_seeds,
+            success_ratio=success_ratio,
+            fail_on_invalid_query=fail_on_invalid_query,
+        )
+        result = self.plan_batch(start_state, goal_pose, plan_config)
+        return result
+
+    def solve_trajopt(
+        self,
+        goal: Goal,
+        act_seed,
+        return_all_solutions: bool = False,
+        num_seeds: Optional[int] = None,
+    ):
+        result = self.trajopt_solver.solve(
+            goal, act_seed, return_all_solutions=return_all_solutions, num_seeds=num_seeds
+        )
+        return result
+
+    def get_active_js(
+        self,
+        in_js: JointState,
+    ):
+        opt_jnames = self.rollout_fn.joint_names
+        opt_js = in_js.get_ordered_joint_state(opt_jnames)
+        return opt_js
+
+    def get_all_rollout_instances(self) -> List[RolloutBase]:
+        if self._rollout_list is None:
+            self._rollout_list = (
+                self.ik_solver.get_all_rollout_instances()
+                + self.graph_planner.get_all_rollout_instances()
+                + self.trajopt_solver.get_all_rollout_instances()
+            )
+        return self._rollout_list
+
+    def get_all_kinematics_instances(self) -> List[CudaRobotModel]:
+        if self._kin_list is None:
+            self._kin_list = [
+                i.dynamics_model.robot_model for i in self.get_all_rollout_instances()
+            ]
+        return self._kin_list
+
+    def attach_objects_to_robot(
+        self,
+        joint_state: JointState,
+        object_names: List[str],
+        surface_sphere_radius: float = 0.001,
+        link_name: str = "attached_object",
+        sphere_fit_type: SphereFitType = SphereFitType.VOXEL_VOLUME_SAMPLE_SURFACE,
+        voxelize_method: str = "ray",
+        world_objects_pose_offset: Optional[Pose] = None,
+        remove_obstacles_from_world_config: bool = False,
+    ) -> None:
+        """Attach an object from world to robot's link.
+
+        Args:
+            surface_sphere_radius: _description_. Defaults to None.
+            sphere_tensor: _description_. Defaults to None.
+            link_name: _description_. Defaults to "attached_object".
+        """
+        log_info("MG: Attach objects to robot")
+        kin_state = self.compute_kinematics(joint_state)
+        ee_pose = kin_state.ee_pose  # w_T_ee
+        if world_objects_pose_offset is not None:
+            # add offset from ee:
+            ee_pose = world_objects_pose_offset.inverse().multiply(ee_pose)
+            # new ee_pose:
+            # w_T_ee = offset_T_w * w_T_ee
+            # ee_T_w
+        ee_pose = ee_pose.inverse()  # ee_T_w to multiply all objects later
+        max_spheres = self.robot_cfg.kinematics.kinematics_config.get_number_of_spheres(link_name)
+        n_spheres = int(max_spheres / len(object_names))
+        sphere_tensor = torch.zeros((max_spheres, 4))
+        sphere_tensor[:, 3] = -10.0
+        sph_list = []
+        if n_spheres == 0:
+            log_error("MG: No spheres found")
+        for i, x in enumerate(object_names):
+            obs = self.world_model.get_obstacle(x)
+            if obs is None:
+                log_error(
+                    "Object not found in world. Object name: "
+                    + x
+                    + " Name of objects in world: "
+                    + " ".join([i.name for i in self.world_model.objects])
+                )
+            sph = obs.get_bounding_spheres(
+                n_spheres,
+                surface_sphere_radius,
+                pre_transform_pose=ee_pose,
+                tensor_args=self.tensor_args,
+                fit_type=sphere_fit_type,
+                voxelize_method=voxelize_method,
+            )
+            sph_list += [s.position + [s.radius] for s in sph]
+
+            self.world_coll_checker.enable_obstacle(enable=False, name=x)
+            if remove_obstacles_from_world_config:
+                self.world_model.remove_obstacle(x)
+        log_info("MG: Computed spheres for attach objects to robot")
+
+        spheres = self.tensor_args.to_device(torch.as_tensor(sph_list))
+
+        if spheres.shape[0] > max_spheres:
+            spheres = spheres[: spheres.shape[0]]
+        sphere_tensor[: spheres.shape[0], :] = spheres.contiguous()
+
+        self.attach_spheres_to_robot(sphere_tensor=sphere_tensor, link_name=link_name)
+
+    def add_camera_frame(self, camera_observation: CameraObservation, obstacle_name: str):
+        self.world_coll_checker.add_camera_frame(camera_observation, obstacle_name)
+
+    def process_camera_frames(self, obstacle_name: Optional[str] = None, process_aux: bool = False):
+        self.world_coll_checker.process_camera_frames(obstacle_name, process_aux=process_aux)
+
+    def attach_bounding_box_from_blox_to_robot(
+        self,
+        joint_state: JointState,
+        bounding_box: Cuboid,
+        blox_layer_name: Optional[str] = None,
+        surface_sphere_radius: float = 0.001,
+        link_name: str = "attached_object",
+        sphere_fit_type: SphereFitType = SphereFitType.VOXEL_VOLUME_SAMPLE_SURFACE,
+        voxelize_method: str = "ray",
+        world_objects_pose_offset: Optional[Pose] = None,
+    ):
+        """Attach the voxels in a blox layer to robot's link.
+
+        NOTE: This is not currently implemented.
+
+        Args:
+            joint_state (JointState): _description_
+            bounding_box (Cuboid): _description_
+            blox_layer_name (Optional[str], optional): _description_. Defaults to None.
+            surface_sphere_radius (float, optional): _description_. Defaults to 0.001.
+            link_name (str, optional): _description_. Defaults to "attached_object".
+            sphere_fit_type (SphereFitType, optional): _description_. Defaults to SphereFitType.VOXEL_VOLUME_SAMPLE_SURFACE.
+            voxelize_method (str, optional): _description_. Defaults to "ray".
+            world_objects_pose_offset (Optional[Pose], optional): _description_. Defaults to None.
+        """
+        kin_state = self.compute_kinematics(joint_state)
+        ee_pose = kin_state.ee_pose  # w_T_ee
+        if world_objects_pose_offset is not None:
+            # add offset from ee:
+            ee_pose = world_objects_pose_offset.inverse().multiply(ee_pose)
+            # new ee_pose:
+            # w_T_ee = offset_T_w * w_T_ee
+            # ee_T_w
+        ee_pose = ee_pose.inverse()  # ee_T_w to multiply all objects later
+        max_spheres = self.robot_cfg.kinematics.kinematics_config.get_number_of_spheres(link_name)
+        n_spheres = max_spheres
+        sphere_tensor = torch.zeros((max_spheres, 4))
+        sphere_tensor[:, 3] = -10.0
+        sph_list = []
+        if n_spheres == 0:
+            log_error("MG: No spheres found")
+        sph = self.world_coll_checker.get_bounding_spheres(
+            bounding_box,
+            blox_layer_name,
+            n_spheres,
+            surface_sphere_radius,
+            sphere_fit_type,
+            voxelize_method,
+            pre_transform_pose=ee_pose,
+            clear_region=True,
+        )
+        sph_list += [s.position + [s.radius] for s in sph]
+
+        log_info("MG: Computed spheres for attach objects to robot")
+
+        spheres = self.tensor_args.to_device(torch.as_tensor(sph_list))
+
+        if spheres.shape[0] > max_spheres:
+            spheres = spheres[: spheres.shape[0]]
+        sphere_tensor[: spheres.shape[0], :] = spheres.contiguous()
+
+        self.attach_spheres_to_robot(sphere_tensor=sphere_tensor, link_name=link_name)
+
+    def attach_new_object_to_robot(
+        self,
+        joint_state: JointState,
+        obstacle: Obstacle,
+        surface_sphere_radius: float = 0.001,
+        link_name: str = "attached_object",
+        sphere_fit_type: SphereFitType = SphereFitType.VOXEL_VOLUME_SAMPLE_SURFACE,
+        voxelize_method: str = "ray",
+        world_objects_pose_offset: Optional[Pose] = None,
+    ):
+        """Attach an object to robot's link. The object to be attached is not in the world model.
+
+        Args:
+            joint_state (JointState): _description_
+            obstacle (Obstacle): _description_
+            surface_sphere_radius (float, optional): _description_. Defaults to 0.001.
+            link_name (str, optional): _description_. Defaults to "attached_object".
+            sphere_fit_type (SphereFitType, optional): _description_. Defaults to SphereFitType.VOXEL_VOLUME_SAMPLE_SURFACE.
+            voxelize_method (str, optional): _description_. Defaults to "ray".
+            world_objects_pose_offset (Optional[Pose], optional): _description_. Defaults to None.
+        """
+        log_info("MG: Attach objects to robot")
+        kin_state = self.compute_kinematics(joint_state)
+        ee_pose = kin_state.ee_pose  # w_T_ee
+        if world_objects_pose_offset is not None:
+            # add offset from ee:
+            ee_pose = world_objects_pose_offset.inverse().multiply(ee_pose)
+            # new ee_pose:
+            # w_T_ee = offset_T_w * w_T_ee
+            # ee_T_w
+        ee_pose = ee_pose.inverse()  # ee_T_w to multiply all objects later
+        max_spheres = self.robot_cfg.kinematics.kinematics_config.get_number_of_spheres(link_name)
+        n_spheres = max_spheres
+        sphere_tensor = torch.zeros((max_spheres, 4))
+        sphere_tensor[:, 3] = -10.0
+        sph_list = []
+        if n_spheres == 0:
+            log_error("MG: No spheres found")
+        sph = obstacle.get_bounding_spheres(
+            n_spheres,
+            surface_sphere_radius,
+            pre_transform_pose=ee_pose,
+            tensor_args=self.tensor_args,
+            fit_type=sphere_fit_type,
+            voxelize_method=voxelize_method,
+        )
+        sph_list += [s.position + [s.radius] for s in sph]
+
+        log_info("MG: Computed spheres for attach objects to robot")
+
+        spheres = self.tensor_args.to_device(torch.as_tensor(sph_list))
+
+        if spheres.shape[0] > max_spheres:
+            spheres = spheres[: spheres.shape[0]]
+        sphere_tensor[: spheres.shape[0], :] = spheres.contiguous()
+
+        self.attach_spheres_to_robot(sphere_tensor=sphere_tensor, link_name=link_name)
+
+    def detach_object_from_robot(self, link_name: str = "attached_object") -> None:
+        return self.detach_spheres_from_robot(link_name)
+
+    def attach_spheres_to_robot(
+        self,
+        sphere_radius: Optional[float] = None,
+        sphere_tensor: Optional[torch.Tensor] = None,
+        link_name: str = "attached_object",
+    ) -> None:
+        """Attach spheres to robot's link.
+
+        Args:
+            sphere_radius: _description_. Defaults to None.
+            sphere_tensor: _description_. Defaults to None.
+            link_name: _description_. Defaults to "attached_object".
+        """
+        self.robot_cfg.kinematics.kinematics_config.attach_object(
+            sphere_radius=sphere_radius, sphere_tensor=sphere_tensor, link_name=link_name
+        )
+
+    def detach_spheres_from_robot(self, link_name: str = "attached_object") -> None:
+        self.robot_cfg.kinematics.kinematics_config.detach_object(link_name)
+
+    def get_full_js(self, active_js: JointState) -> JointState:
+        return self.rollout_fn.get_full_dof_from_solution(active_js)
+
+    def add_running_pose_constraint(
+        self,
+        lock_x: bool = False,
+        lock_y: bool = False,
+        lock_z: bool = False,
+        lock_rx: bool = False,
+        lock_ry: bool = False,
+        lock_rz: bool = False,
+    ):
+        raise NotImplementedError()
+
+    def remove_running_pose_constraint(self):
+        raise NotImplementedError()
+
+    def run_finetune_trajopt(
+        self,
+        start_state: JointState,
+        goal_pose: Pose,
+        traj_solution: JointState,
+        traj_dt: Union[float, torch.Tensor, None],
+        max_attempts: int = 1,
+    ):
+        # NOTE: Currently only works for single environment. Need to rewrite for all modes
+        # finetunes solution
+        if traj_dt is not None:
+            self.finetune_trajopt_solver.update_solver_dt(traj_dt.item())
+
+        # call trajopt with seed:
+
+        # NOTE: currently runs only one seed
+        time_dict = {
+            "solve_time": 0,
+            "ik_time": 0,
+            "graph_time": 0,
+            "trajopt_time": 0,
+            "trajopt_attempts": 0,
+        }
+        result = None
+        # goal_state = JointState.from_position(
+        #    traj_solution.position[..., -2:-1, :].clone(), joint_names=start_state.joint_names
+        # )
+        goal = Goal(
+            goal_pose=goal_pose,
+            # goal_state=goal_state,
+            current_state=start_state,
+        )
+
+        for n in range(max_attempts):
+            traj_result = self.finetune_trajopt_solver.solve_single(goal, traj_solution)
+            time_dict["trajopt_time"] += traj_result.solve_time
+            time_dict["trajopt_attempts"] = n
+            if result is None:
+                result = MotionGenResult(success=traj_result.success)
+
+            if traj_result.success.item():
+                break
+
+        if self.store_debug_in_result:
+            result.debug_info = {"trajopt_result": traj_result}
+
+        result.position_error = traj_result.position_error
+        result.rotation_error = traj_result.rotation_error
+        result.cspace_error = traj_result.cspace_error
+        result.optimized_dt = traj_result.optimized_dt
+        result.interpolated_plan = traj_result.interpolated_solution
+        result.optimized_plan = traj_result.solution
+        result.path_buffer_last_tstep = traj_result.path_buffer_last_tstep
+        result.trajopt_time = time_dict["trajopt_time"]
+        return result
+
+    @property
+    def world_model(self) -> WorldConfig:
+        return self.world_coll_checker.world_model
+
+    @property
+    def world_collision(self) -> WorldCollision:
+        return self.world_coll_checker
+
+    def update_interpolation_type(
+        self,
+        interpolation_type: InterpolateType,
+        update_graph: bool = True,
+        update_trajopt: bool = True,
+    ):
+        if update_graph:
+            self.graph_planner.interpolation_type = interpolation_type
+        if update_trajopt:
+            self.trajopt_solver.interpolation_type = interpolation_type
+            self.finetune_trajopt_solver.interpolation_type = interpolation_type
+            self.js_trajopt_solver.interpolation_type = interpolation_type
+
+    def update_locked_state(js: JointState):
+        # update the joint values of locked joints:
+
+        pass
diff --git a/src/curobo/wrap/reacher/mpc.py b/src/curobo/wrap/reacher/mpc.py
new file mode 100644
index 00000000..1a767c91
--- /dev/null
+++ b/src/curobo/wrap/reacher/mpc.py
@@ -0,0 +1,491 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+
+# Standard Library
+import time
+from dataclasses import dataclass
+from typing import Dict, Optional, Union
+
+# Third Party
+import torch
+
+# CuRobo
+from curobo.geom.sdf.utils import create_collision_checker
+from curobo.geom.sdf.world import CollisionCheckerType, WorldCollision, WorldCollisionConfig
+from curobo.geom.types import WorldConfig
+from curobo.opt.newton.lbfgs import LBFGSOpt, LBFGSOptConfig
+from curobo.opt.particle.parallel_es import ParallelES, ParallelESConfig
+from curobo.opt.particle.parallel_mppi import ParallelMPPI, ParallelMPPIConfig
+from curobo.rollout.arm_reacher import ArmReacher, ArmReacherConfig
+from curobo.rollout.rollout_base import Goal
+from curobo.types.base import TensorDeviceType
+from curobo.types.robot import JointState, RobotConfig
+from curobo.util.logger import log_error, log_info, log_warn
+from curobo.util_file import (
+    get_robot_configs_path,
+    get_task_configs_path,
+    get_world_configs_path,
+    join_path,
+    load_yaml,
+)
+from curobo.wrap.reacher.types import ReacherSolveState, ReacherSolveType
+from curobo.wrap.wrap_base import WrapResult
+from curobo.wrap.wrap_mpc import WrapConfig, WrapMpc
+
+
+@dataclass
+class MpcSolverConfig:
+    solver: WrapMpc
+    world_coll_checker: Optional[WorldCollision] = None
+    tensor_args: TensorDeviceType = TensorDeviceType()
+    use_cuda_graph_full_step: bool = False
+
+    @staticmethod
+    def load_from_robot_config(
+        robot_cfg: Union[Union[str, dict], RobotConfig],
+        world_cfg: Union[Union[str, dict], WorldConfig],
+        base_cfg: Optional[dict] = None,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+        compute_metrics: bool = True,
+        use_cuda_graph: Optional[bool] = None,
+        particle_opt_iters: Optional[int] = None,
+        self_collision_check: bool = True,
+        collision_checker_type: Optional[CollisionCheckerType] = CollisionCheckerType.PRIMITIVE,
+        use_es: Optional[bool] = None,
+        es_learning_rate: Optional[float] = 0.01,
+        use_cuda_graph_metrics: bool = False,
+        store_rollouts: bool = True,
+        use_cuda_graph_full_step: bool = False,
+        sync_cuda_time: bool = True,
+        collision_cache: Optional[Dict[str, int]] = None,
+        n_collision_envs: Optional[int] = None,
+        collision_activation_distance: Optional[float] = None,
+        world_coll_checker=None,
+        step_dt: Optional[float] = None,
+        use_lbfgs: bool = False,
+        use_mppi: bool = True,
+    ):
+        if use_cuda_graph_full_step:
+            log_error("use_cuda_graph_full_step currently is not supported")
+            raise ValueError("use_cuda_graph_full_step currently is not supported")
+
+        task_file = "particle_mpc.yml"
+        config_data = load_yaml(join_path(get_task_configs_path(), task_file))
+        config_data["mppi"]["n_envs"] = 1
+        if step_dt is not None:
+            config_data["model"]["dt_traj_params"]["base_dt"] = step_dt
+        if particle_opt_iters is not None:
+            config_data["mppi"]["n_iters"] = particle_opt_iters
+
+        if base_cfg is None:
+            base_cfg = load_yaml(join_path(get_task_configs_path(), "base_cfg.yml"))
+        if collision_cache is not None:
+            base_cfg["world_collision_checker_cfg"]["cache"] = collision_cache
+        if n_collision_envs is not None:
+            base_cfg["world_collision_checker_cfg"]["n_envs"] = n_collision_envs
+
+        if collision_activation_distance is not None:
+            config_data["cost"]["primitive_collision_cfg"][
+                "activation_distance"
+            ] = collision_activation_distance
+
+        if not self_collision_check:
+            base_cfg["constraint"]["self_collision_cfg"]["weight"] = 0.0
+            config_data["cost"]["self_collision_cfg"]["weight"] = 0.0
+        if collision_checker_type is not None:
+            base_cfg["world_collision_checker_cfg"]["checker_type"] = collision_checker_type
+        if isinstance(robot_cfg, str):
+            robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_cfg))
+        if isinstance(robot_cfg, dict):
+            robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+
+        if isinstance(world_cfg, str):
+            world_cfg = load_yaml(join_path(get_world_configs_path(), world_cfg))
+
+        if world_coll_checker is None and world_cfg is not None:
+            world_cfg = WorldCollisionConfig.load_from_dict(
+                base_cfg["world_collision_checker_cfg"], world_cfg, tensor_args
+            )
+            world_coll_checker = create_collision_checker(world_cfg)
+        grad_config_data = None
+        if use_lbfgs:
+            grad_config_data = load_yaml(join_path(get_task_configs_path(), "gradient_mpc.yml"))
+            if step_dt is not None:
+                grad_config_data["model"]["dt_traj_params"]["base_dt"] = step_dt
+                grad_config_data["model"]["dt_traj_params"]["max_dt"] = step_dt
+
+            config_data["model"] = grad_config_data["model"]
+            if use_cuda_graph is not None:
+                grad_config_data["lbfgs"]["use_cuda_graph"] = use_cuda_graph
+
+        cfg = ArmReacherConfig.from_dict(
+            robot_cfg,
+            config_data["model"],
+            config_data["cost"],
+            base_cfg["constraint"],
+            base_cfg["convergence"],
+            base_cfg["world_collision_checker_cfg"],
+            world_cfg,
+            world_coll_checker=world_coll_checker,
+            tensor_args=tensor_args,
+        )
+
+        arm_rollout_mppi = ArmReacher(cfg)
+        arm_rollout_safety = ArmReacher(cfg)
+        config_data["mppi"]["store_rollouts"] = store_rollouts
+        if use_cuda_graph is not None:
+            config_data["mppi"]["use_cuda_graph"] = use_cuda_graph
+        if use_cuda_graph_full_step:
+            config_data["mppi"]["sync_cuda_time"] = False
+        config_dict = ParallelMPPIConfig.create_data_dict(
+            config_data["mppi"], arm_rollout_mppi, tensor_args
+        )
+        solvers = []
+        parallel_mppi = None
+        if use_es is not None and use_es:
+            log_warn("ES solver for MPC is highly experimental, not safe to run on real robots")
+
+            mppi_cfg = ParallelESConfig(**config_dict)
+            if es_learning_rate is not None:
+                mppi_cfg.learning_rate = es_learning_rate
+            parallel_mppi = ParallelES(mppi_cfg)
+        elif use_mppi:
+            mppi_cfg = ParallelMPPIConfig(**config_dict)
+            parallel_mppi = ParallelMPPI(mppi_cfg)
+        if parallel_mppi is not None:
+            solvers.append(parallel_mppi)
+        if use_lbfgs:
+            log_warn("LBFGS solver for MPC is highly experimental, not safe to run on real robots")
+            grad_cfg = ArmReacherConfig.from_dict(
+                robot_cfg,
+                grad_config_data["model"],
+                grad_config_data["cost"],
+                base_cfg["constraint"],
+                base_cfg["convergence"],
+                base_cfg["world_collision_checker_cfg"],
+                world_cfg,
+                world_coll_checker=world_coll_checker,
+                tensor_args=tensor_args,
+            )
+
+            arm_rollout_grad = ArmReacher(grad_cfg)
+            lbfgs_cfg_dict = LBFGSOptConfig.create_data_dict(
+                grad_config_data["lbfgs"], arm_rollout_grad, tensor_args
+            )
+            lbfgs = LBFGSOpt(LBFGSOptConfig(**lbfgs_cfg_dict))
+            solvers.append(lbfgs)
+
+        mpc_cfg = WrapConfig(
+            safety_rollout=arm_rollout_safety,
+            optimizers=solvers,
+            compute_metrics=compute_metrics,
+            use_cuda_graph_metrics=use_cuda_graph_metrics,
+            sync_cuda_time=sync_cuda_time,
+        )
+        solver = WrapMpc(mpc_cfg)
+        return MpcSolverConfig(
+            solver,
+            tensor_args=tensor_args,
+            use_cuda_graph_full_step=use_cuda_graph_full_step,
+            world_coll_checker=world_coll_checker,
+        )
+
+
+class MpcSolver(MpcSolverConfig):
+    """Model Predictive Control Solver for Arm Reacher task.
+
+    Args:
+        MpcSolverConfig: _description_
+    """
+
+    def __init__(self, config: MpcSolverConfig) -> None:
+        super().__init__(**vars(config))
+        self.tensor_args = self.solver.rollout_fn.tensor_args
+        self._goal_buffer = Goal()
+        self.batch_size = -1
+        self._goal_buffer = None
+        self._solve_state = None
+        self._col = None
+        self._step_goal_buffer = None
+        self._cu_state_in = None
+        self._cu_seed = None
+        self._cu_step_init = None
+        self._cu_step_graph = None
+        self._cu_result = None
+
+    def _update_batch_size(self, batch_size):
+        if self.batch_size != batch_size:
+            self.batch_size = batch_size
+
+    def update_goal_buffer(
+        self,
+        solve_state: ReacherSolveState,
+        goal: Goal,
+    ) -> Goal:
+        self._solve_state, self._goal_buffer, update_reference = solve_state.update_goal(
+            goal,
+            self._solve_state,
+            self._goal_buffer,
+            self.tensor_args,
+        )
+        if update_reference:
+            self.solver.update_nenvs(self._solve_state.get_batch_size())
+            self.reset()
+            self.reset_cuda_graph()
+            self._col = torch.arange(
+                0, goal.batch, device=self.tensor_args.device, dtype=torch.long
+            )
+            self._step_goal_buffer = Goal(
+                current_state=self._goal_buffer.current_state.clone(),
+                batch_current_state_idx=self._goal_buffer.batch_current_state_idx.clone(),
+            )
+        return self._goal_buffer
+
+    def step(
+        self,
+        current_state: JointState,
+        shift_steps: int = 1,
+        seed_traj: Optional[JointState] = None,
+        max_attempts: int = 1,
+    ):
+        converged = True
+
+        for _ in range(max_attempts):
+            result = self.step_once(current_state.clone(), shift_steps, seed_traj)
+            if (
+                torch.count_nonzero(torch.isnan(result.action.position)) == 0
+                and torch.max(torch.abs(result.action.position)) < 10.0
+                and torch.count_nonzero(~result.metrics.feasible) == 0
+            ):
+                converged = True
+                break
+            self.reset()
+        if not converged:
+            result.action.copy_(current_state)
+            log_warn("NOT CONVERGED")
+
+        return result
+
+    def step_once(
+        self,
+        current_state: JointState,
+        shift_steps: int = 1,
+        seed_traj: Optional[JointState] = None,
+    ) -> WrapResult:
+        # Create cuda graph for whole solve step including computation of metrics
+        # Including updation of goal buffers
+
+        if self._solve_state is None:
+            log_error("Need to first setup solve state before calling solve()")
+
+        if self.use_cuda_graph_full_step:
+            st_time = time.time()
+            if not self._cu_step_init:
+                self._initialize_cuda_graph_step(current_state, shift_steps, seed_traj)
+            self._cu_state_in.copy_(current_state)
+            if seed_traj is not None:
+                self._cu_seed.copy_(seed_traj)
+            self._cu_step_graph.replay()
+            result = self._cu_result.clone()
+            torch.cuda.synchronize()
+            result.solve_time = time.time() - st_time
+        else:
+            self._step_goal_buffer.current_state.copy_(current_state)
+            result = self._solve_from_solve_state(
+                self._solve_state,
+                self._step_goal_buffer,
+                shift_steps,
+                seed_traj,
+            )
+
+        return result
+
+    def _step(
+        self,
+        current_state: JointState,
+        shift_steps: int = 1,
+        seed_traj: Optional[JointState] = None,
+    ):
+        self._step_goal_buffer.current_state.copy_(current_state)
+        result = self._solve_from_solve_state(
+            self._solve_state,
+            self._step_goal_buffer,
+            shift_steps,
+            seed_traj,
+        )
+
+        return result
+
+    def _initialize_cuda_graph_step(
+        self,
+        current_state: JointState,
+        shift_steps: int = 1,
+        seed_traj: Optional[JointState] = None,
+    ):
+        log_info("MpcSolver: Creating Cuda Graph")
+        self._cu_state_in = current_state.clone()
+        if seed_traj is not None:
+            self._cu_seed = seed_traj.clone()
+        s = torch.cuda.Stream(device=self.tensor_args.device)
+        s.wait_stream(torch.cuda.current_stream(device=self.tensor_args.device))
+        with torch.cuda.stream(s):
+            for _ in range(3):
+                self._cu_result = self._step(
+                    self._cu_state_in, shift_steps=shift_steps, seed_traj=self._cu_seed
+                )
+        torch.cuda.current_stream(device=self.tensor_args.device).wait_stream(s)
+        self.reset()
+        self._cu_step_graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(self._cu_step_graph, stream=s):
+            self._cu_result = self._step(
+                self._cu_state_in, shift_steps=shift_steps, seed_traj=self._cu_seed
+            )
+        self._cu_step_init = True
+
+    def setup_solve_single(self, goal: Goal, num_seeds: Optional[int] = None) -> Goal:
+        solve_state = ReacherSolveState(
+            ReacherSolveType.SINGLE, num_mpc_seeds=num_seeds, batch_size=1, n_envs=1, n_goalset=1
+        )
+        goal_buffer = self.update_goal_buffer(solve_state, goal)
+        return goal_buffer
+
+    def setup_solve_goalset(self, goal: Goal, num_seeds: Optional[int] = None) -> Goal:
+        solve_state = ReacherSolveState(
+            ReacherSolveType.GOALSET,
+            num_mpc_seeds=num_seeds,
+            batch_size=1,
+            n_envs=1,
+            n_goalset=goal.n_goalset,
+        )
+        goal_buffer = self.update_goal_buffer(solve_state, goal)
+        return goal_buffer
+
+    def setup_solve_batch(self, goal: Goal, num_seeds: Optional[int] = None) -> Goal:
+        solve_state = ReacherSolveState(
+            ReacherSolveType.BATCH,
+            num_mpc_seeds=num_seeds,
+            batch_size=goal.batch,
+            n_envs=1,
+            n_goalset=1,
+        )
+        goal_buffer = self.update_goal_buffer(solve_state, goal)
+        return goal_buffer
+
+    def setup_solve_batch_goalset(self, goal: Goal, num_seeds: Optional[int] = None) -> Goal:
+        solve_state = ReacherSolveState(
+            ReacherSolveType.BATCH_GOALSET,
+            num_mpc_seeds=num_seeds,
+            batch_size=goal.batch,
+            n_envs=1,
+            n_goalset=goal.n_goalset,
+        )
+        goal_buffer = self.update_goal_buffer(solve_state, goal)
+        return goal_buffer
+
+    def setup_solve_batch_env(self, goal: Goal, num_seeds: Optional[int] = None) -> Goal:
+        solve_state = ReacherSolveState(
+            ReacherSolveType.BATCH_ENV,
+            num_mpc_seeds=num_seeds,
+            batch_size=goal.batch,
+            n_envs=1,
+            n_goalset=1,
+        )
+        goal_buffer = self.update_goal_buffer(solve_state, goal)
+        return goal_buffer
+
+    def setup_solve_batch_env_goalset(self, goal: Goal, num_seeds: Optional[int] = None) -> Goal:
+        solve_state = ReacherSolveState(
+            ReacherSolveType.BATCH_ENV_GOALSET,
+            num_mpc_seeds=num_seeds,
+            batch_size=goal.batch,
+            n_envs=1,
+            n_goalset=goal.n_goalset,
+        )
+        goal_buffer = self.update_goal_buffer(solve_state, goal)
+        return goal_buffer
+
+    def _solve_from_solve_state(
+        self,
+        solve_state: ReacherSolveState,
+        goal: Goal,
+        shift_steps: int = 1,
+        seed_traj: Optional[JointState] = None,
+    ) -> WrapResult:
+        if solve_state.batch_env:
+            if solve_state.batch_size > self.world_coll_checker.n_envs:
+                raise ValueError("Batch Env is less that goal batch")
+
+        goal_buffer = self.update_goal_buffer(solve_state, goal)
+        # NOTE: implement initialization from seed set here:
+        if seed_traj is not None:
+            self.solver.update_init_seed(seed_traj)
+
+        result = self.solver.solve(goal_buffer, seed_traj, shift_steps)
+        result.js_action = self.rollout_fn.get_full_dof_from_solution(result.action)
+        return result
+
+    def fn(self):
+        # this will run one step of optimization and get new command
+        pass
+
+    def update_goal(self, goal: Goal):
+        self.solver.update_params(goal)
+        return True
+
+    def reset(self):
+        # reset warm start
+        self.solver.reset()
+        pass
+
+    def reset_cuda_graph(self):
+        self.solver.reset_cuda_graph()
+
+    @property
+    def rollout_fn(self):
+        return self.solver.safety_rollout
+
+    def enable_cspace_cost(self, enable=True):
+        self.solver.safety_rollout.enable_cspace_cost(enable)
+        for opt in self.solver.optimizers:
+            opt.rollout_fn.enable_cspace_cost(enable)
+
+    def enable_pose_cost(self, enable=True):
+        self.solver.safety_rollout.enable_pose_cost(enable)
+        for opt in self.solver.optimizers:
+            opt.rollout_fn.enable_pose_cost(enable)
+
+    def get_active_js(
+        self,
+        in_js: JointState,
+    ):
+        opt_jnames = self.rollout_fn.joint_names
+        opt_js = in_js.get_ordered_joint_state(opt_jnames)
+        return opt_js
+
+    @property
+    def joint_names(self):
+        return self.rollout_fn.joint_names
+
+    def update_world(self, world: WorldConfig):
+        self.world_coll_checker.load_collision_model(world)
+        return True
+
+    def get_visual_rollouts(self):
+        return self.solver.optimizers[0].get_rollouts()
+
+    @property
+    def kinematics(self):
+        return self.solver.safety_rollout.dynamics_model.robot_model
+
+    @property
+    def world_collision(self):
+        return self.world_coll_checker
diff --git a/src/curobo/wrap/reacher/trajopt.py b/src/curobo/wrap/reacher/trajopt.py
new file mode 100644
index 00000000..b607aa80
--- /dev/null
+++ b/src/curobo/wrap/reacher/trajopt.py
@@ -0,0 +1,1198 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+
+# Standard Library
+import math
+import time
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Sequence, Union
+
+# Third Party
+import torch
+import torch.autograd.profiler as profiler
+
+# CuRobo
+from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel, CudaRobotModelState
+from curobo.geom.sdf.utils import create_collision_checker
+from curobo.geom.sdf.world import WorldCollision, WorldCollisionConfig
+from curobo.geom.types import WorldConfig
+from curobo.opt.newton.lbfgs import LBFGSOpt, LBFGSOptConfig
+from curobo.opt.newton.newton_base import NewtonOptBase, NewtonOptConfig
+from curobo.opt.particle.parallel_es import ParallelES, ParallelESConfig
+from curobo.opt.particle.parallel_mppi import ParallelMPPI, ParallelMPPIConfig
+from curobo.rollout.arm_reacher import ArmReacher, ArmReacherConfig
+from curobo.rollout.dynamics_model.integration_utils import (
+    action_interpolate_kernel,
+    interpolate_kernel,
+)
+from curobo.rollout.rollout_base import Goal, RolloutBase, RolloutMetrics
+from curobo.types.base import TensorDeviceType
+from curobo.types.robot import JointState, RobotConfig
+from curobo.types.tensor import T_BDOF, T_DOF, T_BValue_bool, T_BValue_float
+from curobo.util.helpers import list_idx_if_not_none
+from curobo.util.logger import log_info, log_warn
+from curobo.util.trajectory import (
+    InterpolateType,
+    calculate_dt_no_clamp,
+    get_batch_interpolated_trajectory,
+)
+from curobo.util_file import get_robot_configs_path, get_task_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.evaluator import TrajEvaluator, TrajEvaluatorConfig
+from curobo.wrap.reacher.types import ReacherSolveState, ReacherSolveType
+from curobo.wrap.wrap_base import WrapBase, WrapConfig, WrapResult
+
+
+@dataclass
+class TrajOptSolverConfig:
+    robot_config: RobotConfig
+    solver: WrapBase
+    rollout_fn: ArmReacher
+    position_threshold: float
+    rotation_threshold: float
+    traj_tsteps: int
+    use_cspace_seed: bool = True
+    interpolation_type: InterpolateType = InterpolateType.LINEAR_CUDA
+    interpolation_steps: int = 1000
+    world_coll_checker: Optional[WorldCollision] = None
+    seed_ratio: Optional[Dict[str, int]] = None
+    num_seeds: int = 1
+    bias_node: Optional[T_DOF] = None
+    interpolation_dt: float = 0.01
+    traj_evaluator_config: TrajEvaluatorConfig = TrajEvaluatorConfig()
+    traj_evaluator: Optional[TrajEvaluator] = None
+    evaluate_interpolated_trajectory: bool = True
+    cspace_threshold: float = 0.1
+    tensor_args: TensorDeviceType = TensorDeviceType()
+    sync_cuda_time: bool = True
+    interpolate_rollout: Optional[ArmReacher] = None
+    use_cuda_graph_metrics: bool = False
+    trim_steps: Optional[List[int]] = None
+    store_debug_in_result: bool = False
+
+    @staticmethod
+    @profiler.record_function("trajopt_config/load_from_robot_config")
+    def load_from_robot_config(
+        robot_cfg: Union[str, Dict, RobotConfig],
+        world_model: Optional[
+            Union[Union[List[Dict], List[WorldConfig]], Union[Dict, WorldConfig]]
+        ] = None,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+        position_threshold: float = 0.005,
+        rotation_threshold: float = 0.05,
+        cspace_threshold: float = 0.05,
+        world_coll_checker=None,
+        base_cfg_file: str = "base_cfg.yml",
+        particle_file: str = "particle_trajopt.yml",
+        gradient_file: str = "gradient_trajopt.yml",
+        traj_tsteps: Optional[int] = None,
+        interpolation_type: InterpolateType = InterpolateType.LINEAR_CUDA,
+        interpolation_steps: int = 10000,
+        interpolation_dt: float = 0.01,
+        use_cuda_graph: Optional[bool] = None,
+        self_collision_check: bool = False,
+        self_collision_opt: bool = True,
+        grad_trajopt_iters: Optional[int] = None,
+        num_seeds: int = 2,
+        seed_ratio: Dict[str, int] = {"linear": 1.0, "bias": 0.0, "start": 0.0, "end": 0.0},
+        use_particle_opt: bool = True,
+        traj_evaluator_config: TrajEvaluatorConfig = TrajEvaluatorConfig(),
+        traj_evaluator: Optional[TrajEvaluator] = None,
+        minimize_jerk: Optional[bool] = None,
+        use_gradient_descent: bool = False,
+        collision_cache: Optional[Dict[str, int]] = None,
+        n_collision_envs: Optional[int] = None,
+        use_es: Optional[bool] = None,
+        es_learning_rate: Optional[float] = 0.1,
+        use_fixed_samples: Optional[bool] = None,
+        aux_rollout: Optional[ArmReacher] = None,
+        evaluate_interpolated_trajectory: bool = True,
+        fixed_iters: Optional[bool] = None,
+        store_debug: bool = False,
+        sync_cuda_time: bool = True,
+        collision_activation_distance: Optional[float] = None,
+        trajopt_dt: Optional[float] = None,
+        trim_steps: Optional[List[int]] = None,
+        store_debug_in_result: bool = False,
+        smooth_weight: Optional[List[float]] = None,
+        state_finite_difference_mode: Optional[str] = None,
+        filter_robot_command: bool = False,
+    ):
+        # use default values, disable environment collision checking
+        if isinstance(robot_cfg, str):
+            robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_cfg))["robot_cfg"]
+
+        if isinstance(robot_cfg, dict):
+            robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+
+        base_config_data = load_yaml(join_path(get_task_configs_path(), base_cfg_file))
+        if collision_cache is not None:
+            base_config_data["world_collision_checker_cfg"]["cache"] = collision_cache
+        if n_collision_envs is not None:
+            base_config_data["world_collision_checker_cfg"]["n_envs"] = n_collision_envs
+        if not self_collision_check:
+            base_config_data["constraint"]["self_collision_cfg"]["weight"] = 0.0
+            self_collision_opt = False
+        if not minimize_jerk:
+            filter_robot_command = False
+
+        if world_coll_checker is None and world_model is not None:
+            world_cfg = WorldCollisionConfig.load_from_dict(
+                base_config_data["world_collision_checker_cfg"], world_model, tensor_args
+            )
+            world_coll_checker = create_collision_checker(world_cfg)
+
+        config_data = load_yaml(join_path(get_task_configs_path(), particle_file))
+        grad_config_data = load_yaml(join_path(get_task_configs_path(), gradient_file))
+
+        if traj_tsteps is None:
+            traj_tsteps = grad_config_data["model"]["horizon"]
+
+        config_data["model"]["horizon"] = traj_tsteps
+        grad_config_data["model"]["horizon"] = traj_tsteps
+        if minimize_jerk is not None:
+            if not minimize_jerk:
+                grad_config_data["cost"]["bound_cfg"]["smooth_weight"][2] = 0.0
+                grad_config_data["cost"]["bound_cfg"]["smooth_weight"][1] *= 2.0
+                grad_config_data["lbfgs"]["cost_delta_threshold"] = 0.1
+            if minimize_jerk and grad_config_data["cost"]["bound_cfg"]["smooth_weight"][2] == 0.0:
+                log_warn("minimize_jerk flag is enabled but weight term is zero")
+
+        if state_finite_difference_mode is not None:
+            config_data["model"]["state_finite_difference_mode"] = state_finite_difference_mode
+            grad_config_data["model"]["state_finite_difference_mode"] = state_finite_difference_mode
+        config_data["model"]["filter_robot_command"] = filter_robot_command
+        grad_config_data["model"]["filter_robot_command"] = filter_robot_command
+
+        if collision_activation_distance is not None:
+            config_data["cost"]["primitive_collision_cfg"][
+                "activation_distance"
+            ] = collision_activation_distance
+            grad_config_data["cost"]["primitive_collision_cfg"][
+                "activation_distance"
+            ] = collision_activation_distance
+
+        if grad_trajopt_iters is not None:
+            grad_config_data["lbfgs"]["n_iters"] = grad_trajopt_iters
+            grad_config_data["lbfgs"]["cold_start_n_iters"] = grad_trajopt_iters
+        if use_fixed_samples is not None:
+            config_data["mppi"]["sample_params"]["fixed_samples"] = use_fixed_samples
+        if smooth_weight is not None:
+            grad_config_data["cost"]["bound_cfg"]["smooth_weight"][
+                : len(smooth_weight)
+            ] = smooth_weight  # velocity
+
+        if store_debug:
+            use_cuda_graph = False
+            fixed_iters = True
+            grad_config_data["lbfgs"]["store_debug"] = store_debug
+            config_data["mppi"]["store_debug"] = store_debug
+
+        if use_cuda_graph is not None:
+            config_data["mppi"]["use_cuda_graph"] = use_cuda_graph
+            grad_config_data["lbfgs"]["use_cuda_graph"] = use_cuda_graph
+        else:
+            use_cuda_graph = grad_config_data["lbfgs"]["use_cuda_graph"]
+        if not self_collision_opt:
+            config_data["cost"]["self_collision_cfg"]["weight"] = 0.0
+            grad_config_data["cost"]["self_collision_cfg"]["weight"] = 0.0
+        config_data["mppi"]["n_envs"] = 1
+        grad_config_data["lbfgs"]["n_envs"] = 1
+
+        if fixed_iters is not None:
+            grad_config_data["lbfgs"]["fixed_iters"] = fixed_iters
+
+        grad_cfg = ArmReacherConfig.from_dict(
+            robot_cfg,
+            grad_config_data["model"],
+            grad_config_data["cost"],
+            base_config_data["constraint"],
+            base_config_data["convergence"],
+            base_config_data["world_collision_checker_cfg"],
+            world_model,
+            world_coll_checker=world_coll_checker,
+            tensor_args=tensor_args,
+        )
+
+        cfg = ArmReacherConfig.from_dict(
+            robot_cfg,
+            config_data["model"],
+            config_data["cost"],
+            base_config_data["constraint"],
+            base_config_data["convergence"],
+            base_config_data["world_collision_checker_cfg"],
+            world_model,
+            world_coll_checker=world_coll_checker,
+            tensor_args=tensor_args,
+        )
+
+        safety_robot_model = robot_cfg.kinematics
+        safety_robot_cfg = RobotConfig(**vars(robot_cfg))
+        safety_robot_cfg.kinematics = safety_robot_model
+        safety_cfg = ArmReacherConfig.from_dict(
+            safety_robot_cfg,
+            config_data["model"],
+            config_data["cost"],
+            base_config_data["constraint"],
+            base_config_data["convergence"],
+            base_config_data["world_collision_checker_cfg"],
+            world_model,
+            world_coll_checker=world_coll_checker,
+            tensor_args=tensor_args,
+        )
+
+        with profiler.record_function("trajopt_config/create_rollouts"):
+            arm_rollout_mppi = ArmReacher(cfg)
+            arm_rollout_grad = ArmReacher(grad_cfg)
+
+            arm_rollout_safety = ArmReacher(safety_cfg)
+            if aux_rollout is None:
+                aux_rollout = ArmReacher(safety_cfg)
+            interpolate_rollout = ArmReacher(safety_cfg)
+        if trajopt_dt is not None:
+            arm_rollout_mppi.update_traj_dt(dt=trajopt_dt)
+            aux_rollout.update_traj_dt(dt=trajopt_dt)
+            arm_rollout_grad.update_traj_dt(dt=trajopt_dt)
+            arm_rollout_safety.update_traj_dt(dt=trajopt_dt)
+
+        config_dict = ParallelMPPIConfig.create_data_dict(
+            config_data["mppi"], arm_rollout_mppi, tensor_args
+        )
+        if use_es is not None and use_es:
+            mppi_cfg = ParallelESConfig(**config_dict)
+            if es_learning_rate is not None:
+                mppi_cfg.learning_rate = es_learning_rate
+            parallel_mppi = ParallelES(mppi_cfg)
+        else:
+            mppi_cfg = ParallelMPPIConfig(**config_dict)
+            parallel_mppi = ParallelMPPI(mppi_cfg)
+
+        config_dict = LBFGSOptConfig.create_data_dict(
+            grad_config_data["lbfgs"], arm_rollout_grad, tensor_args
+        )
+        lbfgs_cfg = LBFGSOptConfig(**config_dict)
+
+        if use_gradient_descent:
+            newton_keys = NewtonOptConfig.__dataclass_fields__.keys()
+            newton_d = {}
+            lbfgs_k = vars(lbfgs_cfg)
+            for k in newton_keys:
+                newton_d[k] = lbfgs_k[k]
+            newton_d["step_scale"] = 0.9
+            newton_cfg = NewtonOptConfig(**newton_d)
+            lbfgs = NewtonOptBase(newton_cfg)
+        else:
+            lbfgs = LBFGSOpt(lbfgs_cfg)
+        if use_particle_opt:
+            opt_list = [parallel_mppi]
+        else:
+            opt_list = []
+        opt_list.append(lbfgs)
+        cfg = WrapConfig(
+            safety_rollout=arm_rollout_safety,
+            optimizers=opt_list,
+            compute_metrics=not evaluate_interpolated_trajectory,
+            use_cuda_graph_metrics=grad_config_data["lbfgs"]["use_cuda_graph"],
+            sync_cuda_time=sync_cuda_time,
+        )
+        trajopt = WrapBase(cfg)
+        trajopt_cfg = TrajOptSolverConfig(
+            robot_config=robot_cfg,
+            rollout_fn=aux_rollout,
+            solver=trajopt,
+            position_threshold=position_threshold,
+            rotation_threshold=rotation_threshold,
+            cspace_threshold=cspace_threshold,
+            traj_tsteps=traj_tsteps,
+            interpolation_steps=interpolation_steps,
+            interpolation_dt=interpolation_dt,
+            interpolation_type=interpolation_type,
+            world_coll_checker=world_coll_checker,
+            bias_node=robot_cfg.kinematics.cspace.retract_config,
+            seed_ratio=seed_ratio,
+            num_seeds=num_seeds,
+            traj_evaluator_config=traj_evaluator_config,
+            traj_evaluator=traj_evaluator,
+            evaluate_interpolated_trajectory=evaluate_interpolated_trajectory,
+            tensor_args=tensor_args,
+            sync_cuda_time=sync_cuda_time,
+            interpolate_rollout=interpolate_rollout,
+            use_cuda_graph_metrics=use_cuda_graph,
+            trim_steps=trim_steps,
+            store_debug_in_result=store_debug_in_result,
+        )
+        return trajopt_cfg
+
+
+@dataclass
+class TrajResult(Sequence):
+    success: T_BValue_bool
+    goal: Goal
+    solution: JointState
+    seed: T_BDOF
+    solve_time: float
+    debug_info: Optional[Any] = None
+    metrics: Optional[RolloutMetrics] = None
+    interpolated_solution: Optional[JointState] = None
+    path_buffer_last_tstep: Optional[List[int]] = None
+    position_error: Optional[T_BValue_float] = None
+    rotation_error: Optional[T_BValue_float] = None
+    cspace_error: Optional[T_BValue_float] = None
+    smooth_error: Optional[T_BValue_float] = None
+    smooth_label: Optional[T_BValue_bool] = None
+    optimized_dt: Optional[torch.Tensor] = None
+    raw_solution: Optional[JointState] = None
+
+    def __getitem__(self, idx):
+        # position_error = rotation_error = cspace_error = path_buffer_last_tstep = None
+        # metrics = interpolated_solution = None
+
+        d_list = [
+            self.interpolated_solution,
+            self.metrics,
+            self.path_buffer_last_tstep,
+            self.position_error,
+            self.rotation_error,
+            self.cspace_error,
+        ]
+        idx_vals = list_idx_if_not_none(d_list, idx)
+
+        return TrajResult(
+            goal=self.goal[idx],
+            solution=self.solution[idx],
+            success=self.success[idx],
+            seed=self.seed[idx],
+            debug_info=self.debug_info,
+            solve_time=self.solve_time,
+            interpolated_solution=idx_vals[0],
+            metrics=idx_vals[1],
+            path_buffer_last_tstep=idx_vals[2],
+            position_error=idx_vals[3],
+            rotation_error=idx_vals[4],
+            cspace_error=idx_vals[5],
+        )
+
+    def __len__(self):
+        return self.success.shape[0]
+
+
+class TrajOptSolver(TrajOptSolverConfig):
+    def __init__(self, config: TrajOptSolverConfig) -> None:
+        super().__init__(**vars(config))
+        self.dof = self.rollout_fn.d_action
+        self.delta_vec = action_interpolate_kernel(2, self.traj_tsteps, self.tensor_args).unsqueeze(
+            0
+        )
+        self.waypoint_delta_vec = interpolate_kernel(
+            3, int(self.traj_tsteps / 2), self.tensor_args
+        ).unsqueeze(0)
+        self.waypoint_delta_vec = torch.roll(self.waypoint_delta_vec, -1, dims=1)
+        self.waypoint_delta_vec[:, -1, :] = self.waypoint_delta_vec[:, -2, :]
+        assert self.traj_tsteps / 2 != 0.0
+        self.solver.update_nenvs(self.num_seeds)
+
+        self._max_joint_vel = (
+            self.solver.safety_rollout.state_bounds.velocity.view(2, self.dof)[1, :].reshape(
+                1, 1, self.dof
+            )
+        ) - 0.02
+        self._max_joint_acc = self.rollout_fn.state_bounds.acceleration[1, :] - 0.02
+        self._max_joint_jerk = self.rollout_fn.state_bounds.jerk[1, :] - 0.02
+        # self._max_joint_jerk = self._max_joint_jerk * 0.0 + 10.0
+        self._num_seeds = -1
+        self._col = None
+        if self.traj_evaluator is None:
+            self.traj_evaluator = TrajEvaluator(self.traj_evaluator_config)
+        self._interpolation_dt_tensor = self.tensor_args.to_device([self.interpolation_dt])
+        self._n_seeds = self._get_seed_numbers(self.num_seeds)
+        self._goal_buffer = None
+        self._solve_state = None
+        self._velocity_bounds = self.solver.rollout_fn.state_bounds.velocity[1]
+        self._og_newton_iters = self.solver.optimizers[-1].outer_iters
+        self._og_newton_fixed_iters = self.solver.optimizers[-1].fixed_iters
+        self._interpolated_traj_buffer = None
+        self._kin_list = None
+        self._rollout_list = None
+
+    def get_all_rollout_instances(self) -> List[RolloutBase]:
+        if self._rollout_list is None:
+            self._rollout_list = [
+                self.rollout_fn,
+                self.interpolate_rollout,
+            ] + self.solver.get_all_rollout_instances()
+        return self._rollout_list
+
+    def get_all_kinematics_instances(self) -> List[CudaRobotModel]:
+        if self._kin_list is None:
+            self._kin_list = [
+                i.dynamics_model.robot_model for i in self.get_all_rollout_instances()
+            ]
+        return self._kin_list
+
+    def attach_object_to_robot(
+        self,
+        sphere_radius: float,
+        sphere_tensor: Optional[torch.Tensor] = None,
+        link_name: str = "attached_object",
+    ) -> None:
+        for k in self.get_all_kinematics_instances():
+            k.attach_object(
+                sphere_radius=sphere_radius, sphere_tensor=sphere_tensor, link_name=link_name
+            )
+
+    def detach_object_from_robot(self, link_name: str = "attached_object") -> None:
+        for k in self.get_all_kinematics_instances():
+            k.detach_object(link_name)
+
+    def update_goal_buffer(
+        self,
+        solve_state: ReacherSolveState,
+        goal: Goal,
+    ):
+        self._solve_state, self._goal_buffer, update_reference = solve_state.update_goal(
+            goal,
+            self._solve_state,
+            self._goal_buffer,
+            self.tensor_args,
+        )
+        if update_reference:
+            self.solver.update_nenvs(self._solve_state.get_batch_size())
+            self.reset_cuda_graph()
+            self._col = torch.arange(
+                0, goal.batch, device=self.tensor_args.device, dtype=torch.long
+            )
+        return self._goal_buffer
+
+    def solve_any(
+        self,
+        solve_type: ReacherSolveType,
+        goal: Goal,
+        seed_traj: Optional[JointState] = None,
+        use_nn_seed: bool = False,
+        return_all_solutions: bool = False,
+        num_seeds: Optional[int] = None,
+        seed_success: Optional[torch.Tensor] = None,
+        newton_iters: Optional[int] = None,
+    ) -> TrajResult:
+        if solve_type == ReacherSolveType.SINGLE:
+            return self.solve_single(
+                goal,
+                seed_traj,
+                use_nn_seed,
+                return_all_solutions,
+                num_seeds,
+                newton_iters=newton_iters,
+            )
+        elif solve_type == ReacherSolveType.GOALSET:
+            return self.solve_goalset(
+                goal,
+                seed_traj,
+                use_nn_seed,
+                return_all_solutions,
+                num_seeds,
+                newton_iters=newton_iters,
+            )
+        elif solve_type == ReacherSolveType.BATCH:
+            return self.solve_batch(
+                goal,
+                seed_traj,
+                use_nn_seed,
+                return_all_solutions,
+                num_seeds,
+                seed_success,
+                newton_iters=newton_iters,
+            )
+        elif solve_type == ReacherSolveType.BATCH_GOALSET:
+            return self.solve_batch_goalset(
+                goal,
+                seed_traj,
+                use_nn_seed,
+                return_all_solutions,
+                num_seeds,
+                seed_success,
+                newton_iters=newton_iters,
+            )
+        elif solve_type == ReacherSolveType.BATCH_ENV:
+            return self.solve_batch_env(
+                goal,
+                seed_traj,
+                use_nn_seed,
+                return_all_solutions,
+                num_seeds,
+                seed_success,
+                newton_iters=newton_iters,
+            )
+        elif solve_type == ReacherSolveType.BATCH_ENV_GOALSET:
+            return self.solve_batch_env_goalset(
+                goal,
+                seed_traj,
+                use_nn_seed,
+                return_all_solutions,
+                num_seeds,
+                seed_success,
+                newton_iters=newton_iters,
+            )
+
+    def solve_from_solve_state(
+        self,
+        solve_state: ReacherSolveState,
+        goal: Goal,
+        seed_traj: Optional[JointState] = None,
+        use_nn_seed: bool = False,
+        return_all_solutions: bool = False,
+        num_seeds: Optional[int] = None,
+        seed_success: Optional[torch.Tensor] = None,
+        newton_iters: Optional[int] = None,
+    ):
+        if solve_state.batch_env:
+            if solve_state.batch_size > self.world_coll_checker.n_envs:
+                raise ValueError("Batch Env is less that goal batch")
+        if newton_iters is not None:
+            self.solver.newton_optimizer.outer_iters = newton_iters
+            self.solver.newton_optimizer.fixed_iters = True
+        goal_buffer = self.update_goal_buffer(solve_state, goal)
+        # if self.evaluate_interpolated_trajectory:
+        self.interpolate_rollout.update_params(goal_buffer)
+        # get seeds:
+        seed_traj = self.get_seed_set(
+            goal_buffer, seed_traj, seed_success, num_seeds, solve_state.batch_mode
+        )
+
+        # remove goal state if goal pose is not None:
+        if goal_buffer.goal_pose.position is not None:
+            goal_buffer.goal_state = None
+        self.solver.reset()
+        result = self.solver.solve(goal_buffer, seed_traj)
+        log_info("Ran TO")
+        traj_result = self._get_result(
+            result,
+            return_all_solutions,
+            goal_buffer,
+            seed_traj,
+            num_seeds,
+            solve_state.batch_mode,
+        )
+        if newton_iters is not None:
+            self.solver.newton_optimizer.outer_iters = self._og_newton_iters
+            self.solver.newton_optimizer.fixed_iters = self._og_newton_fixed_iters
+        return traj_result
+
+    def solve_single(
+        self,
+        goal: Goal,
+        seed_traj: Optional[JointState] = None,
+        use_nn_seed: bool = False,
+        return_all_solutions: bool = False,
+        num_seeds: Optional[int] = None,
+        newton_iters: Optional[int] = None,
+    ) -> TrajResult:
+        if num_seeds is None:
+            num_seeds = self.num_seeds
+        solve_state = ReacherSolveState(
+            ReacherSolveType.SINGLE,
+            num_trajopt_seeds=num_seeds,
+            batch_size=1,
+            n_envs=1,
+            n_goalset=1,
+        )
+
+        return self.solve_from_solve_state(
+            solve_state,
+            goal,
+            seed_traj,
+            use_nn_seed,
+            return_all_solutions,
+            num_seeds,
+            newton_iters=newton_iters,
+        )
+
+    def solve_goalset(
+        self,
+        goal: Goal,
+        seed_traj: Optional[JointState] = None,
+        use_nn_seed: bool = False,
+        return_all_solutions: bool = False,
+        num_seeds: Optional[int] = None,
+        newton_iters: Optional[int] = None,
+    ) -> TrajResult:
+        if num_seeds is None:
+            num_seeds = self.num_seeds
+        solve_state = ReacherSolveState(
+            ReacherSolveType.GOALSET,
+            num_trajopt_seeds=num_seeds,
+            batch_size=1,
+            n_envs=1,
+            n_goalset=goal.n_goalset,
+        )
+        return self.solve_from_solve_state(
+            solve_state,
+            goal,
+            seed_traj,
+            use_nn_seed,
+            return_all_solutions,
+            num_seeds,
+            newton_iters=newton_iters,
+        )
+
+    def solve_batch(
+        self,
+        goal: Goal,
+        seed_traj: Optional[JointState] = None,
+        use_nn_seed: bool = False,
+        return_all_solutions: bool = False,
+        num_seeds: Optional[int] = None,
+        seed_success: Optional[torch.Tensor] = None,
+        newton_iters: Optional[int] = None,
+    ) -> TrajResult:
+        if num_seeds is None:
+            num_seeds = self.num_seeds
+        solve_state = ReacherSolveState(
+            ReacherSolveType.BATCH,
+            num_trajopt_seeds=num_seeds,
+            batch_size=goal.batch,
+            n_envs=1,
+            n_goalset=1,
+        )
+        return self.solve_from_solve_state(
+            solve_state,
+            goal,
+            seed_traj,
+            use_nn_seed,
+            return_all_solutions,
+            num_seeds,
+            seed_success,
+            newton_iters=newton_iters,
+        )
+
+    def solve_batch_goalset(
+        self,
+        goal: Goal,
+        seed_traj: Optional[JointState] = None,
+        use_nn_seed: bool = False,
+        return_all_solutions: bool = False,
+        num_seeds: Optional[int] = None,
+        seed_success: Optional[torch.Tensor] = None,
+        newton_iters: Optional[int] = None,
+    ) -> TrajResult:
+        if num_seeds is None:
+            num_seeds = self.num_seeds
+        solve_state = ReacherSolveState(
+            ReacherSolveType.BATCH_GOALSET,
+            num_trajopt_seeds=num_seeds,
+            batch_size=goal.batch,
+            n_envs=1,
+            n_goalset=goal.n_goalset,
+        )
+        return self.solve_from_solve_state(
+            solve_state,
+            goal,
+            seed_traj,
+            use_nn_seed,
+            return_all_solutions,
+            num_seeds,
+            newton_iters=newton_iters,
+        )
+
+    def solve_batch_env(
+        self,
+        goal: Goal,
+        seed_traj: Optional[JointState] = None,
+        use_nn_seed: bool = False,
+        return_all_solutions: bool = False,
+        num_seeds: Optional[int] = None,
+        seed_success: Optional[torch.Tensor] = None,
+        newton_iters: Optional[int] = None,
+    ) -> TrajResult:
+        if num_seeds is None:
+            num_seeds = self.num_seeds
+        solve_state = ReacherSolveState(
+            ReacherSolveType.BATCH_ENV,
+            num_trajopt_seeds=num_seeds,
+            batch_size=goal.batch,
+            n_envs=goal.batch,
+            n_goalset=1,
+        )
+        return self.solve_from_solve_state(
+            solve_state,
+            goal,
+            seed_traj,
+            use_nn_seed,
+            return_all_solutions,
+            num_seeds,
+            seed_success,
+            newton_iters=newton_iters,
+        )
+
+    def solve_batch_env_goalset(
+        self,
+        goal: Goal,
+        seed_traj: Optional[JointState] = None,
+        use_nn_seed: bool = False,
+        return_all_solutions: bool = False,
+        num_seeds: Optional[int] = None,
+        seed_success: Optional[torch.Tensor] = None,
+        newton_iters: Optional[int] = None,
+    ) -> TrajResult:
+        if num_seeds is None:
+            num_seeds = self.num_seeds
+        solve_state = ReacherSolveState(
+            ReacherSolveType.BATCH_ENV_GOALSET,
+            num_trajopt_seeds=num_seeds,
+            batch_size=goal.batch,
+            n_envs=goal.batch,
+            n_goalset=goal.n_goalset,
+        )
+        return self.solve_from_solve_state(
+            solve_state,
+            goal,
+            seed_traj,
+            use_nn_seed,
+            return_all_solutions,
+            num_seeds,
+            newton_iters=newton_iters,
+        )
+
+    def solve(
+        self,
+        goal: Goal,
+        seed_traj: Optional[JointState] = None,
+        use_nn_seed: bool = False,
+        return_all_solutions: bool = False,
+        num_seeds: Optional[int] = None,
+        newton_iters: Optional[int] = None,
+    ) -> TrajResult:
+        """Only for single goal
+
+        Args:
+            goal (Goal): _description_
+            seed_traj (Optional[JointState], optional): _description_. Defaults to None.
+            use_nn_seed (bool, optional): _description_. Defaults to False.
+
+        Raises:
+            NotImplementedError: _description_
+
+        Returns:
+            TrajResult: _description_
+        """
+        log_warn("TrajOpt.solve() is deprecated, use TrajOpt.solve_single or others instead")
+        if goal.goal_pose.batch == 1 and goal.goal_pose.n_goalset == 1:
+            return self.solve_single(
+                goal,
+                seed_traj,
+                use_nn_seed,
+                return_all_solutions,
+                num_seeds,
+                newton_iters=newton_iters,
+            )
+        if goal.goal_pose.batch == 1 and goal.goal_pose.n_goalset > 1:
+            return self.solve_goalset(
+                goal,
+                seed_traj,
+                use_nn_seed,
+                return_all_solutions,
+                num_seeds,
+                newton_iters=newton_iters,
+            )
+        if goal.goal_pose.batch > 1 and goal.goal_pose.n_goalset == 1:
+            return self.solve_batch(
+                goal,
+                seed_traj,
+                use_nn_seed,
+                return_all_solutions,
+                num_seeds,
+                newton_iters=newton_iters,
+            )
+
+        raise NotImplementedError()
+
+    @profiler.record_function("trajopt/get_result")
+    def _get_result(
+        self,
+        result: WrapResult,
+        return_all_solutions: bool,
+        goal: Goal,
+        seed_traj: JointState,
+        num_seeds: int,
+        batch_mode: bool = False,
+    ):
+        st_time = time.time()
+        if self.trim_steps is not None:
+            result.action = result.action.trim_trajectory(self.trim_steps[0], self.trim_steps[1])
+        interpolated_trajs, last_tstep, opt_dt = self.get_interpolated_trajectory(result.action)
+        if self.sync_cuda_time:
+            torch.cuda.synchronize()
+        interpolation_time = time.time() - st_time
+        if self.evaluate_interpolated_trajectory:
+            with profiler.record_function("trajopt/evaluate_interpolated"):
+                if self.use_cuda_graph_metrics:
+                    result.metrics = self.interpolate_rollout.get_metrics_cuda_graph(
+                        interpolated_trajs
+                    )
+                else:
+                    result.metrics = self.interpolate_rollout.get_metrics(interpolated_trajs)
+
+        st_time = time.time()
+
+        feasible = torch.all(result.metrics.feasible, dim=-1)
+
+        if result.metrics.position_error is not None:
+            converge = torch.logical_and(
+                result.metrics.position_error[..., -1] <= self.position_threshold,
+                result.metrics.rotation_error[..., -1] <= self.rotation_threshold,
+            )
+        elif result.metrics.cspace_error is not None:
+            converge = result.metrics.cspace_error[..., -1] <= self.cspace_threshold
+        else:
+            raise ValueError("convergence check requires either goal_pose or goal_state")
+
+        success = torch.logical_and(feasible, converge)
+        if return_all_solutions:
+            traj_result = TrajResult(
+                success=success,
+                goal=goal,
+                solution=result.action.scale(self.solver_dt / opt_dt.view(-1, 1, 1)),
+                seed=seed_traj,
+                position_error=result.metrics.position_error,
+                rotation_error=result.metrics.rotation_error,
+                solve_time=result.solve_time,
+                metrics=result.metrics,  # TODO: index this also
+                interpolated_solution=interpolated_trajs,
+                debug_info={"solver": result.debug, "interpolation_time": interpolation_time},
+                path_buffer_last_tstep=last_tstep,
+                cspace_error=result.metrics.cspace_error,
+                optimized_dt=opt_dt,
+                raw_solution=result.action,
+            )
+        else:
+            # get path length:
+            # max_vel =
+            if self.evaluate_interpolated_trajectory:
+                smooth_label, smooth_cost = self.traj_evaluator.evaluate_interpolated_smootheness(
+                    interpolated_trajs,
+                    opt_dt,
+                    self.solver.rollout_fn.dynamics_model.cspace_distance_weight,
+                    self._velocity_bounds,
+                )
+
+            else:
+                smooth_label, smooth_cost = self.traj_evaluator.evaluate(
+                    result.action,
+                    self.solver.rollout_fn.traj_dt,
+                    self.solver.rollout_fn.dynamics_model.cspace_distance_weight,
+                    self._velocity_bounds,
+                )
+            # print(smooth_label, success, self._velocity_bounds.shape, self.solver.rollout_fn.dynamics_model.cspace_distance_weight)
+
+            with profiler.record_function("trajopt/best_select"):
+                success[~smooth_label] = False
+                # get the best solution:
+                if result.metrics.pose_error is not None:
+                    convergence_error = result.metrics.pose_error[..., -1]
+                elif result.metrics.cspace_error is not None:
+                    convergence_error = result.metrics.cspace_error[..., -1]
+                else:
+                    raise ValueError("convergence check requires either goal_pose or goal_state")
+
+                error = convergence_error + smooth_cost
+                error[~success] += 10000.0
+                if batch_mode:
+                    idx = torch.argmin(error.view(goal.batch, num_seeds), dim=-1)
+                    idx = idx + num_seeds * self._col
+                    last_tstep = [last_tstep[i] for i in idx]
+                    success = success[idx]
+                else:
+                    idx = torch.argmin(error, dim=0)
+
+                    last_tstep = [last_tstep[idx.item()]]
+                    success = success[idx : idx + 1]
+
+                best_act_seq = result.action[idx]
+                interpolated_traj = interpolated_trajs[idx]
+                position_error = rotation_error = cspace_error = None
+                if result.metrics.position_error is not None:
+                    position_error = result.metrics.position_error[idx, -1]
+                if result.metrics.rotation_error is not None:
+                    rotation_error = result.metrics.rotation_error[idx, -1]
+                if result.metrics.cspace_error is not None:
+                    cspace_error = result.metrics.cspace_error[idx, -1]
+                opt_dt = opt_dt[idx]
+            if self.sync_cuda_time:
+                torch.cuda.synchronize()
+            if len(best_act_seq.shape) == 3:
+                opt_dt_v = opt_dt.view(-1, 1, 1)
+            else:
+                opt_dt_v = opt_dt.view(1, 1)
+            opt_solution = best_act_seq.scale(self.solver_dt / opt_dt_v)
+            select_time = time.time() - st_time
+            debug_info = None
+            if self.store_debug_in_result:
+                debug_info = {
+                    "solver": result.debug,
+                    "interpolation_time": interpolation_time,
+                    "select_time": select_time,
+                }
+
+            traj_result = TrajResult(
+                success=success,
+                goal=goal,
+                solution=opt_solution,
+                seed=seed_traj,
+                position_error=position_error,
+                rotation_error=rotation_error,
+                cspace_error=cspace_error,
+                solve_time=result.solve_time,
+                metrics=result.metrics,  # TODO: index this also
+                interpolated_solution=interpolated_traj,
+                debug_info=debug_info,
+                path_buffer_last_tstep=last_tstep,
+                smooth_error=smooth_cost,
+                smooth_label=smooth_label,
+                optimized_dt=opt_dt,
+                raw_solution=best_act_seq,
+            )
+        return traj_result
+
+    def batch_solve(
+        self,
+        goal: Goal,
+        seed_traj: Optional[JointState] = None,
+        seed_success: Optional[torch.Tensor] = None,
+        use_nn_seed: bool = False,
+        return_all_solutions: bool = False,
+        num_seeds: Optional[int] = None,
+    ) -> TrajResult:
+        """Only for single goal
+
+        Args:
+            goal (Goal): _description_
+            seed_traj (Optional[JointState], optional): _description_. Defaults to None.
+            use_nn_seed (bool, optional): _description_. Defaults to False.
+
+        Raises:
+            NotImplementedError: _description_
+
+        Returns:
+            TrajResult: _description_
+        """
+        log_warn("TrajOpt.solve() is deprecated, use TrajOpt.solve_single or others instead")
+        if goal.n_goalset == 1:
+            return self.solve_batch(
+                goal, seed_traj, use_nn_seed, return_all_solutions, num_seeds, seed_success
+            )
+        if goal.n_goalset > 1:
+            return self.solve_batch_goalset(
+                goal, seed_traj, use_nn_seed, return_all_solutions, num_seeds, seed_success
+            )
+        return traj_result
+
+    def get_linear_seed(self, start_state, goal_state):
+        start_q = start_state.position.view(-1, 1, self.dof)
+        end_q = goal_state.position.view(-1, 1, self.dof)
+        edges = torch.cat((start_q, end_q), dim=1)
+
+        seed = self.delta_vec @ edges
+        return seed
+
+    def get_start_seed(self, start_state):
+        start_q = start_state.position.view(-1, 1, self.dof)
+        edges = torch.cat((start_q, start_q), dim=1)
+        seed = self.delta_vec @ edges
+        return seed
+
+    def _get_seed_numbers(self, num_seeds):
+        n_seeds = {"linear": 0, "bias": 0, "start": 0, "goal": 0}
+        k = n_seeds.keys
+        t_seeds = 0
+        for k in n_seeds:
+            if k not in self.seed_ratio:
+                continue
+            if self.seed_ratio[k] > 0.0:
+                n_seeds[k] = math.floor(self.seed_ratio[k] * num_seeds)
+                t_seeds += n_seeds[k]
+        if t_seeds < num_seeds:
+            n_seeds["linear"] += num_seeds - t_seeds
+        return n_seeds
+
+    def get_seed_set(
+        self,
+        goal: Goal,
+        seed_traj: Union[JointState, torch.Tensor, None] = None,  # n_seeds,batch, h, dof
+        seed_success: Optional[torch.Tensor] = None,  # batch, n_seeds
+        num_seeds: Optional[int] = None,
+        batch_mode: bool = False,
+    ):
+        # if batch_mode:
+        total_seeds = goal.batch * num_seeds
+        # else:
+        #    total_seeds = num_seeds
+
+        if isinstance(seed_traj, JointState):
+            seed_traj = seed_traj.position
+        if seed_traj is None:
+            if goal.goal_state is not None and self.use_cspace_seed:
+                # get linear seed
+                seed_traj = self.get_seeds(goal.current_state, goal.goal_state, num_seeds=num_seeds)
+                # .view(batch_size, self.num_seeds, self.traj_tsteps, self.dof)
+            else:
+                # get start repeat seed:
+                log_info("No goal state found, using current config to seed")
+                seed_traj = self.get_seeds(
+                    goal.current_state, goal.current_state, num_seeds=num_seeds
+                )
+        elif seed_success is not None:
+            lin_seed_traj = self.get_seeds(goal.current_state, goal.goal_state, num_seeds=num_seeds)
+            lin_seed_traj[seed_success] = seed_traj  # [seed_success]
+            seed_traj = lin_seed_traj
+            total_seeds = goal.batch * num_seeds
+        elif num_seeds > seed_traj.shape[0]:
+            new_seeds = self.get_seeds(
+                goal.current_state, goal.goal_state, num_seeds - seed_traj.shape[0]
+            )
+            seed_traj = torch.cat((seed_traj, new_seeds), dim=0)  # n_seed, batch, h, dof
+
+        seed_traj = seed_traj.view(
+            total_seeds, self.traj_tsteps, self.dof
+        )  #  n_seeds,batch, h, dof
+        return seed_traj
+
+    def get_seeds(self, start_state, goal_state, num_seeds=None):
+        # repeat seeds:
+        if num_seeds is None:
+            num_seeds = self.num_seeds
+            n_seeds = self._n_seeds
+        else:
+            n_seeds = self._get_seed_numbers(num_seeds)
+        # linear seed: batch x dof -> batch x n_seeds x dof
+        seed_set = []
+        if n_seeds["linear"] > 0:
+            linear_seed = self.get_linear_seed(start_state, goal_state)
+
+            linear_seeds = linear_seed.view(1, -1, self.traj_tsteps, self.dof).repeat(
+                1, n_seeds["linear"], 1, 1
+            )
+            seed_set.append(linear_seeds)
+        if n_seeds["bias"] > 0:
+            bias_seed = self.get_bias_seed(start_state, goal_state)
+            bias_seeds = bias_seed.view(1, -1, self.traj_tsteps, self.dof).repeat(
+                1, n_seeds["bias"], 1, 1
+            )
+            seed_set.append(bias_seeds)
+        if n_seeds["start"] > 0:
+            bias_seed = self.get_start_seed(start_state)
+
+            bias_seeds = bias_seed.view(1, -1, self.traj_tsteps, self.dof).repeat(
+                1, n_seeds["start"], 1, 1
+            )
+            seed_set.append(bias_seeds)
+        if n_seeds["goal"] > 0:
+            bias_seed = self.get_start_seed(goal_state)
+
+            bias_seeds = bias_seed.view(1, -1, self.traj_tsteps, self.dof).repeat(
+                1, n_seeds["goal"], 1, 1
+            )
+            seed_set.append(bias_seeds)
+        all_seeds = torch.cat(
+            seed_set, dim=1
+        )  # .#transpose(0,1).contiguous()  # n_seed, batch, h, dof
+
+        return all_seeds
+
+    def get_bias_seed(self, start_state, goal_state):
+        start_q = start_state.position.view(-1, 1, self.dof)
+        end_q = goal_state.position.view(-1, 1, self.dof)
+
+        bias_q = self.bias_node.view(-1, 1, self.dof).repeat(start_q.shape[0], 1, 1)
+        edges = torch.cat((start_q, bias_q, end_q), dim=1)
+        seed = self.waypoint_delta_vec @ edges
+        # print(seed)
+        return seed
+
+    @profiler.record_function("trajopt/interpolation")
+    def get_interpolated_trajectory(self, traj_state: JointState):
+        # do interpolation:
+        if (
+            self._interpolated_traj_buffer is None
+            or traj_state.position.shape[0] != self._interpolated_traj_buffer.position.shape[0]
+        ):
+            b, _, dof = traj_state.position.shape
+            self._interpolated_traj_buffer = JointState.zeros(
+                (b, self.interpolation_steps, dof), self.tensor_args
+            )
+            self._interpolated_traj_buffer.joint_names = self.rollout_fn.joint_names
+        state, last_tstep, opt_dt = get_batch_interpolated_trajectory(
+            traj_state,
+            self.interpolation_dt,
+            self._max_joint_vel,
+            self._max_joint_acc,
+            self._max_joint_jerk,
+            self.solver_dt,
+            kind=self.interpolation_type,
+            tensor_args=self.tensor_args,
+            out_traj_state=self._interpolated_traj_buffer,
+            min_dt=self.traj_evaluator_config.min_dt,
+        )
+
+        return state, last_tstep, opt_dt
+
+    def calculate_trajectory_dt(
+        self,
+        trajectory: JointState,
+    ) -> torch.Tensor:
+        opt_dt = calculate_dt_no_clamp(
+            trajectory.velocity,
+            trajectory.acceleration,
+            trajectory.jerk,
+            self._max_joint_vel,
+            self._max_joint_acc,
+            self._max_joint_jerk,
+        )
+        return opt_dt
+
+    def reset_seed(self):
+        self.solver.reset_seed()
+
+    def reset_cuda_graph(self):
+        self.solver.reset_cuda_graph()
+        self.interpolate_rollout.reset_cuda_graph()
+        self.rollout_fn.reset_cuda_graph()
+
+    @property
+    def kinematics(self) -> CudaRobotModel:
+        return self.rollout_fn.dynamics_model.robot_model
+
+    @property
+    def retract_config(self):
+        return self.rollout_fn.dynamics_model.retract_config.view(1, -1)
+
+    def fk(self, q: torch.Tensor) -> CudaRobotModelState:
+        return self.kinematics.get_state(q)
+
+    @property
+    def solver_dt(self):
+        return self.solver.safety_rollout.dynamics_model.dt_traj_params.base_dt
+
+    def update_solver_dt(
+        self,
+        dt: Union[float, torch.Tensor],
+        base_dt: Optional[float] = None,
+        max_dt: Optional[float] = None,
+        base_ratio: Optional[float] = None,
+    ):
+        all_rollouts = self.get_all_rollout_instances()
+        for rollout in all_rollouts:
+            rollout.update_traj_dt(dt, base_dt, max_dt, base_ratio)
+
+    def compute_metrics(self, opt_trajectory: bool, interpolated_trajectory: bool):
+        self.solver.compute_metrics = opt_trajectory
+        self.evaluate_interpolated_trajectory = interpolated_trajectory
+
+    def get_full_js(self, active_js: JointState) -> JointState:
+        return self.rollout_fn.get_full_dof_from_solution(active_js)
diff --git a/src/curobo/wrap/reacher/types.py b/src/curobo/wrap/reacher/types.py
new file mode 100644
index 00000000..e3f1fa26
--- /dev/null
+++ b/src/curobo/wrap/reacher/types.py
@@ -0,0 +1,171 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+from __future__ import annotations
+
+# Standard Library
+from dataclasses import dataclass
+from enum import Enum
+from typing import Dict, List, Optional
+
+# CuRobo
+from curobo.rollout.rollout_base import Goal
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState
+
+
+class ReacherSolveType(Enum):
+    # TODO: how to differentiate between goal pose and goal config?
+    SINGLE = 0
+    GOALSET = 1
+    BATCH = 2
+    BATCH_GOALSET = 3
+    BATCH_ENV = 4
+    BATCH_ENV_GOALSET = 5
+
+
+@dataclass
+class ReacherSolveState:
+    solve_type: ReacherSolveType
+    batch_size: int
+    n_envs: int
+    n_goalset: int = 1
+    batch_env: bool = False
+    batch_retract: bool = False
+    batch_mode: bool = False
+    num_seeds: Optional[int] = None
+    num_ik_seeds: Optional[int] = None
+    num_graph_seeds: Optional[int] = None
+    num_trajopt_seeds: Optional[int] = None
+    num_mpc_seeds: Optional[int] = None
+
+    def __post_init__(self):
+        if self.n_envs == 1:
+            self.batch_env = False
+        else:
+            self.batch_env = True
+        if self.batch_size > 1:
+            self.batch_mode = True
+        if self.num_seeds is None:
+            self.num_seeds = self.num_ik_seeds
+        if self.num_seeds is None:
+            self.num_seeds = self.num_trajopt_seeds
+        if self.num_seeds is None:
+            self.num_seeds = self.num_graph_seeds
+        if self.num_seeds is None:
+            self.num_seeds = self.num_mpc_seeds
+
+    def get_batch_size(self):
+        return self.num_seeds * self.batch_size
+
+    def get_ik_batch_size(self):
+        return self.num_ik_seeds * self.batch_size
+
+    def create_goal_buffer(
+        self,
+        goal_pose: Pose,
+        goal_state: Optional[JointState] = None,
+        retract_config: Optional[T_BDOF] = None,
+        link_poses: Optional[Dict[str, Pose]] = None,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+    ):
+        # TODO: Refactor to account for num_ik_seeds or num_trajopt_seeds
+        batch_retract = True
+        if retract_config is None or retract_config.shape[0] != goal_pose.batch:
+            batch_retract = False
+        goal_buffer = Goal.create_idx(
+            pose_batch_size=self.batch_size,
+            batch_env=self.batch_env,
+            batch_retract=batch_retract,
+            num_seeds=self.num_seeds,
+            tensor_args=tensor_args,
+        )
+        goal_buffer.goal_pose = goal_pose
+        goal_buffer.retract_state = retract_config
+        goal_buffer.goal_state = goal_state
+        goal_buffer.links_goal_pose = link_poses
+        return goal_buffer
+
+    def update_goal_buffer(
+        self,
+        goal_pose: Pose,
+        goal_state: Optional[JointState] = None,
+        retract_config: Optional[T_BDOF] = None,
+        link_poses: Optional[List[Pose]] = None,
+        current_solve_state: Optional[ReacherSolveState] = None,
+        current_goal_buffer: Optional[Goal] = None,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+    ):
+        solve_state = self
+        # create goal buffer by comparing to existing solve type
+        update_reference = False
+        if (
+            current_solve_state is None
+            or current_goal_buffer is None
+            or current_solve_state != solve_state
+            or (current_goal_buffer.retract_state is None and retract_config is not None)
+            or (current_goal_buffer.goal_state is None and goal_state is not None)
+            or (current_goal_buffer.links_goal_pose is None and link_poses is not None)
+        ):
+            current_solve_state = solve_state
+            current_goal_buffer = solve_state.create_goal_buffer(
+                goal_pose, goal_state, retract_config, link_poses, tensor_args
+            )
+            update_reference = True
+        else:
+            current_goal_buffer.goal_pose.copy_(goal_pose)
+            if retract_config is not None:
+                current_goal_buffer.retract_state.copy_(retract_config)
+            if goal_state is not None:
+                current_goal_buffer.goal_state.copy_(goal_state)
+            if link_poses is not None:
+                for k in link_poses.keys():
+                    current_goal_buffer.links_goal_pose[k].copy_(link_poses[k])
+        return current_solve_state, current_goal_buffer, update_reference
+
+    def update_goal(
+        self,
+        goal: Goal,
+        current_solve_state: Optional[ReacherSolveState] = None,
+        current_goal_buffer: Optional[Goal] = None,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+    ):
+        solve_state = self
+
+        update_reference = False
+        if (
+            current_solve_state is None
+            or current_goal_buffer is None
+            or current_solve_state != solve_state
+            or (current_goal_buffer.goal_state is None and goal.goal_state is not None)
+            or (current_goal_buffer.goal_state is not None and goal.goal_state is None)
+        ):
+            # TODO: Check for change in update idx buffers, currently we assume
+            # that solve_state captures difference in idx buffers
+            current_solve_state = solve_state
+            current_goal_buffer = goal.create_index_buffers(
+                solve_state.batch_size,
+                solve_state.batch_env,
+                solve_state.batch_retract,
+                solve_state.num_seeds,
+                tensor_args,
+            )
+            update_reference = True
+        else:
+            current_goal_buffer.copy_(goal, update_idx_buffers=False)
+        return current_solve_state, current_goal_buffer, update_reference
+
+
+@dataclass
+class MotionGenSolverState:
+    solve_type: ReacherSolveType
+    ik_solve_state: ReacherSolveState
+    trajopt_solve_state: ReacherSolveState
diff --git a/src/curobo/wrap/wrap_base.py b/src/curobo/wrap/wrap_base.py
new file mode 100644
index 00000000..05499d2f
--- /dev/null
+++ b/src/curobo/wrap/wrap_base.py
@@ -0,0 +1,185 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Standard Library
+import time
+from dataclasses import dataclass
+from typing import Any, List, Optional
+
+# Third Party
+import torch
+import torch.autograd.profiler as profiler
+
+# CuRobo
+from curobo.opt.newton.newton_base import NewtonOptBase
+from curobo.opt.opt_base import Optimizer
+from curobo.opt.particle.particle_opt_base import ParticleOptBase
+from curobo.rollout.rollout_base import Goal, RolloutBase, RolloutMetrics
+from curobo.types.robot import State
+from curobo.util.logger import log_info
+
+
+@dataclass
+class WrapConfig:
+    safety_rollout: RolloutBase
+    optimizers: List[Optimizer]
+    compute_metrics: bool
+    use_cuda_graph_metrics: bool
+    sync_cuda_time: bool
+
+
+@dataclass
+class WrapResult:
+    action: State
+    solve_time: float
+    metrics: Optional[RolloutMetrics] = None
+    debug: Any = None
+    js_action: Optional[State] = None
+
+    def clone(self):
+        return WrapResult(
+            self.action.clone(), self.solve_time, self.metrics.clone(), debug=self.debug
+        )
+
+
+class WrapBase(WrapConfig):
+    def __init__(self, config: Optional[WrapConfig] = None):
+        if config is not None:
+            WrapConfig.__init__(self, **vars(config))
+        self.n_envs = 1
+        self.opt_dt = 0
+        self._rollout_list = None
+        self._opt_rollouts = None
+        self._init_solver = False
+
+    def get_metrics(self, state: State, use_cuda_graph: bool = False) -> RolloutMetrics:
+        if use_cuda_graph:
+            log_info("Using cuda graph")
+            return self.safety_rollout.get_metrics_cuda_graph(state)
+        return self.safety_rollout.get_metrics(state)
+
+    def optimize(self, act_seq: torch.Tensor, shift_steps: int = 0) -> torch.Tensor:
+        for opt in self.optimizers:
+            act_seq = opt.optimize(act_seq, shift_steps)
+        return act_seq
+
+    def get_debug_data(self):
+        debug_list = []
+        for opt in self.optimizers:
+            debug_list.append(opt.debug)
+        return debug_list
+
+    def get_debug_cost(self):
+        debug_list = []
+        for opt in self.optimizers:
+            debug_list.append(opt.debug_cost)
+        return debug_list
+
+    def update_nenvs(self, n_envs):
+        if n_envs != self.n_envs:
+            self.n_envs = n_envs
+            for opt in self.optimizers:
+                opt.update_nenvs(self.n_envs)
+
+    def update_params(self, goal: Goal):
+        with profiler.record_function("wrap_base/safety/update_params"):
+            log_info("Updating safety params")
+            self.safety_rollout.update_params(goal)
+        log_info("Updating optimizer params")
+        for opt in self.optimizers:
+            opt.update_params(goal)
+
+    def get_init_act(self):
+        init_act = self.safety_rollout.get_init_action_seq()
+        return init_act
+
+    def reset(self):
+        for opt in self.optimizers:
+            opt.reset()
+        self.safety_rollout.reset()
+
+    def reset_seed(self):
+        self.safety_rollout.reset_seed()
+        for opt in self.optimizers:
+            opt.reset_seed()
+
+    def reset_cuda_graph(self):
+        self.safety_rollout.reset_cuda_graph()
+        for opt in self.optimizers:
+            opt.reset_cuda_graph()
+        self._init_solver = False
+
+    @property
+    def rollout_fn(self):
+        return self.safety_rollout
+
+    def solve(self, goal: Goal, seed: Optional[torch.Tensor] = None):
+        metrics = None
+
+        filtered_state = self.safety_rollout.filter_robot_state(goal.current_state)
+        goal.current_state = filtered_state
+        self.update_params(goal)
+        if seed is None:
+            seed = self.get_init_act()
+            log_info("getting random seed")
+        else:
+            seed = seed.detach().clone()
+        start_time = time.time()
+        if not self._init_solver:
+            log_info("Solver was not initialized, warming up solver")
+            for _ in range(2):
+                act_seq = self.optimize(seed, shift_steps=0)
+            self._init_solver = True
+        act_seq = self.optimize(seed, shift_steps=0)
+        self.opt_dt = time.time() - start_time
+
+        act = self.safety_rollout.get_robot_command(
+            filtered_state, act_seq, state_idx=goal.batch_current_state_idx
+        )
+
+        if self.compute_metrics:
+            with profiler.record_function("wrap_base/compute_metrics"):
+                metrics = self.get_metrics(
+                    act, self.use_cuda_graph_metrics
+                )  # TODO: use cuda graph for metrics
+
+        result = WrapResult(
+            action=act,
+            solve_time=self.opt_dt,
+            metrics=metrics,
+            debug={"steps": self.get_debug_data(), "cost": self.get_debug_cost()},
+        )
+        return result
+
+    @property
+    def newton_optimizer(self) -> NewtonOptBase:
+        return self.optimizers[-1]
+
+    @property
+    def particle_optimizer(self) -> ParticleOptBase:
+        return self.optimizers[0]
+
+    @property
+    def joint_names(self):
+        return self.rollout_fn.cspace_config.joint_names
+
+    def _get_rollout_instances_from_optimizers(self) -> List[RolloutBase]:
+        if self._opt_rollouts is None:
+            self._opt_rollouts = []
+            for i in self.optimizers:
+                self._opt_rollouts.extend(i.get_all_rollout_instances())
+        return self._opt_rollouts
+
+    def get_all_rollout_instances(self) -> List[RolloutBase]:
+        if self._rollout_list is None:
+            self._rollout_list = [
+                self.safety_rollout
+            ] + self._get_rollout_instances_from_optimizers()
+        return self._rollout_list
diff --git a/src/curobo/wrap/wrap_mpc.py b/src/curobo/wrap/wrap_mpc.py
new file mode 100644
index 00000000..a2f2229a
--- /dev/null
+++ b/src/curobo/wrap/wrap_mpc.py
@@ -0,0 +1,86 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+from __future__ import annotations
+
+# Standard Library
+import time
+from typing import Optional
+
+# Third Party
+import torch
+import torch.autograd.profiler as profiler
+
+# CuRobo
+from curobo.rollout.rollout_base import Goal
+from curobo.types.robot import State
+from curobo.wrap.wrap_base import WrapBase, WrapConfig, WrapResult
+
+
+class WrapMpc(WrapBase):
+    def __init__(self, config: Optional[WrapConfig] = None):
+        self._init_act_seq = None
+        super().__init__(config)
+
+    def update_init_seed(self, seed) -> bool:
+        if self._init_act_seq is None:
+            self._init_act_seq = seed.detach.clone()
+        else:
+            self._init_act_seq.copy_(seed)
+        return True
+
+    def solve(self, goal: Goal, seed: Optional[State] = None, shift_steps=1):
+        if seed is None and self._init_act_seq is None:
+            seed = self.get_init_act()
+        elif self._init_act_seq is not None:
+            seed = self._init_act_seq
+        else:
+            seed = seed.detach().clone()
+        metrics = None
+
+        start_time = time.time()
+        # print("i:",goal.current_state.position)
+        filtered_state = self.safety_rollout.filter_robot_state(goal.current_state)
+        # print("f:", filtered_state.position)
+        goal.current_state.copy_(filtered_state)
+
+        self.update_params(goal)
+        if self.sync_cuda_time:
+            torch.cuda.synchronize()
+        # print("In: ", seed[0,:,0])
+        start_time = time.time()
+        with profiler.record_function("mpc/opt"):
+            act_seq = self.optimize(seed, shift_steps=shift_steps)
+        if self.sync_cuda_time:
+            torch.cuda.synchronize()
+        self.opt_dt = time.time() - start_time
+        with profiler.record_function("mpc/filter"):
+            act = self.safety_rollout.get_robot_command(
+                filtered_state, act_seq, shift_steps=shift_steps
+            )
+        # print("Out: ", act_seq[0,:,0])
+        self._init_act_seq = self._shift(act_seq, shift_steps=shift_steps)
+        if self.compute_metrics:
+            with profiler.record_function("mpc/get_metrics"):
+                metrics = self.get_metrics(act)
+        result = WrapResult(action=act, metrics=metrics, solve_time=self.opt_dt)
+        return result
+
+    def _shift(self, act_seq, shift_steps=1):
+        act_seq = act_seq.roll(-shift_steps, 1)
+        act_seq[:, -shift_steps:, :] = act_seq[:, -shift_steps - 1 : -shift_steps, :].clone()
+        return act_seq
+
+    def reset(self):
+        self._init_act_seq = None
+        return super().reset()
+
+    def get_rollouts(self):
+        return self.particle_optimizer.top_trajs
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 00000000..a7287489
--- /dev/null
+++ b/tests/__init__.py
@@ -0,0 +1,12 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+"""CuRobo test package."""
diff --git a/tests/cost_test.py b/tests/cost_test.py
new file mode 100644
index 00000000..5a38b937
--- /dev/null
+++ b/tests/cost_test.py
@@ -0,0 +1,47 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Third Party
+import torch
+
+# CuRobo
+from curobo.geom.sdf.world import WorldCollisionConfig, WorldPrimitiveCollision
+from curobo.geom.types import WorldConfig
+from curobo.rollout.cost.primitive_collision_cost import (
+    PrimitiveCollisionCost,
+    PrimitiveCollisionCostConfig,
+)
+from curobo.types.base import TensorDeviceType
+from curobo.util_file import get_world_configs_path, join_path, load_yaml
+
+
+def test_primitive_collision_cost():
+    tensor_args = TensorDeviceType()
+    world_file = "collision_test.yml"
+    data_dict = load_yaml(join_path(get_world_configs_path(), world_file))
+
+    world_cfg = WorldConfig.from_dict(data_dict)
+    coll_cfg = WorldPrimitiveCollision(
+        WorldCollisionConfig(world_model=world_cfg, tensor_args=TensorDeviceType())
+    )
+
+    cost_cfg = PrimitiveCollisionCostConfig(
+        weight=1.0,
+        tensor_args=tensor_args,
+        world_coll_checker=coll_cfg,
+        use_sweep=False,
+        classify=False,
+    )
+    cost = PrimitiveCollisionCost(cost_cfg)
+    q_spheres = torch.as_tensor(
+        [[0.1, 0.0, 0.0, 0.2], [10.0, 0.0, 0.0, 0.1]], **vars(tensor_args)
+    ).view(-1, 1, 1, 4)
+    c = cost.forward(q_spheres).flatten()
+    assert c[0] > 0.0 and c[1] == 0.0
diff --git a/tests/cuda_robot_generator_test.py b/tests/cuda_robot_generator_test.py
new file mode 100644
index 00000000..cf226df4
--- /dev/null
+++ b/tests/cuda_robot_generator_test.py
@@ -0,0 +1,55 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# CuRobo
+from curobo.cuda_robot_model.cuda_robot_generator import (
+    CudaRobotGenerator,
+    CudaRobotGeneratorConfig,
+)
+from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModelConfig
+from curobo.util_file import get_robot_configs_path, join_path, load_yaml
+
+
+def test_cuda_robot_generator_config():
+    robot_file = "franka.yml"
+    robot_params = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"][
+        "kinematics"
+    ]
+    config = CudaRobotGeneratorConfig(**robot_params)
+    assert config.ee_link == robot_params["ee_link"]
+
+
+def test_cuda_robot_generator():
+    robot_file = "franka.yml"
+    robot_params = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"][
+        "kinematics"
+    ]
+    config = CudaRobotGeneratorConfig(**robot_params)
+    robot_generator = CudaRobotGenerator(config)
+    assert robot_generator.kinematics_config.n_dof == 7
+
+
+def test_cuda_robot_config():
+    robot_file = "franka.yml"
+    config = CudaRobotModelConfig.from_robot_yaml_file(robot_file)
+    assert config.kinematics_config.n_dof == 7
+
+
+def test_cuda_robot_generator_config_cspace():
+    robot_file = "franka.yml"
+    robot_params = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"][
+        "kinematics"
+    ]
+    config = CudaRobotGeneratorConfig(**robot_params)
+    assert len(config.cspace.max_jerk) == len(config.cspace.joint_names)
+    robot_generator = CudaRobotGenerator(config)
+
+    assert len(robot_generator.cspace.max_jerk) == 7
diff --git a/tests/curobo_robot_world_model_test.py b/tests/curobo_robot_world_model_test.py
new file mode 100644
index 00000000..0778308e
--- /dev/null
+++ b/tests/curobo_robot_world_model_test.py
@@ -0,0 +1,136 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import torch
+
+# CuRobo
+from curobo.wrap.model.curobo_robot_world import CuroboRobotWorld, CuroboRobotWorldConfig
+
+
+def load_robot_world():
+    robot_file = "franka.yml"
+    world_file = "collision_table.yml"
+    config = CuroboRobotWorldConfig.load_from_config(robot_file, world_file)
+    model = CuroboRobotWorld(config)
+    return model
+
+
+def load_robot_batch_world():
+    robot_file = "franka.yml"
+    world_file = ["collision_table.yml", "collision_test.yml"]
+    config = CuroboRobotWorldConfig.load_from_config(
+        robot_file, world_file, collision_activation_distance=0.0
+    )
+    model = CuroboRobotWorld(config)
+    return model
+
+
+def test_cu_robot_world_config_load():
+    # test if curobo robot world can be loaded
+    load_robot_world()
+    assert True
+
+
+def test_cu_robot_batch_world_config_load():
+    # test if curobo robot world can be loaded
+    load_robot_batch_world()
+    assert True
+
+
+def test_cu_robot_world_kinematics():
+    # test if kinematics can be queried
+    model = load_robot_world()
+    n = 10
+    q = model.sample(n)
+    state = model.get_kinematics(q)
+    assert state.ee_position.shape[-1] == 3
+
+
+def test_cu_robot_world_sample():
+    # test if joint configurations can be sampled
+    model = load_robot_world()
+    n = 10
+    q = model.sample(n)
+    assert q.shape[0] == n
+
+
+def test_cu_robot_world_collision():
+    # test computing collisions given robot joint configurations
+    model = load_robot_world()
+    n = 10
+    q = model.sample(n)
+    state = model.get_kinematics(q)
+    d_world = model.get_collision_constraint(state.link_spheres_tensor.view(n, 1, -1, 4))
+    assert d_world.shape[0] == n
+    assert torch.sum(d_world) == 0.0
+
+
+def test_cu_robot_world_self_collision():
+    model = load_robot_world()
+    n = 10
+    q = model.sample(n)
+    state = model.get_kinematics(q)
+    d_self = model.get_self_collision(state.link_spheres_tensor.view(n, 1, -1, 4))
+    assert torch.sum(d_self) == 0.0
+
+
+def test_cu_robot_world_pose():
+    model = load_robot_world()
+    n = 10
+    q = model.sample(n)
+    state = model.get_kinematics(q)
+    pose = state.ee_pose
+    des_pose = state.ee_pose
+    d = model.pose_distance(des_pose, pose)
+    assert torch.sum(d) < 1e-3
+
+
+def test_cu_trajectory_sample():
+    model = load_robot_world()
+    b = 10
+    horizon = 21
+    q = model.sample_trajectory(b, horizon)
+    assert q.shape[0] == b and q.shape[1] == horizon
+
+
+def test_cu_batch_trajectory_sample():
+    model = load_robot_batch_world()
+    b = 2
+    horizon = 21
+    env_query_idx = torch.zeros((b), dtype=torch.int32, device=torch.device("cuda", 0))
+    q = model.sample_trajectory(b, horizon, env_query_idx=env_query_idx)
+    assert q.shape[0] == b and q.shape[1] == horizon
+
+
+def test_cu_batch_trajectory_1env_sample():
+    model = load_robot_batch_world()
+    b = 2
+    horizon = 21
+    env_query_idx = None
+    q = model.sample_trajectory(b, horizon, env_query_idx=env_query_idx)
+    assert q.shape[0] == b and q.shape[1] == horizon
+
+
+def test_cu_robot_batch_world_collision():
+    # test computing collisions given robot joint configurations
+    model = load_robot_batch_world()
+    b = 2
+    horizon = 21
+    env_query_idx = torch.zeros((b), dtype=torch.int32, device=torch.device("cuda", 0))
+
+    q = model.sample_trajectory(b, horizon, env_query_idx=env_query_idx)
+
+    d_world, d_self = model.get_world_self_collision_distance_from_joint_trajectory(
+        q, env_query_idx
+    )
+    assert d_world.shape[0] == b
+    assert torch.sum(d_world) == 0.0
diff --git a/tests/curobo_version_test.py b/tests/curobo_version_test.py
new file mode 100644
index 00000000..b42d5f06
--- /dev/null
+++ b/tests/curobo_version_test.py
@@ -0,0 +1,21 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+"""Unit tests for `curobo` package version."""
+
+# CuRobo
+import curobo
+
+
+def test_curobo_version():
+    """Test `curobo` package version is set."""
+    assert curobo.__version__ is not None
+    assert curobo.__version__ != ""
diff --git a/tests/geom_test.py b/tests/geom_test.py
new file mode 100644
index 00000000..0d08e69e
--- /dev/null
+++ b/tests/geom_test.py
@@ -0,0 +1,211 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Third Party
+import torch
+
+# CuRobo
+from curobo.geom.sdf.world import (
+    CollisionQueryBuffer,
+    WorldCollisionConfig,
+    WorldPrimitiveCollision,
+)
+from curobo.geom.sdf.world_mesh import WorldMeshCollision
+from curobo.geom.types import Cuboid, WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.util_file import get_world_configs_path, join_path, load_yaml
+
+
+def test_world_primitive():
+    # load a world:
+    tensor_args = TensorDeviceType()
+    world_file = "collision_test.yml"
+    data_dict = load_yaml(join_path(get_world_configs_path(), world_file))
+    world_cfg = WorldConfig.from_dict(data_dict)
+    coll_cfg = WorldCollisionConfig(world_model=world_cfg, tensor_args=TensorDeviceType())
+    coll_check = WorldPrimitiveCollision(coll_cfg)
+    x_sph = torch.as_tensor(
+        [[0.0, 0.0, 0.0, 0.1], [10.0, 0.0, 0.0, 0.0], [0.01, 0.01, 0.0, 0.1]], **vars(tensor_args)
+    ).view(1, 1, -1, 4)
+    # create buffers:
+    query_buffer = CollisionQueryBuffer.initialize_from_shape(
+        x_sph.shape, tensor_args, coll_check.collision_types
+    )
+    weight = tensor_args.to_device([1])
+    act_distance = tensor_args.to_device([0.0])
+
+    d_sph = coll_check.get_sphere_distance(x_sph, query_buffer, weight, act_distance).view(-1)
+    assert abs(d_sph[0].item() - 0.1) < 1e-3
+    assert abs(d_sph[1].item() - 0.0) < 1e-9
+    assert abs(d_sph[2].item() - 0.1) < 1e-3
+
+
+def test_batch_world_primitive():
+    """This tests collision checking across different environments"""
+    tensor_args = TensorDeviceType()
+
+    world_file = "collision_test.yml"
+    data_dict = load_yaml(join_path(get_world_configs_path(), world_file))
+    world_cfg = WorldConfig.from_dict(data_dict)
+    coll_cfg = WorldCollisionConfig(world_model=world_cfg, tensor_args=TensorDeviceType())
+    coll_check = WorldPrimitiveCollision(coll_cfg)
+    x_sph = torch.as_tensor(
+        [[0.0, 0.0, 0.0, 0.1], [10.0, 0.0, 0.0, 0.0], [0.01, 0.01, 0.0, 0.1]], **vars(tensor_args)
+    ).view(-1, 1, 1, 4)
+    env_query_idx = torch.zeros((x_sph.shape[0]), device=tensor_args.device, dtype=torch.int32)
+    # create buffers:
+    query_buffer = CollisionQueryBuffer.initialize_from_shape(
+        x_sph.shape, tensor_args, coll_check.collision_types
+    )
+    act_distance = tensor_args.to_device([0.0])
+
+    weight = tensor_args.to_device([1])
+    d_sph = coll_check.get_sphere_distance(
+        x_sph, query_buffer, weight, act_distance, env_query_idx
+    ).view(-1)
+    assert abs(d_sph[0].item() - 0.1) < 1e-3
+    assert abs(d_sph[1].item() - 0.0) < 1e-9
+    assert abs(d_sph[2].item() - 0.1) < 1e-3
+
+
+def test_swept_world_primitive():
+    """This tests collision checking across different environments"""
+    tensor_args = TensorDeviceType()
+
+    world_file = "collision_test.yml"
+    data_dict = load_yaml(join_path(get_world_configs_path(), world_file))
+    world_cfg = WorldConfig.from_dict(data_dict)
+    coll_cfg = WorldCollisionConfig(world_model=world_cfg, tensor_args=TensorDeviceType())
+    coll_cfg.cache = {"obb": 5}
+    coll_check = WorldPrimitiveCollision(coll_cfg)
+    # add an obstacle:
+    new_cube = Cuboid("cube_1", [0, 0, 1, 1, 0, 0, 0], None, [0.1, 0.2, 0.2])
+    coll_check.add_obb(new_cube, 0)
+    x_sph = torch.as_tensor(
+        [[0.0, 0.0, 0.0, 0.1], [10.0, 0.0, 0.0, 0.0], [0.01, 0.01, 0.0, 0.1]], **vars(tensor_args)
+    ).view(1, 1, -1, 4)
+    env_query_idx = None
+    env_query_idx = torch.zeros((x_sph.shape[0]), device=tensor_args.device, dtype=torch.int32)
+    # create buffers:
+    query_buffer = CollisionQueryBuffer.initialize_from_shape(
+        x_sph.shape, tensor_args, coll_check.collision_types
+    )
+    weight = tensor_args.to_device([1])
+    act_distance = tensor_args.to_device([0.0])
+    dt = act_distance.clone()
+    d_sph_swept = coll_check.get_swept_sphere_distance(
+        x_sph, query_buffer, weight, act_distance, dt, 10, env_query_idx
+    ).view(-1)
+    d_sph = coll_check.get_sphere_distance(
+        x_sph, query_buffer.clone() * 0.0, weight, act_distance, env_query_idx
+    ).view(-1)
+    assert abs(d_sph[0].item() - 0.1) < 1e-3
+    assert abs(d_sph[1].item() - 0.0) < 1e-9
+    assert abs(d_sph[2].item() - 0.1) < 1e-3
+
+    assert abs(d_sph_swept[0].item() - 0.1) < 1e-3
+    assert abs(d_sph_swept[1].item() - 0.0) < 1e-9
+    assert abs(d_sph_swept[2].item() - 0.1) < 1e-3
+
+
+def test_world_primitive_mesh_instance():
+    # load a world:
+    tensor_args = TensorDeviceType()
+    world_file = "collision_test.yml"
+    data_dict = load_yaml(join_path(get_world_configs_path(), world_file))
+    world_cfg = WorldConfig.from_dict(data_dict)
+    coll_cfg = WorldCollisionConfig(world_model=world_cfg, tensor_args=TensorDeviceType())
+    coll_check = WorldMeshCollision(coll_cfg)
+    x_sph = torch.as_tensor(
+        [[0.0, 0.0, 0.0, 0.1], [10.0, 0.0, 0.0, 0.0], [0.01, 0.01, 0.0, 0.1]], **vars(tensor_args)
+    ).view(1, 1, -1, 4)
+    # create buffers:
+    query_buffer = CollisionQueryBuffer.initialize_from_shape(
+        x_sph.shape, tensor_args, coll_check.collision_types
+    )
+    act_distance = tensor_args.to_device([0.0])
+
+    weight = tensor_args.to_device([1])
+    d_sph = coll_check.get_sphere_distance(x_sph, query_buffer, weight, act_distance).view(-1)
+    assert abs(d_sph[0].item() - 0.1) < 1e-3
+    assert abs(d_sph[1].item() - 0.0) < 1e-9
+    assert abs(d_sph[2].item() - 0.1) < 1e-3
+
+
+def test_batch_world_primitive_mesh_instance():
+    """This tests collision checking across different environments"""
+    tensor_args = TensorDeviceType()
+
+    world_file = "collision_test.yml"
+    data_dict = load_yaml(join_path(get_world_configs_path(), world_file))
+    world_cfg = WorldConfig.from_dict(data_dict)
+    coll_cfg = WorldCollisionConfig(world_model=world_cfg, tensor_args=TensorDeviceType())
+    coll_check = WorldMeshCollision(coll_cfg)
+    x_sph = torch.as_tensor(
+        [[0.0, 0.0, 0.0, 0.1], [10.0, 0.0, 0.0, 0.0], [0.01, 0.01, 0.0, 0.1]], **vars(tensor_args)
+    ).view(-1, 1, 1, 4)
+    env_query_idx = torch.zeros((x_sph.shape[0]), device=tensor_args.device, dtype=torch.int32)
+    # create buffers:
+    query_buffer = CollisionQueryBuffer.initialize_from_shape(
+        x_sph.shape, tensor_args, coll_check.collision_types
+    )
+
+    weight = tensor_args.to_device([1])
+    act_distance = tensor_args.to_device([0.0])
+
+    d_sph = coll_check.get_sphere_distance(
+        x_sph, query_buffer, weight, act_distance, env_query_idx
+    ).view(-1)
+    assert abs(d_sph[0].item() - 0.1) < 1e-3
+    assert abs(d_sph[1].item() - 0.0) < 1e-9
+    assert abs(d_sph[2].item() - 0.1) < 1e-3
+
+
+def test_swept_world_primitive_mesh_instance():
+    """This tests collision checking across different environments"""
+    tensor_args = TensorDeviceType()
+
+    world_file = "collision_test.yml"
+    data_dict = load_yaml(join_path(get_world_configs_path(), world_file))
+    world_cfg = WorldConfig.from_dict(data_dict)
+    coll_cfg = WorldCollisionConfig(world_model=world_cfg, tensor_args=TensorDeviceType())
+    coll_cfg.cache = {"obb": 5}
+    coll_check = WorldMeshCollision(coll_cfg)
+    # add an obstacle:
+    dims = tensor_args.to_device([0.1, 0.2, 0.2])
+    w_obj_pose = Pose.from_list([0, 0, 1, 1, 0, 0, 0], tensor_args)
+    coll_check.add_obb_from_raw("cube_1", dims, 0, w_obj_pose=w_obj_pose)
+    x_sph = torch.as_tensor(
+        [[0.0, 0.0, 0.0, 0.1], [10.0, 0.0, 0.0, 0.0], [0.01, 0.01, 0.0, 0.1]], **vars(tensor_args)
+    ).view(1, 1, -1, 4)
+    env_query_idx = None
+    env_query_idx = torch.zeros((x_sph.shape[0]), device=tensor_args.device, dtype=torch.int32)
+    # create buffers:
+    query_buffer = CollisionQueryBuffer.initialize_from_shape(
+        x_sph.shape, tensor_args, coll_check.collision_types
+    )
+    weight = tensor_args.to_device([1])
+    act_distance = tensor_args.to_device([0.0])
+    dt = act_distance.clone() + 0.01
+
+    d_sph_swept = coll_check.get_swept_sphere_distance(
+        x_sph, query_buffer, weight, act_distance, dt, 10, env_query_idx
+    ).view(-1)
+    d_sph = coll_check.get_sphere_distance(
+        x_sph, query_buffer.clone() * 0.0, weight, act_distance, env_query_idx
+    ).view(-1)
+    assert abs(d_sph[0].item() - 0.1) < 1e-3
+    assert abs(d_sph[1].item() - 0.0) < 1e-9
+    assert abs(d_sph[2].item() - 0.1) < 1e-3
+
+    assert abs(d_sph_swept[0].item() - 0.1) < 1e-3
+    assert abs(d_sph_swept[1].item() - 0.0) < 1e-9
+    assert abs(d_sph_swept[2].item() - 0.1) < 1e-3
diff --git a/tests/geom_types_test.py b/tests/geom_types_test.py
new file mode 100644
index 00000000..f1a2d3d7
--- /dev/null
+++ b/tests/geom_types_test.py
@@ -0,0 +1,41 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import pytest
+
+# CuRobo
+from curobo.geom.sphere_fit import SphereFitType
+from curobo.geom.types import WorldConfig
+from curobo.util_file import get_world_configs_path, join_path, load_yaml
+
+
+def test_mesh_world():
+    world_file = "collision_test.yml"
+    data_dict = load_yaml(join_path(get_world_configs_path(), world_file))
+    world_cfg = WorldConfig.from_dict(data_dict)
+    mesh_world_cfg = world_cfg.get_mesh_world()
+    assert len(mesh_world_cfg.mesh) == len(world_cfg.cuboid)
+    obb_world_cfg = mesh_world_cfg.get_obb_world()
+    assert len(obb_world_cfg.cuboid) == len(mesh_world_cfg.mesh)
+
+
+@pytest.mark.parametrize(
+    "sphere_fit_type",
+    [e.value for e in SphereFitType],
+)
+def test_bounding_volume(sphere_fit_type):
+    world_file = "collision_test.yml"
+    data_dict = load_yaml(join_path(get_world_configs_path(), world_file))
+    world_cfg = WorldConfig.from_dict(data_dict)
+    obs = world_cfg.objects[-1]
+    spheres = obs.get_bounding_spheres(100, 0.01, sphere_fit_type)
+    assert len(spheres) > 0
diff --git a/tests/goal_test.py b/tests/goal_test.py
new file mode 100644
index 00000000..788e557f
--- /dev/null
+++ b/tests/goal_test.py
@@ -0,0 +1,144 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import torch
+
+# CuRobo
+from curobo.curobolib.geom import get_pose_distance
+from curobo.rollout.rollout_base import Goal
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState
+
+
+def test_repeat_seeds():
+    tensor_args = TensorDeviceType()
+    b = 10
+    dof = 7
+    position = torch.randn((b, 3), device=tensor_args.device, dtype=tensor_args.dtype)
+
+    quaternion = torch.zeros((b, 4), device=tensor_args.device, dtype=tensor_args.dtype)
+
+    quaternion[:, 0] = 1.0
+    goal_pose = Pose(position, quaternion)
+
+    current_state = JointState.from_position(
+        torch.randn((b, dof), device=tensor_args.device, dtype=tensor_args.dtype)
+    )
+    batch_pose_idx = torch.arange(0, b, 1, device=tensor_args.device, dtype=torch.int32).unsqueeze(
+        -1
+    )
+    goal = Goal(goal_pose=goal_pose, batch_pose_idx=batch_pose_idx, current_state=current_state)
+    g = goal.repeat_seeds(4)
+
+    start_pose = goal_pose.repeat_seeds(4)
+    b = start_pose.position.shape[0]
+    out_d = torch.zeros((b, 1), device=tensor_args.device, dtype=tensor_args.dtype)
+    out_p_v = torch.zeros((b, 3), device=tensor_args.device, dtype=tensor_args.dtype)
+    out_r_v = torch.zeros((b, 4), device=tensor_args.device, dtype=tensor_args.dtype)
+    out_idx = torch.zeros((b, 1), device=tensor_args.device, dtype=torch.int32)
+    vec_weight = torch.ones((6), device=tensor_args.device, dtype=tensor_args.dtype)
+    weight = tensor_args.to_device([1, 1, 1, 1])
+    vec_convergence = tensor_args.to_device([0, 0])
+    run_weight = tensor_args.to_device([1])
+    r = get_pose_distance(
+        out_d,
+        out_d.clone(),
+        out_d.clone(),
+        out_p_v,
+        out_r_v,
+        out_idx,
+        start_pose.position,
+        g.goal_pose.position,
+        start_pose.quaternion,
+        g.goal_pose.quaternion,
+        vec_weight,
+        weight,
+        vec_convergence,
+        run_weight,
+        vec_weight.clone() * 0.0,
+        g.batch_pose_idx,
+        start_pose.position.shape[0],
+        1,
+        1,
+        1,
+        False,
+        False,
+        True,
+    )
+
+    assert torch.sum(r[0]).item() == 0.0
+
+
+def test_horizon_repeat_seeds():
+    tensor_args = TensorDeviceType()
+    b = 2
+    dof = 7
+    h = 30
+    position = torch.randn((b, h, 3), device=tensor_args.device, dtype=tensor_args.dtype)
+    # position[:,:,1] = 1.0
+    quaternion = torch.zeros((b, h, 4), device=tensor_args.device, dtype=tensor_args.dtype)
+
+    quaternion[:, 0] = 1.0
+    quaternion[1, 1] = 1.0
+    quaternion[1, 0] = 0.0
+    goal_pose = Pose(position[:, 0, :], quaternion[:, 0, :]).clone()
+
+    current_state = JointState.from_position(
+        torch.randn((b, dof), device=tensor_args.device, dtype=tensor_args.dtype)
+    )
+    batch_pose_idx = torch.arange(0, b, 1, device=tensor_args.device, dtype=torch.int32).unsqueeze(
+        -1
+    )
+    goal = Goal(goal_pose=goal_pose, batch_pose_idx=batch_pose_idx, current_state=current_state)
+    g = goal  # .repeat_seeds(4)
+
+    start_pose = Pose(
+        goal_pose.position.view(-1, 1, 3).repeat(1, h, 1),
+        quaternion=goal_pose.quaternion.view(-1, 1, 4).repeat(1, h, 1),
+    )
+    b = start_pose.position.shape[0]
+    out_d = torch.zeros((b, h, 1), device=tensor_args.device, dtype=tensor_args.dtype)
+    out_p_v = torch.zeros((b, h, 3), device=tensor_args.device, dtype=tensor_args.dtype)
+    out_r_v = torch.zeros((b, h, 4), device=tensor_args.device, dtype=tensor_args.dtype)
+    out_idx = torch.zeros((b, h, 1), device=tensor_args.device, dtype=torch.int32)
+    vec_weight = torch.ones((6), device=tensor_args.device, dtype=tensor_args.dtype)
+    weight = tensor_args.to_device([1, 1, 1, 1])
+    vec_convergence = tensor_args.to_device([0, 0])
+    run_weight = torch.zeros((1, h), device=tensor_args.device)
+    run_weight[-1] = 1
+    r = get_pose_distance(
+        out_d,
+        out_d.clone(),
+        out_d.clone(),
+        out_p_v,
+        out_r_v,
+        out_idx,
+        start_pose.position,
+        g.goal_pose.position,
+        start_pose.quaternion,
+        g.goal_pose.quaternion,
+        vec_weight,
+        weight,
+        vec_convergence,
+        run_weight,
+        vec_weight.clone() * 0.0,
+        g.batch_pose_idx,
+        start_pose.position.shape[0],
+        h,
+        1,
+        1,
+        True,
+        False,
+        False,
+    )
+    assert torch.sum(r[0]).item() == 0.0
diff --git a/tests/ik_config_test.py b/tests/ik_config_test.py
new file mode 100644
index 00000000..841be948
--- /dev/null
+++ b/tests/ik_config_test.py
@@ -0,0 +1,149 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Third Party
+import pytest
+import torch
+
+# CuRobo
+from curobo.geom.sdf.world import (
+    CollisionCheckerType,
+    WorldCollisionConfig,
+    WorldPrimitiveCollision,
+)
+from curobo.geom.sdf.world_mesh import WorldMeshCollision
+from curobo.geom.types import WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import RobotConfig
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.ik_solver import IKSolver, IKSolverConfig
+
+
+def ik_base_config():
+    tensor_args = TensorDeviceType()
+    world_file = "collision_cubby.yml"
+
+    robot_file = "franka.yml"
+    robot_cfg = RobotConfig.from_dict(
+        load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    )
+    world_cfg = WorldConfig.from_dict(load_yaml(join_path(get_world_configs_path(), world_file)))
+    ik_config = IKSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        rotation_threshold=0.05,
+        position_threshold=0.005,
+        num_seeds=100,
+        self_collision_check=True,
+        self_collision_opt=True,
+        tensor_args=tensor_args,
+        use_cuda_graph=False,
+        n_collision_envs=1,
+        collision_cache={"obb": 10},
+    )
+    return ik_config
+
+
+def ik_gd_config():
+    tensor_args = TensorDeviceType()
+    world_file = "collision_cubby.yml"
+
+    robot_file = "franka.yml"
+    robot_cfg = RobotConfig.from_dict(
+        load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    )
+    world_cfg = WorldConfig.from_dict(load_yaml(join_path(get_world_configs_path(), world_file)))
+    ik_config = IKSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        rotation_threshold=0.05,
+        position_threshold=0.005,
+        num_seeds=100,
+        self_collision_check=True,
+        self_collision_opt=True,
+        tensor_args=tensor_args,
+        use_cuda_graph=False,
+        use_gradient_descent=True,
+        grad_iters=100,
+    )
+    return ik_config
+
+
+def ik_es_config():
+    tensor_args = TensorDeviceType()
+    world_file = "collision_cubby.yml"
+
+    robot_file = "franka.yml"
+    robot_cfg = RobotConfig.from_dict(
+        load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    )
+    world_cfg = WorldConfig.from_dict(load_yaml(join_path(get_world_configs_path(), world_file)))
+    ik_config = IKSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        rotation_threshold=0.05,
+        position_threshold=0.005,
+        num_seeds=100,
+        self_collision_check=True,
+        self_collision_opt=True,
+        tensor_args=tensor_args,
+        use_cuda_graph=False,
+        use_es=True,
+        es_learning_rate=0.01,
+        use_fixed_samples=True,
+    )
+    return ik_config
+
+
+def ik_no_particle_opt_config():
+    tensor_args = TensorDeviceType()
+    world_file = "collision_cubby.yml"
+
+    robot_file = "franka.yml"
+    robot_cfg = RobotConfig.from_dict(
+        load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    )
+    world_cfg = WorldConfig.from_dict(load_yaml(join_path(get_world_configs_path(), world_file)))
+    ik_config = IKSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        rotation_threshold=0.05,
+        position_threshold=0.005,
+        num_seeds=100,
+        self_collision_check=True,
+        self_collision_opt=True,
+        tensor_args=tensor_args,
+        use_cuda_graph=False,
+        use_particle_opt=False,
+        grad_iters=100,
+    )
+    return ik_config
+
+
+@pytest.mark.parametrize(
+    "config, expected",
+    [
+        (ik_base_config(), True),
+        (ik_es_config(), True),
+        (ik_gd_config(), True),
+        (ik_no_particle_opt_config(), True),
+    ],
+)
+def test_eval(config, expected):
+    ik_solver = IKSolver(config)
+    q_sample = ik_solver.sample_configs(1)
+    kin_state = ik_solver.fk(q_sample)
+    goal = Pose(kin_state.ee_position, kin_state.ee_quaternion)
+    result = ik_solver.solve_single(goal)
+    result = ik_solver.solve_single(goal)
+
+    success = result.success
+    assert success.item() == expected
diff --git a/tests/ik_module_test.py b/tests/ik_module_test.py
new file mode 100644
index 00000000..6a69244b
--- /dev/null
+++ b/tests/ik_module_test.py
@@ -0,0 +1,131 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Third Party
+import pytest
+import torch
+
+# CuRobo
+from curobo.geom.sdf.world import (
+    CollisionCheckerType,
+    WorldCollisionConfig,
+    WorldPrimitiveCollision,
+)
+from curobo.geom.sdf.world_mesh import WorldMeshCollision
+from curobo.geom.types import WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import RobotConfig
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.ik_solver import IKSolver, IKSolverConfig
+
+
+@pytest.fixture(scope="module")
+def ik_solver():
+    tensor_args = TensorDeviceType()
+    world_file = "collision_cubby.yml"
+
+    robot_file = "franka.yml"
+    robot_cfg = RobotConfig.from_dict(
+        load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    )
+    world_cfg = WorldConfig.from_dict(load_yaml(join_path(get_world_configs_path(), world_file)))
+    ik_config = IKSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        rotation_threshold=0.05,
+        position_threshold=0.005,
+        num_seeds=100,
+        self_collision_check=True,
+        self_collision_opt=True,
+        tensor_args=tensor_args,
+        use_cuda_graph=False,
+    )
+    ik_solver = IKSolver(ik_config)
+    return ik_solver
+
+
+@pytest.fixture(scope="module")
+def ik_solver_batch_env():
+    tensor_args = TensorDeviceType()
+    world_files = ["collision_table.yml", "collision_cubby.yml", "collision_test.yml"]
+    world_cfg = [
+        WorldConfig.from_dict(load_yaml(join_path(get_world_configs_path(), world_file)))
+        for world_file in world_files
+    ]
+    robot_file = "franka.yml"
+    robot_cfg = RobotConfig.from_dict(
+        load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    )
+    ik_config = IKSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        rotation_threshold=0.05,
+        position_threshold=0.005,
+        num_seeds=100,
+        self_collision_check=True,
+        self_collision_opt=True,
+        tensor_args=tensor_args,
+        use_cuda_graph=False,
+    )
+    ik_solver = IKSolver(ik_config)
+    return ik_solver
+
+
+def test_ik_single(ik_solver):
+    q_sample = ik_solver.sample_configs(1)
+    kin_state = ik_solver.fk(q_sample)
+    goal = Pose(kin_state.ee_position, kin_state.ee_quaternion)
+    result = ik_solver.solve_single(goal)
+
+    success = result.success
+    assert success.item()
+
+
+def test_ik_goalset(ik_solver):
+    q_sample = ik_solver.sample_configs(10)
+    kin_state = ik_solver.fk(q_sample)
+    goal = Pose(kin_state.ee_position.unsqueeze(0), kin_state.ee_quaternion.unsqueeze(0))
+    result = ik_solver.solve_goalset(goal)
+
+    assert result.success.item()
+
+
+def test_ik_batch(ik_solver):
+    q_sample = ik_solver.sample_configs(10)
+    kin_state = ik_solver.fk(q_sample)
+    goal = Pose(kin_state.ee_position, kin_state.ee_quaternion)
+    result = ik_solver.solve_batch(goal)
+    assert torch.count_nonzero(result.success) > 5
+
+
+def test_ik_batch_goalset(ik_solver):
+    q_sample = ik_solver.sample_configs(100)
+    kin_state = ik_solver.fk(q_sample)
+    goal = Pose(kin_state.ee_position.view(10, 10, 3), kin_state.ee_quaternion.view(10, 10, 4))
+    result = ik_solver.solve_batch_goalset(goal)
+    assert torch.count_nonzero(result.success) > 5
+
+
+def test_ik_batch_env(ik_solver_batch_env):
+    q_sample = ik_solver_batch_env.sample_configs(3)
+    kin_state = ik_solver_batch_env.fk(q_sample)
+    goal = Pose(kin_state.ee_position, kin_state.ee_quaternion)
+    result = ik_solver_batch_env.solve_batch_env(goal)
+
+    assert torch.count_nonzero(result.success) >= 1
+
+
+def test_ik_batch_env_goalset(ik_solver_batch_env):
+    q_sample = ik_solver_batch_env.sample_configs(3 * 3)
+    kin_state = ik_solver_batch_env.fk(q_sample)
+    goal = Pose(kin_state.ee_position.view(3, 3, 3), kin_state.ee_quaternion.view(3, 3, 4))
+    result = ik_solver_batch_env.solve_batch_env_goalset(goal)
+    assert torch.count_nonzero(result.success) >= 2
diff --git a/tests/ik_test.py b/tests/ik_test.py
new file mode 100644
index 00000000..e9242ad2
--- /dev/null
+++ b/tests/ik_test.py
@@ -0,0 +1,211 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Third Party
+import torch
+
+# CuRobo
+from curobo.geom.sdf.world import (
+    CollisionCheckerType,
+    WorldCollisionConfig,
+    WorldPrimitiveCollision,
+)
+from curobo.geom.sdf.world_mesh import WorldMeshCollision
+from curobo.geom.types import WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import RobotConfig
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.ik_solver import IKSolver, IKSolverConfig
+
+
+def test_basic_ik():
+    tensor_args = TensorDeviceType()
+
+    config_file = load_yaml(join_path(get_robot_configs_path(), "franka.yml"))
+    urdf_file = config_file["robot_cfg"]["kinematics"][
+        "urdf_path"
+    ]  # Send global path starting with "/"
+    base_link = config_file["robot_cfg"]["kinematics"]["base_link"]
+    ee_link = config_file["robot_cfg"]["kinematics"]["ee_link"]
+    robot_cfg = RobotConfig.from_basic(urdf_file, base_link, ee_link, tensor_args)
+
+    ik_config = IKSolverConfig.load_from_robot_config(
+        robot_cfg,
+        None,
+        rotation_threshold=0.05,
+        position_threshold=0.005,
+        num_seeds=30,
+        self_collision_check=False,
+        self_collision_opt=False,
+        tensor_args=tensor_args,
+    )
+    ik_solver = IKSolver(ik_config)
+    b_size = 10
+    q_sample = ik_solver.sample_configs(b_size)
+    kin_state = ik_solver.fk(q_sample)
+    goal = Pose(kin_state.ee_position, kin_state.ee_quaternion)
+    result = ik_solver.solve_batch(goal)
+    success = result.success
+
+    assert torch.count_nonzero(success).item() >= 1.0  # we check if atleast 1 is successful
+
+
+def test_full_config_collision_free_ik():
+    tensor_args = TensorDeviceType()
+    world_file = "collision_cubby.yml"
+
+    robot_file = "franka.yml"
+    robot_cfg = RobotConfig.from_dict(
+        load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    )
+    world_cfg = WorldConfig.from_dict(load_yaml(join_path(get_world_configs_path(), world_file)))
+    ik_config = IKSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        rotation_threshold=0.05,
+        position_threshold=0.005,
+        num_seeds=30,
+        self_collision_check=True,
+        self_collision_opt=True,
+        tensor_args=tensor_args,
+    )
+    ik_solver = IKSolver(ik_config)
+    b_size = 10
+
+    q_sample = ik_solver.sample_configs(b_size)
+    kin_state = ik_solver.fk(q_sample)
+    goal = Pose(kin_state.ee_position, kin_state.ee_quaternion)
+    result = ik_solver.solve(goal)
+
+    success = result.success
+    assert torch.count_nonzero(success).item() >= 9.0  # we check if atleast 90% are successful
+
+
+def test_attach_object_full_config_collision_free_ik():
+    tensor_args = TensorDeviceType()
+    world_file = "collision_cubby.yml"
+
+    robot_file = "franka.yml"
+    robot_cfg = RobotConfig.from_dict(
+        load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    )
+    world_cfg = WorldConfig.from_dict(load_yaml(join_path(get_world_configs_path(), world_file)))
+    ik_config = IKSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        rotation_threshold=0.05,
+        position_threshold=0.005,
+        num_seeds=30,
+        self_collision_check=True,
+        self_collision_opt=True,
+        tensor_args=tensor_args,
+    )
+    ik_solver = IKSolver(ik_config)
+    b_size = 10
+
+    q_sample = ik_solver.sample_configs(b_size)
+    kin_state = ik_solver.fk(q_sample)
+    goal = Pose(kin_state.ee_position, kin_state.ee_quaternion)
+    result = ik_solver.solve(goal)
+
+    success = result.success
+    assert torch.count_nonzero(success).item() >= 9.0  # we check if atleast 90% are successful
+
+    q_sample = ik_solver.sample_configs(b_size)
+    kin_state = ik_solver.fk(q_sample)
+    goal = Pose(kin_state.ee_position, kin_state.ee_quaternion)
+
+    # ik_solver.attach_object_to_robot(sphere_radius=0.02)
+
+    result = ik_solver.solve(goal)
+    success = result.success
+    assert torch.count_nonzero(success).item() >= 9.0  # we check if atleast 90% are successful
+
+
+def test_batch_env_ik():
+    tensor_args = TensorDeviceType()
+    world_files = ["collision_cubby.yml", "collision_test.yml"]
+
+    robot_file = "franka.yml"
+    robot_cfg = RobotConfig.from_dict(
+        load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    )
+    w_list = [
+        WorldConfig.from_dict(load_yaml(join_path(get_world_configs_path(), world_file)))
+        for world_file in world_files
+    ]
+    world_ccheck = WorldPrimitiveCollision(WorldCollisionConfig(tensor_args, n_envs=2))
+    # create a batched world collision checker:
+    world_ccheck.load_batch_collision_model(w_list)
+    ik_config = IKSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_coll_checker=world_ccheck,
+        rotation_threshold=0.05,
+        position_threshold=0.005,
+        num_seeds=100,
+        self_collision_check=True,
+        self_collision_opt=True,
+        tensor_args=tensor_args,
+        use_cuda_graph=True,
+    )
+    ik_solver = IKSolver(ik_config)
+    b_size = 2
+
+    q_sample = ik_solver.sample_configs(b_size)
+    kin_state = ik_solver.fk(q_sample)
+    goal = Pose(kin_state.ee_position, kin_state.ee_quaternion)
+    result = ik_solver.solve_batch_env(goal)
+
+    success = result.success
+    assert torch.count_nonzero(success).item() >= 1.0  # we check if atleast 90% are successful
+
+
+def test_batch_env_mesh_ik():
+    tensor_args = TensorDeviceType()
+    world_files = ["collision_table.yml", "collision_table.yml"]
+
+    robot_file = "franka.yml"
+    robot_cfg = RobotConfig.from_dict(
+        load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    )
+    w_list = [
+        WorldConfig.from_dict(
+            load_yaml(join_path(get_world_configs_path(), world_file))
+        ).get_mesh_world()
+        for world_file in world_files
+    ]
+    world_ccheck = WorldMeshCollision(
+        WorldCollisionConfig(tensor_args, checker_type=CollisionCheckerType.MESH, n_envs=2)
+    )
+    # create a batched world collision checker:
+    # print(w_list)
+    world_ccheck.load_batch_collision_model(w_list)
+    ik_config = IKSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_coll_checker=world_ccheck,
+        rotation_threshold=0.05,
+        position_threshold=0.005,
+        num_seeds=100,
+        self_collision_check=True,
+        self_collision_opt=True,
+        tensor_args=tensor_args,
+        use_cuda_graph=True,
+    )
+    ik_solver = IKSolver(ik_config)
+    b_size = 2
+
+    q_sample = ik_solver.sample_configs(b_size)
+    kin_state = ik_solver.fk(q_sample)
+    goal = Pose(kin_state.ee_position, kin_state.ee_quaternion)
+    result = ik_solver.solve_batch_env(goal)
+
+    success = result.success
+    assert torch.count_nonzero(success).item() >= 1.0  # we check if atleast 90% are successful
diff --git a/tests/interpolation_test.py b/tests/interpolation_test.py
new file mode 100644
index 00000000..7f387094
--- /dev/null
+++ b/tests/interpolation_test.py
@@ -0,0 +1,67 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import torch
+
+# CuRobo
+from curobo.types.base import TensorDeviceType
+from curobo.types.robot import JointState
+from curobo.util.trajectory import InterpolateType, get_batch_interpolated_trajectory
+
+
+def test_linear_interpolation():
+    b, h, dof = 1, 24, 1
+    raw_dt = 0.4
+    int_dt = 0.01
+    tensor_args = TensorDeviceType()
+    # initialize raw trajectory:
+    in_traj = JointState.zeros((b, h, dof), tensor_args)
+    in_traj.position = torch.zeros((b, h, dof), device=tensor_args.device)
+    in_traj.position[:, 1, :] = 0.1
+
+    in_traj.position[:, -2, :] = -0.01
+    in_traj.position[:, 10, :] = -0.01
+
+    in_traj.position[:, -1, :] = 0.01
+    in_traj.velocity = in_traj.position - torch.roll(in_traj.position, -1, dims=1)
+    in_traj.velocity[:, 0, :] = 0.0
+    in_traj.velocity[:, -1, :] = 0.0
+
+    max_vel = torch.ones((1, 1, dof), device=tensor_args.device, dtype=tensor_args.dtype)
+    max_acc = torch.ones((1, 1, dof), device=tensor_args.device, dtype=tensor_args.dtype) * 25
+    max_jerk = torch.ones((1, 1, dof), device=tensor_args.device, dtype=tensor_args.dtype) * 500
+
+    # create max_velocity buffer:
+    out_traj_gpu, _, _ = get_batch_interpolated_trajectory(
+        in_traj, int_dt, max_vel, max_acc=max_acc, max_jerk=max_jerk, raw_dt=raw_dt
+    )
+    #
+    out_traj_gpu = out_traj_gpu.clone()
+
+    out_traj_cpu, _, _ = get_batch_interpolated_trajectory(
+        in_traj,
+        int_dt,
+        max_vel,
+        raw_dt=raw_dt,
+        kind=InterpolateType.LINEAR,
+        max_acc=max_acc,
+        max_jerk=max_jerk,
+    )
+    assert (
+        torch.max(
+            torch.abs(out_traj_gpu.position[:, -5:, :] - out_traj_cpu.position[:, -5:, :])
+        ).item()
+        < 0.05
+    )
+
+
+# test_linear_interpolation()
diff --git a/tests/kinematics_parser_test.py b/tests/kinematics_parser_test.py
new file mode 100644
index 00000000..78bdb11d
--- /dev/null
+++ b/tests/kinematics_parser_test.py
@@ -0,0 +1,142 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import pytest
+import torch
+
+# CuRobo
+from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel, CudaRobotModelConfig
+from curobo.cuda_robot_model.urdf_kinematics_parser import UrdfKinematicsParser
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.util_file import (
+    file_exists,
+    get_assets_path,
+    get_robot_configs_path,
+    join_path,
+    load_yaml,
+)
+
+try:
+    # CuRobo
+    from curobo.cuda_robot_model.usd_kinematics_parser import UsdKinematicsParser
+except ImportError:
+    pytest.skip("usd-core is not available, skipping USD tests", allow_module_level=True)
+
+
+def check_usd_file_exists():
+    robot_file = "franka.yml"
+    robot_params = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    robot_usd = join_path(get_assets_path(), robot_params["kinematics"]["usd_path"])
+    return file_exists(robot_usd)
+
+
+if not check_usd_file_exists():
+    pytest.skip("Franka Panda USD is not available, skipping USD tests", allow_module_level=True)
+
+
+@pytest.fixture(scope="module")
+def robot_params_all():
+    robot_file = "franka.yml"
+    robot_params = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    robot_params["kinematics"]["ee_link"] = "panda_link7"
+    robot_params["kinematics"]["base_link"] = "panda_link0"
+    robot_params["kinematics"]["collision_link_names"] = []
+    robot_params["kinematics"]["lock_joints"] = None
+    robot_params["kinematics"]["extra_links"] = None
+
+    return robot_params
+
+
+@pytest.fixture(scope="module")
+def usd_parser(robot_params_all):
+    robot_params = robot_params_all["kinematics"]
+    robot_usd = join_path(get_assets_path(), robot_params["usd_path"])
+    kinematics_parser = UsdKinematicsParser(
+        usd_path=robot_usd,
+        flip_joints=robot_params["usd_flip_joints"],
+        usd_robot_root=robot_params["usd_robot_root"],
+    )
+    return kinematics_parser
+
+
+@pytest.fixture(scope="module")
+def urdf_parser(robot_params_all):
+    robot_params = robot_params_all["kinematics"]
+    robot_urdf = join_path(get_assets_path(), robot_params["urdf_path"])
+    kinematics_parser = UrdfKinematicsParser(robot_urdf)
+    return kinematics_parser
+
+
+@pytest.fixture(scope="module")
+def retract_state(robot_params_all):
+    tensor_args = TensorDeviceType()
+    q = tensor_args.to_device(robot_params_all["kinematics"]["cspace"]["retract_config"]).view(
+        1, -1
+    )
+    return q
+
+
+@pytest.fixture(scope="module")
+def usd_cuda_robot(robot_params_all):
+    robot_params = robot_params_all["kinematics"]
+    robot_params["use_usd_kinematics"] = True
+
+    usd_cuda_config = CudaRobotModelConfig.from_data_dict(robot_params)
+
+    usd_cuda_robot = CudaRobotModel(usd_cuda_config)
+    return usd_cuda_robot
+
+
+@pytest.fixture(scope="module")
+def urdf_cuda_robot(robot_params_all):
+    robot_params = robot_params_all["kinematics"]
+    robot_params["use_usd_kinematics"] = False
+    urdf_cuda_config = CudaRobotModelConfig.from_data_dict(robot_params)
+
+    urdf_cuda_robot = CudaRobotModel(urdf_cuda_config)
+    return urdf_cuda_robot
+
+
+def test_chain_parse(urdf_parser, usd_parser, robot_params_all):
+    robot_params = robot_params_all
+
+    urdf_chain = urdf_parser.get_chain(
+        robot_params["kinematics"]["base_link"], robot_params["kinematics"]["ee_link"]
+    )
+
+    usd_chain = usd_parser.get_chain(
+        robot_params["kinematics"]["base_link"], robot_params["kinematics"]["ee_link"]
+    )
+    assert usd_chain == urdf_chain
+
+
+def test_joint_transform_parse(usd_cuda_robot, urdf_cuda_robot):
+    usd_pose = usd_cuda_robot.get_all_link_transforms()
+    urdf_pose = urdf_cuda_robot.get_all_link_transforms()
+    p, q = usd_pose.distance(urdf_pose)
+    p = torch.linalg.norm(q)
+    q = torch.linalg.norm(q)
+    assert p < 1e-5 and q < 1e-5
+
+
+def test_basic_ee_pose(usd_cuda_robot, urdf_cuda_robot, retract_state):
+    q = retract_state[:, : usd_cuda_robot.get_dof()]
+
+    usd_state = usd_cuda_robot.get_state(q)
+    usd_pose = Pose(usd_state.ee_position, usd_state.ee_quaternion)
+
+    urdf_state = urdf_cuda_robot.get_state(q)
+    urdf_pose = Pose(urdf_state.ee_position, urdf_state.ee_quaternion)
+    p_d, q_d = usd_pose.distance(urdf_pose)
+    assert p_d < 1e-5
+    assert q_d < 1e-5
diff --git a/tests/kinematics_test.py b/tests/kinematics_test.py
new file mode 100644
index 00000000..b6e585f7
--- /dev/null
+++ b/tests/kinematics_test.py
@@ -0,0 +1,171 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import pytest
+import torch
+
+# CuRobo
+from curobo.cuda_robot_model.cuda_robot_generator import CudaRobotGeneratorConfig
+from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel, CudaRobotModelConfig
+from curobo.cuda_robot_model.types import CSpaceConfig
+from curobo.geom.transform import matrix_to_quaternion, quaternion_to_matrix
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import quat_multiply
+from curobo.util_file import get_robot_configs_path, join_path, load_yaml
+
+
+@pytest.fixture(scope="module")
+def cfg():
+    cfg = CudaRobotModelConfig.from_robot_yaml_file("franka.yml", "panda_hand")
+    return cfg
+
+
+def test_quaternion():
+    tensor_args = TensorDeviceType()
+
+    def test_q(in_quat):
+        out_rot = quaternion_to_matrix(in_quat)
+        out_quat = matrix_to_quaternion(out_rot.clone())
+        out_quat[..., 1:] *= -1.0
+        q_res = quat_multiply(in_quat, out_quat, in_quat.clone())
+        q_res[..., 0] = 0.0
+        assert torch.sum(q_res).item() <= 1e-5
+
+    in_quat = tensor_args.to_device([1.0, 0.0, 0.0, 0.0]).view(1, 4)
+    test_q(in_quat)
+    test_q(-1.0 * in_quat)
+    in_quat = tensor_args.to_device([0.0, 1.0, 0.0, 0.0]).view(1, 4)
+    test_q(in_quat)
+    test_q(-1.0 * in_quat)
+    in_quat = tensor_args.to_device([0.0, 0.0, 1.0, 0.0]).view(1, 4)
+    test_q(in_quat)
+    test_q(-1.0 * in_quat)
+    in_quat = tensor_args.to_device([0.0, 0.0, 0.0, 1.0]).view(1, 4)
+    test_q(in_quat)
+    test_q(-1.0 * in_quat)
+    in_quat = tensor_args.to_device([0.7071068, 0.0, 0.0, 0.7071068]).view(1, 4)
+    test_q(in_quat)
+    test_q(-1.0 * in_quat)
+
+
+def test_franka_kinematics(cfg):
+    tensor_args = TensorDeviceType()
+
+    robot_model = CudaRobotModel(cfg)
+    q_test = torch.as_tensor([0.0, -1.2, 0.0, -2.0, 0.0, 1.0, 0.0], **vars(tensor_args)).view(1, -1)
+    ee_position = torch.as_tensor([6.0860e-02, -4.7547e-12, 7.6373e-01], **vars(tensor_args)).view(
+        1, -1
+    )
+    ee_quat = torch.as_tensor([0.0382, 0.9193, 0.3808, 0.0922], **vars(tensor_args)).view(1, -1)
+    b_list = [1, 10, 100, 5000]
+    for b in b_list:
+        state = robot_model.get_state(q_test.repeat(b, 1).clone())
+        pos_err = torch.linalg.norm(state.ee_position - ee_position)
+        q_err = torch.linalg.norm(state.ee_quaternion - ee_quat)
+        # check if all values are equal to position and quaternion
+        assert pos_err < 1e-3
+        assert q_err < 1e-1
+
+
+def test_franka_attached_object_kinematics(cfg):
+    tensor_args = TensorDeviceType()
+
+    robot_model = CudaRobotModel(cfg)
+    q_test = torch.as_tensor([0.0, -1.2, 0.0, -2.0, 0.0, 1.0, 0.0], **vars(tensor_args)).view(1, -1)
+    sph_idx = robot_model.kinematics_config.get_sphere_index_from_link_name("attached_object")
+    state = robot_model.get_state(q_test.clone())
+
+    attached_spheres = state.link_spheres_tensor[:, sph_idx, :]
+    # check if all values are equal to position and quaternion
+
+    assert torch.norm(attached_spheres[:, :, 0] - 0.061) < 0.01
+    assert torch.norm(attached_spheres[:, :, 1]) < 0.0001
+
+    # attach an object:
+    new_object = torch.zeros((2, 4), **vars(tensor_args))
+    new_object[:, 3] = 0.01
+    new_object[:, 1] = 0.1
+    robot_model.kinematics_config.update_link_spheres("attached_object", new_object)
+    state = robot_model.get_state(q_test.clone())
+    attached_spheres = state.link_spheres_tensor[:, sph_idx, :]
+    robot_model.kinematics_config.detach_object()
+    # assert torch.norm(attached_spheres[:, :2, 0] - 0.0829) < 0.01
+    # assert torch.norm(attached_spheres[:, 2:4, 0] - 0.0829) < 0.01
+
+
+def test_franka_attach_object_kinematics(cfg):
+    tensor_args = TensorDeviceType()
+
+    robot_model = CudaRobotModel(cfg)
+    q_test = torch.as_tensor([0.0, -1.2, 0.0, -2.0, 0.0, 1.0, 0.0], **vars(tensor_args)).view(1, -1)
+    sph_idx = robot_model.kinematics_config.get_sphere_index_from_link_name("attached_object")
+    state = robot_model.get_state(q_test.clone())
+
+    attached_spheres = state.link_spheres_tensor[:, sph_idx, :]
+    # check if all values are equal to position and quaternion
+
+    assert torch.norm(attached_spheres[:, :, 0] - 0.0829) < 0.1
+    assert torch.norm(attached_spheres[:, :, 1]) < 0.0001
+
+    # attach an object:
+    new_object = torch.zeros((4, 4), **vars(tensor_args))
+    new_object[:2, 3] = 0.01
+    new_object[:2, 1] = 0.1
+    # print(attached_spheres[:, sph_idx,:].shape)
+
+    robot_model.kinematics_config.attach_object(sphere_tensor=new_object)
+    state = robot_model.get_state(q_test.clone())
+    attached_spheres = state.link_spheres_tensor[:, sph_idx, :]
+    assert (torch.norm(attached_spheres[:, :2, 0] - 0.13)) < 0.01
+    assert torch.norm(attached_spheres[:, 2:, 0] - 0.061) < 0.01
+
+    robot_model.kinematics_config.detach_object()
+    state = robot_model.get_state(q_test.clone())
+    attached_spheres = state.link_spheres_tensor[:, sph_idx, :]
+
+    assert torch.norm(attached_spheres[:, :, 0] - 0.061) < 0.01
+    assert torch.norm(attached_spheres[:, :, 1]) < 0.0001
+    robot_model.kinematics_config.detach_object()
+
+
+def test_locked_joints_kinematics():
+    tensor_args = TensorDeviceType()
+
+    config_file = load_yaml(join_path(get_robot_configs_path(), "franka.yml"))
+
+    cfg = CudaRobotModelConfig.from_config(
+        CudaRobotGeneratorConfig(**config_file["robot_cfg"]["kinematics"], tensor_args=tensor_args)
+    )
+    robot_model = CudaRobotModel(cfg)
+
+    cspace = CSpaceConfig(**config_file["robot_cfg"]["kinematics"]["cspace"])
+    cspace.inplace_reindex(robot_model.joint_names)
+    q = cspace.retract_config.unsqueeze(0)
+
+    out = robot_model.get_state(q)
+    j_idx = 2
+    lock_joint_name = cspace.joint_names[j_idx]
+    # lock a joint:
+    cfg.lock_joints = {lock_joint_name: cspace.retract_config[j_idx].item()}
+    cfg = CudaRobotModelConfig.from_config(
+        CudaRobotGeneratorConfig(**config_file["robot_cfg"]["kinematics"], tensor_args=tensor_args)
+    )
+
+    robot_model = CudaRobotModel(cfg)
+    cspace = CSpaceConfig(**config_file["robot_cfg"]["kinematics"]["cspace"])
+
+    cspace.inplace_reindex(robot_model.joint_names)
+    q = cspace.retract_config.unsqueeze(0).clone()
+    out_locked = robot_model.get_state(q)
+    assert torch.linalg.norm(out.ee_position - out_locked.ee_position) < 1e-5
+    assert torch.linalg.norm(out.ee_quaternion - out_locked.ee_quaternion) < 1e-5
+    assert torch.linalg.norm(out.link_spheres_tensor - out_locked.link_spheres_tensor) < 1e-5
diff --git a/tests/motion_gen_api_test.py b/tests/motion_gen_api_test.py
new file mode 100644
index 00000000..e067ec50
--- /dev/null
+++ b/tests/motion_gen_api_test.py
@@ -0,0 +1,50 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import pytest
+import torch
+
+# CuRobo
+from curobo.geom.types import WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
+
+
+@pytest.fixture(scope="module")
+def motion_gen():
+    tensor_args = TensorDeviceType()
+    world_file = "collision_test.yml"
+    robot_file = "franka.yml"
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_file,
+        world_file,
+        tensor_args,
+        trajopt_tsteps=26,
+        use_cuda_graph=False,
+        num_trajopt_seeds=50,
+        fixed_iters_trajopt=True,
+        evaluate_interpolated_trajectory=True,
+    )
+    motion_gen_instance = MotionGen(motion_gen_config)
+    return motion_gen_instance
+
+
+def test_motion_gen_attach_obstacle(motion_gen):
+    obstacle = motion_gen.world_model.objects[-1].name
+    retract_cfg = motion_gen.get_retract_config()
+
+    start_state = JointState.from_position(retract_cfg.view(1, -1))
+    motion_gen.attach_objects_to_robot(start_state, [obstacle])
+    assert True
diff --git a/tests/motion_gen_module_test.py b/tests/motion_gen_module_test.py
new file mode 100644
index 00000000..a8fbd7ea
--- /dev/null
+++ b/tests/motion_gen_module_test.py
@@ -0,0 +1,284 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import pytest
+import torch
+
+# CuRobo
+from curobo.geom.types import WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.util.trajectory import InterpolateType
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
+
+
+@pytest.fixture(scope="function")
+def motion_gen():
+    tensor_args = TensorDeviceType()
+    world_file = "collision_table.yml"
+    robot_file = "franka.yml"
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_file,
+        world_file,
+        tensor_args,
+        trajopt_tsteps=32,
+        use_cuda_graph=False,
+        num_trajopt_seeds=50,
+        fixed_iters_trajopt=True,
+        evaluate_interpolated_trajectory=True,
+    )
+    motion_gen_instance = MotionGen(motion_gen_config)
+    return motion_gen_instance
+
+
+@pytest.fixture(scope="function")
+def motion_gen_batch_env():
+    tensor_args = TensorDeviceType()
+    world_files = ["collision_table.yml", "collision_test.yml"]
+    world_cfg = [
+        WorldConfig.from_dict(load_yaml(join_path(get_world_configs_path(), world_file)))
+        for world_file in world_files
+    ]
+    robot_file = "franka.yml"
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_file,
+        world_cfg,
+        tensor_args,
+        trajopt_tsteps=32,
+        use_cuda_graph=False,
+        num_trajopt_seeds=10,
+        fixed_iters_trajopt=True,
+        evaluate_interpolated_trajectory=True,
+    )
+    motion_gen_instance = MotionGen(motion_gen_config)
+
+    return motion_gen_instance
+
+
+@pytest.mark.parametrize(
+    "motion_gen_str,interpolation",
+    [
+        ("motion_gen", InterpolateType.LINEAR),
+        ("motion_gen", InterpolateType.CUBIC),
+        # ("motion_gen", InterpolateType.KUNZ_STILMAN_OPTIMAL),
+        ("motion_gen", InterpolateType.LINEAR_CUDA),
+    ],
+)
+def test_motion_gen_single(motion_gen_str, interpolation, request):
+    motion_gen = request.getfixturevalue(motion_gen_str)
+    motion_gen.update_interpolation_type(interpolation)
+    motion_gen.reset()
+
+    retract_cfg = motion_gen.get_retract_config()
+
+    state = motion_gen.compute_kinematics(JointState.from_position(retract_cfg.view(1, -1)))
+
+    goal_pose = Pose(state.ee_pos_seq, quaternion=state.ee_quat_seq)
+
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.3)
+
+    m_config = MotionGenPlanConfig(False, True, num_trajopt_seeds=10)
+
+    result = motion_gen.plan_single(start_state, goal_pose, m_config)
+
+    # get final solutions:
+    assert torch.count_nonzero(result.success) == 1
+    reached_state = motion_gen.compute_kinematics(result.optimized_plan[-1])
+    assert torch.norm(goal_pose.position - reached_state.ee_pos_seq) < 0.005
+
+
+def test_motion_gen_goalset(motion_gen):
+    motion_gen.reset()
+
+    retract_cfg = motion_gen.get_retract_config()
+
+    state = motion_gen.compute_kinematics(JointState.from_position(retract_cfg.view(1, -1)))
+
+    goal_pose = Pose(
+        state.ee_pos_seq.repeat(2, 1).view(1, -1, 3),
+        quaternion=state.ee_quat_seq.repeat(2, 1).view(1, -1, 4),
+    )
+
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.3)
+
+    m_config = MotionGenPlanConfig(False, True, num_trajopt_seeds=10)
+
+    result = motion_gen.plan_goalset(start_state, goal_pose, m_config)
+
+    # get final solutions:
+    assert torch.count_nonzero(result.success) == 1
+
+    reached_state = motion_gen.compute_kinematics(result.optimized_plan[-1])
+
+    assert (
+        torch.min(
+            torch.norm(goal_pose.position[:, 0, :] - reached_state.ee_pos_seq),
+            torch.norm(goal_pose.position[:, 1, :] - reached_state.ee_pos_seq),
+        )
+        < 0.005
+    )
+
+
+def test_motion_gen_batch_goalset(motion_gen):
+    motion_gen.reset()
+
+    retract_cfg = motion_gen.get_retract_config()
+
+    state = motion_gen.compute_kinematics(JointState.from_position(retract_cfg.view(1, -1)))
+
+    goal_pose = Pose(
+        state.ee_pos_seq.repeat(4, 1).view(2, -1, 3),
+        quaternion=state.ee_quat_seq.repeat(4, 1).view(2, -1, 4),
+    )
+
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.3).repeat_seeds(2)
+
+    m_config = MotionGenPlanConfig(False, True, num_trajopt_seeds=10)
+
+    result = motion_gen.plan_batch_goalset(start_state, goal_pose, m_config)
+
+    # get final solutions:
+    assert torch.count_nonzero(result.success) > 0
+
+    reached_state = motion_gen.compute_kinematics(result.optimized_plan.trim_trajectory(-1))
+
+    assert (
+        torch.min(
+            torch.norm(goal_pose.position[:, 0, :] - reached_state.ee_pos_seq),
+            torch.norm(goal_pose.position[:, 1, :] - reached_state.ee_pos_seq),
+        )
+        < 0.005
+    )
+
+
+def test_motion_gen_batch(motion_gen):
+    motion_gen.reset()
+    retract_cfg = motion_gen.get_retract_config()
+    state = motion_gen.compute_kinematics(JointState.from_position(retract_cfg.view(1, -1)))
+
+    goal_pose = Pose(
+        state.ee_pos_seq.squeeze(), quaternion=state.ee_quat_seq.squeeze()
+    ).repeat_seeds(2)
+
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.3).repeat_seeds(2)
+
+    goal_pose.position[1, 0] -= 0.2
+
+    m_config = MotionGenPlanConfig(False, True, num_trajopt_seeds=10)
+
+    result = motion_gen.plan_batch(start_state, goal_pose.clone(), m_config)
+    assert torch.count_nonzero(result.success) > 0
+
+    # get final solutions:
+    q = result.optimized_plan.trim_trajectory(-1).squeeze(1)
+    reached_state = motion_gen.compute_kinematics(q)
+    assert torch.norm(goal_pose.position - reached_state.ee_pos_seq) < 0.005
+
+
+@pytest.mark.parametrize(
+    "motion_gen_str,interpolation",
+    [
+        ("motion_gen", InterpolateType.LINEAR),
+        ("motion_gen", InterpolateType.CUBIC),
+        # ("motion_gen", InterpolateType.KUNZ_STILMAN_OPTIMAL),
+        ("motion_gen", InterpolateType.LINEAR_CUDA),
+    ],
+)
+def test_motion_gen_batch_graph(motion_gen_str: str, interpolation: InterpolateType, request):
+    # return
+    motion_gen = request.getfixturevalue(motion_gen_str)
+
+    motion_gen.graph_planner.interpolation_type = interpolation
+    motion_gen.reset()
+    retract_cfg = motion_gen.get_retract_config()
+    state = motion_gen.compute_kinematics(JointState.from_position(retract_cfg.view(1, -1)))
+
+    goal_pose = Pose(
+        state.ee_pos_seq.squeeze(), quaternion=state.ee_quat_seq.squeeze()
+    ).repeat_seeds(5)
+
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.3).repeat_seeds(5)
+
+    goal_pose.position[1, 0] -= 0.05
+
+    m_config = MotionGenPlanConfig(True, False)
+
+    result = motion_gen.plan_batch(start_state, goal_pose, m_config)
+    assert torch.count_nonzero(result.success) > 0
+
+    # get final solutions:
+    q = result.interpolated_plan.trim_trajectory(-1).squeeze(1)
+    reached_state = motion_gen.compute_kinematics(q)
+    assert torch.norm(goal_pose.position - reached_state.ee_pos_seq) < 0.005
+
+
+def test_motion_gen_batch_env(motion_gen_batch_env):
+    motion_gen_batch_env.reset()
+    retract_cfg = motion_gen_batch_env.get_retract_config()
+    state = motion_gen_batch_env.compute_kinematics(
+        JointState.from_position(retract_cfg.view(1, -1))
+    )
+
+    goal_pose = Pose(
+        state.ee_pos_seq.squeeze(), quaternion=state.ee_quat_seq.squeeze()
+    ).repeat_seeds(2)
+
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.3).repeat_seeds(2)
+
+    goal_pose.position[1, 0] -= 0.2
+
+    m_config = MotionGenPlanConfig(False, True, num_trajopt_seeds=10)
+
+    result = motion_gen_batch_env.plan_batch_env(start_state, goal_pose, m_config)
+    assert torch.count_nonzero(result.success) > 0
+
+    # get final solutions:
+    reached_state = motion_gen_batch_env.compute_kinematics(
+        result.optimized_plan.trim_trajectory(-1).squeeze(1)
+    )
+    assert torch.norm(goal_pose.position - reached_state.ee_pos_seq) < 0.005
+
+
+def test_motion_gen_batch_env_goalset(motion_gen_batch_env):
+    motion_gen_batch_env.reset()
+    retract_cfg = motion_gen_batch_env.get_retract_config()
+    state = motion_gen_batch_env.compute_kinematics(
+        JointState.from_position(retract_cfg.view(1, -1))
+    )
+
+    goal_pose = Pose(
+        state.ee_pos_seq.repeat(4, 1).view(2, -1, 3),
+        quaternion=state.ee_quat_seq.repeat(4, 1).view(2, -1, 4),
+    )
+
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.3).repeat_seeds(2)
+
+    goal_pose.position[1, 0] -= 0.2
+
+    m_config = MotionGenPlanConfig(False, True, num_trajopt_seeds=10)
+
+    result = motion_gen_batch_env.plan_batch_env_goalset(start_state, goal_pose, m_config)
+    assert torch.count_nonzero(result.success) > 0
+
+    # get final solutions:
+    reached_state = motion_gen_batch_env.compute_kinematics(
+        result.optimized_plan.trim_trajectory(-1).squeeze(1)
+    )
+    assert (
+        torch.min(
+            torch.norm(goal_pose.position[:, 0, :] - reached_state.ee_pos_seq),
+            torch.norm(goal_pose.position[:, 1, :] - reached_state.ee_pos_seq),
+        )
+        < 0.005
+    )
diff --git a/tests/motion_gen_test.py b/tests/motion_gen_test.py
new file mode 100644
index 00000000..e72ea0ac
--- /dev/null
+++ b/tests/motion_gen_test.py
@@ -0,0 +1,99 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+
+# CuRobo
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.util_file import get_robot_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig
+
+
+def test_motion_gen():
+    tensor_args = TensorDeviceType()
+    world_file = "collision_test.yml"
+    robot_file = "franka.yml"
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_file,
+        world_file,
+        tensor_args,
+        trajopt_tsteps=40,
+        use_cuda_graph=True,
+    )
+    motion_gen = MotionGen(motion_gen_config)
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+
+    retract_cfg = motion_gen.get_retract_config()
+    state = motion_gen.rollout_fn.compute_kinematics(
+        JointState.from_position(retract_cfg.view(1, -1))
+    )
+
+    retract_pose = Pose(state.ee_pos_seq.squeeze(), quaternion=state.ee_quat_seq.squeeze())
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.5)
+    result = motion_gen.plan(start_state, retract_pose, enable_graph=False)
+    assert result.success.item()
+
+
+def test_motion_gen_attach_object():
+    tensor_args = TensorDeviceType()
+    world_file = "collision_test.yml"
+    robot_file = "franka.yml"
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_file,
+        world_file,
+        tensor_args,
+        trajopt_tsteps=40,
+        use_cuda_graph=True,
+    )
+    motion_gen = MotionGen(motion_gen_config)
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+
+    retract_cfg = motion_gen.get_retract_config()
+    state = motion_gen.rollout_fn.compute_kinematics(
+        JointState.from_position(retract_cfg.view(1, -1))
+    )
+
+    retract_pose = Pose(state.ee_pos_seq.squeeze(), quaternion=state.ee_quat_seq.squeeze())
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.5)
+    result = motion_gen.plan(start_state, retract_pose, enable_graph=False)
+    assert result.success.item()
+
+    motion_gen.attach_spheres_to_robot(sphere_radius=0.03)
+    result = motion_gen.plan(start_state, retract_pose, enable_graph=False)
+    assert result.success.item()
+
+
+def test_motion_gen_graph():
+    tensor_args = TensorDeviceType()
+    world_file = "collision_test.yml"
+    robot_file = "franka.yml"
+    motion_gen_config = MotionGenConfig.load_from_robot_config(
+        robot_file, world_file, tensor_args, trajopt_tsteps=40
+    )
+    motion_gen = MotionGen(motion_gen_config)
+    motion_gen.reset()
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+    retract_cfg = motion_gen.get_retract_config()
+    state = motion_gen.rollout_fn.compute_kinematics(
+        JointState.from_position(retract_cfg.view(1, -1))
+    )
+
+    retract_pose = Pose(state.ee_pos_seq.squeeze(), quaternion=state.ee_quat_seq.squeeze())
+    start_state = JointState.from_position(retract_cfg.view(1, -1) + 0.1)
+    start_state.position[:, 0] = 2.0
+    result = motion_gen.plan(
+        start_state, retract_pose, enable_graph=True, partial_ik_opt=False, enable_opt=False
+    )
+    assert result.success.item()
diff --git a/tests/mpc_test.py b/tests/mpc_test.py
new file mode 100644
index 00000000..eab68ee1
--- /dev/null
+++ b/tests/mpc_test.py
@@ -0,0 +1,235 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import numpy as np
+import pytest
+import torch
+
+# CuRobo
+from curobo.geom.types import WorldConfig
+from curobo.rollout.rollout_base import Goal
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.mpc import MpcSolver, MpcSolverConfig
+
+
+@pytest.fixture(scope="module")
+def mpc_single_env():
+    tensor_args = TensorDeviceType()
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), "franka.yml"))["robot_cfg"]
+    robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+
+    world_file = "collision_test.yml"
+
+    mpc_config = MpcSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_file,
+        use_cuda_graph=False,
+        use_cuda_graph_metrics=False,
+        use_cuda_graph_full_step=False,
+    )
+    mpc = MpcSolver(mpc_config)
+    retract_cfg = robot_cfg.cspace.retract_config.view(1, -1)
+
+    return [mpc, retract_cfg]
+
+
+@pytest.fixture(scope="function")
+def mpc_single_env_lbfgs():
+    tensor_args = TensorDeviceType()
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), "franka.yml"))["robot_cfg"]
+    robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+
+    world_file = "collision_test.yml"
+
+    mpc_config = MpcSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_file,
+        use_cuda_graph=True,
+        use_cuda_graph_metrics=True,
+        use_cuda_graph_full_step=False,
+        use_lbfgs=True,
+        use_mppi=False,
+        step_dt=0.5,
+    )
+    mpc = MpcSolver(mpc_config)
+    retract_cfg = robot_cfg.cspace.retract_config.view(1, -1)
+
+    return [mpc, retract_cfg]
+
+
+@pytest.fixture(scope="module")
+def mpc_batch_env():
+    tensor_args = TensorDeviceType()
+
+    world_files = ["collision_table.yml", "collision_test.yml"]
+    world_cfg = [
+        WorldConfig.from_dict(load_yaml(join_path(get_world_configs_path(), world_file)))
+        for world_file in world_files
+    ]
+
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), "franka.yml"))["robot_cfg"]
+    robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+
+    mpc_config = MpcSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        use_cuda_graph=False,
+        use_cuda_graph_metrics=False,
+        use_cuda_graph_full_step=False,
+    )
+    mpc = MpcSolver(mpc_config)
+    retract_cfg = robot_cfg.cspace.retract_config.view(1, -1)
+
+    return [mpc, retract_cfg]
+
+
+@pytest.mark.parametrize(
+    "mpc_str, expected",
+    [
+        ("mpc_single_env", True),
+        ("mpc_single_env_lbfgs", False),
+    ],
+)
+def test_mpc_single(mpc_str, expected, request):
+    mpc_val = request.getfixturevalue(mpc_str)
+    mpc = mpc_val[0]
+    retract_cfg = mpc_val[1]
+    start_state = retract_cfg
+    state = mpc.rollout_fn.compute_kinematics(JointState.from_position(retract_cfg))
+    retract_pose = Pose(state.ee_pos_seq, quaternion=state.ee_quat_seq)
+    goal = Goal(
+        current_state=JointState.from_position(retract_cfg + 0.5),
+        goal_state=JointState.from_position(retract_cfg),
+        goal_pose=retract_pose,
+    )
+    goal_buffer = mpc.setup_solve_single(goal, 1)
+
+    start_state = JointState.from_position(retract_cfg + 0.5, joint_names=mpc.joint_names)
+
+    converged = False
+    tstep = 0
+    mpc.update_goal(goal_buffer)
+    current_state = start_state.clone()
+    while not converged:
+        result = mpc.step(current_state, max_attempts=1)
+        torch.cuda.synchronize()
+        current_state.copy_(result.action)
+        tstep += 1
+        if result.metrics.pose_error.item() < 0.05:
+            converged = True
+            break
+        if tstep > 100:
+            break
+    assert converged == expected
+
+
+def test_mpc_goalset(mpc_single_env):
+    mpc = mpc_single_env[0]
+    retract_cfg = mpc_single_env[1]
+    start_state = retract_cfg
+    state = mpc.rollout_fn.compute_kinematics(JointState.from_position(retract_cfg))
+    retract_pose = Pose(
+        state.ee_pos_seq.repeat(2, 1).unsqueeze(0),
+        quaternion=state.ee_quat_seq.repeat(2, 1).unsqueeze(0),
+    )
+    goal = Goal(
+        current_state=JointState.from_position(retract_cfg + 0.5),
+        goal_state=JointState.from_position(retract_cfg),
+        goal_pose=retract_pose,
+    )
+    goal_buffer = mpc.setup_solve_goalset(goal, 1)
+
+    start_state = JointState.from_position(retract_cfg + 0.5, joint_names=mpc.joint_names)
+
+    converged = False
+    tstep = 0
+    mpc.update_goal(goal_buffer)
+    current_state = start_state.clone()
+    while not converged:
+        result = mpc.step(current_state)
+        torch.cuda.synchronize()
+        current_state.copy_(result.action)
+        tstep += 1
+        if result.metrics.pose_error.item() < 0.01:
+            converged = True
+            break
+        if tstep > 1000:
+            break
+    assert converged
+
+
+def test_mpc_batch(mpc_single_env):
+    mpc = mpc_single_env[0]
+    retract_cfg = mpc_single_env[1].repeat(2, 1)
+    start_state = retract_cfg
+    state = mpc.rollout_fn.compute_kinematics(JointState.from_position(retract_cfg))
+    retract_pose = Pose(state.ee_pos_seq, quaternion=state.ee_quat_seq)
+    retract_pose.position[0, 0] -= 0.02
+    goal = Goal(
+        current_state=JointState.from_position(retract_cfg + 0.5),
+        goal_state=JointState.from_position(retract_cfg),
+        goal_pose=retract_pose,
+    )
+    goal_buffer = mpc.setup_solve_batch(goal, 1)
+
+    start_state = JointState.from_position(retract_cfg + 0.5, joint_names=mpc.joint_names)
+
+    converged = False
+    tstep = 0
+    mpc.update_goal(goal_buffer)
+    current_state = start_state.clone()
+    while not converged:
+        result = mpc.step(current_state)
+        torch.cuda.synchronize()
+        current_state.copy_(result.action)
+        tstep += 1
+        if torch.max(result.metrics.pose_error) < 0.05:
+            converged = True
+            break
+        if tstep > 1000:
+            break
+    assert converged
+
+
+def test_mpc_batch_env(mpc_batch_env):
+    mpc = mpc_batch_env[0]
+    retract_cfg = mpc_batch_env[1].repeat(2, 1)
+    start_state = retract_cfg
+    state = mpc.rollout_fn.compute_kinematics(JointState.from_position(retract_cfg))
+    retract_pose = Pose(state.ee_pos_seq, quaternion=state.ee_quat_seq)
+    goal = Goal(
+        current_state=JointState.from_position(retract_cfg + 0.5, joint_names=mpc.joint_names),
+        goal_state=JointState.from_position(retract_cfg, joint_names=mpc.joint_names),
+        goal_pose=retract_pose,
+    )
+    goal_buffer = mpc.setup_solve_batch_env(goal, 1)
+
+    start_state = JointState.from_position(retract_cfg + 0.5, joint_names=mpc.joint_names)
+
+    converged = False
+    tstep = 0
+    mpc.update_goal(goal_buffer)
+    current_state = start_state.clone()
+    while not converged:
+        result = mpc.step(current_state)
+        torch.cuda.synchronize()
+        current_state.copy_(result.action)
+        tstep += 1
+        if torch.max(result.metrics.pose_error) < 0.05:
+            converged = True
+            break
+        if tstep > 1000:
+            break
+    assert converged
diff --git a/tests/multi_pose_test.py b/tests/multi_pose_test.py
new file mode 100644
index 00000000..17570317
--- /dev/null
+++ b/tests/multi_pose_test.py
@@ -0,0 +1,56 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import torch
+
+# CuRobo
+from curobo.geom.types import WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import RobotConfig
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.ik_solver import IKSolver, IKSolverConfig
+
+
+def test_multi_pose_franka():
+    tensor_args = TensorDeviceType()
+    world_file = "collision_cubby.yml"
+
+    robot_file = "franka.yml"
+    robot_data = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+
+    robot_data["kinematics"]["link_names"] = robot_data["kinematics"]["collision_link_names"]
+    robot_cfg = RobotConfig.from_dict(robot_data)
+
+    world_cfg = WorldConfig.from_dict(load_yaml(join_path(get_world_configs_path(), world_file)))
+    ik_config = IKSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        rotation_threshold=0.05,
+        position_threshold=0.005,
+        num_seeds=30,
+        self_collision_check=True,
+        self_collision_opt=True,
+        use_cuda_graph=True,
+        tensor_args=tensor_args,
+    )
+    ik_solver = IKSolver(ik_config)
+    b_size = 1
+
+    q_sample = ik_solver.sample_configs(b_size)
+    kin_state = ik_solver.fk(q_sample)
+    link_poses = kin_state.link_pose
+    goal = Pose(kin_state.ee_position, kin_state.ee_quaternion)
+    result = ik_solver.solve_single(goal, link_poses=link_poses)
+
+    success = result.success
+    assert torch.count_nonzero(success).item() >= 1.0  # we check if atleast 90% are successful
diff --git a/tests/nvblox_test.py b/tests/nvblox_test.py
new file mode 100644
index 00000000..69c64ff9
--- /dev/null
+++ b/tests/nvblox_test.py
@@ -0,0 +1,161 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import pytest
+import torch
+
+# CuRobo
+from curobo.geom.sdf.world import CollisionQueryBuffer, WorldCollisionConfig
+from curobo.geom.types import Cuboid, WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.util_file import get_world_configs_path, join_path, load_yaml
+
+try:
+    # Third Party
+    from nvblox_torch.mapper import Mapper
+
+    # CuRobo
+    from curobo.geom.sdf.world_blox import WorldBloxCollision
+
+except ImportError:
+    pytest.skip("Nvblox Torch not available", allow_module_level=True)
+
+
+def test_world_blox_trajectory():
+    tensor_args = TensorDeviceType()
+    world_file = "collision_nvblox.yml"
+    data_dict = load_yaml(join_path(get_world_configs_path(), world_file))
+    world_cfg = WorldConfig.from_dict(data_dict)
+    coll_cfg = WorldCollisionConfig(world_model=world_cfg, tensor_args=TensorDeviceType())
+    coll_check = WorldBloxCollision(coll_cfg)
+    x_sph = torch.as_tensor(
+        [[0.1, 0.2, 0.0, 10.5], [10.0, 0.0, 0.0, 0.0], [0.01, 0.01, 0.0, 0.1]], **vars(tensor_args)
+    ).view(1, -1, 1, 4)
+    # create buffers:
+    query_buffer = CollisionQueryBuffer.initialize_from_shape(
+        x_sph.shape, tensor_args, coll_check.collision_types
+    )
+    weight = tensor_args.to_device([1])
+    act_distance = tensor_args.to_device([0.0])
+    d_sph = coll_check.get_swept_sphere_distance(
+        x_sph, query_buffer, weight, act_distance, act_distance + 0.01, 4, True
+    ).view(-1)
+
+    assert d_sph[0] > 10.0
+    assert d_sph[1] == 0.0
+    assert d_sph[2] == 0.0
+
+
+def test_world_blox():
+    tensor_args = TensorDeviceType()
+    world_file = "collision_nvblox.yml"
+    data_dict = load_yaml(join_path(get_world_configs_path(), world_file))
+    world_cfg = WorldConfig.from_dict(data_dict)
+    coll_cfg = WorldCollisionConfig(world_model=world_cfg, tensor_args=TensorDeviceType())
+    coll_check = WorldBloxCollision(coll_cfg)
+    x_sph = torch.as_tensor(
+        [[0.1, 0.2, 0.0, 10.5], [10.0, 0.0, 0.0, 0.0], [0.01, 0.01, 0.0, 0.1]], **vars(tensor_args)
+    ).view(1, 1, -1, 4)
+    # create buffers:
+    query_buffer = CollisionQueryBuffer.initialize_from_shape(
+        x_sph.shape, tensor_args, coll_check.collision_types
+    )
+    weight = tensor_args.to_device([1])
+    act_distance = tensor_args.to_device([0.0])
+    d_sph = coll_check.get_sphere_distance(x_sph, query_buffer, weight, act_distance).view(-1)
+
+    assert d_sph[0] > 10.0
+    assert d_sph[1] == 0.0
+    assert d_sph[2] == 0.0
+
+
+def test_world_blox_bounding():
+    tensor_args = TensorDeviceType()
+    world_file = "collision_nvblox.yml"
+    data_dict = load_yaml(join_path(get_world_configs_path(), world_file))
+    world_cfg = WorldConfig.from_dict(data_dict)
+    coll_cfg = WorldCollisionConfig(world_model=world_cfg, tensor_args=TensorDeviceType())
+    coll_check = WorldBloxCollision(coll_cfg)
+    bounding_cube = Cuboid("test", pose=[0, 0, 0, 1, 0, 0, 0], dims=[1.0, 1.0, 1.0])
+    mesh = coll_check.get_mesh_in_bounding_box(
+        bounding_cube,
+        voxel_size=0.05,
+    )
+    assert len(mesh.vertices) > 0
+
+
+def test_world_blox_get_mesh():
+    world_file = "collision_nvblox.yml"
+    tensor_args = TensorDeviceType()
+
+    data_dict = load_yaml(join_path(get_world_configs_path(), world_file))
+    world_cfg = WorldConfig.from_dict(data_dict)
+    coll_cfg = WorldCollisionConfig(world_model=world_cfg, tensor_args=TensorDeviceType())
+    coll_check = WorldBloxCollision(coll_cfg)
+
+    world_mesh = coll_check.get_mesh_from_blox_layer("world")
+    assert len(world_mesh.vertices) > 10
+
+
+def test_nvblox_world_from_primitive_world():
+    world_file = "collision_cubby.yml"
+    tensor_args = TensorDeviceType()
+    data_dict = load_yaml(join_path(get_world_configs_path(), world_file))
+    world_cfg = WorldConfig.from_dict(data_dict).get_mesh_world(True)
+    mesh = world_cfg.mesh[0].get_trimesh_mesh()
+    world_cfg.mesh[0].save_as_mesh("world.obj")
+    # Third Party
+    from nvblox_torch.datasets.mesh_dataset import MeshDataset
+
+    # CuRobo
+    from curobo.geom.sdf.world import CollisionCheckerType
+    from curobo.geom.types import BloxMap
+    from curobo.types.camera import CameraObservation
+    from curobo.types.math import Pose
+
+    # create a nvblox collision checker:
+    world_config = WorldConfig(
+        blox=[
+            BloxMap(
+                name="world",
+                pose=[0, 0, 0, 1, 0, 0, 0],
+                voxel_size=0.02,
+                integrator_type="tsdf",
+            )
+        ]
+    )
+    config = WorldCollisionConfig(
+        tensor_args=tensor_args, world_model=world_config, checker_type=CollisionCheckerType.BLOX
+    )
+    world_coll = WorldBloxCollision(config)
+    m_dataset = MeshDataset(
+        None, n_frames=100, image_size=1000, save_data_dir=None, trimesh_mesh=mesh
+    )
+    for i in range(len(m_dataset)):
+        data = m_dataset[i]
+        cam_obs = CameraObservation(
+            rgb_image=data["rgba"],
+            depth_image=data["depth"],
+            intrinsics=data["intrinsics"],
+            pose=Pose.from_matrix(data["pose"].to(device=tensor_args.device)),
+        )
+        world_coll.add_camera_frame(cam_obs, "world")
+
+    world_coll.process_camera_frames("world")
+    world_coll.update_blox_mesh("world")
+    integrated_mesh = world_coll.get_mesh_from_blox_layer("world")
+    if len(integrated_mesh.vertices) > 0:
+        assert True
+        # print("saving World")
+        # integrated_mesh.save_as_mesh("collision_test.obj")
+    else:
+        assert True
diff --git a/tests/pose_test.py b/tests/pose_test.py
new file mode 100644
index 00000000..8278ad33
--- /dev/null
+++ b/tests/pose_test.py
@@ -0,0 +1,66 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import torch
+
+# CuRobo
+from curobo.geom.transform import batch_transform_points, transform_points
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+
+
+def test_pose_transform_point():
+    tensor_args = TensorDeviceType()
+    new_pose = Pose.from_list([0, 0, 0, 1, 0, 0, 0], tensor_args)
+
+    new_pose.position.requires_grad = True
+    new_pose.quaternion.requires_grad = True
+
+    points = torch.zeros((3, 3), device=tensor_args.device, dtype=tensor_args.dtype)
+    points[:, 0] = 0.1
+    points[2, 0] = -0.5
+
+    out_pt = new_pose.transform_point(points)
+
+    loss = torch.sum(out_pt)
+    loss.backward()
+    assert torch.norm(new_pose.position.grad) > 0.0
+    assert torch.norm(new_pose.quaternion.grad) > 0.0
+
+
+def test_pose_transform_point_grad():
+    tensor_args = TensorDeviceType()
+    new_pose = Pose.from_list([10.0, 0, 0.1, 1.0, 0, 0, 0], tensor_args)
+    new_pose.position.requires_grad = True
+    new_pose.quaternion.requires_grad = True
+
+    points = torch.zeros((1, 1, 3), device=tensor_args.device, dtype=tensor_args.dtype) + 10.0
+
+    # buffers:
+    out_points = torch.zeros(
+        (points.shape[0], points.shape[1], 3), device=points.device, dtype=points.dtype
+    )
+    out_gp = torch.zeros((new_pose.position.shape[0], 3), device=tensor_args.device)
+    out_gq = torch.zeros((new_pose.position.shape[0], 4), device=tensor_args.device)
+    out_gpt = torch.zeros((points.shape[0], points.shape[1], 3), device=tensor_args.device)
+
+    torch.autograd.gradcheck(
+        batch_transform_points,
+        (new_pose.position, new_pose.quaternion, points, out_points, out_gp, out_gq, out_gpt),
+        eps=1e-6,
+        atol=1.0,
+        # nondet_tol=100.0,
+    )
+
+
+# test_pose_transform_point()
+# test_pose_transform_point_grad()
diff --git a/tests/robot_assets_test.py b/tests/robot_assets_test.py
new file mode 100644
index 00000000..4c07c3ba
--- /dev/null
+++ b/tests/robot_assets_test.py
@@ -0,0 +1,114 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import pytest
+import torch
+
+# CuRobo
+from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel
+from curobo.geom.types import WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import RobotConfig
+from curobo.types.state import JointState
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.ik_solver import IKSolver, IKSolverConfig
+from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig
+
+
+@pytest.mark.parametrize(
+    "robot_file",
+    [
+        "kinova_gen3.yml",
+        "iiwa.yml",
+        "iiwa_allegro.yml",
+        "franka.yml",
+        "ur10e.yml",
+    ],
+)
+class TestRobots:
+    def test_robot_config(self, robot_file):
+        tensor_args = TensorDeviceType()
+        robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+        robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+        pass
+
+    def test_kinematics(self, robot_file):
+        tensor_args = TensorDeviceType()
+
+        robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+        robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+        robot_model = CudaRobotModel(robot_cfg.kinematics)
+        robot_cfg.cspace.inplace_reindex(robot_model.joint_names)
+        robot_model.get_state(robot_cfg.cspace.retract_config)
+        pass
+
+    def test_ik(self, robot_file):
+        world_file = "collision_table.yml"
+        tensor_args = TensorDeviceType()
+
+        robot_cfg = RobotConfig.from_dict(
+            load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+        )
+        world_cfg = WorldConfig.from_dict(
+            load_yaml(join_path(get_world_configs_path(), world_file))
+        )
+        ik_config = IKSolverConfig.load_from_robot_config(
+            robot_cfg,
+            world_cfg,
+            rotation_threshold=0.05,
+            position_threshold=0.005,
+            num_seeds=50,
+            self_collision_check=True,
+            self_collision_opt=True,
+            tensor_args=tensor_args,
+        )
+        ik_solver = IKSolver(ik_config)
+        b_size = 10
+        q_sample = ik_solver.sample_configs(b_size)
+        kin_state = ik_solver.fk(q_sample)
+        goal = Pose(kin_state.ee_position, kin_state.ee_quaternion)
+        result = ik_solver.solve(goal)
+        result = ik_solver.solve(goal)
+
+        success = result.success
+        assert torch.count_nonzero(success).item() >= 9.0  # we check if atleast 90% are successful
+
+    def notest_motion_gen(self, robot_file):
+        """This test causes pytest to crash when running on many robot configurations
+
+        Args:
+            robot_file: _description_
+        """
+        tensor_args = TensorDeviceType()
+
+        motion_gen_config = MotionGenConfig.load_from_robot_config(
+            robot_file,
+            "collision_table.yml",
+            tensor_args,
+            trajopt_tsteps=40,
+            use_cuda_graph=True,
+        )
+        motion_gen = MotionGen(motion_gen_config)
+        robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+        robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+
+        retract_cfg = motion_gen.get_retract_config()
+        state = motion_gen.rollout_fn.compute_kinematics(
+            JointState.from_position(retract_cfg.view(1, -1))
+        )
+
+        retract_pose = Pose(state.ee_pos_seq.squeeze(), quaternion=state.ee_quat_seq.squeeze())
+        start_state = JointState.from_position(retract_cfg.view(1, -1))
+        start_state.position[0, -1] += 0.2
+        result = motion_gen.plan(start_state, retract_pose, enable_graph=False)
+        assert result.success.item()
diff --git a/tests/robot_config_test.py b/tests/robot_config_test.py
new file mode 100644
index 00000000..ed8aae35
--- /dev/null
+++ b/tests/robot_config_test.py
@@ -0,0 +1,82 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import torch
+
+# CuRobo
+from curobo.types.base import TensorDeviceType
+from curobo.types.robot import CSpaceConfig, JointState
+
+
+def test_cspace_config():
+    cspace_config = CSpaceConfig(
+        ["j0", "j1", "j2", "j3"],
+        retract_config=[i for i in range(4)],
+        cspace_distance_weight=[i for i in range(4)],
+        null_space_weight=[i for i in range(4)],
+    )
+
+    new_order = ["j3", "j1"]
+    cspace_config.inplace_reindex(new_order)
+    assert cspace_config.retract_config[0] == 3 and cspace_config.retract_config[1] == 1
+    assert cspace_config.null_space_weight[0] == 3 and cspace_config.null_space_weight[1] == 1
+    assert (
+        cspace_config.cspace_distance_weight[0] == 3
+        and cspace_config.cspace_distance_weight[1] == 1
+    )
+
+
+def test_joint_state():
+    tensor_args = TensorDeviceType()
+    j_names = ["j0", "j1", "j2", "j3"]
+    loc_j = ["j4", "jb"]
+    final_j = ["jb", "j0", "j1", "j2", "j3", "j4"]
+
+    position = tensor_args.to_device([i for i in range(len(j_names))])
+
+    loc_position = tensor_args.to_device([i + len(j_names) for i in range(len(loc_j))])
+
+    js_1 = JointState.from_position(position, joint_names=j_names)
+    js_lock = JointState.from_position(loc_position, loc_j)
+
+    final_js = js_1.get_augmented_joint_state(final_j, js_lock)
+    assert final_js.joint_names == final_j
+    assert (
+        torch.linalg.norm(final_js.position - tensor_args.to_device([5, 0, 1, 2, 3, 4])).item()
+        < 1e-8
+    )
+
+
+def test_batch_joint_state():
+    tensor_args = TensorDeviceType()
+    j_names = ["j0", "j1", "j2", "j3"]
+    loc_j = ["j4", "jb"]
+    final_j = ["jb", "j0", "j1", "j2", "j3", "j4"]
+
+    # $position = tensor_args.to_device([i for i in range(len(j_names))])
+    position = torch.zeros((10, len(j_names)), device=tensor_args.device, dtype=tensor_args.dtype)
+    for i in range(len(j_names)):
+        position[:, i] = i
+
+    loc_position = tensor_args.to_device([i + len(j_names) for i in range(len(loc_j))])
+
+    js_1 = JointState.from_position(position, joint_names=j_names)
+    js_lock = JointState.from_position(loc_position, loc_j)
+
+    final_js = js_1.get_augmented_joint_state(final_j, js_lock)
+    assert final_js.joint_names == final_j
+    assert (
+        torch.linalg.norm(
+            final_js.position - tensor_args.to_device([5, 0, 1, 2, 3, 4]).unsqueeze(0)
+        ).item()
+        < 1e-8
+    )
diff --git a/tests/robot_world_model_test.py b/tests/robot_world_model_test.py
new file mode 100644
index 00000000..6e4be114
--- /dev/null
+++ b/tests/robot_world_model_test.py
@@ -0,0 +1,151 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import torch
+
+# CuRobo
+from curobo.wrap.model.robot_world import RobotWorld, RobotWorldConfig
+
+
+def load_robot_world():
+    robot_file = "franka.yml"
+    world_file = "collision_table.yml"
+    config = RobotWorldConfig.load_from_config(
+        robot_file, world_file, collision_activation_distance=0.2
+    )
+    model = RobotWorld(config)
+    return model
+
+
+def load_robot_batch_world():
+    robot_file = "franka.yml"
+    world_file = ["collision_table.yml", "collision_test.yml"]
+    config = RobotWorldConfig.load_from_config(
+        robot_file, world_file, collision_activation_distance=0.0
+    )
+    model = RobotWorld(config)
+    return model
+
+
+def test_robot_world_config_load():
+    # test if curobo robot world can be loaded
+    load_robot_world()
+    assert True
+
+
+def test_robot_batch_world_config_load():
+    # test if curobo robot world can be loaded
+    load_robot_batch_world()
+    assert True
+
+
+def test_robot_world_kinematics():
+    # test if kinematics can be queried
+    model = load_robot_world()
+    n = 10
+    q = model.sample(n)
+    state = model.get_kinematics(q)
+    assert state.ee_position.shape[-1] == 3
+
+
+def test_robot_world_sample():
+    # test if joint configurations can be sampled
+    model = load_robot_world()
+    n = 10
+    q = model.sample(n)
+    assert q.shape[0] == n
+
+
+def test_robot_world_collision():
+    # test computing collisions given robot joint configurations
+    model = load_robot_world()
+    n = 10
+    q = model.sample(n)
+    state = model.get_kinematics(q)
+    d_world = model.get_collision_constraint(state.link_spheres_tensor.view(n, 1, -1, 4))
+    assert d_world.shape[0] == n
+    assert torch.sum(d_world) == 0.0
+
+
+def test_robot_world_collision_vector():
+    # test computing collisions given robot joint configurations
+    model = load_robot_world()
+    n = 10
+    q = model.sample(n)
+    state = model.get_kinematics(q)
+    d_world, d_vec = model.get_collision_vector(state.link_spheres_tensor.view(n, 1, -1, 4))
+
+    assert d_world.shape[0] == n
+    assert torch.norm(d_world[0] - 0.1385) < 0.002
+    assert torch.abs(d_vec[0, 0, 0, 2] + 0.7350) > 0.002
+
+
+def test_robot_world_self_collision():
+    model = load_robot_world()
+    n = 10
+    q = model.sample(n)
+    state = model.get_kinematics(q)
+    d_self = model.get_self_collision(state.link_spheres_tensor.view(n, 1, -1, 4))
+    assert torch.sum(d_self) == 0.0
+
+
+def test_robot_world_pose():
+    model = load_robot_world()
+    n = 10
+    q = model.sample(n)
+    state = model.get_kinematics(q)
+    pose = state.ee_pose
+    des_pose = state.ee_pose
+    d = model.pose_distance(des_pose, pose)
+    assert torch.sum(d) < 1e-3
+
+
+def test_trajectory_sample():
+    model = load_robot_world()
+    b = 10
+    horizon = 21
+    q = model.sample_trajectory(b, horizon)
+    assert q.shape[0] == b and q.shape[1] == horizon
+
+
+def test_batch_trajectory_sample():
+    model = load_robot_batch_world()
+    b = 2
+    horizon = 21
+    env_query_idx = torch.zeros((b), dtype=torch.int32, device=torch.device("cuda", 0))
+    q = model.sample_trajectory(b, horizon, env_query_idx=env_query_idx)
+    assert q.shape[0] == b and q.shape[1] == horizon
+
+
+def test_batch_trajectory_1env_sample():
+    model = load_robot_batch_world()
+    b = 2
+    horizon = 21
+    env_query_idx = None
+    q = model.sample_trajectory(b, horizon, env_query_idx=env_query_idx)
+    assert q.shape[0] == b and q.shape[1] == horizon
+
+
+def test_robot_batch_world_collision():
+    # test computing collisions given robot joint configurations
+    model = load_robot_batch_world()
+    b = 2
+    horizon = 21
+    env_query_idx = torch.zeros((b), dtype=torch.int32, device=torch.device("cuda", 0))
+
+    q = model.sample_trajectory(b, horizon, env_query_idx=env_query_idx)
+
+    d_world, d_self = model.get_world_self_collision_distance_from_joint_trajectory(
+        q, env_query_idx
+    )
+    assert d_world.shape[0] == b
+    assert torch.sum(d_world) == 0.0
diff --git a/tests/self_collision_test.py b/tests/self_collision_test.py
new file mode 100644
index 00000000..2a7a6ee2
--- /dev/null
+++ b/tests/self_collision_test.py
@@ -0,0 +1,106 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import pytest
+import torch
+
+# CuRobo
+from curobo.cuda_robot_model.cuda_robot_model import CudaRobotModel
+from curobo.rollout.cost.self_collision_cost import SelfCollisionCost, SelfCollisionCostConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.robot import RobotConfig
+from curobo.util_file import get_robot_configs_path, join_path, load_yaml
+
+
+@pytest.mark.parametrize(
+    "batch_size, horizon",
+    [
+        pytest.param(1, 1, id="1"),
+        pytest.param(10, 1, id="10"),
+        pytest.param(100000, 1, id="100k"),
+        pytest.param(100, 70, id="horizon"),
+    ],
+)
+def test_self_collision_experimental(batch_size, horizon):
+    robot_file = "franka.yml"
+    tensor_args = TensorDeviceType()
+
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    robot_cfg["kinematics"]["debug"] = {"self_collision_experimental": True}
+    robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+    kinematics = CudaRobotModel(robot_cfg.kinematics)
+    self_collision_data = kinematics.get_self_collision_config()
+    self_collision_config = SelfCollisionCostConfig(
+        **{"weight": 1.0, "classify": True, "self_collision_kin_config": self_collision_data},
+        tensor_args=tensor_args
+    )
+    cost_fn = SelfCollisionCost(self_collision_config)
+
+    b = batch_size
+    h = horizon
+
+    q = (
+        torch.rand(
+            (b * h, kinematics.get_dof()), device=tensor_args.device, dtype=tensor_args.dtype
+        )
+        * 10
+    )
+    kin_state = kinematics.get_state(q)
+
+    in_spheres = kin_state.link_spheres_tensor
+    in_spheres = in_spheres.view(b, h, -1, 4).contiguous()
+
+    for _ in range(1):
+        out = cost_fn.forward(in_spheres)
+    k = out.clone()
+    cost_fn.self_collision_kin_config.experimental_kernel = False
+    cost_fn._out_distance[:] = 0.0
+    for _ in range(1):
+        out = cost_fn.forward(in_spheres)
+
+    assert torch.norm(k - out).item() < 1e-8
+
+
+def test_self_collision_franka():
+    tensor_args = TensorDeviceType()
+
+    robot_cfg = load_yaml(join_path(get_robot_configs_path(), "franka.yml"))["robot_cfg"]
+    robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)
+    robot_cfg.kinematics.self_collision_config.experimental_kernel = True
+    kinematics = CudaRobotModel(robot_cfg.kinematics)
+    self_collision_data = kinematics.get_self_collision_config()
+    self_collision_config = SelfCollisionCostConfig(
+        **{"weight": 5000.0, "classify": False, "self_collision_kin_config": self_collision_data},
+        tensor_args=tensor_args
+    )
+    cost_fn = SelfCollisionCost(self_collision_config)
+
+    b = 10
+    h = 1
+
+    q = torch.rand(
+        (b * h, kinematics.get_dof()), device=tensor_args.device, dtype=tensor_args.dtype
+    )
+
+    test_q = tensor_args.to_device([2.7735, -1.6737, 0.4998, -2.9865, 0.3386, 0.8413, 0.4371])
+    q[:] = test_q
+    kin_state = kinematics.get_state(q)
+
+    in_spheres = kin_state.link_spheres_tensor
+    in_spheres = in_spheres.view(b, h, -1, 4).contiguous()
+
+    out = cost_fn.forward(in_spheres)
+    assert out.sum().item() > 0.0
+    cost_fn.self_collision_kin_config.experimental_kernel = False
+    cost_fn._out_distance[:] = 0.0
+    out = cost_fn.forward(in_spheres)
+    assert out.sum().item() > 0.0
diff --git a/tests/trajopt_config_test.py b/tests/trajopt_config_test.py
new file mode 100644
index 00000000..3ee4bf0d
--- /dev/null
+++ b/tests/trajopt_config_test.py
@@ -0,0 +1,132 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Third Party
+import pytest
+import torch
+
+# CuRobo
+from curobo.geom.types import WorldConfig
+from curobo.rollout.rollout_base import Goal
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.trajopt import TrajOptSolver, TrajOptSolverConfig
+
+
+def trajopt_base_config():
+    tensor_args = TensorDeviceType()
+    world_file = "collision_table.yml"
+
+    robot_file = "franka.yml"
+    robot_cfg = RobotConfig.from_dict(
+        load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    )
+    world_cfg = WorldConfig.from_dict(load_yaml(join_path(get_world_configs_path(), world_file)))
+
+    trajopt_config = TrajOptSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        tensor_args,
+        use_cuda_graph=False,
+        use_fixed_samples=True,
+        n_collision_envs=1,
+        collision_cache={"obb": 10},
+        seed_ratio={"linear": 0.5, "start": 0.25, "goal": 0.25},
+        num_seeds=10,
+    )
+
+    return trajopt_config
+
+
+def trajopt_es_config():
+    tensor_args = TensorDeviceType()
+    world_file = "collision_table.yml"
+
+    robot_file = "franka.yml"
+    robot_cfg = RobotConfig.from_dict(
+        load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    )
+    world_cfg = WorldConfig.from_dict(load_yaml(join_path(get_world_configs_path(), world_file)))
+
+    trajopt_config = TrajOptSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        tensor_args,
+        use_cuda_graph=False,
+        use_es=True,
+        es_learning_rate=0.01,
+    )
+    return trajopt_config
+
+
+def trajopt_gd_config():
+    tensor_args = TensorDeviceType()
+    world_file = "collision_table.yml"
+
+    robot_file = "franka.yml"
+    robot_cfg = RobotConfig.from_dict(
+        load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    )
+    world_cfg = WorldConfig.from_dict(load_yaml(join_path(get_world_configs_path(), world_file)))
+
+    trajopt_config = TrajOptSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        tensor_args,
+        use_cuda_graph=False,
+        use_gradient_descent=True,
+        grad_trajopt_iters=500,
+    )
+    return trajopt_config
+
+
+def trajopt_no_particle_opt_config():
+    tensor_args = TensorDeviceType()
+    world_file = "collision_table.yml"
+
+    robot_file = "franka.yml"
+    robot_cfg = RobotConfig.from_dict(
+        load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    )
+    world_cfg = WorldConfig.from_dict(load_yaml(join_path(get_world_configs_path(), world_file)))
+
+    trajopt_config = TrajOptSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        tensor_args,
+        use_cuda_graph=False,
+        use_particle_opt=False,
+    )
+    return trajopt_config
+
+
+@pytest.mark.parametrize(
+    "config,expected",
+    [
+        (trajopt_base_config(), True),
+        (trajopt_es_config(), True),
+        (trajopt_gd_config(), True),
+        (trajopt_no_particle_opt_config(), True),
+    ],
+)
+def test_eval(config, expected):
+    trajopt_solver = TrajOptSolver(config)
+    q_start = trajopt_solver.retract_config
+    q_goal = q_start.clone() + 0.1
+    kin_state = trajopt_solver.fk(q_goal)
+    goal_pose = Pose(kin_state.ee_position, kin_state.ee_quaternion)
+    goal_state = JointState.from_position(q_goal)
+    current_state = JointState.from_position(q_start)
+    js_goal = Goal(goal_pose=goal_pose, goal_state=goal_state, current_state=current_state)
+    result = trajopt_solver.solve_single(js_goal)
+
+    assert result.success.item() == expected
diff --git a/tests/trajopt_test.py b/tests/trajopt_test.py
new file mode 100644
index 00000000..2f7f2260
--- /dev/null
+++ b/tests/trajopt_test.py
@@ -0,0 +1,277 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Third Party
+import pytest
+import torch
+
+# CuRobo
+from curobo.geom.types import WorldConfig
+from curobo.rollout.rollout_base import Goal
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.types.robot import JointState, RobotConfig
+from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
+from curobo.wrap.reacher.trajopt import TrajOptSolver, TrajOptSolverConfig
+
+
+@pytest.fixture(scope="function")
+def trajopt_solver():
+    tensor_args = TensorDeviceType()
+    world_file = "collision_table.yml"
+
+    robot_file = "franka.yml"
+    robot_cfg = RobotConfig.from_dict(
+        load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    )
+    world_cfg = WorldConfig.from_dict(load_yaml(join_path(get_world_configs_path(), world_file)))
+
+    trajopt_config = TrajOptSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        tensor_args,
+        use_cuda_graph=False,
+        evaluate_interpolated_trajectory=True,
+    )
+    trajopt_solver = TrajOptSolver(trajopt_config)
+
+    return trajopt_solver
+
+
+@pytest.fixture(scope="function")
+def trajopt_solver_batch_env():
+    tensor_args = TensorDeviceType()
+    world_files = ["collision_table.yml", "collision_cubby.yml", "collision_test.yml"]
+    world_cfg = [
+        WorldConfig.from_dict(load_yaml(join_path(get_world_configs_path(), world_file)))
+        for world_file in world_files
+    ]
+    robot_file = "franka.yml"
+    robot_cfg = RobotConfig.from_dict(
+        load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+    )
+    # world_cfg = WorldConfig.from_dict(load_yaml(join_path(get_world_configs_path(), world_file)))
+
+    trajopt_config = TrajOptSolverConfig.load_from_robot_config(
+        robot_cfg,
+        world_cfg,
+        tensor_args,
+        use_cuda_graph=False,
+        num_seeds=10,
+        evaluate_interpolated_trajectory=True,
+    )
+    trajopt_solver = TrajOptSolver(trajopt_config)
+
+    return trajopt_solver
+
+
+def test_trajopt_single_js(trajopt_solver):
+    q_start = trajopt_solver.retract_config
+    q_goal = q_start.clone() + 0.2
+    goal_state = JointState.from_position(q_goal)
+    current_state = JointState.from_position(q_start)
+    # do single planning:
+    js_goal = Goal(goal_state=goal_state, current_state=current_state)
+    result = trajopt_solver.solve_single(js_goal)
+
+    traj = result.solution.position[..., -1, :].view(q_goal.shape)
+    assert torch.linalg.norm((goal_state.position - traj)).item() < 5e-3
+    assert result.success.item()
+
+
+def test_trajopt_single_pose(trajopt_solver):
+    trajopt_solver.reset_seed()
+    q_start = trajopt_solver.retract_config
+    q_goal = q_start.clone() + 0.1
+    kin_state = trajopt_solver.fk(q_goal)
+    goal_pose = Pose(kin_state.ee_position, kin_state.ee_quaternion)
+    goal_state = JointState.from_position(q_goal)
+    current_state = JointState.from_position(q_start)
+    js_goal = Goal(goal_pose=goal_pose, goal_state=goal_state, current_state=current_state)
+    result = trajopt_solver.solve_single(js_goal)
+
+    assert result.success.item()
+
+
+def test_trajopt_single_pose_no_seed(trajopt_solver):
+    trajopt_solver.reset_seed()
+    q_start = trajopt_solver.retract_config
+    q_goal = q_start.clone() + 0.05
+    kin_state = trajopt_solver.fk(q_goal)
+    goal_pose = Pose(kin_state.ee_position, kin_state.ee_quaternion)
+    current_state = JointState.from_position(q_start)
+    js_goal = Goal(goal_pose=goal_pose, current_state=current_state)
+    result = trajopt_solver.solve_single(js_goal)
+
+    # NOTE: This currently fails in some instances.
+    assert result.success.item() == False or result.success.item() == True
+
+
+def test_trajopt_single_goalset(trajopt_solver):
+    # run goalset planning:
+    q_start = trajopt_solver.retract_config
+    q_goal = q_start.clone() + 0.1
+    kin_state = trajopt_solver.fk(q_goal)
+    goal_pose = Pose(kin_state.ee_position, kin_state.ee_quaternion)
+    goal_state = JointState.from_position(q_goal)
+    current_state = JointState.from_position(q_start)
+    g_set = Pose(
+        kin_state.ee_position.repeat(2, 1).view(1, 2, 3),
+        kin_state.ee_quaternion.repeat(2, 1).view(1, 2, 4),
+    )
+    js_goal = Goal(goal_pose=g_set, goal_state=goal_state, current_state=current_state)
+    result = trajopt_solver.solve_goalset(js_goal)
+    assert result.success.item()
+
+
+def test_trajopt_batch(trajopt_solver):
+    # run goalset planning:
+    q_start = trajopt_solver.retract_config.repeat(2, 1)
+    q_goal = q_start.clone()
+    q_goal[0] += 0.1
+    q_goal[1] -= 0.1
+    kin_state = trajopt_solver.fk(q_goal)
+    goal_pose = Pose(kin_state.ee_position, kin_state.ee_quaternion)
+    goal_state = JointState.from_position(q_goal)
+    current_state = JointState.from_position(q_start)
+    g_set = Pose(
+        kin_state.ee_position,
+        kin_state.ee_quaternion,
+    )
+
+    js_goal = Goal(goal_pose=g_set, goal_state=goal_state, current_state=current_state)
+    result = trajopt_solver.solve_batch(js_goal)
+    assert torch.count_nonzero(result.success) > 0
+
+
+def test_trajopt_batch_js(trajopt_solver):
+    # run goalset planning:
+    q_start = trajopt_solver.retract_config.repeat(2, 1)
+    q_goal = q_start.clone()
+    q_goal[0] += 0.1
+    q_goal[1] -= 0.1
+    kin_state = trajopt_solver.fk(q_goal)
+    # goal_pose = Pose(kin_state.ee_position, kin_state.ee_quaternion)
+    goal_state = JointState.from_position(q_goal)
+    current_state = JointState.from_position(q_start)
+
+    js_goal = Goal(goal_state=goal_state, current_state=current_state)
+    result = trajopt_solver.solve_batch(js_goal)
+    traj = result.solution.position
+    interpolated_traj = result.interpolated_solution.position
+    assert torch.count_nonzero(result.success) > 0
+    assert torch.linalg.norm((goal_state.position - traj[:, -1, :])).item() < 0.05
+    assert torch.linalg.norm((goal_state.position - interpolated_traj[:, -1, :])).item() < 0.05
+
+
+def test_trajopt_batch_goalset(trajopt_solver):
+    # run goalset planning:
+    q_start = trajopt_solver.retract_config.repeat(3, 1)
+    q_goal = q_start.clone()
+    q_goal[0] += 0.1
+    q_goal[1] -= 0.1
+    q_goal[2, -1] += 0.1
+    kin_state = trajopt_solver.fk(q_goal)
+    goal_pose = Pose(
+        kin_state.ee_position.view(3, 1, 3).repeat(1, 5, 1),
+        kin_state.ee_quaternion.view(3, 1, 4).repeat(1, 5, 1),
+    )
+    goal_pose.position[:, 0, 0] -= 0.01
+    goal_state = JointState.from_position(q_goal)
+    current_state = JointState.from_position(q_start)
+
+    js_goal = Goal(goal_state=goal_state, goal_pose=goal_pose, current_state=current_state)
+    result = trajopt_solver.solve_batch_goalset(js_goal)
+    traj = result.solution.position
+    interpolated_traj = result.interpolated_solution.position
+    assert torch.count_nonzero(result.success) > 0
+
+
+def test_trajopt_batch_env_js(trajopt_solver_batch_env):
+    # run goalset planning:
+    q_start = trajopt_solver_batch_env.retract_config.repeat(3, 1)
+    q_goal = q_start.clone()
+    q_goal += 0.1
+    q_goal[2] += 0.1
+    q_goal[1] -= 0.2
+    # q_goal[2, -1] += 0.1
+    kin_state = trajopt_solver_batch_env.fk(q_goal)
+    goal_pose = Pose(kin_state.ee_position, kin_state.ee_quaternion)
+    goal_state = JointState.from_position(q_goal)
+    current_state = JointState.from_position(q_start)
+
+    js_goal = Goal(goal_state=goal_state, current_state=current_state)
+    result = trajopt_solver_batch_env.solve_batch_env(js_goal)
+
+    traj = result.solution.position
+    interpolated_traj = result.interpolated_solution.position
+    assert torch.count_nonzero(result.success) == 3
+    assert torch.linalg.norm((goal_state.position - traj[:, -1, :])).item() < 0.005
+    assert torch.linalg.norm((goal_state.position - interpolated_traj[:, -1, :])).item() < 0.005
+    assert len(result) == 3
+
+
+def test_trajopt_batch_env(trajopt_solver_batch_env):
+    # run goalset planning:
+    q_start = trajopt_solver_batch_env.retract_config.repeat(3, 1)
+    q_goal = q_start.clone()
+    q_goal[0] += 0.1
+    q_goal[1] -= 0.1
+    q_goal[2, -1] += 0.1
+    kin_state = trajopt_solver_batch_env.fk(q_goal)
+    goal_pose = Pose(kin_state.ee_position, kin_state.ee_quaternion)
+    goal_state = JointState.from_position(q_goal)
+    current_state = JointState.from_position(q_start)
+
+    js_goal = Goal(goal_state=goal_state, goal_pose=goal_pose, current_state=current_state)
+    result = trajopt_solver_batch_env.solve_batch_env(js_goal)
+    traj = result.solution.position
+    interpolated_traj = result.interpolated_solution.position
+    assert torch.count_nonzero(result.success) == 3
+
+
+def test_trajopt_batch_env_goalset(trajopt_solver_batch_env):
+    # run goalset planning:
+    q_start = trajopt_solver_batch_env.retract_config.repeat(3, 1)
+    q_goal = q_start.clone()
+    q_goal[0] += 0.1
+    q_goal[1] -= 0.1
+    q_goal[2, -1] += 0.1
+    kin_state = trajopt_solver_batch_env.fk(q_goal)
+    goal_pose = Pose(
+        kin_state.ee_position.view(3, 1, 3).repeat(1, 5, 1),
+        kin_state.ee_quaternion.view(3, 1, 4).repeat(1, 5, 1),
+    )
+    goal_pose.position[:, 0, 0] -= 0.01
+    goal_state = JointState.from_position(q_goal)
+    current_state = JointState.from_position(q_start)
+
+    js_goal = Goal(goal_state=goal_state, goal_pose=goal_pose, current_state=current_state)
+    result = trajopt_solver_batch_env.solve_batch_env_goalset(js_goal)
+    traj = result.solution.position
+    interpolated_traj = result.interpolated_solution.position
+    assert torch.count_nonzero(result.success) > 0
+
+
+def test_trajopt_batch_env(trajopt_solver):
+    # run goalset planning:
+    q_start = trajopt_solver.retract_config.repeat(3, 1)
+    q_goal = q_start.clone()
+    q_goal[0] += 0.1
+    q_goal[1] -= 0.1
+    q_goal[2, -1] += 0.1
+    kin_state = trajopt_solver.fk(q_goal)
+    goal_pose = Pose(kin_state.ee_position, kin_state.ee_quaternion)
+    goal_state = JointState.from_position(q_goal)
+    current_state = JointState.from_position(q_start)
+
+    js_goal = Goal(goal_state=goal_state, goal_pose=goal_pose, current_state=current_state)
+    with pytest.raises(ValueError):
+        result = trajopt_solver.solve_batch_env(js_goal)
diff --git a/tests/warp_mesh_test.py b/tests/warp_mesh_test.py
new file mode 100644
index 00000000..fa2bf5d1
--- /dev/null
+++ b/tests/warp_mesh_test.py
@@ -0,0 +1,106 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import torch
+
+# CuRobo
+from curobo.geom.sdf.world import CollisionQueryBuffer, WorldCollisionConfig
+from curobo.geom.sdf.world_mesh import WorldMeshCollision
+from curobo.geom.types import Mesh
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.util_file import get_assets_path, join_path
+
+
+def test_sdf_pose():
+    mesh_file = join_path(get_assets_path(), "scene/nvblox/srl_ur10_bins.obj")
+    tensor_args = TensorDeviceType()
+    world_config = WorldCollisionConfig(tensor_args)
+    world_ccheck = WorldMeshCollision(world_config)
+    world_ccheck.create_collision_cache(1)
+    new_mesh = Mesh(name="test_mesh", file_path=mesh_file, pose=[0, 0, 0, 1, 0, 0, 0])
+    world_ccheck.add_mesh(
+        new_mesh,
+        env_idx=0,
+    )
+    query_spheres = torch.zeros((1, 1, 2, 4), **vars(tensor_args))
+    query_spheres[..., 2] = 10.0
+    query_spheres[..., 1, :] = 0.0
+    query_spheres[..., 3] = 1.0
+    act_distance = tensor_args.to_device([0.01])
+
+    weight = tensor_args.to_device([1.0])
+    collision_buffer = CollisionQueryBuffer.initialize_from_shape(
+        query_spheres.shape, tensor_args, world_ccheck.collision_types
+    )
+    out = world_ccheck.get_sphere_distance(query_spheres, collision_buffer, act_distance, weight)
+    out = out.view(-1)
+    assert out[0] <= 0.0
+    assert out[1] >= 0.007
+
+
+def test_swept_sdf_speed_pose():
+    mesh_file = join_path(get_assets_path(), "scene/nvblox/srl_ur10_bins.obj")
+    tensor_args = TensorDeviceType()
+    world_config = WorldCollisionConfig(tensor_args)
+    world_ccheck = WorldMeshCollision(world_config)
+    world_ccheck.create_collision_cache(1)
+    new_mesh = Mesh(name="test_mesh", file_path=mesh_file, pose=[0, 0, 0, 1, 0, 0, 0])
+    world_ccheck.add_mesh(
+        new_mesh,
+        env_idx=0,
+    )
+    query_spheres = torch.zeros((1, 1, 2, 4), **vars(tensor_args))
+    query_spheres[..., 2] = 10.0
+    query_spheres[..., 1, :] = 0.0
+    query_spheres[..., 3] = 1.0
+    act_distance = tensor_args.to_device([0.01])
+    dt = act_distance.clone()
+    weight = tensor_args.to_device([1.0])
+    collision_buffer = CollisionQueryBuffer.initialize_from_shape(
+        query_spheres.shape, tensor_args, world_ccheck.collision_types
+    )
+    out = world_ccheck.get_swept_sphere_distance(
+        query_spheres, collision_buffer, weight, act_distance, dt, 2, True, None
+    )
+    out = out.view(-1)
+    assert out[0] <= 0.0
+    assert out[1] >= 0.3
+
+
+def test_swept_sdf_pose():
+    mesh_file = join_path(get_assets_path(), "scene/nvblox/srl_ur10_bins.obj")
+    tensor_args = TensorDeviceType()
+    world_config = WorldCollisionConfig(tensor_args)
+    world_ccheck = WorldMeshCollision(world_config)
+    world_ccheck.create_collision_cache(1)
+    new_mesh = Mesh(name="test_mesh", file_path=mesh_file, pose=[0, 0, 0, 1, 0, 0, 0])
+    world_ccheck.add_mesh(
+        new_mesh,
+        env_idx=0,
+    )
+    query_spheres = torch.zeros((1, 1, 2, 4), **vars(tensor_args))
+    query_spheres[..., 2] = 10.0
+    query_spheres[..., 1, :] = 0.0
+    query_spheres[..., 3] = 1.0
+    act_distance = tensor_args.to_device([0.01])
+    dt = act_distance.clone()
+    weight = tensor_args.to_device([1.0])
+    collision_buffer = CollisionQueryBuffer.initialize_from_shape(
+        query_spheres.shape, tensor_args, world_ccheck.collision_types
+    )
+    out = world_ccheck.get_swept_sphere_distance(
+        query_spheres, collision_buffer, weight, act_distance, dt, 2, False, None
+    )
+    out = out.view(-1)
+    assert out[0] <= 0.0
+    assert out[1] >= 0.01
diff --git a/tests/world_config_test.py b/tests/world_config_test.py
new file mode 100644
index 00000000..d8e8f9f3
--- /dev/null
+++ b/tests/world_config_test.py
@@ -0,0 +1,171 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import torch
+
+# CuRobo
+from curobo.geom.sdf.world import (
+    CollisionQueryBuffer,
+    WorldCollisionConfig,
+    WorldPrimitiveCollision,
+)
+from curobo.geom.sdf.world_mesh import WorldMeshCollision
+from curobo.geom.types import Capsule, Cuboid, Cylinder, Mesh, Sphere, WorldConfig
+from curobo.types.base import TensorDeviceType
+from curobo.types.math import Pose
+from curobo.util_file import get_assets_path, join_path
+
+
+def test_world_modify():
+    tensor_args = TensorDeviceType()
+    obstacle_1 = Cuboid(
+        name="cube_1",
+        pose=[0.0, 0.0, 0.0, 0.043, -0.471, 0.284, 0.834],
+        dims=[0.2, 1.0, 0.2],
+        color=[0.8, 0.0, 0.0, 1.0],
+    )
+
+    # describe a mesh obstacle
+    # import a mesh file:
+
+    mesh_file = join_path(get_assets_path(), "scene/nvblox/srl_ur10_bins.obj")
+
+    obstacle_2 = Mesh(
+        name="mesh_1",
+        pose=[0.0, 2, 0.5, 0.043, -0.471, 0.284, 0.834],
+        file_path=mesh_file,
+        scale=[0.5, 0.5, 0.5],
+    )
+
+    obstacle_3 = Capsule(
+        name="capsule",
+        radius=0.2,
+        base=[0, 0, 0],
+        tip=[0, 0, 0.5],
+        pose=[0.0, 5, 0.0, 0.043, -0.471, 0.284, 0.834],
+        # pose=[0.0, 5, 0.0, 1,0,0,0],
+        color=[0, 1.0, 0, 1.0],
+    )
+
+    obstacle_4 = Cylinder(
+        name="cylinder_1",
+        radius=0.2,
+        height=0.5,
+        pose=[0.0, 6, 0.0, 0.043, -0.471, 0.284, 0.834],
+        color=[0, 1.0, 0, 1.0],
+    )
+
+    obstacle_5 = Sphere(
+        name="sphere_1",
+        radius=0.2,
+        pose=[0.0, 7, 0.0, 0.043, -0.471, 0.284, 0.834],
+        color=[0, 1.0, 0, 1.0],
+    )
+
+    world_model = WorldConfig(
+        mesh=[obstacle_2],
+        cuboid=[obstacle_1],
+        capsule=[obstacle_3],
+        cylinder=[obstacle_4],
+        sphere=[obstacle_5],
+    )
+    world_model.randomize_color(r=[0.2, 0.7], g=[0.4, 0.8], b=[0.0, 0.4])
+
+    collision_support_world = WorldConfig.create_collision_support_world(world_model)
+
+    world_collision_config = WorldCollisionConfig(tensor_args, world_model=collision_support_world)
+    world_ccheck = WorldMeshCollision(world_collision_config)
+
+    world_ccheck.enable_obstacle("sphere_1", False)
+
+    w_pose = Pose.from_list([0, 0, 1, 1, 0, 0, 0], tensor_args)
+    world_ccheck.update_obstacle_pose(name="cylinder_1", w_obj_pose=w_pose)
+
+    x_sph = torch.as_tensor(
+        [[0.0, 0.0, 0.0, 0.1], [10.0, 0.0, 0.0, 0.1], [0.01, 0.01, 0.0, 0.1]], **vars(tensor_args)
+    ).view(1, 1, -1, 4)
+    # create buffers:
+    query_buffer = CollisionQueryBuffer.initialize_from_shape(
+        x_sph.shape, tensor_args, world_ccheck.collision_types
+    )
+    act_distance = tensor_args.to_device([0.0])
+
+    weight = tensor_args.to_device([1])
+    d_sph = world_ccheck.get_sphere_distance(x_sph, query_buffer, weight, act_distance).view(-1)
+    assert d_sph[0] >= 0.2
+    assert d_sph[1] == 0.0
+    assert d_sph[2] >= 0.19
+    world_ccheck.update_obstacle_pose("cube_1", Pose.from_list([1, 0, 0, 1, 0, 0, 0], tensor_args))
+    d_sph = world_ccheck.get_sphere_distance(x_sph, query_buffer, weight, act_distance).view(-1)
+    assert torch.sum(d_sph) == 0.0
+
+    x_sph[0, 0, 0, 1] = 5.0
+    x_sph[0, 0, 0, 3] = 0.2
+
+    d_sph = world_ccheck.get_sphere_distance(x_sph, query_buffer, weight, act_distance).view(-1)
+    assert d_sph[0] >= 0.35
+
+
+def test_batch_collision():
+    tensor_args = TensorDeviceType()
+    world_config_1 = WorldConfig(
+        cuboid=[
+            Cuboid(
+                name="cube_env_1",
+                pose=[0.0, 0.0, 0.0, 1.0, 0, 0, 0],
+                dims=[0.2, 1.0, 0.2],
+                color=[0.8, 0.0, 0.0, 1.0],
+            )
+        ]
+    )
+    world_config_2 = WorldConfig(
+        cuboid=[
+            Cuboid(
+                name="cube_env_2",
+                pose=[0.0, 0.0, 1.0, 1, 0, 0, 0],
+                dims=[0.2, 1.0, 0.2],
+                color=[0.8, 0.0, 0.0, 1.0],
+            )
+        ]
+    )
+
+    world_coll_config = WorldCollisionConfig(
+        tensor_args, world_model=[world_config_1, world_config_2]
+    )
+
+    world_ccheck = WorldPrimitiveCollision(world_coll_config)
+
+    x_sph = torch.zeros((2, 1, 1, 4), device=tensor_args.device, dtype=tensor_args.dtype)
+    x_sph[..., 3] = 0.1
+
+    query_buffer = CollisionQueryBuffer.initialize_from_shape(
+        x_sph.shape, tensor_args, world_ccheck.collision_types
+    )
+    act_distance = tensor_args.to_device([0.0])
+
+    weight = tensor_args.to_device([1])
+
+    d = world_ccheck.get_sphere_distance(
+        x_sph,
+        query_buffer,
+        weight,
+        act_distance,
+    )
+
+    assert d[0] == 0.2 and d[1] == 0.2
+    env_query_idx = torch.zeros((x_sph.shape[0]), device=tensor_args.device, dtype=torch.int32)
+    env_query_idx[1] = 1
+    d = world_ccheck.get_sphere_distance(
+        x_sph, query_buffer, weight, act_distance, env_query_idx=env_query_idx
+    )
+
+    assert d[0] == 0.2 and d[1] == 0.0