Merge pull request #138 from utn-mi/136-read-depth-data-from-realsense-cam

juelg · web-flow · commit 0febbcc3ff0a · 2024-10-01T09:14:44.000+02:00
feat(sim): depth image data read out of sim and forwarded to frameset and camera env
diff --git a/python/rcsss/_core/sim.pyi b/python/rcsss/_core/sim.pyi
@@ -121,6 +121,8 @@ class FrameSet:
     @property
     def color_frames(self) -> dict[str, numpy.ndarray[M, numpy.dtype[numpy.uint8]]]: ...
     @property
+    def depth_frames(self) -> dict[str, numpy.ndarray[M, numpy.dtype[numpy.float32]]]: ...
+    @property
     def timestamp(self) -> float: ...
 
 class FrankaHand(rcsss._core.common.Gripper):
diff --git a/python/rcsss/camera/sim.py b/python/rcsss/camera/sim.py
@@ -75,12 +75,20 @@ def _cpp_to_python_frames(self, cpp_frameset: _FrameSet | None) -> FrameSet | No
         if cpp_frameset is None:
             return None
         frames: dict[str, Frame] = {}
-        for frame_name, cpp_frame in cpp_frameset.color_frames.items():
-            # copy, reshape and flip the frame
-            np_frame = np.copy(cpp_frame).reshape(self._cfg.resolution_height, self._cfg.resolution_width, 3)[::-1]
-            cameraframe = CameraFrame(color=DataFrame(data=np_frame, timestamp=cpp_frameset.timestamp))
+        c_frames_iter = cpp_frameset.color_frames.items()
+        d_frames_iter = cpp_frameset.depth_frames.items()
+        for (color_name, color_frame), (depth_name, depth_frame) in zip(c_frames_iter, d_frames_iter, strict=True):
+            assert color_name == depth_name
+            color_np_frame = np.copy(color_frame).reshape(self._cfg.resolution_height, self._cfg.resolution_width, 3)[
+                ::-1
+            ]
+            depth_np_frame = np.copy(depth_frame).reshape(self._cfg.resolution_height, self._cfg.resolution_width, 1)
+            cameraframe = CameraFrame(
+                color=DataFrame(data=color_np_frame, timestamp=cpp_frameset.timestamp),
+                depth=DataFrame(data=depth_np_frame, timestamp=cpp_frameset.timestamp),
+            )
             frame = Frame(camera=cameraframe, avg_timestamp=cpp_frameset.timestamp)
-            frames[frame_name] = frame
+            frames[color_name] = frame
         return FrameSet(frames=frames, avg_timestamp=cpp_frameset.timestamp)
 
     @property
@@ -90,7 +98,7 @@ def config(self) -> SimCameraSetConfig:
     @property
     def camera_names(self) -> list[str]:
         """Should return a list of the activated human readable names of the cameras."""
-        return [camera.identifier for camera in self._cfg.cameras.values()]
+        return list(self._cfg.cameras.keys())
 
     @property
     def name_to_identifier(self) -> dict[str, str]:
diff --git a/python/rcsss/envs/base.py b/python/rcsss/envs/base.py
@@ -97,16 +97,19 @@ class GripperDictType(RCSpaceType):
 class CameraDictType(RCSpaceType):
     frames: dict[
         Annotated[str, "camera_names"],
-        Annotated[
-            np.ndarray,
-            # needs to be filled with values downstream
-            lambda height, width: gym.spaces.Box(
-                low=0,
-                high=255,
-                shape=(height, width, 3),
-                dtype=np.uint8,
-            ),
-            "frame",
+        dict[
+            Annotated[str, "camera_type"],  # "rgb" or "depth"
+            Annotated[
+                np.ndarray,
+                # needs to be filled with values downstream
+                lambda height, width, color_dim=3, dtype=np.uint8, low=0, high=255: gym.spaces.Box(
+                    low=low,
+                    high=high,
+                    shape=(height, width, color_dim),
+                    dtype=dtype,
+                ),
+                "frame",
+            ],
         ],
     ]
 
@@ -387,22 +390,46 @@ def action(self, action: dict[str, Any]) -> dict[str, Any]:
 
 
 class CameraSetWrapper(ActObsInfoWrapper):
-    def __init__(self, env, camera_set: BaseCameraSet):
+    RGB_KEY = "rgb"
+    DEPTH_KEY = "depth"
+
+    def __init__(self, env, camera_set: BaseCameraSet, include_depth: bool = False):
         super().__init__(env)
         self.unwrapped: FR3Env
         self.camera_set = camera_set
+        self.include_depth = include_depth
 
         self.observation_space: gym.spaces.Dict
+        # rgb is always included
+        params: dict = {
+            "frame": {
+                "height": camera_set.config.resolution_height,
+                "width": camera_set.config.resolution_width,
+            }
+        }
+        if self.include_depth:
+            # depth is optional
+            params.update(
+                {
+                    f"/{name}/{self.DEPTH_KEY}/frame": {
+                        "height": camera_set.config.resolution_height,
+                        "width": camera_set.config.resolution_width,
+                        "color_dim": 1,
+                        "dtype": np.float32,
+                        "low": 0.0,
+                        "high": 1.0,
+                    }
+                    for name in camera_set.camera_names
+                }
+            )
         self.observation_space.spaces.update(
             get_space(
                 CameraDictType,
-                child_dict_keys_to_unfold={"camera_names": camera_set.camera_names},
-                params={
-                    "frame": {
-                        "height": camera_set.config.resolution_height,
-                        "width": camera_set.config.resolution_height,
-                    }
+                child_dict_keys_to_unfold={
+                    "camera_names": camera_set.camera_names,
+                    "camera_type": [self.RGB_KEY, self.DEPTH_KEY] if self.include_depth else [self.RGB_KEY],
                 },
+                params=params,
             ).spaces
         )
         self.camera_key = get_space_keys(CameraDictType)[0]
@@ -419,11 +446,27 @@ def observation(self, observation: dict, info: dict[str, Any]) -> tuple[dict[str
             observation[self.camera_key] = {}
             info["camera_available"] = False
             return observation, info
-        assert frameset is not None, "No frame available."
-        color_frame_dict: dict[str, np.ndarray] = {
-            camera_name: frame.camera.color.data for camera_name, frame in frameset.frames.items()
+
+        def check_depth(depth):
+            if self.include_depth and depth is None:
+                msg = "Depth is not available in data but still requested."
+                raise ValueError(msg)
+            return self.include_depth
+
+        frame_dict: dict[str, dict[str, np.ndarray]] = {
+            camera_name: (
+                {
+                    self.RGB_KEY: frame.camera.color.data,
+                }
+                if check_depth(frame.camera.depth)
+                else {
+                    self.RGB_KEY: frame.camera.color.data,
+                    self.DEPTH_KEY: frame.camera.depth.data,  # type: ignore
+                }
+            )
+            for camera_name, frame in frameset.frames.items()
         }
-        observation[self.camera_key] = color_frame_dict
+        observation[self.camera_key] = frame_dict
 
         info["camera_available"] = True
         if frameset.avg_timestamp is not None:
diff --git a/python/rcsss/envs/factories.py b/python/rcsss/envs/factories.py
@@ -138,7 +138,7 @@ def fr3_sim_env(
 
     if camera_set_cfg is not None:
         camera_set = SimCameraSet(simulation, camera_set_cfg)
-        env = CameraSetWrapper(env, camera_set)
+        env = CameraSetWrapper(env, camera_set, include_depth=True)
 
     if gripper_cfg is not None:
         gripper = sim.FrankaHand(simulation, "0", gripper_cfg)
diff --git a/src/pybind/rcsss.cpp b/src/pybind/rcsss.cpp
@@ -481,6 +481,7 @@ PYBIND11_MODULE(_core, m) {
   py::class_<rcs::sim::FrameSet>(sim, "FrameSet")
       .def(py::init<>())
       .def_readonly("color_frames", &rcs::sim::FrameSet::color_frames)
+      .def_readonly("depth_frames", &rcs::sim::FrameSet::depth_frames)
       .def_readonly("timestamp", &rcs::sim::FrameSet::timestamp);
   py::class_<rcs::sim::SimCameraSet>(sim, "SimCameraSet")
       .def(py::init<std::shared_ptr<rcs::sim::Sim>,
diff --git a/src/sim/camera.cpp b/src/sim/camera.cpp
@@ -77,9 +77,9 @@ void SimCameraSet::frame_callback(const std::string& id, mjrContext& ctx,
   int W = viewport.width;
   int H = viewport.height;
 
-  // allocate rgb buffers
+  // allocate rgb and depth buffers
   ColorFrame frame = ColorFrame::Zero(3 * W * H);
-
+  DepthFrame depth = DepthFrame::Zero(1 * W * H);
   // update abstract scene
   // TODO: we might be able to call this once for all cameras
   // there is also a mjv_updateCamera function
@@ -91,18 +91,22 @@ void SimCameraSet::frame_callback(const std::string& id, mjrContext& ctx,
   mjr_render(viewport, &scene, &ctx);
 
   // read rgb and depth buffers
-  mjr_readPixels(frame.data(), NULL, viewport, &ctx);
+  // depth documentation can be found here:
+  // https://registry.khronos.org/OpenGL-Refpages/gl4/html/glReadPixels.xhtml
+  mjr_readPixels(frame.data(), depth.data(), viewport, &ctx);
 
   auto ts = this->sim->d->time;
   std::lock_guard<std::mutex> lock(buffer_lock);
   // The following code assumes that all render callbacks for a timestep
   // happen directly after each other
   if (this->last_ts == ts) {
     buffer[buffer.size() - 1].color_frames[id] = frame;
+    buffer[buffer.size() - 1].depth_frames[id] = depth;
   } else {
     FrameSet fs;
     fs.timestamp = ts;
     fs.color_frames[id] = frame;
+    fs.depth_frames[id] = depth;
     buffer.push_back(fs);
     this->last_ts = ts;
   }
diff --git a/src/sim/camera.h b/src/sim/camera.h
@@ -36,9 +36,11 @@ struct SimCameraSetConfig {
 
 // (H,W,3)
 typedef Eigen::Matrix<unsigned char, Eigen::Dynamic, 1> ColorFrame;
+typedef Eigen::Matrix<float, Eigen::Dynamic, 1> DepthFrame;
 
 struct FrameSet {
   std::unordered_map<std::string, ColorFrame> color_frames;
+  std::unordered_map<std::string, DepthFrame> depth_frames;
   mjtNum timestamp;
 };