|
3 | 3 | :external-requirements [] |
4 | 4 | :quarto {:author [:janwedekind] |
5 | 5 | :draft true |
6 | | - :description "A Clojure port of XinJingHao's PPO implementation using Pytorch and Quil" |
| 6 | + :description "A Clojure port of XinJingHao's PPO implementation using libpython-clj2, Pytorch, and Quil" |
7 | 7 | :image "pendulum.png" |
8 | 8 | :type :post |
9 | 9 | :date "2026-04-18" |
|
15 | 15 | [clojure.core.async :as async] |
16 | 16 | [quil.core :as q] |
17 | 17 | [quil.middleware :as m] |
18 | | - [libpython-clj2.require :refer (require-python)])) |
| 18 | + [libpython-clj2.require :refer (require-python)] |
| 19 | + [libpython-clj2.python :refer (py.) :as py])) |
19 | 20 |
|
20 | | -(require-python '[torch :as torch]) |
| 21 | +(require-python '[builtins :as python] |
| 22 | + '[torch :as torch] |
| 23 | + '[torch.nn :as nn] |
| 24 | + '[torch.nn.functional :as F] |
| 25 | + '[torch.optim :as optim] |
| 26 | + '[torch.distributions :refer (Beta)]) |
21 | 27 |
|
22 | 28 | ;; ## Motivation |
23 | 29 | ;; |
|
26 | 32 | ;; However I had stability issues. |
27 | 33 | ;; The algorithm would learn a strategy and then suddenly diverge again. |
28 | 34 | ;; |
29 | | -;; More recently (2017) the Proximal Policy Optimization (PPO) algorithm was published and it has gained in popularity. |
| 35 | +;; More recently (2017) the [Proximal Policy Optimization (PPO) algorithm was published](https://arxiv.org/abs/1707.06347) and it has gained in popularity. |
30 | 36 | ;; PPO is inspired by Trust Region Policy Optimization (TRPO) but is much easier to implement. |
| 37 | +;; Most importantly PPO can handle continuous observation and action spaces. |
31 | 38 | ;; The [Stable Baselines3](https://github.com/DLR-RM/stable-baselines3) Python library has a implementation of PPO, TRPO, and other reinforcement learning algorithms. |
32 | 39 | ;; However I found [XinJingHao's PPO implementation](https://github.com/XinJingHao/PPO-Continuous-Pytorch/) which I found easier to follow. |
33 | 40 | ;; |
34 | 41 | ;; In order to use PPO with a simulation environment in Clojure and also in order to get a better understanding of PPO, I dediced to do an implementation of PPO in Clojure. |
35 | 42 | ;; |
36 | 43 | ;; ## Pendulum environment |
37 | 44 | ;; |
38 | | -;;  |
| 45 | +;;  |
39 | 46 | ;; |
40 | 47 | ;; First we implement a simple pendulum environment to test the PPO algorithm. |
41 | 48 | ;; In order to be able to switch environments, we define a protocol according to the environment abstract class used in OpenAI's gym. |
|
129 | 136 | (observation {:angle 0.0 :velocity 0.5} config) |
130 | 137 | (observation {:angle (/ PI 2) :velocity 0.0} config) |
131 | 138 |
|
| 139 | +;; Note that the observation needs to capture all information required for achieving the objective, because it the only information available to the policy for deciding on the next action. |
| 140 | + |
132 | 141 | ;; ### Action |
133 | 142 | ;; |
134 | 143 | ;; The action of a pendulum is a vector with one element between 0 and 1. |
|
257 | 266 | :on-close (fn [& _] (async/close! done-chan))) |
258 | 267 | (async/<!! done-chan)) |
259 | 268 | (System/exit 0)) |
| 269 | + |
| 270 | +;;  |
| 271 | + |
| 272 | +;; ## Neural networks |
| 273 | +;; |
| 274 | +;; PPO is a machine learning technique using backpropagation to learn the parameters of two neural networks. |
| 275 | +;; |
| 276 | +;; * The **actor** network takes an observation as an input and outputs the parameters of a probability distribution for sampling the next action to take. |
| 277 | +;; * The **critic** takes an observation as an input and outputs the expected cumulative reward for the current state. |
| 278 | +;; |
| 279 | +;; ### Pytorch |
| 280 | +;; |
| 281 | +;; For implementing the neural networks and backpropagation, I am using the Python-Clojure bridge [libpython-clj2](https://github.com/clj-python/libpython-clj) and [Pytorch](https://pytorch.org/). |
| 282 | +;; The Pytorch library is quite comprehensive, is free software, and you can find a lot of documentation on how to use it. |
| 283 | +;; The default version of [Pytorch on pypi.org](https://pypi.org/project/torch/) comes with CUDA (Nvidia) GPU support. |
| 284 | +;; There is also a [Pytorch wheel on AMD's website](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html#use-a-wheels-package) which comes with [ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/quick-start.html) support. |
| 285 | +;; Here we are going to use a CPU version of Pytorch which is a much smaller install. |
| 286 | +;; |
| 287 | +;; You need to install [Python 3.10](https://www.python.org/) or later. |
| 288 | +;; For package management we are going to use the [uv](https://docs.astral.sh/uv/) package manager. |
| 289 | +;; The following *pyproject.toml* file is used to install Pytorch and NumPy. |
| 290 | +;; |
| 291 | +;; ```toml |
| 292 | +;; [project] |
| 293 | +;; name = "ppo" |
| 294 | +;; version = "0.1.0" |
| 295 | +;; description = "Proximal Policy Optimization" |
| 296 | +;; authors = [{ name="Jan Wedekind", email="jan@wedesoft.de" }] |
| 297 | +;; requires-python = ">=3.10.0" |
| 298 | +;; dependencies = [ |
| 299 | +;; "numpy", |
| 300 | +;; "torch", |
| 301 | +;; ] |
| 302 | +;; |
| 303 | +;; [tool.uv] |
| 304 | +;; python-preference = "only-system" |
| 305 | +;; |
| 306 | +;; [tool.uv.sources] |
| 307 | +;; torch = { index = "pytorch" } |
| 308 | +;; numpy = { index = "pytorch" } |
| 309 | +;; |
| 310 | +;; [[tool.uv.index]] |
| 311 | +;; name = "pytorch" |
| 312 | +;; url = "https://download.pytorch.org/whl/cpu" |
| 313 | +;; |
| 314 | +;; [build-system] |
| 315 | +;; requires = ["setuptools", "wheel"] |
| 316 | +;; build-backend = "setuptools.build_meta" |
| 317 | +;; ``` |
| 318 | +;; |
| 319 | +;; Note that we are specifying a custom repository index to get the CPU-only version of Pytorch. |
| 320 | +;; Also we are using the system version of Python to prevent *uv* from trying to install its own version which lacks the *\_cython* module. |
| 321 | +;; To freeze the dependencies and create a *uv.lock* file, you need to run |
| 322 | +;; |
| 323 | +;; ```bash |
| 324 | +;; uv lock |
| 325 | +;; ``` |
| 326 | +;; |
| 327 | +;; You can install the dependencies using |
| 328 | +;; ```bash |
| 329 | +;; uv sync |
| 330 | +;; ``` |
| 331 | +;; |
| 332 | +;; In order to access Pytorch from Clojure you need to run the `clj` command via `uv`: |
| 333 | +;; |
| 334 | +;; ```bash |
| 335 | +;; uv run clj |
| 336 | +;; ``` |
| 337 | +;; |
| 338 | +;; Now you should be able to import the Python modules using *require-python*. |
| 339 | +(require-python '[builtins :as python] |
| 340 | + '[torch :as torch] |
| 341 | + '[torch.nn :as nn] |
| 342 | + '[torch.nn.functional :as F] |
| 343 | + '[torch.optim :as optim] |
| 344 | + '[torch.distributions :refer (Beta)]) |
0 commit comments