Skip to content

Commit 49057ea

Browse files
authored
Upload file for comparison RL algorithms
1 parent 35718e8 commit 49057ea

5 files changed

Lines changed: 2364 additions & 0 deletions

_RL_helper.py

Lines changed: 323 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,323 @@
1+
import os
2+
3+
os.environ['OPENBLAS_NUM_THREADS'] = '1'
4+
5+
import numpy as np
6+
7+
import math
8+
import random
9+
10+
from stable_baselines3 import DQN
11+
12+
13+
def q_learning_augmented_state(env, network = None, lagrange_multiplier = 0, num_episodes=25, learning_rate=.05, exploration_rate=.25, exploration_decay=0.995, episodes_with_no_exploration_rate=10):
14+
"""
15+
Q-learning algorithm for reinforcement learning with augmented state. The state has to contain the Lagrange multiplier.
16+
:param env: The environment to train on.
17+
:return: an estimate fo the cost given by the policy when choosing the action
18+
"""
19+
20+
21+
22+
# it has to consider in the state the Lagrange multiplier. So the new state has to be something like {'x': x, 'e': e, 'lagrange_multiplier': lagrange_multiplier}
23+
if episodes_with_no_exploration_rate > num_episodes:
24+
raise ValueError("episodes_with_no_exploration_rate must be less than num_episodes")
25+
26+
non_exploration_action = 0
27+
non_exploration_offloading = 0
28+
29+
if network is None:
30+
# we use a DQN network to approximate the Q-values
31+
network = DQN('MlpPolicy', env, verbose=0, device = 'cpu', gamma = env.gamma)
32+
33+
# train the network
34+
network.learn(total_timesteps=num_episodes * env.max_episodes_steps, log_interval=1)
35+
36+
# then we evaluate the network
37+
# total_discounted_reward = []
38+
# total_discounted_cost = []
39+
# total_average_reward = []
40+
# total_average_cost = []
41+
42+
# for episode in range(num_episodes):
43+
# state = env.reset()
44+
# done = False
45+
# episode_discounted_reward = 0
46+
# episode_discounted_cost = 0
47+
# episode_average_reward = 0
48+
# episode_average_cost = 0
49+
50+
# for t in range(env.max_episodes_steps):
51+
# if env._agents_energy < 0:
52+
# action = 0
53+
# else:
54+
# action, _states = network.predict(state, deterministic=True)
55+
#
56+
# next_state, reward, done, info = env.step(action)
57+
# print(state)
58+
# episode_discounted_reward += (env.gamma ** t) * reward
59+
# episode_discounted_cost += (env.gamma ** t) * info['cost']
60+
#
61+
# episode_average_reward += reward
62+
# episode_average_cost += info['cost']
63+
#
64+
# state = next_state
65+
66+
# total_discounted_reward.append(episode_discounted_reward)
67+
# total_discounted_cost.append(episode_discounted_cost)
68+
69+
# total_average_reward.append(episode_average_reward / (t + 1))
70+
# total_average_cost.append(episode_average_cost / (t + 1))
71+
72+
# compute the average discounted reward and cost
73+
average_discounted_reward = 0 # np.mean(total_discounted_reward)
74+
average_discounted_cost = 0 # np.mean(total_discounted_cost)
75+
average_average_reward = 0 # np.mean(total_average_reward)
76+
average_average_cost = 0 # np.mean(total_average_cost)
77+
78+
# now we compute the policy
79+
policy = np.zeros((env.M, (env.B + 1)))
80+
for x in range(1, env.M + 1):
81+
for e in range(env.B + 1):
82+
env._lagrange_multiplier = lagrange_multiplier
83+
env._agents_aoi = x
84+
env._agents_energy = e
85+
policy[x - 1, e] = network.predict(env._get_obs(), deterministic=True)[0]
86+
87+
return [network, average_discounted_reward, average_discounted_cost, average_average_reward, average_average_cost, policy]
88+
89+
90+
91+
92+
93+
94+
# this is to apply q_learing to the single agent
95+
def q_learning(env, q_table, num_episodes = 25, learning_rate = .05, exploration_rate = .25, exploration_decay = 0.995, episodes_with_no_exploration_rate = 10):
96+
"""
97+
Q-learning algorithm for reinforcement learning.
98+
:param env: The environment to train on.
99+
:return: A tuple of the Q-table and the rewards list.
100+
"""
101+
102+
if episodes_with_no_exploration_rate > num_episodes:
103+
raise ValueError("episodes_with_no_exploration_rate must be less than num_episodes")
104+
non_exploration_action = 0
105+
non_exploration_offloading = 0
106+
107+
# Initialize the Q-table
108+
if q_table is None:
109+
q_table = np.zeros((env.num_states, env.action_space_size))
110+
state_visits = np.zeros((env.num_states, ))
111+
112+
# edit the table to make sure we never choose the forbidden actions
113+
for x in range(1, env.M+1):
114+
for e in range(1, -env.min_energy):
115+
index = env.compute_state_index(x, -e)
116+
q_table[index, 1] = math.inf
117+
q_table[index, 2] = math.inf
118+
119+
for x in range(1, env.M+1):
120+
e = env.B
121+
index = env.compute_state_index(x, e)
122+
q_table[index, 0] = math.inf
123+
124+
for e in range(1, env.B+1):
125+
x = env.M
126+
index = env.compute_state_index(x, e)
127+
q_table[index, 0] = math.inf
128+
129+
# Initialize the rewards list
130+
discounted_rewards = []
131+
discounted_penalized_rewards = []
132+
average_rewards = []
133+
discounted_costs = []
134+
average_costs = []
135+
136+
for episode in range(num_episodes):
137+
state = env.reset()
138+
state_visits[state['index']] += 1
139+
140+
# we need to compute the index of each state
141+
done = False
142+
total_reward = 0
143+
144+
episode_discounted_reward = 0
145+
episode_discounted_penalized_reward = 0
146+
episode_discounted_cost = 0
147+
episode_average_reward = 0
148+
episode_average_cost = 0
149+
150+
exploration_rate *= exploration_decay
151+
152+
153+
for t in range(env.max_episodes_steps):
154+
if random.uniform(0, 1) < exploration_rate/state_visits[state['index']]**.75 and episode < num_episodes - episodes_with_no_exploration_rate:
155+
# depending on the state, the possible actions are limited
156+
if state['e']<0:
157+
action = 0
158+
elif state['e'] == env.B or state['x'] == env.M:
159+
action = np.random.choice([1, 2])
160+
else:
161+
action = np.random.randint(0, env.action_space_size)
162+
else:
163+
non_exploration_action += 1
164+
if state['e'] < 0:
165+
action = 0
166+
else:
167+
action = np.argmin(q_table[state['index']])
168+
if action == 2:
169+
non_exploration_offloading += 1
170+
171+
reward_no_penalty = env.reward_function(state, action, training=False)
172+
next_state, reward, _, info = env.step(state, action, training=True)
173+
if info['cost'] > 1:
174+
print(env.lagrange_multiplier, reward, reward_no_penalty, info['cost'])
175+
176+
if episode >= episodes_with_no_exploration_rate:
177+
# we also need to collect the reward without penalty
178+
179+
180+
# accumulate the discounted reward and cost
181+
episode_discounted_reward += (env.gamma ** t) * reward_no_penalty
182+
episode_discounted_penalized_reward += (env.gamma ** t) * reward
183+
episode_discounted_cost += (env.gamma ** t) * info['cost']
184+
185+
episode_average_reward += reward_no_penalty
186+
episode_average_cost += info['cost']
187+
188+
189+
# Update the Q-value using the Bellman equation
190+
if state_visits[state['index']] <= 0:
191+
# we completely substitute with the new value
192+
q_table[state['index'], action] = reward + env.gamma * np.min(q_table[next_state['index']])
193+
else:
194+
q_table[state['index'], action] += (learning_rate/state_visits[state['index']]**.75) * (reward + env.gamma * np.min(q_table[next_state['index']]) - q_table[state['index'], action])
195+
state = next_state
196+
state_visits[state['index']] += 1
197+
198+
# rewards.append(total_reward)
199+
exploration_rate *= exploration_decay
200+
if episode >= episodes_with_no_exploration_rate:
201+
discounted_rewards.append(episode_discounted_reward)
202+
discounted_penalized_rewards.append(episode_discounted_penalized_reward)
203+
discounted_costs.append(episode_discounted_cost)
204+
average_rewards.append(episode_average_reward / (t + 1))
205+
average_costs.append(episode_average_cost / (t + 1))
206+
207+
208+
# given the q_table, we can compute the policy
209+
policy=np.zeros((env.M,(env.B+1)))
210+
for x in range(1, env.M+1):
211+
for e in range(env.B+1):
212+
index=env.compute_state_index(x,e)
213+
policy[x-1,e] = q_table[index].argmin()
214+
# print("Average cost = ", non_exploration_offloading / non_exploration_action)
215+
216+
return [q_table, policy, np.mean(discounted_rewards), np.mean(discounted_costs), np.mean(average_rewards), np.mean(average_costs), np.mean(discounted_penalized_rewards)]
217+
218+
219+
220+
def evaluate_single_agent_policy(env, policy, num_episodes = 50, augmented_state = False):
221+
"""
222+
Evaluate the policy of a single agent.
223+
:param env: The environment to evaluate on.
224+
:param policy: The policy to evaluate, in the form of a policy produced by Q-learning.
225+
:param num_episodes: The number of episodes to evaluate the policy.
226+
:return: The average reward over the episodes.
227+
"""
228+
total_discounted_reward = []
229+
total_discounted_cost = []
230+
total_average_reward = []
231+
total_average_cost = []
232+
233+
for episode in range(num_episodes):
234+
state = env.reset()
235+
done = False
236+
episode_discounted_reward = 0
237+
episode_discounted_cost = 0
238+
episode_average_reward = 0
239+
episode_average_cost = 0
240+
241+
for t in range(env.max_episodes_steps):
242+
if state['e']<0:
243+
action = 0
244+
else:
245+
action = int(policy[state['x']-1, state['e']])
246+
next_state, reward, done, info = env.step(state, action, training=False)
247+
episode_discounted_reward += (env.gamma ** t) * reward
248+
episode_discounted_cost += (env.gamma ** t) * info['cost']
249+
250+
episode_average_reward += reward
251+
episode_average_cost += info['cost']
252+
253+
state = next_state
254+
255+
256+
total_discounted_reward.append(episode_discounted_reward)
257+
total_discounted_cost.append(episode_discounted_cost)
258+
259+
total_average_reward.append(episode_average_reward / (t + 1))
260+
total_average_cost.append(episode_average_cost / (t + 1))
261+
262+
return np.mean(total_discounted_reward), np.mean(total_discounted_cost), np.mean(total_average_reward), np.mean(total_average_cost)
263+
264+
265+
def evaluate_single_agent_policy_augmented_state(env, policy, num_episodes = 50):
266+
"""
267+
Evaluate the policy of a single agent.
268+
:param env: The environment to evaluate on.
269+
:param policy: The policy to evaluate, in the form of a policy produced by Q-learning.
270+
:param num_episodes: The number of episodes to evaluate the policy.
271+
:return: The average reward over the episodes.
272+
"""
273+
total_discounted_reward = []
274+
total_discounted_cost = []
275+
total_average_reward = []
276+
total_average_cost = []
277+
278+
for episode in range(num_episodes):
279+
state = env.reset()
280+
done = False
281+
episode_discounted_reward = 0
282+
episode_discounted_cost = 0
283+
episode_average_reward = 0
284+
episode_average_cost = 0
285+
286+
for t in range(env.max_episodes_steps):
287+
if env._agents_energy<0:
288+
action = 0
289+
else:
290+
action = int(policy[env._agents_aoi-1, env._agents_energy])
291+
next_state, reward, done, info = env.step(action)
292+
293+
# in the policy evaluation we do not consider the Lagrange multiplier, so we do not penalize the cost
294+
if action ==2:
295+
reward = reward - env.lagrange_multiplier * info['cost']
296+
297+
episode_discounted_reward += (env.gamma ** t) * reward
298+
episode_discounted_cost += (env.gamma ** t) * info['cost']
299+
300+
episode_average_reward += reward
301+
episode_average_cost += info['cost']
302+
303+
304+
# print(state, action, reward, info['cost'])
305+
state = next_state
306+
307+
308+
total_discounted_reward.append(episode_discounted_reward)
309+
total_discounted_cost.append(episode_discounted_cost)
310+
311+
total_average_reward.append(episode_average_reward / (t + 1))
312+
total_average_cost.append(episode_average_cost / (t + 1))
313+
314+
return np.mean(total_discounted_reward), np.mean(total_discounted_cost), np.mean(total_average_reward), np.mean(total_average_cost)
315+
316+
317+
318+
def centralized_q_learning():
319+
320+
321+
322+
323+
return 0

0 commit comments

Comments
 (0)