1+ import os
2+
3+ os .environ ['OPENBLAS_NUM_THREADS' ] = '1'
4+
5+ import numpy as np
6+
7+ import math
8+ import random
9+
10+ from stable_baselines3 import DQN
11+
12+
13+ def q_learning_augmented_state (env , network = None , lagrange_multiplier = 0 , num_episodes = 25 , learning_rate = .05 , exploration_rate = .25 , exploration_decay = 0.995 , episodes_with_no_exploration_rate = 10 ):
14+ """
15+ Q-learning algorithm for reinforcement learning with augmented state. The state has to contain the Lagrange multiplier.
16+ :param env: The environment to train on.
17+ :return: an estimate fo the cost given by the policy when choosing the action
18+ """
19+
20+
21+
22+ # it has to consider in the state the Lagrange multiplier. So the new state has to be something like {'x': x, 'e': e, 'lagrange_multiplier': lagrange_multiplier}
23+ if episodes_with_no_exploration_rate > num_episodes :
24+ raise ValueError ("episodes_with_no_exploration_rate must be less than num_episodes" )
25+
26+ non_exploration_action = 0
27+ non_exploration_offloading = 0
28+
29+ if network is None :
30+ # we use a DQN network to approximate the Q-values
31+ network = DQN ('MlpPolicy' , env , verbose = 0 , device = 'cpu' , gamma = env .gamma )
32+
33+ # train the network
34+ network .learn (total_timesteps = num_episodes * env .max_episodes_steps , log_interval = 1 )
35+
36+ # then we evaluate the network
37+ # total_discounted_reward = []
38+ # total_discounted_cost = []
39+ # total_average_reward = []
40+ # total_average_cost = []
41+
42+ # for episode in range(num_episodes):
43+ # state = env.reset()
44+ # done = False
45+ # episode_discounted_reward = 0
46+ # episode_discounted_cost = 0
47+ # episode_average_reward = 0
48+ # episode_average_cost = 0
49+
50+ # for t in range(env.max_episodes_steps):
51+ # if env._agents_energy < 0:
52+ # action = 0
53+ # else:
54+ # action, _states = network.predict(state, deterministic=True)
55+ #
56+ # next_state, reward, done, info = env.step(action)
57+ # print(state)
58+ # episode_discounted_reward += (env.gamma ** t) * reward
59+ # episode_discounted_cost += (env.gamma ** t) * info['cost']
60+ #
61+ # episode_average_reward += reward
62+ # episode_average_cost += info['cost']
63+ #
64+ # state = next_state
65+
66+ # total_discounted_reward.append(episode_discounted_reward)
67+ # total_discounted_cost.append(episode_discounted_cost)
68+
69+ # total_average_reward.append(episode_average_reward / (t + 1))
70+ # total_average_cost.append(episode_average_cost / (t + 1))
71+
72+ # compute the average discounted reward and cost
73+ average_discounted_reward = 0 # np.mean(total_discounted_reward)
74+ average_discounted_cost = 0 # np.mean(total_discounted_cost)
75+ average_average_reward = 0 # np.mean(total_average_reward)
76+ average_average_cost = 0 # np.mean(total_average_cost)
77+
78+ # now we compute the policy
79+ policy = np .zeros ((env .M , (env .B + 1 )))
80+ for x in range (1 , env .M + 1 ):
81+ for e in range (env .B + 1 ):
82+ env ._lagrange_multiplier = lagrange_multiplier
83+ env ._agents_aoi = x
84+ env ._agents_energy = e
85+ policy [x - 1 , e ] = network .predict (env ._get_obs (), deterministic = True )[0 ]
86+
87+ return [network , average_discounted_reward , average_discounted_cost , average_average_reward , average_average_cost , policy ]
88+
89+
90+
91+
92+
93+
94+ # this is to apply q_learing to the single agent
95+ def q_learning (env , q_table , num_episodes = 25 , learning_rate = .05 , exploration_rate = .25 , exploration_decay = 0.995 , episodes_with_no_exploration_rate = 10 ):
96+ """
97+ Q-learning algorithm for reinforcement learning.
98+ :param env: The environment to train on.
99+ :return: A tuple of the Q-table and the rewards list.
100+ """
101+
102+ if episodes_with_no_exploration_rate > num_episodes :
103+ raise ValueError ("episodes_with_no_exploration_rate must be less than num_episodes" )
104+ non_exploration_action = 0
105+ non_exploration_offloading = 0
106+
107+ # Initialize the Q-table
108+ if q_table is None :
109+ q_table = np .zeros ((env .num_states , env .action_space_size ))
110+ state_visits = np .zeros ((env .num_states , ))
111+
112+ # edit the table to make sure we never choose the forbidden actions
113+ for x in range (1 , env .M + 1 ):
114+ for e in range (1 , - env .min_energy ):
115+ index = env .compute_state_index (x , - e )
116+ q_table [index , 1 ] = math .inf
117+ q_table [index , 2 ] = math .inf
118+
119+ for x in range (1 , env .M + 1 ):
120+ e = env .B
121+ index = env .compute_state_index (x , e )
122+ q_table [index , 0 ] = math .inf
123+
124+ for e in range (1 , env .B + 1 ):
125+ x = env .M
126+ index = env .compute_state_index (x , e )
127+ q_table [index , 0 ] = math .inf
128+
129+ # Initialize the rewards list
130+ discounted_rewards = []
131+ discounted_penalized_rewards = []
132+ average_rewards = []
133+ discounted_costs = []
134+ average_costs = []
135+
136+ for episode in range (num_episodes ):
137+ state = env .reset ()
138+ state_visits [state ['index' ]] += 1
139+
140+ # we need to compute the index of each state
141+ done = False
142+ total_reward = 0
143+
144+ episode_discounted_reward = 0
145+ episode_discounted_penalized_reward = 0
146+ episode_discounted_cost = 0
147+ episode_average_reward = 0
148+ episode_average_cost = 0
149+
150+ exploration_rate *= exploration_decay
151+
152+
153+ for t in range (env .max_episodes_steps ):
154+ if random .uniform (0 , 1 ) < exploration_rate / state_visits [state ['index' ]]** .75 and episode < num_episodes - episodes_with_no_exploration_rate :
155+ # depending on the state, the possible actions are limited
156+ if state ['e' ]< 0 :
157+ action = 0
158+ elif state ['e' ] == env .B or state ['x' ] == env .M :
159+ action = np .random .choice ([1 , 2 ])
160+ else :
161+ action = np .random .randint (0 , env .action_space_size )
162+ else :
163+ non_exploration_action += 1
164+ if state ['e' ] < 0 :
165+ action = 0
166+ else :
167+ action = np .argmin (q_table [state ['index' ]])
168+ if action == 2 :
169+ non_exploration_offloading += 1
170+
171+ reward_no_penalty = env .reward_function (state , action , training = False )
172+ next_state , reward , _ , info = env .step (state , action , training = True )
173+ if info ['cost' ] > 1 :
174+ print (env .lagrange_multiplier , reward , reward_no_penalty , info ['cost' ])
175+
176+ if episode >= episodes_with_no_exploration_rate :
177+ # we also need to collect the reward without penalty
178+
179+
180+ # accumulate the discounted reward and cost
181+ episode_discounted_reward += (env .gamma ** t ) * reward_no_penalty
182+ episode_discounted_penalized_reward += (env .gamma ** t ) * reward
183+ episode_discounted_cost += (env .gamma ** t ) * info ['cost' ]
184+
185+ episode_average_reward += reward_no_penalty
186+ episode_average_cost += info ['cost' ]
187+
188+
189+ # Update the Q-value using the Bellman equation
190+ if state_visits [state ['index' ]] <= 0 :
191+ # we completely substitute with the new value
192+ q_table [state ['index' ], action ] = reward + env .gamma * np .min (q_table [next_state ['index' ]])
193+ else :
194+ q_table [state ['index' ], action ] += (learning_rate / state_visits [state ['index' ]]** .75 ) * (reward + env .gamma * np .min (q_table [next_state ['index' ]]) - q_table [state ['index' ], action ])
195+ state = next_state
196+ state_visits [state ['index' ]] += 1
197+
198+ # rewards.append(total_reward)
199+ exploration_rate *= exploration_decay
200+ if episode >= episodes_with_no_exploration_rate :
201+ discounted_rewards .append (episode_discounted_reward )
202+ discounted_penalized_rewards .append (episode_discounted_penalized_reward )
203+ discounted_costs .append (episode_discounted_cost )
204+ average_rewards .append (episode_average_reward / (t + 1 ))
205+ average_costs .append (episode_average_cost / (t + 1 ))
206+
207+
208+ # given the q_table, we can compute the policy
209+ policy = np .zeros ((env .M ,(env .B + 1 )))
210+ for x in range (1 , env .M + 1 ):
211+ for e in range (env .B + 1 ):
212+ index = env .compute_state_index (x ,e )
213+ policy [x - 1 ,e ] = q_table [index ].argmin ()
214+ # print("Average cost = ", non_exploration_offloading / non_exploration_action)
215+
216+ return [q_table , policy , np .mean (discounted_rewards ), np .mean (discounted_costs ), np .mean (average_rewards ), np .mean (average_costs ), np .mean (discounted_penalized_rewards )]
217+
218+
219+
220+ def evaluate_single_agent_policy (env , policy , num_episodes = 50 , augmented_state = False ):
221+ """
222+ Evaluate the policy of a single agent.
223+ :param env: The environment to evaluate on.
224+ :param policy: The policy to evaluate, in the form of a policy produced by Q-learning.
225+ :param num_episodes: The number of episodes to evaluate the policy.
226+ :return: The average reward over the episodes.
227+ """
228+ total_discounted_reward = []
229+ total_discounted_cost = []
230+ total_average_reward = []
231+ total_average_cost = []
232+
233+ for episode in range (num_episodes ):
234+ state = env .reset ()
235+ done = False
236+ episode_discounted_reward = 0
237+ episode_discounted_cost = 0
238+ episode_average_reward = 0
239+ episode_average_cost = 0
240+
241+ for t in range (env .max_episodes_steps ):
242+ if state ['e' ]< 0 :
243+ action = 0
244+ else :
245+ action = int (policy [state ['x' ]- 1 , state ['e' ]])
246+ next_state , reward , done , info = env .step (state , action , training = False )
247+ episode_discounted_reward += (env .gamma ** t ) * reward
248+ episode_discounted_cost += (env .gamma ** t ) * info ['cost' ]
249+
250+ episode_average_reward += reward
251+ episode_average_cost += info ['cost' ]
252+
253+ state = next_state
254+
255+
256+ total_discounted_reward .append (episode_discounted_reward )
257+ total_discounted_cost .append (episode_discounted_cost )
258+
259+ total_average_reward .append (episode_average_reward / (t + 1 ))
260+ total_average_cost .append (episode_average_cost / (t + 1 ))
261+
262+ return np .mean (total_discounted_reward ), np .mean (total_discounted_cost ), np .mean (total_average_reward ), np .mean (total_average_cost )
263+
264+
265+ def evaluate_single_agent_policy_augmented_state (env , policy , num_episodes = 50 ):
266+ """
267+ Evaluate the policy of a single agent.
268+ :param env: The environment to evaluate on.
269+ :param policy: The policy to evaluate, in the form of a policy produced by Q-learning.
270+ :param num_episodes: The number of episodes to evaluate the policy.
271+ :return: The average reward over the episodes.
272+ """
273+ total_discounted_reward = []
274+ total_discounted_cost = []
275+ total_average_reward = []
276+ total_average_cost = []
277+
278+ for episode in range (num_episodes ):
279+ state = env .reset ()
280+ done = False
281+ episode_discounted_reward = 0
282+ episode_discounted_cost = 0
283+ episode_average_reward = 0
284+ episode_average_cost = 0
285+
286+ for t in range (env .max_episodes_steps ):
287+ if env ._agents_energy < 0 :
288+ action = 0
289+ else :
290+ action = int (policy [env ._agents_aoi - 1 , env ._agents_energy ])
291+ next_state , reward , done , info = env .step (action )
292+
293+ # in the policy evaluation we do not consider the Lagrange multiplier, so we do not penalize the cost
294+ if action == 2 :
295+ reward = reward - env .lagrange_multiplier * info ['cost' ]
296+
297+ episode_discounted_reward += (env .gamma ** t ) * reward
298+ episode_discounted_cost += (env .gamma ** t ) * info ['cost' ]
299+
300+ episode_average_reward += reward
301+ episode_average_cost += info ['cost' ]
302+
303+
304+ # print(state, action, reward, info['cost'])
305+ state = next_state
306+
307+
308+ total_discounted_reward .append (episode_discounted_reward )
309+ total_discounted_cost .append (episode_discounted_cost )
310+
311+ total_average_reward .append (episode_average_reward / (t + 1 ))
312+ total_average_cost .append (episode_average_cost / (t + 1 ))
313+
314+ return np .mean (total_discounted_reward ), np .mean (total_discounted_cost ), np .mean (total_average_reward ), np .mean (total_average_cost )
315+
316+
317+
318+ def centralized_q_learning ():
319+
320+
321+
322+
323+ return 0
0 commit comments