Andrea-Fox
diff --git a/‎_RL_helper.py‎
Lines changed: 323 additions & 0 deletions b/‎_RL_helper.py‎
Lines changed: 323 additions & 0 deletions
@@ -0,0 +1,323 @@
+import os
+
+os.environ['OPENBLAS_NUM_THREADS'] = '1'
+
+import numpy as np
+
+import math
+import random
+
+from stable_baselines3 import DQN
+
+
+def q_learning_augmented_state(env, network = None, lagrange_multiplier = 0, num_episodes=25, learning_rate=.05, exploration_rate=.25, exploration_decay=0.995, episodes_with_no_exploration_rate=10):
+    """
+    Q-learning algorithm for reinforcement learning with augmented state. The state has to contain the Lagrange multiplier.
+    :param env: The environment to train on.
+    :return: an estimate fo the cost given by the policy when choosing the action
+    """
+
+
+
+    # it has to consider in the state the Lagrange multiplier. So the new state has to be something like {'x': x, 'e': e, 'lagrange_multiplier': lagrange_multiplier}
+    if episodes_with_no_exploration_rate > num_episodes:
+        raise ValueError("episodes_with_no_exploration_rate must be less than num_episodes")
+
+    non_exploration_action = 0
+    non_exploration_offloading = 0
+
+    if network is None:
+        # we use a DQN network to approximate the Q-values
+        network = DQN('MlpPolicy', env, verbose=0, device = 'cpu', gamma = env.gamma)
+
+    # train the network
+    network.learn(total_timesteps=num_episodes * env.max_episodes_steps, log_interval=1)
+
+    # then we evaluate the network
+    # total_discounted_reward = []
+    # total_discounted_cost = []
+    # total_average_reward = []
+    # total_average_cost = []
+
+    # for episode in range(num_episodes):
+    #     state = env.reset()
+    #     done = False
+    #     episode_discounted_reward = 0
+    #     episode_discounted_cost = 0
+    #     episode_average_reward = 0
+    #     episode_average_cost = 0
+
+        # for t in range(env.max_episodes_steps):
+        #     if env._agents_energy < 0:
+        #         action = 0
+        #     else:
+        #         action, _states = network.predict(state, deterministic=True)
+        # 
+        #     next_state, reward, done, info = env.step(action)
+        #     print(state)
+        #     episode_discounted_reward += (env.gamma ** t) * reward
+        #     episode_discounted_cost += (env.gamma ** t) * info['cost']
+        #     
+        #     episode_average_reward += reward
+        #     episode_average_cost += info['cost']
+        # 
+        #     state = next_state
+
+        # total_discounted_reward.append(episode_discounted_reward)
+        # total_discounted_cost.append(episode_discounted_cost)
+
+        # total_average_reward.append(episode_average_reward / (t + 1))
+        # total_average_cost.append(episode_average_cost / (t + 1))
+
+    # compute the average discounted reward and cost
+    average_discounted_reward = 0 # np.mean(total_discounted_reward)
+    average_discounted_cost = 0 # np.mean(total_discounted_cost)
+    average_average_reward = 0 # np.mean(total_average_reward)
+    average_average_cost = 0 # np.mean(total_average_cost)
+
+    # now we compute the policy
+    policy = np.zeros((env.M, (env.B + 1)))
+    for x in range(1, env.M + 1):
+        for e in range(env.B + 1):
+            env._lagrange_multiplier = lagrange_multiplier
+            env._agents_aoi = x
+            env._agents_energy = e
+            policy[x - 1, e] = network.predict(env._get_obs(), deterministic=True)[0]
+
+    return [network, average_discounted_reward, average_discounted_cost, average_average_reward, average_average_cost, policy]
+
+    
+
+
+
+
+# this is to apply q_learing to the single agent
+def q_learning(env, q_table, num_episodes = 25, learning_rate = .05, exploration_rate = .25, exploration_decay = 0.995, episodes_with_no_exploration_rate = 10):
+    """
+    Q-learning algorithm for reinforcement learning.
+    :param env: The environment to train on.
+    :return: A tuple of the Q-table and the rewards list.
+    """
+
+    if episodes_with_no_exploration_rate > num_episodes:
+        raise ValueError("episodes_with_no_exploration_rate must be less than num_episodes")
+    non_exploration_action = 0
+    non_exploration_offloading = 0
+
+    # Initialize the Q-table
+    if q_table is None:
+        q_table = np.zeros((env.num_states, env.action_space_size))
+    state_visits = np.zeros((env.num_states, ))
+
+    # edit the table to make sure we never choose the forbidden actions
+    for x in range(1, env.M+1):
+        for e in range(1, -env.min_energy):
+            index = env.compute_state_index(x, -e)
+            q_table[index, 1] = math.inf
+            q_table[index, 2] = math.inf
+
+    for x in range(1, env.M+1):
+        e = env.B
+        index = env.compute_state_index(x, e)
+        q_table[index, 0] = math.inf
+
+    for e in range(1, env.B+1):
+        x = env.M
+        index = env.compute_state_index(x, e)
+        q_table[index, 0] = math.inf
+
+    # Initialize the rewards list
+    discounted_rewards = []
+    discounted_penalized_rewards = []
+    average_rewards = []
+    discounted_costs = []
+    average_costs = []
+
+    for episode in range(num_episodes):
+        state = env.reset()
+        state_visits[state['index']] += 1
+
+        # we need to compute the index of each state
+        done = False
+        total_reward = 0
+
+        episode_discounted_reward = 0
+        episode_discounted_penalized_reward = 0
+        episode_discounted_cost = 0
+        episode_average_reward = 0
+        episode_average_cost = 0
+
+        exploration_rate *= exploration_decay
+
+
+        for t in range(env.max_episodes_steps):
+            if random.uniform(0, 1) < exploration_rate/state_visits[state['index']]**.75 and episode < num_episodes - episodes_with_no_exploration_rate:
+                # depending on the state, the possible actions are limited
+                if state['e']<0:
+                    action = 0
+                elif state['e'] == env.B or state['x'] == env.M:
+                    action = np.random.choice([1, 2])
+                else:
+                    action = np.random.randint(0, env.action_space_size)
+            else:
+                non_exploration_action += 1
+                if state['e'] < 0:
+                    action = 0
+                else:
+                    action = np.argmin(q_table[state['index']])
+                if action == 2:
+                    non_exploration_offloading += 1
+
+            reward_no_penalty = env.reward_function(state, action, training=False)
+            next_state, reward, _, info = env.step(state, action, training=True)
+            if info['cost'] > 1:
+                print(env.lagrange_multiplier, reward, reward_no_penalty, info['cost'])
+            
+            if episode >= episodes_with_no_exploration_rate:
+                # we also need to collect the reward without penalty
+
+
+                # accumulate the discounted reward and cost
+                episode_discounted_reward += (env.gamma ** t) * reward_no_penalty
+                episode_discounted_penalized_reward += (env.gamma ** t) * reward
+                episode_discounted_cost += (env.gamma ** t) * info['cost']
+                
+                episode_average_reward += reward_no_penalty
+                episode_average_cost += info['cost']
+            
+
+            # Update the Q-value using the Bellman equation
+            if state_visits[state['index']] <= 0:
+                # we completely substitute with the new value
+                q_table[state['index'], action] = reward + env.gamma * np.min(q_table[next_state['index']])
+            else:
+                q_table[state['index'], action] += (learning_rate/state_visits[state['index']]**.75) * (reward + env.gamma * np.min(q_table[next_state['index']]) - q_table[state['index'], action])
+            state = next_state
+            state_visits[state['index']] += 1   
+
+        # rewards.append(total_reward)
+        exploration_rate *= exploration_decay
+        if episode >= episodes_with_no_exploration_rate:
+            discounted_rewards.append(episode_discounted_reward)
+            discounted_penalized_rewards.append(episode_discounted_penalized_reward)
+            discounted_costs.append(episode_discounted_cost)
+            average_rewards.append(episode_average_reward / (t + 1))
+            average_costs.append(episode_average_cost / (t + 1))
+
+
+    # given the q_table, we can compute the policy 
+    policy=np.zeros((env.M,(env.B+1)))
+    for x in range(1, env.M+1):
+        for e in range(env.B+1):
+            index=env.compute_state_index(x,e)
+            policy[x-1,e] = q_table[index].argmin()
+    # print("Average cost = ", non_exploration_offloading / non_exploration_action)
+
+    return [q_table, policy, np.mean(discounted_rewards), np.mean(discounted_costs), np.mean(average_rewards), np.mean(average_costs), np.mean(discounted_penalized_rewards)]
+
+
+
+def evaluate_single_agent_policy(env, policy, num_episodes = 50, augmented_state = False):
+    """
+    Evaluate the policy of a single agent.
+    :param env: The environment to evaluate on.
+    :param policy: The policy to evaluate, in the form of a policy produced by Q-learning.
+    :param num_episodes: The number of episodes to evaluate the policy.
+    :return: The average reward over the episodes.
+    """
+    total_discounted_reward = []
+    total_discounted_cost = []
+    total_average_reward = []
+    total_average_cost = []
+
+    for episode in range(num_episodes):
+        state = env.reset()
+        done = False
+        episode_discounted_reward = 0
+        episode_discounted_cost = 0
+        episode_average_reward = 0
+        episode_average_cost = 0
+
+        for t in range(env.max_episodes_steps):
+            if state['e']<0:
+                action = 0
+            else:
+                action = int(policy[state['x']-1, state['e']])
+            next_state, reward, done, info = env.step(state, action, training=False)
+            episode_discounted_reward += (env.gamma ** t) * reward
+            episode_discounted_cost += (env.gamma ** t) * info['cost']
+            
+            episode_average_reward += reward
+            episode_average_cost += info['cost']
+
+            state = next_state
+
+
+        total_discounted_reward.append(episode_discounted_reward)
+        total_discounted_cost.append(episode_discounted_cost)
+
+        total_average_reward.append(episode_average_reward / (t + 1))
+        total_average_cost.append(episode_average_cost / (t + 1))
+
+    return np.mean(total_discounted_reward), np.mean(total_discounted_cost), np.mean(total_average_reward), np.mean(total_average_cost)
+
+
+def evaluate_single_agent_policy_augmented_state(env, policy, num_episodes = 50):
+    """
+    Evaluate the policy of a single agent.
+    :param env: The environment to evaluate on.
+    :param policy: The policy to evaluate, in the form of a policy produced by Q-learning.
+    :param num_episodes: The number of episodes to evaluate the policy.
+    :return: The average reward over the episodes.
+    """
+    total_discounted_reward = []
+    total_discounted_cost = []
+    total_average_reward = []
+    total_average_cost = []
+
+    for episode in range(num_episodes):
+        state = env.reset()
+        done = False
+        episode_discounted_reward = 0
+        episode_discounted_cost = 0
+        episode_average_reward = 0
+        episode_average_cost = 0
+
+        for t in range(env.max_episodes_steps):
+            if env._agents_energy<0:
+                action = 0
+            else:
+                action = int(policy[env._agents_aoi-1, env._agents_energy])
+            next_state, reward, done, info = env.step(action)
+
+            # in the policy evaluation we do not consider the Lagrange multiplier, so we do not penalize the cost
+            if action ==2:
+                reward = reward - env.lagrange_multiplier * info['cost']
+
+            episode_discounted_reward += (env.gamma ** t) * reward
+            episode_discounted_cost += (env.gamma ** t) * info['cost']
+            
+            episode_average_reward += reward
+            episode_average_cost += info['cost']
+
+
+            # print(state, action, reward, info['cost'])
+            state = next_state
+
+
+        total_discounted_reward.append(episode_discounted_reward)
+        total_discounted_cost.append(episode_discounted_cost)
+
+        total_average_reward.append(episode_average_reward / (t + 1))
+        total_average_cost.append(episode_average_cost / (t + 1))
+
+    return np.mean(total_discounted_reward), np.mean(total_discounted_cost), np.mean(total_average_reward), np.mean(total_average_cost)
+
+
+
+def centralized_q_learning():
+
+
+
+
+    return 0