2024-02-10 17:11:28 +00:00
|
|
|
import os
|
|
|
|
import random
|
|
|
|
import torch as T
|
|
|
|
import numpy as np
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
|
|
import args
|
|
|
|
import folder_struct
|
|
|
|
|
|
|
|
import utils.seeds as seeds
|
|
|
|
import utils.metrics as metrics
|
|
|
|
|
|
|
|
from game import Pneuma
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
|
|
|
parsed_args = args.parse_args()
|
|
|
|
|
|
|
|
if not parsed_args.no_seed:
|
|
|
|
seeds.set_seeds(parsed_args.seed)
|
|
|
|
print(f"Seed set as {parsed_args.seed}")
|
|
|
|
else:
|
|
|
|
print("No seed set")
|
|
|
|
|
|
|
|
chkpt_path, figure_path = folder_struct.setup_dirs()
|
|
|
|
|
2024-03-07 08:27:26 +00:00
|
|
|
# Setup AI params
|
2024-02-10 17:11:28 +00:00
|
|
|
n_episodes = parsed_args.n_episodes
|
|
|
|
episode_length = parsed_args.ep_length
|
|
|
|
n_agents = parsed_args.n_agents
|
|
|
|
|
|
|
|
horizon = parsed_args.horizon
|
|
|
|
no_training = parsed_args.no_training
|
|
|
|
|
|
|
|
learnings_per_episode = int(episode_length/horizon)
|
|
|
|
learn_iters = 0
|
|
|
|
|
|
|
|
show_pygame = parsed_args.show_pg
|
|
|
|
|
|
|
|
# Setup parameter monitoring
|
|
|
|
score_history = np.zeros(
|
|
|
|
shape=(parsed_args.n_agents, parsed_args.n_episodes))
|
|
|
|
|
|
|
|
best_score = np.zeros(parsed_args.n_agents)
|
|
|
|
|
|
|
|
actor_loss = np.zeros(shape=(parsed_args.n_agents,
|
|
|
|
parsed_args.n_episodes))
|
|
|
|
|
|
|
|
critic_loss = np.zeros(shape=(parsed_args.n_agents,
|
|
|
|
parsed_args.n_episodes))
|
|
|
|
|
|
|
|
total_loss = np.zeros(shape=(parsed_args.n_agents,
|
|
|
|
parsed_args.n_episodes))
|
|
|
|
|
|
|
|
entropy = np.zeros(shape=(parsed_args.n_agents,
|
|
|
|
parsed_args.n_episodes))
|
|
|
|
|
|
|
|
advantage = np.zeros(shape=(parsed_args.n_agents,
|
|
|
|
parsed_args.n_episodes))
|
|
|
|
|
2024-02-12 09:57:29 +00:00
|
|
|
time_alive = np.zeros(shape=(parsed_args.n_agents,
|
|
|
|
parsed_args.n_episodes))
|
2024-02-10 17:11:28 +00:00
|
|
|
game = Pneuma(show_pg=show_pygame, n_players=parsed_args.n_agents)
|
|
|
|
|
|
|
|
print("Initializing agents ...")
|
|
|
|
for player in tqdm(game.level.player_sprites,
|
|
|
|
dynamic_ncols=True):
|
|
|
|
player.setup_agent(
|
|
|
|
gamma=parsed_args.gamma,
|
|
|
|
alpha=parsed_args.alpha,
|
|
|
|
policy_clip=parsed_args.policy_clip,
|
|
|
|
batch_size=parsed_args.batch_size,
|
|
|
|
n_epochs=parsed_args.n_epochs,
|
|
|
|
gae_lambda=parsed_args.gae_lambda,
|
|
|
|
entropy_coef=parsed_args.entropy_coeff,
|
|
|
|
chkpt_dir=chkpt_path,
|
2024-02-13 10:48:24 +00:00
|
|
|
load=parsed_args.load
|
2024-02-10 17:11:28 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
# Episodes start
|
|
|
|
for episode in tqdm(range(n_episodes),
|
|
|
|
dynamic_ncols=True):
|
|
|
|
|
|
|
|
game.level.reset()
|
|
|
|
|
|
|
|
episode_reward = np.zeros(
|
|
|
|
shape=(n_agents, episode_length))
|
|
|
|
|
|
|
|
episode_actor_loss = np.zeros(
|
|
|
|
shape=(n_agents, learnings_per_episode))
|
|
|
|
|
|
|
|
episode_critic_loss = np.zeros(
|
|
|
|
shape=(n_agents, learnings_per_episode))
|
|
|
|
|
|
|
|
episode_total_loss = np.zeros(
|
|
|
|
shape=(n_agents, learnings_per_episode))
|
|
|
|
|
2024-03-04 15:26:11 +00:00
|
|
|
episode_entropy = np.zeros(
|
|
|
|
shape=(n_agents, learnings_per_episode))
|
|
|
|
|
|
|
|
episode_advantage = np.zeros(
|
|
|
|
shape=(n_agents, learnings_per_episode))
|
|
|
|
|
2024-02-10 17:11:28 +00:00
|
|
|
# Main game loop
|
|
|
|
for step in tqdm(range(episode_length),
|
|
|
|
leave=False,
|
|
|
|
ascii=True,
|
|
|
|
dynamic_ncols=True):
|
|
|
|
|
|
|
|
if not game.level.done:
|
|
|
|
game.run()
|
|
|
|
|
|
|
|
for player in game.level.player_sprites:
|
|
|
|
|
|
|
|
episode_reward[player.player_id][step] = player.reward
|
|
|
|
|
|
|
|
if not no_training and ((step % horizon == 0 and step != 0) or player.is_dead()):
|
|
|
|
|
|
|
|
player.agent.learn()
|
|
|
|
|
|
|
|
episode_actor_loss[player.player_id][learn_iters % learnings_per_episode]\
|
|
|
|
= player.agent.actor_loss
|
|
|
|
|
|
|
|
episode_critic_loss[player.player_id][learn_iters % learnings_per_episode]\
|
|
|
|
= player.agent.critic_loss
|
|
|
|
|
|
|
|
episode_total_loss[player.player_id][learn_iters % learnings_per_episode]\
|
|
|
|
= player.agent.total_loss
|
|
|
|
|
2024-03-04 15:26:11 +00:00
|
|
|
episode_entropy[player.player_id][learn_iters % learnings_per_episode]\
|
|
|
|
= player.agent.entropy
|
|
|
|
|
|
|
|
episode_advantage[player.player_id][learn_iters % learnings_per_episode]\
|
2024-03-07 08:52:15 +00:00
|
|
|
= player.agent.gae
|
2024-03-04 15:26:11 +00:00
|
|
|
|
2024-02-10 17:11:28 +00:00
|
|
|
learn_iters += 1
|
|
|
|
|
2024-02-12 09:57:29 +00:00
|
|
|
|
|
|
|
|
2024-02-10 17:11:28 +00:00
|
|
|
# Gather information about the episode
|
|
|
|
for player in game.level.player_sprites:
|
|
|
|
|
|
|
|
score = np.mean(episode_reward[player.player_id])
|
|
|
|
|
|
|
|
# Update score
|
|
|
|
score_history[player.player_id][episode] = score
|
|
|
|
|
|
|
|
# Update actor/critic loss
|
|
|
|
actor_loss[player.player_id][episode] = np.mean(
|
|
|
|
episode_actor_loss)
|
|
|
|
|
|
|
|
critic_loss[player.player_id][episode] = np.mean(
|
|
|
|
episode_critic_loss)
|
|
|
|
|
|
|
|
total_loss[player.player_id][episode] = np.mean(
|
|
|
|
episode_total_loss)
|
|
|
|
|
2024-02-12 09:57:29 +00:00
|
|
|
time_alive[player.player_id][episode] = step
|
|
|
|
|
2024-03-04 15:26:11 +00:00
|
|
|
entropy[player.player_id][episode] = np.mean(
|
|
|
|
episode_entropy)
|
|
|
|
|
|
|
|
advantage[player.player_id][episode] = np.mean(
|
|
|
|
episode_advantage)
|
|
|
|
|
2024-02-10 17:11:28 +00:00
|
|
|
# Check for new best score
|
|
|
|
if score > best_score[player.player_id]:
|
|
|
|
print(f"\nEpisode:\
|
|
|
|
{episode}\
|
|
|
|
\nNew best score for player {player.player_id}:\
|
|
|
|
{score}\
|
|
|
|
\nOld best score for player {player.player_id}: \
|
|
|
|
{best_score[player.player_id]}")
|
|
|
|
|
|
|
|
best_score[player.player_id] = score
|
|
|
|
|
|
|
|
print(f"Saving models for player {player.player_id}...")
|
|
|
|
|
|
|
|
# Save models
|
|
|
|
player.agent.save_models(
|
|
|
|
f"A{player.player_id}",
|
|
|
|
f"C{player.player_id}")
|
|
|
|
|
|
|
|
print(f"Models saved to {chkpt_path}")
|
|
|
|
|
2024-03-07 08:58:05 +00:00
|
|
|
metrics.plot_learning_curve(score_history, parsed_args.n_agents, figure_path, n_episodes)
|
2024-02-10 17:11:28 +00:00
|
|
|
|
2024-02-13 10:48:24 +00:00
|
|
|
metrics.plot_score(score_history, parsed_args.n_agents, figure_path)
|
|
|
|
|
2024-03-07 08:58:05 +00:00
|
|
|
metrics.plot_loss('actor', actor_loss, parsed_args.n_agents, figure_path, n_episodes)
|
2024-02-10 17:11:28 +00:00
|
|
|
|
2024-03-07 08:58:05 +00:00
|
|
|
metrics.plot_loss('critic', critic_loss, parsed_args.n_agents, figure_path, n_episodes)
|
2024-02-10 17:11:28 +00:00
|
|
|
|
2024-03-07 08:58:05 +00:00
|
|
|
metrics.plot_parameter('entropy', entropy, parsed_args.n_agents, figure_path, n_episodes)
|
2024-02-10 17:11:28 +00:00
|
|
|
|
2024-03-07 08:58:05 +00:00
|
|
|
metrics.plot_parameter('advantage', advantage, parsed_args.n_agents, figure_path, n_episodes)
|
2024-02-12 09:57:29 +00:00
|
|
|
|
|
|
|
metrics.plot_avg_time(time_alive, parsed_args.n_agents, figure_path)
|
2024-03-07 08:27:26 +00:00
|
|
|
|
2024-02-10 17:11:28 +00:00
|
|
|
# End of training session
|
|
|
|
print("End of episodes.\
|
2024-03-07 08:27:26 +00:00
|
|
|
\n Saving models and exiting game...")
|
2024-02-10 17:11:28 +00:00
|
|
|
|
2024-03-04 10:25:43 +00:00
|
|
|
# Save models
|
|
|
|
player.agent.save_models(
|
|
|
|
f"A{player.player_id}_end",
|
|
|
|
f"C{player.player_id}_end")
|
|
|
|
|
|
|
|
print(f"Models saved to {chkpt_path}")
|
|
|
|
|
2024-02-10 17:11:28 +00:00
|
|
|
game.quit()
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|