From 143ebc713233f0e49fe4b771212edd05cf5bac7f Mon Sep 17 00:00:00 2001
From: Max Zuo <maxzuo@users.noreply.github.com>
Date: Mon, 2 Aug 2021 01:22:39 -0400
Subject: [PATCH] Faster Q value update for DDDQN - TF 2

Faster implementation for calculating q_target for training the DDDQN - in your video you mention the slow speed at which it runs. With this small change, it should run significantly faster. I've tested it using `np.array_equal` and produces the same results as the original method.
---
 .../DeepQLearning/dueling_ddqn_tf2.py                | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/ReinforcementLearning/DeepQLearning/dueling_ddqn_tf2.py b/ReinforcementLearning/DeepQLearning/dueling_ddqn_tf2.py
index ebc840f..bb54ef1 100644
--- a/ReinforcementLearning/DeepQLearning/dueling_ddqn_tf2.py
+++ b/ReinforcementLearning/DeepQLearning/dueling_ddqn_tf2.py
@@ -40,7 +40,7 @@ def __init__(self, max_size, input_shape):
                                         dtype=np.float32)
         self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
         self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
-        self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
+        self.terminal_memory = np.zeros(self.mem_size, dtype=bool) #np.bool deprecated
 
     def store_transition(self, state, action, reward, state_, done):
         index = self.mem_cntr % self.mem_size
@@ -118,12 +118,10 @@ def learn(self):
         q_target = q_pred.numpy()
         max_actions = tf.math.argmax(self.q_eval(states_), axis=1)
         
-        # improve on my solution!
-        for idx, terminal in enumerate(dones):
-            #if terminal:
-                #q_next[idx] = 0.0
-            q_target[idx, actions[idx]] = rewards[idx] + \
-                    self.gamma*q_next[idx, max_actions[idx]]*(1-int(dones[idx]))
+        # faster numpy implementation:
+        q_target[np.arange(self.batch_size),actions] = rewards + self.gamma * \
+                        q_next.numpy()[np.arange(self.batch_size),max_actions] * (1-dones)
+        
         self.q_eval.train_on_batch(states, q_target)
 
         self.epsilon = self.epsilon - self.eps_dec if self.epsilon > \