From 1e26af791e5e8640bf9c39a32480c2c39dc8f62e Mon Sep 17 00:00:00 2001
From: Lorenzo Mambretti <lmambretti@ucdavis.edu>
Date: Thu, 16 Aug 2018 21:14:38 -0700
Subject: [PATCH 1/3] add act_with_exploration() method

this methods allows to use the explorer to alternate random and greedy without training.
---
 chainerrl/agents/dqn.py | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/chainerrl/agents/dqn.py b/chainerrl/agents/dqn.py
index a59e5cad2..8419c79f6 100644
--- a/chainerrl/agents/dqn.py
+++ b/chainerrl/agents/dqn.py
@@ -352,13 +352,33 @@ def _compute_loss(self, exp_batch, gamma, errors_out=None):
             return compute_value_loss(y, t, clip_delta=self.clip_delta,
                                       batch_accumulator=self.batch_accumulator)
 
-    def act(self, obs):
+    def act(self, obs):
+        with chainer.using_config('train', False):
+            with chainer.no_backprop_mode():
+                action_value = self.model(
+                    self.batch_states([obs], self.xp, self.phi))
+                q = float(action_value.max.data)
+                action = cuda.to_cpu(action_value.greedy_actions.data)[0]
+
+        # Update stats
+        self.average_q *= self.average_q_decay
+        self.average_q += (1 - self.average_q_decay) * q
+
+        self.logger.debug('t:%s q:%s action_value:%s', self.t, q, action_value)
+        return action
+
+    def act_with_exploration(self, obs):
+
         with chainer.using_config('train', False):
             with chainer.no_backprop_mode():
                 action_value = self.model(
                     self.batch_states([obs], self.xp, self.phi))
                 q = float(action_value.max.data)
-                action = cuda.to_cpu(action_value.greedy_actions.data)[0]
+                greedy_action = cuda.to_cpu(action_value.greedy_actions.data)[0]
+                action = self.explorer.select_action(
+                    self.t,
+                    lambda: greedy_action,
+                    action_value=action_value)
 
         # Update stats
         self.average_q *= self.average_q_decay

From 4dbc6d6e1306206d694e3a1013de6944778f3729 Mon Sep 17 00:00:00 2001
From: Lorenzo Mambretti <lmambretti@ucdavis.edu>
Date: Fri, 17 Aug 2018 01:15:33 -0700
Subject: [PATCH 2/3] corrected style to pass flake8

---
 chainerrl/agents/dqn.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/chainerrl/agents/dqn.py b/chainerrl/agents/dqn.py
index 8419c79f6..a8c28e038 100644
--- a/chainerrl/agents/dqn.py
+++ b/chainerrl/agents/dqn.py
@@ -374,7 +374,8 @@ def act_with_exploration(self, obs):
                 action_value = self.model(
                     self.batch_states([obs], self.xp, self.phi))
                 q = float(action_value.max.data)
-                greedy_action = cuda.to_cpu(action_value.greedy_actions.data)[0]
+                greedy_action = cuda.to_cpu(
+                    action_value.greedy_actions.data)[0]
                 action = self.explorer.select_action(
                     self.t,
                     lambda: greedy_action,

From 3db0754ec54c61020bd8109493701f12c99e249d Mon Sep 17 00:00:00 2001
From: Lorenzo Mambretti <lmambretti@ucdavis.edu>
Date: Fri, 17 Aug 2018 15:40:39 -0700
Subject: [PATCH 3/3] self.t update added

this may have been a possible source of error when building chainer
---
 chainerrl/agents/dqn.py | 39 +++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/chainerrl/agents/dqn.py b/chainerrl/agents/dqn.py
index a8c28e038..fd8e454c9 100644
--- a/chainerrl/agents/dqn.py
+++ b/chainerrl/agents/dqn.py
@@ -352,21 +352,6 @@ def _compute_loss(self, exp_batch, gamma, errors_out=None):
             return compute_value_loss(y, t, clip_delta=self.clip_delta,
                                       batch_accumulator=self.batch_accumulator)
 
-    def act(self, obs):
-        with chainer.using_config('train', False):
-            with chainer.no_backprop_mode():
-                action_value = self.model(
-                    self.batch_states([obs], self.xp, self.phi))
-                q = float(action_value.max.data)
-                action = cuda.to_cpu(action_value.greedy_actions.data)[0]
-
-        # Update stats
-        self.average_q *= self.average_q_decay
-        self.average_q += (1 - self.average_q_decay) * q
-
-        self.logger.debug('t:%s q:%s action_value:%s', self.t, q, action_value)
-        return action
-
     def act_with_exploration(self, obs):
 
         with chainer.using_config('train', False):
@@ -374,18 +359,32 @@ def act_with_exploration(self, obs):
                 action_value = self.model(
                     self.batch_states([obs], self.xp, self.phi))
                 q = float(action_value.max.data)
-                greedy_action = cuda.to_cpu(
-                    action_value.greedy_actions.data)[0]
-                action = self.explorer.select_action(
-                    self.t,
+                greedy_action = cuda.to_cpu(action_value.greedy_actions.data)[0]
+                
+        action = self.explorer.select_action(self.t,
                     lambda: greedy_action,
                     action_value=action_value)
-
+        self.t += 1
         # Update stats
         self.average_q *= self.average_q_decay
         self.average_q += (1 - self.average_q_decay) * q
 
         self.logger.debug('t:%s q:%s action_value:%s', self.t, q, action_value)
+        return action
+        
+    def act(self, obs):
+        with chainer.using_config('train', False):
+            with chainer.no_backprop_mode():
+                action_value = self.model(
+                    self.batch_states([obs], self.xp, self.phi))
+                q = float(action_value.max.data)
+                action = cuda.to_cpu(action_value.greedy_actions.data)[0]
+
+        # Update stats
+        self.average_q *= self.average_q_decay
+        self.average_q += (1 - self.average_q_decay) * q
+
+        self.logger.debug('t:%s q:%s action_value:%s', self.t, q, action_value)
         return action
 
     def act_and_train(self, obs, reward):