From 1e26af791e5e8640bf9c39a32480c2c39dc8f62e Mon Sep 17 00:00:00 2001 From: Lorenzo Mambretti Date: Thu, 16 Aug 2018 21:14:38 -0700 Subject: [PATCH 1/3] add act_with_exploration() method this methods allows to use the explorer to alternate random and greedy without training. --- chainerrl/agents/dqn.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/chainerrl/agents/dqn.py b/chainerrl/agents/dqn.py index a59e5cad2..8419c79f6 100644 --- a/chainerrl/agents/dqn.py +++ b/chainerrl/agents/dqn.py @@ -352,13 +352,33 @@ def _compute_loss(self, exp_batch, gamma, errors_out=None): return compute_value_loss(y, t, clip_delta=self.clip_delta, batch_accumulator=self.batch_accumulator) - def act(self, obs): + def act(self, obs): + with chainer.using_config('train', False): + with chainer.no_backprop_mode(): + action_value = self.model( + self.batch_states([obs], self.xp, self.phi)) + q = float(action_value.max.data) + action = cuda.to_cpu(action_value.greedy_actions.data)[0] + + # Update stats + self.average_q *= self.average_q_decay + self.average_q += (1 - self.average_q_decay) * q + + self.logger.debug('t:%s q:%s action_value:%s', self.t, q, action_value) + return action + + def act_with_exploration(self, obs): + with chainer.using_config('train', False): with chainer.no_backprop_mode(): action_value = self.model( self.batch_states([obs], self.xp, self.phi)) q = float(action_value.max.data) - action = cuda.to_cpu(action_value.greedy_actions.data)[0] + greedy_action = cuda.to_cpu(action_value.greedy_actions.data)[0] + action = self.explorer.select_action( + self.t, + lambda: greedy_action, + action_value=action_value) # Update stats self.average_q *= self.average_q_decay From 4dbc6d6e1306206d694e3a1013de6944778f3729 Mon Sep 17 00:00:00 2001 From: Lorenzo Mambretti Date: Fri, 17 Aug 2018 01:15:33 -0700 Subject: [PATCH 2/3] corrected style to pass flake8 --- chainerrl/agents/dqn.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/chainerrl/agents/dqn.py b/chainerrl/agents/dqn.py index 8419c79f6..a8c28e038 100644 --- a/chainerrl/agents/dqn.py +++ b/chainerrl/agents/dqn.py @@ -374,7 +374,8 @@ def act_with_exploration(self, obs): action_value = self.model( self.batch_states([obs], self.xp, self.phi)) q = float(action_value.max.data) - greedy_action = cuda.to_cpu(action_value.greedy_actions.data)[0] + greedy_action = cuda.to_cpu( + action_value.greedy_actions.data)[0] action = self.explorer.select_action( self.t, lambda: greedy_action, From 3db0754ec54c61020bd8109493701f12c99e249d Mon Sep 17 00:00:00 2001 From: Lorenzo Mambretti Date: Fri, 17 Aug 2018 15:40:39 -0700 Subject: [PATCH 3/3] self.t update added this may have been a possible source of error when building chainer --- chainerrl/agents/dqn.py | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/chainerrl/agents/dqn.py b/chainerrl/agents/dqn.py index a8c28e038..fd8e454c9 100644 --- a/chainerrl/agents/dqn.py +++ b/chainerrl/agents/dqn.py @@ -352,21 +352,6 @@ def _compute_loss(self, exp_batch, gamma, errors_out=None): return compute_value_loss(y, t, clip_delta=self.clip_delta, batch_accumulator=self.batch_accumulator) - def act(self, obs): - with chainer.using_config('train', False): - with chainer.no_backprop_mode(): - action_value = self.model( - self.batch_states([obs], self.xp, self.phi)) - q = float(action_value.max.data) - action = cuda.to_cpu(action_value.greedy_actions.data)[0] - - # Update stats - self.average_q *= self.average_q_decay - self.average_q += (1 - self.average_q_decay) * q - - self.logger.debug('t:%s q:%s action_value:%s', self.t, q, action_value) - return action - def act_with_exploration(self, obs): with chainer.using_config('train', False): @@ -374,18 +359,32 @@ def act_with_exploration(self, obs): action_value = self.model( self.batch_states([obs], self.xp, self.phi)) q = float(action_value.max.data) - greedy_action = cuda.to_cpu( - action_value.greedy_actions.data)[0] - action = self.explorer.select_action( - self.t, + greedy_action = cuda.to_cpu(action_value.greedy_actions.data)[0] + + action = self.explorer.select_action(self.t, lambda: greedy_action, action_value=action_value) - + self.t += 1 # Update stats self.average_q *= self.average_q_decay self.average_q += (1 - self.average_q_decay) * q self.logger.debug('t:%s q:%s action_value:%s', self.t, q, action_value) + return action + + def act(self, obs): + with chainer.using_config('train', False): + with chainer.no_backprop_mode(): + action_value = self.model( + self.batch_states([obs], self.xp, self.phi)) + q = float(action_value.max.data) + action = cuda.to_cpu(action_value.greedy_actions.data)[0] + + # Update stats + self.average_q *= self.average_q_decay + self.average_q += (1 - self.average_q_decay) * q + + self.logger.debug('t:%s q:%s action_value:%s', self.t, q, action_value) return action def act_and_train(self, obs, reward):