HACKMIT/rps_learning.py at master · KevinJiao/HACKMIT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import sys
import random


class GameInfo(object):
	def __init__(self):
		# poses are  'r', 'p', or 's' ('' means non existing game)
		self.pose_ai = ''
		self.pose_human = ''

		# outcome is 'w', 'l', 'd' ('' means non existing game)
		self.outcome = ''

	def __str__(self):
		return self.pose_ai + self.pose_human + self.outcome

class AIPlayer(object):

	def __init__(self):
		self.debug = True
		self.greedy_chance = 1
		self.prev_games = []
		self.q_matrix = dict()


	def random_pose(self):
		return "rps"[int(random.random() * 3)]

	def valid_state(self, state):
		return len(state) == 3 # <ai pose> <human pose> <outcome> <action>

	def find_greedy_pose(self, q_matrix, current_state):
		if (not self.valid_state(current_state)) or len(q_matrix) == 0:
			return 0, self.random_pose()
		else:
			# find the best pose with the current state
			max_q = -sys.maxint - 1
			best_pose = self.random_pose()

			for i in range(len(self.q_matrix)):
				state = self.q_matrix.keys()[i][0:3]

				if state != current_state: continue # only look at q matrix entries with current state

				if self.q_matrix.values()[i] > max_q:
					max_q = self.q_matrix.values()[i]

					action = self.q_matrix.keys()[i][-1]
					best_pose = action

			return max_q, best_pose

	def get_pose_choice(self):

		state = '' if (len(self.prev_games) == 0) else str(self.prev_games[-1])
		pose = ''

		q_greedy, pose_greedy = self.find_greedy_pose(self.q_matrix, state)

		if self.debug: print 'Q Matrix: ', self.q_matrix

		if random.random() <= self.greedy_chance and q_greedy > 0:
			# pick greedily
			pose = pose_greedy
			if self.debug: print 'picks greedy - q = ' + str(q_greedy)
		else:
			# pick randomly
			pose = self.random_pose()
			if self.debug: print 'picks random'

		return pose

	def update_with_game_outcome(self, pose_ai, pose_human, outcome):
		game = GameInfo()
		game.pose_ai = pose_ai
		game.pose_human = pose_human
		game.outcome = outcome

		#-----------------------------------------
		# Update q matrix

		state = '' if (len(self.prev_games) == 0) else str(self.prev_games[-1])

		if (self.valid_state(state)):
			state_action_pair = state + game.pose_ai

			# create q_matrix entry if not yet existent
			if not state_action_pair in self.q_matrix.keys():
				self.q_matrix[state_action_pair] = 0

			# reward for ai pose choice given previous game
			self.q_matrix[state_action_pair] += 1 if game.outcome == 'w' else (-1 if game.outcome == 'l' else 0)


		# save game to history
		self.prev_games.append(game)


def main():

	ai = AIPlayer()

	wins_ai = 0
	wins_human = 0
	num_games = 50
	human_pattern = ['r', 's', 's', 'r', 'p', 'p', 's']

	for i in range(num_games):

		pose_ai = ''
		pose_human = ''

		#-----------------------------------------
		# Determine poses


		pose_human = human_pattern[i % 7]
		pose_ai = ai.get_pose_choice()

		#-----------------------------------------
		# Game outcome

		outcome = 'd' if pose_ai == pose_human else \
					   'w' if ((pose_ai == 'r' and pose_human == 's') \
						   or (pose_ai == 'p' and pose_human == 'r' ) \
						   or (pose_ai == 's' and pose_human == 'p' )) \
						   else 'l'

		ai.update_with_game_outcome(pose_ai, pose_human, outcome)

		print 'human: ' + pose_human + ' ai: ' + pose_ai + ' outcome: ' + 'ai win? ' + outcome
		print

		if outcome == 'w': wins_ai += 1
		elif outcome == 'l': wins_human += 1


	print 'AI won ' + str((float(wins_ai) / (wins_ai + wins_human)) * 100.0) + '% of won games'


#-----------------------------------------
# Entry Point
main()