-
Notifications
You must be signed in to change notification settings - Fork 10
Expand file tree
/
Copy pathSarsaLambdaLearner.py
More file actions
124 lines (104 loc) · 2.83 KB
/
SarsaLambdaLearner.py
File metadata and controls
124 lines (104 loc) · 2.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
__author__ = 'philippe'
import World
import threading
import time
import random
# Might not be working properly, seems a bit slow to learn
lambda_ = 0.9
discount = 0.5
actions = World.actions
states = []
Q = {}
e = {}
for i in range(World.x):
for j in range(World.y):
states.append((i, j))
for state in states:
temp = {}
temp2 = {}
for action in actions:
temp[action] = 0.1
temp2[action] = 0.0
World.set_cell_action_score(state, action, temp[action])
Q[state] = temp
e[state] = temp2
for (i, j, c, w) in World.specials:
for action in actions:
Q[(i, j)][action] = w
World.set_cell_action_score((i, j), action, w)
def do_action(action):
s = World.player
r = -World.score
if action == actions[0]:
World.try_move(0, -1)
elif action == actions[1]:
World.try_move(0, 1)
elif action == actions[2]:
World.try_move(-1, 0)
elif action == actions[3]:
World.try_move(1, 0)
else:
return
s2 = World.player
r += World.score
return s, action, r, s2
def policy(max_act):
epsilon = 0.1
if random.random() > epsilon:
return max_act
else:
random_idx = random.randint(0, len(actions)-2)
if actions[random_idx] == max_act:
return actions[len(actions)-1]
else:
return actions[random_idx]
def max_Q(s):
val = None
act = None
for a, q in Q[s].items():
if val is None or (q > val):
val = q
act = a
return act, val
def inc_Q(s, a, inc):
Q[s][a] += inc
World.set_cell_action_score(s, a, Q[s][a])
def run():
global discount
time.sleep(1)
alpha = 1.0
t = 1
act = actions[random.randint(0,len(actions)-1)]
while True:
# Pick the right action
s = World.player
(s, a, r, s2) = do_action(act)
max_act, max_val = max_Q(s2)
next_act = policy(max_act)
# Update Q
delta = r + discount * Q[s2][next_act] - Q[s][a]
e[s][a] += 1.0
for state in states:
for action in actions:
inc_Q(state, action, alpha * delta * e[state][action])
e[state][action] *= discount * lambda_
# Check if the game has restarted
t += 1.0
if World.has_restarted():
World.restart_game()
time.sleep(0.01)
t = 1.0
max_act, max_val = max_Q(World.player)
act = policy(max_act)
for state in states:
for action in actions:
e[state][action] = 0.0
# Update the learning rate
alpha = pow(t, -0.1)
act = next_act
# MODIFY THIS SLEEP IF THE GAME IS GOING TOO FAST.
time.sleep(0.02)
t = threading.Thread(target=run)
t.daemon = True
t.start()
World.start_game()