Skip to content

Commit be2d2d3

Browse files
committed
adding the project files
0 parents  commit be2d2d3

File tree

142 files changed

+52991
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

142 files changed

+52991
-0
lines changed

DDPG.py

Lines changed: 264 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
import copy
2+
import numpy as np
3+
import torch
4+
import torch.nn as nn
5+
import torch.nn.functional as F
6+
import os
7+
8+
# selectiong on which GPU code should be run..
9+
# os.environ['CUDA_VISIBLE_DEVICES'] = '1'
10+
11+
12+
# if GPU avilable then use the GPU otherwise use the CPU.
13+
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14+
device = "cpu"
15+
16+
# if torch.cuda.is_available():
17+
# print("training on the nvedia GPU........")
18+
19+
# torch.cuda.empty_cache()
20+
21+
# Re-tuned version of Deep Deterministic Policy Gradients (DDPG)
22+
# Paper: https://arxiv.org/abs/1509.02971
23+
24+
25+
class Actor(nn.Module):
26+
27+
"""
28+
Actor class define the structure of neural network of the actor part of actor-critic frameworks and doing the forward pass.
29+
Arguments :
30+
state_dim (integer) : number of observation actor observed from the environment.
31+
action_dim (integer) : number of actions actor need to give to the environment.
32+
33+
Return :
34+
model : return the actor model.
35+
36+
"""
37+
38+
def __init__(self, state_dim, action_dim):
39+
40+
"""
41+
init function is used to initilize the critic network.
42+
43+
Arguments :
44+
state_dim (integer) : number of observation actor observed from the environment.
45+
action_dim (integer) : number of actions actor need to give to the environment.
46+
47+
Return :
48+
None
49+
50+
"""
51+
super(Actor, self).__init__()
52+
53+
self.l1 = nn.Linear(state_dim, 256)
54+
self.l2 = nn.Linear(256, 256)
55+
self.l3 = nn.Linear(256, action_dim)
56+
57+
def forward(self, state):
58+
59+
"""
60+
forward function takes the (state, action) and predict its Q-value.
61+
62+
Arguements :
63+
state (array) : observations of the actor networks.
64+
action (array) : output array of the actor network.
65+
Return :
66+
model : return the critic model.
67+
"""
68+
69+
a = F.relu(self.l1(state))
70+
a = F.relu(self.l2(a))
71+
72+
model = torch.tanh(self.l3(a))
73+
return model
74+
75+
76+
class Critic(nn.Module):
77+
78+
"""
79+
Critic class define the structure of neural network of the critic part of actor-critic frameworks and doing the forward pass.
80+
"""
81+
82+
def __init__(self, state_dim, action_dim):
83+
super(Critic, self).__init__()
84+
85+
"""
86+
init function is used to initilize the critic network.
87+
88+
Arguments :
89+
state_dim (integer) : number of observation actor observed from the environment.
90+
action_dim (integer) : number of actions actor need to give to the environment.
91+
92+
Return :
93+
None
94+
95+
"""
96+
self.l1 = nn.Linear(state_dim + action_dim, 256)
97+
self.l2 = nn.Linear(256, 256)
98+
self.l3 = nn.Linear(256, 1)
99+
100+
101+
def forward(self, state, action):
102+
103+
"""
104+
forward function takes the (state, action) and predict its Q-value.
105+
106+
Arguements :
107+
state (array) : observations of the actor networks.
108+
action (array) : output array of the actor network.
109+
Return :
110+
model : return the critic model.
111+
"""
112+
q = F.relu(self.l1(torch.cat([state, action], 1)))
113+
q = F.relu(self.l2(q))
114+
return self.l3(q)
115+
116+
117+
class DDPG(object):
118+
119+
"""
120+
DDPG class define the ddpg algorithm
121+
"""
122+
123+
124+
def __init__(self, state_dim, action_dim, discount=0.99, tau=0.001):
125+
126+
"""
127+
init funtion is called when we create the object of the class.
128+
129+
Argurments :
130+
state_dim (integer) : number of observation actor observed from the environment.
131+
action_dim (integer) : number of actions actor need to give to the environment.
132+
discount (float) : discount factor (gamma) used while updating the Q-value of (state, action).
133+
tau (float) : used to soft update the target actor and critic.
134+
135+
Return :
136+
None
137+
"""
138+
139+
self.actor = Actor(state_dim, action_dim).to(device)
140+
self.actor_target = copy.deepcopy(self.actor)
141+
self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4)
142+
143+
self.critic = Critic(state_dim, action_dim).to(device)
144+
self.critic_target = copy.deepcopy(self.critic)
145+
self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)
146+
147+
self.discount = discount
148+
self.tau = tau
149+
150+
self.actor_loss_list = []
151+
self.critic_loss_list = []
152+
153+
154+
def select_action(self, state):
155+
156+
"""
157+
This function take the state as input and return action taken by the agent on this state.
158+
Arguments :
159+
state (array) : state on which we need to take the action.
160+
Return :
161+
action (array) : action taken by the actor on this state.
162+
163+
"""
164+
165+
state = torch.FloatTensor(state.reshape(1, -1)).to(device)
166+
return self.actor(state).cpu().data.numpy().flatten()
167+
168+
169+
def train(self, replay_buffer, batch_size=256):
170+
171+
"""
172+
Train function is used to update the actor and critic network using the sampled data from the replay buffer.
173+
174+
Arguments :
175+
replay_buffer (object) : replay_buffer takes the object of the buffer.ReplayBuffer() class where the (interaction) data is stored.
176+
batch_size (integer) : the number of samples we randomly select to update the actor and critic network from the replay_buffer.
177+
default value of the replay buffer is 256.
178+
"""
179+
180+
### Sample replay buffer
181+
state, action, next_state, reward, not_done = replay_buffer.sample(batch_size)
182+
183+
### Compute the target Q value
184+
target_Q = self.critic_target(next_state, self.actor_target(next_state))
185+
target_Q = reward + (not_done * self.discount * target_Q).detach()
186+
187+
### Get current Q estimate
188+
current_Q = self.critic(state, action)
189+
190+
### Compute critic loss
191+
critic_loss = F.mse_loss(current_Q, target_Q)
192+
193+
### Append the critic loss to the critic_loss_list
194+
self.critic_loss_list.append(critic_loss.cpu().data.numpy())
195+
196+
### Optimize the critic
197+
self.critic_optimizer.zero_grad()
198+
critic_loss.backward()
199+
self.critic_optimizer.step()
200+
201+
### Compute actor loss
202+
actor_loss = -self.critic(state, self.actor(state)).mean()
203+
204+
### Append the actor loss to the actor_loss_list.
205+
self.actor_loss_list.append(actor_loss.cpu().data.numpy())
206+
207+
### Optimize the actor
208+
self.actor_optimizer.zero_grad()
209+
actor_loss.backward()
210+
self.actor_optimizer.step()
211+
212+
### Update the frozen target models
213+
for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
214+
target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
215+
216+
for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
217+
target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
218+
219+
220+
def save(self, dir, ep):
221+
222+
"""
223+
save function is used to save the actor, critic losses and models.
224+
Arguments :
225+
dir (str) : directory where we need to save the actor, critic losses and models.
226+
ep (integer) : episode number at which we are saving the data.
227+
Return :
228+
None
229+
"""
230+
231+
torch.save(self.critic.state_dict(), dir + "/model/_critic" + str(ep))
232+
torch.save(self.critic_optimizer.state_dict(), dir + "/model/_critic_optimizer" + str(ep))
233+
234+
torch.save(self.actor.state_dict(), dir + "/model/_actor" + str(ep))
235+
torch.save(self.actor_optimizer.state_dict(), dir + "/model/_actor_optimizer"+ str(ep))
236+
237+
ac_loss = np.asarray(self.actor_loss_list)
238+
c_loss = np.asarray(self.critic_loss_list)
239+
240+
np.savetxt(dir + "/mat/actor_loss "+ str(ep) + ".csv", ac_loss, delimiter=',')
241+
np.savetxt(dir + "/mat/critic_loss"+ str(ep) +".csv", c_loss, delimiter=',')
242+
243+
244+
def load(self, dir, ep):
245+
246+
"""
247+
save function is used to save the actor, critic losses and models.
248+
Arguments :
249+
dir (str) : directory where we need to save the actor, critic losses and models.
250+
ep (integer) : episode number at which we are saving the data.
251+
Return :
252+
None
253+
"""
254+
255+
self.critic.load_state_dict(torch.load(dir + "/model/_critic" + str(ep) ))
256+
self.critic_optimizer.load_state_dict(torch.load(dir + "/model/_critic_optimizer" + str(ep) ))
257+
self.critic_target = copy.deepcopy(self.critic)
258+
259+
self.actor.load_state_dict(torch.load(dir + "/model/_actor" + str(ep) ))
260+
self.actor_optimizer.load_state_dict(torch.load(dir + "/model/_actor_optimizer" + str(ep) ))
261+
self.actor_target = copy.deepcopy(self.actor)
262+
263+
264+

__pycache__/DDPG.cpython-310.pyc

6.57 KB
Binary file not shown.

__pycache__/buffer.cpython-310.pyc

1.79 KB
Binary file not shown.

__pycache__/env.cpython-310.pyc

6.22 KB
Binary file not shown.
8.95 KB
Binary file not shown.
661 Bytes
Binary file not shown.
1.57 KB
Binary file not shown.

__pycache__/utils.cpython-310.pyc

6.12 KB
Binary file not shown.

buffer.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import numpy as np
2+
import torch
3+
import utils
4+
5+
class ReplayBuffer(object):
6+
7+
def __init__(self, state_dim, action_dim, max_size=int(1e6)):
8+
self.max_size = max_size
9+
self.ptr = 0
10+
self.size = 0
11+
12+
self.state = np.zeros((max_size, state_dim))
13+
self.action = np.zeros((max_size, action_dim))
14+
self.next_state = np.zeros((max_size, state_dim))
15+
self.reward = np.zeros((max_size, 1))
16+
self.not_done = np.zeros((max_size, 1))
17+
18+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19+
20+
21+
def add(self, state, action, next_state, reward, done):
22+
self.state[self.ptr] = state
23+
self.action[self.ptr] = action
24+
self.next_state[self.ptr] = next_state
25+
self.reward[self.ptr] = reward
26+
self.not_done[self.ptr] = 1. - done
27+
28+
self.ptr = (self.ptr + 1) % self.max_size
29+
self.size = min(self.size + 1, self.max_size)
30+
31+
32+
def sample(self, batch_size):
33+
ind = np.random.randint(0, self.size, size=batch_size)
34+
35+
return (
36+
torch.FloatTensor(self.state[ind]).to(self.device),
37+
torch.FloatTensor(self.action[ind]).to(self.device),
38+
torch.FloatTensor(self.next_state[ind]).to(self.device),
39+
torch.FloatTensor(self.reward[ind]).to(self.device),
40+
torch.FloatTensor(self.not_done[ind]).to(self.device)
41+
)
42+
43+
def save_buffer(self):
44+
buffer_data = np.concatenate([self.state, self.action, self.reward,
45+
self.next_state, self.not_done], axis=1)
46+
47+
file = utils.global_dir + '/data/buffer_data.csv'
48+
np.savetxt(file, buffer_data, delimiter=',')

0 commit comments

Comments
 (0)