def reward(self, agent, world): # squared distance from listener to landmark a = world.agents[0] dist2 = np.sum(np.square(a.goal_a.state.p_pos - a.goal_b.state.p_pos)) return -dist2 But world.agents[0] is the speaker