From c70c5f2484ea70d673b6c7fe8eed6614bed02e19 Mon Sep 17 00:00:00 2001 From: yaciud <32393009+yaciud@users.noreply.github.com> Date: Wed, 18 Jul 2018 16:35:46 -0400 Subject: [PATCH 01/24] Update network-mpi.c Dan's Version of network-mpi.c with queue --- core/network-mpi.c | 191 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 164 insertions(+), 27 deletions(-) diff --git a/core/network-mpi.c b/core/network-mpi.c index 5ccf0adee..c38376cfc 100644 --- a/core/network-mpi.c +++ b/core/network-mpi.c @@ -14,13 +14,68 @@ struct act_q MPI_Request *req_list; int *idx_list; MPI_Status *status_list; + int *free_idx_list;//add, que of free indices + + #if ROSS_MEMORY char **buffers; #endif - unsigned int cur; + unsigned int cur; + int front;//add, front of queue + int coda;//add, back of queue but back is already a variable somewhere + int sizeOfQ;//add, size of queue array + int numInQ;//add, number of elements in queue + +// Deal with filling queue, then plateauing + }; +int deal_with_cur(struct act_q *q)// try this +{ + if(q->cur < (q->sizeOfQ-1)) + { + q->cur++; + return 1; + } + else + { + return 1; + } +} + + +int fr_q_chq(struct act_q *q, int *frontOrCoda) //free index queue; check for modulating the front or back index of que +{ + if(*frontOrCoda != q->sizeOfQ)//don't mess with queue + { + return 0;// return probably not necessary + } + else//mess with queue + { + *frontOrCoda = 0; + return 0; + } +} + +void fr_q_aq(struct act_q *q, int ele) // free index queue; add to queue +{ + q->free_idx_list[q->coda] = ele; + q->coda++; + q->numInQ++; + fr_q_chq(q,&q->coda);//wraps the queue array around + +} + +int fr_q_dq(struct act_q *q) // free index queue; dequeue +{ + int rv =q->free_idx_list[q->front]; + q->front++; + q->numInQ--; + fr_q_chq(q,&q->front);// wraps the queue array around + + return rv; +} #define EVENT_TAG 1 #if ROSS_MEMORY @@ -101,7 +156,19 @@ init_q(struct act_q *q, const char *name) q->event_list = (tw_event **) tw_calloc(TW_LOC, name, sizeof(*q->event_list), n); q->req_list = (MPI_Request *) tw_calloc(TW_LOC, name, sizeof(*q->req_list), n); q->idx_list = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), n); - q->status_list = (MPI_Status *) tw_calloc(TW_LOC, name, sizeof(*q->status_list), n); + q->free_idx_list = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), n); + q->status_list = (MPI_Status *) tw_calloc(TW_LOC, name, sizeof(*q->status_list), n+1);// queue, n+1 is meant to prevent a full queue + q->front = 0;// front of queue + q->coda = 0;// end of queue + q->sizeOfQ=n+1;// for wraparound + q->numInQ= 0;// number of elements in queue + + int i = 0; + while(ibuffers = tw_calloc(TW_LOC, name, sizeof(*q->buffers), n); @@ -207,7 +274,7 @@ tw_net_minimum(tw_pe *me) e = e->next; } - for (i = 0; i < posted_sends.cur; i++) { + for (i = 0; i < posted_sends.cur; i++) { //fix this line (?) e = posted_sends.event_list[i]; if (m > e->recv_ts) m = e->recv_ts; @@ -228,7 +295,10 @@ test_q( char *tmp; #endif - if (!q->cur) +// if ( !q->cur || q->numInQ == ((q->sizeOfQ)-1) ) //fixed this line (?) if queue is full, no elements are being processed +// return 0; + + if( q->numInQ == ((q->sizeOfQ)-1) ) return 0; if (MPI_Testsome( @@ -254,6 +324,7 @@ test_q( n = q->idx_list[i]; e = q->event_list[n]; q->event_list[n] = NULL; + fr_q_aq(q,n);//add n onto queue #if ROSS_MEMORY finish(me, e, q->buffers[n]); @@ -263,7 +334,8 @@ test_q( } /* Collapse the lists to remove any holes we left. */ - for (i = 0, n = 0; i < q->cur; i++) + /* + for (i = 0, n = 0; i < q->cur; i++)//fix these lines { if (q->event_list[i]) { @@ -288,8 +360,8 @@ test_q( n++; } // endif (q->event_list[i]) } - q->cur -= ready; - + q->cur -= ready;//fix this line + */ return 1; } @@ -303,14 +375,15 @@ recv_begin(tw_pe *me) int flag = 0; int changed = 0; - while (posted_recvs.cur < read_buffer) + while (0 < posted_recvs.numInQ)//fix these lines { - unsigned id = posted_recvs.cur; + + int id = fr_q_dq(&posted_recvs); if(!(e = tw_event_grab(me))) { if(tw_gvt_inprogress(me)) - tw_error(TW_LOC, "Out of events in GVT! Consider increasing --extramem"); + tw_error(TW_LOC, "out of events in GVT!"); return changed; } @@ -337,7 +410,8 @@ recv_begin(tw_pe *me) } posted_recvs.event_list[id] = e; - posted_recvs.cur++; + deal_with_cur(&posted_recvs); + // fixed in fr_q_dq //posted_recvs.cur++; //fix this line changed = 1; } @@ -348,7 +422,6 @@ static void recv_finish(tw_pe *me, tw_event *e, char * buffer) { tw_pe *dest_pe; - tw_clock start; #if ROSS_MEMORY tw_memory *memory; @@ -460,9 +533,7 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer) /* Fast case, we are sending to our own PE and * there is no rollback caused by this send. */ - start = tw_clock_read(); tw_pq_enqueue(dest_pe->pq, e); - dest_pe->stats.s_pq += tw_clock_read() - start; return; } @@ -493,12 +564,13 @@ send_begin(tw_pe *me) { int changed = 0; - while (posted_sends.cur < send_buffer) + while (0 < posted_sends.numInQ)//fixed these line (hopefully) { - tw_event *e = tw_eventq_peek(&outq); + tw_event *e = tw_eventq_peek(&outq);//next event? tw_node *dest_node = NULL; - unsigned id = posted_sends.cur; + int id = fr_q_dq(&posted_sends);// fixed, grabs from front of queue, moves front up one element + // posted_sends.cur; //fix this line #if ROSS_MEMORY tw_event *tmp_prev = NULL; @@ -609,7 +681,9 @@ send_begin(tw_pe *me) : TW_net_asend; posted_sends.event_list[id] = e; - posted_sends.cur++; + deal_with_cur(&posted_sends); + + // fixed in fr_q_dq //posted_sends.cur++;//fix this line me->s_nwhite_sent++; changed = 1; @@ -786,13 +860,31 @@ tw_net_statistics(tw_pe * me, tw_statistics * s) if(MPI_Reduce(&(s->s_net_events), &me->stats.s_net_events, - 17, + 16, MPI_UNSIGNED_LONG_LONG, MPI_SUM, (int)g_tw_masternode, MPI_COMM_ROSS) != MPI_SUCCESS) tw_error(TW_LOC, "Unable to reduce statistics!"); + if(MPI_Reduce(&s->s_total, + &me->stats.s_total, + 8, + MPI_UNSIGNED_LONG_LONG, + MPI_MAX, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); + + if(MPI_Reduce(&s->s_pe_event_ties, + &me->stats.s_pe_event_ties, + 1, + MPI_UNSIGNED_LONG_LONG, + MPI_SUM, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); + if(MPI_Reduce(&s->s_min_detected_offset, &me->stats.s_min_detected_offset, 1, @@ -802,24 +894,69 @@ tw_net_statistics(tw_pe * me, tw_statistics * s) MPI_COMM_ROSS) != MPI_SUCCESS) tw_error(TW_LOC, "Unable to reduce statistics!"); - if(MPI_Reduce(&(s->s_total), - &me->stats.s_total, - 16, - MPI_UNSIGNED_LONG_LONG, - MPI_MAX, - (int)g_tw_masternode, - MPI_COMM_ROSS) != MPI_SUCCESS) + if(MPI_Reduce(&s->s_avl, + &me->stats.s_avl, + 1, + MPI_UNSIGNED_LONG_LONG, + MPI_MAX, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); + + if (MPI_Reduce(&s->s_buddy, + &me->stats.s_buddy, + 1, + MPI_UNSIGNED_LONG_LONG, + MPI_MAX, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); + + if (MPI_Reduce(&s->s_lz4, + &me->stats.s_lz4, + 1, + MPI_UNSIGNED_LONG_LONG, + MPI_MAX, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) tw_error(TW_LOC, "Unable to reduce statistics!"); if (MPI_Reduce(&s->s_events_past_end, &me->stats.s_events_past_end, - 3, + 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, (int)g_tw_masternode, MPI_COMM_ROSS) != MPI_SUCCESS) tw_error(TW_LOC, "Unable to reduce statistics!"); + if (MPI_Reduce(&g_st_stat_comp_ctr, + &me->stats.s_stat_comp, + 1, + MPI_UNSIGNED_LONG_LONG, + MPI_MAX, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); + + if (MPI_Reduce(&g_st_stat_write_ctr, + &me->stats.s_stat_write, + 1, + MPI_UNSIGNED_LONG_LONG, + MPI_MAX, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); + + if(MPI_Reduce(&(s->s_alp_nevent_processed), + &me->stats.s_alp_nevent_processed, + 2, + MPI_UNSIGNED_LONG_LONG, + MPI_SUM, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); + #ifdef USE_RIO if (MPI_Reduce(&s->s_rio_load, &me->stats.s_rio_load, From 0d7cb73d622488704bbf39de202f814bc008819a Mon Sep 17 00:00:00 2001 From: yaciud <32393009+yaciud@users.noreply.github.com> Date: Thu, 19 Jul 2018 23:43:45 -0400 Subject: [PATCH 02/24] Update network-mpi.c --- core/network-mpi.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/core/network-mpi.c b/core/network-mpi.c index c38376cfc..9ce91cc5b 100644 --- a/core/network-mpi.c +++ b/core/network-mpi.c @@ -157,7 +157,8 @@ init_q(struct act_q *q, const char *name) q->req_list = (MPI_Request *) tw_calloc(TW_LOC, name, sizeof(*q->req_list), n); q->idx_list = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), n); q->free_idx_list = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), n); - q->status_list = (MPI_Status *) tw_calloc(TW_LOC, name, sizeof(*q->status_list), n+1);// queue, n+1 is meant to prevent a full queue + q->status_list = (MPI_Status *) tw_calloc(TW_LOC, name, sizeof(*q->status_list), n); + q->free_idx_list = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), n+1);// queue, n+1 is meant to prevent a full queue q->front = 0;// front of queue q->coda = 0;// end of queue q->sizeOfQ=n+1;// for wraparound @@ -170,6 +171,15 @@ init_q(struct act_q *q, const char *name) i++; } +// printf("sizeofq = %d, numinq = %d, coda = %d, front = %d\n",q->sizeOfQ, q->numInQ,q->coda, q->front ); +// printf("dequeue twice, requeue those elements\n"); +// fr_q_dq(q); +// fr_q_dq(q); +// fr_q_aq(q,0); +// fr_q_aq(q,1); +// printf("sizeofq = %d, numinq = %d, coda = %d, front = %d\n",q->sizeOfQ, q->numInQ, q->coda, q->front ); +// printf("check: num in q = %d, size of q = %d\n",q->numInQ,q->sizeOfQ); + #if ROSS_MEMORY q->buffers = tw_calloc(TW_LOC, name, sizeof(*q->buffers), n); From 98d21e18dfaf90107c436a39579000746ecc59f8 Mon Sep 17 00:00:00 2001 From: yaciud <32393009+yaciud@users.noreply.github.com> Date: Sat, 21 Jul 2018 18:35:41 -0400 Subject: [PATCH 03/24] Update network-mpi.c Changed camelCase to snake_case --- core/network-mpi.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/core/network-mpi.c b/core/network-mpi.c index 9ce91cc5b..211d057be 100644 --- a/core/network-mpi.c +++ b/core/network-mpi.c @@ -24,8 +24,8 @@ struct act_q unsigned int cur; int front;//add, front of queue int coda;//add, back of queue but back is already a variable somewhere - int sizeOfQ;//add, size of queue array - int numInQ;//add, number of elements in queue + int size_of_fr_q;//add, size of queue array + int num_in_fr_q;//add, number of elements in queue // Deal with filling queue, then plateauing @@ -33,7 +33,7 @@ struct act_q int deal_with_cur(struct act_q *q)// try this { - if(q->cur < (q->sizeOfQ-1)) + if(q->cur < (q->size_of_fr_q-1)) { q->cur++; return 1; @@ -47,7 +47,7 @@ int deal_with_cur(struct act_q *q)// try this int fr_q_chq(struct act_q *q, int *frontOrCoda) //free index queue; check for modulating the front or back index of que { - if(*frontOrCoda != q->sizeOfQ)//don't mess with queue + if(*frontOrCoda != q->size_of_fr_q)//don't mess with queue { return 0;// return probably not necessary } @@ -62,7 +62,7 @@ void fr_q_aq(struct act_q *q, int ele) // free index queue; add to queue { q->free_idx_list[q->coda] = ele; q->coda++; - q->numInQ++; + q->num_in_fr_q++; fr_q_chq(q,&q->coda);//wraps the queue array around } @@ -71,7 +71,7 @@ int fr_q_dq(struct act_q *q) // free index queue; dequeue { int rv =q->free_idx_list[q->front]; q->front++; - q->numInQ--; + q->num_in_fr_q--; fr_q_chq(q,&q->front);// wraps the queue array around return rv; @@ -161,8 +161,8 @@ init_q(struct act_q *q, const char *name) q->free_idx_list = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), n+1);// queue, n+1 is meant to prevent a full queue q->front = 0;// front of queue q->coda = 0;// end of queue - q->sizeOfQ=n+1;// for wraparound - q->numInQ= 0;// number of elements in queue + q->size_of_fr_q=n+1;// for wraparound + q->num_in_fr_q= 0;// number of elements in queue int i = 0; while(isizeOfQ, q->numInQ,q->coda, q->front ); +// printf("sizeofq = %d, numinq = %d, coda = %d, front = %d\n",q->size_of_fr_q, q->num_in_fr_q,q->coda, q->front ); // printf("dequeue twice, requeue those elements\n"); // fr_q_dq(q); // fr_q_dq(q); // fr_q_aq(q,0); // fr_q_aq(q,1); -// printf("sizeofq = %d, numinq = %d, coda = %d, front = %d\n",q->sizeOfQ, q->numInQ, q->coda, q->front ); -// printf("check: num in q = %d, size of q = %d\n",q->numInQ,q->sizeOfQ); +// printf("sizeofq = %d, numinq = %d, coda = %d, front = %d\n",q->size_of_fr_q, q->num_in_fr_q, q->coda, q->front ); +// printf("check: num in q = %d, size of q = %d\n",q->num_in_fr_q,q->size_of_fr_q); #if ROSS_MEMORY q->buffers = tw_calloc(TW_LOC, name, sizeof(*q->buffers), n); @@ -305,10 +305,10 @@ test_q( char *tmp; #endif -// if ( !q->cur || q->numInQ == ((q->sizeOfQ)-1) ) //fixed this line (?) if queue is full, no elements are being processed +// if ( !q->cur || q->num_in_fr_q == ((q->size_of_fr_q)-1) ) //fixed this line (?) if queue is full, no elements are being processed // return 0; - if( q->numInQ == ((q->sizeOfQ)-1) ) + if( q->num_in_fr_q == ((q->size_of_fr_q)-1) ) return 0; if (MPI_Testsome( @@ -385,7 +385,7 @@ recv_begin(tw_pe *me) int flag = 0; int changed = 0; - while (0 < posted_recvs.numInQ)//fix these lines + while (0 < posted_recvs.num_in_fr_q)//fix these lines { int id = fr_q_dq(&posted_recvs); @@ -574,7 +574,7 @@ send_begin(tw_pe *me) { int changed = 0; - while (0 < posted_sends.numInQ)//fixed these line (hopefully) + while (0 < posted_sends.num_in_fr_q)//fixed these line (hopefully) { tw_event *e = tw_eventq_peek(&outq);//next event? tw_node *dest_node = NULL; From 36de58781e646a3219636d1d9c07f841827043cd Mon Sep 17 00:00:00 2001 From: yaciud <32393009+yaciud@users.noreply.github.com> Date: Sun, 22 Jul 2018 00:52:19 -0400 Subject: [PATCH 04/24] Update network-mpi.c variable names changed, id assigned after events are checked for null in sends, and e null check added in tw_net_minimum --- core/network-mpi.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/core/network-mpi.c b/core/network-mpi.c index 211d057be..00e2761b3 100644 --- a/core/network-mpi.c +++ b/core/network-mpi.c @@ -286,8 +286,12 @@ tw_net_minimum(tw_pe *me) for (i = 0; i < posted_sends.cur; i++) { //fix this line (?) e = posted_sends.event_list[i]; - if (m > e->recv_ts) + if(e == NULL) + {} + else if(m > e->recv_ts) m = e->recv_ts; + else + {} } return m; @@ -388,7 +392,6 @@ recv_begin(tw_pe *me) while (0 < posted_recvs.num_in_fr_q)//fix these lines { - int id = fr_q_dq(&posted_recvs); if(!(e = tw_event_grab(me))) { @@ -396,6 +399,8 @@ recv_begin(tw_pe *me) tw_error(TW_LOC, "out of events in GVT!"); return changed; } + + int id = fr_q_dq(&posted_recvs); #if ROSS_MEMORY if( MPI_Irecv(posted_recvs.buffers[id], @@ -578,9 +583,7 @@ send_begin(tw_pe *me) { tw_event *e = tw_eventq_peek(&outq);//next event? tw_node *dest_node = NULL; - - int id = fr_q_dq(&posted_sends);// fixed, grabs from front of queue, moves front up one element - // posted_sends.cur; //fix this line + // posted_sends.cur; //fixed this line #if ROSS_MEMORY tw_event *tmp_prev = NULL; @@ -602,7 +605,8 @@ send_begin(tw_pe *me) if(e == me->abort_event) tw_error(TW_LOC, "Sending abort event!"); - + + int id = fr_q_dq(&posted_sends);// fixed, grabs from front of queue, moves front up one element dest_node = tw_net_onnode((*e->src_lp->type->map) ((tw_lpid) e->dest_lp)); From 4231e96633e574f147bdb66a4f11e3bee200dd27 Mon Sep 17 00:00:00 2001 From: yaciud <32393009+yaciud@users.noreply.github.com> Date: Tue, 24 Jul 2018 11:45:11 -0400 Subject: [PATCH 05/24] Temporary changes to recv_finish added NULL check before return --- core/network-mpi.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/core/network-mpi.c b/core/network-mpi.c index 00e2761b3..002531908 100644 --- a/core/network-mpi.c +++ b/core/network-mpi.c @@ -491,12 +491,15 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer) // MPI module lets me read cancel events during // event sends over the network. - cancel->state.cancel_q = 1; - cancel->state.remote = 0; - - cancel->cancel_next = dest_pe->cancel_q; - dest_pe->cancel_q = cancel; + if(e!=NULL) // Temporary, for performance testing + { + cancel->state.cancel_q = 1; + cancel->state.remote = 0; + cancel->cancel_next = dest_pe->cancel_q; + dest_pe->cancel_q = cancel; + } + tw_event_free(me, e); return; From 4f21dc2c2ac223577f44fc8155239d769cbc20f5 Mon Sep 17 00:00:00 2001 From: yaciud <32393009+yaciud@users.noreply.github.com> Date: Tue, 24 Jul 2018 11:48:16 -0400 Subject: [PATCH 06/24] Temporary error change Commented out line 257 --- core/avl_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/avl_tree.c b/core/avl_tree.c index 6b12bd037..20c2d88a4 100644 --- a/core/avl_tree.c +++ b/core/avl_tree.c @@ -254,7 +254,7 @@ avlDelete(AvlTree *t, tw_event *key) AvlTree oldroot; if (*t == AVL_EMPTY) { - tw_error(TW_LOC, "We never look for non-existent events!"); +// tw_error(TW_LOC, "We never look for non-existent events!"); return target; } From d84c56bc0f89be6dde681d21be12ea55c52f4562 Mon Sep 17 00:00:00 2001 From: yaciud <32393009+yaciud@users.noreply.github.com> Date: Tue, 24 Jul 2018 17:27:09 -0400 Subject: [PATCH 07/24] Update network-mpi.c --- core/network-mpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/network-mpi.c b/core/network-mpi.c index 002531908..81549db31 100644 --- a/core/network-mpi.c +++ b/core/network-mpi.c @@ -491,7 +491,7 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer) // MPI module lets me read cancel events during // event sends over the network. - if(e!=NULL) // Temporary, for performance testing + if(cancel!=NULL) // Temporary, for performance testing { cancel->state.cancel_q = 1; cancel->state.remote = 0; From 8ee4f477f98d352dcb06070177488cb9953634a9 Mon Sep 17 00:00:00 2001 From: yaciud <32393009+yaciud@users.noreply.github.com> Date: Tue, 7 Aug 2018 17:28:23 -0400 Subject: [PATCH 08/24] Update network-mpi.c Added line tw_clock start;, start = tw_clock_read();, and dest_pe->stats.s_pq += tw_clock_read() - start;. Changed error message to be more explicit, like in the master. Edited tw_net_statistics added to match master. --- core/network-mpi.c | 87 +++++++--------------------------------------- 1 file changed, 13 insertions(+), 74 deletions(-) diff --git a/core/network-mpi.c b/core/network-mpi.c index 81549db31..ba29a8d8d 100644 --- a/core/network-mpi.c +++ b/core/network-mpi.c @@ -396,7 +396,7 @@ recv_begin(tw_pe *me) if(!(e = tw_event_grab(me))) { if(tw_gvt_inprogress(me)) - tw_error(TW_LOC, "out of events in GVT!"); + tw_error(TW_LOC, "Out of events in GVT! Consider increasing --extramem"); return changed; } @@ -437,11 +437,11 @@ static void recv_finish(tw_pe *me, tw_event *e, char * buffer) { tw_pe *dest_pe; + tw_clock start; #if ROSS_MEMORY tw_memory *memory; tw_memory *last; - tw_fd mem_fd; size_t mem_size; @@ -551,7 +551,9 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer) /* Fast case, we are sending to our own PE and * there is no rollback caused by this send. */ + start = tw_clock_read(); tw_pq_enqueue(dest_pe->pq, e); + dest_pe->stats.s_pq += tw_clock_read() - start; return; } @@ -877,31 +879,13 @@ tw_net_statistics(tw_pe * me, tw_statistics * s) if(MPI_Reduce(&(s->s_net_events), &me->stats.s_net_events, - 16, + 17, MPI_UNSIGNED_LONG_LONG, MPI_SUM, (int)g_tw_masternode, MPI_COMM_ROSS) != MPI_SUCCESS) tw_error(TW_LOC, "Unable to reduce statistics!"); - if(MPI_Reduce(&s->s_total, - &me->stats.s_total, - 8, - MPI_UNSIGNED_LONG_LONG, - MPI_MAX, - (int)g_tw_masternode, - MPI_COMM_ROSS) != MPI_SUCCESS) - tw_error(TW_LOC, "Unable to reduce statistics!"); - - if(MPI_Reduce(&s->s_pe_event_ties, - &me->stats.s_pe_event_ties, - 1, - MPI_UNSIGNED_LONG_LONG, - MPI_SUM, - (int)g_tw_masternode, - MPI_COMM_ROSS) != MPI_SUCCESS) - tw_error(TW_LOC, "Unable to reduce statistics!"); - if(MPI_Reduce(&s->s_min_detected_offset, &me->stats.s_min_detected_offset, 1, @@ -911,69 +895,24 @@ tw_net_statistics(tw_pe * me, tw_statistics * s) MPI_COMM_ROSS) != MPI_SUCCESS) tw_error(TW_LOC, "Unable to reduce statistics!"); - if(MPI_Reduce(&s->s_avl, - &me->stats.s_avl, - 1, - MPI_UNSIGNED_LONG_LONG, - MPI_MAX, - (int)g_tw_masternode, - MPI_COMM_ROSS) != MPI_SUCCESS) - tw_error(TW_LOC, "Unable to reduce statistics!"); - - if (MPI_Reduce(&s->s_buddy, - &me->stats.s_buddy, - 1, - MPI_UNSIGNED_LONG_LONG, - MPI_MAX, - (int)g_tw_masternode, - MPI_COMM_ROSS) != MPI_SUCCESS) - tw_error(TW_LOC, "Unable to reduce statistics!"); - - if (MPI_Reduce(&s->s_lz4, - &me->stats.s_lz4, - 1, - MPI_UNSIGNED_LONG_LONG, - MPI_MAX, - (int)g_tw_masternode, - MPI_COMM_ROSS) != MPI_SUCCESS) + if(MPI_Reduce(&(s->s_total), + &me->stats.s_total, + 16, + MPI_UNSIGNED_LONG_LONG, + MPI_MAX, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) tw_error(TW_LOC, "Unable to reduce statistics!"); if (MPI_Reduce(&s->s_events_past_end, &me->stats.s_events_past_end, - 1, + 3, MPI_UNSIGNED_LONG_LONG, MPI_SUM, (int)g_tw_masternode, MPI_COMM_ROSS) != MPI_SUCCESS) tw_error(TW_LOC, "Unable to reduce statistics!"); - if (MPI_Reduce(&g_st_stat_comp_ctr, - &me->stats.s_stat_comp, - 1, - MPI_UNSIGNED_LONG_LONG, - MPI_MAX, - (int)g_tw_masternode, - MPI_COMM_ROSS) != MPI_SUCCESS) - tw_error(TW_LOC, "Unable to reduce statistics!"); - - if (MPI_Reduce(&g_st_stat_write_ctr, - &me->stats.s_stat_write, - 1, - MPI_UNSIGNED_LONG_LONG, - MPI_MAX, - (int)g_tw_masternode, - MPI_COMM_ROSS) != MPI_SUCCESS) - tw_error(TW_LOC, "Unable to reduce statistics!"); - - if(MPI_Reduce(&(s->s_alp_nevent_processed), - &me->stats.s_alp_nevent_processed, - 2, - MPI_UNSIGNED_LONG_LONG, - MPI_SUM, - (int)g_tw_masternode, - MPI_COMM_ROSS) != MPI_SUCCESS) - tw_error(TW_LOC, "Unable to reduce statistics!"); - #ifdef USE_RIO if (MPI_Reduce(&s->s_rio_load, &me->stats.s_rio_load, From 39d24e3244fb0c1985ab66915e5d2e56aeb51237 Mon Sep 17 00:00:00 2001 From: yaciud <32393009+yaciud@users.noreply.github.com> Date: Wed, 8 Aug 2018 18:14:24 -0400 Subject: [PATCH 09/24] Update network-mpi.c Exchanged incrementing in the queueing function for an addition outside of the loops (wherever the function was used). --- core/network-mpi.c | 977 +++++++++++++++++++++++---------------------- 1 file changed, 491 insertions(+), 486 deletions(-) diff --git a/core/network-mpi.c b/core/network-mpi.c index ba29a8d8d..21c0b40d0 100644 --- a/core/network-mpi.c +++ b/core/network-mpi.c @@ -8,24 +8,24 @@ static long id_tmp; struct act_q { - const char *name; + const char *name; - tw_event **event_list; - MPI_Request *req_list; - int *idx_list; - MPI_Status *status_list; - int *free_idx_list;//add, que of free indices + tw_event **event_list; + MPI_Request *req_list; + int *idx_list; + MPI_Status *status_list; + int *free_idx_list;//add, que of free indices #if ROSS_MEMORY - char **buffers; + char **buffers; #endif - unsigned int cur; - int front;//add, front of queue - int coda;//add, back of queue but back is already a variable somewhere - int size_of_fr_q;//add, size of queue array - int num_in_fr_q;//add, number of elements in queue + unsigned int cur; + int front;//add, front of queue + int coda;//add, back of queue but back is already a variable somewhere + int size_of_fr_q;//add, size of queue array + int num_in_fr_q;//add, number of elements in queue // Deal with filling queue, then plateauing @@ -62,7 +62,7 @@ void fr_q_aq(struct act_q *q, int ele) // free index queue; add to queue { q->free_idx_list[q->coda] = ele; q->coda++; - q->num_in_fr_q++; +// q->num_in_fr_q++; //fixed in function fr_q_chq(q,&q->coda);//wraps the queue array around } @@ -93,16 +93,16 @@ static unsigned int send_buffer = 1024; static int world_size = 1; static const tw_optdef mpi_opts[] = { - TWOPT_GROUP("ROSS MPI Kernel"), - TWOPT_UINT( - "read-buffer", - read_buffer, - "network read buffer size in # of events"), - TWOPT_UINT( - "send-buffer", - send_buffer, - "network send buffer size in # of events"), - TWOPT_END() + TWOPT_GROUP("ROSS MPI Kernel"), + TWOPT_UINT( + "read-buffer", + read_buffer, + "network read buffer size in # of events"), + TWOPT_UINT( + "send-buffer", + send_buffer, + "network send buffer size in # of events"), + TWOPT_END() }; // Forward declarations of functions used in MPI network message processing @@ -114,62 +114,64 @@ static void send_finish(tw_pe *me, tw_event *e, char * buffer); // Start of implmentation of network processing routines/functions void tw_comm_set(MPI_Comm comm) { - MPI_COMM_ROSS = comm; - custom_communicator = 1; + MPI_COMM_ROSS = comm; + custom_communicator = 1; } const tw_optdef * tw_net_init(int *argc, char ***argv) { - int my_rank; + int my_rank; - int initialized; - MPI_Initialized(&initialized); + int initialized; + MPI_Initialized(&initialized); - if (!initialized) { - if (MPI_Init(argc, argv) != MPI_SUCCESS) - tw_error(TW_LOC, "MPI_Init failed."); - } - if (MPI_Comm_rank(MPI_COMM_ROSS, &my_rank) != MPI_SUCCESS) - tw_error(TW_LOC, "Cannot get MPI_Comm_rank(MPI_COMM_ROSS)"); + if (!initialized) { + if (MPI_Init(argc, argv) != MPI_SUCCESS) + tw_error(TW_LOC, "MPI_Init failed."); + } + if (MPI_Comm_rank(MPI_COMM_ROSS, &my_rank) != MPI_SUCCESS) + tw_error(TW_LOC, "Cannot get MPI_Comm_rank(MPI_COMM_ROSS)"); - g_tw_masternode = 0; - g_tw_mynode = my_rank; + g_tw_masternode = 0; + g_tw_mynode = my_rank; - return mpi_opts; + return mpi_opts; } static void init_q(struct act_q *q, const char *name) { - unsigned int n; + unsigned int n; #if ROSS_MEMORY - unsigned int i; + unsigned int i; #endif - if(q == &posted_sends) - n = send_buffer; - else - n = read_buffer; - - q->name = name; - q->event_list = (tw_event **) tw_calloc(TW_LOC, name, sizeof(*q->event_list), n); - q->req_list = (MPI_Request *) tw_calloc(TW_LOC, name, sizeof(*q->req_list), n); - q->idx_list = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), n); - q->free_idx_list = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), n); - q->status_list = (MPI_Status *) tw_calloc(TW_LOC, name, sizeof(*q->status_list), n); - q->free_idx_list = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), n+1);// queue, n+1 is meant to prevent a full queue - q->front = 0;// front of queue - q->coda = 0;// end of queue - q->size_of_fr_q=n+1;// for wraparound - q->num_in_fr_q= 0;// number of elements in queue - - int i = 0; - while(iname = name; + q->event_list = (tw_event **) tw_calloc(TW_LOC, name, sizeof(*q->event_list), n); + q->req_list = (MPI_Request *) tw_calloc(TW_LOC, name, sizeof(*q->req_list), n); + q->idx_list = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), n); + q->free_idx_list = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), n); + q->status_list = (MPI_Status *) tw_calloc(TW_LOC, name, sizeof(*q->status_list), n); + q->free_idx_list = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), n+1);// queue, n+1 is meant to prevent a full queue + q->front = 0;// front of queue + q->coda = 0;// end of queue + q->size_of_fr_q=n+1;// for wraparound + q->num_in_fr_q= 0;// number of elements in queue + + int i = 0; + while(inum_in_fr_q = n; // printf("sizeofq = %d, numinq = %d, coda = %d, front = %d\n",q->size_of_fr_q, q->num_in_fr_q,q->coda, q->front ); // printf("dequeue twice, requeue those elements\n"); @@ -181,7 +183,7 @@ init_q(struct act_q *q, const char *name) // printf("check: num in q = %d, size of q = %d\n",q->num_in_fr_q,q->size_of_fr_q); #if ROSS_MEMORY - q->buffers = tw_calloc(TW_LOC, name, sizeof(*q->buffers), n); + q->buffers = tw_calloc(TW_LOC, name, sizeof(*q->buffers), n); for(i = 0; i < n; i++) q->buffers[i] = tw_calloc(TW_LOC, "", TW_MEMORY_BUFFER_SIZE, 1); @@ -189,221 +191,224 @@ init_q(struct act_q *q, const char *name) } tw_node * tw_net_onnode(tw_peid gid) { - id_tmp = gid; - return &id_tmp; + id_tmp = gid; + return &id_tmp; } unsigned int tw_nnodes(void) { - return world_size; + return world_size; } void tw_net_start(void) { - if (MPI_Comm_size(MPI_COMM_ROSS, &world_size) != MPI_SUCCESS) - tw_error(TW_LOC, "Cannot get MPI_Comm_size(MPI_COMM_ROSS)"); + if (MPI_Comm_size(MPI_COMM_ROSS, &world_size) != MPI_SUCCESS) + tw_error(TW_LOC, "Cannot get MPI_Comm_size(MPI_COMM_ROSS)"); - if( g_tw_mynode == 0) + if( g_tw_mynode == 0) { - printf("tw_net_start: Found world size to be %d \n", world_size ); + printf("tw_net_start: Found world size to be %d \n", world_size ); } - // Check after tw_nnodes is defined - if(tw_nnodes() == 1 && g_tw_npe == 1) { - // force the setting of SEQUENTIAL protocol - if (g_tw_synchronization_protocol == NO_SYNCH) { - g_tw_synchronization_protocol = SEQUENTIAL; - } else if(g_tw_synchronization_protocol == CONSERVATIVE || g_tw_synchronization_protocol == OPTIMISTIC) { - g_tw_synchronization_protocol = SEQUENTIAL; - fprintf(stderr, "Warning: Defaulting to Sequential Simulation, not enought PEs defined.\n"); - } - } + // Check after tw_nnodes is defined + if(tw_nnodes() == 1 && g_tw_npe == 1) { + // force the setting of SEQUENTIAL protocol + if (g_tw_synchronization_protocol == NO_SYNCH) { + g_tw_synchronization_protocol = SEQUENTIAL; + } else if(g_tw_synchronization_protocol == CONSERVATIVE || g_tw_synchronization_protocol == OPTIMISTIC) { + g_tw_synchronization_protocol = SEQUENTIAL; + fprintf(stderr, "Warning: Defaulting to Sequential Simulation, not enought PEs defined.\n"); + } + } - tw_pe_create(1); - tw_pe_init(0, g_tw_mynode); + tw_pe_create(1); + tw_pe_init(0, g_tw_mynode); - //If we're in (some variation of) optimistic mode, we need this hash - if (g_tw_synchronization_protocol == OPTIMISTIC || - g_tw_synchronization_protocol == OPTIMISTIC_DEBUG || - g_tw_synchronization_protocol == OPTIMISTIC_REALTIME) { - g_tw_pe[0]->hash_t = tw_hash_create(); - } else { - g_tw_pe[0]->hash_t = NULL; - } + //If we're in (some variation of) optimistic mode, we need this hash + if (g_tw_synchronization_protocol == OPTIMISTIC || + g_tw_synchronization_protocol == OPTIMISTIC_DEBUG || + g_tw_synchronization_protocol == OPTIMISTIC_REALTIME) { + g_tw_pe[0]->hash_t = tw_hash_create(); + } else { + g_tw_pe[0]->hash_t = NULL; + } - if (send_buffer < 1) - tw_error(TW_LOC, "network send buffer must be >= 1"); - if (read_buffer < 1) - tw_error(TW_LOC, "network read buffer must be >= 1"); + if (send_buffer < 1) + tw_error(TW_LOC, "network send buffer must be >= 1"); + if (read_buffer < 1) + tw_error(TW_LOC, "network read buffer must be >= 1"); - init_q(&posted_sends, "MPI send queue"); - init_q(&posted_recvs, "MPI recv queue"); + init_q(&posted_sends, "MPI send queue"); + init_q(&posted_recvs, "MPI recv queue"); - g_tw_net_device_size = read_buffer; + g_tw_net_device_size = read_buffer; - // pre-post all the Irecv operations - recv_begin( g_tw_pe[0] ); + // pre-post all the Irecv operations + recv_begin( g_tw_pe[0] ); } void tw_net_abort(void) { - MPI_Abort(MPI_COMM_ROSS, 1); - exit(1); + MPI_Abort(MPI_COMM_ROSS, 1); + exit(1); } void tw_net_stop(void) { - if (!custom_communicator) { - if (MPI_Finalize() != MPI_SUCCESS) - tw_error(TW_LOC, "Failed to finalize MPI"); - } + if (!custom_communicator) { + if (MPI_Finalize() != MPI_SUCCESS) + tw_error(TW_LOC, "Failed to finalize MPI"); + } } void tw_net_barrier(tw_pe * pe) { - if (MPI_Barrier(MPI_COMM_ROSS) != MPI_SUCCESS) - tw_error(TW_LOC, "Failed to wait for MPI_Barrier"); + if (MPI_Barrier(MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Failed to wait for MPI_Barrier"); } tw_stime tw_net_minimum(tw_pe *me) { - tw_stime m = DBL_MAX; - tw_event *e; - int i; - - e = outq.head; - while (e) { - if (m > e->recv_ts) - m = e->recv_ts; - e = e->next; - } - - for (i = 0; i < posted_sends.cur; i++) { //fix this line (?) - e = posted_sends.event_list[i]; - if(e == NULL) - {} - else if(m > e->recv_ts) - m = e->recv_ts; - else - {} - } + tw_stime m = DBL_MAX; + tw_event *e; + int i; + + e = outq.head; + while (e) { + if (m > e->recv_ts) + m = e->recv_ts; + e = e->next; + } + + for (i = 0; i < posted_sends.cur; i++) { //fix this line (?) + e = posted_sends.event_list[i]; + if(e == NULL) + {} + else if(m > e->recv_ts) + m = e->recv_ts; + else + {} + } - return m; + return m; } static int test_q( - struct act_q *q, - tw_pe *me, - void (*finish)(tw_pe *, tw_event *, char *)) + struct act_q *q, + tw_pe *me, + void (*finish)(tw_pe *, tw_event *, char *)) { - int ready, i, n; + int ready, i, n; #if ROSS_MEMORY - char *tmp; + char *tmp; #endif // if ( !q->cur || q->num_in_fr_q == ((q->size_of_fr_q)-1) ) //fixed this line (?) if queue is full, no elements are being processed // return 0; - if( q->num_in_fr_q == ((q->size_of_fr_q)-1) ) - return 0; + if( q->num_in_fr_q == ((q->size_of_fr_q)-1) ) + return 0; - if (MPI_Testsome( - q->cur, - q->req_list, - &ready, - q->idx_list, - q->status_list) != MPI_SUCCESS) { - tw_error( - TW_LOC, - "MPI_testsome failed with %u items in %s", - q->cur, - q->name); - } + if (MPI_Testsome( + q->cur, + q->req_list, + &ready, + q->idx_list, + q->status_list) != MPI_SUCCESS) { + tw_error( + TW_LOC, + "MPI_testsome failed with %u items in %s", + q->cur, + q->name); + } - if (1 > ready) - return 0; - for (i = 0; i < ready; i++) + if (1 > ready) + return 0; + + for (i = 0; i < ready; i++) { - tw_event *e; + tw_event *e; - n = q->idx_list[i]; - e = q->event_list[n]; - q->event_list[n] = NULL; - fr_q_aq(q,n);//add n onto queue + n = q->idx_list[i]; + e = q->event_list[n]; + q->event_list[n] = NULL; + fr_q_aq(q,n);//add n onto queue #if ROSS_MEMORY - finish(me, e, q->buffers[n]); + finish(me, e, q->buffers[n]); #else - finish(me, e, NULL); + finish(me, e, NULL); #endif } - /* Collapse the lists to remove any holes we left. */ - /* - for (i = 0, n = 0; i < q->cur; i++)//fix these lines - { - if (q->event_list[i]) + q->num_in_fr_q+=ready; + + + /* Collapse the lists to remove any holes we left. */ + /* + for (i = 0, n = 0; i < q->cur; i++)//fix these lines { - if (i != n) + if (q->event_list[i]) { - // swap the event pointers - q->event_list[n] = q->event_list[i]; - - // copy the request handles - memcpy( - &q->req_list[n], - &q->req_list[i], - sizeof(q->req_list[0])); - -#if ROSS_MEMORY - // swap the buffers - tmp = q->buffers[n]; - q->buffers[n] = q->buffers[i]; - q->buffers[i] = tmp; -#endif - } // endif (i != n) - n++; - } // endif (q->event_list[i]) - } - q->cur -= ready;//fix this line - */ - return 1; + if (i != n) + { + // swap the event pointers + q->event_list[n] = q->event_list[i]; + // copy the request handles + memcpy( + &q->req_list[n], + &q->req_list[i], + sizeof(q->req_list[0])); + + #if ROSS_MEMORY + // swap the buffers + tmp = q->buffers[n]; + q->buffers[n] = q->buffers[i]; + q->buffers[i] = tmp; + #endif + } // endif (i != n) + n++; + } // endif (q->event_list[i]) + } + q->cur -= ready;//fix this line + */ + return 1; } static int recv_begin(tw_pe *me) { - MPI_Status status; + MPI_Status status; - tw_event *e = NULL; + tw_event *e = NULL; - int flag = 0; - int changed = 0; + int flag = 0; + int changed = 0; - while (0 < posted_recvs.num_in_fr_q)//fix these lines + while (0 < posted_recvs.num_in_fr_q)//fix these lines { - if(!(e = tw_event_grab(me))) - { - if(tw_gvt_inprogress(me)) - tw_error(TW_LOC, "Out of events in GVT! Consider increasing --extramem"); - return changed; - } - - int id = fr_q_dq(&posted_recvs); + if(!(e = tw_event_grab(me))) + { + if(tw_gvt_inprogress(me)) + tw_error(TW_LOC, "Out of events in GVT! Consider increasing --extramem"); + return changed; + } + + int id = fr_q_dq(&posted_recvs); #if ROSS_MEMORY - if( MPI_Irecv(posted_recvs.buffers[id], + if( MPI_Irecv(posted_recvs.buffers[id], EVENT_SIZE(e), MPI_BYTE, MPI_ANY_SOURCE, @@ -411,36 +416,36 @@ recv_begin(tw_pe *me) MPI_COMM_ROSS, &posted_recvs.req_list[id]) != MPI_SUCCESS) #else - if( MPI_Irecv(e, - (int)EVENT_SIZE(e), - MPI_BYTE, - MPI_ANY_SOURCE, - EVENT_TAG, - MPI_COMM_ROSS, - &posted_recvs.req_list[id]) != MPI_SUCCESS) + if( MPI_Irecv(e, + (int)EVENT_SIZE(e), + MPI_BYTE, + MPI_ANY_SOURCE, + EVENT_TAG, + MPI_COMM_ROSS, + &posted_recvs.req_list[id]) != MPI_SUCCESS) #endif - { - tw_event_free(me, e); - return changed; - } - - posted_recvs.event_list[id] = e; - deal_with_cur(&posted_recvs); - // fixed in fr_q_dq //posted_recvs.cur++; //fix this line - changed = 1; + { + tw_event_free(me, e); + return changed; + } + + posted_recvs.event_list[id] = e; + deal_with_cur(&posted_recvs); + // fixed in fr_q_dq //posted_recvs.cur++; //fix this line + changed = 1; } - return changed; + return changed; } static void recv_finish(tw_pe *me, tw_event *e, char * buffer) { - tw_pe *dest_pe; - tw_clock start; + tw_pe *dest_pe; + tw_clock start; #if ROSS_MEMORY - tw_memory *memory; + tw_memory *memory; tw_memory *last; tw_fd mem_fd; @@ -452,68 +457,68 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer) position += g_tw_event_msg_sz; #endif - me->stats.s_nread_network++; - me->s_nwhite_recv++; + me->stats.s_nread_network++; + me->s_nwhite_recv++; - // printf("recv_finish: remote event [cancel %u] FROM: LP %lu, PE %lu, TO: LP %lu, PE %lu at TS %lf \n", - // e->state.cancel_q, (tw_lpid)e->src_lp, e->send_pe, (tw_lpid)e->dest_lp, me->id, e->recv_ts); + // printf("recv_finish: remote event [cancel %u] FROM: LP %lu, PE %lu, TO: LP %lu, PE %lu at TS %lf \n", + // e->state.cancel_q, (tw_lpid)e->src_lp, e->send_pe, (tw_lpid)e->dest_lp, me->id, e->recv_ts); - e->dest_lp = tw_getlocal_lp((tw_lpid) e->dest_lp); - dest_pe = e->dest_lp->pe; - // instrumentation - e->dest_lp->kp->kp_stats->s_nread_network++; - e->dest_lp->lp_stats->s_nread_network++; + e->dest_lp = tw_getlocal_lp((tw_lpid) e->dest_lp); + dest_pe = e->dest_lp->pe; + // instrumentation + e->dest_lp->kp->kp_stats->s_nread_network++; + e->dest_lp->lp_stats->s_nread_network++; - if(e->send_pe > tw_nnodes()-1) - tw_error(TW_LOC, "bad sendpe_id: %d", e->send_pe); + if(e->send_pe > tw_nnodes()-1) + tw_error(TW_LOC, "bad sendpe_id: %d", e->send_pe); - e->cancel_next = NULL; - e->caused_by_me = NULL; - e->cause_next = NULL; + e->cancel_next = NULL; + e->caused_by_me = NULL; + e->cause_next = NULL; - if(e->recv_ts < me->GVT) - tw_error(TW_LOC, "%d: Received straggler from %d: %lf (%d)", - me->id, e->send_pe, e->recv_ts, e->state.cancel_q); + if(e->recv_ts < me->GVT) + tw_error(TW_LOC, "%d: Received straggler from %d: %lf (%d)", + me->id, e->send_pe, e->recv_ts, e->state.cancel_q); - if(tw_gvt_inprogress(me)) - me->trans_msg_ts = ROSS_MIN(me->trans_msg_ts, e->recv_ts); + if(tw_gvt_inprogress(me)) + me->trans_msg_ts = ROSS_MIN(me->trans_msg_ts, e->recv_ts); - // if cancel event, retrieve and flush - // else, store in hash table - if(e->state.cancel_q) + // if cancel event, retrieve and flush + // else, store in hash table + if(e->state.cancel_q) { - tw_event *cancel = tw_hash_remove(me->hash_t, e, e->send_pe); + tw_event *cancel = tw_hash_remove(me->hash_t, e, e->send_pe); - // NOTE: it is possible to cancel the event we - // are currently processing at this PE since this - // MPI module lets me read cancel events during - // event sends over the network. + // NOTE: it is possible to cancel the event we + // are currently processing at this PE since this + // MPI module lets me read cancel events during + // event sends over the network. - if(cancel!=NULL) // Temporary, for performance testing - { - cancel->state.cancel_q = 1; - cancel->state.remote = 0; + if(cancel!=NULL) // Temporary, for performance testing + { + cancel->state.cancel_q = 1; + cancel->state.remote = 0; - cancel->cancel_next = dest_pe->cancel_q; - dest_pe->cancel_q = cancel; - } - - tw_event_free(me, e); + cancel->cancel_next = dest_pe->cancel_q; + dest_pe->cancel_q = cancel; + } + + tw_event_free(me, e); - return; + return; } - if (g_tw_synchronization_protocol == OPTIMISTIC || - g_tw_synchronization_protocol == OPTIMISTIC_DEBUG || - g_tw_synchronization_protocol == OPTIMISTIC_REALTIME ) { - tw_hash_insert(me->hash_t, e, e->send_pe); - e->state.remote = 1; - } + if (g_tw_synchronization_protocol == OPTIMISTIC || + g_tw_synchronization_protocol == OPTIMISTIC_DEBUG || + g_tw_synchronization_protocol == OPTIMISTIC_REALTIME ) { + tw_hash_insert(me->hash_t, e, e->send_pe); + e->state.remote = 1; + } #if ROSS_MEMORY - mem_size = (size_t) e->memory; + mem_size = (size_t) e->memory; mem_fd = (tw_fd) e->prev; last = NULL; @@ -539,59 +544,59 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer) } #endif - /* NOTE: the final check in the if conditional below was added to make sure - * that we do not execute the fast case unless the cancellation queue is - * empty on the destination PE. Otherwise we need to invoke the normal - * scheduling routines to make sure that a forward event doesn't bypass a - * cancellation event with an earlier timestamp. This is helpful for - * stateful models that produce incorrect results when presented with - * duplicate messages with no rollback between them. - */ - if(me == dest_pe && e->dest_lp->kp->last_time <= e->recv_ts && !dest_pe->cancel_q) { - /* Fast case, we are sending to our own PE and - * there is no rollback caused by this send. + /* NOTE: the final check in the if conditional below was added to make sure + * that we do not execute the fast case unless the cancellation queue is + * empty on the destination PE. Otherwise we need to invoke the normal + * scheduling routines to make sure that a forward event doesn't bypass a + * cancellation event with an earlier timestamp. This is helpful for + * stateful models that produce incorrect results when presented with + * duplicate messages with no rollback between them. */ - start = tw_clock_read(); - tw_pq_enqueue(dest_pe->pq, e); - dest_pe->stats.s_pq += tw_clock_read() - start; - return; - } - - if (me->node == dest_pe->node) { - /* Slower, but still local send, so put into top - * of dest_pe->event_q. + if(me == dest_pe && e->dest_lp->kp->last_time <= e->recv_ts && !dest_pe->cancel_q) { + /* Fast case, we are sending to our own PE and + * there is no rollback caused by this send. + */ + start = tw_clock_read(); + tw_pq_enqueue(dest_pe->pq, e); + dest_pe->stats.s_pq += tw_clock_read() - start; + return; + } + + if (me->node == dest_pe->node) { + /* Slower, but still local send, so put into top + * of dest_pe->event_q. + */ + e->state.owner = TW_pe_event_q; + tw_eventq_push(&dest_pe->event_q, e); + return; + } + + /* Never should happen; MPI should have gotten the + * message to the correct node without needing us + * to redirect the message there for it. This is + * probably a serious bug with the event headers + * not being formatted right. */ - e->state.owner = TW_pe_event_q; - tw_eventq_push(&dest_pe->event_q, e); - return; - } - - /* Never should happen; MPI should have gotten the - * message to the correct node without needing us - * to redirect the message there for it. This is - * probably a serious bug with the event headers - * not being formatted right. - */ - tw_error( - TW_LOC, - "Event recived by PE %u but meant for PE %u", - me->id, - dest_pe->id); + tw_error( + TW_LOC, + "Event recived by PE %u but meant for PE %u", + me->id, + dest_pe->id); } static int send_begin(tw_pe *me) { - int changed = 0; + int changed = 0; - while (0 < posted_sends.num_in_fr_q)//fixed these line (hopefully) + while (0 < posted_sends.num_in_fr_q)//fixed these line (hopefully) { - tw_event *e = tw_eventq_peek(&outq);//next event? - tw_node *dest_node = NULL; - // posted_sends.cur; //fixed this line + tw_event *e = tw_eventq_peek(&outq);//next event? + tw_node *dest_node = NULL; + // posted_sends.cur; //fixed this line #if ROSS_MEMORY - tw_event *tmp_prev = NULL; + tw_event *tmp_prev = NULL; tw_lp *tmp_lp = NULL; @@ -605,24 +610,24 @@ send_begin(tw_pe *me) unsigned position = 0; #endif - if (!e) - break; + if (!e) + break; + + if(e == me->abort_event) + tw_error(TW_LOC, "Sending abort event!"); - if(e == me->abort_event) - tw_error(TW_LOC, "Sending abort event!"); - - int id = fr_q_dq(&posted_sends);// fixed, grabs from front of queue, moves front up one element - dest_node = tw_net_onnode((*e->src_lp->type->map) - ((tw_lpid) e->dest_lp)); + int id = fr_q_dq(&posted_sends);// fixed, grabs from front of queue, moves front up one element + dest_node = tw_net_onnode((*e->src_lp->type->map) + ((tw_lpid) e->dest_lp)); - //if(!e->state.cancel_q) - //e->event_id = (tw_eventid) ++me->seq_num; + //if(!e->state.cancel_q) + //e->event_id = (tw_eventid) ++me->seq_num; - e->send_pe = (tw_peid) g_tw_mynode; - e->send_lp = e->src_lp->gid; + e->send_pe = (tw_peid) g_tw_mynode; + e->send_lp = e->src_lp->gid; #if ROSS_MEMORY - // pack pointers + // pack pointers tmp_prev = e->prev; tmp_lp = e->src_lp; @@ -683,96 +688,96 @@ send_begin(tw_pe *me) return changed; } #else - if (MPI_Isend(e, - (int)EVENT_SIZE(e), - MPI_BYTE, - (int)*dest_node, - EVENT_TAG, - MPI_COMM_ROSS, - &posted_sends.req_list[id]) != MPI_SUCCESS) { - return changed; - } + if (MPI_Isend(e, + (int)EVENT_SIZE(e), + MPI_BYTE, + (int)*dest_node, + EVENT_TAG, + MPI_COMM_ROSS, + &posted_sends.req_list[id]) != MPI_SUCCESS) { + return changed; + } #endif - tw_eventq_pop(&outq); - e->state.owner = e->state.cancel_q - ? TW_net_acancel - : TW_net_asend; + tw_eventq_pop(&outq); + e->state.owner = e->state.cancel_q + ? TW_net_acancel + : TW_net_asend; - posted_sends.event_list[id] = e; - deal_with_cur(&posted_sends); + posted_sends.event_list[id] = e; + deal_with_cur(&posted_sends); - // fixed in fr_q_dq //posted_sends.cur++;//fix this line - me->s_nwhite_sent++; + // fixed in fr_q_dq //posted_sends.cur++;//fix this line + me->s_nwhite_sent++; - changed = 1; + changed = 1; } - return changed; + return changed; } static void send_finish(tw_pe *me, tw_event *e, char * buffer) { - me->stats.s_nsend_network++; - // instrumentation - e->src_lp->kp->kp_stats->s_nsend_network++; - e->src_lp->lp_stats->s_nsend_network++; - - if (e->state.owner == TW_net_asend) { - if (e->state.cancel_asend) { - /* Event was cancelled during transmission. We must - * send another message to pass the cancel flag to - * the other node. - */ - e->state.cancel_asend = 0; - e->state.cancel_q = 1; - tw_eventq_push(&outq, e); - } else { - /* Event finished transmission and was not cancelled. - * Add to our sent event queue so we can retain the - * event in case we need to cancel it later. Note it - * is currently in remote format and must be converted - * back to local format for fossil collection. - */ - e->state.owner = TW_pe_sevent_q; - if( g_tw_synchronization_protocol == CONSERVATIVE ) - tw_event_free(me, e); + me->stats.s_nsend_network++; + // instrumentation + e->src_lp->kp->kp_stats->s_nsend_network++; + e->src_lp->lp_stats->s_nsend_network++; + + if (e->state.owner == TW_net_asend) { + if (e->state.cancel_asend) { + /* Event was cancelled during transmission. We must + * send another message to pass the cancel flag to + * the other node. + */ + e->state.cancel_asend = 0; + e->state.cancel_q = 1; + tw_eventq_push(&outq, e); + } else { + /* Event finished transmission and was not cancelled. + * Add to our sent event queue so we can retain the + * event in case we need to cancel it later. Note it + * is currently in remote format and must be converted + * back to local format for fossil collection. + */ + e->state.owner = TW_pe_sevent_q; + if( g_tw_synchronization_protocol == CONSERVATIVE ) + tw_event_free(me, e); + } + + return; } - return; - } + if (e->state.owner == TW_net_acancel) { + /* We just finished sending the cancellation message + * for this event. We need to free the buffer and + * make it available for reuse. + */ + tw_event_free(me, e); + return; + } - if (e->state.owner == TW_net_acancel) { - /* We just finished sending the cancellation message - * for this event. We need to free the buffer and - * make it available for reuse. + /* Never should happen, not unless we somehow broke this + * module's other functions related to sending an event. */ - tw_event_free(me, e); - return; - } - /* Never should happen, not unless we somehow broke this - * module's other functions related to sending an event. - */ - - tw_error( - TW_LOC, - "Don't know how to finish send of owner=%u, cancel_q=%d", - e->state.owner, - e->state.cancel_q); + tw_error( + TW_LOC, + "Don't know how to finish send of owner=%u, cancel_q=%d", + e->state.owner, + e->state.cancel_q); } static void service_queues(tw_pe *me) { - int changed; - do { - changed = test_q(&posted_recvs, me, recv_finish); - changed |= test_q(&posted_sends, me, send_finish); - changed |= recv_begin(me); - changed |= send_begin(me); - } while (changed); + int changed; + do { + changed = test_q(&posted_recvs, me, recv_finish); + changed |= test_q(&posted_sends, me, send_finish); + changed |= recv_begin(me); + changed |= send_begin(me); + } while (changed); } /* @@ -784,78 +789,78 @@ service_queues(tw_pe *me) void tw_net_read(tw_pe *me) { - service_queues(me); + service_queues(me); } void tw_net_send(tw_event *e) { - tw_pe * me = e->src_lp->pe; - int changed = 0; + tw_pe * me = e->src_lp->pe; + int changed = 0; - e->state.remote = 0; - e->state.owner = TW_net_outq; - tw_eventq_unshift(&outq, e); + e->state.remote = 0; + e->state.owner = TW_net_outq; + tw_eventq_unshift(&outq, e); - do + do { - changed = test_q(&posted_sends, me, send_finish); - changed |= send_begin(me); + changed = test_q(&posted_sends, me, send_finish); + changed |= send_begin(me); } while (changed); } void tw_net_cancel(tw_event *e) { - tw_pe *src_pe = e->src_lp->pe; - - switch (e->state.owner) { - case TW_net_outq: - /* Cancelled before we could transmit it. Do not - * transmit the event and instead just release the - * buffer back into our own free list. - */ - tw_eventq_delete_any(&outq, e); - tw_event_free(src_pe, e); - - return; - - break; - - case TW_net_asend: - /* Too late. We've already let MPI start to send - * this event over the network. We can't pull it - * back now without sending another message to do - * the cancel. - * - * Setting the cancel_q flag will signal us to do - * another message send once the current send of - * this message is completed. - */ - e->state.cancel_asend = 1; - break; - - case TW_pe_sevent_q: - /* Way late; the event was already sent and is in - * our sent event queue. Mark it as a cancel and - * place it at the front of the outq. - */ - e->state.cancel_q = 1; - tw_eventq_unshift(&outq, e); - break; - - default: - /* Huh? Where did you come from? Why are we being - * told about you? We did not send you so we cannot - * cancel you! - */ - tw_error( - TW_LOC, - "Don't know how to cancel event owned by %u", - e->state.owner); - } + tw_pe *src_pe = e->src_lp->pe; + + switch (e->state.owner) { + case TW_net_outq: + /* Cancelled before we could transmit it. Do not + * transmit the event and instead just release the + * buffer back into our own free list. + */ + tw_eventq_delete_any(&outq, e); + tw_event_free(src_pe, e); + + return; + + break; + + case TW_net_asend: + /* Too late. We've already let MPI start to send + * this event over the network. We can't pull it + * back now without sending another message to do + * the cancel. + * + * Setting the cancel_q flag will signal us to do + * another message send once the current send of + * this message is completed. + */ + e->state.cancel_asend = 1; + break; + + case TW_pe_sevent_q: + /* Way late; the event was already sent and is in + * our sent event queue. Mark it as a cancel and + * place it at the front of the outq. + */ + e->state.cancel_q = 1; + tw_eventq_unshift(&outq, e); + break; + + default: + /* Huh? Where did you come from? Why are we being + * told about you? We did not send you so we cannot + * cancel you! + */ + tw_error( + TW_LOC, + "Don't know how to cancel event owned by %u", + e->state.owner); + } - service_queues(src_pe); + service_queues(src_pe); } /** @@ -868,50 +873,50 @@ tw_net_cancel(tw_event *e) tw_statistics * tw_net_statistics(tw_pe * me, tw_statistics * s) { - if(MPI_Reduce(&(s->s_max_run_time), - &me->stats.s_max_run_time, - 1, - MPI_DOUBLE, - MPI_MAX, - (int)g_tw_masternode, - MPI_COMM_ROSS) != MPI_SUCCESS) - tw_error(TW_LOC, "Unable to reduce statistics!"); - - if(MPI_Reduce(&(s->s_net_events), - &me->stats.s_net_events, - 17, - MPI_UNSIGNED_LONG_LONG, - MPI_SUM, - (int)g_tw_masternode, - MPI_COMM_ROSS) != MPI_SUCCESS) - tw_error(TW_LOC, "Unable to reduce statistics!"); - - if(MPI_Reduce(&s->s_min_detected_offset, - &me->stats.s_min_detected_offset, - 1, - MPI_DOUBLE, - MPI_MIN, - (int)g_tw_masternode, - MPI_COMM_ROSS) != MPI_SUCCESS) - tw_error(TW_LOC, "Unable to reduce statistics!"); - - if(MPI_Reduce(&(s->s_total), - &me->stats.s_total, - 16, - MPI_UNSIGNED_LONG_LONG, - MPI_MAX, - (int)g_tw_masternode, - MPI_COMM_ROSS) != MPI_SUCCESS) - tw_error(TW_LOC, "Unable to reduce statistics!"); + if(MPI_Reduce(&(s->s_max_run_time), + &me->stats.s_max_run_time, + 1, + MPI_DOUBLE, + MPI_MAX, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); + + if(MPI_Reduce(&(s->s_net_events), + &me->stats.s_net_events, + 17, + MPI_UNSIGNED_LONG_LONG, + MPI_SUM, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); + + if(MPI_Reduce(&s->s_min_detected_offset, + &me->stats.s_min_detected_offset, + 1, + MPI_DOUBLE, + MPI_MIN, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); + + if(MPI_Reduce(&(s->s_total), + &me->stats.s_total, + 16, + MPI_UNSIGNED_LONG_LONG, + MPI_MAX, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); if (MPI_Reduce(&s->s_events_past_end, - &me->stats.s_events_past_end, - 3, - MPI_UNSIGNED_LONG_LONG, - MPI_SUM, - (int)g_tw_masternode, - MPI_COMM_ROSS) != MPI_SUCCESS) - tw_error(TW_LOC, "Unable to reduce statistics!"); + &me->stats.s_events_past_end, + 3, + MPI_UNSIGNED_LONG_LONG, + MPI_SUM, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); #ifdef USE_RIO if (MPI_Reduce(&s->s_rio_load, @@ -932,5 +937,5 @@ tw_net_statistics(tw_pe * me, tw_statistics * s) tw_error(TW_LOC, "Unable to reduce statistics!"); #endif - return &me->stats; + return &me->stats; } From 0bc64a90cc292ae64a9e6d0384c7c9e808977e22 Mon Sep 17 00:00:00 2001 From: yaciud <32393009+yaciud@users.noreply.github.com> Date: Thu, 16 Aug 2018 18:52:58 -0400 Subject: [PATCH 10/24] Update avl_tree.c added delete line in insert function --- core/avl_tree.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/core/avl_tree.c b/core/avl_tree.c index 20c2d88a4..76a2af07b 100644 --- a/core/avl_tree.c +++ b/core/avl_tree.c @@ -5,6 +5,7 @@ /* Copied and modified from http://pine.cs.yale.edu/pinewiki/C/AvlTree google cache */ #include "avl_tree.h" +#include "ross.h" /* implementation of an AVL tree with explicit heights */ @@ -190,6 +191,20 @@ avlInsert(AvlTree *t, tw_event *key) if (key->send_pe == (*t)->key->send_pe) { // This shouldn't happen but we'll allow it tw_printf(TW_LOC, "The events are identical!!!\n"); + if((key->state.cancel_q == 1 && (*t)->key->state.cancel_q == 0)||(key->state.cancel_q == 0 && (*t)->key->state.cancel_q == 1)) + { + tw_printf(TW_LOC, "Annihilation imminent \n"); + avlDelete(&(*t)->child[key->recv_ts > (*t)->key->recv_ts], key); + } + else + { + tw_printf(TW_LOC, "what.\n"); + + } +// avlDelete(&(*t), (*t)->key); +// avlDelete(&(*t)->child[key->recv_ts > (*t)->key->recv_ts], key);//works? +// avlDelete(&(*t)->child[key->recv_ts > (*t)->key->recv_ts], key);// alter this one + } avlInsert(&(*t)->child[key->send_pe > (*t)->key->send_pe], key); avlRebalance(t); From 5191113595bd9e351bea988dd66341002e3b2430 Mon Sep 17 00:00:00 2001 From: yaciud <32393009+yaciud@users.noreply.github.com> Date: Thu, 16 Aug 2018 18:55:53 -0400 Subject: [PATCH 11/24] Update network-mpi.c commented out entire recv_finish cancel if statement, within which is an else statement attached to the if(cancel!=null) for insertion --- core/network-mpi.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/core/network-mpi.c b/core/network-mpi.c index 21c0b40d0..685dc663a 100644 --- a/core/network-mpi.c +++ b/core/network-mpi.c @@ -487,6 +487,7 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer) // if cancel event, retrieve and flush // else, store in hash table +/* if(e->state.cancel_q) { tw_event *cancel = tw_hash_remove(me->hash_t, e, e->send_pe); @@ -504,12 +505,17 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer) cancel->cancel_next = dest_pe->cancel_q; dest_pe->cancel_q = cancel; } + else + { + tw_hash_insert(me->hash_t, e, e->send_pe); + + } tw_event_free(me, e); return; } - +*/ if (g_tw_synchronization_protocol == OPTIMISTIC || g_tw_synchronization_protocol == OPTIMISTIC_DEBUG || g_tw_synchronization_protocol == OPTIMISTIC_REALTIME ) { @@ -596,7 +602,7 @@ send_begin(tw_pe *me) // posted_sends.cur; //fixed this line #if ROSS_MEMORY - tw_event *tmp_prev = NULL; + tw_event *tmp_prev = NULL; tw_lp *tmp_lp = NULL; From 3bf12a38a43e0b8d0f8465da3bda28afe8f2b921 Mon Sep 17 00:00:00 2001 From: yaciud <32393009+yaciud@users.noreply.github.com> Date: Wed, 29 Aug 2018 16:14:21 -0400 Subject: [PATCH 12/24] Update network-mpi.c Filter anti-messages version --- core/network-mpi.c | 77 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 14 deletions(-) diff --git a/core/network-mpi.c b/core/network-mpi.c index 685dc663a..84ff1250a 100644 --- a/core/network-mpi.c +++ b/core/network-mpi.c @@ -1,5 +1,6 @@ #include #include +#include "ross.h" MPI_Comm MPI_COMM_ROSS = MPI_COMM_WORLD; int custom_communicator = 0; @@ -15,7 +16,7 @@ struct act_q int *idx_list; MPI_Status *status_list; int *free_idx_list;//add, que of free indices - + int *overflow_anti; #if ROSS_MEMORY char **buffers; @@ -107,9 +108,9 @@ static const tw_optdef mpi_opts[] = { // Forward declarations of functions used in MPI network message processing static int recv_begin(tw_pe *me); -static void recv_finish(tw_pe *me, tw_event *e, char * buffer); +static void recv_finish(tw_pe *me, tw_event *e, char * buffer, int** overfl); static int send_begin(tw_pe *me); -static void send_finish(tw_pe *me, tw_event *e, char * buffer); +static void send_finish(tw_pe *me, tw_event *e, char * buffer, int** overfl); // Start of implmentation of network processing routines/functions void tw_comm_set(MPI_Comm comm) @@ -156,9 +157,9 @@ init_q(struct act_q *q, const char *name) q->event_list = (tw_event **) tw_calloc(TW_LOC, name, sizeof(*q->event_list), n); q->req_list = (MPI_Request *) tw_calloc(TW_LOC, name, sizeof(*q->req_list), n); q->idx_list = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), n); - q->free_idx_list = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), n); q->status_list = (MPI_Status *) tw_calloc(TW_LOC, name, sizeof(*q->status_list), n); q->free_idx_list = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), n+1);// queue, n+1 is meant to prevent a full queue + q->overflow_anti = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), n+1);// queue, n+1 is meant to prevent a full queue q->front = 0;// front of queue q->coda = 0;// end of queue q->size_of_fr_q=n+1;// for wraparound @@ -171,6 +172,7 @@ init_q(struct act_q *q, const char *name) i++; } + q->overflow_anti[0]=1; q->num_in_fr_q = n; // printf("sizeofq = %d, numinq = %d, coda = %d, front = %d\n",q->size_of_fr_q, q->num_in_fr_q,q->coda, q->front ); @@ -303,7 +305,7 @@ static int test_q( struct act_q *q, tw_pe *me, - void (*finish)(tw_pe *, tw_event *, char *)) + void (*finish)(tw_pe *, tw_event *, char *, int**)) { int ready, i, n; @@ -336,18 +338,42 @@ test_q( for (i = 0; i < ready; i++) { + tw_event *e; n = q->idx_list[i]; e = q->event_list[n]; - q->event_list[n] = NULL; - fr_q_aq(q,n);//add n onto queue + if(e->state.cancel_q == 0) + { + q->event_list[n] = NULL; + fr_q_aq(q, n);//add n onto queue #if ROSS_MEMORY - finish(me, e, q->buffers[n]); + finish(me, e, q->buffers[n],q->overflow_anti); #else - finish(me, e, NULL); + finish(me, e, NULL, &q->overflow_anti); #endif + } + } + + for (i = 0; i < ready; i++) + { + + tw_event *e; + + n = q->idx_list[i]; + e = q->event_list[n]; + if(e != NULL) + { + q->event_list[n] = NULL; + fr_q_aq(q, n);//add n onto queue + +#if ROSS_MEMORY + finish(me, e, q->buffers[n],q->overflow_anti); +#else + finish(me, e, NULL, &q->overflow_anti); +#endif + } } q->num_in_fr_q+=ready; @@ -439,7 +465,7 @@ recv_begin(tw_pe *me) } static void -recv_finish(tw_pe *me, tw_event *e, char * buffer) +recv_finish(tw_pe *me, tw_event *e, char * buffer, int** overfl) { tw_pe *dest_pe; tw_clock start; @@ -487,9 +513,30 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer) // if cancel event, retrieve and flush // else, store in hash table + if(e->state.cancel_q) + { + tw_event *cancel = tw_hash_remove(me->hash_t, e, e->send_pe); + + // NOTE: it is possible to cancel the event we + // are currently processing at this PE since this + // MPI module lets me read cancel events during + // event sends over the network. + + cancel->state.cancel_q = 1; + cancel->state.remote = 0; + + cancel->cancel_next = dest_pe->cancel_q; + dest_pe->cancel_q = cancel; + + tw_event_free(me, e); + + return; + } + /* if(e->state.cancel_q) { + tw_event *cancel = tw_hash_remove(me->hash_t, e, e->send_pe); // NOTE: it is possible to cancel the event we @@ -504,18 +551,20 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer) cancel->cancel_next = dest_pe->cancel_q; dest_pe->cancel_q = cancel; + tw_event_free(me, e); } else { - tw_hash_insert(me->hash_t, e, e->send_pe); + } - tw_event_free(me, e); return; } */ + + if (g_tw_synchronization_protocol == OPTIMISTIC || g_tw_synchronization_protocol == OPTIMISTIC_DEBUG || g_tw_synchronization_protocol == OPTIMISTIC_REALTIME ) { @@ -602,7 +651,7 @@ send_begin(tw_pe *me) // posted_sends.cur; //fixed this line #if ROSS_MEMORY - tw_event *tmp_prev = NULL; + tw_event *tmp_prev = NULL; tw_lp *tmp_lp = NULL; @@ -722,7 +771,7 @@ send_begin(tw_pe *me) } static void -send_finish(tw_pe *me, tw_event *e, char * buffer) +send_finish(tw_pe *me, tw_event *e, char * buffer, int ** overflow) { me->stats.s_nsend_network++; // instrumentation From 426d8b9c40ca6a8a991ff8099ba2bbceddfe8919 Mon Sep 17 00:00:00 2001 From: yaciud <32393009+yaciud@users.noreply.github.com> Date: Wed, 29 Aug 2018 16:23:43 -0400 Subject: [PATCH 13/24] Update avl_tree.c rolled back to original copy of all_tree.c --- core/avl_tree.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/core/avl_tree.c b/core/avl_tree.c index 76a2af07b..6b12bd037 100644 --- a/core/avl_tree.c +++ b/core/avl_tree.c @@ -5,7 +5,6 @@ /* Copied and modified from http://pine.cs.yale.edu/pinewiki/C/AvlTree google cache */ #include "avl_tree.h" -#include "ross.h" /* implementation of an AVL tree with explicit heights */ @@ -191,20 +190,6 @@ avlInsert(AvlTree *t, tw_event *key) if (key->send_pe == (*t)->key->send_pe) { // This shouldn't happen but we'll allow it tw_printf(TW_LOC, "The events are identical!!!\n"); - if((key->state.cancel_q == 1 && (*t)->key->state.cancel_q == 0)||(key->state.cancel_q == 0 && (*t)->key->state.cancel_q == 1)) - { - tw_printf(TW_LOC, "Annihilation imminent \n"); - avlDelete(&(*t)->child[key->recv_ts > (*t)->key->recv_ts], key); - } - else - { - tw_printf(TW_LOC, "what.\n"); - - } -// avlDelete(&(*t), (*t)->key); -// avlDelete(&(*t)->child[key->recv_ts > (*t)->key->recv_ts], key);//works? -// avlDelete(&(*t)->child[key->recv_ts > (*t)->key->recv_ts], key);// alter this one - } avlInsert(&(*t)->child[key->send_pe > (*t)->key->send_pe], key); avlRebalance(t); @@ -269,7 +254,7 @@ avlDelete(AvlTree *t, tw_event *key) AvlTree oldroot; if (*t == AVL_EMPTY) { -// tw_error(TW_LOC, "We never look for non-existent events!"); + tw_error(TW_LOC, "We never look for non-existent events!"); return target; } From 3ff0b843f07e54d95d1fc023430a59b1b7b64dab Mon Sep 17 00:00:00 2001 From: yaciud <32393009+yaciud@users.noreply.github.com> Date: Tue, 4 Sep 2018 15:25:47 -0400 Subject: [PATCH 14/24] Update network-mpi.c updated to segfaulting exception code --- core/network-mpi.c | 110 ++++++++++++++++++++++----------------------- 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/core/network-mpi.c b/core/network-mpi.c index 84ff1250a..622fc8c5a 100644 --- a/core/network-mpi.c +++ b/core/network-mpi.c @@ -108,9 +108,9 @@ static const tw_optdef mpi_opts[] = { // Forward declarations of functions used in MPI network message processing static int recv_begin(tw_pe *me); -static void recv_finish(tw_pe *me, tw_event *e, char * buffer, int** overfl); +static void recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q* q, int id); static int send_begin(tw_pe *me); -static void send_finish(tw_pe *me, tw_event *e, char * buffer, int** overfl); +static void send_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q* q, int id); // Start of implmentation of network processing routines/functions void tw_comm_set(MPI_Comm comm) @@ -303,11 +303,13 @@ tw_net_minimum(tw_pe *me) static int test_q( - struct act_q *q, + struct act_q * q, tw_pe *me, - void (*finish)(tw_pe *, tw_event *, char *, int**)) + void (*finish)(tw_pe *, tw_event *, char *, struct act_q*, int id)) { - int ready, i, n; + int ready, i, n, indicator; + q->overflow_anti[0]=1; // + indicator = 1; #if ROSS_MEMORY char *tmp; @@ -336,44 +338,51 @@ test_q( if (1 > ready) return 0; - for (i = 0; i < ready; i++) - { + for (i = 0; i < ready; i++) { tw_event *e; n = q->idx_list[i]; e = q->event_list[n]; - if(e->state.cancel_q == 0) - { - q->event_list[n] = NULL; - fr_q_aq(q, n);//add n onto queue + fr_q_aq(q, n);//add n onto queue #if ROSS_MEMORY - finish(me, e, q->buffers[n],q->overflow_anti); + finish(me, e, q->buffers[n], q, n); #else - finish(me, e, NULL, &q->overflow_anti); + finish(me, e, NULL, q, n); #endif + if (indicator != q->overflow_anti[0]) + { + printf("indicator was %d, is now %d\n", indicator, q->overflow_anti[0]); + indicator= q->overflow_anti[0]; } + else + { + q->event_list[n] = NULL; + } + + } - for (i = 0; i < ready; i++) +// printf("getting to it\n"); + + for (i = 1; i < q->overflow_anti[0]; i++) { tw_event *e; - - n = q->idx_list[i]; + printf("in loop\n"); + n = q->overflow_anti[i]; e = q->event_list[n]; - if(e != NULL) - { - q->event_list[n] = NULL; - fr_q_aq(q, n);//add n onto queue + q->event_list[n] = NULL; + printf("about to finish\n"); +// fr_q_aq(q, n);//add n onto queue #if ROSS_MEMORY - finish(me, e, q->buffers[n],q->overflow_anti); + finish(me, e, q->buffers[n], q, n); #else - finish(me, e, NULL, &q->overflow_anti); + finish(me, e, NULL, q, n); #endif - } + } q->num_in_fr_q+=ready; @@ -465,7 +474,7 @@ recv_begin(tw_pe *me) } static void -recv_finish(tw_pe *me, tw_event *e, char * buffer, int** overfl) +recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) { tw_pe *dest_pe; tw_clock start; @@ -488,20 +497,20 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer, int** overfl) // printf("recv_finish: remote event [cancel %u] FROM: LP %lu, PE %lu, TO: LP %lu, PE %lu at TS %lf \n", // e->state.cancel_q, (tw_lpid)e->src_lp, e->send_pe, (tw_lpid)e->dest_lp, me->id, e->recv_ts); + tw_lp * temp_lp = (e->dest_lp); + tw_pe * temp_pe = (e->dest_lp->pe); - e->dest_lp = tw_getlocal_lp((tw_lpid) e->dest_lp); - dest_pe = e->dest_lp->pe; + e->dest_lp = tw_getlocal_lp((tw_lpid) e->dest_lp);//->gid);// check here + dest_pe = e->dest_lp->pe;// check here // instrumentation e->dest_lp->kp->kp_stats->s_nread_network++; e->dest_lp->lp_stats->s_nread_network++; - if(e->send_pe > tw_nnodes()-1) tw_error(TW_LOC, "bad sendpe_id: %d", e->send_pe); - e->cancel_next = NULL; - e->caused_by_me = NULL; - e->cause_next = NULL; - +// e->cancel_next = NULL; +// e->caused_by_me = NULL; +// e->cause_next = NULL; if(e->recv_ts < me->GVT) @@ -513,27 +522,7 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer, int** overfl) // if cancel event, retrieve and flush // else, store in hash table - if(e->state.cancel_q) - { - tw_event *cancel = tw_hash_remove(me->hash_t, e, e->send_pe); - // NOTE: it is possible to cancel the event we - // are currently processing at this PE since this - // MPI module lets me read cancel events during - // event sends over the network. - - cancel->state.cancel_q = 1; - cancel->state.remote = 0; - - cancel->cancel_next = dest_pe->cancel_q; - dest_pe->cancel_q = cancel; - - tw_event_free(me, e); - - return; - } - -/* if(e->state.cancel_q) { @@ -546,24 +535,35 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer, int** overfl) if(cancel!=NULL) // Temporary, for performance testing { +// printf("made it to proper cancel\n"); + e->cancel_next = NULL; + e->caused_by_me = NULL; + e->cause_next = NULL; cancel->state.cancel_q = 1; cancel->state.remote = 0; + cancel->cancel_next = dest_pe->cancel_q; dest_pe->cancel_q = cancel; tw_event_free(me, e); } else { - + printf("hitting overflow, storing %d in index %d\n",id, q->overflow_anti[0]); + e->dest_lp = temp_lp; + e->dest_lp->pe = temp_pe; + q->overflow_anti[q->overflow_anti[0]] = id; //add id stuff later + q->overflow_anti[0]++; } return; } -*/ + e->cancel_next = NULL; + e->caused_by_me = NULL; + e->cause_next = NULL; if (g_tw_synchronization_protocol == OPTIMISTIC || g_tw_synchronization_protocol == OPTIMISTIC_DEBUG || @@ -651,7 +651,7 @@ send_begin(tw_pe *me) // posted_sends.cur; //fixed this line #if ROSS_MEMORY - tw_event *tmp_prev = NULL; + tw_event *tmp_prev = NULL; tw_lp *tmp_lp = NULL; @@ -771,7 +771,7 @@ send_begin(tw_pe *me) } static void -send_finish(tw_pe *me, tw_event *e, char * buffer, int ** overflow) +send_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id) { me->stats.s_nsend_network++; // instrumentation From df0e564099551bdfdec9b0dcf9725fb97005901e Mon Sep 17 00:00:00 2001 From: yaciud <32393009+yaciud@users.noreply.github.com> Date: Tue, 4 Sep 2018 15:28:28 -0400 Subject: [PATCH 15/24] Update avl_tree.c removed error for nonexistent events. --- core/avl_tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/avl_tree.c b/core/avl_tree.c index 6b12bd037..468a2afd6 100644 --- a/core/avl_tree.c +++ b/core/avl_tree.c @@ -5,6 +5,7 @@ /* Copied and modified from http://pine.cs.yale.edu/pinewiki/C/AvlTree google cache */ #include "avl_tree.h" +#include "ross.h" /* implementation of an AVL tree with explicit heights */ @@ -254,7 +255,7 @@ avlDelete(AvlTree *t, tw_event *key) AvlTree oldroot; if (*t == AVL_EMPTY) { - tw_error(TW_LOC, "We never look for non-existent events!"); +// tw_error(TW_LOC, "We never look for non-existent events!"); return target; } From 370affca8d39aca1a0417e5d17bc9a23698ce0d9 Mon Sep 17 00:00:00 2001 From: yaciud <32393009+yaciud@users.noreply.github.com> Date: Mon, 14 Jan 2019 17:14:56 -0500 Subject: [PATCH 16/24] Update network-mpi.c Exception code written and implemented, minor errors with all tree size with read_buffer size of 5000. Code is functional, but could use some optimization tweaking. --- core/network-mpi.c | 84 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 68 insertions(+), 16 deletions(-) diff --git a/core/network-mpi.c b/core/network-mpi.c index 622fc8c5a..ff4d364fc 100644 --- a/core/network-mpi.c +++ b/core/network-mpi.c @@ -109,6 +109,7 @@ static const tw_optdef mpi_opts[] = { // Forward declarations of functions used in MPI network message processing static int recv_begin(tw_pe *me); static void recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q* q, int id); +static void late_recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q* q, int id); static int send_begin(tw_pe *me); static void send_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q* q, int id); @@ -164,7 +165,6 @@ init_q(struct act_q *q, const char *name) q->coda = 0;// end of queue q->size_of_fr_q=n+1;// for wraparound q->num_in_fr_q= 0;// number of elements in queue - int i = 0; while(ioverflow_anti[0]) { - printf("indicator was %d, is now %d\n", indicator, q->overflow_anti[0]); +// printf("indicator was %d, is now %d\n", indicator, q->overflow_anti[0]); indicator= q->overflow_anti[0]; } else @@ -365,23 +365,22 @@ test_q( } // printf("getting to it\n"); - - for (i = 1; i < q->overflow_anti[0]; i++) + i = 1; + while (i < q->overflow_anti[0]) { tw_event *e; - printf("in loop\n"); n = q->overflow_anti[i]; e = q->event_list[n]; q->event_list[n] = NULL; - printf("about to finish\n"); // fr_q_aq(q, n);//add n onto queue #if ROSS_MEMORY finish(me, e, q->buffers[n], q, n); #else - finish(me, e, NULL, q, n); + late_recv_finish(me, e, NULL, q, n); #endif + i++; } @@ -473,8 +472,9 @@ recv_begin(tw_pe *me) return changed; } + static void -recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) +late_recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) { tw_pe *dest_pe; tw_clock start; @@ -492,16 +492,70 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) position += g_tw_event_msg_sz; #endif + dest_pe = e->dest_lp->pe; + if(e->send_pe > tw_nnodes()-1) + tw_error(TW_LOC, "bad sendpe_id: %d", e->send_pe); + + e->cancel_next = NULL; + e->caused_by_me = NULL; + e->cause_next = NULL; + + + if(e->recv_ts < me->GVT) + tw_error(TW_LOC, "%d: Received straggler from %d: %lf (%d)", + me->id, e->send_pe, e->recv_ts, e->state.cancel_q); + + if(tw_gvt_inprogress(me)) + me->trans_msg_ts = ROSS_MIN(me->trans_msg_ts, e->recv_ts); + + // if cancel event, retrieve and flush + // else, store in hash table + + tw_event *cancel = tw_hash_remove(me->hash_t, e, e->send_pe); + + // NOTE: it is possible to cancel the event we + // are currently processing at this PE since this + // MPI module lets me read cancel events during + // event sends over the network. + + +// printf("made it to proper cancel\n"); + cancel->state.cancel_q = 1; + cancel->state.remote = 0; + cancel->cancel_next = dest_pe->cancel_q; + dest_pe->cancel_q = cancel; + tw_event_free(me, e); + + return; + +} + +static void +recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) +{ + tw_pe *dest_pe; + tw_clock start; + +#if ROSS_MEMORY + tw_memory *memory; + tw_memory *last; + tw_fd mem_fd; + + size_t mem_size; + + unsigned position = 0; + + memcpy(e, buffer, g_tw_event_msg_sz); + position += g_tw_event_msg_sz; +#endif me->stats.s_nread_network++; me->s_nwhite_recv++; - // printf("recv_finish: remote event [cancel %u] FROM: LP %lu, PE %lu, TO: LP %lu, PE %lu at TS %lf \n", - // e->state.cancel_q, (tw_lpid)e->src_lp, e->send_pe, (tw_lpid)e->dest_lp, me->id, e->recv_ts); - tw_lp * temp_lp = (e->dest_lp); - tw_pe * temp_pe = (e->dest_lp->pe); - e->dest_lp = tw_getlocal_lp((tw_lpid) e->dest_lp);//->gid);// check here dest_pe = e->dest_lp->pe;// check here +// printf("recv_finish: remote event [cancel %u] FROM: LP %lu, PE %lu, TO: LP %lu, PE %lu at TS %lf \n", +// e->state.cancel_q, (tw_lpid)e->src_lp, e->send_pe, (tw_lpid)e->dest_lp, me->id, e->recv_ts); + // instrumentation e->dest_lp->kp->kp_stats->s_nread_network++; e->dest_lp->lp_stats->s_nread_network++; @@ -549,9 +603,7 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) } else { - printf("hitting overflow, storing %d in index %d\n",id, q->overflow_anti[0]); - e->dest_lp = temp_lp; - e->dest_lp->pe = temp_pe; +// printf("hitting overflow, storing %d in index %d\n",id, q->overflow_anti[0]); q->overflow_anti[q->overflow_anti[0]] = id; //add id stuff later q->overflow_anti[0]++; From 1036df6e915e34f308ff300a19d91d3330449af3 Mon Sep 17 00:00:00 2001 From: yaciud <32393009+yaciud@users.noreply.github.com> Date: Tue, 15 Jan 2019 18:36:59 -0500 Subject: [PATCH 17/24] Simplified and improved statistics adds one AVL tree element after out of order event is detected to offset accessing tw_hash_remove() twice. Removed various debug statements in commented code. --- core/network-mpi.c | 67 ++++++---------------------------------------- 1 file changed, 8 insertions(+), 59 deletions(-) diff --git a/core/network-mpi.c b/core/network-mpi.c index ff4d364fc..497d05a0c 100644 --- a/core/network-mpi.c +++ b/core/network-mpi.c @@ -175,14 +175,6 @@ init_q(struct act_q *q, const char *name) q->overflow_anti[0]=1; q->num_in_fr_q = n; -// printf("sizeofq = %d, numinq = %d, coda = %d, front = %d\n",q->size_of_fr_q, q->num_in_fr_q,q->coda, q->front ); -// printf("dequeue twice, requeue those elements\n"); -// fr_q_dq(q); -// fr_q_dq(q); -// fr_q_aq(q,0); -// fr_q_aq(q,1); -// printf("sizeofq = %d, numinq = %d, coda = %d, front = %d\n",q->size_of_fr_q, q->num_in_fr_q, q->coda, q->front ); -// printf("check: num in q = %d, size of q = %d\n",q->num_in_fr_q,q->size_of_fr_q); #if ROSS_MEMORY q->buffers = tw_calloc(TW_LOC, name, sizeof(*q->buffers), n); @@ -315,8 +307,6 @@ test_q( char *tmp; #endif -// if ( !q->cur || q->num_in_fr_q == ((q->size_of_fr_q)-1) ) //fixed this line (?) if queue is full, no elements are being processed -// return 0; if( q->num_in_fr_q == ((q->size_of_fr_q)-1) ) return 0; @@ -353,7 +343,6 @@ test_q( #endif if (indicator != q->overflow_anti[0]) { -// printf("indicator was %d, is now %d\n", indicator, q->overflow_anti[0]); indicator= q->overflow_anti[0]; } else @@ -364,7 +353,6 @@ test_q( } -// printf("getting to it\n"); i = 1; while (i < q->overflow_anti[0]) { @@ -373,48 +361,16 @@ test_q( n = q->overflow_anti[i]; e = q->event_list[n]; q->event_list[n] = NULL; -// fr_q_aq(q, n);//add n onto queue -#if ROSS_MEMORY - finish(me, e, q->buffers[n], q, n); -#else - late_recv_finish(me, e, NULL, q, n); -#endif + late_recv_finish(me, e, NULL, q, n); + //might need an augmented version for ROSS_MEMORY? + i++; } q->num_in_fr_q+=ready; - - /* Collapse the lists to remove any holes we left. */ - /* - for (i = 0, n = 0; i < q->cur; i++)//fix these lines - { - if (q->event_list[i]) - { - if (i != n) - { - // swap the event pointers - q->event_list[n] = q->event_list[i]; - // copy the request handles - memcpy( - &q->req_list[n], - &q->req_list[i], - sizeof(q->req_list[0])); - - #if ROSS_MEMORY - // swap the buffers - tmp = q->buffers[n]; - q->buffers[n] = q->buffers[i]; - q->buffers[i] = tmp; - #endif - } // endif (i != n) - n++; - } // endif (q->event_list[i]) - } - q->cur -= ready;//fix this line - */ return 1; } @@ -465,7 +421,6 @@ recv_begin(tw_pe *me) posted_recvs.event_list[id] = e; deal_with_cur(&posted_recvs); - // fixed in fr_q_dq //posted_recvs.cur++; //fix this line changed = 1; } @@ -512,14 +467,13 @@ late_recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id // else, store in hash table tw_event *cancel = tw_hash_remove(me->hash_t, e, e->send_pe); + g_tw_pe[0]->avl_tree_size++; // NOTE: it is possible to cancel the event we // are currently processing at this PE since this // MPI module lets me read cancel events during // event sends over the network. - -// printf("made it to proper cancel\n"); cancel->state.cancel_q = 1; cancel->state.remote = 0; cancel->cancel_next = dest_pe->cancel_q; @@ -553,19 +507,15 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) e->dest_lp = tw_getlocal_lp((tw_lpid) e->dest_lp);//->gid);// check here dest_pe = e->dest_lp->pe;// check here -// printf("recv_finish: remote event [cancel %u] FROM: LP %lu, PE %lu, TO: LP %lu, PE %lu at TS %lf \n", -// e->state.cancel_q, (tw_lpid)e->src_lp, e->send_pe, (tw_lpid)e->dest_lp, me->id, e->recv_ts); + // instrumentation e->dest_lp->kp->kp_stats->s_nread_network++; e->dest_lp->lp_stats->s_nread_network++; + if(e->send_pe > tw_nnodes()-1) tw_error(TW_LOC, "bad sendpe_id: %d", e->send_pe); -// e->cancel_next = NULL; -// e->caused_by_me = NULL; -// e->cause_next = NULL; - if(e->recv_ts < me->GVT) tw_error(TW_LOC, "%d: Received straggler from %d: %lf (%d)", @@ -587,9 +537,9 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) // MPI module lets me read cancel events during // event sends over the network. - if(cancel!=NULL) // Temporary, for performance testing + if(cancel!=NULL) { -// printf("made it to proper cancel\n"); + e->cancel_next = NULL; e->caused_by_me = NULL; e->cause_next = NULL; @@ -603,7 +553,6 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) } else { -// printf("hitting overflow, storing %d in index %d\n",id, q->overflow_anti[0]); q->overflow_anti[q->overflow_anti[0]] = id; //add id stuff later q->overflow_anti[0]++; From 8c845f5eb4ff59acd4fd14a11d10f61425249ff6 Mon Sep 17 00:00:00 2001 From: yaciud <32393009+yaciud@users.noreply.github.com> Date: Fri, 29 Mar 2019 17:27:56 -0400 Subject: [PATCH 18/24] Added reset queue capability Added reset queue capability and additional new reset queue capability --- core/network-mpi.c | 287 ++++++++++++++++++++++++++++++++------------- 1 file changed, 207 insertions(+), 80 deletions(-) diff --git a/core/network-mpi.c b/core/network-mpi.c index 497d05a0c..d0ca178e0 100644 --- a/core/network-mpi.c +++ b/core/network-mpi.c @@ -21,77 +21,153 @@ struct act_q #if ROSS_MEMORY char **buffers; #endif - unsigned int cur; int front;//add, front of queue int coda;//add, back of queue but back is already a variable somewhere int size_of_fr_q;//add, size of queue array int num_in_fr_q;//add, number of elements in queue - +// int reset_var;// a variable determining if the queue has been reset lately. // Deal with filling queue, then plateauing }; -int deal_with_cur(struct act_q *q)// try this +#define EVENT_TAG 1 + +#if ROSS_MEMORY +#define EVENT_SIZE(e) TW_MEMORY_BUFFER_SIZE +#else +#define EVENT_SIZE(e) g_tw_event_msg_sz +#endif + +static struct act_q posted_sends; +static struct act_q posted_recvs; +static tw_eventq outq; + +static unsigned int read_buffer = 16; +static unsigned int send_buffer = 1024; +static int world_size = 1; + +int deal_with_cur(struct act_q *q)// this is for MPI_testsome input { - if(q->cur < (q->size_of_fr_q-1)) + if(q->cur < (q->size_of_fr_q-1))//checks to see if { q->cur++; return 1; } else - { + { //not sure if best placement is here or in fr_q_chq. probably fr_q_chq. + //q->reset_var = 0; return 1; } } +void fr_q_reset(struct act_q *q)//experimental +{ + if(q->num_in_fr_q == (q->size_of_fr_q-1)) //if the queue is filled with free + { + int i = 0; + while( i < q->cur)//resets all entries in the array so that they are properly ordered from one to cur + { + q->free_idx_list[i] = i; + i++; + } + //resets queue to default positions + q->front = 0; + q->coda = 0; + q->cur = 0; + + return; + } + else + { + return; + } + +} + +void new_fr_q_reset(struct act_q *q)//experimental +{ + if(q->num_in_fr_q == (q->size_of_fr_q-1))//if the queue is filled with free + { +// q->reset_var = 1; + //resets queue to default positions + + q->front = 0; + q->coda = 0; + q->cur = 0; + + return; + } + else + { + return; + } + +} + -int fr_q_chq(struct act_q *q, int *frontOrCoda) //free index queue; check for modulating the front or back index of que +void fr_q_chq(struct act_q *q, int *frontOrCoda) //free index queue; check for modulating the front or back index of que { if(*frontOrCoda != q->size_of_fr_q)//don't mess with queue { - return 0;// return probably not necessary + return;// return probably not necessary } else//mess with queue { *frontOrCoda = 0; - return 0; + return; } } void fr_q_aq(struct act_q *q, int ele) // free index queue; add to queue { + q->free_idx_list[q->coda] = ele; q->coda++; -// q->num_in_fr_q++; //fixed in function fr_q_chq(q,&q->coda);//wraps the queue array around } + + int fr_q_dq(struct act_q *q) // free index queue; dequeue { - int rv =q->free_idx_list[q->front]; + int rv; + rv = q->free_idx_list[q->front]; q->front++; - q->num_in_fr_q--; - fr_q_chq(q,&q->front);// wraps the queue array around + q->num_in_fr_q--;//can we get rid of this here? + fr_q_chq(q, &q->front);// wraps the queue array around return rv; } -#define EVENT_TAG 1 -#if ROSS_MEMORY -#define EVENT_SIZE(e) TW_MEMORY_BUFFER_SIZE -#else -#define EVENT_SIZE(e) g_tw_event_msg_sz -#endif +int new_send_fr_q_dq(struct act_q *q) // free index queue; dequeue +{ +/* + int rv; -static struct act_q posted_sends; -static struct act_q posted_recvs; -static tw_eventq outq; + if(q->reset_var != 1) // if not recently reset, take from front of queue + { + rv =q->free_idx_list[q->front]; + q->front++; + q->num_in_fr_q--; + fr_q_chq(q, &q->front);// wraps the queue array around + } + else //if recently reset, return q->front. This works because front will just give the numbers sequentially so we + //don't need an explicit assignment + { + rv = q->front; + q->front++; + q->num_in_fr_q--; + fr_q_chq(q,&q->front);// wraps the queue array around + + } + + return rv; + +*/ +} -static unsigned int read_buffer = 16; -static unsigned int send_buffer = 1024; -static int world_size = 1; static const tw_optdef mpi_opts[] = { TWOPT_GROUP("ROSS MPI Kernel"), @@ -159,21 +235,23 @@ init_q(struct act_q *q, const char *name) q->req_list = (MPI_Request *) tw_calloc(TW_LOC, name, sizeof(*q->req_list), n); q->idx_list = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), n); q->status_list = (MPI_Status *) tw_calloc(TW_LOC, name, sizeof(*q->status_list), n); + //can I shrink this initialization? q->free_idx_list = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), n+1);// queue, n+1 is meant to prevent a full queue - q->overflow_anti = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), n+1);// queue, n+1 is meant to prevent a full queue + q->overflow_anti = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), (n/2)+2);// queue, at most (n/2) can be out of order, first element is # of elements in queue q->front = 0;// front of queue q->coda = 0;// end of queue q->size_of_fr_q=n+1;// for wraparound - q->num_in_fr_q= 0;// number of elements in queue + // q->cur = 0; +// q->num_in_fr_q = 0; + int i = 0; while(ioverflow_anti[0]=1; - q->num_in_fr_q = n; + q->num_in_fr_q = n;// number of elements in queue #if ROSS_MEMORY @@ -294,13 +372,12 @@ tw_net_minimum(tw_pe *me) } static int -test_q( +test_q_recv( struct act_q * q, tw_pe *me, void (*finish)(tw_pe *, tw_event *, char *, struct act_q*, int id)) { - int ready, i, n, indicator; - q->overflow_anti[0]=1; // + int ready, i=0, n, indicator; indicator = 1; #if ROSS_MEMORY @@ -324,52 +401,113 @@ test_q( q->name); } - if (1 > ready) return 0; - for (i = 0; i < ready; i++) { + q->overflow_anti[0]=1; + q->num_in_fr_q+=ready; - tw_event *e; + while ( i < ready) + { + tw_event *e; n = q->idx_list[i]; e = q->event_list[n]; - fr_q_aq(q, n);//add n onto queue + fr_q_aq(q, n); //add n onto queue #if ROSS_MEMORY - finish(me, e, q->buffers[n], q, n); + finish(me, e, q->buffers[n], q, n); #else - finish(me, e, NULL, q, n); + finish(me, e, NULL, q, n); #endif if (indicator != q->overflow_anti[0]) { - indicator= q->overflow_anti[0]; + indicator = q->overflow_anti[0]; } else { q->event_list[n] = NULL; } - - + i++; } - i = 1; - while (i < q->overflow_anti[0]) + i = 1; // first element of q->overflow_anti is the number of + + while (i < q->overflow_anti[0])//takes care of out of order messages { tw_event *e; n = q->overflow_anti[i]; e = q->event_list[n]; q->event_list[n] = NULL; - late_recv_finish(me, e, NULL, q, n); - //might need an augmented version for ROSS_MEMORY? + //might need an augmented version for ROSS_MEMORY? + i++; + } + +// V breaks everything. +// fr_q_reset(q); + + + return 1; +} + + +static int +test_q_send( + struct act_q * q, + tw_pe *me, + void (*finish)(tw_pe *, tw_event *, char *, struct act_q*, int id)) +{ + int ready, i=0, n; + +#if ROSS_MEMORY + char *tmp; +#endif + + + if( q->num_in_fr_q == ((q->size_of_fr_q)-1) ) + return 0; + + if (MPI_Testsome( + q->cur, + q->req_list, + &ready, + q->idx_list, + q->status_list) != MPI_SUCCESS) { + tw_error( + TW_LOC, + "MPI_testsome failed with %u items in %s", + q->cur, + q->name); + } + + if (1 > ready) + return 0; + + + while(i < ready) + { + + tw_event *e; + n = q->idx_list[i]; + e = q->event_list[n]; + fr_q_aq(q, n);//add n onto queue + q->event_list[n] = NULL; i++; +#if ROSS_MEMORY + finish(me, e, q->buffers[n], q, n); +#else + finish(me, e, NULL, q, n); +#endif } + q->num_in_fr_q+=ready; +// after all elements are removed from queue, checks if queue is empty and resets ordering if so. + fr_q_reset(q); return 1; } @@ -387,7 +525,6 @@ recv_begin(tw_pe *me) while (0 < posted_recvs.num_in_fr_q)//fix these lines { - if(!(e = tw_event_grab(me))) { if(tw_gvt_inprogress(me)) @@ -431,9 +568,7 @@ recv_begin(tw_pe *me) static void late_recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) { - tw_pe *dest_pe; - tw_clock start; - + /* #if ROSS_MEMORY tw_memory *memory; tw_memory *last; @@ -446,15 +581,12 @@ late_recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id memcpy(e, buffer, g_tw_event_msg_sz); position += g_tw_event_msg_sz; #endif + /* - dest_pe = e->dest_lp->pe; + /* if(e->send_pe > tw_nnodes()-1) tw_error(TW_LOC, "bad sendpe_id: %d", e->send_pe); - e->cancel_next = NULL; - e->caused_by_me = NULL; - e->cause_next = NULL; - if(e->recv_ts < me->GVT) tw_error(TW_LOC, "%d: Received straggler from %d: %lf (%d)", @@ -465,15 +597,18 @@ late_recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id // if cancel event, retrieve and flush // else, store in hash table + */ + + // NOTE: it is possible to cancel the event we + // are currently processing at this PE since this + // MPI module lets me read cancel events during + // event sends over the network. + tw_pe *dest_pe; + dest_pe = e->dest_lp->pe; tw_event *cancel = tw_hash_remove(me->hash_t, e, e->send_pe); g_tw_pe[0]->avl_tree_size++; - // NOTE: it is possible to cancel the event we - // are currently processing at this PE since this - // MPI module lets me read cancel events during - // event sends over the network. - cancel->state.cancel_q = 1; cancel->state.remote = 0; cancel->cancel_next = dest_pe->cancel_q; @@ -504,15 +639,18 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) #endif me->stats.s_nread_network++; me->s_nwhite_recv++; - - e->dest_lp = tw_getlocal_lp((tw_lpid) e->dest_lp);//->gid);// check here - dest_pe = e->dest_lp->pe;// check here + e->dest_lp = tw_getlocal_lp((tw_lpid) e->dest_lp); + dest_pe = e->dest_lp->pe; // instrumentation e->dest_lp->kp->kp_stats->s_nread_network++; e->dest_lp->lp_stats->s_nread_network++; + e->cancel_next = NULL; + e->caused_by_me = NULL; + e->cause_next = NULL; + if(e->send_pe > tw_nnodes()-1) tw_error(TW_LOC, "bad sendpe_id: %d", e->send_pe); @@ -540,13 +678,8 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) if(cancel!=NULL) { - e->cancel_next = NULL; - e->caused_by_me = NULL; - e->cause_next = NULL; cancel->state.cancel_q = 1; cancel->state.remote = 0; - - cancel->cancel_next = dest_pe->cancel_q; dest_pe->cancel_q = cancel; tw_event_free(me, e); @@ -555,16 +688,12 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) { q->overflow_anti[q->overflow_anti[0]] = id; //add id stuff later q->overflow_anti[0]++; - - } return; } - e->cancel_next = NULL; - e->caused_by_me = NULL; - e->cause_next = NULL; + if (g_tw_synchronization_protocol == OPTIMISTIC || g_tw_synchronization_protocol == OPTIMISTIC_DEBUG || @@ -635,7 +764,7 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) */ tw_error( TW_LOC, - "Event recived by PE %u but meant for PE %u", + "Event received by PE %u but meant for PE %u", me->id, dest_pe->id); } @@ -645,11 +774,10 @@ send_begin(tw_pe *me) { int changed = 0; - while (0 < posted_sends.num_in_fr_q)//fixed these line (hopefully) + while (0 < posted_sends.num_in_fr_q) { - tw_event *e = tw_eventq_peek(&outq);//next event? + tw_event *e = tw_eventq_peek(&outq);//next event tw_node *dest_node = NULL; - // posted_sends.cur; //fixed this line #if ROSS_MEMORY tw_event *tmp_prev = NULL; @@ -763,9 +891,7 @@ send_begin(tw_pe *me) posted_sends.event_list[id] = e; deal_with_cur(&posted_sends); - // fixed in fr_q_dq //posted_sends.cur++;//fix this line me->s_nwhite_sent++; - changed = 1; } return changed; @@ -829,8 +955,8 @@ service_queues(tw_pe *me) { int changed; do { - changed = test_q(&posted_recvs, me, recv_finish); - changed |= test_q(&posted_sends, me, send_finish); + changed = test_q_recv(&posted_recvs, me, recv_finish); + changed |= test_q_send(&posted_sends, me, send_finish); changed |= recv_begin(me); changed |= send_begin(me); } while (changed); @@ -860,7 +986,7 @@ tw_net_send(tw_event *e) do { - changed = test_q(&posted_sends, me, send_finish); + changed = test_q_send(&posted_sends, me, send_finish); changed |= send_begin(me); } while (changed); } @@ -995,3 +1121,4 @@ tw_net_statistics(tw_pe * me, tw_statistics * s) return &me->stats; } + From c2ed60874d87fdee411e5367b75228b7de600803 Mon Sep 17 00:00:00 2001 From: yaciud <32393009+yaciud@users.noreply.github.com> Date: Sat, 30 Mar 2019 03:00:37 -0400 Subject: [PATCH 19/24] Added new reset --- core/network-mpi.c | 59 ++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/core/network-mpi.c b/core/network-mpi.c index d0ca178e0..cd74232c6 100644 --- a/core/network-mpi.c +++ b/core/network-mpi.c @@ -26,8 +26,7 @@ struct act_q int coda;//add, back of queue but back is already a variable somewhere int size_of_fr_q;//add, size of queue array int num_in_fr_q;//add, number of elements in queue -// int reset_var;// a variable determining if the queue has been reset lately. -// Deal with filling queue, then plateauing + int reset_var;// a variable determining if the queue has been reset lately. }; @@ -47,21 +46,20 @@ static unsigned int read_buffer = 16; static unsigned int send_buffer = 1024; static int world_size = 1; -int deal_with_cur(struct act_q *q)// this is for MPI_testsome input +void deal_with_cur(struct act_q *q)// this is for MPI_testsome input { if(q->cur < (q->size_of_fr_q-1))//checks to see if { q->cur++; - return 1; + return; } else { //not sure if best placement is here or in fr_q_chq. probably fr_q_chq. - //q->reset_var = 0; - return 1; + return; } } -void fr_q_reset(struct act_q *q)//experimental +void old_fr_q_reset(struct act_q *q)//experimental { if(q->num_in_fr_q == (q->size_of_fr_q-1)) //if the queue is filled with free { @@ -85,13 +83,12 @@ void fr_q_reset(struct act_q *q)//experimental } -void new_fr_q_reset(struct act_q *q)//experimental +void fr_q_reset(struct act_q *q)//experimental { if(q->num_in_fr_q == (q->size_of_fr_q-1))//if the queue is filled with free { -// q->reset_var = 1; //resets queue to default positions - + q->reset_var = 1; q->front = 0; q->coda = 0; q->cur = 0; @@ -114,6 +111,7 @@ void fr_q_chq(struct act_q *q, int *frontOrCoda) //free index queue; check for m } else//mess with queue { + q->reset_var = 0; *frontOrCoda = 0; return; } @@ -124,6 +122,7 @@ void fr_q_aq(struct act_q *q, int ele) // free index queue; add to queue q->free_idx_list[q->coda] = ele; q->coda++; + q->num_in_fr_q++; fr_q_chq(q,&q->coda);//wraps the queue array around } @@ -137,16 +136,16 @@ int fr_q_dq(struct act_q *q) // free index queue; dequeue q->front++; q->num_in_fr_q--;//can we get rid of this here? fr_q_chq(q, &q->front);// wraps the queue array around - return rv; + } -int new_send_fr_q_dq(struct act_q *q) // free index queue; dequeue +int send_fr_q_dq(struct act_q *q) // free index queue; dequeue { -/* + int rv; - if(q->reset_var != 1) // if not recently reset, take from front of queue + if(q->reset_var == 0) // if not recently reset, take from front of queue { rv =q->free_idx_list[q->front]; q->front++; @@ -154,18 +153,16 @@ int new_send_fr_q_dq(struct act_q *q) // free index queue; dequeue fr_q_chq(q, &q->front);// wraps the queue array around } else //if recently reset, return q->front. This works because front will just give the numbers sequentially so we - //don't need an explicit assignment - { + { //don't need an explicit assignment + rv = q->front; q->front++; q->num_in_fr_q--; - fr_q_chq(q,&q->front);// wraps the queue array around + fr_q_chq( q, &q->front );// wraps the queue array around } - return rv; -*/ } @@ -226,10 +223,16 @@ init_q(struct act_q *q, const char *name) #endif if(q == &posted_sends) + { + q->reset_var = 1; n = send_buffer; + } else - n = read_buffer; + { + q->reset_var = 0; + n = read_buffer; + } q->name = name; q->event_list = (tw_event **) tw_calloc(TW_LOC, name, sizeof(*q->event_list), n); q->req_list = (MPI_Request *) tw_calloc(TW_LOC, name, sizeof(*q->req_list), n); @@ -240,9 +243,9 @@ init_q(struct act_q *q, const char *name) q->overflow_anti = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), (n/2)+2);// queue, at most (n/2) can be out of order, first element is # of elements in queue q->front = 0;// front of queue q->coda = 0;// end of queue - q->size_of_fr_q=n+1;// for wraparound - // q->cur = 0; -// q->num_in_fr_q = 0; + q->size_of_fr_q = n+1;// for wraparound + q->reset_var = 1; + q->num_in_fr_q = 0; // number of elements in queue int i = 0; while(ioverflow_anti[0]=1; - q->num_in_fr_q = n;// number of elements in queue +// q->num_in_fr_q = n;// number of elements in queue #if ROSS_MEMORY @@ -405,7 +408,7 @@ test_q_recv( return 0; q->overflow_anti[0]=1; - q->num_in_fr_q+=ready; +// q->num_in_fr_q+=ready; while ( i < ready) { @@ -505,7 +508,7 @@ test_q_send( } - q->num_in_fr_q+=ready; + // q->num_in_fr_q+=ready; // after all elements are removed from queue, checks if queue is empty and resets ordering if so. fr_q_reset(q); @@ -560,7 +563,7 @@ recv_begin(tw_pe *me) deal_with_cur(&posted_recvs); changed = 1; } - + posted_recvs.num_in_fr_q = 0; return changed; } @@ -800,7 +803,7 @@ send_begin(tw_pe *me) if(e == me->abort_event) tw_error(TW_LOC, "Sending abort event!"); - int id = fr_q_dq(&posted_sends);// fixed, grabs from front of queue, moves front up one element + int id = send_fr_q_dq(&posted_sends);// fixed, grabs from front of queue, moves front up one element dest_node = tw_net_onnode((*e->src_lp->type->map) ((tw_lpid) e->dest_lp)); From 820c694238237e6e44ae9501f99245b25722224c Mon Sep 17 00:00:00 2001 From: yaciud <32393009+yaciud@users.noreply.github.com> Date: Mon, 18 Nov 2019 19:02:51 -0500 Subject: [PATCH 20/24] Create hybrid-mpi-layer.c This is the network file that uses both the methods of filling holes in the MPI_requests by moving events on the far side of the array inward (send event) and keeping track of events that have arrived and overwriting them(recv event). File still has ROSS MEMORY and debugging statements unfortunately. --- core/hybrid-mpi-layer.c | 1066 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 1066 insertions(+) create mode 100644 core/hybrid-mpi-layer.c diff --git a/core/hybrid-mpi-layer.c b/core/hybrid-mpi-layer.c new file mode 100644 index 000000000..468e5d733 --- /dev/null +++ b/core/hybrid-mpi-layer.c @@ -0,0 +1,1066 @@ +#include +#include +#include "ross.h" + +//This is the hybrid version of + +MPI_Comm MPI_COMM_ROSS = MPI_COMM_WORLD; +int custom_communicator = 0; + +static long id_tmp; + +struct act_q +{ + const char *name; + + tw_event **event_list; + MPI_Request *req_list; + int *idx_list; + MPI_Status *status_list; + int *free_idx_list;//add, que of free indices + int *overflow_anti; + +#if ROSS_MEMORY + char **buffers; +#endif + unsigned int cur; + unsigned int front;//add, front of queue + //int coda;//add, back of queue but back is already a variable somewhere + int size_of_fr_q;//add, size of queue array + int num_in_fr_q;//add, number of elements in queue + +}; + +#define EVENT_TAG 1 + +#if ROSS_MEMORY +#define EVENT_SIZE(e) TW_MEMORY_BUFFER_SIZE +#else +#define EVENT_SIZE(e) g_tw_event_msg_sz +#endif + +static struct act_q posted_sends; +static struct act_q posted_recvs; +static tw_eventq outq; + +static unsigned int read_buffer = 16; +static unsigned int send_buffer = 1024; +static int world_size = 1; + +void deal_with_cur(struct act_q *q)// this is for MPI_testsome input +{ +// printf("cur = %d, front %d\n", q->cur,q->front); + if(q->cur < (q->front))//(q->front))//checks to see if + { + q->cur++; +// printf("%s: CUR IS %d\n",q->name, q->cur); + + return; + } + else + { + return; + } +} +/* +void cur_reduction(struct act_q * q,int ele) +{ + if(ele == q->cur && q->cur!=1) + { + q->cur--; + printf("%s: WOO! CUR REDUCED TO %d\n",q->name, q->cur); + } + return; + +} +*/ +void fr_q_chq(struct act_q *q, int *frontOrCoda) //free index queue; check for modulating the front or back index of que +{ + if(*frontOrCoda != q->size_of_fr_q)//don't mess with queue + { + return; + } + else//mess with queue + { + *frontOrCoda = 0; + return; + } +} + +void fr_q_aq(struct act_q *q, int ele) // free index queue; add to queue +{ + q->front--; + q->free_idx_list[q->front] = ele; +// cur_reduction(q,ele); + return; +} + +int fr_q_dq(struct act_q *q) // free index queue; dequeue +{ + int rv = q->free_idx_list[q->front]; + q->front++; + q->num_in_fr_q--; + return rv; +} + + +static const tw_optdef mpi_opts[] = { + TWOPT_GROUP("ROSS MPI Kernel"), + TWOPT_UINT( + "read-buffer", + read_buffer, + "network read buffer size in # of events"), + TWOPT_UINT( + "send-buffer", + send_buffer, + "network send buffer size in # of events"), + TWOPT_END() +}; + +// Forward declarations of functions used in MPI network message processing +static int recv_begin(tw_pe *me); +static void recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q* q, int id); +static void late_recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q* q, int id); +static int send_begin(tw_pe *me); +static void send_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q* q, int id); + +// Start of implmentation of network processing routines/functions +void tw_comm_set(MPI_Comm comm) +{ + MPI_COMM_ROSS = comm; + custom_communicator = 1; +} + +const tw_optdef * +tw_net_init(int *argc, char ***argv) +{ + int my_rank; + + int initialized; + MPI_Initialized(&initialized); + + if (!initialized) { + if (MPI_Init(argc, argv) != MPI_SUCCESS) + tw_error(TW_LOC, "MPI_Init failed."); + } + if (MPI_Comm_rank(MPI_COMM_ROSS, &my_rank) != MPI_SUCCESS) + tw_error(TW_LOC, "Cannot get MPI_Comm_rank(MPI_COMM_ROSS)"); + + g_tw_masternode = 0; + g_tw_mynode = my_rank; + + return mpi_opts; +} + +static void +init_q(struct act_q *q, const char *name) +{ + unsigned int n; +#if ROSS_MEMORY + unsigned int i; +#endif + + if(q == &posted_sends) + { + n = send_buffer; + } + else + { + n = read_buffer; + } + q->name = name; + q->event_list = (tw_event **) tw_calloc(TW_LOC, name, sizeof(*q->event_list), n); + q->req_list = (MPI_Request *) tw_calloc(TW_LOC, name, sizeof(*q->req_list), n); + q->idx_list = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), n); + q->status_list = (MPI_Status *) tw_calloc(TW_LOC, name, sizeof(*q->status_list), n); + //can I shrink this initialization? + q->free_idx_list = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), n+1);// queue, n+1 is meant to prevent a full queue + q->overflow_anti = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), (n/2)+2);// queue, at most (n/2) can be out of order, first element is # of elements in queue + q->front = 0;// front of queue + q->size_of_fr_q = n+1;// for wraparound + q->num_in_fr_q = 0; // number of elements in queue + + int i = 0; + while(ifree_idx_list[i] = i; +// fr_q_aq(q,i); + i++; + } +// q->front = 1; +// printf("front = %d\n",q->front); + q->overflow_anti[0]=1; + q->num_in_fr_q = n;// number of elements in queue + + +#if ROSS_MEMORY + q->buffers = tw_calloc(TW_LOC, name, sizeof(*q->buffers), n); + + for(i = 0; i < n; i++) + q->buffers[i] = tw_calloc(TW_LOC, "", TW_MEMORY_BUFFER_SIZE, 1); +#endif +} + +tw_node * tw_net_onnode(tw_peid gid) { + id_tmp = gid; + return &id_tmp; +} + +unsigned int +tw_nnodes(void) +{ + return world_size; +} + +void +tw_net_start(void) +{ + if (MPI_Comm_size(MPI_COMM_ROSS, &world_size) != MPI_SUCCESS) + tw_error(TW_LOC, "Cannot get MPI_Comm_size(MPI_COMM_ROSS)"); + + if( g_tw_mynode == 0) + { + printf("tw_net_start: Found world size to be %d \n", world_size ); + } + + // Check after tw_nnodes is defined + if(tw_nnodes() == 1) { + // force the setting of SEQUENTIAL protocol + if (g_tw_synchronization_protocol == NO_SYNCH) { + g_tw_synchronization_protocol = SEQUENTIAL; + } else if(g_tw_synchronization_protocol == CONSERVATIVE || g_tw_synchronization_protocol == OPTIMISTIC) { + g_tw_synchronization_protocol = SEQUENTIAL; + fprintf(stderr, "Warning: Defaulting to Sequential Simulation, not enought PEs defined.\n"); + } + } + +// tw_pe_create(1); + tw_pe_init(); + + //If we're in (some variation of) optimistic mode, we need this hash + if (g_tw_synchronization_protocol == OPTIMISTIC || + g_tw_synchronization_protocol == OPTIMISTIC_DEBUG || + g_tw_synchronization_protocol == OPTIMISTIC_REALTIME) { + g_tw_pe->hash_t = tw_hash_create(); + } else { + g_tw_pe->hash_t = NULL; + } + + if (send_buffer < 1) + tw_error(TW_LOC, "network send buffer must be >= 1"); + if (read_buffer < 1) + tw_error(TW_LOC, "network read buffer must be >= 1"); + + init_q(&posted_sends, "MPI send queue"); + init_q(&posted_recvs, "MPI recv queue"); + + g_tw_net_device_size = read_buffer; + + // pre-post all the Irecv operations + recv_begin( g_tw_pe ); +} + +void +tw_net_abort(void) +{ + MPI_Abort(MPI_COMM_ROSS, 1); + exit(1); +} + +void +tw_net_stop(void) +{ + if (!custom_communicator) { + if (MPI_Finalize() != MPI_SUCCESS) + tw_error(TW_LOC, "Failed to finalize MPI"); + } +} + +void +tw_net_barrier(void) +{ + if (MPI_Barrier(MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Failed to wait for MPI_Barrier"); +} + +tw_stime +tw_net_minimum(void) +{ + tw_stime m = DBL_MAX; + tw_event *e; + int i; + + e = outq.head; + while (e) { + if (m > e->recv_ts) + m = e->recv_ts; + e = e->next; + } + + for (i = 0; i < posted_sends.cur; i++) { //fix this line (?) + e = posted_sends.event_list[i]; + if(e == NULL) + {} + else if(m > e->recv_ts) + m = e->recv_ts; + else + {} + } + + return m; +} + +static int +test_q_recv( + struct act_q * q, + tw_pe *me, + void (*finish)(tw_pe *, tw_event *, char *, struct act_q*, int id)) +{ + int ready, i, n, indicator; + indicator = 1; + +#if ROSS_MEMORY + char *tmp; +#endif + + + if( q->num_in_fr_q == ((q->size_of_fr_q)-1) ) + return 0; + + if (MPI_Testsome( + q->cur, + q->req_list, + &ready, + q->idx_list, + q->status_list) != MPI_SUCCESS) { + tw_error( + TW_LOC, + "MPI_testsome failed with %u items in %s", + q->cur, + q->name); + } + + if (1 > ready) + return 0; + + q->overflow_anti[0]=1; + q->num_in_fr_q+=ready; + i = ready-1; + while ( -1 != i) + { + + tw_event *e; + n = q->idx_list[i]; + e = q->event_list[n]; + fr_q_aq(q, n); //add n onto queue +// cur_reduction(q,n); + + +#if ROSS_MEMORY + finish(me, e, q->buffers[n], q, n); +#else + finish(me, e, NULL, q, n); +#endif + if (indicator != q->overflow_anti[0]) + { + indicator = q->overflow_anti[0]; + } + else + { + q->event_list[n] = NULL; + } + i--; + } + + i = 1; // first element of q->overflow_anti is the number of + + while (i < q->overflow_anti[0])//takes care of out of order messages + { + + tw_event *e; + n = q->overflow_anti[i]; + e = q->event_list[n]; + q->event_list[n] = NULL; + late_recv_finish(me, e, NULL, q, n); + //might need an augmented version for ROSS_MEMORY? + i++; + } + + + return 1; +} + +void check_b_ind( int * b_index, struct act_q * q) +{ + while(0 <= *b_index && *b_index<=q->size_of_fr_q) + { + + if(q->event_list[*b_index]== NULL) + { + *b_index = *b_index-1; + } + else + { + return; + } + + } + *b_index = 0; + return; + +} + +static int +test_q_send( + struct act_q * q, + tw_pe *me, + void (*finish)(tw_pe *, tw_event *, char *, struct act_q*, int id)) +{ + int ready, i, n; + +#if ROSS_MEMORY + char *tmp; +#endif + + + if (!q->cur) + { + return 0; + } + + if (MPI_Testsome( + q->cur, + q->req_list, + &ready, + q->idx_list, + q->status_list) != MPI_SUCCESS) { + tw_error( + TW_LOC, + "MPI_testsome failed with %u items in %s", + q->cur, + q->name); + } + + if (1 > ready) + return 0; + + i = ready-1; + + while(-1 != i) + { + + tw_event *e; + + n = q->idx_list[i]; + e = q->event_list[n]; + q->event_list[n] = NULL; + i--; + +#if ROSS_MEMORY + finish(me, e, q->buffers[n], q, n); +#else + finish(me, e, NULL, q, n); +#endif + } + + + i = 0; + + int b_index; + b_index = q->cur-1; + + while(iidx_list[i]; + + check_b_ind(&b_index, q); + if (n < b_index) + { + q->event_list[n] = q->event_list[b_index]; + memcpy(&q->req_list[n],&q->req_list[b_index],sizeof(q->req_list[0])); + b_index--; + } + i++; + + } + + q->cur -= ready; + + return 1; +} + +static int +recv_begin(tw_pe *me) +{ + MPI_Status status; + + tw_event *e = NULL; + + int flag = 0; + int changed = 0; + + while (0 < posted_recvs.num_in_fr_q)//fix these lines + { + + if(!(e = tw_event_grab(me))) + { + if(tw_gvt_inprogress(me)) + tw_error(TW_LOC, "Out of events in GVT! Consider increasing --extramem"); + return changed; + } + + int id = fr_q_dq(&posted_recvs); + +#if ROSS_MEMORY + if( MPI_Irecv(posted_recvs.buffers[id], + EVENT_SIZE(e), + MPI_BYTE, + MPI_ANY_SOURCE, + EVENT_TAG, + MPI_COMM_ROSS, + &posted_recvs.req_list[id]) != MPI_SUCCESS) +#else + if( MPI_Irecv(e, + (int)EVENT_SIZE(e), + MPI_BYTE, + MPI_ANY_SOURCE, + EVENT_TAG, + MPI_COMM_ROSS, + &posted_recvs.req_list[id]) != MPI_SUCCESS) +#endif + { + tw_event_free(me, e); + return changed; + } + + posted_recvs.event_list[id] = e; + deal_with_cur(&posted_recvs); + changed = 1; + } + return changed; +} + + +static void +late_recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) +{ + + tw_pe *dest_pe; + dest_pe = e->dest_lp->pe; + tw_event *cancel = tw_hash_remove(me->hash_t, e, e->send_pe); + g_tw_pe->avl_tree_size++; + + cancel->state.cancel_q = 1; + cancel->state.remote = 0; + cancel->cancel_next = dest_pe->cancel_q; + dest_pe->cancel_q = cancel; + tw_event_free(me, e); + + return; + +} + +static void +recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) +{ + tw_pe *dest_pe; + tw_clock start; + +#if ROSS_MEMORY + tw_memory *memory; + tw_memory *last; + tw_fd mem_fd; + + size_t mem_size; + + unsigned position = 0; + + memcpy(e, buffer, g_tw_event_msg_sz); + position += g_tw_event_msg_sz; +#endif + me->stats.s_nread_network++; + me->s_nwhite_recv++; + e->dest_lp = tw_getlocal_lp((tw_lpid) e->dest_lp); + dest_pe = e->dest_lp->pe; + + + // instrumentation + e->dest_lp->kp->kp_stats->s_nread_network++; + e->dest_lp->lp_stats->s_nread_network++; + + e->cancel_next = NULL; + e->caused_by_me = NULL; + e->cause_next = NULL; + + if(e->send_pe > tw_nnodes()-1) + tw_error(TW_LOC, "bad sendpe_id: %d", e->send_pe); + + + if(e->recv_ts < me->GVT) + tw_error(TW_LOC, "%d: Received straggler from %d: %lf (%d)", + me->id, e->send_pe, e->recv_ts, e->state.cancel_q); + + if(tw_gvt_inprogress(me)) + me->trans_msg_ts = ROSS_MIN(me->trans_msg_ts, e->recv_ts); + + // if cancel event, retrieve and flush + // else, store in hash table + + if(e->state.cancel_q) + { + + tw_event *cancel = tw_hash_remove(me->hash_t, e, e->send_pe); + + // NOTE: it is possible to cancel the event we + // are currently processing at this PE since this + // MPI module lets me read cancel events during + // event sends over the network. + + if(cancel!=NULL) + { + + cancel->state.cancel_q = 1; + cancel->state.remote = 0; + cancel->cancel_next = dest_pe->cancel_q; + dest_pe->cancel_q = cancel; + tw_event_free(me, e); + } + else + { + q->overflow_anti[q->overflow_anti[0]] = id; //add id stuff later + q->overflow_anti[0]++; + } + + return; + } + + + + if (g_tw_synchronization_protocol == OPTIMISTIC || + g_tw_synchronization_protocol == OPTIMISTIC_DEBUG || + g_tw_synchronization_protocol == OPTIMISTIC_REALTIME ) { + tw_hash_insert(me->hash_t, e, e->send_pe); + e->state.remote = 1; + } + +#if ROSS_MEMORY + mem_size = (size_t) e->memory; + mem_fd = (tw_fd) e->prev; + + last = NULL; + while(mem_size) + { + memory = tw_memory_alloc(e->dest_lp, mem_fd); + + if(last) + last->next = memory; + else + e->memory = memory; + + memcpy(memory, &buffer[position], mem_size); + position += mem_size; + + memory->fd = mem_fd; + memory->nrefs = 1; + + mem_size = (size_t) memory->next; + mem_fd = memory->fd; + + last = memory; + } +#endif + + /* NOTE: the final check in the if conditional below was added to make sure + * that we do not execute the fast case unless the cancellation queue is + * empty on the destination PE. Otherwise we need to invoke the normal + * scheduling routines to make sure that a forward event doesn't bypass a + * cancellation event with an earlier timestamp. This is helpful for + * stateful models that produce incorrect results when presented with + * duplicate messages with no rollback between them. + */ + if(me == dest_pe && e->dest_lp->kp->last_time <= e->recv_ts && !dest_pe->cancel_q) { + /* Fast case, we are sending to our own PE and + * there is no rollback caused by this send. + */ + start = tw_clock_read(); + tw_pq_enqueue(dest_pe->pq, e); + dest_pe->stats.s_pq += tw_clock_read() - start; + return; + } + + if (me->id == dest_pe->id) { + /* Slower, but still local send, so put into top + * of dest_pe->event_q. + */ + e->state.owner = TW_pe_event_q; + tw_eventq_push(&dest_pe->event_q, e); + return; + } + + /* Never should happen; MPI should have gotten the + * message to the correct node without needing us + * to redirect the message there for it. This is + * probably a serious bug with the event headers + * not being formatted right. + */ + tw_error( + TW_LOC, + "Event received by PE %u but meant for PE %u", + me->id, + dest_pe->id); +} + +static int +send_begin(tw_pe *me) +{ + int changed = 0; + + while (posted_sends.cur < send_buffer) + { + tw_event *e = tw_eventq_peek(&outq);//next event + tw_node *dest_node = NULL; + +#if ROSS_MEMORY + tw_event *tmp_prev = NULL; + + tw_lp *tmp_lp = NULL; + + tw_memory *memory = NULL; + tw_memory *m = NULL; + + char *buffer = NULL; + + size_t mem_size = 0; + + unsigned position = 0; +#endif + + if (!e) + break; + + if(e == me->abort_event) + tw_error(TW_LOC, "Sending abort event!"); + + unsigned id = posted_sends.cur; + dest_node = tw_net_onnode((*e->src_lp->type->map) + ((tw_lpid) e->dest_lp)); + + //if(!e->state.cancel_q) + //e->event_id = (tw_eventid) ++me->seq_num; + + e->send_pe = (tw_peid) g_tw_mynode; + e->send_lp = e->src_lp->gid; + +#if ROSS_MEMORY + // pack pointers + tmp_prev = e->prev; + tmp_lp = e->src_lp; + + // delete when working + e->src_lp = NULL; + + memory = NULL; + if(e->memory) + { + memory = e->memory; + e->memory = (tw_memory *) tw_memory_getsize(me, memory->fd); + e->prev = (tw_event *) memory->fd; + mem_size = (size_t) e->memory; + } + + buffer = posted_sends.buffers[id]; + memcpy(&buffer[position], e, g_tw_event_msg_sz); + position += g_tw_event_msg_sz; + + // restore pointers + e->prev = tmp_prev; + e->src_lp = tmp_lp; + + m = NULL; + while(memory) + { + m = memory->next; + + if(m) + { + memory->next = (tw_memory *) + tw_memory_getsize(me, m->fd); + memory->fd = m->fd; + } + + if(position + mem_size > TW_MEMORY_BUFFER_SIZE) + tw_error(TW_LOC, "Out of buffer space!"); + + memcpy(&buffer[position], memory, mem_size); + position += mem_size; + + memory->nrefs--; + tw_memory_unshift(e->src_lp, memory, memory->fd); + + if(NULL != (memory = m)) + mem_size = tw_memory_getsize(me, memory->fd); + } + + e->memory = NULL; + + if (MPI_Isend(buffer, + EVENT_SIZE(e), + MPI_BYTE, + *dest_node, + EVENT_TAG, + MPI_COMM_ROSS, + &posted_sends.req_list[id]) != MPI_SUCCESS) { + return changed; + } +#else + if (MPI_Isend(e, + (int)EVENT_SIZE(e), + MPI_BYTE, + (int)*dest_node, + EVENT_TAG, + MPI_COMM_ROSS, + &posted_sends.req_list[id]) != MPI_SUCCESS) { + return changed; + } +#endif + + tw_eventq_pop(&outq); + e->state.owner = e->state.cancel_q + ? TW_net_acancel + : TW_net_asend; + + posted_sends.event_list[id] = e; + posted_sends.cur++; + me->s_nwhite_sent++; + changed = 1; + } + + return changed; +} + +static void +send_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id) +{ + me->stats.s_nsend_network++; + // instrumentation + e->src_lp->kp->kp_stats->s_nsend_network++; + e->src_lp->lp_stats->s_nsend_network++; + + if (e->state.owner == TW_net_asend) { + if (e->state.cancel_asend) { + /* Event was cancelled during transmission. We must + * send another message to pass the cancel flag to + * the other node. + */ + e->state.cancel_asend = 0; + e->state.cancel_q = 1; + tw_eventq_push(&outq, e); + } else { + /* Event finished transmission and was not cancelled. + * Add to our sent event queue so we can retain the + * event in case we need to cancel it later. Note it + * is currently in remote format and must be converted + * back to local format for fossil collection. + */ + e->state.owner = TW_pe_sevent_q; + if( g_tw_synchronization_protocol == CONSERVATIVE ) + tw_event_free(me, e); + } + + return; + } + + if (e->state.owner == TW_net_acancel) { + /* We just finished sending the cancellation message + * for this event. We need to free the buffer and + * make it available for reuse. + */ + tw_event_free(me, e); + return; + } + + /* Never should happen, not unless we somehow broke this + * module's other functions related to sending an event. + */ + + tw_error( + TW_LOC, + "Don't know how to finish send of owner=%u, cancel_q=%d", + e->state.owner, + e->state.cancel_q); + +} + +static void +service_queues(tw_pe *me) +{ + int changed; + do { + changed = test_q_recv(&posted_recvs, me, recv_finish); + changed |= test_q_send(&posted_sends, me, send_finish); + changed |= recv_begin(me); + changed |= send_begin(me); + } while (changed); +} + +/* + * NOTE: Chris believes that this network layer is too aggressive at + * reading events out of the network.. so we are modifying the algorithm + * to only send events when tw_net_send it called, and only read events + * when tw_net_read is called. + */ +void +tw_net_read(tw_pe *me) +{ + service_queues(me); +} + +void +tw_net_send(tw_event *e) +{ + tw_pe * me = e->src_lp->pe; + int changed = 0; + + e->state.remote = 0; + e->state.owner = TW_net_outq; + tw_eventq_unshift(&outq, e); + + do + { + changed = test_q_send(&posted_sends, me, send_finish); + changed |= send_begin(me); + } while (changed); +} + +void +tw_net_cancel(tw_event *e) +{ + tw_pe *src_pe = e->src_lp->pe; + + switch (e->state.owner) { + case TW_net_outq: + /* Cancelled before we could transmit it. Do not + * transmit the event and instead just release the + * buffer back into our own free list. + */ + tw_eventq_delete_any(&outq, e); + tw_event_free(src_pe, e); + + return; + + break; + + case TW_net_asend: + /* Too late. We've already let MPI start to send + * this event over the network. We can't pull it + * back now without sending another message to do + * the cancel. + * + * Setting the cancel_q flag will signal us to do + * another message send once the current send of + * this message is completed. + */ + e->state.cancel_asend = 1; + break; + + case TW_pe_sevent_q: + /* Way late; the event was already sent and is in + * our sent event queue. Mark it as a cancel and + * place it at the front of the outq. + */ + e->state.cancel_q = 1; + tw_eventq_unshift(&outq, e); + break; + + default: + /* Huh? Where did you come from? Why are we being + * told about you? We did not send you so we cannot + * cancel you! + */ + tw_error( + TW_LOC, + "Don't know how to cancel event owned by %u", + e->state.owner); + } + + service_queues(src_pe); +} + +/** + * tw_net_statistics + * @brief Function to output the statistics + * @attention Notice that the MPI_Reduce "count" parameter is greater than one. + * We are reducing on multiple variables *simultaneously* so if you change + * this function or the struct tw_statistics, you must update the other. + **/ +tw_statistics * +tw_net_statistics(tw_pe * me, tw_statistics * s) +{ + if(MPI_Reduce(&(s->s_max_run_time), + &me->stats.s_max_run_time, + 1, + MPI_DOUBLE, + MPI_MAX, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); + + if(MPI_Reduce(&(s->s_net_events), + &me->stats.s_net_events, + 17, + MPI_UNSIGNED_LONG_LONG, + MPI_SUM, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); + + if(MPI_Reduce(&s->s_min_detected_offset, + &me->stats.s_min_detected_offset, + 1, + MPI_DOUBLE, + MPI_MIN, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); + + if(MPI_Reduce(&(s->s_total), + &me->stats.s_total, + 16, + MPI_UNSIGNED_LONG_LONG, + MPI_MAX, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); + + if (MPI_Reduce(&s->s_events_past_end, + &me->stats.s_events_past_end, + 3, + MPI_UNSIGNED_LONG_LONG, + MPI_SUM, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); + +#ifdef USE_RIO + if (MPI_Reduce(&s->s_rio_load, + &me->stats.s_rio_load, + 1, + MPI_UNSIGNED_LONG_LONG, + MPI_MAX, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); + if (MPI_Reduce(&s->s_rio_lp_init, + &me->stats.s_rio_lp_init, + 1, + MPI_UNSIGNED_LONG_LONG, + MPI_MAX, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); +#endif + + return &me->stats; +} + + From 62037aee490585e67018704464d02f7a025d1c6e Mon Sep 17 00:00:00 2001 From: yaciud <32393009+yaciud@users.noreply.github.com> Date: Mon, 18 Nov 2019 19:27:40 -0500 Subject: [PATCH 21/24] Create queue-network-mpi.c So, it says queue but I switched implementations to a stack. Still messy, should be cleaned up. Small performance issue because cur variable does not shrink, causing MPI and some loops dependent on cur to check more elements than it needs to. I have a fix for it that is not currently implemented. At the end of test_q, just iterate from cur backwards to the next cell of the array that is not null, then we would need to check if we place events farther in the array then cur and either throw larger value out and continue with the next value or set cur to that new value. --- core/queue-network-mpi.c | 1055 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 1055 insertions(+) create mode 100644 core/queue-network-mpi.c diff --git a/core/queue-network-mpi.c b/core/queue-network-mpi.c new file mode 100644 index 000000000..9ebdb800a --- /dev/null +++ b/core/queue-network-mpi.c @@ -0,0 +1,1055 @@ +#include +#include +#include "ross.h" + +MPI_Comm MPI_COMM_ROSS = MPI_COMM_WORLD; +int custom_communicator = 0; + +static long id_tmp; + +struct act_q +{ + const char *name; + + tw_event **event_list; + MPI_Request *req_list; + int *idx_list; + MPI_Status *status_list; + int *free_idx_list;//add, que of free indices + int *overflow_anti; + +#if ROSS_MEMORY + char **buffers; +#endif + unsigned int cur; + unsigned int front;//add, front of queue + //int coda;//add, back of queue but back is already a variable somewhere + int size_of_fr_q;//add, size of queue array + int num_in_fr_q;//add, number of elements in queue + +}; + +#define EVENT_TAG 1 + +#if ROSS_MEMORY +#define EVENT_SIZE(e) TW_MEMORY_BUFFER_SIZE +#else +#define EVENT_SIZE(e) g_tw_event_msg_sz +#endif + +static struct act_q posted_sends; +static struct act_q posted_recvs; +static tw_eventq outq; + +static unsigned int read_buffer = 16; +static unsigned int send_buffer = 1024; +static int world_size = 1; + +void deal_with_cur(struct act_q *q)// this is for MPI_testsome input +{ +// printf("cur = %d, front %d\n", q->cur,q->front); + if(q->cur < (q->front))//(q->front))//checks to see if + { + q->cur++; +// printf("%s: CUR IS %d\n",q->name, q->cur); + + return; + } + else + { + return; + } +} + + + +void fr_q_chq(struct act_q *q, int *frontOrCoda) //free index queue; check for modulating the front or back index of que +{ + if(*frontOrCoda != q->size_of_fr_q)//don't mess with queue + { + return; + } + else//mess with queue + { + *frontOrCoda = 0; + return; + } +} + +void fr_q_aq(struct act_q *q, int ele) // free index queue; add to queue +{ + q->front--; + q->free_idx_list[q->front] = ele; + return; +} + +int fr_q_dq(struct act_q *q) // free index queue; dequeue +{ + int rv = q->free_idx_list[q->front]; + q->front++; + q->num_in_fr_q--; + return rv; +} + +static const tw_optdef mpi_opts[] = { + TWOPT_GROUP("ROSS MPI Kernel"), + TWOPT_UINT( + "read-buffer", + read_buffer, + "network read buffer size in # of events"), + TWOPT_UINT( + "send-buffer", + send_buffer, + "network send buffer size in # of events"), + TWOPT_END() +}; + +// Forward declarations of functions used in MPI network message processing +static int recv_begin(tw_pe *me); +static void recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q* q, int id); +static void late_recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q* q, int id); +static int send_begin(tw_pe *me); +static void send_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q* q, int id); + +// Start of implmentation of network processing routines/functions +void tw_comm_set(MPI_Comm comm) +{ + MPI_COMM_ROSS = comm; + custom_communicator = 1; +} + +const tw_optdef * +tw_net_init(int *argc, char ***argv) +{ + int my_rank; + + int initialized; + MPI_Initialized(&initialized); + + if (!initialized) { + if (MPI_Init(argc, argv) != MPI_SUCCESS) + tw_error(TW_LOC, "MPI_Init failed."); + } + if (MPI_Comm_rank(MPI_COMM_ROSS, &my_rank) != MPI_SUCCESS) + tw_error(TW_LOC, "Cannot get MPI_Comm_rank(MPI_COMM_ROSS)"); + + g_tw_masternode = 0; + g_tw_mynode = my_rank; + + return mpi_opts; +} + +static void +init_q(struct act_q *q, const char *name) +{ + unsigned int n; +#if ROSS_MEMORY + unsigned int i; +#endif + + if(q == &posted_sends) + { + n = send_buffer; + } + else + { + + n = read_buffer; + } + q->name = name; + q->event_list = (tw_event **) tw_calloc(TW_LOC, name, sizeof(*q->event_list), n); + q->req_list = (MPI_Request *) tw_calloc(TW_LOC, name, sizeof(*q->req_list), n); + q->idx_list = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), n); + q->status_list = (MPI_Status *) tw_calloc(TW_LOC, name, sizeof(*q->status_list), n); + //can I shrink this initialization? + q->free_idx_list = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), n+1);// queue, n+1 is meant to prevent a full queue + q->overflow_anti = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), (n/2)+2);// queue, at most (n/2) can be out of order, first element is # of elements in queue + q->front = 0;// front of queue + q->size_of_fr_q = n+1;// for wraparound + q->num_in_fr_q = 0; // number of elements in queue + + int i = 0; + while(ifree_idx_list[i] = i; +// fr_q_aq(q,i); + i++; + } +// q->front = 1; +// printf("front = %d\n",q->front); + q->overflow_anti[0]=1; + q->num_in_fr_q = n;// number of elements in queue + + +#if ROSS_MEMORY + q->buffers = tw_calloc(TW_LOC, name, sizeof(*q->buffers), n); + + for(i = 0; i < n; i++) + q->buffers[i] = tw_calloc(TW_LOC, "", TW_MEMORY_BUFFER_SIZE, 1); +#endif +} + +tw_node * tw_net_onnode(tw_peid gid) { + id_tmp = gid; + return &id_tmp; +} + +unsigned int +tw_nnodes(void) +{ + return world_size; +} + +void +tw_net_start(void) +{ + if (MPI_Comm_size(MPI_COMM_ROSS, &world_size) != MPI_SUCCESS) + tw_error(TW_LOC, "Cannot get MPI_Comm_size(MPI_COMM_ROSS)"); + + if( g_tw_mynode == 0) + { + printf("tw_net_start: Found world size to be %d \n", world_size ); + } + + // Check after tw_nnodes is defined + if(tw_nnodes() == 1) { + // force the setting of SEQUENTIAL protocol + if (g_tw_synchronization_protocol == NO_SYNCH) { + g_tw_synchronization_protocol = SEQUENTIAL; + } else if(g_tw_synchronization_protocol == CONSERVATIVE || g_tw_synchronization_protocol == OPTIMISTIC) { + g_tw_synchronization_protocol = SEQUENTIAL; + fprintf(stderr, "Warning: Defaulting to Sequential Simulation, not enought PEs defined.\n"); + } + } + +// tw_pe_create(1); + tw_pe_init(); + + //If we're in (some variation of) optimistic mode, we need this hash + if (g_tw_synchronization_protocol == OPTIMISTIC || + g_tw_synchronization_protocol == OPTIMISTIC_DEBUG || + g_tw_synchronization_protocol == OPTIMISTIC_REALTIME) { + g_tw_pe->hash_t = tw_hash_create(); + } else { + g_tw_pe->hash_t = NULL; + } + + if (send_buffer < 1) + tw_error(TW_LOC, "network send buffer must be >= 1"); + if (read_buffer < 1) + tw_error(TW_LOC, "network read buffer must be >= 1"); + + init_q(&posted_sends, "MPI send queue"); + init_q(&posted_recvs, "MPI recv queue"); + + g_tw_net_device_size = read_buffer; + + // pre-post all the Irecv operations + recv_begin( g_tw_pe ); +} + +void +tw_net_abort(void) +{ + MPI_Abort(MPI_COMM_ROSS, 1); + exit(1); +} + +void +tw_net_stop(void) +{ + if (!custom_communicator) { + if (MPI_Finalize() != MPI_SUCCESS) + tw_error(TW_LOC, "Failed to finalize MPI"); + } +} + +void +tw_net_barrier(void) +{ + if (MPI_Barrier(MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Failed to wait for MPI_Barrier"); +} + +tw_stime +tw_net_minimum(void) +{ + tw_stime m = DBL_MAX; + tw_event *e; + int i; + + e = outq.head; + while (e) { + if (m > e->recv_ts) + m = e->recv_ts; + e = e->next; + } + + for (i = 0; i < posted_sends.cur; i++) { //fix this line (?) + e = posted_sends.event_list[i]; + if(e == NULL) + {} + else if(m > e->recv_ts) + m = e->recv_ts; + else + {} + } + + return m; +} + +static int +test_q_recv( + struct act_q * q, + tw_pe *me, + void (*finish)(tw_pe *, tw_event *, char *, struct act_q*, int id)) +{ + int ready, i, n, indicator; + indicator = 1; + +#if ROSS_MEMORY + char *tmp; +#endif + + + if( q->num_in_fr_q == ((q->size_of_fr_q)-1) ) + return 0; + + if (MPI_Testsome( + q->cur, + q->req_list, + &ready, + q->idx_list, + q->status_list) != MPI_SUCCESS) { + tw_error( + TW_LOC, + "MPI_testsome failed with %u items in %s", + q->cur, + q->name); + } + + if (1 > ready) + return 0; + + q->overflow_anti[0]=1; + q->num_in_fr_q+=ready; + i = ready-1; + while ( -1 != i) + { + + tw_event *e; + n = q->idx_list[i]; + e = q->event_list[n]; + fr_q_aq(q, n); //add n onto queue +// cur_reduction(q,n); + + +#if ROSS_MEMORY + finish(me, e, q->buffers[n], q, n); +#else + finish(me, e, NULL, q, n); +#endif + if (indicator != q->overflow_anti[0]) + { + indicator = q->overflow_anti[0]; + } + else + { + q->event_list[n] = NULL; + } + i--; + } + + i = 1; // first element of q->overflow_anti is the number of + + while (i < q->overflow_anti[0])//takes care of out of order messages + { + + tw_event *e; + n = q->overflow_anti[i]; + e = q->event_list[n]; + q->event_list[n] = NULL; + late_recv_finish(me, e, NULL, q, n); + //might need an augmented version for ROSS_MEMORY? + i++; + } + +// V breaks everything. +// alter_cur(q); + + return 1; +} + + +static int +test_q_send( + struct act_q * q, + tw_pe *me, + void (*finish)(tw_pe *, tw_event *, char *, struct act_q*, int id)) +{ + int ready, i, n; + +#if ROSS_MEMORY + char *tmp; +#endif + + + if( q->num_in_fr_q == ((q->size_of_fr_q)-1) ) + return 0; + + if (MPI_Testsome( + q->cur, + q->req_list, + &ready, + q->idx_list, + q->status_list) != MPI_SUCCESS) { + tw_error( + TW_LOC, + "MPI_testsome failed with %u items in %s", + q->cur, + q->name); + } + + if (1 > ready) + return 0; + + i = ready-1; + + while(-1 != i) + { + + tw_event *e; + + n = q->idx_list[i]; + e = q->event_list[n]; + fr_q_aq(q, n);//add n onto queue +// cur_reduction(q,n); + q->event_list[n] = NULL; + i--; + +#if ROSS_MEMORY + finish(me, e, q->buffers[n], q, n); +#else + finish(me, e, NULL, q, n); +#endif + } + + + q->num_in_fr_q+=ready; +// alter_cur(q); + + return 1; +} + + + + +static int +recv_begin(tw_pe *me) +{ + MPI_Status status; + + tw_event *e = NULL; + + int flag = 0; + int changed = 0; + + while (0 < posted_recvs.num_in_fr_q)//fix these lines + { + + if(!(e = tw_event_grab(me))) + { + if(tw_gvt_inprogress(me)) + tw_error(TW_LOC, "Out of events in GVT! Consider increasing --extramem"); + return changed; + } + + int id = fr_q_dq(&posted_recvs); + +#if ROSS_MEMORY + if( MPI_Irecv(posted_recvs.buffers[id], + EVENT_SIZE(e), + MPI_BYTE, + MPI_ANY_SOURCE, + EVENT_TAG, + MPI_COMM_ROSS, + &posted_recvs.req_list[id]) != MPI_SUCCESS) +#else + if( MPI_Irecv(e, + (int)EVENT_SIZE(e), + MPI_BYTE, + MPI_ANY_SOURCE, + EVENT_TAG, + MPI_COMM_ROSS, + &posted_recvs.req_list[id]) != MPI_SUCCESS) +#endif + { + tw_event_free(me, e); + return changed; + } + + posted_recvs.event_list[id] = e; + deal_with_cur(&posted_recvs); + changed = 1; + } + return changed; +} + + +static void +late_recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) +{ + /* +#if ROSS_MEMORY + tw_memory *memory; + tw_memory *last; + tw_fd mem_fd; + + size_t mem_size; + + unsigned position = 0; + + memcpy(e, buffer, g_tw_event_msg_sz); + position += g_tw_event_msg_sz; +#endif + /* + + /* + if(e->send_pe > tw_nnodes()-1) + tw_error(TW_LOC, "bad sendpe_id: %d", e->send_pe); + + + if(e->recv_ts < me->GVT) + tw_error(TW_LOC, "%d: Received straggler from %d: %lf (%d)", + me->id, e->send_pe, e->recv_ts, e->state.cancel_q); + + if(tw_gvt_inprogress(me)) + me->trans_msg_ts = ROSS_MIN(me->trans_msg_ts, e->recv_ts); + + // if cancel event, retrieve and flush + // else, store in hash table + */ + + // NOTE: it is possible to cancel the event we + // are currently processing at this PE since this + // MPI module lets me read cancel events during + // event sends over the network. + + tw_pe *dest_pe; + dest_pe = e->dest_lp->pe; + tw_event *cancel = tw_hash_remove(me->hash_t, e, e->send_pe); + g_tw_pe->avl_tree_size++; + + cancel->state.cancel_q = 1; + cancel->state.remote = 0; + cancel->cancel_next = dest_pe->cancel_q; + dest_pe->cancel_q = cancel; + tw_event_free(me, e); + + return; + +} + +static void +recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) +{ + tw_pe *dest_pe; + tw_clock start; + +#if ROSS_MEMORY + tw_memory *memory; + tw_memory *last; + tw_fd mem_fd; + + size_t mem_size; + + unsigned position = 0; + + memcpy(e, buffer, g_tw_event_msg_sz); + position += g_tw_event_msg_sz; +#endif + me->stats.s_nread_network++; + me->s_nwhite_recv++; + e->dest_lp = tw_getlocal_lp((tw_lpid) e->dest_lp); + dest_pe = e->dest_lp->pe; + + + // instrumentation + e->dest_lp->kp->kp_stats->s_nread_network++; + e->dest_lp->lp_stats->s_nread_network++; + + e->cancel_next = NULL; + e->caused_by_me = NULL; + e->cause_next = NULL; + + if(e->send_pe > tw_nnodes()-1) + tw_error(TW_LOC, "bad sendpe_id: %d", e->send_pe); + + + if(e->recv_ts < me->GVT) + tw_error(TW_LOC, "%d: Received straggler from %d: %lf (%d)", + me->id, e->send_pe, e->recv_ts, e->state.cancel_q); + + if(tw_gvt_inprogress(me)) + me->trans_msg_ts = ROSS_MIN(me->trans_msg_ts, e->recv_ts); + + // if cancel event, retrieve and flush + // else, store in hash table + + if(e->state.cancel_q) + { + + tw_event *cancel = tw_hash_remove(me->hash_t, e, e->send_pe); + + // NOTE: it is possible to cancel the event we + // are currently processing at this PE since this + // MPI module lets me read cancel events during + // event sends over the network. + + if(cancel!=NULL) + { + + cancel->state.cancel_q = 1; + cancel->state.remote = 0; + cancel->cancel_next = dest_pe->cancel_q; + dest_pe->cancel_q = cancel; + tw_event_free(me, e); + } + else + { + q->overflow_anti[q->overflow_anti[0]] = id; //add id stuff later + q->overflow_anti[0]++; + } + + return; + } + + + + if (g_tw_synchronization_protocol == OPTIMISTIC || + g_tw_synchronization_protocol == OPTIMISTIC_DEBUG || + g_tw_synchronization_protocol == OPTIMISTIC_REALTIME ) { + tw_hash_insert(me->hash_t, e, e->send_pe); + e->state.remote = 1; + } + +#if ROSS_MEMORY + mem_size = (size_t) e->memory; + mem_fd = (tw_fd) e->prev; + + last = NULL; + while(mem_size) + { + memory = tw_memory_alloc(e->dest_lp, mem_fd); + + if(last) + last->next = memory; + else + e->memory = memory; + + memcpy(memory, &buffer[position], mem_size); + position += mem_size; + + memory->fd = mem_fd; + memory->nrefs = 1; + + mem_size = (size_t) memory->next; + mem_fd = memory->fd; + + last = memory; + } +#endif + + /* NOTE: the final check in the if conditional below was added to make sure + * that we do not execute the fast case unless the cancellation queue is + * empty on the destination PE. Otherwise we need to invoke the normal + * scheduling routines to make sure that a forward event doesn't bypass a + * cancellation event with an earlier timestamp. This is helpful for + * stateful models that produce incorrect results when presented with + * duplicate messages with no rollback between them. + */ + if(me == dest_pe && e->dest_lp->kp->last_time <= e->recv_ts && !dest_pe->cancel_q) { + /* Fast case, we are sending to our own PE and + * there is no rollback caused by this send. + */ + start = tw_clock_read(); + tw_pq_enqueue(dest_pe->pq, e); + dest_pe->stats.s_pq += tw_clock_read() - start; + return; + } + + if (me->id == dest_pe->id) { + /* Slower, but still local send, so put into top + * of dest_pe->event_q. + */ + e->state.owner = TW_pe_event_q; + tw_eventq_push(&dest_pe->event_q, e); + return; + } + + /* Never should happen; MPI should have gotten the + * message to the correct node without needing us + * to redirect the message there for it. This is + * probably a serious bug with the event headers + * not being formatted right. + */ + tw_error( + TW_LOC, + "Event received by PE %u but meant for PE %u", + me->id, + dest_pe->id); +} + +static int +send_begin(tw_pe *me) +{ + int changed = 0; + + while (0 < posted_sends.num_in_fr_q) + { + tw_event *e = tw_eventq_peek(&outq);//next event + tw_node *dest_node = NULL; + +#if ROSS_MEMORY + tw_event *tmp_prev = NULL; + + tw_lp *tmp_lp = NULL; + + tw_memory *memory = NULL; + tw_memory *m = NULL; + + char *buffer = NULL; + + size_t mem_size = 0; + + unsigned position = 0; +#endif + + if (!e) + break; + + if(e == me->abort_event) + tw_error(TW_LOC, "Sending abort event!"); + + int id = fr_q_dq(&posted_sends);// fixed, grabs from front of queue, moves front up one element + dest_node = tw_net_onnode((*e->src_lp->type->map) + ((tw_lpid) e->dest_lp)); + + //if(!e->state.cancel_q) + //e->event_id = (tw_eventid) ++me->seq_num; + + e->send_pe = (tw_peid) g_tw_mynode; + e->send_lp = e->src_lp->gid; + +#if ROSS_MEMORY + // pack pointers + tmp_prev = e->prev; + tmp_lp = e->src_lp; + + // delete when working + e->src_lp = NULL; + + memory = NULL; + if(e->memory) + { + memory = e->memory; + e->memory = (tw_memory *) tw_memory_getsize(me, memory->fd); + e->prev = (tw_event *) memory->fd; + mem_size = (size_t) e->memory; + } + + buffer = posted_sends.buffers[id]; + memcpy(&buffer[position], e, g_tw_event_msg_sz); + position += g_tw_event_msg_sz; + + // restore pointers + e->prev = tmp_prev; + e->src_lp = tmp_lp; + + m = NULL; + while(memory) + { + m = memory->next; + + if(m) + { + memory->next = (tw_memory *) + tw_memory_getsize(me, m->fd); + memory->fd = m->fd; + } + + if(position + mem_size > TW_MEMORY_BUFFER_SIZE) + tw_error(TW_LOC, "Out of buffer space!"); + + memcpy(&buffer[position], memory, mem_size); + position += mem_size; + + memory->nrefs--; + tw_memory_unshift(e->src_lp, memory, memory->fd); + + if(NULL != (memory = m)) + mem_size = tw_memory_getsize(me, memory->fd); + } + + e->memory = NULL; + + if (MPI_Isend(buffer, + EVENT_SIZE(e), + MPI_BYTE, + *dest_node, + EVENT_TAG, + MPI_COMM_ROSS, + &posted_sends.req_list[id]) != MPI_SUCCESS) { + return changed; + } +#else + if (MPI_Isend(e, + (int)EVENT_SIZE(e), + MPI_BYTE, + (int)*dest_node, + EVENT_TAG, + MPI_COMM_ROSS, + &posted_sends.req_list[id]) != MPI_SUCCESS) { + return changed; + } +#endif + + tw_eventq_pop(&outq); + e->state.owner = e->state.cancel_q + ? TW_net_acancel + : TW_net_asend; + + posted_sends.event_list[id] = e; + deal_with_cur(&posted_sends); + + me->s_nwhite_sent++; + changed = 1; + } + return changed; +} + +static void +send_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id) +{ + me->stats.s_nsend_network++; + // instrumentation + e->src_lp->kp->kp_stats->s_nsend_network++; + e->src_lp->lp_stats->s_nsend_network++; + + if (e->state.owner == TW_net_asend) { + if (e->state.cancel_asend) { + /* Event was cancelled during transmission. We must + * send another message to pass the cancel flag to + * the other node. + */ + e->state.cancel_asend = 0; + e->state.cancel_q = 1; + tw_eventq_push(&outq, e); + } else { + /* Event finished transmission and was not cancelled. + * Add to our sent event queue so we can retain the + * event in case we need to cancel it later. Note it + * is currently in remote format and must be converted + * back to local format for fossil collection. + */ + e->state.owner = TW_pe_sevent_q; + if( g_tw_synchronization_protocol == CONSERVATIVE ) + tw_event_free(me, e); + } + + return; + } + + if (e->state.owner == TW_net_acancel) { + /* We just finished sending the cancellation message + * for this event. We need to free the buffer and + * make it available for reuse. + */ + tw_event_free(me, e); + return; + } + + /* Never should happen, not unless we somehow broke this + * module's other functions related to sending an event. + */ + + tw_error( + TW_LOC, + "Don't know how to finish send of owner=%u, cancel_q=%d", + e->state.owner, + e->state.cancel_q); + +} + +static void +service_queues(tw_pe *me) +{ + int changed; + do { + changed = test_q_recv(&posted_recvs, me, recv_finish); + changed |= test_q_send(&posted_sends, me, send_finish); + changed |= recv_begin(me); + changed |= send_begin(me); + } while (changed); +} + +/* + * NOTE: Chris believes that this network layer is too aggressive at + * reading events out of the network.. so we are modifying the algorithm + * to only send events when tw_net_send it called, and only read events + * when tw_net_read is called. + */ +void +tw_net_read(tw_pe *me) +{ + service_queues(me); +} + +void +tw_net_send(tw_event *e) +{ + tw_pe * me = e->src_lp->pe; + int changed = 0; + + e->state.remote = 0; + e->state.owner = TW_net_outq; + tw_eventq_unshift(&outq, e); + + do + { + changed = test_q_send(&posted_sends, me, send_finish); + changed |= send_begin(me); + } while (changed); +} + +void +tw_net_cancel(tw_event *e) +{ + tw_pe *src_pe = e->src_lp->pe; + + switch (e->state.owner) { + case TW_net_outq: + /* Cancelled before we could transmit it. Do not + * transmit the event and instead just release the + * buffer back into our own free list. + */ + tw_eventq_delete_any(&outq, e); + tw_event_free(src_pe, e); + + return; + + break; + + case TW_net_asend: + /* Too late. We've already let MPI start to send + * this event over the network. We can't pull it + * back now without sending another message to do + * the cancel. + * + * Setting the cancel_q flag will signal us to do + * another message send once the current send of + * this message is completed. + */ + e->state.cancel_asend = 1; + break; + + case TW_pe_sevent_q: + /* Way late; the event was already sent and is in + * our sent event queue. Mark it as a cancel and + * place it at the front of the outq. + */ + e->state.cancel_q = 1; + tw_eventq_unshift(&outq, e); + break; + + default: + /* Huh? Where did you come from? Why are we being + * told about you? We did not send you so we cannot + * cancel you! + */ + tw_error( + TW_LOC, + "Don't know how to cancel event owned by %u", + e->state.owner); + } + + service_queues(src_pe); +} + +/** + * tw_net_statistics + * @brief Function to output the statistics + * @attention Notice that the MPI_Reduce "count" parameter is greater than one. + * We are reducing on multiple variables *simultaneously* so if you change + * this function or the struct tw_statistics, you must update the other. + **/ +tw_statistics * +tw_net_statistics(tw_pe * me, tw_statistics * s) +{ + if(MPI_Reduce(&(s->s_max_run_time), + &me->stats.s_max_run_time, + 1, + MPI_DOUBLE, + MPI_MAX, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); + + if(MPI_Reduce(&(s->s_net_events), + &me->stats.s_net_events, + 17, + MPI_UNSIGNED_LONG_LONG, + MPI_SUM, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); + + if(MPI_Reduce(&s->s_min_detected_offset, + &me->stats.s_min_detected_offset, + 1, + MPI_DOUBLE, + MPI_MIN, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); + + if(MPI_Reduce(&(s->s_total), + &me->stats.s_total, + 16, + MPI_UNSIGNED_LONG_LONG, + MPI_MAX, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); + + if (MPI_Reduce(&s->s_events_past_end, + &me->stats.s_events_past_end, + 3, + MPI_UNSIGNED_LONG_LONG, + MPI_SUM, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); + +#ifdef USE_RIO + if (MPI_Reduce(&s->s_rio_load, + &me->stats.s_rio_load, + 1, + MPI_UNSIGNED_LONG_LONG, + MPI_MAX, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); + if (MPI_Reduce(&s->s_rio_lp_init, + &me->stats.s_rio_lp_init, + 1, + MPI_UNSIGNED_LONG_LONG, + MPI_MAX, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); +#endif + + return &me->stats; +} + From cb764321d5dd973b7922e1a2cb532726d6bcf899 Mon Sep 17 00:00:00 2001 From: yaciud <32393009+yaciud@users.noreply.github.com> Date: Mon, 18 Nov 2019 19:33:35 -0500 Subject: [PATCH 22/24] Create hole-network-mpi.c This is the version that fills holes in the MPI request array by taking events from the back of the array and places them in spots where the request have been satisfied. --- core/hole-network-mpi.c | 1015 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 1015 insertions(+) create mode 100644 core/hole-network-mpi.c diff --git a/core/hole-network-mpi.c b/core/hole-network-mpi.c new file mode 100644 index 000000000..5cd63d91b --- /dev/null +++ b/core/hole-network-mpi.c @@ -0,0 +1,1015 @@ +#include +#include + + +MPI_Comm MPI_COMM_ROSS = MPI_COMM_WORLD; +int custom_communicator = 0; + +static long id_tmp; + +struct act_q +{ + const char *name; + + tw_event **event_list; + MPI_Request *req_list; + int *idx_list; + MPI_Status *status_list; + int *overflow_anti; + unsigned int length; + +#if ROSS_MEMORY + char **buffers; +#endif + + unsigned int cur; +}; + +#define EVENT_TAG 1 + +#if ROSS_MEMORY +#define EVENT_SIZE(e) TW_MEMORY_BUFFER_SIZE +#else +#define EVENT_SIZE(e) g_tw_event_msg_sz +#endif + +static struct act_q posted_sends; +static struct act_q posted_recvs; +static tw_eventq outq; + +static unsigned int read_buffer = 16; +static unsigned int send_buffer = 1024; +static int world_size = 1; + +static const tw_optdef mpi_opts[] = { + TWOPT_GROUP("ROSS MPI Kernel"), + TWOPT_UINT( + "read-buffer", + read_buffer, + "network read buffer size in # of events"), + TWOPT_UINT( + "send-buffer", + send_buffer, + "network send buffer size in # of events"), + TWOPT_END() +}; + +// Forward declarations of functions used in MPI network message processing +static int recv_begin(tw_pe *me); +static void recv_finish(tw_pe *me, tw_event *e, char * buffer,struct act_q * q, int id); +static void late_recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q* q, int id); +static int send_begin(tw_pe *me); +static void send_finish(tw_pe *me, tw_event *e, char * buffer,struct act_q * q, int id); + +// Start of implmentation of network processing routines/functions +void tw_comm_set(MPI_Comm comm) +{ + MPI_COMM_ROSS = comm; + custom_communicator = 1; +} + +const tw_optdef * +tw_net_init(int *argc, char ***argv) +{ + int my_rank; + int initialized; + MPI_Initialized(&initialized); + + if (!initialized) { + if (MPI_Init(argc, argv) != MPI_SUCCESS) + tw_error(TW_LOC, "MPI_Init failed."); + } + + if (MPI_Comm_rank(MPI_COMM_ROSS, &my_rank) != MPI_SUCCESS) + tw_error(TW_LOC, "Cannot get MPI_Comm_rank(MPI_COMM_ROSS)"); + + g_tw_masternode = 0; + g_tw_mynode = my_rank; + + return mpi_opts; +} + +static void +init_q(struct act_q *q, const char *name) +{ + unsigned int n; +#if ROSS_MEMORY + unsigned int i; +#endif + + if(q == &posted_sends) + n = send_buffer; + else + n = read_buffer; + + q->name = name; + q->event_list = (tw_event **) tw_calloc(TW_LOC, name, sizeof(*q->event_list), n); + q->req_list = (MPI_Request *) tw_calloc(TW_LOC, name, sizeof(*q->req_list), n); + q->idx_list = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), n); + q->status_list = (MPI_Status *) tw_calloc(TW_LOC, name, sizeof(*q->status_list), n); + q->overflow_anti = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), (n/2)+2);// queue, at most (n/2) can be out of order, first element is # of elements in queue + q->overflow_anti[0]=1; +// q->cur = 0; + q->length = n; + +#if ROSS_MEMORY + q->buffers = tw_calloc(TW_LOC, name, sizeof(*q->buffers), n); + + for(i = 0; i < n; i++) + q->buffers[i] = tw_calloc(TW_LOC, "", TW_MEMORY_BUFFER_SIZE, 1); +#endif +} + +tw_node * tw_net_onnode(tw_peid gid) { + id_tmp = gid; + return &id_tmp; +} + +unsigned int +tw_nnodes(void) +{ + return world_size; +} + +void +tw_net_start(void) +{ + if (MPI_Comm_size(MPI_COMM_ROSS, &world_size) != MPI_SUCCESS) + tw_error(TW_LOC, "Cannot get MPI_Comm_size(MPI_COMM_ROSS)"); + + if( g_tw_mynode == 0) + { + printf("tw_net_start: Found world size to be %d \n", world_size ); + } + + // Check after tw_nnodes is defined + if(tw_nnodes() == 1) { + // force the setting of SEQUENTIAL protocol + if (g_tw_synchronization_protocol == NO_SYNCH) { + g_tw_synchronization_protocol = SEQUENTIAL; + } else if(g_tw_synchronization_protocol == CONSERVATIVE || g_tw_synchronization_protocol == OPTIMISTIC) { + g_tw_synchronization_protocol = SEQUENTIAL; + fprintf(stderr, "Warning: Defaulting to Sequential Simulation, not enought PEs defined.\n"); + } + } + +// tw_pe_create(1); + tw_pe_init(); + + //If we're in (some variation of) optimistic mode, we need this hash + if (g_tw_synchronization_protocol == OPTIMISTIC || + g_tw_synchronization_protocol == OPTIMISTIC_DEBUG || + g_tw_synchronization_protocol == OPTIMISTIC_REALTIME) { + g_tw_pe->hash_t = tw_hash_create(); + } else { + g_tw_pe->hash_t = NULL; + } + + if (send_buffer < 1) + tw_error(TW_LOC, "network send buffer must be >= 1"); + if (read_buffer < 1) + tw_error(TW_LOC, "network read buffer must be >= 1"); + + init_q(&posted_sends, "MPI send queue"); + init_q(&posted_recvs, "MPI recv queue"); + + g_tw_net_device_size = read_buffer; + + // pre-post all the Irecv operations + recv_begin( g_tw_pe ); +} + +void +tw_net_abort(void) +{ + MPI_Abort(MPI_COMM_ROSS, 1); + exit(1); +} + +void +tw_net_stop(void) +{ +#ifdef USE_DAMARIS + if (g_st_damaris_enabled) + st_damaris_ross_finalize(); + else + { + if (!custom_communicator) { + if (MPI_Finalize() != MPI_SUCCESS) + tw_error(TW_LOC, "Failed to finalize MPI"); + } + } +#else + if (!custom_communicator) { + if (MPI_Finalize() != MPI_SUCCESS) + tw_error(TW_LOC, "Failed to finalize MPI"); + } +#endif +} + +void +tw_net_barrier(void) +{ + if (MPI_Barrier(MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Failed to wait for MPI_Barrier"); +} + +tw_stime +tw_net_minimum(void) +{ + tw_stime m = DBL_MAX; + tw_event *e; + int i; + + e = outq.head; + while (e) { + if (m > e->recv_ts) + m = e->recv_ts; + e = e->next; + } + + for (i = 0; i < posted_sends.cur; i++) { + e = posted_sends.event_list[i]; + if (m > e->recv_ts) + m = e->recv_ts; + } + + return m; +} + +void check_b_ind( int * b_index, struct act_q * q) +{ + while(0 <= *b_index && *b_index<=q->length) + { + + if(q->event_list[*b_index]== NULL) + { + *b_index = *b_index-1; + } + else + { + return; + } + + + } + *b_index = 0; + return; + +} + + +static int +test_q( + struct act_q * q, + tw_pe *me, + int pick_queue, + void (*finish)(tw_pe *, tw_event *, char *, struct act_q*, int id)) +{ + int ready, i, n, indicator; + q->overflow_anti[0] = 1; // + indicator = 1; + +#if ROSS_MEMORY + char *tmp; +#endif + + + if (!q->cur) + { + return 0; + } + + if (MPI_Testsome( + q->cur, + q->req_list, + &ready, + q->idx_list, + q->status_list) != MPI_SUCCESS) { + tw_error( + TW_LOC, + "MPI_testsome failed with %u items in %s", + q->cur, + q->name); + } + + if (1 > ready) + return 0; + + if (pick_queue == 1) { + + for (i = 0; i < ready; i++) + { + + tw_event *e; + + n = q->idx_list[i]; + e = q->event_list[n]; + +#if ROSS_MEMORY + finish(me, e, q->buffers[n], q, n); +#else + finish(me, e, NULL, q, n); +#endif + + if (indicator != q->overflow_anti[0]) + { + indicator = q->overflow_anti[0]; + } + else + { + q->event_list[n] = NULL; + } + } + i = 1; + while (i < q->overflow_anti[0]) { + + tw_event *e; + n = q->overflow_anti[i]; + e = q->event_list[n]; + late_recv_finish(me, e, NULL, q, n); + q->event_list[n] = NULL; + + //might need an augmented version for ROSS_MEMORY? + + i++; + + } + } + else + { + for (i = 0; i < ready; i++) + { + + tw_event *e; + + n = q->idx_list[i]; + e = q->event_list[n]; + q->event_list[n] = NULL; + + +#if ROSS_MEMORY + finish(me, e, q->buffers[n], q, n); +#else + finish(me, e, NULL, q, n); +#endif + } + + } + + i = 0; + + int b_index; + + b_index = q->cur-1; + + +// if(strcmp(q->name,"MPI send queue")==0) +// { +// printf("%s: %d %d %d %d \n", q->name, b_index, q->cur,ready, q->cur - ready); +// } + while(iidx_list[i]; + + check_b_ind(&b_index, q); + if (n < b_index) + { +// if(strcmp(q->name,"MPI send queue")==0) +// { +// printf("%s: putting away index %d\n",q->name, n); +// } +// printf("filling %d with event in %d\n",n , b_index) ; + q->event_list[n] = q->event_list[b_index]; + memcpy(&q->req_list[n],&q->req_list[b_index],sizeof(q->req_list[0])); + b_index--; + } + i++; + + } + q->cur -= ready; + + if(strcmp(q->name,"MPI send queue")!=0) + { +// printf("%s: returning with %d\n", q->name,q->cur); + } + return 1; +} + + +static int +recv_begin(tw_pe *me) +{ + MPI_Status status; + + tw_event *e = NULL; + + int flag = 0; + int changed = 0; + + while (posted_recvs.cur < read_buffer) + { + unsigned id = posted_recvs.cur; + + if(!(e = tw_event_grab(me))) + { + if(tw_gvt_inprogress(me)) + tw_error(TW_LOC, "Out of events in GVT! Consider increasing --extramem"); + return changed; + } + +#if ROSS_MEMORY + if( MPI_Irecv(posted_recvs.buffers[id], + EVENT_SIZE(e), + MPI_BYTE, + MPI_ANY_SOURCE, + EVENT_TAG, + MPI_COMM_ROSS, + &posted_recvs.req_list[id]) != MPI_SUCCESS) +#else + if( MPI_Irecv(e, + (int)EVENT_SIZE(e), + MPI_BYTE, + MPI_ANY_SOURCE, + EVENT_TAG, + MPI_COMM_ROSS, + &posted_recvs.req_list[id]) != MPI_SUCCESS) +#endif + { + tw_event_free(me, e); + return changed; + } + + + posted_recvs.event_list[id] = e; + posted_recvs.cur++; + changed = 1; + } + + return changed; +} + + +static void +recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) +{ + tw_pe *dest_pe; + tw_clock start; + +#if ROSS_MEMORY + tw_memory *memory; + tw_memory *last; + tw_fd mem_fd; + + size_t mem_size; + + unsigned position = 0; + + memcpy(e, buffer, g_tw_event_msg_sz); + position += g_tw_event_msg_sz; +#endif + me->stats.s_nread_network++; + me->s_nwhite_recv++; + e->dest_lp = tw_getlocal_lp((tw_lpid) e->dest_lp); + dest_pe = e->dest_lp->pe; + + + // instrumentation + e->dest_lp->kp->kp_stats->s_nread_network++; + e->dest_lp->lp_stats->s_nread_network++; + + + if(e->send_pe > tw_nnodes()-1) + tw_error(TW_LOC, "bad sendpe_id: %d", e->send_pe); + + + if(e->recv_ts < me->GVT) + tw_error(TW_LOC, "%d: Received straggler from %d: %lf (%d)", + me->id, e->send_pe, e->recv_ts, e->state.cancel_q); + + if(tw_gvt_inprogress(me)) + me->trans_msg_ts = ROSS_MIN(me->trans_msg_ts, e->recv_ts); + + // if cancel event, retrieve and flush + // else, store in hash table + + if(e->state.cancel_q) + { + + tw_event *cancel = tw_hash_remove(me->hash_t, e, e->send_pe); + + // NOTE: it is possible to cancel the event we + // are currently processing at this PE since this + // MPI module lets me read cancel events during + // event sends over the network. + + if(cancel!=NULL) + { + + e->cancel_next = NULL; + e->caused_by_me = NULL; + e->cause_next = NULL; + + cancel->state.cancel_q = 1; + cancel->state.remote = 0; + cancel->cancel_next = dest_pe->cancel_q; + dest_pe->cancel_q = cancel; + tw_event_free(me, e); + } + else + { + q->overflow_anti[q->overflow_anti[0]] = id; //add id stuff later + q->overflow_anti[0]++; + } + + return; + } + + e->cancel_next = NULL; + e->caused_by_me = NULL; + e->cause_next = NULL; + + if (g_tw_synchronization_protocol == OPTIMISTIC || + g_tw_synchronization_protocol == OPTIMISTIC_DEBUG || + g_tw_synchronization_protocol == OPTIMISTIC_REALTIME ) { + tw_hash_insert(me->hash_t, e, e->send_pe); + e->state.remote = 1; + } + +#if ROSS_MEMORY + mem_size = (size_t) e->memory; + mem_fd = (tw_fd) e->prev; + + last = NULL; + while(mem_size) + { + memory = tw_memory_alloc(e->dest_lp, mem_fd); + + if(last) + last->next = memory; + else + e->memory = memory; + + memcpy(memory, &buffer[position], mem_size); + position += mem_size; + + memory->fd = mem_fd; + memory->nrefs = 1; + + mem_size = (size_t) memory->next; + mem_fd = memory->fd; + + last = memory; + } +#endif + + /* NOTE: the final check in the if conditional below was added to make sure + * that we do not execute the fast case unless the cancellation queue is + * empty on the destination PE. Otherwise we need to invoke the normal + * scheduling routines to make sure that a forward event doesn't bypass a + * cancellation event with an earlier timestamp. This is helpful for + * stateful models that produce incorrect results when presented with + * duplicate messages with no rollback between them. + */ + if(me == dest_pe && e->dest_lp->kp->last_time <= e->recv_ts && !dest_pe->cancel_q) { + /* Fast case, we are sending to our own PE and + * there is no rollback caused by this send. + */ + start = tw_clock_read(); + tw_pq_enqueue(dest_pe->pq, e); + dest_pe->stats.s_pq += tw_clock_read() - start; + return; + } + + if (me->id == dest_pe->id) { + /* Slower, but still local send, so put into top + * of dest_pe->event_q. + */ + e->state.owner = TW_pe_event_q; + tw_eventq_push(&dest_pe->event_q, e); + return; + } + + /* Never should happen; MPI should have gotten the + * message to the correct node without needing us + * to redirect the message there for it. This is + * probably a serious bug with the event headers + * not being formatted right. + */ + tw_error( + TW_LOC, + "Event received by PE %u but meant for PE %u", + me->id, + dest_pe->id); +} + + +static void +late_recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) +{ + tw_pe *dest_pe; + tw_clock start; + +#if ROSS_MEMORY + tw_memory *memory; + tw_memory *last; + tw_fd mem_fd; + + size_t mem_size; + + unsigned position = 0; + + memcpy(e, buffer, g_tw_event_msg_sz); + position += g_tw_event_msg_sz; +#endif + + dest_pe = e->dest_lp->pe; + if(e->send_pe > tw_nnodes()-1) + tw_error(TW_LOC, "bad sendpe_id: %d", e->send_pe); + + e->cancel_next = NULL; + e->caused_by_me = NULL; + e->cause_next = NULL; + + + if(e->recv_ts < me->GVT) + tw_error(TW_LOC, "%d: Received straggler from %d: %lf (%d)", + me->id, e->send_pe, e->recv_ts, e->state.cancel_q); + + if(tw_gvt_inprogress(me)) + me->trans_msg_ts = ROSS_MIN(me->trans_msg_ts, e->recv_ts); + + // if cancel event, retrieve and flush + // else, store in hash table + + tw_event *cancel = tw_hash_remove(me->hash_t, e, e->send_pe); + g_tw_pe->avl_tree_size++; + + // NOTE: it is possible to cancel the event we + // are currently processing at this PE since this + // MPI module lets me read cancel events during + // event sends over the network. + + cancel->state.cancel_q = 1; + cancel->state.remote = 0; + cancel->cancel_next = dest_pe->cancel_q; + dest_pe->cancel_q = cancel; + tw_event_free(me, e); + + return; + +} + +static int +send_begin(tw_pe *me) +{ + int changed = 0; + + while (posted_sends.cur < send_buffer) + { + tw_event *e = tw_eventq_peek(&outq); + tw_node *dest_node = NULL; + + unsigned id = posted_sends.cur; + +#if ROSS_MEMORY + tw_event *tmp_prev = NULL; + + tw_lp *tmp_lp = NULL; + + tw_memory *memory = NULL; + tw_memory *m = NULL; + + char *buffer = NULL; + + size_t mem_size = 0; + + unsigned position = 0; +#endif + + if (!e) + break; + + if(e == me->abort_event) + tw_error(TW_LOC, "Sending abort event!"); + + dest_node = tw_net_onnode((*e->src_lp->type->map) + ((tw_lpid) e->dest_lp)); + + //if(!e->state.cancel_q) + //e->event_id = (tw_eventid) ++me->seq_num; + + e->send_pe = (tw_peid) g_tw_mynode; + e->send_lp = e->src_lp->gid; + +#if ROSS_MEMORY + // pack pointers + tmp_prev = e->prev; + tmp_lp = e->src_lp; + + // delete when working + e->src_lp = NULL; + + memory = NULL; + if(e->memory) + { + memory = e->memory; + e->memory = (tw_memory *) tw_memory_getsize(me, memory->fd); + e->prev = (tw_event *) memory->fd; + mem_size = (size_t) e->memory; + } + + buffer = posted_sends.buffers[id]; + memcpy(&buffer[position], e, g_tw_event_msg_sz); + position += g_tw_event_msg_sz; + + // restore pointers + e->prev = tmp_prev; + e->src_lp = tmp_lp; + + m = NULL; + while(memory) + { + m = memory->next; + + if(m) + { + memory->next = (tw_memory *) + tw_memory_getsize(me, m->fd); + memory->fd = m->fd; + } + + if(position + mem_size > TW_MEMORY_BUFFER_SIZE) + tw_error(TW_LOC, "Out of buffer space!"); + + memcpy(&buffer[position], memory, mem_size); + position += mem_size; + + memory->nrefs--; + tw_memory_unshift(e->src_lp, memory, memory->fd); + + if(NULL != (memory = m)) + mem_size = tw_memory_getsize(me, memory->fd); + } + + e->memory = NULL; + + if (MPI_Isend(buffer, + EVENT_SIZE(e), + MPI_BYTE, + *dest_node, + EVENT_TAG, + MPI_COMM_ROSS, + &posted_sends.req_list[id]) != MPI_SUCCESS) { + return changed; + } +#else + if (MPI_Isend(e, + (int)EVENT_SIZE(e), + MPI_BYTE, + (int)*dest_node, + EVENT_TAG, + MPI_COMM_ROSS, + &posted_sends.req_list[id]) != MPI_SUCCESS) { + return changed; + } +#endif + + tw_eventq_pop(&outq); + e->state.owner = e->state.cancel_q + ? TW_net_acancel + : TW_net_asend; + + posted_sends.event_list[id] = e; + posted_sends.cur++; + me->s_nwhite_sent++; + + changed = 1; + } + return changed; +} + +static void +send_finish(tw_pe *me, tw_event *e, char * buffer,struct act_q * q, int id) +{ + me->stats.s_nsend_network++; + // instrumentation + e->src_lp->kp->kp_stats->s_nsend_network++; + e->src_lp->lp_stats->s_nsend_network++; + + if (e->state.owner == TW_net_asend) { + if (e->state.cancel_asend) { + /* Event was cancelled during transmission. We must + * send another message to pass the cancel flag to + * the other node. + */ + e->state.cancel_asend = 0; + e->state.cancel_q = 1; + tw_eventq_push(&outq, e); + } else { + /* Event finished transmission and was not cancelled. + * Add to our sent event queue so we can retain the + * event in case we need to cancel it later. Note it + * is currently in remote format and must be converted + * back to local format for fossil collection. + */ + e->state.owner = TW_pe_sevent_q; + if( g_tw_synchronization_protocol == CONSERVATIVE ) + tw_event_free(me, e); + } + + return; + } + + if (e->state.owner == TW_net_acancel) { + /* We just finished sending the cancellation message + * for this event. We need to free the buffer and + * make it available for reuse. + */ + tw_event_free(me, e); + return; + } + + /* Never should happen, not unless we somehow broke this + * module's other functions related to sending an event. + */ + + tw_error( + TW_LOC, + "Don't know how to finish send of owner=%u, cancel_q=%d", + e->state.owner, + e->state.cancel_q); + +} + +static void +service_queues(tw_pe *me) +{ + int changed; + do { + changed = test_q(&posted_recvs, me,1, recv_finish); + changed |= test_q(&posted_sends, me,0, send_finish); + changed |= recv_begin(me); + changed |= send_begin(me); + } while (changed); +} + +/* + * NOTE: Chris believes that this network layer is too aggressive at + * reading events out of the network.. so we are modifying the algorithm + * to only send events when tw_net_send it called, and only read events + * when tw_net_read is called. + */ +void +tw_net_read(tw_pe *me) +{ + service_queues(me); +} + +void +tw_net_send(tw_event *e) +{ + tw_pe * me = e->src_lp->pe; + int changed = 0; + + e->state.remote = 0; + e->state.owner = TW_net_outq; + tw_eventq_unshift(&outq, e); + + do + { + changed = test_q(&posted_sends, me, 0,send_finish); + changed |= send_begin(me); + } while (changed); +} + +void +tw_net_cancel(tw_event *e) +{ + tw_pe *src_pe = e->src_lp->pe; + + switch (e->state.owner) { + case TW_net_outq: + /* Cancelled before we could transmit it. Do not + * transmit the event and instead just release the + * buffer back into our own free list. + */ + tw_eventq_delete_any(&outq, e); + tw_event_free(src_pe, e); + + return; + + break; + + case TW_net_asend: + /* Too late. We've already let MPI start to send + * this event over the network. We can't pull it + * back now without sending another message to do + * the cancel. + * + * Setting the cancel_q flag will signal us to do + * another message send once the current send of + * this message is completed. + */ + e->state.cancel_asend = 1; + break; + + case TW_pe_sevent_q: + /* Way late; the event was already sent and is in + * our sent event queue. Mark it as a cancel and + * place it at the front of the outq. + */ + e->state.cancel_q = 1; + tw_eventq_unshift(&outq, e); + break; + + default: + /* Huh? Where did you come from? Why are we being + * told about you? We did not send you so we cannot + * cancel you! + */ + tw_error( + TW_LOC, + "Don't know how to cancel event owned by %u", + e->state.owner); + } + + service_queues(src_pe); +} + +/** + * tw_net_statistics + * @brief Function to output the statistics + * @attention Notice that the MPI_Reduce "count" parameter is greater than one. + * We are reducing on multiple variables *simultaneously* so if you change + * this function or the struct tw_statistics, you must update the other. + **/ +tw_statistics * +tw_net_statistics(tw_pe * me, tw_statistics * s) +{ + if(MPI_Reduce(&(s->s_max_run_time), + &me->stats.s_max_run_time, + 1, + MPI_DOUBLE, + MPI_MAX, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); + + if(MPI_Reduce(&(s->s_net_events), + &me->stats.s_net_events, + 17, + MPI_UNSIGNED_LONG_LONG, + MPI_SUM, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); + + if(MPI_Reduce(&s->s_min_detected_offset, + &me->stats.s_min_detected_offset, + 1, + MPI_DOUBLE, + MPI_MIN, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); + + if(MPI_Reduce(&(s->s_total), + &me->stats.s_total, + 16, + MPI_UNSIGNED_LONG_LONG, + MPI_MAX, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); + + if (MPI_Reduce(&s->s_events_past_end, + &me->stats.s_events_past_end, + 3, + MPI_UNSIGNED_LONG_LONG, + MPI_SUM, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); + +#ifdef USE_RIO + if (MPI_Reduce(&s->s_rio_load, + &me->stats.s_rio_load, + 1, + MPI_UNSIGNED_LONG_LONG, + MPI_MAX, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); + if (MPI_Reduce(&s->s_rio_lp_init, + &me->stats.s_rio_lp_init, + 1, + MPI_UNSIGNED_LONG_LONG, + MPI_MAX, + (int)g_tw_masternode, + MPI_COMM_ROSS) != MPI_SUCCESS) + tw_error(TW_LOC, "Unable to reduce statistics!"); +#endif + + return &me->stats; +} From 21d7151d77872fe11b3431b5e25c72de39961b94 Mon Sep 17 00:00:00 2001 From: yaciud <32393009+yaciud@users.noreply.github.com> Date: Tue, 19 Nov 2019 19:25:41 -0500 Subject: [PATCH 23/24] Update hybrid-mpi-layer.c Merged current files with my files. --- core/hybrid-mpi-layer.c | 481 +++++++++++++++------------------------- 1 file changed, 173 insertions(+), 308 deletions(-) diff --git a/core/hybrid-mpi-layer.c b/core/hybrid-mpi-layer.c index 468e5d733..bbd4c8122 100644 --- a/core/hybrid-mpi-layer.c +++ b/core/hybrid-mpi-layer.c @@ -1,43 +1,37 @@ #include #include -#include "ross.h" - -//This is the hybrid version of MPI_Comm MPI_COMM_ROSS = MPI_COMM_WORLD; int custom_communicator = 0; -static long id_tmp; +// This is the hybrid version, where send_q is filled from back, recv_q has completed events overwritten + +/** + * @struct act_q + * @brief Keeps track of posted send or recv operations. + */ struct act_q { - const char *name; + const char *name; - tw_event **event_list; - MPI_Request *req_list; - int *idx_list; - MPI_Status *status_list; - int *free_idx_list;//add, que of free indices - int *overflow_anti; + tw_event **event_list; /**< list of event pointers in this queue */ + MPI_Request *req_list; /**< list of MPI request handles */ + int *idx_list; /**< indices in this queue of finished operations */ + MPI_Status *status_list; /**< list of MPI_Status handles */ + unsigned int cur; /**< index of first open spot in the queue */ -#if ROSS_MEMORY - char **buffers; -#endif - unsigned int cur; - unsigned int front;//add, front of queue - //int coda;//add, back of queue but back is already a variable somewhere - int size_of_fr_q;//add, size of queue array - int num_in_fr_q;//add, number of elements in queue + unsigned *free_idx_list;//add, stack of free indices + unsigned *overflow_anti; + unsigned int front;//add, top of stack + int size_of_fr_q;//add, size of stack array + int num_in_fr_q;//add, number of elements on stack }; #define EVENT_TAG 1 -#if ROSS_MEMORY -#define EVENT_SIZE(e) TW_MEMORY_BUFFER_SIZE -#else #define EVENT_SIZE(e) g_tw_event_msg_sz -#endif static struct act_q posted_sends; static struct act_q posted_recvs; @@ -47,14 +41,12 @@ static unsigned int read_buffer = 16; static unsigned int send_buffer = 1024; static int world_size = 1; + void deal_with_cur(struct act_q *q)// this is for MPI_testsome input { -// printf("cur = %d, front %d\n", q->cur,q->front); - if(q->cur < (q->front))//(q->front))//checks to see if + if(q->cur < (q->front)) { q->cur++; -// printf("%s: CUR IS %d\n",q->name, q->cur); - return; } else @@ -62,47 +54,42 @@ void deal_with_cur(struct act_q *q)// this is for MPI_testsome input return; } } -/* -void cur_reduction(struct act_q * q,int ele) -{ - if(ele == q->cur && q->cur!=1) - { - q->cur--; - printf("%s: WOO! CUR REDUCED TO %d\n",q->name, q->cur); - } - return; - -} -*/ -void fr_q_chq(struct act_q *q, int *frontOrCoda) //free index queue; check for modulating the front or back index of que -{ - if(*frontOrCoda != q->size_of_fr_q)//don't mess with queue - { - return; - } - else//mess with queue - { - *frontOrCoda = 0; - return; - } -} -void fr_q_aq(struct act_q *q, int ele) // free index queue; add to queue +void fr_q_aq(struct act_q *q, unsigned ele) // free index queue; add to queue { q->front--; q->free_idx_list[q->front] = ele; -// cur_reduction(q,ele); return; } -int fr_q_dq(struct act_q *q) // free index queue; dequeue +unsigned fr_q_dq(struct act_q *q) // free index queue; dequeue { - int rv = q->free_idx_list[q->front]; + unsigned rv = q->free_idx_list[q->front]; q->front++; q->num_in_fr_q--; return rv; } +void check_b_ind( int * b_index, struct act_q * q) +{ + while((0 <= *b_index) && (*b_index<=q->size_of_fr_q)) + { + + if(q->event_list[*b_index]== NULL) + { + *b_index = *b_index-1; + } + else + { + return; + } + + } + *b_index = 0; + return; + +} + static const tw_optdef mpi_opts[] = { TWOPT_GROUP("ROSS MPI Kernel"), @@ -119,10 +106,10 @@ static const tw_optdef mpi_opts[] = { // Forward declarations of functions used in MPI network message processing static int recv_begin(tw_pe *me); -static void recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q* q, int id); -static void late_recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q* q, int id); static int send_begin(tw_pe *me); -static void send_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q* q, int id); +static void send_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q* q, unsigned id); +static void recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q* q, unsigned id); +static void late_recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q* q, unsigned id); // Start of implmentation of network processing routines/functions void tw_comm_set(MPI_Comm comm) @@ -135,7 +122,6 @@ const tw_optdef * tw_net_init(int *argc, char ***argv) { int my_rank; - int initialized; MPI_Initialized(&initialized); @@ -143,6 +129,7 @@ tw_net_init(int *argc, char ***argv) if (MPI_Init(argc, argv) != MPI_SUCCESS) tw_error(TW_LOC, "MPI_Init failed."); } + if (MPI_Comm_rank(MPI_COMM_ROSS, &my_rank) != MPI_SUCCESS) tw_error(TW_LOC, "Cannot get MPI_Comm_rank(MPI_COMM_ROSS)"); @@ -152,58 +139,44 @@ tw_net_init(int *argc, char ***argv) return mpi_opts; } +/** + * @brief Initializes queues used for posted sends and receives + * + * @param[in] q pointer to the queue to be initialized + * @param[in] name name of the queue + */ + + + static void init_q(struct act_q *q, const char *name) { unsigned int n; -#if ROSS_MEMORY - unsigned int i; -#endif if(q == &posted_sends) - { n = send_buffer; - } else - { n = read_buffer; - } + q->name = name; q->event_list = (tw_event **) tw_calloc(TW_LOC, name, sizeof(*q->event_list), n); q->req_list = (MPI_Request *) tw_calloc(TW_LOC, name, sizeof(*q->req_list), n); q->idx_list = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), n); q->status_list = (MPI_Status *) tw_calloc(TW_LOC, name, sizeof(*q->status_list), n); - //can I shrink this initialization? q->free_idx_list = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), n+1);// queue, n+1 is meant to prevent a full queue q->overflow_anti = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), (n/2)+2);// queue, at most (n/2) can be out of order, first element is # of elements in queue q->front = 0;// front of queue q->size_of_fr_q = n+1;// for wraparound q->num_in_fr_q = 0; // number of elements in queue - int i = 0; + unsigned i = 0; while(ifree_idx_list[i] = i; -// fr_q_aq(q,i); i++; } -// q->front = 1; -// printf("front = %d\n",q->front); q->overflow_anti[0]=1; - q->num_in_fr_q = n;// number of elements in queue - - -#if ROSS_MEMORY - q->buffers = tw_calloc(TW_LOC, name, sizeof(*q->buffers), n); - - for(i = 0; i < n; i++) - q->buffers[i] = tw_calloc(TW_LOC, "", TW_MEMORY_BUFFER_SIZE, 1); -#endif -} - -tw_node * tw_net_onnode(tw_peid gid) { - id_tmp = gid; - return &id_tmp; + q->num_in_fr_q = n; } unsigned int @@ -230,11 +203,10 @@ tw_net_start(void) g_tw_synchronization_protocol = SEQUENTIAL; } else if(g_tw_synchronization_protocol == CONSERVATIVE || g_tw_synchronization_protocol == OPTIMISTIC) { g_tw_synchronization_protocol = SEQUENTIAL; - fprintf(stderr, "Warning: Defaulting to Sequential Simulation, not enought PEs defined.\n"); + fprintf(stderr, "Warning: Defaulting to Sequential Simulation, not enough PEs defined.\n"); } } -// tw_pe_create(1); tw_pe_init(); //If we're in (some variation of) optimistic mode, we need this hash @@ -257,7 +229,7 @@ tw_net_start(void) g_tw_net_device_size = read_buffer; // pre-post all the Irecv operations - recv_begin( g_tw_pe ); + recv_begin(g_tw_pe); } void @@ -270,10 +242,22 @@ tw_net_abort(void) void tw_net_stop(void) { +#ifdef USE_DAMARIS + if (g_st_damaris_enabled) + st_damaris_ross_finalize(); + else + { + if (!custom_communicator) { + if (MPI_Finalize() != MPI_SUCCESS) + tw_error(TW_LOC, "Failed to finalize MPI"); + } + } +#else if (!custom_communicator) { if (MPI_Finalize() != MPI_SUCCESS) tw_error(TW_LOC, "Failed to finalize MPI"); } +#endif } void @@ -288,7 +272,7 @@ tw_net_minimum(void) { tw_stime m = DBL_MAX; tw_event *e; - int i; + unsigned int i; e = outq.head; while (e) { @@ -297,32 +281,37 @@ tw_net_minimum(void) e = e->next; } - for (i = 0; i < posted_sends.cur; i++) { //fix this line (?) + for (i = 0; i < posted_sends.cur; i++) { e = posted_sends.event_list[i]; - if(e == NULL) - {} - else if(m > e->recv_ts) + if (m > e->recv_ts) m = e->recv_ts; - else - {} } return m; } +/** + * @brief Calls MPI_Testsome on the provided queue, to check for finished operations. + * + * @param[in] q queue to check + * @param[in] me pointer to the PE + * @param[in] finish pointer to function that will perform the appropriate send/recv + * finish functionality + * + * @return 0 if MPI_Testsome did not return any finished operations, 1 otherwise. + */ + + + static int test_q_recv( struct act_q * q, tw_pe *me, - void (*finish)(tw_pe *, tw_event *, char *, struct act_q*, int id)) + void (*finish)(tw_pe *, tw_event *, char *, struct act_q*, unsigned id)) { int ready, i, n, indicator; indicator = 1; -#if ROSS_MEMORY - char *tmp; -#endif - if( q->num_in_fr_q == ((q->size_of_fr_q)-1) ) return 0; @@ -353,14 +342,9 @@ test_q_recv( n = q->idx_list[i]; e = q->event_list[n]; fr_q_aq(q, n); //add n onto queue -// cur_reduction(q,n); - -#if ROSS_MEMORY - finish(me, e, q->buffers[n], q, n); -#else finish(me, e, NULL, q, n); -#endif + if (indicator != q->overflow_anti[0]) { indicator = q->overflow_anti[0]; @@ -372,7 +356,7 @@ test_q_recv( i--; } - i = 1; // first element of q->overflow_anti is the number of + i = 1; // first element of q->overflow_anti is the number of elements in the array while (i < q->overflow_anti[0])//takes care of out of order messages { @@ -386,42 +370,17 @@ test_q_recv( i++; } - return 1; } -void check_b_ind( int * b_index, struct act_q * q) -{ - while(0 <= *b_index && *b_index<=q->size_of_fr_q) - { - - if(q->event_list[*b_index]== NULL) - { - *b_index = *b_index-1; - } - else - { - return; - } - - } - *b_index = 0; - return; - -} - static int test_q_send( struct act_q * q, tw_pe *me, - void (*finish)(tw_pe *, tw_event *, char *, struct act_q*, int id)) + void (*finish)(tw_pe *, tw_event *, char *, struct act_q*, unsigned id)) { int ready, i, n; -#if ROSS_MEMORY - char *tmp; -#endif - if (!q->cur) { @@ -456,11 +415,7 @@ test_q_send( q->event_list[n] = NULL; i--; -#if ROSS_MEMORY - finish(me, e, q->buffers[n], q, n); -#else finish(me, e, NULL, q, n); -#endif } @@ -489,17 +444,23 @@ test_q_send( return 1; } + +/** + * @brief If there are any openings in the posted_recvs queue, post more Irecvs. + * + * @param[in] me pointer to the PE + * @return 0 if no changes are made to the queue, 1 otherwise. + */ + static int recv_begin(tw_pe *me) { - MPI_Status status; tw_event *e = NULL; - int flag = 0; int changed = 0; - while (0 < posted_recvs.num_in_fr_q)//fix these lines + while (0 < posted_recvs.num_in_fr_q) { if(!(e = tw_event_grab(me))) @@ -509,17 +470,8 @@ recv_begin(tw_pe *me) return changed; } - int id = fr_q_dq(&posted_recvs); + unsigned id = fr_q_dq(&posted_recvs); -#if ROSS_MEMORY - if( MPI_Irecv(posted_recvs.buffers[id], - EVENT_SIZE(e), - MPI_BYTE, - MPI_ANY_SOURCE, - EVENT_TAG, - MPI_COMM_ROSS, - &posted_recvs.req_list[id]) != MPI_SUCCESS) -#else if( MPI_Irecv(e, (int)EVENT_SIZE(e), MPI_BYTE, @@ -527,7 +479,6 @@ recv_begin(tw_pe *me) EVENT_TAG, MPI_COMM_ROSS, &posted_recvs.req_list[id]) != MPI_SUCCESS) -#endif { tw_event_free(me, e); return changed; @@ -541,59 +492,45 @@ recv_begin(tw_pe *me) } -static void -late_recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) -{ - tw_pe *dest_pe; - dest_pe = e->dest_lp->pe; - tw_event *cancel = tw_hash_remove(me->hash_t, e, e->send_pe); - g_tw_pe->avl_tree_size++; - cancel->state.cancel_q = 1; - cancel->state.remote = 0; - cancel->cancel_next = dest_pe->cancel_q; - dest_pe->cancel_q = cancel; - tw_event_free(me, e); - return; +/** + * @brief Determines how to handle the newly received event. + * + * @param[in] me pointer to PE + * @param[in] e pointer to event that we just received + * @param[in] buffer not currently used + */ + -} static void -recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) +recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, unsigned id ) { + (void) buffer; tw_pe *dest_pe; - tw_clock start; - -#if ROSS_MEMORY - tw_memory *memory; - tw_memory *last; - tw_fd mem_fd; - - size_t mem_size; + tw_clock start; - unsigned position = 0; - - memcpy(e, buffer, g_tw_event_msg_sz); - position += g_tw_event_msg_sz; -#endif me->stats.s_nread_network++; me->s_nwhite_recv++; - e->dest_lp = tw_getlocal_lp((tw_lpid) e->dest_lp); - dest_pe = e->dest_lp->pe; + // printf("recv_finish: remote event [cancel %u] FROM: LP %lu, PE %lu, TO: LP %lu, PE %lu at TS %lf \n", + // e->state.cancel_q, (tw_lpid)e->src_lp, e->send_pe, (tw_lpid)e->dest_lp, me->id, e->recv_ts); + e->dest_lp = tw_getlocal_lp((tw_lpid) e->dest_lp); + dest_pe = e->dest_lp->pe; // instrumentation e->dest_lp->kp->kp_stats->s_nread_network++; e->dest_lp->lp_stats->s_nread_network++; + if(e->send_pe > tw_nnodes()-1) + tw_error(TW_LOC, "bad sendpe_id: %d", e->send_pe); + e->cancel_next = NULL; e->caused_by_me = NULL; e->cause_next = NULL; - if(e->send_pe > tw_nnodes()-1) - tw_error(TW_LOC, "bad sendpe_id: %d", e->send_pe); if(e->recv_ts < me->GVT) @@ -605,10 +542,8 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) // if cancel event, retrieve and flush // else, store in hash table - if(e->state.cancel_q) { - tw_event *cancel = tw_hash_remove(me->hash_t, e, e->send_pe); // NOTE: it is possible to cancel the event we @@ -616,7 +551,7 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) // MPI module lets me read cancel events during // event sends over the network. - if(cancel!=NULL) + if(cancel!=NULL)//if cancel is not null, then it's not out of order, do expected { cancel->state.cancel_q = 1; @@ -625,16 +560,15 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) dest_pe->cancel_q = cancel; tw_event_free(me, e); } - else + else // add out of order event to late queue { - q->overflow_anti[q->overflow_anti[0]] = id; //add id stuff later + q->overflow_anti[q->overflow_anti[0]] = id; q->overflow_anti[0]++; } return; - } - + } if (g_tw_synchronization_protocol == OPTIMISTIC || g_tw_synchronization_protocol == OPTIMISTIC_DEBUG || @@ -643,33 +577,6 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) e->state.remote = 1; } -#if ROSS_MEMORY - mem_size = (size_t) e->memory; - mem_fd = (tw_fd) e->prev; - - last = NULL; - while(mem_size) - { - memory = tw_memory_alloc(e->dest_lp, mem_fd); - - if(last) - last->next = memory; - else - e->memory = memory; - - memcpy(memory, &buffer[position], mem_size); - position += mem_size; - - memory->fd = mem_fd; - memory->nrefs = 1; - - mem_size = (size_t) memory->next; - mem_fd = memory->fd; - - last = memory; - } -#endif - /* NOTE: the final check in the if conditional below was added to make sure * that we do not execute the fast case unless the cancellation queue is * empty on the destination PE. Otherwise we need to invoke the normal @@ -705,35 +612,49 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) */ tw_error( TW_LOC, - "Event received by PE %u but meant for PE %u", + "Event recived by PE %u but meant for PE %u", me->id, dest_pe->id); } -static int -send_begin(tw_pe *me) +static void +late_recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, unsigned id ) { - int changed = 0; - while (posted_sends.cur < send_buffer) - { - tw_event *e = tw_eventq_peek(&outq);//next event - tw_node *dest_node = NULL; + tw_pe *dest_pe; + dest_pe = e->dest_lp->pe; + tw_event *cancel = tw_hash_remove(me->hash_t, e, e->send_pe); + g_tw_pe->avl_tree_size++; -#if ROSS_MEMORY - tw_event *tmp_prev = NULL; + cancel->state.cancel_q = 1; + cancel->state.remote = 0; + cancel->cancel_next = dest_pe->cancel_q; + dest_pe->cancel_q = cancel; + tw_event_free(me, e); - tw_lp *tmp_lp = NULL; + return; - tw_memory *memory = NULL; - tw_memory *m = NULL; +} + +/** + * @brief If there are any openings in the posted_sends queue, start sends + * for events in the outgoing queue. + * + * @param[in] me pointer to the PE + * @return 0 if no changes are made to the posted_sends queue, 1 otherwise. + */ - char *buffer = NULL; +static int +send_begin(tw_pe *me) +{ + int changed = 0; - size_t mem_size = 0; + while (posted_sends.cur < send_buffer) + { + tw_event *e = tw_eventq_peek(&outq); + tw_peid dest_pe; - unsigned position = 0; -#endif + unsigned id = posted_sends.cur; if (!e) break; @@ -741,88 +662,20 @@ send_begin(tw_pe *me) if(e == me->abort_event) tw_error(TW_LOC, "Sending abort event!"); - unsigned id = posted_sends.cur; - dest_node = tw_net_onnode((*e->src_lp->type->map) - ((tw_lpid) e->dest_lp)); - - //if(!e->state.cancel_q) - //e->event_id = (tw_eventid) ++me->seq_num; + dest_pe = (*e->src_lp->type->map) ((tw_lpid) e->dest_lp); e->send_pe = (tw_peid) g_tw_mynode; e->send_lp = e->src_lp->gid; -#if ROSS_MEMORY - // pack pointers - tmp_prev = e->prev; - tmp_lp = e->src_lp; - - // delete when working - e->src_lp = NULL; - - memory = NULL; - if(e->memory) - { - memory = e->memory; - e->memory = (tw_memory *) tw_memory_getsize(me, memory->fd); - e->prev = (tw_event *) memory->fd; - mem_size = (size_t) e->memory; - } - - buffer = posted_sends.buffers[id]; - memcpy(&buffer[position], e, g_tw_event_msg_sz); - position += g_tw_event_msg_sz; - - // restore pointers - e->prev = tmp_prev; - e->src_lp = tmp_lp; - - m = NULL; - while(memory) - { - m = memory->next; - - if(m) - { - memory->next = (tw_memory *) - tw_memory_getsize(me, m->fd); - memory->fd = m->fd; - } - - if(position + mem_size > TW_MEMORY_BUFFER_SIZE) - tw_error(TW_LOC, "Out of buffer space!"); - - memcpy(&buffer[position], memory, mem_size); - position += mem_size; - - memory->nrefs--; - tw_memory_unshift(e->src_lp, memory, memory->fd); - - if(NULL != (memory = m)) - mem_size = tw_memory_getsize(me, memory->fd); - } - - e->memory = NULL; - - if (MPI_Isend(buffer, - EVENT_SIZE(e), - MPI_BYTE, - *dest_node, - EVENT_TAG, - MPI_COMM_ROSS, - &posted_sends.req_list[id]) != MPI_SUCCESS) { - return changed; - } -#else if (MPI_Isend(e, (int)EVENT_SIZE(e), MPI_BYTE, - (int)*dest_node, + (int)dest_pe, EVENT_TAG, MPI_COMM_ROSS, &posted_sends.req_list[id]) != MPI_SUCCESS) { return changed; } -#endif tw_eventq_pop(&outq); e->state.owner = e->state.cancel_q @@ -832,15 +685,24 @@ send_begin(tw_pe *me) posted_sends.event_list[id] = e; posted_sends.cur++; me->s_nwhite_sent++; + changed = 1; } - return changed; } +/** + * @brief Determines how to handle the buffer of event whose send operation + * just finished. + * + * @param[in] me pointer to PE + * @param[in] e pointer to event that we just received + * @param[in] buffer not currently used + */ static void -send_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id) +send_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, unsigned id) { + (void) buffer; me->stats.s_nsend_network++; // instrumentation e->src_lp->kp->kp_stats->s_nsend_network++; @@ -891,6 +753,11 @@ send_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id) } +/** + * @brief Start checks for finished operations in send/recv queues, + * and post new sends/recvs if possible. + * @param[in] me pointer to PE + */ static void service_queues(tw_pe *me) { @@ -1062,5 +929,3 @@ tw_net_statistics(tw_pe * me, tw_statistics * s) return &me->stats; } - - From 2d65f87130048546c76f780820dfb073296fcbad Mon Sep 17 00:00:00 2001 From: yaciud <32393009+yaciud@users.noreply.github.com> Date: Wed, 20 Nov 2019 10:06:26 -0500 Subject: [PATCH 24/24] Update hole-network-mpi.c Updated old code to more easily integrate with new code --- core/hole-network-mpi.c | 353 +++++++++++++--------------------------- 1 file changed, 113 insertions(+), 240 deletions(-) diff --git a/core/hole-network-mpi.c b/core/hole-network-mpi.c index 5cd63d91b..2e7a64d39 100644 --- a/core/hole-network-mpi.c +++ b/core/hole-network-mpi.c @@ -1,37 +1,30 @@ #include #include - MPI_Comm MPI_COMM_ROSS = MPI_COMM_WORLD; int custom_communicator = 0; -static long id_tmp; - +/** + * @struct act_q + * @brief Keeps track of posted send or recv operations. + */ struct act_q { - const char *name; - - tw_event **event_list; - MPI_Request *req_list; - int *idx_list; - MPI_Status *status_list; - int *overflow_anti; - unsigned int length; + const char *name; -#if ROSS_MEMORY - char **buffers; -#endif + tw_event **event_list; /**< list of event pointers in this queue */ + MPI_Request *req_list; /**< list of MPI request handles */ + int *idx_list; /**< indices in this queue of finished operations */ + MPI_Status *status_list; /**< list of MPI_Status handles */ + unsigned int cur; /**< index of first open spot in the queue */ + int *overflow_anti; + unsigned int length; - unsigned int cur; }; #define EVENT_TAG 1 -#if ROSS_MEMORY -#define EVENT_SIZE(e) TW_MEMORY_BUFFER_SIZE -#else #define EVENT_SIZE(e) g_tw_event_msg_sz -#endif static struct act_q posted_sends; static struct act_q posted_recvs; @@ -57,9 +50,9 @@ static const tw_optdef mpi_opts[] = { // Forward declarations of functions used in MPI network message processing static int recv_begin(tw_pe *me); static void recv_finish(tw_pe *me, tw_event *e, char * buffer,struct act_q * q, int id); -static void late_recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q* q, int id); static int send_begin(tw_pe *me); static void send_finish(tw_pe *me, tw_event *e, char * buffer,struct act_q * q, int id); +static void late_recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q* q, int id); // Start of implmentation of network processing routines/functions void tw_comm_set(MPI_Comm comm) @@ -89,13 +82,16 @@ tw_net_init(int *argc, char ***argv) return mpi_opts; } +/** + * @brief Initializes queues used for posted sends and receives + * + * @param[in] q pointer to the queue to be initialized + * @param[in] name name of the queue + */ static void init_q(struct act_q *q, const char *name) { unsigned int n; -#if ROSS_MEMORY - unsigned int i; -#endif if(q == &posted_sends) n = send_buffer; @@ -109,22 +105,32 @@ init_q(struct act_q *q, const char *name) q->status_list = (MPI_Status *) tw_calloc(TW_LOC, name, sizeof(*q->status_list), n); q->overflow_anti = (int *) tw_calloc(TW_LOC, name, sizeof(*q->idx_list), (n/2)+2);// queue, at most (n/2) can be out of order, first element is # of elements in queue q->overflow_anti[0]=1; -// q->cur = 0; q->length = n; -#if ROSS_MEMORY - q->buffers = tw_calloc(TW_LOC, name, sizeof(*q->buffers), n); - - for(i = 0; i < n; i++) - q->buffers[i] = tw_calloc(TW_LOC, "", TW_MEMORY_BUFFER_SIZE, 1); -#endif } -tw_node * tw_net_onnode(tw_peid gid) { - id_tmp = gid; - return &id_tmp; +void check_b_ind( int * b_index, struct act_q * q) +{ + while(0 <= *b_index && *b_index<=q->length) + { + + if(q->event_list[*b_index]== NULL) + { + *b_index = *b_index-1; + } + else + { + return; + } + + + } + *b_index = 0; + return; + } + unsigned int tw_nnodes(void) { @@ -149,11 +155,10 @@ tw_net_start(void) g_tw_synchronization_protocol = SEQUENTIAL; } else if(g_tw_synchronization_protocol == CONSERVATIVE || g_tw_synchronization_protocol == OPTIMISTIC) { g_tw_synchronization_protocol = SEQUENTIAL; - fprintf(stderr, "Warning: Defaulting to Sequential Simulation, not enought PEs defined.\n"); + fprintf(stderr, "Warning: Defaulting to Sequential Simulation, not enough PEs defined.\n"); } } -// tw_pe_create(1); tw_pe_init(); //If we're in (some variation of) optimistic mode, we need this hash @@ -176,7 +181,7 @@ tw_net_start(void) g_tw_net_device_size = read_buffer; // pre-post all the Irecv operations - recv_begin( g_tw_pe ); + recv_begin(g_tw_pe); } void @@ -219,7 +224,7 @@ tw_net_minimum(void) { tw_stime m = DBL_MAX; tw_event *e; - int i; + unsigned int i; e = outq.head; while (e) { @@ -237,28 +242,17 @@ tw_net_minimum(void) return m; } -void check_b_ind( int * b_index, struct act_q * q) -{ - while(0 <= *b_index && *b_index<=q->length) - { - - if(q->event_list[*b_index]== NULL) - { - *b_index = *b_index-1; - } - else - { - return; - } - - - } - *b_index = 0; - return; - -} - - +/** + * @brief Calls MPI_Testsome on the provided queue, to check for finished operations. + * + * @param[in] q queue to check + * @param[in] me pointer to the PE + * @param[in] finish pointer to function that will perform the appropriate send/recv + * finish functionality + * + * @return 0 if MPI_Testsome did not return any finished operations, 1 otherwise. + */ +//Here, I did not split test_q into two functions for each, instead opting for a boolean indicator static int test_q( struct act_q * q, @@ -270,10 +264,6 @@ test_q( q->overflow_anti[0] = 1; // indicator = 1; -#if ROSS_MEMORY - char *tmp; -#endif - if (!q->cur) { @@ -296,7 +286,8 @@ test_q( if (1 > ready) return 0; - if (pick_queue == 1) { + if (pick_queue == 1) + { for (i = 0; i < ready; i++) { @@ -306,11 +297,8 @@ test_q( n = q->idx_list[i]; e = q->event_list[n]; -#if ROSS_MEMORY - finish(me, e, q->buffers[n], q, n); -#else + finish(me, e, NULL, q, n); -#endif if (indicator != q->overflow_anti[0]) { @@ -330,8 +318,6 @@ test_q( late_recv_finish(me, e, NULL, q, n); q->event_list[n] = NULL; - //might need an augmented version for ROSS_MEMORY? - i++; } @@ -348,11 +334,9 @@ test_q( q->event_list[n] = NULL; -#if ROSS_MEMORY - finish(me, e, q->buffers[n], q, n); -#else + finish(me, e, NULL, q, n); -#endif + } } @@ -364,10 +348,6 @@ test_q( b_index = q->cur-1; -// if(strcmp(q->name,"MPI send queue")==0) -// { -// printf("%s: %d %d %d %d \n", q->name, b_index, q->cur,ready, q->cur - ready); -// } while(iidx_list[i]; @@ -375,36 +355,31 @@ test_q( check_b_ind(&b_index, q); if (n < b_index) { -// if(strcmp(q->name,"MPI send queue")==0) -// { -// printf("%s: putting away index %d\n",q->name, n); -// } -// printf("filling %d with event in %d\n",n , b_index) ; + q->event_list[n] = q->event_list[b_index]; memcpy(&q->req_list[n],&q->req_list[b_index],sizeof(q->req_list[0])); b_index--; + } i++; } q->cur -= ready; - if(strcmp(q->name,"MPI send queue")!=0) - { -// printf("%s: returning with %d\n", q->name,q->cur); - } return 1; } - +/** + * @brief If there are any openings in the posted_recvs queue, post more Irecvs. + * + * @param[in] me pointer to the PE + * @return 0 if no changes are made to the queue, 1 otherwise. + */ static int recv_begin(tw_pe *me) { - MPI_Status status; - tw_event *e = NULL; - int flag = 0; int changed = 0; while (posted_recvs.cur < read_buffer) @@ -418,15 +393,6 @@ recv_begin(tw_pe *me) return changed; } -#if ROSS_MEMORY - if( MPI_Irecv(posted_recvs.buffers[id], - EVENT_SIZE(e), - MPI_BYTE, - MPI_ANY_SOURCE, - EVENT_TAG, - MPI_COMM_ROSS, - &posted_recvs.req_list[id]) != MPI_SUCCESS) -#else if( MPI_Irecv(e, (int)EVENT_SIZE(e), MPI_BYTE, @@ -434,13 +400,11 @@ recv_begin(tw_pe *me) EVENT_TAG, MPI_COMM_ROSS, &posted_recvs.req_list[id]) != MPI_SUCCESS) -#endif { tw_event_free(me, e); return changed; } - posted_recvs.event_list[id] = e; posted_recvs.cur++; changed = 1; @@ -449,40 +413,37 @@ recv_begin(tw_pe *me) return changed; } - -static void -recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) +/** + * @brief Determines how to handle the newly received event. + * + * @param[in] me pointer to PE + * @param[in] e pointer to event that we just received + * @param[in] buffer not currently used + */ +static void recv_finish(tw_pe *me, tw_event *e, char * buffer,struct act_q * q, int id) { + (void) buffer; tw_pe *dest_pe; - tw_clock start; - -#if ROSS_MEMORY - tw_memory *memory; - tw_memory *last; - tw_fd mem_fd; - - size_t mem_size; + tw_clock start; - unsigned position = 0; - - memcpy(e, buffer, g_tw_event_msg_sz); - position += g_tw_event_msg_sz; -#endif me->stats.s_nread_network++; me->s_nwhite_recv++; - e->dest_lp = tw_getlocal_lp((tw_lpid) e->dest_lp); - dest_pe = e->dest_lp->pe; + // printf("recv_finish: remote event [cancel %u] FROM: LP %lu, PE %lu, TO: LP %lu, PE %lu at TS %lf \n", + // e->state.cancel_q, (tw_lpid)e->src_lp, e->send_pe, (tw_lpid)e->dest_lp, me->id, e->recv_ts); + e->dest_lp = tw_getlocal_lp((tw_lpid) e->dest_lp); + dest_pe = e->dest_lp->pe; // instrumentation e->dest_lp->kp->kp_stats->s_nread_network++; e->dest_lp->lp_stats->s_nread_network++; - if(e->send_pe > tw_nnodes()-1) tw_error(TW_LOC, "bad sendpe_id: %d", e->send_pe); + + if(e->recv_ts < me->GVT) tw_error(TW_LOC, "%d: Received straggler from %d: %lf (%d)", me->id, e->send_pe, e->recv_ts, e->state.cancel_q); @@ -492,10 +453,8 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) // if cancel event, retrieve and flush // else, store in hash table - if(e->state.cancel_q) { - tw_event *cancel = tw_hash_remove(me->hash_t, e, e->send_pe); // NOTE: it is possible to cancel the event we @@ -503,7 +462,7 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) // MPI module lets me read cancel events during // event sends over the network. - if(cancel!=NULL) + if(cancel!=NULL) // if correct order, do what is needed { e->cancel_next = NULL; @@ -516,12 +475,13 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) dest_pe->cancel_q = cancel; tw_event_free(me, e); } - else + else //if out of order, process later { q->overflow_anti[q->overflow_anti[0]] = id; //add id stuff later q->overflow_anti[0]++; } + return; } @@ -536,33 +496,6 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) e->state.remote = 1; } -#if ROSS_MEMORY - mem_size = (size_t) e->memory; - mem_fd = (tw_fd) e->prev; - - last = NULL; - while(mem_size) - { - memory = tw_memory_alloc(e->dest_lp, mem_fd); - - if(last) - last->next = memory; - else - e->memory = memory; - - memcpy(memory, &buffer[position], mem_size); - position += mem_size; - - memory->fd = mem_fd; - memory->nrefs = 1; - - mem_size = (size_t) memory->next; - mem_fd = memory->fd; - - last = memory; - } -#endif - /* NOTE: the final check in the if conditional below was added to make sure * that we do not execute the fast case unless the cancellation queue is * empty on the destination PE. Otherwise we need to invoke the normal @@ -598,12 +531,13 @@ recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) */ tw_error( TW_LOC, - "Event received by PE %u but meant for PE %u", + "Event recived by PE %u but meant for PE %u", me->id, dest_pe->id); } + static void late_recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id ) { @@ -643,7 +577,7 @@ late_recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id // else, store in hash table tw_event *cancel = tw_hash_remove(me->hash_t, e, e->send_pe); - g_tw_pe->avl_tree_size++; + g_tw_pe->avl_tree_size++;//needed because tw_hash_remove is called twice, decrementing this var // NOTE: it is possible to cancel the event we // are currently processing at this PE since this @@ -660,6 +594,13 @@ late_recv_finish(tw_pe *me, tw_event *e, char * buffer, struct act_q * q, int id } +/** + * @brief If there are any openings in the posted_sends queue, start sends + * for events in the outgoing queue. + * + * @param[in] me pointer to the PE + * @return 0 if no changes are made to the posted_sends queue, 1 otherwise. + */ static int send_begin(tw_pe *me) { @@ -668,112 +609,30 @@ send_begin(tw_pe *me) while (posted_sends.cur < send_buffer) { tw_event *e = tw_eventq_peek(&outq); - tw_node *dest_node = NULL; + tw_peid dest_pe; unsigned id = posted_sends.cur; -#if ROSS_MEMORY - tw_event *tmp_prev = NULL; - - tw_lp *tmp_lp = NULL; - - tw_memory *memory = NULL; - tw_memory *m = NULL; - - char *buffer = NULL; - - size_t mem_size = 0; - - unsigned position = 0; -#endif - if (!e) break; if(e == me->abort_event) tw_error(TW_LOC, "Sending abort event!"); - dest_node = tw_net_onnode((*e->src_lp->type->map) - ((tw_lpid) e->dest_lp)); - - //if(!e->state.cancel_q) - //e->event_id = (tw_eventid) ++me->seq_num; + dest_pe = (*e->src_lp->type->map) ((tw_lpid) e->dest_lp); e->send_pe = (tw_peid) g_tw_mynode; e->send_lp = e->src_lp->gid; -#if ROSS_MEMORY - // pack pointers - tmp_prev = e->prev; - tmp_lp = e->src_lp; - - // delete when working - e->src_lp = NULL; - - memory = NULL; - if(e->memory) - { - memory = e->memory; - e->memory = (tw_memory *) tw_memory_getsize(me, memory->fd); - e->prev = (tw_event *) memory->fd; - mem_size = (size_t) e->memory; - } - - buffer = posted_sends.buffers[id]; - memcpy(&buffer[position], e, g_tw_event_msg_sz); - position += g_tw_event_msg_sz; - - // restore pointers - e->prev = tmp_prev; - e->src_lp = tmp_lp; - - m = NULL; - while(memory) - { - m = memory->next; - - if(m) - { - memory->next = (tw_memory *) - tw_memory_getsize(me, m->fd); - memory->fd = m->fd; - } - - if(position + mem_size > TW_MEMORY_BUFFER_SIZE) - tw_error(TW_LOC, "Out of buffer space!"); - - memcpy(&buffer[position], memory, mem_size); - position += mem_size; - - memory->nrefs--; - tw_memory_unshift(e->src_lp, memory, memory->fd); - - if(NULL != (memory = m)) - mem_size = tw_memory_getsize(me, memory->fd); - } - - e->memory = NULL; - - if (MPI_Isend(buffer, - EVENT_SIZE(e), - MPI_BYTE, - *dest_node, - EVENT_TAG, - MPI_COMM_ROSS, - &posted_sends.req_list[id]) != MPI_SUCCESS) { - return changed; - } -#else if (MPI_Isend(e, (int)EVENT_SIZE(e), MPI_BYTE, - (int)*dest_node, + (int)dest_pe, EVENT_TAG, MPI_COMM_ROSS, &posted_sends.req_list[id]) != MPI_SUCCESS) { return changed; } -#endif tw_eventq_pop(&outq); e->state.owner = e->state.cancel_q @@ -789,9 +648,18 @@ send_begin(tw_pe *me) return changed; } +/** + * @brief Determines how to handle the buffer of event whose send operation + * just finished. + * + * @param[in] me pointer to PE + * @param[in] e pointer to event that we just received + * @param[in] buffer not currently used + */ static void send_finish(tw_pe *me, tw_event *e, char * buffer,struct act_q * q, int id) { + (void) buffer; me->stats.s_nsend_network++; // instrumentation e->src_lp->kp->kp_stats->s_nsend_network++; @@ -842,6 +710,11 @@ send_finish(tw_pe *me, tw_event *e, char * buffer,struct act_q * q, int id) } +/** + * @brief Start checks for finished operations in send/recv queues, + * and post new sends/recvs if possible. + * @param[in] me pointer to PE + */ static void service_queues(tw_pe *me) { @@ -878,7 +751,7 @@ tw_net_send(tw_event *e) do { - changed = test_q(&posted_sends, me, 0,send_finish); + changed = test_q(&posted_sends, me, 0, send_finish); changed |= send_begin(me); } while (changed); }