/* * Copyright (c) 2018 Sandia National Laboratories. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include #ifndef __CUSTOM_MATCH #define __CUSTOM_MATCH #include "../pml_ob1_recvreq.h" #include "../pml_ob1_recvfrag.h" #define CUSTOM_MATCH_DEBUG 0 #define CUSTOM_MATCH_DEBUG_VERBOSE 0 typedef struct custom_match_prq_node { __m512i tags; __m512i tmask; __m512i srcs; __m512i smask; struct custom_match_prq_node* next; int start, end; void* value[16]; } custom_match_prq_node; typedef struct custom_match_prq { custom_match_prq_node* head; custom_match_prq_node* tail; custom_match_prq_node* pool; int size; } custom_match_prq; static inline int custom_match_prq_cancel(custom_match_prq* list, void* req) { #if CUSTOM_MATCH_DEBUG_VERBOSE printf("custom_match_prq_cancel - list: %x req: %x\n", list, req); #endif custom_match_prq_node* prev = 0; custom_match_prq_node* elem = list->head; int i; while(elem) { for(i = elem->start; i <= elem->end; i++) { if(elem->value[i] == req) { #if CUSTOM_MATCH_DEBUG_VERBOSE printf("Canceled!");// %x %x %x\n", req, req->req_tag, req->req_peer); #endif void* payload = elem->value[i]; ((int*)(&(elem->tags)))[i] = ~0; ((int*)(&(elem->tmask)))[i] = ~0; ((int*)(&(elem->srcs)))[i] = ~0; ((int*)(&(elem->smask)))[i] = ~0; elem->value[i] = 0; if(i == elem->start || i == elem->end) { while((elem->start <= elem->end) && (!(elem->value[elem->start]))) elem->start++; while((elem->start <= elem->end) && (!(elem->value[elem->end]))) elem->end--; if(elem->start > elem->end) { if(prev) { prev->next = elem->next; } else { list->head = elem->next; } if(!elem->next) { list->tail = prev; } elem->next = list->pool; list->pool = elem; } } list->size--; return 1; } } prev = elem; elem = elem->next; } return 0; } static inline void* custom_match_prq_find_verify(custom_match_prq* list, int tag, int peer) { #if CUSTOM_MATCH_DEBUG_VERBOSE printf("custom_match_prq_find_verify list: %x tag: %x peer: %x\n", list, tag, peer); #endif __mmask16 result = 0; custom_match_prq_node* elem = list->head; int i; __m512i tsearch = _mm512_set1_epi32(tag); __m512i ssearch = _mm512_set1_epi32(peer); while(elem) { result = _mm512_cmpeq_epi32_mask(_mm512_and_epi32(elem->tags, elem->tmask), _mm512_and_epi32(tsearch, elem->tmask)) & _mm512_cmpeq_epi32_mask(_mm512_and_epi32(elem->srcs, elem->smask), _mm512_and_epi32(ssearch, elem->smask)); if(result) { for(i = elem->start; i <= elem->end; i++) { if((0x1 << i & result) && elem->value[i]) { #if CUSTOM_MATCH_DEBUG_VERBOSE printf("Found list: %x tag: %x peer: %x\n", list, req->req_tag, req->req_peer); #endif return elem->value[i]; } } } elem = elem->next; } return 0; } static inline void* custom_match_prq_find_dequeue_verify(custom_match_prq* list, int tag, int peer) { #if CUSTOM_MATCH_DEBUG_VERBOSE printf("custom_match_prq_find_dequeue_verify list: %x:%d tag: %x peer: %x\n", list, list->size, tag, peer); #endif __mmask16 result = 0; custom_match_prq_node* prev = 0; custom_match_prq_node* elem = list->head; int i; __m512i tsearch = _mm512_set1_epi32(tag); __m512i ssearch = _mm512_set1_epi32(peer); while(elem) { #if CUSTOM_MATCH_DEBUG_VERBOSE for(int iter = elem->start; iter <= elem->end; iter++) { //printf("Search = %x, Element Key = %x, Element mask = %x", ((int32_t*) &search)[iter], ((int32_t*) &elem->keys)[iter], ((int32_t*) &elem->mask)[iter]); } #endif result = _mm512_cmpeq_epi32_mask(_mm512_and_epi32(elem->tags, elem->tmask), _mm512_and_epi32(tsearch, elem->tmask)) & _mm512_cmpeq_epi32_mask(_mm512_and_epi32(elem->srcs, elem->smask), _mm512_and_epi32(ssearch, elem->smask)); if(result) { for(i = elem->start; i <= elem->end; i++) { mca_pml_base_request_t *req = (mca_pml_base_request_t *)elem->value[i]; if((0x1 << i & result) && req && ((req->req_peer == peer || req->req_peer == OMPI_ANY_SOURCE) && (req->req_tag == tag || req->req_tag == OMPI_ANY_TAG))) { void* payload = elem->value[i]; ((int*)(&(elem->tags)))[i] = ~0; ((int*)(&(elem->tmask)))[i] = ~0; ((int*)(&(elem->srcs)))[i] = ~0; ((int*)(&(elem->smask)))[i] = ~0; elem->value[i] = 0; if(i == elem->start || i == elem->end) { while((elem->start <= elem->end) && (!(elem->value[elem->start]))) elem->start++; while((elem->start <= elem->end) && (!(elem->value[elem->end]))) elem->end--; if(elem->start > elem->end) { if(prev) { prev->next = elem->next; } else { list->head = elem->next; } if(!elem->next) { list->tail = prev; } elem->next = list->pool; list->pool = elem; } } list->size--; #if CUSTOM_MATCH_DEBUG_VERBOSE printf("Found list: %x tag: %x peer: %x\n", list, req->req_tag, req->req_peer); #endif return payload; } } } prev = elem; elem = elem->next; } return 0; } static inline void custom_match_prq_append(custom_match_prq* list, void* payload, int tag, int source) { int32_t mask_tag, mask_src; if(source == OMPI_ANY_SOURCE) { mask_src = 0; } else { mask_src = ~0; } if(tag == OMPI_ANY_TAG) { mask_tag = 0; } else { mask_tag = ~0; } mca_pml_base_request_t *req = (mca_pml_base_request_t *)payload; #if CUSTOM_MATCH_DEBUG_VERBOSE printf("custom_match_prq_append list: %x key: %x mask: %x tag: %x peer: %x\n", list, key, mask, req->req_tag, req->req_peer); #endif int i; custom_match_prq_node* elem; if((!list->tail) || list->tail->end == 15) { if(list->pool) { elem = list->pool; list->pool = list->pool->next; } else { elem = _mm_malloc(sizeof(custom_match_prq_node),64); //if(!elem) //{ // printf("Error: Couldn't create memory\n"); //} } elem->tags = _mm512_set1_epi32(~0); // TODO: we only have to do this type of initialization for freshly malloc'd entries. elem->tmask = _mm512_set1_epi32(~0); elem->srcs = _mm512_set1_epi32(~0); elem->smask = _mm512_set1_epi32(~0); elem->next = 0; elem->start = 0; elem->end = -1; // we don't have an element yet for(i = 0; i < 16; i++) elem->value[i] = 0; if(list->tail) { list->tail->next = elem; list->tail = elem; } else { list->head = elem; list->tail = elem; } } elem = list->tail; elem->end++; ((int*)(&(elem->tags)))[elem->end] = tag; ((int*)(&(elem->tmask)))[elem->end] = mask_tag; ((int*)(&(elem->srcs)))[elem->end] = source; ((int*)(&(elem->smask)))[elem->end] = mask_src; elem->value[elem->end] = payload; list->size++; #if CUSTOM_MATCH_DEBUG_VERBOSE printf("Exiting custom_match_prq_append\n"); #endif } static inline int custom_match_prq_size(custom_match_prq* list) { return list->size; } static inline custom_match_prq* custom_match_prq_init() { #if CUSTOM_MATCH_DEBUG_VERBOSE printf("custom_match_prq_init\n"); #endif custom_match_prq* list = _mm_malloc(sizeof(custom_match_prq),64); list->head = 0; list->tail = 0; list->pool = 0; list->size = 0; return list; } static inline void custom_match_prq_destroy(custom_match_prq* list) { #if CUSTOM_MATCH_DEBUG_VERBOSE printf("custom_match_prq_destroy\n"); #endif custom_match_prq_node* elem; while(list->head) { elem = list->head; list->head = list->head->next; _mm_free(elem); } while(list->pool) { elem = list->pool; list->pool = list->pool->next; _mm_free(elem); } _mm_free(list); } static inline void custom_match_print(custom_match_prq* list) { custom_match_prq_node* elem; int i = 0; int j = 0; printf("Elements in the list (this is currenly only partialy implemented):\n"); for(elem = list->head; elem; elem = elem->next) { printf("This is the %d linked list element\n", ++i); for(j = 0; j < 16; j++) { printf("%d:%d The key is %d, the mask is %d, the value is %ld\n", i, j, ((int*)(&(elem->tags)))[j], ((int*)(&(elem->tmask)))[j], elem->value[j]); } i++; } } static inline void custom_match_prq_dump(custom_match_prq* list) { opal_list_item_t* item; char cpeer[64], ctag[64]; custom_match_prq_node* elem; int i = 0; int j = 0; printf("Elements in the list:\n"); for(elem = list->head; elem; elem = elem->next) { printf("This is the %d linked list element\n", ++i); for(j = 0; j < 16; j++) { if(elem->value[j]) { mca_pml_base_request_t *req = (mca_pml_base_request_t *)elem->value[j]; if( OMPI_ANY_SOURCE == req->req_peer ) snprintf(cpeer, 64, "%s", "ANY_SOURCE"); else snprintf(cpeer, 64, "%d", req->req_peer); if( OMPI_ANY_TAG == req->req_tag ) snprintf(ctag, 64, "%s", "ANY_TAG"); else snprintf(ctag, 64, "%d", req->req_tag); opal_output(0, "req %p peer %s tag %s addr %p count %lu datatype %s [%p] [%s %s] req_seq %" PRIu64, (void*) req, cpeer, ctag, (void*) req->req_addr, req->req_count, (0 != req->req_count ? req->req_datatype->name : "N/A"), (void*) req->req_datatype, (req->req_pml_complete ? "pml_complete" : ""), (req->req_free_called ? "freed" : ""), req->req_sequence); } } } } // UMQ below. typedef struct custom_match_umq_node { __m512i tags; __m512i srcs; struct custom_match_umq_node* next; int start, end; void* value[16]; } custom_match_umq_node; typedef struct custom_match_umq { custom_match_umq_node* head; custom_match_umq_node* tail; custom_match_umq_node* pool; int size; } custom_match_umq; static inline void custom_match_umq_dump(custom_match_umq* list); static inline void* custom_match_umq_find_verify_hold(custom_match_umq* list, int tag, int peer, custom_match_umq_node** hold_prev, custom_match_umq_node** hold_elem, int* hold_index) { #if CUSTOM_MATCH_DEBUG_VERBOSE printf("custom_match_umq_find_verify_hold list: %x:%d tag: %x peer: %x\n", list, list->size, tag, peer); custom_match_umq_dump(list); #endif __mmask16 result = 0; custom_match_umq_node* prev = 0; custom_match_umq_node* elem = list->head; int i; __m512i tsearch = _mm512_set1_epi32(tag); __m512i ssearch = _mm512_set1_epi32(peer); int tmask = ~0; int smask = ~0; if(peer == OMPI_ANY_SOURCE) { smask = 0; } if(tag == OMPI_ANY_TAG) { tmask = 0; } __m512i tmasks = _mm512_set1_epi32(tmask); __m512i smasks = _mm512_set1_epi32(smask); tsearch = _mm512_and_epi32(tsearch, tmasks); ssearch = _mm512_and_epi32(ssearch, smasks); while(elem) { result = _mm512_cmpeq_epi32_mask(_mm512_and_epi32(elem->tags,tmasks), tsearch) & _mm512_cmpeq_epi32_mask(_mm512_and_epi32(elem->srcs,smasks), ssearch); if(result) { for(i = elem->start; i <= elem->end; i++) { if((0x1 << i & result) && elem->value[i]) { *hold_prev = prev; *hold_elem = elem; *hold_index = i; return elem->value[i]; } } } prev = elem; elem = elem->next; } return 0; } static inline void custom_match_umq_remove_hold(custom_match_umq* list, custom_match_umq_node* prev, custom_match_umq_node* elem, int i) { #if CUSTOM_MATCH_DEBUG_VERBOSE printf("custom_match_umq_find_remove_hold %x %x %x\n", prev, elem, i); #endif ((int*)(&(elem->tags)))[i] = ~0; ((int*)(&(elem->srcs)))[i] = ~0; elem->value[i] = 0; if(i == elem->start || i == elem->end) { while((elem->start <= elem->end) && (!(elem->value[elem->start]))) elem->start++; while((elem->start <= elem->end) && (!(elem->value[elem->end]))) elem->end--; if(elem->start > elem->end) { if(prev) { prev->next = elem->next; } else { list->head = elem->next; } if(!elem->next) { list->tail = prev; } elem->next = list->pool; list->pool = elem; } } list->size--; } static inline void custom_match_umq_append(custom_match_umq* list, int tag, int source, void* payload) { #if CUSTOM_MATCH_DEBUG_VERBOSE int32_t key = source; ((int8_t*)&key)[3] = (int8_t) tag; // MGFD TODO verify this set higer order bits... #endif mca_pml_ob1_recv_frag_t *req = (mca_pml_ob1_recv_frag_t *)payload; #if CUSTOM_MATCH_DEBUG_VERBOSE printf("custom_match_umq_append list: %x payload: %x tag: %d src: %d\n", list, payload, req->hdr.hdr_match.hdr_tag, req->hdr.hdr_match.hdr_src); #endif int i; custom_match_umq_node* elem; list->size++; if((!list->tail) || list->tail->end == 15) { if(list->pool) { elem = list->pool; list->pool = list->pool->next; } else { elem = _mm_malloc(sizeof(custom_match_umq_node),64); } elem->tags = _mm512_set1_epi32(~0); // TODO: we only have to do this type of initialization for freshly malloc'd entries. elem->srcs = _mm512_set1_epi32(~0); elem->next = 0; elem->start = 0; elem->end = -1; // we don't have an element yet for(i = 0; i < 16; i++) elem->value[i] = 0; if(list->tail) { list->tail->next = elem; list->tail = elem; } else { list->head = elem; list->tail = elem; } } elem = list->tail; elem->end++; ((int*)(&(elem->tags)))[elem->end] = tag; ((int*)(&(elem->srcs)))[elem->end] = source; elem->value[elem->end] = payload; #if CUSTOM_MATCH_DEBUG_VERBOSE custom_match_umq_dump(list); #endif } static inline custom_match_umq* custom_match_umq_init() { #if CUSTOM_MATCH_DEBUG_VERBOSE printf("custom_match_umq_init\n"); #endif custom_match_umq* list = _mm_malloc(sizeof(custom_match_umq),64); list->head = 0; list->tail = 0; list->pool = 0; list->size = 0; return list; } static inline void custom_match_umq_destroy(custom_match_umq* list) { #if CUSTOM_MATCH_DEBUG_VERBOSE printf("custom_match_umq_destroy\n"); #endif custom_match_umq_node* elem; while(list->head) { elem = list->head; list->head = list->head->next; _mm_free(elem); } while(list->pool) { elem = list->pool; list->pool = list->pool->next; _mm_free(elem); } _mm_free(list); } static inline int custom_match_umq_size(custom_match_umq* list) { return list->size; } static inline void custom_match_umq_dump(custom_match_umq* list) { char cpeer[64], ctag[64]; custom_match_umq_node* elem; int i = 0; int j = 0; //printf("Elements in the list:\n"); for(elem = list->head; elem; elem = elem->next) { //printf("This is the %d linked list element\n", ++i); for(j = 0; j < 16; j++) { if(elem->value[j]) { mca_pml_ob1_recv_frag_t *req = (mca_pml_ob1_recv_frag_t *)elem->value[j]; //printf("%x %x %x\n", elem->value[j], req->hdr.hdr_match.hdr_tag, req->hdr.hdr_match.hdr_src); if( OMPI_ANY_SOURCE == req->hdr.hdr_match.hdr_src ) snprintf(cpeer, 64, "%s", "ANY_SOURCE"); else snprintf(cpeer, 64, "%d", req->hdr.hdr_match.hdr_src); if( OMPI_ANY_TAG == req->hdr.hdr_match.hdr_tag ) snprintf(ctag, 64, "%s", "ANY_TAG"); else snprintf(ctag, 64, "%d", req->hdr.hdr_match.hdr_tag); // opal_output(0, "peer %s tag %s",// addr %p count %lu datatype %s [%p] [%s %s] req_seq %" PRIu64, // /*(void*) req,*/ cpeer, ctag, //(void*) req->req_addr, req->req_count, //(0 != req->req_count ? req->req_datatype->name : "N/A"), //(void*) req->req_datatype, //(req->req_pml_complete ? "pml_complete" : ""), //(req->req_free_called ? "freed" : ""), //req->req_sequence); // ); } } } } #endif