00001
00023 #include "config.h"
00024 #ifdef GS_SMART_GRIDSOLVE
00025 #include "gs_smart_netpm.h"
00026 #include "comm_encode.h"
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039 int gs_smart_netpm_construct_netpm(gs_smart_netpm * netpm,
00040 gs_server_t ** server_list, int nb_servers){
00041
00042 if((!netpm) || (!server_list)) return -1;
00043
00044 if(gs_smart_netpm_aggregate_ping_info(server_list, nb_servers)<0){
00045 ERRPRINTF("SMART: Error agrgregating ping info\n");
00046 return -1;
00047 }
00048
00049 if(gs_smart_netpm_measure_client_links_perf(server_list, nb_servers)<0){
00050 ERRPRINTF("SMART: Error measuring client links of network pm\n");
00051 return -1;
00052 }
00053
00054 if(gs_smart_netpm_construct_nodes(netpm, server_list, nb_servers)<0){
00055 ERRPRINTF("SMART: Error constructing nodes of network performance model\n");
00056 return -1;
00057 }
00058
00059 if(gs_smart_netpm_construct_links(netpm, server_list, nb_servers)<0){
00060 ERRPRINTF("SMART: Error constructing links of network performance model\n");
00061 return -1;
00062 }
00063
00064 if(gs_smart_netpm_add_bw_to_links(netpm, server_list, nb_servers)<0){
00065 ERRPRINTF("SMART: Error adding bandwidth links of network pm\n");
00066 return -1;
00067 }
00068
00069 return 0;
00070 }
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083 int gs_smart_netpm_construct_nodes(gs_smart_netpm * netpm,
00084 gs_server_t ** server_list, int nb_servers){
00085
00086 gs_server_t * this_server=NULL;
00087 gs_smart_netpm_node * netpm_node;
00088 int i;
00089
00090 if((!netpm) || (!server_list) || (nb_servers<=0)) return -1;
00091
00092 netpm->nb_nodes=nb_servers;
00093
00094
00095
00096
00097 netpm->netpm_clnt_node=(gs_smart_netpm_node *)
00098 calloc(1, sizeof(gs_smart_netpm_node ));
00099
00100 if(!netpm->netpm_clnt_node) return -1;
00101
00102 netpm->netpm_clnt_node->node_type=GS_SMART_NETPM_CLIENT_NODE;
00103
00104
00105
00106
00107 netpm->netpm_nodes=(gs_smart_netpm_node **)
00108 calloc(netpm->nb_nodes, sizeof(gs_smart_netpm_node *));
00109
00110 if(!netpm->netpm_nodes) return -1;
00111
00112
00113 for(i=0; i<netpm->nb_nodes; i++){
00114
00115
00116
00117
00118 netpm->netpm_nodes[i]=(gs_smart_netpm_node *)
00119 calloc(1, sizeof(gs_smart_netpm_node));
00120
00121 if(!netpm->netpm_nodes[i]) return -1;
00122
00123 netpm_node=netpm->netpm_nodes[i];
00124 netpm_node->id=i;
00125 netpm_node->node_type=GS_SMART_NETPM_SERVER_NODE;
00126
00127 if(!server_list[i]) return -1;
00128
00129 netpm_node->server=server_list[i];
00130 this_server = netpm_node->server;
00131 }
00132
00133 return 0;
00134 }
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152 int gs_smart_netpm_construct_links(gs_smart_netpm * netpm,
00153 gs_server_t ** server_list, int nb_servers){
00154
00155 gs_smart_netpm_node * netpm_node, * netpm_clnt_node, * dest_netpm_node;
00156 gs_server_t * this_server=NULL;
00157 int i, j;
00158 int alloc_links=0;
00159
00160 if((!netpm) || (!server_list) || (nb_servers<=0)) return -1;
00161 if(!netpm->netpm_clnt_node) return -1;
00162
00163
00164
00165
00166
00167
00168
00169 netpm_clnt_node=netpm->netpm_clnt_node;
00170 netpm_clnt_node->nb_links=nb_servers;
00171 netpm_clnt_node->netpm_links=(gs_smart_netpm_link **)
00172 calloc(nb_servers, sizeof(gs_smart_netpm_link *));
00173
00174 if(!netpm_clnt_node->netpm_links) return -1;
00175
00176
00177
00178
00179 for(i=0; i<netpm->nb_nodes;i++){
00180 if(!netpm->netpm_nodes[i]) return -1;
00181 netpm_node=netpm->netpm_nodes[i];
00182
00183 if(!netpm_node->server) return -1;
00184 this_server = netpm_node->server;
00185
00186
00187
00188 if(this_server->smart==1){
00189
00190
00191
00192
00193
00194 netpm_node->nb_links=this_server->nb_smart_servers;
00195 netpm_node->netpm_links=(gs_smart_netpm_link **)
00196 calloc(netpm_node->nb_links, sizeof(gs_smart_netpm_link *));
00197
00198 if(!netpm_node->netpm_links) return -1;
00199
00200
00201
00202
00203 netpm_node->netpm_links[0]=(gs_smart_netpm_link *)
00204 calloc(1, sizeof(gs_smart_netpm_link ));
00205
00206 if(!netpm_node->netpm_links[0]) return -1;
00207
00208
00209
00210
00211
00212
00213 int link_node=0;
00214 for(j=0; j<alloc_links; j++){
00215 if(!netpm->netpm_nodes[j]) return -1;
00216 if(!netpm->netpm_nodes[j]->server) return -1;
00217
00218 while(netpm->netpm_nodes[link_node]->server->smart==0){
00219 if(link_node>netpm->nb_nodes) return -1;
00220 link_node++;
00221 }
00222
00223 if(!netpm->netpm_nodes[link_node]) return -1;
00224 dest_netpm_node=netpm->netpm_nodes[link_node];
00225
00226 netpm_node->netpm_links[j+1]=(gs_smart_netpm_link *)
00227 calloc(1, sizeof(gs_smart_netpm_link));
00228
00229 if(!netpm_node->netpm_links[j+1]) return -1;
00230
00231
00232
00233
00234
00235 netpm_node->netpm_links[j+1]->linked_nodeA=netpm_node;
00236 netpm_node->netpm_links[j+1]->linked_nodeB=dest_netpm_node;
00237
00238
00239
00240
00241
00242 dest_netpm_node->netpm_links[alloc_links]=netpm_node->netpm_links[j+1];
00243 link_node++;
00244 }
00245 alloc_links++;
00246 }
00247 else{
00248
00249
00250
00251
00252
00253 netpm_node->nb_links=1;
00254 netpm_node->netpm_links=(gs_smart_netpm_link **)
00255 calloc(1, sizeof(gs_smart_netpm_link *));
00256
00257 if(!netpm_node->netpm_links) return -1;
00258
00259 netpm_node->netpm_links[0]=(gs_smart_netpm_link *)
00260 calloc(1, sizeof(gs_smart_netpm_link));
00261
00262 if(!netpm_node->netpm_links[0]) return -1;
00263 }
00264
00265
00266
00267
00268
00269 netpm_node->netpm_links[0]->linked_nodeA=netpm_node;
00270 netpm_node->netpm_links[0]->linked_nodeB=netpm_clnt_node;
00271
00272
00273
00274
00275
00276 netpm_clnt_node->netpm_links[i]=netpm_node->netpm_links[0];
00277 }
00278
00279 return 0;
00280 }
00281
00282
00283
00284
00285
00286
00287
00288
00289
00290
00291
00292
00293
00294
00295 int gs_smart_netpm_add_bw_to_links(gs_smart_netpm * netpm,
00296 gs_server_t ** server_list, int nb_servers){
00297 gs_server_t * this_server=NULL, * search_server=NULL;
00298 gs_smart_netpm_node * netpm_node;
00299 int i, j, k;
00300 int link_num=0;
00301 double ping_byte_size, bw;
00302 double avg_s_bw, avg_c_bw, s_sum, c_sum;
00303 int nnzs, nnzc;
00304 int nb_smart_servers=0;
00305
00306 if((!netpm) || (!server_list) || (nb_servers<=0)) return -1;
00307
00308 s_sum = c_sum = avg_c_bw = avg_s_bw = 0.0;
00309 nnzs = nnzc = 0;
00310
00311 ping_byte_size=(32*1024);
00312 for(i=0; i<netpm->nb_nodes;i++){
00313
00314 if(!netpm->netpm_nodes[i]) return -1;
00315 netpm_node=netpm->netpm_nodes[i];
00316
00317 if(!netpm_node->server) return -1;
00318 this_server = netpm_node->server;
00319
00320
00321
00322
00323
00324 if(!netpm_node->netpm_links[0]) return -1;
00325 netpm_node->netpm_links[0]->bw=this_server->comm_bw;
00326
00327 if(this_server->comm_bw != 0.0) {
00328 c_sum += this_server->comm_bw;
00329 nnzc++;
00330 }
00331
00332
00333
00334
00335
00336
00337
00338
00339
00340
00341 if(this_server->smart==1){
00342 for(j=0;j<this_server->nb_my_pings ;j++){
00343 link_num=1;
00344 for(k=0; k<netpm->nb_nodes; k++){
00345 if(!netpm->netpm_nodes[k]) return -1;
00346 if(!netpm->netpm_nodes[k]->server) return -1;
00347
00348 search_server=netpm->netpm_nodes[k]->server;
00349
00350 if(!this_server->my_pings[j]) return -1;
00351 if((!search_server->componentid) ||
00352 (!this_server->my_pings[j]->componentid) ) return -1;
00353
00354 if(memcmp(search_server->componentid,
00355 this_server->my_pings[j]->componentid, CID_LEN)==0){
00356
00357 bw =(ping_byte_size/this_server->my_pings[j]->comm_time);
00358 if(bw<0) bw=0.0;
00359
00360
00361 if(!netpm_node->netpm_links[link_num]) return -1;
00362
00363 netpm_node->netpm_links[link_num]->bw=bw;
00364 if(bw != 0.0){
00365 s_sum+=bw;
00366 nnzs++;
00367 }
00368
00369
00370
00371 }
00372 if((search_server->smart==1) && (i!=k)) link_num++;
00373 }
00374 }
00375 nb_smart_servers++;
00376 }
00377 }
00378 if(nb_smart_servers<2){
00379 netpm->avg_s_bw=0.0;
00380 }
00381 else{
00382 if(nnzs > 0){
00383 netpm->avg_s_bw=s_sum/(double) nnzs;
00384 }
00385 if(netpm->avg_s_bw == 0.0)
00386 netpm->avg_s_bw = 20000.0;
00387 }
00388
00389 if(nnzc > 0){
00390 netpm->avg_c_bw=c_sum/(double) nnzc;
00391 }
00392 if(netpm->avg_c_bw == 0.0)
00393 netpm->avg_c_bw = 20000.0;
00394
00395 return 0;
00396 }
00397
00398
00399
00400
00401
00402
00403
00404
00405
00406
00407
00408
00409
00410
00411
00412
00413
00414
00415
00416
00417
00418
00419 int gs_smart_netpm_aggregate_ping_info(gs_server_t ** server_list,
00420 int nb_servers){
00421 gs_server_pings_t * ping_info1, * ping_info2;
00422 int nb_smart_servers=0, i, j,k,l;
00423 int arr_cnt;
00424
00425 if((!server_list) || (nb_servers<=0)){
00426 ERRPRINTF("SMART: Server List is NULL\n");
00427 return -1;
00428 }
00429
00430
00431
00432
00433
00434
00435 for(i=0;i<nb_servers;i++){
00436 if(!server_list[i]) return -1;
00437 if(server_list[i]->smart==1){
00438
00439 if(!server_list[i]->my_ping_str) return -1;
00440
00441 if(gs_smart_decode_ping_info(server_list[i]->my_ping_str,
00442 &server_list[i]->my_pings,&server_list[i]->nb_my_pings)<0){
00443 ERRPRINTF("SMART: Error decoding smart info\n");
00444 return -1;
00445 }
00446 if(server_list[i]->my_ping_str) free(server_list[i]->my_ping_str);
00447 nb_smart_servers++;
00448 }
00449 }
00450
00451
00452
00453
00454
00455
00456
00457 for(i=0;i<nb_servers;i++){
00458 server_list[i]->nb_smart_servers=nb_smart_servers;
00459 server_list[i]->all_pings=(gs_server_pings_t **)
00460 calloc( nb_smart_servers, sizeof(gs_server_pings_t *));
00461
00462 if(!server_list[i]->all_pings){
00463 ERRPRINTF("SMART: Error allocating all ping\n");
00464 return -1;
00465 }
00466
00467 arr_cnt=0;
00468 for(j=0;j<nb_servers;j++){
00469 if(server_list[j]->smart==1){
00470 server_list[i]->all_pings[arr_cnt]=(gs_server_pings_t *)
00471 calloc(1, sizeof(gs_server_pings_t));
00472
00473 if (!server_list[i]->all_pings[arr_cnt]) {
00474 ERRPRINTF("SMART: Cannot malloc all ping %d\n", i);
00475 return -1;
00476 }
00477
00478 if(memcpy( server_list[i]->all_pings[arr_cnt]->componentid,
00479 server_list[j]->componentid, CID_LEN)<0){
00480 ERRPRINTF("SMART: Error copying component ids\n");
00481 return -1;
00482 }
00483
00484 if(!server_list[i]->all_pings[arr_cnt]->componentid){
00485 ERRPRINTF("SMART: Error copying component id data to ping data\n");
00486 return -1;
00487 }
00488
00489 server_list[i]->all_pings[arr_cnt]->comm_time=0.0;
00490 arr_cnt++;
00491 }
00492 }
00493 }
00494
00495
00496
00497
00498
00499
00500
00501 for(i=0;i<nb_servers;i++){
00502 if(!server_list[i]) return -1;
00503 if( server_list[i]->smart==1){
00504 for(j=0;j< server_list[i]->nb_my_pings;j++){
00505 for(k=0;k<nb_smart_servers;k++){
00506
00507 if((!server_list[i]->all_pings[k]) ||
00508 (!server_list[i]->my_pings[j])) return -1;
00509
00510 if((!server_list[i]->all_pings[k]->componentid) ||
00511 (!server_list[i]->my_pings[j]->componentid)) return -1;
00512
00513 if(memcmp(server_list[i]->all_pings[k]->componentid,
00514 server_list[i]->my_pings[j]->componentid, CID_LEN)==0){
00515 server_list[i]->all_pings[k]->comm_time=server_list[i]->my_pings[j]->comm_time;
00516 }
00517 }
00518 }
00519 }
00520 }
00521
00522
00523
00524
00525
00526
00527
00528
00529
00530 for(i=0;i<nb_servers;i++){
00531 if(!server_list[i]) return -1;
00532 if( server_list[i]->smart==1){
00533 for(j=0;j< server_list[i]->nb_my_pings;j++){
00534
00535 for(k=0;k<nb_servers;k++){
00536
00537 if(!server_list[k]) return -1;
00538
00539 if( server_list[k]->smart==1){
00540
00541 if((!server_list[k]->componentid) ||
00542 (!server_list[i]->my_pings[j])) return -1;
00543
00544 if(!(server_list[i]->my_pings[j]->componentid)) return -1;
00545
00546 if(memcmp(server_list[k]->componentid,
00547 server_list[i]->my_pings[j]->componentid, CID_LEN)==0){
00548
00549
00550 for(l=0;l<nb_smart_servers;l++){
00551
00552 if((!server_list[i]->componentid) ||
00553 (!server_list[k]->all_pings[l])) return -1;
00554
00555 if(!server_list[k]->all_pings[l]->componentid) return -1;
00556
00557 if(memcmp(server_list[i]->componentid,
00558 server_list[k]->all_pings[l]->componentid, CID_LEN)==0){
00559 ping_info1=server_list[k]->all_pings[l];
00560 ping_info2=server_list[i]->my_pings[j];
00561 ping_info1->comm_time=ping_info2->comm_time;
00562 }
00563 }
00564 }
00565 }
00566 }
00567 }
00568 }
00569 }
00570 return 0;
00571 }
00572
00573
00574
00575
00576
00577
00578
00579
00580
00581
00582
00583
00584
00585 int
00586 gs_smart_netpm_measure_client_links_perf(gs_server_t ** server_list, int nb_servers)
00587 {
00588
00589 int i, len, ns, nping;
00590 char *msg;
00591 int gs_max_ping=5;
00592 int grpc_measure_comm_num_servers = gs_max_ping;
00593 double elapsed_time, *comm_bw;
00594 char *srv_cid;
00595 char *cid_dup;
00596 srv_cid=(char *)calloc(2*CID_LEN+1, sizeof(char));
00597 icl_hash_t *grpc_comm_cache = NULL;
00598 ns = MIN(nb_servers, grpc_measure_comm_num_servers);
00599 len = 32 * 1024;
00600
00601 msg = (char *)calloc(len, 1);
00602
00603 if(!msg)
00604 return -1;
00605
00606 nping = 0;
00607
00608 for(i=0;i<nb_servers;i++) {
00609
00610 proxy_cid_to_str(srv_cid, server_list[i]->componentid);
00611
00612 if(grpc_comm_cache)
00613 comm_bw = (double *) icl_hash_find(grpc_comm_cache, srv_cid);
00614 else
00615 comm_bw = NULL;
00616
00617 if(comm_bw) {
00618 server_list[i]->comm_bw = *comm_bw;
00619 }
00620 else {
00621 if(gs_do_ping(server_list[i], msg, len, &elapsed_time) < 0) {
00622 server_list[i]->comm_bw = 0.0;
00623 }
00624 else {
00625 server_list[i]->comm_bw = len / elapsed_time;
00626
00627 if(grpc_comm_cache) {
00628 comm_bw = (double *)malloc(sizeof(double));
00629
00630 if(comm_bw) {
00631
00632 cid_dup = strdup(srv_cid);
00633
00634 if(cid_dup) {
00635 free(comm_bw);
00636 }
00637 else {
00638 *comm_bw = server_list[i]->comm_bw;
00639
00640 icl_hash_insert(grpc_comm_cache, cid_dup, comm_bw);
00641 }
00642 }
00643 }
00644 }
00645
00646 nping++;
00647 }
00648
00649 if(nping == ns)
00650 break;
00651 }
00652
00653 if(grpc_comm_cache) free(grpc_comm_cache);
00654 if(msg) free(msg);
00655 if(srv_cid) free(srv_cid);
00656 return 0;
00657
00658 }
00659
00660
00661
00662
00663
00664
00665
00666
00667
00668
00669
00670
00671
00672
00673
00674
00675
00676
00677
00678
00679
00680
00681
00682 int
00683 gs_smart_net_pm_print_to_dotgraph(gs_smart_netpm * netpm, char * filename){
00684 gs_smart_netpm_node * netpm_node;
00685 gs_server_t * server=NULL;
00686 FILE *fp;
00687 int i, j;
00688
00689
00690 char *gridsolve_root;
00691 if((!netpm) || (!filename)) return -1;
00692
00693 if((gridsolve_root = getenv("GRIDSOLVE_ROOT")) == NULL)
00694 gridsolve_root = GRIDSOLVE_TOP_BUILD_DIR;
00695 if(!gridsolve_root) {
00696 ERRPRINTF("Warning: GRIDSOLVE_ROOT unknown, assuming cwd.\n");
00697 gridsolve_root = strdup(".");
00698 }
00699
00700 fp = fopen(filename, "w");
00701 if(fp<0){
00702 ERRPRINTF("SMART: Error opening file\n");
00703 return -1;
00704 }
00705
00706 fprintf(fp, "digraph Net_PM {\n");
00707 fprintf(fp, " client [shape=\"diamond\", style=\"filled\", fillcolor=\"#ADD8E6\" fontsize=\"15.0\"];\n");
00708 for(i=0; i<netpm->nb_nodes;i++){
00709
00710 if(!netpm->netpm_nodes[i]) return -1;
00711 netpm_node=netpm->netpm_nodes[i];
00712
00713 if(!netpm_node->server) return -1;
00714
00715 server=netpm_node->server;
00716 if(server->smart==1){
00717 fprintf(fp, " \"%s_%d\" [shape=box,label=\"%s\\nport=%d\\nkflops=%d\\nworkload=%d\\nsmart=%d\", style=filled,fillcolor=\"firebrick1\",fontsize=15.0];\n", server->hostname, server->port, server->hostname, server->port, server->kflops, server->workload, server->smart);
00718
00719 for(j=1;j<netpm_node->nb_links;j++){
00720
00721 if(!netpm_node->netpm_links[j]) return -1;
00722 if((!netpm_node->netpm_links[j]->linked_nodeB) ||
00723 (!netpm_node->netpm_links[j]->linked_nodeA)) return -1;
00724
00725 if(!netpm_node->netpm_links[j]->linked_nodeA->server) return -1;
00726
00727 if(!netpm_node->netpm_links[j]->linked_nodeA->server->hostname) return -1;
00728
00729
00730
00731 if( netpm_node->netpm_links[j]->linked_nodeB->id== netpm_node->id){
00732 fprintf(fp, " \"%s_%d\" -> \"%s_%d\" [label=\"bw=%f\", color=\"red\"];\n", server->hostname, server->port, netpm_node->netpm_links[j]->linked_nodeA->server->hostname, netpm_node->netpm_links[j]->linked_nodeA->server->port, netpm_node->netpm_links[j]->bw);
00733 }
00734 }
00735 }
00736 else{
00737 if(!server->hostname) return -1;
00738 fprintf(fp, " \"%s_%d\" [shape=box,label=\"%s\\nport=%d\\nkflops=%d\\nworkload=%d\\nsmart=%d\", style=filled,fillcolor=\"deepskyblue\",fontsize=15.0];\n", server->hostname, server->port, server->hostname, server->port, server->kflops, server->workload, server->smart);
00739
00740
00741 }
00742 if(!server->hostname) return -1;
00743 if(!netpm_node->netpm_links[0]) return -1;
00744
00745 fprintf(fp, " client -> \"%s_%d\" [label=\"bw=%f\"];\n", server->hostname, server->port, netpm_node->netpm_links[0]->bw);
00746
00747 }
00748
00749 fprintf(fp, "\n}\n");
00750 fclose(fp);
00751 return 0;
00752 }
00753
00754
00755
00756
00757
00758
00759
00760
00761
00762
00763
00764
00765
00766
00767
00768
00769
00770
00771
00772
00773
00774
00775 int gs_smart_netpm_free(gs_smart_netpm * netpm){
00776 int i, j;
00777 gs_smart_netpm_node * netpm_node;
00778 if(!netpm) return -1;
00779
00780
00781
00782
00783 if(netpm->netpm_clnt_node){
00784 netpm_node=netpm->netpm_clnt_node;
00785
00786 if(!netpm_node->netpm_links) return -1;
00787 free(netpm_node->netpm_links);
00788 free(netpm_node);
00789 }
00790
00791
00792
00793
00794
00795
00796 for(i=0; i<netpm->nb_nodes;i++){
00797 if(!netpm->netpm_nodes[i]) return -1;
00798 netpm_node=netpm->netpm_nodes[i];
00799 for(j=0; j<netpm_node->nb_links;j++){
00800 if(!netpm_node->netpm_links[j]) return -1;
00801 free(netpm_node->netpm_links[j]);
00802 }
00803 if(!netpm_node->netpm_links) return -1;
00804 free(netpm_node->netpm_links);
00805 free(netpm_node);
00806 }
00807
00808 free(netpm);
00809
00810
00811 return 0;
00812 }
00813
00814
00815
00816
00817 #endif
00818