memset
# numademo 1G memset 2 nodes available Cannot determine CPU cache size memory with no policy memset Avg 26927.56 MB/s Max 26989.29 MB/s Min 26822.09 MB/s local memory memset Avg 26884.54 MB/s Max 26954.06 MB/s Min 26847.57 MB/s memory interleaved on all nodes memset Avg 13196.72 MB/s Max 14128.55 MB/s Min 11570.00 MB/s memory on node 1 memset Avg 26610.44 MB/s Max 26687.42 MB/s Min 26518.69 MB/s memory on node 3 memset Avg 10000.71 MB/s Max 10138.34 MB/s Min 9965.12 MB/s memory interleaved on 1 memset Avg 26867.66 MB/s Max 26927.02 MB/s Min 26774.60 MB/s setting preferred node to 1 memory without policy memset Avg 26890.20 MB/s Max 26921.62 MB/s Min 26841.53 MB/s setting preferred node to 3 memory without policy memset Avg 9969.95 MB/s Max 9975.68 MB/s Min 9965.49 MB/s manual interleaving to all nodes memset Avg 13600.29 MB/s Max 13647.29 MB/s Min 13560.09 MB/s manual interleaving on node 0/1 memset Avg 26916.15 MB/s Max 26973.02 MB/s Min 26859.66 MB/s current interleave node 1 running on node 1, preferred node 0 local memory memset Avg 26955.48 MB/s Max 26981.83 MB/s Min 26919.59 MB/s memory interleaved on all nodes memset Avg 13709.41 MB/s Max 13858.49 MB/s Min 13631.35 MB/s memory interleaved on node 0/1 memset Avg 25967.22 MB/s Max 26047.15 MB/s Min 25860.83 MB/s alloc on node 3 memset Avg 9561.23 MB/s Max 9575.35 MB/s Min 9549.13 MB/s local allocation memset Avg 25924.52 MB/s Max 26121.29 MB/s Min 25348.61 MB/s setting wrong preferred node memset Avg 9747.35 MB/s Max 9861.43 MB/s Min 9727.86 MB/s setting correct preferred node memset Avg 26019.13 MB/s Max 26105.41 MB/s Min 25932.66 MB/s running on node 3, preferred node 0 local memory memset Avg 28741.57 MB/s Max 29209.52 MB/s Min 28157.81 MB/s memory interleaved on all nodes memset Avg 13450.03 MB/s Max 14381.18 MB/s Min 11495.43 MB/s memory interleaved on node 0/1 memset Avg 9307.51 MB/s Max 9323.27 MB/s Min 9279.75 MB/s alloc on node 1 memset Avg 9314.34 MB/s Max 9323.75 MB/s Min 9301.86 MB/s local allocation memset Avg 28251.23 MB/s Max 28531.92 MB/s Min 28154.12 MB/s setting wrong preferred node memset Avg 9461.36 MB/s Max 9470.13 MB/s Min 9434.68 MB/s setting correct preferred node memset Avg 28129.26 MB/s Max 28158.55 MB/s Min 28111.37 MB/s
memcpy
# numademo 1G memcpy 2 nodes available Cannot determine CPU cache size memory with no policy memcpy Avg 12365.35 MB/s Max 13271.64 MB/s Min 9305.17 MB/s local memory memcpy Avg 47384.48 MB/s Max 47550.68 MB/s Min 47007.35 MB/s memory interleaved on all nodes memcpy Avg 26573.56 MB/s Max 26950.00 MB/s Min 26323.01 MB/s memory on node 1 memcpy Avg 38779.62 MB/s Max 39570.36 MB/s Min 34569.92 MB/s memory on node 3 memcpy Avg 22022.94 MB/s Max 22042.66 MB/s Min 21980.84 MB/s memory interleaved on 1 memcpy Avg 39356.43 MB/s Max 39516.48 MB/s Min 39244.95 MB/s setting preferred node to 1 memory without policy memcpy Avg 11652.55 MB/s Max 11743.23 MB/s Min 11595.48 MB/s setting preferred node to 3 memory without policy memcpy Avg 6007.48 MB/s Max 6028.94 MB/s Min 5976.79 MB/s manual interleaving to all nodes memcpy Avg 8116.78 MB/s Max 8228.73 MB/s Min 8056.17 MB/s manual interleaving on node 0/1 memcpy Avg 11643.77 MB/s Max 11685.97 MB/s Min 11598.99 MB/s current interleave node 1 running on node 1, preferred node 0 local memory memcpy Avg 47246.87 MB/s Max 47351.47 MB/s Min 47120.81 MB/s memory interleaved on all nodes memcpy Avg 25628.93 MB/s Max 25777.64 MB/s Min 25511.22 MB/s memory interleaved on node 0/1 memcpy Avg 47260.18 MB/s Max 47326.42 MB/s Min 47191.22 MB/s alloc on node 3 memcpy Avg 5492.49 MB/s Max 5505.41 MB/s Min 5476.18 MB/s local allocation memcpy Avg 13273.64 MB/s Max 13317.07 MB/s Min 13228.96 MB/s setting wrong preferred node memcpy Avg 5534.01 MB/s Max 5570.10 MB/s Min 5471.49 MB/s setting correct preferred node memcpy Avg 13328.18 MB/s Max 13360.32 MB/s Min 13280.17 MB/s running on node 3, preferred node 0 local memory memcpy Avg 51844.04 MB/s Max 52146.17 MB/s Min 51577.57 MB/s memory interleaved on all nodes memcpy Avg 25641.60 MB/s Max 25901.38 MB/s Min 25498.50 MB/s memory interleaved on node 0/1 memcpy Avg 16918.19 MB/s Max 17028.65 MB/s Min 16322.23 MB/s alloc on node 1 memcpy Avg 5488.25 MB/s Max 5529.25 MB/s Min 5456.53 MB/s local allocation memcpy Avg 13455.33 MB/s Max 13481.26 MB/s Min 13429.83 MB/s setting wrong preferred node memcpy Avg 5479.94 MB/s Max 5521.80 MB/s Min 5437.69 MB/s setting correct preferred node memcpy Avg 13438.35 MB/s Max 13476.86 MB/s Min 13372.96 MB/s
forward
# numademo 1G forward 2 nodes available Cannot determine CPU cache size memory with no policy forward Avg 10345.84 MB/s Max 10364.20 MB/s Min 10335.87 MB/s local memory forward Avg 8701.35 MB/s Max 10354.61 MB/s Min 5448.94 MB/s memory interleaved on all nodes forward Avg 7890.15 MB/s Max 7908.36 MB/s Min 7879.34 MB/s memory on node 1 forward Avg 11143.32 MB/s Max 11190.99 MB/s Min 10864.86 MB/s memory on node 3 forward Avg 5448.25 MB/s Max 5450.66 MB/s Min 5443.20 MB/s memory interleaved on 1 forward Avg 10441.04 MB/s Max 11178.99 MB/s Min 7101.42 MB/s setting preferred node to 1 memory without policy forward Avg 11136.77 MB/s Max 11183.65 MB/s Min 10795.06 MB/s setting preferred node to 3 memory without policy forward Avg 5425.09 MB/s Max 5445.08 MB/s Min 5372.20 MB/s manual interleaving to all nodes forward Avg 7882.31 MB/s Max 7902.42 MB/s Min 7788.24 MB/s manual interleaving on node 0/1 forward Avg 11173.43 MB/s Max 11185.74 MB/s Min 11162.95 MB/s current interleave node 1 running on node 1, preferred node 0 local memory forward Avg 10676.51 MB/s Max 10698.90 MB/s Min 10579.68 MB/s memory interleaved on all nodes forward Avg 7929.68 MB/s Max 7932.08 MB/s Min 7925.70 MB/s memory interleaved on node 0/1 forward Avg 10683.54 MB/s Max 10690.70 MB/s Min 10666.70 MB/s alloc on node 3 forward Avg 5154.34 MB/s Max 5185.58 MB/s Min 5111.26 MB/s local allocation forward Avg 10693.20 MB/s Max 10704.13 MB/s Min 10680.60 MB/s setting wrong preferred node forward Avg 5197.30 MB/s Max 5242.39 MB/s Min 5063.46 MB/s setting correct preferred node forward Avg 10321.32 MB/s Max 10564.79 MB/s Min 9863.69 MB/s running on node 3, preferred node 0 local memory forward Avg 10257.98 MB/s Max 10415.48 MB/s Min 10195.43 MB/s memory interleaved on all nodes forward Avg 7922.00 MB/s Max 7931.32 MB/s Min 7886.46 MB/s memory interleaved on node 0/1 forward Avg 5256.75 MB/s Max 5259.03 MB/s Min 5254.40 MB/s alloc on node 1 forward Avg 5254.91 MB/s Max 5257.49 MB/s Min 5252.52 MB/s local allocation forward Avg 10398.64 MB/s Max 10429.44 MB/s Min 10316.11 MB/s setting wrong preferred node forward Avg 5251.63 MB/s Max 5259.11 MB/s Min 5240.17 MB/s setting correct preferred node forward Avg 10210.21 MB/s Max 10321.66 MB/s Min 10062.71 MB/s
backward
# numademo 1G backward 2 nodes available Cannot determine CPU cache size memory with no policy backward Avg 11374.24 MB/s Max 11405.31 MB/s Min 11307.31 MB/s local memory backward Avg 9176.80 MB/s Max 11528.26 MB/s Min 6193.14 MB/s memory interleaved on all nodes backward Avg 8696.17 MB/s Max 8740.55 MB/s Min 8539.59 MB/s memory on node 1 backward Avg 12086.99 MB/s Max 12101.50 MB/s Min 12063.97 MB/s memory on node 3 backward Avg 6256.84 MB/s Max 6259.53 MB/s Min 6252.50 MB/s memory interleaved on 1 backward Avg 12051.88 MB/s Max 12107.91 MB/s Min 11955.04 MB/s setting preferred node to 1 memory without policy backward Avg 11744.63 MB/s Max 11985.20 MB/s Min 10492.32 MB/s setting preferred node to 3 memory without policy backward Avg 6228.39 MB/s Max 6232.18 MB/s Min 6222.82 MB/s manual interleaving to all nodes backward Avg 8369.83 MB/s Max 8728.12 MB/s Min 7170.37 MB/s manual interleaving on node 0/1 backward Avg 11720.50 MB/s Max 11970.77 MB/s Min 11542.38 MB/s current interleave node 1 running on node 1, preferred node 0 local memory backward Avg 11772.67 MB/s Max 11956.77 MB/s Min 11313.14 MB/s memory interleaved on all nodes backward Avg 8859.78 MB/s Max 8872.21 MB/s Min 8839.27 MB/s memory interleaved on node 0/1 backward Avg 11917.91 MB/s Max 11927.55 MB/s Min 11907.58 MB/s alloc on node 3 backward Avg 5936.62 MB/s Max 5963.34 MB/s Min 5930.01 MB/s local allocation backward Avg 12051.57 MB/s Max 12081.21 MB/s Min 11919.74 MB/s setting wrong preferred node backward Avg 5958.82 MB/s Max 5974.86 MB/s Min 5930.21 MB/s setting correct preferred node backward Avg 12053.91 MB/s Max 12066.28 MB/s Min 12040.84 MB/s running on node 3, preferred node 0 local memory backward Avg 10952.35 MB/s Max 11505.03 MB/s Min 9706.58 MB/s memory interleaved on all nodes backward Avg 8835.80 MB/s Max 8863.35 MB/s Min 8802.89 MB/s memory interleaved on node 0/1 backward Avg 6034.12 MB/s Max 6048.23 MB/s Min 6029.31 MB/s alloc on node 1 backward Avg 6021.08 MB/s Max 6036.02 MB/s Min 5938.25 MB/s local allocation backward Avg 11489.88 MB/s Max 11546.23 MB/s Min 11413.92 MB/s setting wrong preferred node backward Avg 6027.94 MB/s Max 6032.97 MB/s Min 6010.58 MB/s setting correct preferred node backward Avg 11558.66 MB/s Max 11563.39 MB/s Min 11549.21 MB/s
stream
# numademo 1G stream 2 nodes available Cannot determine CPU cache size memory with no policy STREAM Copy 12495.89 MB/s Scale 12377.05 MB/s Add 13299.27 MB/s Triad 13203.44 MB/s local memory STREAM Copy 12554.67 MB/s Scale 12572.49 MB/s Add 13398.67 MB/s Triad 13337.08 MB/s memory interleaved on all nodes STREAM Copy 8636.82 MB/s Scale 8579.04 MB/s Add 8451.73 MB/s Triad 8439.11 MB/s memory on node 1 STREAM Copy 12487.00 MB/s Scale 12441.83 MB/s Add 13297.11 MB/s Triad 13227.63 MB/s memory on node 3 STREAM Copy 5664.64 MB/s Scale 5788.39 MB/s Add 6105.62 MB/s Triad 6123.11 MB/s memory interleaved on 1 STREAM Copy 12485.44 MB/s Scale 12490.22 MB/s Add 13292.99 MB/s Triad 13227.32 MB/s setting preferred node to 1 memory without policy STREAM Copy 12499.84 MB/s Scale 12449.00 MB/s Add 13306.70 MB/s Triad 13209.59 MB/s setting preferred node to 3 memory without policy STREAM Copy 5768.75 MB/s Scale 5820.65 MB/s Add 6029.41 MB/s Triad 6084.35 MB/s manual interleaving to all nodes STREAM Copy 8630.78 MB/s Scale 8631.60 MB/s Add 8462.45 MB/s Triad 8465.79 MB/s manual interleaving on node 0/1 STREAM Copy 12532.87 MB/s Scale 12471.96 MB/s Add 13354.17 MB/s Triad 13242.34 MB/s current interleave node 1 running on node 1, preferred node 0 local memory STREAM Copy 12459.18 MB/s Scale 12490.69 MB/s Add 13322.21 MB/s Triad 13247.24 MB/s memory interleaved on all nodes STREAM Copy 8701.27 MB/s Scale 8631.83 MB/s Add 8433.28 MB/s Triad 8436.93 MB/s memory interleaved on node 0/1 STREAM Copy 12528.69 MB/s Scale 12526.07 MB/s Add 13376.82 MB/s Triad 13308.31 MB/s alloc on node 3 STREAM Copy 5600.20 MB/s Scale 5767.46 MB/s Add 6017.36 MB/s Triad 6008.52 MB/s local allocation STREAM Copy 12436.62 MB/s Scale 12364.92 MB/s Add 13224.22 MB/s Triad 13100.31 MB/s setting wrong preferred node STREAM Copy 5686.68 MB/s Scale 5796.37 MB/s Add 6031.99 MB/s Triad 6039.48 MB/s setting correct preferred node STREAM Copy 12469.58 MB/s Scale 12422.37 MB/s Add 13309.45 MB/s Triad 13198.06 MB/s running on node 3, preferred node 0 local memory STREAM Copy 12308.35 MB/s Scale 12295.39 MB/s Add 13311.81 MB/s Triad 13232.22 MB/s memory interleaved on all nodes STREAM Copy 8744.53 MB/s Scale 8660.44 MB/s Add 8469.46 MB/s Triad 8487.08 MB/s memory interleaved on node 0/1 STREAM Copy 5601.28 MB/s Scale 5716.38 MB/s Add 6033.88 MB/s Triad 6054.60 MB/s alloc on node 1 STREAM Copy 5670.15 MB/s Scale 5778.17 MB/s Add 6057.92 MB/s Triad 6069.18 MB/s local allocation STREAM Copy 12488.51 MB/s Scale 12474.60 MB/s Add 13312.29 MB/s Triad 13225.07 MB/s setting wrong preferred node STREAM Copy 5579.89 MB/s Scale 5641.59 MB/s Add 5976.38 MB/s Triad 5986.72 MB/s setting correct preferred node STREAM Copy 12473.25 MB/s Scale 12468.49 MB/s Add 13258.20 MB/s Triad 13157.95 MB/s
random2
# numademo 1m random2 2 nodes available Cannot determine CPU cache size memory with no policy random2 Avg 408.63 MB/s Max 415.94 MB/s Min 405.01 MB/s local memory random2 Avg 367.33 MB/s Max 369.74 MB/s Min 365.87 MB/s memory interleaved on all nodes random2 Avg 395.76 MB/s Max 397.19 MB/s Min 392.87 MB/s memory on node 1 random2 Avg 491.92 MB/s Max 493.68 MB/s Min 486.58 MB/s memory on node 3 random2 Avg 358.87 MB/s Max 360.71 MB/s Min 356.42 MB/s memory interleaved on 1 random2 Avg 492.75 MB/s Max 495.08 MB/s Min 487.94 MB/s setting preferred node to 1 memory without policy random2 Avg 492.75 MB/s Max 494.15 MB/s Min 491.14 MB/s setting preferred node to 3 memory without policy random2 Avg 364.29 MB/s Max 382.97 MB/s Min 354.97 MB/s manual interleaving to all nodes random2 Avg 393.17 MB/s Max 420.61 MB/s Min 353.17 MB/s manual interleaving on node 0/1 random2 Avg 449.09 MB/s Max 489.99 MB/s Min 392.87 MB/s current interleave node 1 running on node 1, preferred node 0 local memory random2 Avg 568.40 MB/s Max 633.58 MB/s Min 299.34 MB/s memory interleaved on all nodes random2 Avg 489.83 MB/s Max 491.14 MB/s Min 487.71 MB/s memory interleaved on node 0/1 random2 Avg 630.23 MB/s Max 634.73 MB/s Min 620.83 MB/s alloc on node 3 random2 Avg 429.55 MB/s Max 436.54 MB/s Min 408.01 MB/s local allocation random2 Avg 631.94 MB/s Max 636.66 MB/s Min 630.53 MB/s setting wrong preferred node random2 Avg 433.30 MB/s Max 435.46 MB/s Min 431.34 MB/s setting correct preferred node random2 Avg 633.20 MB/s Max 636.27 MB/s Min 627.14 MB/s running on node 3, preferred node 0 local memory random2 Avg 621.53 MB/s Max 624.90 MB/s Min 609.28 MB/s memory interleaved on all nodes random2 Avg 499.49 MB/s Max 501.47 MB/s Min 491.60 MB/s memory interleaved on node 0/1 random2 Avg 447.48 MB/s Max 449.07 MB/s Min 442.06 MB/s alloc on node 1 random2 Avg 445.18 MB/s Max 447.54 MB/s Min 441.51 MB/s local allocation random2 Avg 619.84 MB/s Max 624.90 MB/s Min 615.36 MB/s setting wrong preferred node random2 Avg 447.08 MB/s Max 449.84 MB/s Min 444.12 MB/s setting correct preferred node random2 Avg 620.64 MB/s Max 623.78 MB/s Min 616.81 MB/s
phrchase
# numademo 1m ptrchase 2 nodes available Cannot determine CPU cache size memory with no policy ptrchase Avg 111550.64 MB/s Max 116508.44 MB/s Min 104857.60 MB/s local memory ptrchase Avg 109226.67 MB/s Max 116508.44 MB/s Min 95325.09 MB/s memory interleaved on all nodes ptrchase Avg 151967.54 MB/s Max 174762.67 MB/s Min 149796.57 MB/s memory on node 1 ptrchase Avg 218453.33 MB/s Max 262144.00 MB/s Min 209715.20 MB/s memory on node 3 ptrchase Avg 104857.60 MB/s Max 116508.44 MB/s Min 95325.09 MB/s memory interleaved on 1 ptrchase Avg 209715.20 MB/s Max 209715.20 MB/s Min 209715.20 MB/s setting preferred node to 1 memory without policy ptrchase Avg 209715.20 MB/s Max 262144.00 MB/s Min 174762.67 MB/s setting preferred node to 3 memory without policy ptrchase Avg 183960.70 MB/s Max 262144.00 MB/s Min 74898.29 MB/s manual interleaving to all nodes ptrchase Avg 151967.54 MB/s Max 174762.67 MB/s Min 149796.57 MB/s manual interleaving on node 0/1 ptrchase Avg 132731.14 MB/s Max 149796.57 MB/s Min 116508.44 MB/s current interleave node 1 running on node 1, preferred node 0 local memory ptrchase Avg 183960.70 MB/s Max 209715.20 MB/s Min 174762.67 MB/s memory interleaved on all nodes ptrchase Avg 194180.74 MB/s Max 209715.20 MB/s Min 174762.67 MB/s memory interleaved on node 0/1 ptrchase Avg 177724.75 MB/s Max 209715.20 MB/s Min 149796.57 MB/s alloc on node 3 ptrchase Avg 213995.10 MB/s Max 262144.00 MB/s Min 209715.20 MB/s local allocation ptrchase Avg 180788.97 MB/s Max 209715.20 MB/s Min 174762.67 MB/s setting wrong preferred node ptrchase Avg 213995.10 MB/s Max 262144.00 MB/s Min 209715.20 MB/s setting correct preferred node ptrchase Avg 177724.75 MB/s Max 209715.20 MB/s Min 174762.67 MB/s running on node 3, preferred node 0 local memory ptrchase Avg 174762.67 MB/s Max 174762.67 MB/s Min 174762.67 MB/s memory interleaved on all nodes ptrchase Avg 205603.14 MB/s Max 209715.20 MB/s Min 174762.67 MB/s memory interleaved on node 0/1 ptrchase Avg 238312.73 MB/s Max 262144.00 MB/s Min 209715.20 MB/s alloc on node 1 ptrchase Avg 197844.53 MB/s Max 262144.00 MB/s Min 104857.60 MB/s local allocation ptrchase Avg 190650.18 MB/s Max 209715.20 MB/s Min 174762.67 MB/s setting wrong preferred node ptrchase Avg 227951.30 MB/s Max 262144.00 MB/s Min 209715.20 MB/s setting correct preferred node ptrchase Avg 190650.18 MB/s Max 209715.20 MB/s Min 174762.67 MB/s
附:
stream关键源码及算法说明(string_lib.c)
Copy算法
times[0][k] = mysecond();
for (j = 0; j < N; j++)
c[j] = a[j];
times[0][k] = mysecond() - times[0][k];
Scale算法
times[1][k] = mysecond();
for (j = 0; j < N; j++)
b[j] = scalar * c[j];
times[1][k] = mysecond() - times[1][k];
Add算法
times[2][k] = mysecond();
for (j = 0; j < N; j++)
c[j] = a[j] + b[j];
times[2][k] = mysecond() - times[2][k];
Triad算法
times[3][k] = mysecond();
for (j = 0; j < N; j++)
a[j] = b[j] + scalar * c[j];
times[3][k] = mysecond() - times[3][k];
numademo.c
#define _GNU_SOURCE 1 #include#include #include #include #include #include "numa.h" #ifdef HAVE_STREAM_LIB #include "stream_lib.h" #endif #ifdef HAVE_MT #include "mt.h" #endif #ifdef HAVE_CLEAR_CACHE #include "clearcache.h" #else static inline void clearcache(void *a, unsigned size) {} #endif #define FRACT_NODES 8 #define FRACT_MASKS 32 int fract_nodes; int *node_to_use; unsigned long msize; enum { CACHELINESIZE = 64, }; enum test { MEMSET = 0, MEMCPY, FORWARD, BACKWARD, STREAM, RANDOM2, PTRCHASE, } thistest; char *delim = " "; int force; int regression_testing=0; char *testname[] = { "memset", "memcpy", "forward", "backward", #ifdef HAVE_STREAM_LIB "stream", #endif #ifdef HAVE_MT "random2", #endif "ptrchase", NULL, }; void output(char *title, char *result) { if (!isspace(delim[0])) printf("%s%s%sn", title,delim, result); else printf("%-42s%sn", title, result); } #ifdef HAVE_STREAM_LIB void do_stream(char *name, unsigned char *mem) { int i; char title[100], buf[100]; double res[STREAM_NRESULTS]; stream_verbose = 0; clearcache(mem, msize); stream_init(mem); stream_test(res); sprintf(title, "%s%s%s", name, delim, "STREAM"); buf[0] = ' '; for (i = 0; i < STREAM_NRESULTS; i++) { if (buf[0]) strcat(buf,delim); sprintf(buf+strlen(buf), "%s%s%.2f%sMB/s", stream_names[i], delim, res[i], delim); } output(title, buf); clearcache(mem, msize); } #endif union node { union node *next; struct { unsigned nexti; unsigned val; }; }; static int cmp_node(const void *ap, const void *bp) { union node *a = (union node *)ap; union node *b = (union node *)bp; return a->val - b->val; } void **ptrchase_init(unsigned char *mem) { long i; union node *nodes = (union node *)mem; long nmemb = msize / sizeof(union node); srand(1234); for (i = 0; i < nmemb; i++) { nodes[i].val = rand(); nodes[i].nexti = i + 1; } qsort(nodes, nmemb, sizeof(union node), cmp_node); for (i = 0; i < nmemb; i++) { union node *n = &nodes[i]; n->next = n->nexti >= nmemb ? NULL : &nodes[n->nexti]; } return (void **)nodes; } static inline unsigned long long timerfold(struct timeval *tv) { return tv->tv_sec * 1000000ULL + tv->tv_usec; } #define LOOPS 10 void memtest(char *name, unsigned char *mem) { long k; struct timeval start, end, res; unsigned long long max, min, sum, r; int i; char title[128], result[128]; if (!mem) { fprintf(stderr, "Failed to allocate %lu bytes of memory. Test "%s" exits.n", msize, name); return; } #ifdef HAVE_STREAM_LIB if (thistest == STREAM) { do_stream(name, mem); goto out; } #endif max = 0; min = ~0UL; sum = 0; for (i = 0; i < LOOPS+1; i++) { clearcache(mem, msize); switch (thistest) { case PTRCHASE: { void **ptr; ptr = ptrchase_init(mem); gettimeofday(&start,NULL); while (*ptr) ptr = (void **)*ptr; gettimeofday(&end,NULL); *ptr = "bla"; break; } case MEMSET: gettimeofday(&start,NULL); memset(mem, 0xff, msize); gettimeofday(&end,NULL); break; case MEMCPY: gettimeofday(&start,NULL); memcpy(mem, mem + msize/2, msize/2); gettimeofday(&end,NULL); break; case FORWARD: gettimeofday(&start,NULL); for (k = 0; k < msize; k+=CACHELINESIZE) mem[k]++; gettimeofday(&end,NULL); break; case BACKWARD: gettimeofday(&start,NULL); for (k = msize-5; k > 0; k-=CACHELINESIZE) mem[k]--; gettimeofday(&end,NULL); break; #ifdef HAVE_MT case RANDOM2: { unsigned * __restrict m = (unsigned *)mem; unsigned max = msize / sizeof(unsigned); unsigned mask; mt_init(); mask = 1; while (mask < max) mask = (mask << 1) | 1; gettimeofday(&start,NULL); for (k = 0; k < max; k++) { unsigned idx = mt_random() & mask; if (idx >= max) idx -= max; m[idx]++; } gettimeofday(&end,NULL); } #endif default: break; } if (!i) continue; timersub(&end, &start, &res); r = timerfold(&res); if (r > max) max = r; if (r < min) min = r; sum += r; } sprintf(title, "%s%s%s", name, delim, testname[thistest]); #define H(t) (((double)msize) / ((double)t)) #define D3 delim,delim,delim sprintf(result, "Avg%s%.2f%sMB/s%sMax%s%.2f%sMB/s%sMin%s%.2f%sMB/s", delim, H(sum/LOOPS), D3, H(min), D3, H(max), delim); #undef H #undef D3 output(title,result); #ifdef HAVE_STREAM_LIB out: #endif clearcache(mem, msize); numa_free(mem, msize); } int popcnt(unsigned long val) { int i = 0, cnt = 0; while (val >> i) { if ((1UL << i) & val) cnt++; i++; } return cnt; } int max_node, numnodes; int get_node_list(void) { int a, got_nodes = 0; long long free_node_sizes; numnodes = numa_num_configured_nodes(); node_to_use = (int *)malloc(numnodes * sizeof(int)); max_node = numa_max_node(); for (a = 0; a <= max_node; a++) { if (numa_node_size(a, &free_node_sizes) > 0) node_to_use[got_nodes++] = a; } if(got_nodes != numnodes) return -1; return 0; } void test(enum test type) { unsigned long mask; int i, k; char buf[512]; struct bitmask *nodes; nodes = numa_allocate_nodemask(); thistest = type; if (regression_testing) { printf("nTest %s doing 1 of %d nodes and 1 of %d masks.n", testname[thistest], fract_nodes, FRACT_MASKS); } memtest("memory with no policy", numa_alloc(msize)); memtest("local memory", numa_alloc_local(msize)); memtest("memory interleaved on all nodes", numa_alloc_interleaved(msize)); for (i = 0; i < numnodes; i++) { if (regression_testing && (node_to_use[i] % fract_nodes)) { continue; } sprintf(buf, "memory on node %d", node_to_use[i]); memtest(buf, numa_alloc_onnode(msize, node_to_use[i])); } for (mask = 1, i = 0; mask < (1UL< 50)) { break; } if (regression_testing && (i % FRACT_MASKS)) { continue; } numa_bitmask_clearall(nodes); for (w = 0; mask >> w; w++) { if ((mask >> w) & 1) numa_bitmask_setbit(nodes, w); } sprintf(buf, "memory interleaved on"); for (k = 0; k < numnodes; k++) if ((1UL< 0) { numa_bitmask_clearall(nodes); numa_bitmask_setbit(nodes, 0); numa_bitmask_setbit(nodes, 1); numa_set_interleave_mask(nodes); memtest("manual interleaving on node 0/1", numa_alloc(msize)); printf("current interleave node %dn", numa_get_interleave_node()); } numa_bitmask_free(nodes); numa_set_interleave_mask(numa_no_nodes_ptr); nodes = numa_allocate_nodemask(); for (i = 0; i < numnodes; i++) { int oldhn = numa_preferred(); if (regression_testing && (node_to_use[i] % fract_nodes)) { continue; } numa_run_on_node(node_to_use[i]); printf("running on node %d, preferred node %dn",node_to_use[i], oldhn); memtest("local memory", numa_alloc_local(msize)); memtest("memory interleaved on all nodes", numa_alloc_interleaved(msize)); if (numnodes >= 2) { numa_bitmask_clearall(nodes); numa_bitmask_setbit(nodes, 0); numa_bitmask_setbit(nodes, 1); memtest("memory interleaved on node 0/1", numa_alloc_interleaved_subset(msize, nodes)); } for (k = 0; k < numnodes; k++) { if (node_to_use[k] == node_to_use[i]) continue; if (regression_testing && (node_to_use[k] % fract_nodes)) { continue; } sprintf(buf, "alloc on node %d", node_to_use[k]); numa_bitmask_clearall(nodes); numa_bitmask_setbit(nodes, node_to_use[k]); numa_set_membind(nodes); memtest(buf, numa_alloc(msize)); numa_set_membind(numa_all_nodes_ptr); } numa_set_localalloc(); memtest("local allocation", numa_alloc(msize)); numa_set_preferred(node_to_use[(i + 1) % numnodes]); memtest("setting wrong preferred node", numa_alloc(msize)); numa_set_preferred(node_to_use[i]); memtest("setting correct preferred node", numa_alloc(msize)); numa_set_preferred(-1); if (!delim[0]) printf("nnn"); } numa_bitmask_free(nodes); } void usage(void) { int i; printf("usage: numademo [-S] [-f] [-c] [-e] [-t] msize[kmg] {tests}nNo tests means run all.n"); printf("-c output CSV data. -f run even without NUMA API. -S run stupid tests. -e exit on errorn"); printf("-t regression test; do not run all node combinationsn"); printf("valid tests:"); for (i = 0; testname[i]; i++) printf(" %s", testname[i]); putchar('n'); exit(1); } long memsize(char *s) { char *end; long length = strtoul(s,&end,0); switch (toupper(*end)) { case 'G': length *= 1024; case 'M': length *= 1024; case 'K': length *= 1024; break; } return length; } int main(int ac, char **av) { int simple_tests = 0; while (av[1] && av[1][0] == '-') { ac--; switch (av[1][1]) { case 'c': delim = ","; break; case 'f': force = 1; break; case 'S': simple_tests = 1; break; case 'e': numa_exit_on_error = 1; numa_exit_on_warn = 1; break; case 't': regression_testing = 1; break; default: usage(); break; } ++av; } if (!av[1]) usage(); if (numa_available() < 0) { printf("your system does not support the numa API.n"); if (!force) exit(1); } if(get_node_list()){ fprintf(stderr, "Configured Nodes does not match available memory nodesn"); exit(1); } printf("%d nodes availablen", numnodes); fract_nodes = (((numnodes-1)/8)*2) + FRACT_NODES; if (numnodes <= 3) regression_testing = 0; msize = memsize(av[1]); if (!msize) usage(); #ifdef HAVE_STREAM_LIB stream_setmem(msize); #endif if (av[2] == NULL) { test(MEMSET); test(MEMCPY); if (simple_tests) { test(FORWARD); test(BACKWARD); } #ifdef HAVE_MT test(RANDOM2); #endif #ifdef HAVE_STREAM_LIB test(STREAM); #endif if (msize >= sizeof(union node)) { test(PTRCHASE); } else { fprintf(stderr, "You must set msize at least %lu bytes for ptrchase test.n", sizeof(union node)); exit(1); } } else { int k; for (k = 2; k < ac; k++) { int i; int found = 0; for (i = 0; testname[i]; i++) { if (!strcmp(testname[i],av[k])) { test(i); found = 1; break; } } if (!found) { fprintf(stderr,"unknown test `%s'n", av[k]); usage(); } } } free(node_to_use); return 0; }



