// gcc -lgearman -O3 -DGEARMAN -msse2 -o check4sse2mmap-gear check4sse2mmap.c // -- exec setuidgid mogile ./check4sse2mmap-gear nbid_slab1 rgb4x4.slab1.vec rgb4x4.slab1.ids // gcc -fprofile-generate -O3 -DBENCHMARK -msse2 -o check4sse2mmap-bench check4sse2mmap.c // ./check4sse2mmap-bench rgb4x4.vec 10 100 // gcc -fprofile-use -O3 -DBENCHMARK -msse2 -o check4sse2mmap-bench check4sse2mmap.c // ./check4sse2mmap-bench rgb4x4.vec 10 100 /* * SSE2 vector distance processor * * by now only vaguely related to: * NBIDSearch 3.0, a image vector searcher * Copyright (C) 2006 Philip D. Bober * using some commie license */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include "rbtree.h" struct _res { RB_ENTRY(_res) rb; struct _res *side; unsigned long int dist; unsigned long int slot; }; int res_cmp(const struct _res *p1, const struct _res *p2); RB_HEAD(res, _res); RB_GENERATE_HDRS(res, _res, rb, res_cmp); #define MAXCMP 1000 #define MAXRES 10000 #ifdef STATS struct t_stats{ unsigned long int bank1; unsigned long int bank2; unsigned long int bank3; unsigned long int insert; } g_stats; #endif struct rgb4x4{ unsigned char rgb[48]; }; struct rgb4x4 mcompare[MAXCMP]; struct rgb4x4_sse{ __m128i ca __attribute__ ((aligned (16))); __m128i cb __attribute__ ((aligned (16))); __m128i cc __attribute__ ((aligned (16))); }; struct rgb4x4_sse mcompare_sse[MAXCMP]; unsigned int resultswanted=0; struct _res mreslist[MAXCMP][MAXRES]; struct _res *mresfree[MAXCMP]; struct res mtree[MAXCMP]; unsigned int mcmp = 0; #define MAXBANK 3 char *g_buffer[MAXBANK]; unsigned long int g_count = 0; unsigned long int g_banks = 0; int g_fd[MAXBANK]; struct stat g_stat[MAXBANK]; char *tl_buffer = NULL; unsigned long int tl_size = 0; int tl_fd; struct stat tl_stat; RB_GENERATE(res, _res, rb, res_cmp); unsigned char hexprint(char *a, int count, int spacer){ int i; char h[] = "0123456789abcdef"; for (i=0;i>4)&0x0f],h[a[i]&0x0f]); } } unsigned char hexdigit(char a){ int b=0; b=a-48; if (b>9){ b=a-65+10; } if (b>9){ b=a-97+10; } // printf("in %c, out %i\n", a, b); return b; } unsigned char hex(const char *a){ return hexdigit(a[0])*16+hexdigit(a[1]); } void compare_reset(){ mcmp = 0; } void compare_add_sse(){ mcompare_sse[mcmp].ca=_mm_loadu_si128((__m128i*)&mcompare[mcmp].rgb[0]); mcompare_sse[mcmp].cb=_mm_loadu_si128((__m128i*)&mcompare[mcmp].rgb[16]); mcompare_sse[mcmp].cc=_mm_loadu_si128((__m128i*)&mcompare[mcmp].rgb[32]); } int compare_add_hex(const char *raw){ int i; for(i=0;i<48;i++){ mcompare[mcmp].rgb[i]=hex(raw+i*2); } compare_add_sse(); return mcmp++; } int compare_add_bin(const char *raw){ int i; for(i=0;i<48;i++){ mcompare[mcmp].rgb[i]=raw[i]; } compare_add_sse(); return mcmp++; } int res_cmp(const struct _res *p1, const struct _res *p2) { return p1->dist - p2->dist; } void results_init(int maxres) { if (maxres > MAXRES) { perror("too big!"); exit(1); } int i,j; for (j=0;jside; return(have); } max = RB_MAX(res, &mtree[cmp]); if (max->side) { have = max->side; max->side = have->side; return(have); } RB_REMOVE(res, &mtree[cmp], max); return(max); } unsigned long int results_add_and_return_maxdist(unsigned int cmp, unsigned long int slot, unsigned long int dist) { struct _res *ins = results_getfree(cmp); struct _res *have = NULL; ins->side = NULL; ins->slot = slot; ins->dist = dist; // printf("DEB: INSERT cmp %i, slot %i, dist %i\n", cmp, slot, dist); have = RB_INSERT(res, &mtree[cmp], ins); if (have) { ins->side = have->side; have->side = ins; } if (mresfree[cmp]) { return(0xffffffff); } else { have = RB_MAX(res, &mtree[cmp]); return(have->dist); } } void results_dump(){ struct _res *cur = NULL; int j; for (j=0;jslot,elem->dist); elem = elem->side; } } } } char* results_pack(unsigned long int *size){ struct _res *cur = NULL; int vsize = 4; if (tl_buffer) { vsize = tl_size; } *size = mcmp * resultswanted * (4+vsize); char *buf = malloc(*size); if (!buf) { perror("malloc fail"); exit(1); } int j; for (j=0;j= resultswanted) { printf("res: %i >= %lu\n", i, resultswanted); //perror("result overflow"); //exit(1); } //printf("%010lu: %lu\n",elem->slot,elem->dist); unsigned long int *sbuf = (unsigned long int *)&mbuf[i * (4+4)]; sbuf[0] = htonl(elem->dist); if (tl_buffer) { int n; for (n=0;nslot*tl_size)+n]; } } else { sbuf[1] = htonl(elem->slot); } i++; elem = elem->side; } } } return(buf); } void perf_print_do(char * pref, unsigned long long int ticks, unsigned long long int count) { unsigned long long int tps = sysconf(_SC_CLK_TCK); double sec = (0.0+ticks) / tps; unsigned long long int cps; if (!sec) sec++; cps = count/sec; if (cps > 1000000) printf("%s: %llucmp in %.3fsec, %lluMcps\n", pref, count, sec, (cps/1000000)); else if (cps > 1000) printf("%s: %llucmp in %.3fsec, %llukcps\n", pref, count, sec, cps/1000); else printf("%s: %llucmp in %.3fsec, %llucps\n", pref, count, sec, cps); } clock_t start_real, start_virtual, stop_real, stop_virtual; unsigned long long int have_real, have_virtual; unsigned long long int ops; void perf_reset() { ops = 0; have_real = 0; have_virtual = 0; start_real = 0; stop_real = 0; } void perf_start() { struct tms buf; if ((start_real) || (stop_real)) { printf("ERR: redundant start\n"); exit(255); } // dicksize comparison start_real = times(&buf); start_virtual = buf.tms_utime + buf.tms_stime; } void perf_stop() { struct tms buf; if ((!start_real) || (stop_real)) { printf("ERR: stop without start\n"); exit(255); } stop_real = times(&buf); stop_virtual = buf.tms_utime + buf.tms_stime; have_real += (stop_real-start_real); have_virtual += (stop_virtual-start_virtual); start_real = 0; stop_real = 0; } void perf_print() { struct tms buf; if ((start_real) || (stop_real)) { printf("ERR: failprint\n"); exit(255); } perf_print_do("Real", have_real, ops); perf_print_do("Virt", have_virtual, ops); } char* load_and_map(const char *filename, unsigned long int *psize, int *pfd, struct stat *info) { char * buffer; int fd; unsigned long int count; // if (g_buffer) { // perror("reinit not implemented"); // exit(1); // } fd=open(filename,O_RDONLY); //|O_NOATIME); if(fd==-1){ printf("Can't open '%s'!\n", filename); exit(1); } if(fstat(fd,info)!=0){ printf("Couldn't stat!\n"); close(fd); exit(1); } buffer=(char *)mmap(NULL,info->st_size,PROT_READ,MAP_SHARED,fd,0); //madvise((void*)buffer,count*sizeof(struct rgb4x4),MADV_SEQUENTIAL); *psize = info->st_size; *pfd = fd; return(buffer); } register __m128i ca asm ("xmm7"); register __m128i cb asm ("xmm6"); register __m128i cc asm ("xmm5"); register __m128i ea asm ("xmm4"); void results_generate_single(unsigned int start, unsigned int end, __m128i **buffer) { //register __m128i ca __attribute__ ((aligned (64))); //register __m128i cb __attribute__ ((aligned (16))); //register __m128i cc __attribute__ ((aligned (16))); //register __m128i ea __attribute__ ((aligned (16))); union rbuf { __m128i xmm; uint32_t i[4]; } ex __attribute__ ((aligned (16))); //__m128i ea __attribute__ ((aligned (16))); unsigned int i=0; register unsigned int maxdist = 0xffffffff; register unsigned long int dist; if (mcmp > 1) { printf("WARNING: calling single with multiple targets avail\n"); } #ifdef STATS g_stats.bank1=0; g_stats.bank2=0; g_stats.bank3=0; g_stats.insert=0; #endif ca = _mm_load_si128((__m128i*)&mcompare_sse[0].ca); cb = _mm_load_si128((__m128i*)&mcompare_sse[0].cb); cc = _mm_load_si128((__m128i*)&mcompare_sse[0].cc); for(i=start;imaxdist){ // continue; // } //ea = _mm_add_epi64( _mm_sad_epu8(buffer[0][i],ca), _mm_sad_epu8(buffer[1][i],cb)); ea = _mm_add_epi64( ea, _mm_sad_epu8(buffer[1][i],cb)); // dist += ((unsigned long int *)&ea)[0]+((unsigned long int *)&ea)[2]; // _mm_mfence(); //dist += ((uint16_t*)&ea)[0]+((uint16_t*)&ea)[4]; // dist += ea.i[0]+ea.i[2]; //dist += ((uint32_t*)&ea)[0]+((uint32_t*)&ea)[2]; // printf("\nSINGLE2DIST %lu\n", dist); #ifdef STATS g_stats.bank2++; #endif _mm_store_si128(&ex.xmm,ea); dist = ex.i[0]+ex.i[2]; if(dist>maxdist){ continue; } ex.xmm = _mm_sad_epu8(buffer[2][i],cc); //ea = _mm_add_epi64( ea, _mm_sad_epu8(buffer[2][i],cc)); //ea = _mm_sad_epu8(buffer[2][i],cc); // dist += ((unsigned long int *)&ea)[0]+((unsigned long int *)&ea)[2]; // _mm_mfence(); //dist += ((uint16_t*)&ea)[0]+((uint16_t*)&ea)[4]; //_mm_store_si128(&ex.xmm,ea); dist += ex.i[0]+ex.i[2]; //dist += ((uint64_t*)&ea)[0]+((uint64_t*)&ea)[1]; // printf("SINGLE3DIST %lu\n", dist); #ifdef STATS g_stats.bank3++; #endif if(dist 10) { // exit(i); // } } #ifdef STATS printf("STATS: %lu bank1, %lu bank2 (%.3f%%), %lu bank3 (%.3f%%), %lu inserts\n", g_stats.bank1, g_stats.bank2, (float)(100.0*g_stats.bank2/g_stats.bank1), g_stats.bank3, (float)(100.0*g_stats.bank3/g_stats.bank1), g_stats.insert); #endif } void results_generate_multi(unsigned int start, unsigned int end, __m128i **buffer) { //register __m128i ca __attribute__ ((aligned (64))); //register __m128i cb __attribute__ ((aligned (16))); //register __m128i cc __attribute__ ((aligned (16))); //__m128i ea __attribute__ ((aligned (16))); union rbuf { __m128i xmm; uint32_t i[4]; } ex __attribute__ ((aligned (16))); char cbl, ccl; unsigned int i=0; unsigned int j=0; unsigned int maxdist[MAXCMP]; register unsigned long int dist; for(i=0;imaxdist[j]){ // continue; // } ea = _mm_add_epi64( ea, _mm_sad_epu8(cb,mcompare_sse[j].cb)); #ifdef STATS g_stats.bank2++; #endif _mm_store_si128(&ex.xmm,ea); dist = ex.i[0]+ex.i[2]; if(dist>maxdist[j]){ continue; } ex.xmm = _mm_sad_epu8(cc,mcompare_sse[j].cc); //dist += ((unsigned long int *)&ea)[0]+((unsigned long int *)&ea)[2]; dist += ex.i[0]+ex.i[2]; #ifdef STATS g_stats.bank3++; #endif if(dist 1) { results_generate_multi(start,end,buffer); } else { results_generate_single(start,end,buffer); } } #ifdef GEARMAN #include #include void *do_work(gearman_job_st *job, void *cb_arg, size_t *result_size, gearman_return_t *ret_ptr) { char *buf = (char *)gearman_job_workload(job); unsigned long int size = gearman_job_workload_size(job); unsigned int startposition = ntohl(*(long int *)&buf[0]); unsigned int endposition = ntohl(*(long int *)&buf[4]); resultswanted = ntohl(*(long int *)&buf[8]); compare_reset(); unsigned int j; for (j=0;12+(j*48)count){ endposition=count; } int toread=count-(count-endposition)-startposition; printf("Checking %i of %i entries, %i targets, returning %i results\n",toread,count,mcmp,resultswanted); unsigned int totalread=startposition*sizeof(struct rgb4x4); results_init(resultswanted); perf_start(); ops += toread*mcmp; results_generate(startposition,endposition,(__m128i**)&g_buffer); perf_stop(); perf_print(); perf_reset(); buf = results_pack(&size); *result_size = size; *ret_ptr = GEARMAN_SUCCESS; return buf; } #include void die(int sig) { printf("\nInterrupted..\n"); exit(sig); } void do_remap(int sig) { struct stat info; unsigned int i; for (i=0;i tl_stat.st_size) { size_t osize = tl_stat.st_size; if(fstat(tl_fd,&tl_stat)!=0){ printf("Couldn't stat2!\n"); exit(1); } tl_buffer = mremap(tl_buffer, osize, tl_stat.st_size, MREMAP_MAYMOVE); } printf("\nremapped to %lu bytes, %lu vectors\n", g_stat[0].st_size, g_count); } int main(int argc,char*argv[]){ (void) signal(SIGINT,die); (void) signal(SIGHUP,do_remap); if(argc<2){ printf("Usage: %s [translationfile]\n",argv[0]); return -1; } printf("Loading VEC data..."); unsigned long int size; int banks = MAXBANK; unsigned int i,j; for (i=0;icount){ endposition=count; } int toread=count-(count-endposition)-startposition; unsigned int totalread=startposition*sizeof(struct rgb4x4); resultswanted = atoi(argv[2]); int multi = atoi(argv[3]); char ref[48]; srand(5); // for (i=0;i<10;i++) { compare_reset(); for (j=0;j<48;j++) { ref[j]= (char)(rand() & 0xff); } compare_add_bin((const char *)&ref); // printf("=== CACHEWARMING%i === Checking %i of %i entries, %i targets, returning %i results\n",i,toread,count,mcmp,resultswanted); results_init(resultswanted); // perf_start(); ops += toread*mcmp; results_generate_single(startposition,endposition,(__m128i**)&g_buffer); // perf_print(); // } srand(23); perf_reset(); for (i=0;i [rgb4x4lchex...]\n",argv[0]); return -1; } resultswanted=atoi(argv[2]); startposition=atoi(argv[3]); endposition=atoi(argv[4]); count=0; printf("Loading VEC data..."); unsigned long int size; int banks = 3; for (i=0;icount){ endposition=count; } for (j=5;j11 && rgb.compare(0,11,"urn:rgb4x4:")==0){ // rgb=string(rgb,11); // } // if(rgb.length()!=48*2){ // printf("Got %s, expected 48 byte hexstring\n",rgb.c_str()); // return -1; // } compare_add_hex(rgb); } int toread=count-(count-endposition)-startposition; printf("Ready to start, checking %i of %i entries, returning %i results\n",toread,count,resultswanted); unsigned int totalread=startposition*sizeof(struct rgb4x4); results_init(resultswanted); perf_reset(); perf_start(); ops += toread*mcmp; results_generate(startposition,endposition,(__m128i**)&g_buffer); perf_stop(); perf_print(); // printf("Done. %u swaps. Read %u bytes\n",swaps,toread*sizeof(struct rgb4x4)); results_dump(); // } //munmap(buffer,count*sizeof(struct rgb4x4)); //close(fd); } #endif