diff --git a/configure b/configure index fe177ec10..8f7e577b5 100755 --- a/configure +++ b/configure @@ -2453,7 +2453,7 @@ for arg in "$@"; do dwarfdir="" shift ;; - + -elfutils=download) elfutils=yes download_elfutils=yes @@ -3242,7 +3242,7 @@ for arg in "$@"; do fi shift ;; - + -rocprofv2) fixmakeargs="$fixmakeargs ROCPROFILERV2" rocprofv2=yes @@ -3258,7 +3258,7 @@ for arg in "$@"; do pthread=yes shift ;; - + -rocprofsdk=*) fixmakeargs="$fixmakeargs ROCPROFILERSDK" rocprofsdk=yes @@ -3374,7 +3374,8 @@ for arg in "$@"; do shift ;; -opencl=*) - openclinclude=`echo $arg | sed -e 's/-opencl=//' -e 's/ /#/g'`/include + opencldir=`echo $arg | sed -e 's/-opencl=//' -e 's/ /#/g'` + openclinclude=$opencldir/include if [ ! -d $openclinclude ] ; then echo "Error: Cannot access GPU include directory $openclinclude" if [ `uname -s ` = "Darwin" ]; then @@ -3396,8 +3397,21 @@ for arg in "$@"; do use_opencl=yes else echo "Error: Cannot find GPU headers, TAU currently supports either CUDA or OpenCL" + exit 0 fi fi + opencllib="libOpenCL.so" + for d in "OpenCL" "amdocl64" ; do + libname="lib${d}.so" + echo "Looking for ${libname} in ${opencldir}" + exists=`find ${opencldir} -name ${libname} | head -n 1` + if [ -r "${exists}" ]; then + echo "found $exists" + opencllib=${exists} + fixmakeargs="$fixmakeargs OPENCL opencllib=${opencllib}" + break + fi + done shift ;; @@ -3738,9 +3752,8 @@ fi if [ "x$use_opencl" = "xyes" -a "x$openclinclude" = "x" ]; then ld_lib_path=`echo $LD_LIBRARY_PATH | sed -e "s@:@ @g" ` - for d in $ld_lib_path - do - echo "OPENCL: CHECKING $d" + for d in $ld_lib_path ; do + echo "OPENCL: CHECKING $d" if [ -r $d/libOpenCL.so ]; then echo "checking $d" openclinclude=`echo $d | sed -e "s@/lib64@@g" -e "s@loader@headers@g" `/include @@ -3757,6 +3770,7 @@ if [ "x$use_opencl" = "xyes" -a "x$openclinclude" = "x" ]; then fixmakeargs="$fixmakeargs OPENCL TAU_USE_GPU openclinclude=$openclinclude/sycl" else echo "Error: Cannot find GPU headers, TAU currently supports either CUDA or OpenCL" + exit 0 fi fi fi @@ -11003,7 +11017,7 @@ if [ "x$download_elfutils" = xyes ] ; then predowndir=`pwd` elfutilsdir=$libelfutilsdir/elfutils - + if [ -r "$elfutilsdir/lib/libdw.so" -a -r "$elfutilsdir/include/elfutils/libdw.h" ]; then echo "Found elfutils" echo "elfutils download skipping" @@ -11037,7 +11051,7 @@ if [ "$elfutils" = "yes" ]; then echo "Could not find elfutils $elfutilsdir/lib/libdw.so" exit 1 fi - + fixmakeargs="$fixmakeargs elfutilsincdir=$elfutilsinc" fixmakeargs="$fixmakeargs elfutilslibdir=$elfutilslib" diff --git a/examples/gpu/python_opencl/Makefile b/examples/gpu/python_opencl/Makefile index 0b448c395..162d8ebb2 100644 --- a/examples/gpu/python_opencl/Makefile +++ b/examples/gpu/python_opencl/Makefile @@ -1,19 +1,21 @@ include ../../../include/Makefile #TAU_OPENCL_INC=/opt/intel/oneapi/compiler/2021.1-beta10/linux/include/sycl +TAU_OPENCL_INC=${ROCM_PATH}/include -TAU_OPENCL_LIB=-L$(TAU_OPENCL_INC)/../lib/x86_64 -L$(TAU_OPENCL_INC)/../lib -lOpenCL +#TAU_OPENCL_LIB=-L$(TAU_OPENCL_INC)/../lib/x86_64 -L$(TAU_OPENCL_INC)/../lib -lOpenCL #TAU_OPENCL_LIB=-L/opt/intel/oneapi/compiler/2021.1-beta10/linux/lib -lOpenCL #TAU_CXX=clang++ +TAU_OPENCL_LIB=-L$(TAU_OPENCL_INC)/../lib/x86_64 -L$(TAU_OPENCL_INC)/../lib -L${ROCM_PATH}/lib -Wl,-rpath,${ROCM_PATH}/lib -lamdocl64 all: libmatmult.so -libmatmult.so: matmult.o +libmatmult.so: matmult.o Makefile $(TAU_CXX) -g -o $@ $< $(TAU_OPENCL_LIB) -shared matmult.o: matmult.cpp $(TAU_CXX) -I$(TAU_OPENCL_INC) -g -c $< -o $@ -fPIC -clean: +clean: rm -rf libmatmult.so matmult.o profile.* run: diff --git a/examples/gpu/python_opencl/matmult.cpp b/examples/gpu/python_opencl/matmult.cpp index d22149e44..e8a7e24e4 100644 --- a/examples/gpu/python_opencl/matmult.cpp +++ b/examples/gpu/python_opencl/matmult.cpp @@ -107,8 +107,8 @@ extern "C" int entry(int argc, char**argv) block_mult = ceil(SIZE_OF_MATRIX / ((float) SIZE_OF_BLOCK)); else block_mult = 1; - - + + number_of_blocks = SIZE_OF_BLOCK * block_mult; unsigned int matsize = SIZE_OF_MATRIX*SIZE_OF_MATRIX*sizeof(float); @@ -143,18 +143,17 @@ extern "C" int entry(int argc, char**argv) cl_uint nDevices, count; cl_device_id *cdDevices = NULL; - ci = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_ALL, 0, NULL, &count); - + ci = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &count); + cdDevices = (cl_device_id *)malloc(count * sizeof(cl_device_id)); - //ci = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_ALL, count, cdDevices, NULL); - ci = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_CPU, 1, cdDevices, NULL); + ci = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, count, cdDevices, NULL); CHECK_CL_ERROR(ci); cout << count << " devices found." << endl; string device_list(""); int number_of_iterations = 1; - + int opt = getopt(argc, argv, "d:i:"); while(opt != -1) { stringstream str; @@ -206,7 +205,7 @@ extern "C" int entry(int argc, char**argv) } //cout << "finnished mapping devices." << endl; - //cl_context GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_ALL, NULL, NULL, &ci); + //cl_context GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &ci); cl_context GPUContext = clCreateContext(0, nDevices, devices, NULL, NULL, &ci); CHECK_CL_ERROR(ci); @@ -217,7 +216,7 @@ extern "C" int entry(int argc, char**argv) char name[256]; clGetDeviceInfo(devices[d], CL_DEVICE_NAME, sizeof(name), &name, NULL); cout << "Using device name: " << name << endl; - + cqCommandQueue[d] = clCreateCommandQueue(GPUContext, devices[0], CL_QUEUE_PROFILING_ENABLE, &ci); CHECK_CL_ERROR(ci); @@ -236,23 +235,23 @@ extern "C" int entry(int argc, char**argv) 60000, log, NULL); CHECK_CL_ERROR(ci); - + //printf("build log: %s\n", log); //cout << log << endl; size_t thread_size[] = {number_of_threads, number_of_threads}; size_t block_size[] = {number_of_blocks, number_of_blocks}; - /* + /* cl_mem sub_a = clCreateBuffer(GPUContext, CL_MEM_ALLOC_HOST_PTR, submatsize, NULL, NULL); cl_mem sub_b = clCreateBuffer(GPUContext, CL_MEM_ALLOC_HOST_PTR, submatsize, NULL, NULL); - + cl_kernel OpenCL_multiply_matrices_shared_blocks = clCreateKernel(OpenCLProgram, "multiply_matrices_shared_blocks", &ci); - + CHECK_CL_ERROR(ci); - + ci = clSetKernelArg(OpenCL_multiply_matrices_shared_blocks, 0, sizeof(cl_mem), (void *) &d_a); CHECK_CL_ERROR(ci); ci = clSetKernelArg(OpenCL_multiply_matrices_shared_blocks, 1, sizeof(cl_mem), (void *) &d_b); @@ -305,14 +304,14 @@ extern "C" int entry(int argc, char**argv) clEnqueueWriteBuffer(cCQ, d_a, CL_TRUE, 0, matsize, a, 0, NULL, &event_mem); clEnqueueWriteBuffer(cCQ, d_b, CL_TRUE, 0, matsize, b, 0, NULL, &event_mem); clWaitForEvents(1, &event_mem); - + event = clCreateUserEvent(GPUContext, &ci); CHECK_CL_ERROR(ci); ci = clEnqueueNDRangeKernel(cCQ, OpenCL_multiply_matrices, 2, NULL, block_size, thread_size, 0, NULL, &event); CHECK_CL_ERROR(ci); - + //clWaitForEvents(1, &shared_event); clWaitForEvents(1, &event); CHECK_CL_ERROR(ci); @@ -324,7 +323,7 @@ extern "C" int entry(int argc, char**argv) //clFinish(cCQ); } - + cout << "Finished " << number_of_iterations << " iterations on " << nDevices << " devices." << endl; /* std::cout << " results: " << std::endl; @@ -334,7 +333,7 @@ extern "C" int entry(int argc, char**argv) } std::cout << std::endl; } - */ + */ free(a); free(b); diff --git a/include/Makefile.skel b/include/Makefile.skel index 27e903fd3..413892247 100644 --- a/include/Makefile.skel +++ b/include/Makefile.skel @@ -202,6 +202,7 @@ TAU_LLVM_SRC_DIR= TAU_LLVM_CXX= TAU_LLVM_CC= TAU_STARPU_DIR= +TAU_OPENCL_LIBRARY= #MPC#TAU_CC_FE=$(FULL_CC)#ENDIF# #MPC#CONFIG_CC=$(FULL_CC)#ENDIF# #MPC#TAU_CXX_FE=$(FULL_CXX)#ENDIF# @@ -784,6 +785,7 @@ JDKBINDIR = $(JDKDIR)/bin #ROCMSMI#TAU_ROCM_SMI_INCLUDE_FLAGS = -DTAU_ROCM_SMI -I$(TAU_ROCM_SMI_INC) #ENDIF# #SUPPRESS_PTHREAD_CREATE_WRAPPER#PROFILEOPT117 = -DTAU_SUPPRESS_PTHREAD_CREATE_WRAPPER #ENDIF# #STARPU#PROFILEOPT118 = -I$(TAU_STARPU_DIR) -DTAU_STARPU #ENDIF# +#OPENCL#PROFILEOPT120 = -DTAU_OPENCL_LIBRARY=\"$(TAU_OPENCL_LIBRARY)\" #ENDIF# #GNU_GFORTRAN#TAU_ALLOW_ARG_MISMATCH=-fallow-argument-mismatch#ENDIF# MRNET_ROOT= @@ -1433,7 +1435,7 @@ PROFILEOPTS = $(PROFILEOPT1) $(PROFILEOPT2) $(PROFILEOPT3) $(PROFILEOPT4) \ $(PROFILEOPT108) $(PROFILEOPT109) $(PROFILEOPT110) \ $(PROFILEOPT111) $(PROFILEOPT112) $(PROFILEOPT113) $(PROFILEOPT114) \ $(PROFILEOPT115) $(PROFILEOPT116) $(PROFILEOPT117) $(PROFILEOPT118) \ - $(PROFILEOPT119) $(TRACEOPT) \ + $(PROFILEOPT119) $(PROFILEOPT120) $(TRACEOPT) \ $(TAU_SOS_INCLUDE_OPTS) $(TAU_ADIOS_INCLUDE_OPTS) \ $(TAU_OTF2_INCLUDE_OPTS) $(TAU_CALIPER_INCLUDE_OPTS) \ $(TAU_CORESYMBOLICATION_INCLUDE_OPTS) $(TAU_ELF_BFD_PROFILEOPT) \ diff --git a/src/Profile/TauGpuAdapterOpenCL.cpp b/src/Profile/TauGpuAdapterOpenCL.cpp index 3b4d0a8ff..7b5aa08cc 100644 --- a/src/Profile/TauGpuAdapterOpenCL.cpp +++ b/src/Profile/TauGpuAdapterOpenCL.cpp @@ -146,9 +146,9 @@ cl_int clGetEventProfilingInfo_noinst(cl_event a1, cl_profiling_info a2, size_t } cl_int clEnqueueWriteBuffer_noinst(cl_command_queue a1, cl_mem a2, cl_bool a3, size_t a4, size_t a5, const void * a6, - cl_uint a7, const cl_event * a8, cl_event * a9) + cl_uint a7, const cl_event * a8, cl_event * a9) { - HANDLE(cl_int, clEnqueueWriteBuffer, cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, + HANDLE(cl_int, clEnqueueWriteBuffer, cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *); return clEnqueueWriteBuffer_h(a1, a2, a3, a4, a5, a6, a7, a8, a9); } @@ -182,7 +182,7 @@ static double Tau_opencl_get_gpu_timestamp(cl_command_queue commandQueue, cl_con if (err == CL_INVALID_CONTEXT) { printf("Invalid context.\n"); } - abort(); + abort(); } struct timeval tp; @@ -200,7 +200,7 @@ static double Tau_opencl_get_gpu_timestamp(cl_command_queue commandQueue, cl_con err = clGetEventProfilingInfo_noinst(sync_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &gpu_timestamp, NULL); if (err != CL_SUCCESS) { printf("Cannot get end time for Sync event: %s\n", clGetErrorString(err)); - abort(); + abort(); } return gpu_timestamp; @@ -226,7 +226,7 @@ static double Tau_opencl_sync_clocks(cl_command_queue commandQueue, cl_context c if (err == CL_INVALID_CONTEXT) { printf("Invalid context.\n"); } - abort(); + abort(); } double cpu_timestamp; @@ -251,7 +251,7 @@ static double Tau_opencl_sync_clocks(cl_command_queue commandQueue, cl_context c abort(); } - //printf("SYNC: CPU= %f GPU= %f.\n", cpu_timestamp, ((double)gpu_timestamp/1e3)); + //printf("SYNC: CPU= %f GPU= %f.\n", cpu_timestamp, ((double)gpu_timestamp/1e3)); return cpu_timestamp - (((double)gpu_timestamp)/1e3); } @@ -260,21 +260,21 @@ void * Tau_opencl_get_handle(char const * fnc_name) #ifdef __APPLE__ static char const * libname = "/System/Library/Frameworks/OpenCL.framework/OpenCL"; #else - static char const * libname = "libOpenCL.so"; + static char const * libname = TAU_OPENCL_LIBRARY; #endif /* __APPLE__ */ static void * handle = NULL; if (!handle) { - handle = (void *)dlopen(libname, RTLD_NOW); + handle = (void *)dlopen(libname, RTLD_NOW); } if (!handle) { - perror("Error opening library in dlopen call"); + perror("Error opening library in dlopen call"); return NULL; } void * fnc_sym = dlsym(handle, fnc_name); if (!fnc_sym) { - perror("Error obtaining symbol info from dlopen'ed lib"); + perror("Error obtaining symbol info from dlopen'ed lib"); return NULL; } return fnc_sym; @@ -299,8 +299,8 @@ OpenCLGpuEvent * Tau_opencl_retrieve_gpu(cl_command_queue q) cl_uint vendor; err = clGetCommandQueueInfo(q, CL_QUEUE_DEVICE, sizeof(cl_device_id), &id, NULL); if (err != CL_SUCCESS) - { - printf("error in clGetCommandQueueInfo DEVICE.\n"); + { + printf("error in clGetCommandQueueInfo DEVICE.\n"); if (err == CL_INVALID_COMMAND_QUEUE) printf("invalid command queue.\n"); } @@ -320,7 +320,7 @@ OpenCLGpuEvent * Tau_opencl_retrieve_gpu(cl_command_queue q) int taskid = TAU_CREATE_TASK(taskid); double cpu_timestamp = Tau_opencl_get_cpu_timestamp(); metric_set_gpu_timestamp(taskid, cpu_timestamp); - Tau_create_top_level_timer_if_necessary_task(taskid); + Tau_create_top_level_timer_if_necessary_task(taskid); OpenCLGpuEvent *gId = new OpenCLGpuEvent(id, (x_uint64) q, sync_offset, taskid); #ifdef TAU_DEBUG_OPENCL fprintf(stderr, "Created OpenCLGpuEvent with taskid %d\n", taskid); @@ -341,7 +341,7 @@ OpenCLGpuEvent * Tau_opencl_new_gpu_event(cl_command_queue queue, char const * n Profiler * p = TauInternal_CurrentProfiler(RtsLayer::myThread()); if (p) { OpenCLGpuEvent * gpu_event = Tau_opencl_retrieve_gpu(queue)->getCopy(); - gpu_event->name = name; + gpu_event->name = name; gpu_event->event = NULL; gpu_event->callingSite = p->CallPathFunction; gpu_event->memcpy_type = memcpy_type; @@ -369,7 +369,7 @@ void Tau_opencl_register_memcpy_event(OpenCLGpuEvent *evId, double start, double { Tau_gpu_register_memcpy_event(evId, start/1e3, stop/1e3, transferSize, MemcpyType, MESSAGE_UNKNOWN); } - + void Tau_opencl_enqueue_event(OpenCLGpuEvent * event) { KernelBuffer().push(event); @@ -409,26 +409,26 @@ void Tau_opencl_register_sync_event() sizeof(cl_ulong), &queuedTime, NULL); if (err != CL_SUCCESS) { printf("Cannot get queued time for Kernel event.\n"); - abort(); + abort(); } err = clGetEventProfilingInfo_noinst(kernel_data->event, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &submitTime, NULL); if (err != CL_SUCCESS) { printf("Cannot get submit time for Kernel event.\n"); - abort(); + abort(); } err = clGetEventProfilingInfo_noinst(kernel_data->event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL); if (err != CL_SUCCESS) { printf("Cannot get start time for Kernel event.\n"); - abort(); + abort(); } err = clGetEventProfilingInfo_noinst(kernel_data->event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); if (err != CL_SUCCESS) { printf("Cannot get end time for Kernel event.\n"); - abort(); + abort(); } //Add context events to gpu event. diff --git a/utils/FixMakefile b/utils/FixMakefile index 7ade88266..0ef7d46f2 100755 --- a/utils/FixMakefile +++ b/utils/FixMakefile @@ -1139,6 +1139,10 @@ case $1 in starpudir=`echo $1 | sed -e 's/^starpudir=//' -e 's/#/ /g'` echo "s@^TAU_STARPU_DIR=\(.*\)@TAU_STARPU_DIR=$starpudir@g" >> $sedout ;; + opencllib=*) + opencllib=`echo $1 | sed -e 's/^opencllib=//' -e 's/#/ /g'` + echo "s@^TAU_OPENCL_LIBRARY=\(.*\)@TAU_OPENCL_LIBRARY=$opencllib@g" >> $sedout + ;; ############################################################### # Set the TAU_GPU variables. ############################################################### @@ -1225,6 +1229,10 @@ case $1 in echo "NOTE: Using TAU's StarPU Options" echo "s/#$1#\(.*\)/$bs\1#$1#/g" >> $sedout ;; + OPENCL) + echo "NOTE: Using the TAU's OPENCL Profiling Interface " + echo "s/#$1#\(.*\)/$bs\1#$1#/g" >> $sedout + ;; cudainclude=*) cudainclude=`echo $1 | sed -e 's/^cudainclude=//' -e 's/#/ /g'`