diff --git a/configure b/configure
index fe177ec10..8f7e577b5 100755
--- a/configure
+++ b/configure
@@ -2453,7 +2453,7 @@ for arg in "$@"; do
             dwarfdir=""
             shift
             ;;
-        
+
         -elfutils=download)
             elfutils=yes
             download_elfutils=yes
@@ -3242,7 +3242,7 @@ for arg in "$@"; do
     	    fi
     	    shift
     	    ;;
-    
+
     	-rocprofv2)
             fixmakeargs="$fixmakeargs ROCPROFILERV2"
             rocprofv2=yes
@@ -3258,7 +3258,7 @@ for arg in "$@"; do
             pthread=yes
             shift
             ;;
-        
+
         -rocprofsdk=*)
             fixmakeargs="$fixmakeargs ROCPROFILERSDK"
             rocprofsdk=yes
@@ -3374,7 +3374,8 @@ for arg in "$@"; do
             shift
             ;;
         -opencl=*)
-            openclinclude=`echo $arg | sed -e 's/-opencl=//' -e 's/ /#/g'`/include
+            opencldir=`echo $arg | sed -e 's/-opencl=//' -e 's/ /#/g'`
+            openclinclude=$opencldir/include
             if [ ! -d $openclinclude ] ; then
                 echo "Error: Cannot access GPU include directory $openclinclude"
                 if [ `uname -s ` = "Darwin" ];  then
@@ -3396,8 +3397,21 @@ for arg in "$@"; do
                     use_opencl=yes
                 else
                     echo "Error: Cannot find GPU headers, TAU currently supports either CUDA or OpenCL"
+                    exit 0
                 fi
             fi
+            opencllib="libOpenCL.so"
+            for d in "OpenCL" "amdocl64" ; do
+                libname="lib${d}.so"
+                echo "Looking for ${libname} in ${opencldir}"
+                exists=`find ${opencldir} -name ${libname} | head -n 1`
+                if [ -r "${exists}" ]; then
+                    echo "found $exists"
+                    opencllib=${exists}
+                    fixmakeargs="$fixmakeargs OPENCL opencllib=${opencllib}"
+                    break
+                fi
+            done
             shift
             ;;
 
@@ -3738,9 +3752,8 @@ fi
 
 if [ "x$use_opencl" = "xyes" -a "x$openclinclude" = "x" ]; then
   ld_lib_path=`echo $LD_LIBRARY_PATH | sed -e "s@:@ @g" `
-  for d in $ld_lib_path
-  do
-          echo "OPENCL: CHECKING $d"
+  for d in $ld_lib_path ; do
+    echo "OPENCL: CHECKING $d"
     if [ -r $d/libOpenCL.so ]; then
       echo "checking $d"
       openclinclude=`echo $d | sed -e "s@/lib64@@g" -e "s@loader@headers@g" `/include
@@ -3757,6 +3770,7 @@ if [ "x$use_opencl" = "xyes" -a "x$openclinclude" = "x" ]; then
           fixmakeargs="$fixmakeargs OPENCL TAU_USE_GPU openclinclude=$openclinclude/sycl"
         else
             echo "Error: Cannot find GPU headers, TAU currently supports either CUDA or OpenCL"
+            exit 0
         fi
       fi
     fi
@@ -11003,7 +11017,7 @@ if [ "x$download_elfutils" = xyes ] ; then
 
     predowndir=`pwd`
     elfutilsdir=$libelfutilsdir/elfutils
-    
+
     if [ -r "$elfutilsdir/lib/libdw.so" -a -r "$elfutilsdir/include/elfutils/libdw.h" ]; then
         echo "Found elfutils"
         echo "elfutils download skipping"
@@ -11037,7 +11051,7 @@ if [ "$elfutils" = "yes" ]; then
         echo "Could not find elfutils $elfutilsdir/lib/libdw.so"
         exit 1
     fi
-    
+
 
     fixmakeargs="$fixmakeargs elfutilsincdir=$elfutilsinc"
     fixmakeargs="$fixmakeargs elfutilslibdir=$elfutilslib"
diff --git a/examples/gpu/python_opencl/Makefile b/examples/gpu/python_opencl/Makefile
index 0b448c395..162d8ebb2 100644
--- a/examples/gpu/python_opencl/Makefile
+++ b/examples/gpu/python_opencl/Makefile
@@ -1,19 +1,21 @@
 include ../../../include/Makefile
 #TAU_OPENCL_INC=/opt/intel/oneapi/compiler/2021.1-beta10/linux/include/sycl
+TAU_OPENCL_INC=${ROCM_PATH}/include
 
-TAU_OPENCL_LIB=-L$(TAU_OPENCL_INC)/../lib/x86_64 -L$(TAU_OPENCL_INC)/../lib -lOpenCL
+#TAU_OPENCL_LIB=-L$(TAU_OPENCL_INC)/../lib/x86_64 -L$(TAU_OPENCL_INC)/../lib -lOpenCL
 #TAU_OPENCL_LIB=-L/opt/intel/oneapi/compiler/2021.1-beta10/linux/lib -lOpenCL
 #TAU_CXX=clang++
+TAU_OPENCL_LIB=-L$(TAU_OPENCL_INC)/../lib/x86_64 -L$(TAU_OPENCL_INC)/../lib -L${ROCM_PATH}/lib -Wl,-rpath,${ROCM_PATH}/lib -lamdocl64
 
 all: libmatmult.so
 
-libmatmult.so: matmult.o
+libmatmult.so: matmult.o Makefile
 	$(TAU_CXX) -g -o $@ $< $(TAU_OPENCL_LIB) -shared
 
 matmult.o: matmult.cpp
 	$(TAU_CXX) -I$(TAU_OPENCL_INC) -g -c $< -o $@ -fPIC
 
-clean: 
+clean:
 	rm -rf libmatmult.so matmult.o profile.*
 
 run:
diff --git a/examples/gpu/python_opencl/matmult.cpp b/examples/gpu/python_opencl/matmult.cpp
index d22149e44..e8a7e24e4 100644
--- a/examples/gpu/python_opencl/matmult.cpp
+++ b/examples/gpu/python_opencl/matmult.cpp
@@ -107,8 +107,8 @@ extern "C" int entry(int argc, char**argv)
 		block_mult = ceil(SIZE_OF_MATRIX / ((float) SIZE_OF_BLOCK));
 	else
 		 block_mult = 1;
- 	 
-	
+
+
 	number_of_blocks = SIZE_OF_BLOCK * block_mult;
 
 	unsigned int matsize = SIZE_OF_MATRIX*SIZE_OF_MATRIX*sizeof(float);
@@ -143,18 +143,17 @@ extern "C" int entry(int argc, char**argv)
 
 	cl_uint nDevices, count;
 	cl_device_id *cdDevices = NULL;
-	ci = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_ALL, 0, NULL, &count);
-	
+	ci = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &count);
+
 	cdDevices = (cl_device_id *)malloc(count * sizeof(cl_device_id));
-	//ci = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_ALL, count, cdDevices, NULL);
-	ci = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_CPU, 1, cdDevices, NULL);
+	ci = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, count, cdDevices, NULL);
 	CHECK_CL_ERROR(ci);
 
 	cout << count << " devices found." << endl;
 
 	string device_list("");
 	int number_of_iterations = 1;
-	
+
 	int opt = getopt(argc, argv, "d:i:");
 	while(opt != -1) {
 		stringstream str;
@@ -206,7 +205,7 @@ extern "C" int entry(int argc, char**argv)
 	}
 	//cout << "finnished mapping devices." << endl;
 
-	//cl_context GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_ALL, NULL, NULL, &ci);
+	//cl_context GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &ci);
 	cl_context GPUContext = clCreateContext(0, nDevices, devices, NULL, NULL, &ci);
 	CHECK_CL_ERROR(ci);
 
@@ -217,7 +216,7 @@ extern "C" int entry(int argc, char**argv)
 		char name[256];
 		clGetDeviceInfo(devices[d], CL_DEVICE_NAME, sizeof(name), &name, NULL);
 		cout << "Using device name: " << name << endl;
-	
+
 		cqCommandQueue[d] = clCreateCommandQueue(GPUContext, devices[0], CL_QUEUE_PROFILING_ENABLE, &ci);
 		CHECK_CL_ERROR(ci);
 
@@ -236,23 +235,23 @@ extern "C" int entry(int argc, char**argv)
 	60000, log, NULL);
 
 	CHECK_CL_ERROR(ci);
-	
+
 	//printf("build log: %s\n", log);
 	//cout << log << endl;
 
 	size_t thread_size[] = {number_of_threads, number_of_threads};
 	size_t block_size[] = {number_of_blocks, number_of_blocks};
-  /*	
+  /*
 	cl_mem sub_a = clCreateBuffer(GPUContext, CL_MEM_ALLOC_HOST_PTR, submatsize,
 	NULL, NULL);
 	cl_mem sub_b = clCreateBuffer(GPUContext, CL_MEM_ALLOC_HOST_PTR, submatsize,
 	NULL, NULL);
-	
+
 	cl_kernel OpenCL_multiply_matrices_shared_blocks = clCreateKernel(OpenCLProgram,
 	"multiply_matrices_shared_blocks", &ci);
-	
+
 	CHECK_CL_ERROR(ci);
-	
+
 	ci = clSetKernelArg(OpenCL_multiply_matrices_shared_blocks, 0, sizeof(cl_mem), (void *) &d_a);
 	CHECK_CL_ERROR(ci);
 	ci = clSetKernelArg(OpenCL_multiply_matrices_shared_blocks, 1, sizeof(cl_mem), (void *) &d_b);
@@ -305,14 +304,14 @@ extern "C" int entry(int argc, char**argv)
 		clEnqueueWriteBuffer(cCQ, d_a, CL_TRUE, 0, matsize, a, 0, NULL, &event_mem);
 		clEnqueueWriteBuffer(cCQ, d_b, CL_TRUE, 0, matsize, b, 0, NULL, &event_mem);
 		clWaitForEvents(1, &event_mem);
-		
+
 		event = clCreateUserEvent(GPUContext, &ci);
 		CHECK_CL_ERROR(ci);
 
 		ci = clEnqueueNDRangeKernel(cCQ, OpenCL_multiply_matrices, 2, NULL,
 		block_size, thread_size, 0, NULL, &event);
 		CHECK_CL_ERROR(ci);
-		
+
 		//clWaitForEvents(1, &shared_event);
 		clWaitForEvents(1, &event);
 		CHECK_CL_ERROR(ci);
@@ -324,7 +323,7 @@ extern "C" int entry(int argc, char**argv)
 		//clFinish(cCQ);
 
 	}
-	
+
 	cout << "Finished " << number_of_iterations << " iterations on " << nDevices << " devices." << endl;
 	/*
 	std::cout << " results: " << std::endl;
@@ -334,7 +333,7 @@ extern "C" int entry(int argc, char**argv)
 		}
 		std::cout << std::endl;
 	}
-	*/	
+	*/
 
 	free(a);
 	free(b);
diff --git a/include/Makefile.skel b/include/Makefile.skel
index 27e903fd3..413892247 100644
--- a/include/Makefile.skel
+++ b/include/Makefile.skel
@@ -202,6 +202,7 @@ TAU_LLVM_SRC_DIR=
 TAU_LLVM_CXX=
 TAU_LLVM_CC=
 TAU_STARPU_DIR=
+TAU_OPENCL_LIBRARY=
 #MPC#TAU_CC_FE=$(FULL_CC)#ENDIF#
 #MPC#CONFIG_CC=$(FULL_CC)#ENDIF#
 #MPC#TAU_CXX_FE=$(FULL_CXX)#ENDIF#
@@ -784,6 +785,7 @@ JDKBINDIR = $(JDKDIR)/bin
 #ROCMSMI#TAU_ROCM_SMI_INCLUDE_FLAGS = -DTAU_ROCM_SMI -I$(TAU_ROCM_SMI_INC) #ENDIF#
 #SUPPRESS_PTHREAD_CREATE_WRAPPER#PROFILEOPT117 = -DTAU_SUPPRESS_PTHREAD_CREATE_WRAPPER #ENDIF#
 #STARPU#PROFILEOPT118 = -I$(TAU_STARPU_DIR) -DTAU_STARPU #ENDIF#
+#OPENCL#PROFILEOPT120 = -DTAU_OPENCL_LIBRARY=\"$(TAU_OPENCL_LIBRARY)\" #ENDIF#
 #GNU_GFORTRAN#TAU_ALLOW_ARG_MISMATCH=-fallow-argument-mismatch#ENDIF#
 
 MRNET_ROOT=
@@ -1433,7 +1435,7 @@ PROFILEOPTS = $(PROFILEOPT1)  $(PROFILEOPT2)  $(PROFILEOPT3)  $(PROFILEOPT4)  \
               $(PROFILEOPT108) $(PROFILEOPT109) $(PROFILEOPT110) \
               $(PROFILEOPT111) $(PROFILEOPT112) $(PROFILEOPT113) $(PROFILEOPT114) \
               $(PROFILEOPT115) $(PROFILEOPT116) $(PROFILEOPT117) $(PROFILEOPT118) \
-	      $(PROFILEOPT119) $(TRACEOPT) \
+	      $(PROFILEOPT119) $(PROFILEOPT120) $(TRACEOPT) \
 		  $(TAU_SOS_INCLUDE_OPTS) $(TAU_ADIOS_INCLUDE_OPTS) \
 		  $(TAU_OTF2_INCLUDE_OPTS) $(TAU_CALIPER_INCLUDE_OPTS) \
                   $(TAU_CORESYMBOLICATION_INCLUDE_OPTS) $(TAU_ELF_BFD_PROFILEOPT) \
diff --git a/src/Profile/TauGpuAdapterOpenCL.cpp b/src/Profile/TauGpuAdapterOpenCL.cpp
index 3b4d0a8ff..7b5aa08cc 100644
--- a/src/Profile/TauGpuAdapterOpenCL.cpp
+++ b/src/Profile/TauGpuAdapterOpenCL.cpp
@@ -146,9 +146,9 @@ cl_int clGetEventProfilingInfo_noinst(cl_event a1, cl_profiling_info a2, size_t
 }
 
 cl_int clEnqueueWriteBuffer_noinst(cl_command_queue a1, cl_mem a2, cl_bool a3, size_t a4, size_t a5, const void * a6,
-                                   cl_uint a7, const cl_event * a8, cl_event * a9) 
+                                   cl_uint a7, const cl_event * a8, cl_event * a9)
 {
-  HANDLE(cl_int, clEnqueueWriteBuffer, cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, 
+  HANDLE(cl_int, clEnqueueWriteBuffer, cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint,
          const cl_event *, cl_event *);
   return clEnqueueWriteBuffer_h(a1,  a2,  a3,  a4,  a5,  a6,  a7,  a8,  a9);
 }
@@ -182,7 +182,7 @@ static double Tau_opencl_get_gpu_timestamp(cl_command_queue commandQueue, cl_con
     if (err == CL_INVALID_CONTEXT) {
       printf("Invalid context.\n");
     }
-    abort();	
+    abort();
   }
 
   struct timeval tp;
@@ -200,7 +200,7 @@ static double Tau_opencl_get_gpu_timestamp(cl_command_queue commandQueue, cl_con
   err = clGetEventProfilingInfo_noinst(sync_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &gpu_timestamp, NULL);
   if (err != CL_SUCCESS) {
     printf("Cannot get end time for Sync event: %s\n", clGetErrorString(err));
-    abort();	
+    abort();
   }
 
   return gpu_timestamp;
@@ -226,7 +226,7 @@ static double Tau_opencl_sync_clocks(cl_command_queue commandQueue, cl_context c
     if (err == CL_INVALID_CONTEXT) {
       printf("Invalid context.\n");
     }
-    abort();	
+    abort();
   }
 
   double cpu_timestamp;
@@ -251,7 +251,7 @@ static double Tau_opencl_sync_clocks(cl_command_queue commandQueue, cl_context c
     abort();
   }
 
-  //printf("SYNC: CPU= %f GPU= %f.\n", cpu_timestamp, ((double)gpu_timestamp/1e3)); 
+  //printf("SYNC: CPU= %f GPU= %f.\n", cpu_timestamp, ((double)gpu_timestamp/1e3));
   return cpu_timestamp - (((double)gpu_timestamp)/1e3);
 }
 
@@ -260,21 +260,21 @@ void * Tau_opencl_get_handle(char const * fnc_name)
 #ifdef __APPLE__
   static char const * libname = "/System/Library/Frameworks/OpenCL.framework/OpenCL";
 #else
-  static char const * libname = "libOpenCL.so";
+  static char const * libname = TAU_OPENCL_LIBRARY;
 #endif /* __APPLE__ */
 
   static void * handle = NULL;
   if (!handle) {
-    handle = (void *)dlopen(libname, RTLD_NOW); 
+    handle = (void *)dlopen(libname, RTLD_NOW);
   }
   if (!handle) {
-    perror("Error opening library in dlopen call"); 
+    perror("Error opening library in dlopen call");
     return NULL;
   }
 
   void * fnc_sym = dlsym(handle, fnc_name);
   if (!fnc_sym) {
-    perror("Error obtaining symbol info from dlopen'ed lib"); 
+    perror("Error obtaining symbol info from dlopen'ed lib");
     return NULL;
   }
   return fnc_sym;
@@ -299,8 +299,8 @@ OpenCLGpuEvent * Tau_opencl_retrieve_gpu(cl_command_queue q)
   cl_uint vendor;
   err = clGetCommandQueueInfo(q, CL_QUEUE_DEVICE, sizeof(cl_device_id), &id, NULL);
   if (err != CL_SUCCESS)
-  {	
-    printf("error in clGetCommandQueueInfo DEVICE.\n"); 
+  {
+    printf("error in clGetCommandQueueInfo DEVICE.\n");
     if (err == CL_INVALID_COMMAND_QUEUE)
       printf("invalid command queue.\n");
   }
@@ -320,7 +320,7 @@ OpenCLGpuEvent * Tau_opencl_retrieve_gpu(cl_command_queue q)
   int taskid = TAU_CREATE_TASK(taskid);
   double cpu_timestamp = Tau_opencl_get_cpu_timestamp();
   metric_set_gpu_timestamp(taskid, cpu_timestamp);
-  Tau_create_top_level_timer_if_necessary_task(taskid); 
+  Tau_create_top_level_timer_if_necessary_task(taskid);
   OpenCLGpuEvent *gId = new OpenCLGpuEvent(id, (x_uint64) q, sync_offset, taskid);
 #ifdef TAU_DEBUG_OPENCL
   fprintf(stderr, "Created OpenCLGpuEvent with taskid %d\n", taskid);
@@ -341,7 +341,7 @@ OpenCLGpuEvent * Tau_opencl_new_gpu_event(cl_command_queue queue, char const * n
   Profiler * p = TauInternal_CurrentProfiler(RtsLayer::myThread());
   if (p) {
     OpenCLGpuEvent * gpu_event = Tau_opencl_retrieve_gpu(queue)->getCopy();
-    gpu_event->name = name;                     
+    gpu_event->name = name;
     gpu_event->event = NULL;
     gpu_event->callingSite = p->CallPathFunction;
     gpu_event->memcpy_type = memcpy_type;
@@ -369,7 +369,7 @@ void Tau_opencl_register_memcpy_event(OpenCLGpuEvent *evId, double start, double
 {
   Tau_gpu_register_memcpy_event(evId, start/1e3, stop/1e3, transferSize, MemcpyType, MESSAGE_UNKNOWN);
 }
- 
+
 void Tau_opencl_enqueue_event(OpenCLGpuEvent * event)
 {
   KernelBuffer().push(event);
@@ -409,26 +409,26 @@ void Tau_opencl_register_sync_event()
         sizeof(cl_ulong), &queuedTime, NULL);
     if (err != CL_SUCCESS) {
       printf("Cannot get queued time for Kernel event.\n");
-      abort();	
+      abort();
     }
     err = clGetEventProfilingInfo_noinst(kernel_data->event, CL_PROFILING_COMMAND_SUBMIT,
         sizeof(cl_ulong), &submitTime, NULL);
     if (err != CL_SUCCESS) {
       printf("Cannot get submit time for Kernel event.\n");
-      abort();	
+      abort();
     }
     err = clGetEventProfilingInfo_noinst(kernel_data->event, CL_PROFILING_COMMAND_START,
         sizeof(cl_ulong), &startTime, NULL);
     if (err != CL_SUCCESS) {
       printf("Cannot get start time for Kernel event.\n");
-      abort();	
+      abort();
     }
 
     err = clGetEventProfilingInfo_noinst(kernel_data->event, CL_PROFILING_COMMAND_END,
         sizeof(cl_ulong), &endTime, NULL);
     if (err != CL_SUCCESS) {
       printf("Cannot get end time for Kernel event.\n");
-      abort();	
+      abort();
     }
 
     //Add context events to gpu event.
diff --git a/utils/FixMakefile b/utils/FixMakefile
index 7ade88266..0ef7d46f2 100755
--- a/utils/FixMakefile
+++ b/utils/FixMakefile
@@ -1139,6 +1139,10 @@ case $1 in
         starpudir=`echo $1 | sed -e 's/^starpudir=//' -e 's/#/ /g'`
 	echo "s@^TAU_STARPU_DIR=\(.*\)@TAU_STARPU_DIR=$starpudir@g" >> $sedout
         ;;
+     opencllib=*)
+        opencllib=`echo $1 | sed -e 's/^opencllib=//' -e 's/#/ /g'`
+		echo "s@^TAU_OPENCL_LIBRARY=\(.*\)@TAU_OPENCL_LIBRARY=$opencllib@g" >> $sedout
+        ;;
 ###############################################################
 # Set the TAU_GPU variables.
 ###############################################################
@@ -1225,6 +1229,10 @@ case $1 in
         echo "NOTE: Using TAU's StarPU Options"
         echo "s/#$1#\(.*\)/$bs\1#$1#/g" >> $sedout
         ;;
+     OPENCL)
+        echo "NOTE: Using the TAU's OPENCL Profiling Interface "
+        echo "s/#$1#\(.*\)/$bs\1#$1#/g" >> $sedout
+        ;;
 
      cudainclude=*)
         cudainclude=`echo $1 | sed -e 's/^cudainclude=//' -e 's/#/ /g'`