diff --git a/com.amd.aparapi.jni/build.xml b/com.amd.aparapi.jni/build.xml index 1e4912b8..44516cc9 100644 --- a/com.amd.aparapi.jni/build.xml +++ b/com.amd.aparapi.jni/build.xml @@ -479,6 +479,8 @@ First consider editing the properties in build.properties + + @@ -540,6 +542,7 @@ First consider editing the properties in build.properties + @@ -564,6 +567,7 @@ First consider editing the properties in build.properties + @@ -575,6 +579,7 @@ First consider editing the properties in build.properties + @@ -601,6 +606,7 @@ First consider editing the properties in build.properties + @@ -614,6 +620,7 @@ First consider editing the properties in build.properties + @@ -657,6 +664,7 @@ First consider editing the properties in build.properties + diff --git a/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.cpp b/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.cpp index 142fb19c..277cbe94 100644 --- a/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.cpp +++ b/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.cpp @@ -45,6 +45,7 @@ #include "Config.h" #include "ProfileInfo.h" #include "ArrayBuffer.h" +#include "AparapiBuffer.h" #include "CLHelper.h" #include "List.h" #include @@ -52,7 +53,8 @@ //compiler dependant code /** - * calls either clEnqueueMarker or clEnqueueMarkerWithWaitList depending on the version of OpenCL installed. + * calls either clEnqueueMarker or clEnqueueMarkerWithWaitList + * depending on the version of OpenCL installed. * conveiniece function so we don't have to have #ifdefs all over the code */ int enqueueMarker(cl_command_queue commandQueue, cl_event* firstEvent) { @@ -240,7 +242,10 @@ jint updateNonPrimitiveReferences(JNIEnv *jenv, jobject jobj, JNIContext* jniCon if (config->isVerbose()){ fprintf(stderr, "got type for %s: %08x\n", arg->name, arg->type); } - if (!arg->isPrimitive()) { + + //this won't be a problem with the aparapi buffers because + //we need to copy them every time anyway + if (!arg->isPrimitive() && !arg->isAparapiBuffer()) { // Following used for all primitive arrays, object arrays and nio Buffers jarray newRef = (jarray)jenv->GetObjectField(arg->javaArg, KernelArg::javaArrayFieldID); if (config->isVerbose()){ @@ -326,19 +331,8 @@ void profileFirstRun(JNIContext* jniContext) { } } -/** - * create a new variable in OpenCL for the object, and sets the kernel arguement. - * currently the only objects supported are arrays. - * - * @param jenv the java environment - * @param jniContext the context we got from java - * @param arg the argument we're passing to opencl - * @param argPos out: the position of arg in the opencl argument list - * @param argIdx the position of arg in the argument array - * - * @throws CLException - */ -void updateObject(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx) { + +void updateArray(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx) { cl_int status = CL_SUCCESS; // if either this is the first run or user changed input array @@ -373,6 +367,7 @@ void updateObject(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& arg // Add the array length if needed if (arg->usesArrayLength()) { + argPos++; arg->syncJavaArrayLength(jenv); status = clSetKernelArg(jniContext->kernel, argPos, sizeof(jint), &(arg->arrayBuffer->length)); @@ -381,10 +376,52 @@ void updateObject(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& arg if (config->isVerbose()){ fprintf(stderr, "runKernel arg %d %s, length = %d\n", argIdx, arg->name, arg->arrayBuffer->length); } - argPos++; } } +void updateBuffer(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx) { + + AparapiBuffer* buffer = arg->aparapiBuffer; + cl_int status = CL_SUCCESS; + cl_uint mask = CL_MEM_USE_HOST_PTR; + if (arg->isReadByKernel() && arg->isMutableByKernel()) mask |= CL_MEM_READ_WRITE; + else if (arg->isReadByKernel() && !arg->isMutableByKernel()) mask |= CL_MEM_READ_ONLY; + else if (arg->isMutableByKernel()) mask |= CL_MEM_WRITE_ONLY; + buffer->memMask = mask; + + buffer->mem = clCreateBuffer(jniContext->context, buffer->memMask, + buffer->lengthInBytes, buffer->data, &status); + + if(status != CL_SUCCESS) throw CLException(status,"clCreateBuffer"); + + if (config->isTrackingOpenCLResources()){ + memList.add(buffer->mem, __LINE__, __FILE__); + } + + status = clSetKernelArg(jniContext->kernel, argPos, sizeof(cl_mem), (void *)&(buffer->mem)); + if(status != CL_SUCCESS) throw CLException(status,"clSetKernelArg (buffer)"); + + // Add the array length if needed + if (arg->usesArrayLength()) { + + for(int i = 0; i < buffer->numDims; i++) { + argPos++; + status = clSetKernelArg(jniContext->kernel, argPos, sizeof(cl_uint), &(buffer->lens[i])); + if(status != CL_SUCCESS) throw CLException(status,"clSetKernelArg (buffer length)"); + if (config->isVerbose()){ + fprintf(stderr, "runKernel arg %d %s, length = %d\n", argIdx, arg->name, buffer->lens[i]); + } + argPos++; + status = clSetKernelArg(jniContext->kernel, argPos, sizeof(cl_uint), &(buffer->dims[i])); + if(status != CL_SUCCESS) throw CLException(status,"clSetKernelArg (buffer dimension)"); + if (config->isVerbose()){ + fprintf(stderr, "runKernel arg %d %s, dim = %d\n", argIdx, arg->name, buffer->dims[i]); + } + } + } +} + + /** * manages the memory of KernelArgs that are object. i.e. handels pinning, and moved objects. * currently the only objects supported are arrays. @@ -398,6 +435,14 @@ void updateObject(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& arg * @throws CLException */ void processObject(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx) { + if(arg->isArray()) { + processArray(jenv, jniContext, arg, argPos, argIdx); + } else if(arg->isAparapiBuffer()) { + processBuffer(jenv, jniContext, arg, argPos, argIdx); + } +} + +void processArray(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx) { cl_int status = CL_SUCCESS; @@ -452,7 +497,7 @@ void processObject(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& ar arg->arrayBuffer->mem = (cl_mem)0; } - updateObject(jenv, jniContext, arg, argPos, argIdx); + updateArray(jenv, jniContext, arg, argPos, argIdx); } else { // Keep the arg position in sync if no updates were required @@ -463,6 +508,50 @@ void processObject(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& ar } +void processBuffer(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx) { + + cl_int status = CL_SUCCESS; + + if (config->isProfilingEnabled()){ + arg->aparapiBuffer->read.valid = false; + arg->aparapiBuffer->write.valid = false; + } + + if (config->isVerbose()) { + fprintf(stderr, "runKernel: arrayOrBuf addr=%p, ref.mem=%p\n", + arg->aparapiBuffer->data, + arg->aparapiBuffer->mem); + fprintf(stderr, "at memory addr %p, contents: ", arg->aparapiBuffer->data); + unsigned char *pb = (unsigned char *) arg->aparapiBuffer->data; + for (int k=0; k<8; k++) { + fprintf(stderr, "%02x ", pb[k]); + } + fprintf(stderr, "\n" ); + } + + if (config->isVerbose()){ + if (arg->isExplicit() && arg->isExplicitWrite()){ + fprintf(stderr, "explicit write of %s\n", arg->name); + } + } + + if (arg->aparapiBuffer->mem != 0) { + if (config->isTrackingOpenCLResources()) { + memList.remove((cl_mem)arg->aparapiBuffer->mem, __LINE__, __FILE__); + } + status = clReleaseMemObject((cl_mem)arg->aparapiBuffer->mem); + //fprintf(stdout, "dispose arg %d %0lx\n", i, arg->aparapiBuffer->mem); + + //this needs to be reported, but we can still keep going + CLException::checkCLError(status, "clReleaseMemObject()"); + + arg->aparapiBuffer->mem = (cl_mem)0; + } + + updateBuffer(jenv, jniContext, arg, argPos, argIdx); + +} + /** * keeps track of write events for KernelArgs. @@ -487,8 +576,13 @@ void updateWriteEvents(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int jniContext->writeEventArgs[writeEventCount] = argIdx; } - status = clEnqueueWriteBuffer(jniContext->commandQueue, arg->arrayBuffer->mem, CL_FALSE, 0, + if(arg->isArray()) { + status = clEnqueueWriteBuffer(jniContext->commandQueue, arg->arrayBuffer->mem, CL_FALSE, 0, arg->arrayBuffer->lengthInBytes, arg->arrayBuffer->addr, 0, NULL, &(jniContext->writeEvents[writeEventCount])); + } else if(arg->isAparapiBuffer()) { + status = clEnqueueWriteBuffer(jniContext->commandQueue, arg->aparapiBuffer->mem, CL_FALSE, 0, + arg->aparapiBuffer->lengthInBytes, arg->aparapiBuffer->data, 0, NULL, &(jniContext->writeEvents[writeEventCount])); + } if(status != CL_SUCCESS) throw CLException(status,"clEnqueueWriteBuffer"); if (config->isTrackingOpenCLResources()){ @@ -503,6 +597,7 @@ void updateWriteEvents(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int } } + /** * sets the opencl kernel arguement for local args. * @@ -514,7 +609,7 @@ void updateWriteEvents(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int * * @throws CLException */ -void processLocal(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx) { +void processLocalArray(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx) { cl_int status = CL_SUCCESS; // what if local buffer size has changed? We need a check for resize here. @@ -534,7 +629,6 @@ void processLocal(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& arg if(status != CL_SUCCESS) throw CLException(status,"clSetKernelArg (array length)"); - argPos++; } } else { // Keep the arg position in sync if no updates were required @@ -544,6 +638,53 @@ void processLocal(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& arg } } +/** + * sets the opencl kernel arguement for local args. + * + * @param jenv the java envrionment + * @param jniContext the context we got from java + * @param arg the KernelArg to create a write event for + * @param argPos out: the position of arg in the opencl argument list + * @param argIdx the position of arg in the argument array + * + * @throws CLException + */ +void processLocalBuffer(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx) { + + cl_int status = CL_SUCCESS; + // what if local buffer size has changed? We need a check for resize here. + if (jniContext->firstRun) { + status = arg->setLocalBufferArg(jenv, argIdx, argPos, config->isVerbose()); + if(status != CL_SUCCESS) throw CLException(status,"clSetKernelArg() (local)"); + + // Add the array length if needed + if (arg->usesArrayLength()) { + arg->syncJavaArrayLength(jenv); + + for(int i = 0; i < arg->aparapiBuffer->numDims; i++) + { + int length = arg->aparapiBuffer->lens[i]; + status = clSetKernelArg(jniContext->kernel, argPos, sizeof(jint), &length); + if (config->isVerbose()){ + fprintf(stderr, "runKernel arg %d %s, javaArrayLength = %d\n", argIdx, arg->name, length); + } + if(status != CL_SUCCESS) throw CLException(status,"clSetKernelArg (array length)"); + } + } + } else { + // Keep the arg position in sync if no updates were required + if (arg->usesArrayLength()) { + argPos += arg->aparapiBuffer->numDims; + } + } +} + +void processLocal(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx) { + if(arg->isArray()) processLocalArray(jenv,jniContext,arg,argPos,argIdx); + if(arg->isAparapiBuffer()) processLocalBuffer(jenv,jniContext,arg,argPos,argIdx); +} + + /** * processes all of the arguments for the OpenCL Kernel that we got from the JNIContext * @@ -755,7 +896,7 @@ void enqueueKernel(JNIContext* jniContext, Range& range, int passes, int argPos, * * @throws CLException */ -int getReadEvents(JNIContext* jniContext) { +int getReadEvents(JNIEnv* jenv, JNIContext* jniContext) { int readEventCount = 0; @@ -774,8 +915,17 @@ int getReadEvents(JNIContext* jniContext) { fprintf(stderr, "reading buffer %d %s\n", i, arg->name); } - status = clEnqueueReadBuffer(jniContext->commandQueue, arg->arrayBuffer->mem, CL_FALSE, 0, - arg->arrayBuffer->lengthInBytes,arg->arrayBuffer->addr , 1, jniContext->executeEvents, &(jniContext->readEvents[readEventCount])); + if(arg->isArray()) { + status = clEnqueueReadBuffer(jniContext->commandQueue, arg->arrayBuffer->mem, + CL_FALSE, 0, arg->arrayBuffer->lengthInBytes, arg->arrayBuffer->addr, 1, + jniContext->executeEvents, &(jniContext->readEvents[readEventCount])); + } else if(arg->isAparapiBuffer()) { + status = clEnqueueReadBuffer(jniContext->commandQueue, arg->aparapiBuffer->mem, + CL_TRUE, 0, arg->aparapiBuffer->lengthInBytes, arg->aparapiBuffer->data, 1, + jniContext->executeEvents, &(jniContext->readEvents[readEventCount])); + arg->aparapiBuffer->inflate(jenv, arg); + } + if (status != CL_SUCCESS) throw CLException(status, "clEnqueueReadBuffer()"); if (config->isTrackingOpenCLResources()){ @@ -932,7 +1082,7 @@ JNI_JAVA(jint, KernelRunnerJNI, runKernelJNI) int writeEventCount = 0; processArgs(jenv, jniContext, argPos, writeEventCount); enqueueKernel(jniContext, range, passes, argPos, writeEventCount); - int readEventCount = getReadEvents(jniContext); + int readEventCount = getReadEvents(jenv, jniContext); waitForReadEvents(jniContext, readEventCount, passes); checkEvents(jenv, jniContext, writeEventCount); } @@ -1144,7 +1294,7 @@ KernelArg* getArgForBuffer(JNIEnv* jenv, JNIContext* jniContext, jobject buffer) if (jniContext != NULL){ for (jint i = 0; returnArg == NULL && i < jniContext->argc; i++){ KernelArg *arg = jniContext->args[i]; - if (arg->isArray()){ + if (arg->isArray()) { jboolean isSame = jenv->IsSameObject(buffer, arg->arrayBuffer->javaArray); if (isSame){ if (config->isVerbose()){ @@ -1156,6 +1306,18 @@ KernelArg* getArgForBuffer(JNIEnv* jenv, JNIContext* jniContext, jobject buffer) fprintf(stderr, "unmatched arg '%s'\n", arg->name); } } + } else if(arg->isAparapiBuffer()) { + jboolean isSame = jenv->IsSameObject(buffer, arg->aparapiBuffer->getJavaObject(jenv,arg)); + if (isSame) { + if (config->isVerbose()) { + fprintf(stderr, "matched arg '%s'\n", arg->name); + } + returnArg = arg; + } else { + if (config->isVerbose()) { + fprintf(stderr, "unmatched arg '%s'\n", arg->name); + } + } } } if (returnArg == NULL){ @@ -1181,44 +1343,79 @@ JNI_JAVA(jint, KernelRunnerJNI, getJNI) if (config->isVerbose()){ fprintf(stderr, "explicitly reading buffer %s\n", arg->name); } - arg->pin(jenv); - - try { - status = clEnqueueReadBuffer(jniContext->commandQueue, arg->arrayBuffer->mem, - CL_FALSE, 0, - arg->arrayBuffer->lengthInBytes, - arg->arrayBuffer->addr , 0, NULL, - &jniContext->readEvents[0]); - if (config->isVerbose()){ - fprintf(stderr, "explicitly read %s ptr=%lx len=%d\n", - arg->name, (unsigned long)arg->arrayBuffer->addr, - arg->arrayBuffer->lengthInBytes ); - } - if (status != CL_SUCCESS) throw CLException(status, "clEnqueueReadBuffer()"); + if(arg->isArray()) { + arg->pin(jenv); + + try { + status = clEnqueueReadBuffer(jniContext->commandQueue, arg->arrayBuffer->mem, + CL_FALSE, 0, + arg->arrayBuffer->lengthInBytes, + arg->arrayBuffer->addr , 0, NULL, + &jniContext->readEvents[0]); + if (config->isVerbose()){ + fprintf(stderr, "explicitly read %s ptr=%lx len=%d\n", + arg->name, (unsigned long)arg->arrayBuffer->addr, + arg->arrayBuffer->lengthInBytes ); + } + if (status != CL_SUCCESS) throw CLException(status, "clEnqueueReadBuffer()"); + + status = clWaitForEvents(1, jniContext->readEvents); + if (status != CL_SUCCESS) throw CLException(status, "clWaitForEvents"); + + if (config->isProfilingEnabled()) { + status = profile(&arg->arrayBuffer->read, &jniContext->readEvents[0], 0, + arg->name, jniContext->profileBaseTime); + if (status != CL_SUCCESS) throw CLException(status, "profile "); + } - status = clWaitForEvents(1, jniContext->readEvents); - if (status != CL_SUCCESS) throw CLException(status, "clWaitForEvents"); + status = clReleaseEvent(jniContext->readEvents[0]); + if (status != CL_SUCCESS) throw CLException(status, "clReleaseEvent() read event"); - if (config->isProfilingEnabled()) { - status = profile(&arg->arrayBuffer->read, &jniContext->readEvents[0], 0, - arg->name, jniContext->profileBaseTime); - if (status != CL_SUCCESS) throw CLException(status, "profile "); + // since this is an explicit buffer get, + // we expect the buffer to have changed so we commit + arg->unpin(jenv); // was unpinCommit + + //something went wrong print the error and exit + } catch(CLException& cle) { + cle.printError(); + return status; } + } else if(arg->isAparapiBuffer()) { + + try { + status = clEnqueueReadBuffer(jniContext->commandQueue, arg->aparapiBuffer->mem, + CL_FALSE, 0, + arg->aparapiBuffer->lengthInBytes, + arg->aparapiBuffer->data, 0, NULL, + &jniContext->readEvents[0]); + if (config->isVerbose()){ + fprintf(stderr, "explicitly read %s ptr=%lx len=%d\n", + arg->name, (unsigned long)arg->aparapiBuffer->data, + arg->aparapiBuffer->lengthInBytes ); + } + if (status != CL_SUCCESS) throw CLException(status, "clEnqueueReadBuffer()"); + + status = clWaitForEvents(1, jniContext->readEvents); + if (status != CL_SUCCESS) throw CLException(status, "clWaitForEvents"); - status = clReleaseEvent(jniContext->readEvents[0]); - if (status != CL_SUCCESS) throw CLException(status, "clReleaseEvent() read event"); + if (config->isProfilingEnabled()) { + status = profile(&arg->aparapiBuffer->read, &jniContext->readEvents[0], 0, + arg->name, jniContext->profileBaseTime); + if (status != CL_SUCCESS) throw CLException(status, "profile "); + } - // since this is an explicit buffer get, - // we expect the buffer to have changed so we commit - arg->unpin(jenv); // was unpinCommit + status = clReleaseEvent(jniContext->readEvents[0]); + if (status != CL_SUCCESS) throw CLException(status, "clReleaseEvent() read event"); - //something went wrong print the error and exit - } catch(CLException& cle) { - cle.printError(); - return status; - } + arg->aparapiBuffer->inflate(jenv,arg); - }else{ + //something went wrong print the error and exit + } catch(CLException& cle) { + cle.printError(); + return status; + } + } + } else { if (config->isVerbose()){ fprintf(stderr, "attempt to request to get a buffer that does not appear to be referenced from kernel\n"); } diff --git a/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.h b/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.h index 3b4d40e0..dc174134 100644 --- a/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.h +++ b/com.amd.aparapi.jni/src/cpp/runKernel/Aparapi.h @@ -65,13 +65,18 @@ jint updateNonPrimitiveReferences(JNIEnv *jenv, jobject jobj, JNIContext* jniCon void profileFirstRun(JNIContext* jniContext); -void updateObject(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx); +void updateArray(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx); +void updateBuffer(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx); void processObject(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx); +void processArray(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx); +void processBuffer(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx); void updateWriteEvents(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int argIdx, int& writeEventCount); void processLocal(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx); +void processLocalArray(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx); +void processLocalBuffer(JNIEnv* jenv, JNIContext* jniContext, KernelArg* arg, int& argPos, int argIdx); int processArgs(JNIEnv* jenv, JNIContext* jniContext, int& argPos, int& writeEventCount); diff --git a/com.amd.aparapi.jni/src/cpp/runKernel/AparapiBuffer.cpp b/com.amd.aparapi.jni/src/cpp/runKernel/AparapiBuffer.cpp new file mode 100644 index 00000000..f5fcb19c --- /dev/null +++ b/com.amd.aparapi.jni/src/cpp/runKernel/AparapiBuffer.cpp @@ -0,0 +1,1557 @@ +/* + Copyright (c) 2010-2011, Advanced Micro Devices, Inc. + All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, are permitted provided that the + following conditions are met: + + Redistributions of source code must retain the above copyright notice, this list of conditions and the following + disclaimer. + + Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided with the distribution. + + Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, + INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export + laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 + through 774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000. Further, pursuant to Section 740.6 of + the EAR, you hereby certify that, except pursuant to a license granted by the United States Department of Commerce + Bureau of Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export + Administration Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in + Country Groups D:1, E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) + export to Country Groups D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced + direct product is subject to national security controls as identified on the Commerce Control List (currently + found in Supplement 1 to Part 774 of EAR). For the most current Country Group listings, or for additional + information about the EAR or your obligations under those regulations, please refer to the U.S. Bureau of Industry + and Security?s website at http://www.bis.doc.gov/. + */ +#define APARAPIBUFFER_SOURCE +#include "AparapiBuffer.h" +#include "KernelArg.h" + +AparapiBuffer::AparapiBuffer(): + javaObject((jobject) 0), + numDims(0), + dims(NULL), + lengthInBytes(0), + mem((cl_mem) 0), + data(NULL), + memMask((cl_uint)0) { + } + +AparapiBuffer::AparapiBuffer(void* _data, cl_uint* _lens, cl_uint _numDims, long _lengthInBytes, jobject _javaObject) : + data(_data), + lens(_lens), + numDims(_numDims), + lengthInBytes(_lengthInBytes), + javaObject(_javaObject), + mem((cl_mem) 0), + memMask((cl_uint)0) +{ + dims = new cl_uint[_numDims]; + for(int i = 0; i < _numDims; i++) { + dims[i] = 1; + for(int j = i+1; j < _numDims; j++) { + dims[i] *= lens[j]; + } + } +} + +jobject AparapiBuffer::getJavaObject(JNIEnv* env, KernelArg* arg) { + return JNIHelper::getInstanceField(env, arg->javaArg, "javaBuffer", ObjectClassArg); +} + + +AparapiBuffer* AparapiBuffer::flatten(JNIEnv* env, jobject arg, int type) { + int numDims = JNIHelper::getInstanceField(env, arg, "numDims", IntArg); + if(numDims == 2 && isBoolean(type)) { + return AparapiBuffer::flattenBoolean2D(env,arg); + } else if(numDims == 2 && isByte(type)) { + return AparapiBuffer::flattenByte2D(env,arg); + } else if(numDims == 2 && isShort(type)) { + return AparapiBuffer::flattenShort2D(env,arg); + } else if(numDims == 2 && isInt(type)) { + return AparapiBuffer::flattenInt2D(env,arg); + } else if(numDims == 2 && isLong(type)) { + return AparapiBuffer::flattenLong2D(env,arg); + } else if(numDims == 2 && isFloat(type)) { + return AparapiBuffer::flattenFloat2D(env,arg); + } else if(numDims == 2 && isDouble(type)) { + return AparapiBuffer::flattenDouble2D(env,arg); + } else if(numDims == 3 && isBoolean(type)) { + return AparapiBuffer::flattenBoolean3D(env,arg); + } else if(numDims == 3 && isByte(type)) { + return AparapiBuffer::flattenByte3D(env,arg); + } else if(numDims == 3 && isShort(type)) { + return AparapiBuffer::flattenShort3D(env,arg); + } else if(numDims == 3 && isInt(type)) { + return AparapiBuffer::flattenInt3D(env,arg); + } else if(numDims == 3 && isLong(type)) { + return AparapiBuffer::flattenLong3D(env,arg); + } else if(numDims == 3 && isFloat(type)) { + return AparapiBuffer::flattenFloat3D(env,arg); + } else if(numDims == 3 && isDouble(type)) { + return AparapiBuffer::flattenDouble3D(env,arg); + } + return new AparapiBuffer(); +} + + +AparapiBuffer* AparapiBuffer::flattenBoolean2D(JNIEnv* env, jobject arg) { + + jobject javaBuffer = JNIHelper::getInstanceField(env, arg, "javaBuffer", ObjectClassArg); + cl_uint* dims = new cl_uint[2]; + dims[0] = env->GetArrayLength((jobjectArray)javaBuffer); + dims[1] = env->GetArrayLength((jbooleanArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, 0)); + int totalSize = dims[0] * dims[1]; + long bitSize = totalSize * sizeof(jboolean); + + jboolean* array = new jboolean[totalSize]; + /* + jbooleanArray* jArray = new jbooleanArray[dims[0]]; + jboolean** elems = new jboolean*[dims[0]]; + + for(int i = 0; i < (int)dims[0]; i++) { + jArray[i] = (jbooleanArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i); + elems[i] = env->GetBooleanArrayElements(jArray[i],0); + } + + #pragma omp parallel for + for(int i = 0; i < (int)dims[0]; i++) { + for(int j = 0; j < (int)dims[1]; j++) { + array[i*dims[1] + j] = elems[i][j]; + } + } + + for(int i = 0; i < (int)dims[0]; i++) { + env->ReleaseBooleanArrayElements(jArray[i], elems[i], 0); + } + delete[] jArray; + delete[] elems; + */ + + for(int i = 0; i < (int)dims[0]; i++) { + jbooleanArray jArray = (jbooleanArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i); + jboolean* elems = env->GetBooleanArrayElements(jArray,0); + + for(int j = 0; j < (int)dims[1]; j++) { + array[i*dims[1] + j] = elems[j]; + } + env->ReleaseBooleanArrayElements(jArray, elems, 0); + } + + return new AparapiBuffer((void*)array, (cl_uint*)dims, 2, bitSize, javaBuffer); +} + +AparapiBuffer* AparapiBuffer::flattenByte2D(JNIEnv* env, jobject arg) { + + jobject javaBuffer = JNIHelper::getInstanceField(env, arg, "javaBuffer", ObjectClassArg); + cl_uint* dims = new cl_uint[2]; + dims[0] = env->GetArrayLength((jobjectArray)javaBuffer); + dims[1] = env->GetArrayLength((jbyteArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, 0)); + int totalSize = dims[0] * dims[1]; + long bitSize = totalSize * sizeof(jbyte); + + jbyte* array = new jbyte[totalSize]; + /* + jbyteArray* jArray = new jbyteArray[dims[0]]; + jbyte** elems = new jbyte*[dims[0]]; + + for(int i = 0; i < (int)dims[0]; i++) { + jArray[i] = (jbyteArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i); + elems[i] = env->GetByteArrayElements(jArray[i],0); + } + + #pragma omp parallel for + for(int i = 0; i < (int)dims[0]; i++) { + for(int j = 0; j < (int)dims[1]; j++) { + array[i*dims[1] + j] = elems[i][j]; + } + } + + for(int i = 0; i < (int)dims[0]; i++) { + env->ReleaseByteArrayElements(jArray[i], elems[i], 0); + } + delete[] jArray; + delete[] elems; + */ + + for(int i = 0; i < (int)dims[0]; i++) { + jbyteArray jArray = (jbyteArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i); + jbyte* elems = env->GetByteArrayElements(jArray,0); + + for(int j = 0; j < (int)dims[1]; j++) { + array[i*dims[1] + j] = elems[j]; + } + env->ReleaseByteArrayElements(jArray, elems, 0); + } + + return new AparapiBuffer((void*)array, (cl_uint*)dims, 2, bitSize, javaBuffer); +} + +AparapiBuffer* AparapiBuffer::flattenShort2D(JNIEnv* env, jobject arg) { + + jobject javaBuffer = JNIHelper::getInstanceField(env, arg, "javaBuffer", ObjectClassArg); + cl_uint* dims = new cl_uint[2]; + dims[0] = env->GetArrayLength((jobjectArray)javaBuffer); + dims[1] = env->GetArrayLength((jshortArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, 0)); + int totalSize = dims[0] * dims[1]; + long bitSize = totalSize * sizeof(jshort); + + jshort* array = new jshort[totalSize]; + /* + jshortArray* jArray = new jshortArray[dims[0]]; + jshort** elems = new jshort*[dims[0]]; + + for(int i = 0; i < (int)dims[0]; i++) { + jArray[i] = (jshortArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i); + elems[i] = env->GetShortArrayElements(jArray[i],0); + } + + #pragma omp parallel for + for(int i = 0; i < (int)dims[0]; i++) { + for(int j = 0; j < (int)dims[1]; j++) { + array[i*dims[1] + j] = elems[i][j]; + } + } + + for(int i = 0; i < (int)dims[0]; i++) { + env->ReleaseShortArrayElements(jArray[i], elems[i], 0); + } + delete[] jArray; + delete[] elems; + */ + + for(int i = 0; i < (int)dims[0]; i++) { + jshortArray jArray = (jshortArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i); + jshort* elems = env->GetShortArrayElements(jArray,0); + + for(int j = 0; j < (int)dims[1]; j++) { + array[i*dims[1] + j] = elems[j]; + } + env->ReleaseShortArrayElements(jArray, elems, 0); + } + + return new AparapiBuffer((void*)array, (cl_uint*)dims, 2, bitSize, javaBuffer); +} + +AparapiBuffer* AparapiBuffer::flattenInt2D(JNIEnv* env, jobject arg) { + + jobject javaBuffer = JNIHelper::getInstanceField(env, arg, "javaBuffer", ObjectClassArg); + cl_uint* dims = new cl_uint[2]; + dims[0] = env->GetArrayLength((jobjectArray)javaBuffer); + dims[1] = env->GetArrayLength((jintArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, 0)); + int totalSize = dims[0] * dims[1]; + long bitSize = totalSize * sizeof(jint); + + jint* array = new jint[totalSize]; + /* + jintArray* jArray = new jintArray[dims[0]]; + jint** elems = new jint*[dims[0]]; + + for(int i = 0; i < (int)dims[0]; i++) { + jArray[i] = (jintArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i); + elems[i] = env->GetIntArrayElements(jArray[i],0); + } + + #pragma omp parallel for + for(int i = 0; i < (int)dims[0]; i++) { + for(int j = 0; j < (int)dims[1]; j++) { + array[i*dims[1] + j] = elems[i][j]; + } + } + + for(int i = 0; i < (int)dims[0]; i++) { + env->ReleaseIntArrayElements(jArray[i], elems[i], 0); + } + delete[] jArray; + delete[] elems; + */ + + for(int i = 0; i < (int)dims[0]; i++) { + jintArray jArray = (jintArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i); + jint* elems = env->GetIntArrayElements(jArray,0); + + for(int j = 0; j < (int)dims[1]; j++) { + array[i*dims[1] + j] = elems[j]; + } + env->ReleaseIntArrayElements(jArray, elems, 0); + } + + return new AparapiBuffer((void*)array, (cl_uint*)dims, 2, bitSize, javaBuffer); +} + +AparapiBuffer* AparapiBuffer::flattenLong2D(JNIEnv* env, jobject arg) { + + jobject javaBuffer = JNIHelper::getInstanceField(env, arg, "javaBuffer", ObjectClassArg); + cl_uint* dims = new cl_uint[2]; + dims[0] = env->GetArrayLength((jobjectArray)javaBuffer); + dims[1] = env->GetArrayLength((jlongArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, 0)); + int totalSize = dims[0] * dims[1]; + long bitSize = totalSize * sizeof(jlong); + + jlong* array = new jlong[totalSize]; + /* + jlongArray* jArray = new jlongArray[dims[0]]; + jlong** elems = new jlong*[dims[0]]; + + for(int i = 0; i < (int)dims[0]; i++) { + jArray[i] = (jlongArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i); + elems[i] = env->GetLongArrayElements(jArray[i],0); + } + + #pragma omp parallel for + for(int i = 0; i < (int)dims[0]; i++) { + for(int j = 0; j < (int)dims[1]; j++) { + array[i*dims[1] + j] = elems[i][j]; + } + } + + for(int i = 0; i < (int)dims[0]; i++) { + env->ReleaseLongArrayElements(jArray[i], elems[i], 0); + } + delete[] jArray; + delete[] elems; + */ + + for(int i = 0; i < (int)dims[0]; i++) { + jlongArray jArray = (jlongArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i); + jlong* elems = env->GetLongArrayElements(jArray,0); + + for(int j = 0; j < (int)dims[1]; j++) { + array[i*dims[1] + j] = elems[j]; + } + env->ReleaseLongArrayElements(jArray, elems, 0); + } + + return new AparapiBuffer((void*)array, (cl_uint*)dims, 2, bitSize, javaBuffer); +} + +AparapiBuffer* AparapiBuffer::flattenFloat2D(JNIEnv* env, jobject arg) { + + jobject javaBuffer = JNIHelper::getInstanceField(env, arg, "javaBuffer", ObjectClassArg); + cl_uint* dims = new cl_uint[2]; + dims[0] = env->GetArrayLength((jobjectArray)javaBuffer); + dims[1] = env->GetArrayLength((jfloatArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, 0)); + int totalSize = dims[0] * dims[1]; + long bitSize = totalSize * sizeof(jfloat); + + jfloat* array = new jfloat[totalSize]; + /* + jfloatArray* jArray = new jfloatArray[dims[0]]; + jfloat** elems = new jfloat*[dims[0]]; + + for(int i = 0; i < (int)dims[0]; i++) { + jArray[i] = (jfloatArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i); + elems[i] = env->GetFloatArrayElements(jArray[i],0); + } + + #pragma omp parallel for + for(int i = 0; i < (int)dims[0]; i++) { + for(int j = 0; j < (int)dims[1]; j++) { + array[i*dims[1] + j] = elems[i][j]; + } + } + + for(int i = 0; i < (int)dims[0]; i++) { + env->ReleaseFloatArrayElements(jArray[i], elems[i], 0); + } + delete[] jArray; + delete[] elems; + */ + + for(int i = 0; i < (int)dims[0]; i++) { + jfloatArray jArray = (jfloatArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i); + jfloat* elems = env->GetFloatArrayElements(jArray,0); + + for(int j = 0; j < (int)dims[1]; j++) { + array[i*dims[1] + j] = elems[j]; + } + env->ReleaseFloatArrayElements(jArray, elems, 0); + } + + return new AparapiBuffer((void*)array, (cl_uint*)dims, 2, bitSize, javaBuffer); +} + +AparapiBuffer* AparapiBuffer::flattenDouble2D(JNIEnv* env, jobject arg) { + + jobject javaBuffer = JNIHelper::getInstanceField(env, arg, "javaBuffer", ObjectClassArg); + cl_uint* dims = new cl_uint[2]; + dims[0] = env->GetArrayLength((jobjectArray)javaBuffer); + dims[1] = env->GetArrayLength((jdoubleArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, 0)); + int totalSize = dims[0] * dims[1]; + long bitSize = totalSize * sizeof(jdouble); + + jdouble* array = new jdouble[totalSize]; + /* + jdoubleArray* jArray = new jdoubleArray[dims[0]]; + jdouble** elems = new jdouble*[dims[0]]; + + for(int i = 0; i < (int)dims[0]; i++) { + jArray[i] = (jdoubleArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i); + elems[i] = env->GetDoubleArrayElements(jArray[i],0); + } + + #pragma omp parallel for + for(int i = 0; i < (int)dims[0]; i++) { + for(int j = 0; j < (int)dims[1]; j++) { + array[i*dims[1] + j] = elems[i][j]; + } + } + + for(int i = 0; i < (int)dims[0]; i++) { + env->ReleaseDoubleArrayElements(jArray[i], elems[i], 0); + } + delete[] jArray; + delete[] elems; + */ + + for(int i = 0; i < (int)dims[0]; i++) { + jdoubleArray jArray = (jdoubleArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i); + jdouble* elems = env->GetDoubleArrayElements(jArray,0); + + for(int j = 0; j < (int)dims[1]; j++) { + array[i*dims[1] + j] = elems[j]; + } + env->ReleaseDoubleArrayElements(jArray, elems, 0); + } + + return new AparapiBuffer((void*)array, (cl_uint*)dims, 2, bitSize, javaBuffer); +} + + +AparapiBuffer* AparapiBuffer::flattenBoolean3D(JNIEnv* env, jobject arg) { + + jobject javaBuffer = JNIHelper::getInstanceField(env, arg, "javaBuffer", ObjectClassArg); + cl_uint* dims = new cl_uint[3]; + jobjectArray j0 = (jobjectArray)javaBuffer; + jobjectArray j1 = (jobjectArray)env->GetObjectArrayElement(j0, 0); + jbooleanArray j2 = (jbooleanArray)env->GetObjectArrayElement(j1, 0); + dims[0] = env->GetArrayLength(j0); + dims[1] = env->GetArrayLength(j1); + dims[2] = env->GetArrayLength(j2); + + int totalSize = dims[0] * dims[1] * dims[2]; + long bitSize = totalSize * sizeof(jboolean); + + jboolean* array = new jboolean[totalSize]; + /* + jbooleanArray** jArray = new jbooleanArray*[dims[0]]; + jboolean*** elems = new jboolean**[dims[0]]; + + for(int i = 0; i < (int)dims[0]; i++) { + jArray[i] = new jbooleanArray[dims[1]]; + elems[i] = new jboolean*[dims[1]]; + jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i); + for(int j = 0; j < (int)dims[1]; j++) { + jArray[i][j] = (jbooleanArray)env->GetObjectArrayElement(jrow, j); + elems[i][j] = env->GetBooleanArrayElements(jArray[i][j],0); + } + } + + #pragma omp parallel for + for(int i = 0; i < (int)dims[0]; i++) { + for(int j = 0; j < (int)dims[1]; j++) { + for(int k = 0; k < (int)dims[2]; k++) { + array[i*dims[1]*dims[2] + j*dims[2] + k] = elems[i][j][k]; + } + } + } + + for(int i = 0; i < (int)dims[0]; i++) { + for(int j = 0; j < (int)dims[1]; j++) { + env->ReleaseBooleanArrayElements(jArray[i][j], elems[i][j], 0); + } + delete[] jArray[i]; + delete[] elems[i]; + } + delete[] jArray; + delete[] elems; + */ + + for(int i = 0; i < (int)dims[0]; i++) { + jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i); + for(int j = 0; j < (int)dims[1]; j++) { + jbooleanArray jArray = (jbooleanArray)env->GetObjectArrayElement(jrow, j); + jboolean* elems = env->GetBooleanArrayElements(jArray,0); + for(int k = 0; k < (int)dims[2]; k++) { + array[i*dims[1]*dims[2] + j*dims[2] + k] = elems[k]; + } + env->ReleaseBooleanArrayElements(jArray, elems, 0); + } + } + + return new AparapiBuffer((void*)array, (cl_uint*)dims, 3, bitSize, javaBuffer); +} + +AparapiBuffer* AparapiBuffer::flattenByte3D(JNIEnv* env, jobject arg) { + + jobject javaBuffer = JNIHelper::getInstanceField(env, arg, "javaBuffer", ObjectClassArg); + cl_uint* dims = new cl_uint[3]; + jobjectArray j0 = (jobjectArray)javaBuffer; + jobjectArray j1 = (jobjectArray)env->GetObjectArrayElement(j0, 0); + jbyteArray j2 = (jbyteArray)env->GetObjectArrayElement(j1, 0); + dims[0] = env->GetArrayLength(j0); + dims[1] = env->GetArrayLength(j1); + dims[2] = env->GetArrayLength(j2); + + int totalSize = dims[0] * dims[1] * dims[2]; + long bitSize = totalSize * sizeof(jbyte); + + jbyte* array = new jbyte[totalSize]; + /* + jbyteArray** jArray = new jbyteArray*[dims[0]]; + jbyte*** elems = new jbyte**[dims[0]]; + + for(int i = 0; i < (int)dims[0]; i++) { + jArray[i] = new jbyteArray[dims[1]]; + elems[i] = new jbyte*[dims[1]]; + jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i); + for(int j = 0; j < (int)dims[1]; j++) { + jArray[i][j] = (jbyteArray)env->GetObjectArrayElement(jrow, j); + elems[i][j] = env->GetByteArrayElements(jArray[i][j],0); + } + } + + #pragma omp parallel for + for(int i = 0; i < (int)dims[0]; i++) { + for(int j = 0; j < (int)dims[1]; j++) { + for(int k = 0; k < (int)dims[2]; k++) { + array[i*dims[1]*dims[2] + j*dims[2] + k] = elems[i][j][k]; + } + } + } + + for(int i = 0; i < (int)dims[0]; i++) { + for(int j = 0; j < (int)dims[1]; j++) { + env->ReleaseByteArrayElements(jArray[i][j], elems[i][j], 0); + } + delete[] jArray[i]; + delete[] elems[i]; + } + delete[] jArray; + delete[] elems; + */ + + for(int i = 0; i < (int)dims[0]; i++) { + jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i); + for(int j = 0; j < (int)dims[1]; j++) { + jbyteArray jArray = (jbyteArray)env->GetObjectArrayElement(jrow, j); + jbyte* elems = env->GetByteArrayElements(jArray,0); + for(int k = 0; k < (int)dims[2]; k++) { + array[i*dims[1]*dims[2] + j*dims[2] + k] = elems[k]; + } + env->ReleaseByteArrayElements(jArray, elems, 0); + } + } + + return new AparapiBuffer((void*)array, (cl_uint*)dims, 3, bitSize, javaBuffer); +} + +AparapiBuffer* AparapiBuffer::flattenShort3D(JNIEnv* env, jobject arg) { + + jobject javaBuffer = JNIHelper::getInstanceField(env, arg, "javaBuffer", ObjectClassArg); + cl_uint* dims = new cl_uint[3]; + jobjectArray j0 = (jobjectArray)javaBuffer; + jobjectArray j1 = (jobjectArray)env->GetObjectArrayElement(j0, 0); + jshortArray j2 = (jshortArray)env->GetObjectArrayElement(j1, 0); + dims[0] = env->GetArrayLength(j0); + dims[1] = env->GetArrayLength(j1); + dims[2] = env->GetArrayLength(j2); + + int totalSize = dims[0] * dims[1] * dims[2]; + long bitSize = totalSize * sizeof(jshort); + + jshort* array = new jshort[totalSize]; + /* + jshortArray** jArray = new jshortArray*[dims[0]]; + jshort*** elems = new jshort**[dims[0]]; + + for(int i = 0; i < (int)dims[0]; i++) { + jArray[i] = new jshortArray[dims[1]]; + elems[i] = new jshort*[dims[1]]; + jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i); + for(int j = 0; j < (int)dims[1]; j++) { + jArray[i][j] = (jshortArray)env->GetObjectArrayElement(jrow, j); + elems[i][j] = env->GetShortArrayElements(jArray[i][j],0); + } + } + + #pragma omp parallel for + for(int i = 0; i < (int)dims[0]; i++) { + for(int j = 0; j < (int)dims[1]; j++) { + for(int k = 0; k < (int)dims[2]; k++) { + array[i*dims[1]*dims[2] + j*dims[2] + k] = elems[i][j][k]; + } + } + } + + for(int i = 0; i < (int)dims[0]; i++) { + for(int j = 0; j < (int)dims[1]; j++) { + env->ReleaseShortArrayElements(jArray[i][j], elems[i][j], 0); + } + delete[] jArray[i]; + delete[] elems[i]; + } + delete[] jArray; + delete[] elems; + */ + + for(int i = 0; i < (int)dims[0]; i++) { + jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i); + for(int j = 0; j < (int)dims[1]; j++) { + jshortArray jArray = (jshortArray)env->GetObjectArrayElement(jrow, j); + jshort* elems = env->GetShortArrayElements(jArray,0); + for(int k = 0; k < (int)dims[2]; k++) { + array[i*dims[1]*dims[2] + j*dims[2] + k] = elems[k]; + } + env->ReleaseShortArrayElements(jArray, elems, 0); + } + } + + return new AparapiBuffer((void*)array, (cl_uint*)dims, 3, bitSize, javaBuffer); +} + +AparapiBuffer* AparapiBuffer::flattenInt3D(JNIEnv* env, jobject arg) { + + jobject javaBuffer = JNIHelper::getInstanceField(env, arg, "javaBuffer", ObjectClassArg); + cl_uint* dims = new cl_uint[3]; + jobjectArray j0 = (jobjectArray)javaBuffer; + jobjectArray j1 = (jobjectArray)env->GetObjectArrayElement(j0, 0); + jintArray j2 = (jintArray)env->GetObjectArrayElement(j1, 0); + dims[0] = env->GetArrayLength(j0); + dims[1] = env->GetArrayLength(j1); + dims[2] = env->GetArrayLength(j2); + + int totalSize = dims[0] * dims[1] * dims[2]; + long bitSize = totalSize * sizeof(jint); + + jint* array = new jint[totalSize]; + /* + jintArray** jArray = new jintArray*[dims[0]]; + jint*** elems = new jint**[dims[0]]; + + for(int i = 0; i < (int)dims[0]; i++) { + jArray[i] = new jintArray[dims[1]]; + elems[i] = new jint*[dims[1]]; + jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i); + for(int j = 0; j < (int)dims[1]; j++) { + jArray[i][j] = (jintArray)env->GetObjectArrayElement(jrow, j); + elems[i][j] = env->GetIntArrayElements(jArray[i][j],0); + } + } + + #pragma omp parallel for + for(int i = 0; i < (int)dims[0]; i++) { + for(int j = 0; j < (int)dims[1]; j++) { + for(int k = 0; k < (int)dims[2]; k++) { + array[i*dims[1]*dims[2] + j*dims[2] + k] = elems[i][j][k]; + } + } + } + + for(int i = 0; i < (int)dims[0]; i++) { + for(int j = 0; j < (int)dims[1]; j++) { + env->ReleaseIntArrayElements(jArray[i][j], elems[i][j], 0); + } + delete[] jArray[i]; + delete[] elems[i]; + } + delete[] jArray; + delete[] elems; + */ + + for(int i = 0; i < (int)dims[0]; i++) { + jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i); + for(int j = 0; j < (int)dims[1]; j++) { + jintArray jArray = (jintArray)env->GetObjectArrayElement(jrow, j); + jint* elems = env->GetIntArrayElements(jArray,0); + for(int k = 0; k < (int)dims[2]; k++) { + array[i*dims[1]*dims[2] + j*dims[2] + k] = elems[k]; + } + env->ReleaseIntArrayElements(jArray, elems, 0); + } + } + + return new AparapiBuffer((void*)array, (cl_uint*)dims, 3, bitSize, javaBuffer); +} + +AparapiBuffer* AparapiBuffer::flattenLong3D(JNIEnv* env, jobject arg) { + + jobject javaBuffer = JNIHelper::getInstanceField(env, arg, "javaBuffer", ObjectClassArg); + cl_uint* dims = new cl_uint[3]; + jobjectArray j0 = (jobjectArray)javaBuffer; + jobjectArray j1 = (jobjectArray)env->GetObjectArrayElement(j0, 0); + jlongArray j2 = (jlongArray)env->GetObjectArrayElement(j1, 0); + dims[0] = env->GetArrayLength(j0); + dims[1] = env->GetArrayLength(j1); + dims[2] = env->GetArrayLength(j2); + + int totalSize = dims[0] * dims[1] * dims[2]; + jlong bitSize = totalSize * sizeof(jlong); + + jlong* array = new jlong[totalSize]; + /* + jlongArray** jArray = new jlongArray*[dims[0]]; + jlong*** elems = new jlong**[dims[0]]; + + for(int i = 0; i < (int)dims[0]; i++) { + jArray[i] = new jlongArray[dims[1]]; + elems[i] = new jlong*[dims[1]]; + jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i); + for(int j = 0; j < (int)dims[1]; j++) { + jArray[i][j] = (jlongArray)env->GetObjectArrayElement(jrow, j); + elems[i][j] = env->GetLongArrayElements(jArray[i][j],0); + } + } + + #pragma omp parallel for + for(int i = 0; i < (int)dims[0]; i++) { + for(int j = 0; j < (int)dims[1]; j++) { + for(int k = 0; k < (int)dims[2]; k++) { + array[i*dims[1]*dims[2] + j*dims[2] + k] = elems[i][j][k]; + } + } + } + + for(int i = 0; i < (int)dims[0]; i++) { + for(int j = 0; j < (int)dims[1]; j++) { + env->ReleaseLongArrayElements(jArray[i][j], elems[i][j], 0); + } + delete[] jArray[i]; + delete[] elems[i]; + } + delete[] jArray; + delete[] elems; + */ + + for(int i = 0; i < (int)dims[0]; i++) { + jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i); + for(int j = 0; j < (int)dims[1]; j++) { + jlongArray jArray = (jlongArray)env->GetObjectArrayElement(jrow, j); + jlong* elems = env->GetLongArrayElements(jArray,0); + for(int k = 0; k < (int)dims[2]; k++) { + array[i*dims[1]*dims[2] + j*dims[2] + k] = elems[k]; + } + env->ReleaseLongArrayElements(jArray, elems, 0); + } + } + + return new AparapiBuffer((void*)array, (cl_uint*)dims, 3, bitSize, javaBuffer); +} + +AparapiBuffer* AparapiBuffer::flattenFloat3D(JNIEnv* env, jobject arg) { + + jobject javaBuffer = JNIHelper::getInstanceField(env, arg, "javaBuffer", ObjectClassArg); + cl_uint* dims = new cl_uint[3]; + jobjectArray j0 = (jobjectArray)javaBuffer; + jobjectArray j1 = (jobjectArray)env->GetObjectArrayElement(j0, 0); + jfloatArray j2 = (jfloatArray)env->GetObjectArrayElement(j1, 0); + dims[0] = env->GetArrayLength(j0); + dims[1] = env->GetArrayLength(j1); + dims[2] = env->GetArrayLength(j2); + + int totalSize = dims[0] * dims[1] * dims[2]; + long bitSize = totalSize * sizeof(jfloat); + + jfloat* array = new jfloat[totalSize]; + /* + jfloatArray** jArray = new jfloatArray*[dims[0]]; + jfloat*** elems = new jfloat**[dims[0]]; + + for(int i = 0; i < (int)dims[0]; i++) { + jArray[i] = new jfloatArray[dims[1]]; + elems[i] = new jfloat*[dims[1]]; + jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i); + for(int j = 0; j < (int)dims[1]; j++) { + jArray[i][j] = (jfloatArray)env->GetObjectArrayElement(jrow, j); + elems[i][j] = env->GetFloatArrayElements(jArray[i][j],0); + } + } + + #pragma omp parallel for + for(int i = 0; i < (int)dims[0]; i++) { + for(int j = 0; j < (int)dims[1]; j++) { + for(int k = 0; k < (int)dims[2]; k++) { + array[i*dims[1]*dims[2] + j*dims[2] + k] = elems[i][j][k]; + } + } + } + + for(int i = 0; i < (int)dims[0]; i++) { + for(int j = 0; j < (int)dims[1]; j++) { + env->ReleaseFloatArrayElements(jArray[i][j], elems[i][j], 0); + } + delete[] jArray[i]; + delete[] elems[i]; + } + delete[] jArray; + delete[] elems; + */ + + for(int i = 0; i < (int)dims[0]; i++) { + jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i); + for(int j = 0; j < (int)dims[1]; j++) { + jfloatArray jArray = (jfloatArray)env->GetObjectArrayElement(jrow, j); + jfloat* elems = env->GetFloatArrayElements(jArray,0); + for(int k = 0; k < (int)dims[2]; k++) { + array[i*dims[1]*dims[2] + j*dims[2] + k] = elems[k]; + } + env->ReleaseFloatArrayElements(jArray, elems, 0); + } + } + + return new AparapiBuffer((void*)array, (cl_uint*)dims, 3, bitSize, javaBuffer); +} + +AparapiBuffer* AparapiBuffer::flattenDouble3D(JNIEnv* env, jobject arg) { + + jobject javaBuffer = JNIHelper::getInstanceField(env, arg, "javaBuffer", ObjectClassArg); + cl_uint* dims = new cl_uint[3]; + jobjectArray j0 = (jobjectArray)javaBuffer; + jobjectArray j1 = (jobjectArray)env->GetObjectArrayElement(j0, 0); + jdoubleArray j2 = (jdoubleArray)env->GetObjectArrayElement(j1, 0); + dims[0] = env->GetArrayLength(j0); + dims[1] = env->GetArrayLength(j1); + dims[2] = env->GetArrayLength(j2); + + int totalSize = dims[0] * dims[1] * dims[2]; + long bitSize = totalSize * sizeof(jdouble); + + jdouble* array = new jdouble[totalSize]; + /* + jdoubleArray** jArray = new jdoubleArray*[dims[0]]; + jdouble*** elems = new jdouble**[dims[0]]; + + for(int i = 0; i < (int)dims[0]; i++) { + jArray[i] = new jdoubleArray[dims[1]]; + elems[i] = new jdouble*[dims[1]]; + jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i); + for(int j = 0; j < (int)dims[1]; j++) { + jArray[i][j] = (jdoubleArray)env->GetObjectArrayElement(jrow, j); + elems[i][j] = env->GetDoubleArrayElements(jArray[i][j],0); + } + } + + #pragma omp parallel for + for(int i = 0; i < (int)dims[0]; i++) { + for(int j = 0; j < (int)dims[1]; j++) { + for(int k = 0; k < (int)dims[2]; k++) { + array[i*dims[1]*dims[2] + j*dims[2] + k] = elems[i][j][k]; + } + } + } + + for(int i = 0; i < (int)dims[0]; i++) { + for(int j = 0; j < (int)dims[1]; j++) { + env->ReleaseDoubleArrayElements(jArray[i][j], elems[i][j], 0); + } + delete[] jArray[i]; + delete[] elems[i]; + } + delete[] jArray; + delete[] elems; + */ + + for(int i = 0; i < (int)dims[0]; i++) { + jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement((jobjectArray)javaBuffer, i); + for(int j = 0; j < (int)dims[1]; j++) { + jdoubleArray jArray = (jdoubleArray)env->GetObjectArrayElement(jrow, j); + jdouble* elems = env->GetDoubleArrayElements(jArray,0); + for(int k = 0; k < (int)dims[2]; k++) { + array[i*dims[1]*dims[2] + j*dims[1] + k] = elems[k]; + } + env->ReleaseDoubleArrayElements(jArray, elems, 0); + } + } + + return new AparapiBuffer((void*)array, (cl_uint*)dims, 3, bitSize, javaBuffer); +} + + + +void AparapiBuffer::inflate(JNIEnv* env, KernelArg* arg) { + javaObject = JNIHelper::getInstanceField(env, arg->javaArg, "javaBuffer", ObjectClassArg); + if(numDims == 2 && arg->isBoolean()) { + AparapiBuffer::inflateBoolean2D(env, arg); + } else if(numDims == 2 && arg->isByte()) { + AparapiBuffer::inflateByte2D(env, arg); + } else if(numDims == 2 && arg->isShort()) { + AparapiBuffer::inflateShort2D(env, arg); + } else if(numDims == 2 && arg->isInt()) { + AparapiBuffer::inflateInt2D(env, arg); + } else if(numDims == 2 && arg->isLong()) { + AparapiBuffer::inflateLong2D(env, arg); + } else if(numDims == 2 && arg->isFloat()) { + AparapiBuffer::inflateFloat2D(env, arg); + } else if(numDims == 2 && arg->isDouble()) { + AparapiBuffer::inflateDouble2D(env, arg); + } else if(numDims == 3 && arg->isBoolean()) { + AparapiBuffer::inflateBoolean3D(env, arg); + } else if(numDims == 3 && arg->isByte()) { + AparapiBuffer::inflateByte3D(env, arg); + } else if(numDims == 3 && arg->isShort()) { + AparapiBuffer::inflateShort3D(env, arg); + } else if(numDims == 3 && arg->isInt()) { + AparapiBuffer::inflateInt3D(env, arg); + } else if(numDims == 3 && arg->isLong()) { + AparapiBuffer::inflateLong3D(env, arg); + } else if(numDims == 3 && arg->isFloat()) { + AparapiBuffer::inflateFloat3D(env, arg); + } else if(numDims == 3 && arg->isDouble()) { + AparapiBuffer::inflateDouble3D(env, arg); + } else { + return; + } + + deleteBuffer(arg); +} + + +void AparapiBuffer::inflateBoolean2D(JNIEnv *env, KernelArg* arg) { + + jobjectArray buffer = (jobjectArray)javaObject; + jboolean* array = (jboolean*)data; + /* + jbooleanArray* jArray = new jbooleanArray[lens[0]]; + jboolean** body = new jboolean*[lens[0]]; + + for(int i = 0; i < lens[0]; i++) { + jArray[i] = (jbooleanArray)env->GetObjectArrayElement(buffer, i); + body[i] = env->GetBooleanArrayElements(jArray[i],0); + } + + #pragma omp parallel for + for(int i = 0; i < lens[0]; i++) { + for(int j = 0; j < lens[1]; j++) { + body[i][j] = array[i*dims[0] + j]; + } + } + + for(int i = 0; i < lens[0]; i++) { + env->ReleaseBooleanArrayElements(jArray[i], body[i], 0); + } + delete[] jArray; + delete[] body; + */ + + for(int i = 0; i < lens[0]; i++) { + jbooleanArray jArray = (jbooleanArray)env->GetObjectArrayElement(buffer, i); + jboolean* body = env->GetBooleanArrayElements(jArray,0); + for(int j = 0; j < lens[1]; j++) { + body[j] = array[i*dims[0] + j]; + } + env->ReleaseBooleanArrayElements(jArray, body, 0); + } +} + +void AparapiBuffer::inflateByte2D(JNIEnv *env, KernelArg* arg) { + + jobjectArray buffer = (jobjectArray)javaObject; + jbyte* array = (jbyte*)data; + /* + jbyteArray* jArray = new jbyteArray[lens[0]]; + jbyte** body = new jbyte*[lens[0]]; + + for(int i = 0; i < lens[0]; i++) { + jArray[i] = (jbyteArray)env->GetObjectArrayElement(buffer, i); + body[i] = env->GetByteArrayElements(jArray[i],0); + } + + #pragma omp parallel for + for(int i = 0; i < lens[0]; i++) { + for(int j = 0; j < lens[1]; j++) { + body[i][j] = array[i*dims[0] + j]; + } + } + + for(int i = 0; i < lens[0]; i++) { + env->ReleaseByteArrayElements(jArray[i], body[i], 0); + } + delete[] jArray; + delete[] body; + */ + + for(int i = 0; i < lens[0]; i++) { + jbyteArray jArray = (jbyteArray)env->GetObjectArrayElement(buffer, i); + jbyte* body = env->GetByteArrayElements(jArray,0); + for(int j = 0; j < lens[1]; j++) { + body[j] = array[i*dims[0] + j]; + } + env->ReleaseByteArrayElements(jArray, body, 0); + } +} + +void AparapiBuffer::inflateShort2D(JNIEnv *env, KernelArg* arg) { + + jobjectArray buffer = (jobjectArray)javaObject; + jshort* array = (jshort*)data; + /* + jshortArray* jArray = new jshortArray[lens[0]]; + jshort** body = new jshort*[lens[0]]; + + for(int i = 0; i < lens[0]; i++) { + jArray[i] = (jshortArray)env->GetObjectArrayElement(buffer, i); + body[i] = env->GetShortArrayElements(jArray[i],0); + } + + #pragma omp parallel for + for(int i = 0; i < lens[0]; i++) { + for(int j = 0; j < lens[1]; j++) { + body[i][j] = array[i*dims[0] + j]; + } + } + + for(int i = 0; i < lens[0]; i++) { + env->ReleaseShortArrayElements(jArray[i], body[i], 0); + } + delete[] jArray; + delete[] body; + */ + + for(int i = 0; i < lens[0]; i++) { + jshortArray jArray = (jshortArray)env->GetObjectArrayElement(buffer, i); + jshort* body = env->GetShortArrayElements(jArray,0); + for(int j = 0; j < lens[1]; j++) { + body[j] = array[i*dims[0] + j]; + } + env->ReleaseShortArrayElements(jArray, body, 0); + } +} + +void AparapiBuffer::inflateInt2D(JNIEnv *env, KernelArg* arg) { + + jobjectArray buffer = (jobjectArray)javaObject; + jint* array = (jint*)data; + /* + jintArray* jArray = new jintArray[lens[0]]; + jint** body = new jint*[lens[0]]; + + for(int i = 0; i < lens[0]; i++) { + jArray[i] = (jintArray)env->GetObjectArrayElement(buffer, i); + body[i] = env->GetIntArrayElements(jArray[i],0); + } + + #pragma omp parallel for + for(int i = 0; i < lens[0]; i++) { + for(int j = 0; j < lens[1]; j++) { + body[i][j] = array[i*dims[0] + j]; + } + } + + for(int i = 0; i < lens[0]; i++) { + env->ReleaseIntArrayElements(jArray[i], body[i], 0); + } + delete[] jArray; + delete[] body; + */ + + for(int i = 0; i < lens[0]; i++) { + jintArray jArray = (jintArray)env->GetObjectArrayElement(buffer, i); + jint* body = env->GetIntArrayElements(jArray,0); + for(int j = 0; j < lens[1]; j++) { + body[j] = array[i*dims[0] + j]; + } + env->ReleaseIntArrayElements(jArray, body, 0); + } +} + +void AparapiBuffer::inflateLong2D(JNIEnv *env, KernelArg* arg) { + + jobjectArray buffer = (jobjectArray)javaObject; + jlong* array = (jlong*)data; + /* + jlongArray* jArray = new jlongArray[lens[0]]; + jlong** body = new jlong*[lens[0]]; + + for(int i = 0; i < lens[0]; i++) { + jArray[i] = (jlongArray)env->GetObjectArrayElement(buffer, i); + body[i] = env->GetLongArrayElements(jArray[i],0); + } + + #pragma omp parallel for + for(int i = 0; i < lens[0]; i++) { + for(int j = 0; j < lens[1]; j++) { + body[i][j] = array[i*dims[0] + j]; + } + } + + for(int i = 0; i < lens[0]; i++) { + env->ReleaseLongArrayElements(jArray[i], body[i], 0); + } + delete[] jArray; + delete[] body; + */ + + for(int i = 0; i < lens[0]; i++) { + jlongArray jArray = (jlongArray)env->GetObjectArrayElement(buffer, i); + jlong* body = env->GetLongArrayElements(jArray,0); + for(int j = 0; j < lens[1]; j++) { + body[j] = array[i*dims[0] + j]; + } + env->ReleaseLongArrayElements(jArray, body, 0); + } +} + +void AparapiBuffer::inflateFloat2D(JNIEnv *env, KernelArg* arg) { + + jobjectArray buffer = (jobjectArray)javaObject; + jfloat* array = (jfloat*)data; + /* + jfloatArray* jArray = new jfloatArray[lens[0]]; + jfloat** body = new jfloat*[lens[0]]; + + for(int i = 0; i < lens[0]; i++) { + jArray[i] = (jfloatArray)env->GetObjectArrayElement(buffer, i); + body[i] = env->GetFloatArrayElements(jArray[i],0); + } + + #pragma omp parallel for + for(int i = 0; i < lens[0]; i++) { + for(int j = 0; j < lens[1]; j++) { + body[i][j] = array[i*dims[0] + j]; + } + } + + for(int i = 0; i < lens[0]; i++) { + env->ReleaseFloatArrayElements(jArray[i], body[i], 0); + } + delete[] jArray; + delete[] body; + */ + + for(int i = 0; i < lens[0]; i++) { + jfloatArray jArray = (jfloatArray)env->GetObjectArrayElement(buffer, i); + jfloat* body = env->GetFloatArrayElements(jArray,0); + for(int j = 0; j < lens[1]; j++) { + body[j] = array[i*dims[0] + j]; + } + env->ReleaseFloatArrayElements(jArray, body, 0); + } +} + +void AparapiBuffer::inflateDouble2D(JNIEnv *env, KernelArg* arg) { + + jobjectArray buffer = (jobjectArray)javaObject; + jdouble* array = (jdouble*)data; + /* + jdoubleArray* jArray = new jdoubleArray[lens[0]]; + jdouble** body = new jdouble*[lens[0]]; + + for(int i = 0; i < lens[0]; i++) { + jArray[i] = (jdoubleArray)env->GetObjectArrayElement(buffer, i); + body[i] = env->GetDoubleArrayElements(jArray[i],0); + } + + #pragma omp parallel for + for(int i = 0; i < lens[0]; i++) { + for(int j = 0; j < lens[1]; j++) { + body[i][j] = array[i*dims[0] + j]; + } + } + + for(int i = 0; i < lens[0]; i++) { + env->ReleaseDoubleArrayElements(jArray[i], body[i], 0); + } + delete[] jArray; + delete[] body; + */ + + for(int i = 0; i < lens[0]; i++) { + jdoubleArray jArray = (jdoubleArray)env->GetObjectArrayElement(buffer, i); + jdouble* body = env->GetDoubleArrayElements(jArray,0); + for(int j = 0; j < lens[1]; j++) { + body[j] = array[i*dims[0] + j]; + } + env->ReleaseDoubleArrayElements(jArray, body, 0); + } +} + + +void AparapiBuffer::inflateBoolean3D(JNIEnv *env, KernelArg* arg) { + + jobjectArray buffer = (jobjectArray)javaObject; + jboolean* array = (jboolean*)data; + /* + jbooleanArray** jArray = new jbooleanArray*[lens[0]]; + jboolean*** body = new jboolean**[lens[0]]; + + for(int i = 0; i < lens[0]; i++) { + jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement(buffer, i); + jArray[i] = new jbooleanArray[lens[0]]; + body[i] = new jboolean*[lens[0]]; + for(int j = 0; j < lens[1]; j++) { + jArray[i][j] = (jbooleanArray)env->GetObjectArrayElement(jrow, j); + body[i][j] = env->GetBooleanArrayElements(jArray[i][j],0); + } + } + + #pragma omp parallel for + for(int i = 0; i < lens[0]; i++) { + for(int j = 0; j < lens[1]; j++) { + for(int k = 0; k < lens[2]; k++) { + body[i][j][k] = array[i*dims[0] + j*dims[1] + k]; + } + } + } + + for(int i = 0; i < lens[0]; i++) { + for(int j = 0; j < lens[1]; j++) { + env->ReleaseBooleanArrayElements(jArray[i][j], body[i][j], 0); + } + delete[] jArray[i]; + delete[] body[i]; + } + delete[] jArray; + delete[] body; + */ + + for(int i = 0; i < lens[0]; i++) { + jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement(buffer, i); + for(int j = 0; j < lens[1]; j++) { + jbooleanArray jArray = (jbooleanArray)env->GetObjectArrayElement(jrow, j); + jboolean* body = env->GetBooleanArrayElements(jArray,0); + for(int k = 0; k < lens[2]; k++) { + body[k] = array[i*dims[0] + j*dims[1] + k]; + } + env->ReleaseBooleanArrayElements(jArray, body, 0); + } + } +} + +void AparapiBuffer::inflateByte3D(JNIEnv *env, KernelArg* arg) { + + jobjectArray buffer = (jobjectArray)javaObject; + jbyte* array = (jbyte*)data; + /* + jbyteArray** jArray = new jbyteArray*[lens[0]]; + jbyte*** body = new jbyte**[lens[0]]; + + for(int i = 0; i < lens[0]; i++) { + jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement(buffer, i); + jArray[i] = new jbyteArray[lens[0]]; + body[i] = new jbyte*[lens[0]]; + for(int j = 0; j < lens[1]; j++) { + jArray[i][j] = (jbyteArray)env->GetObjectArrayElement(jrow, j); + body[i][j] = env->GetByteArrayElements(jArray[i][j],0); + } + } + + #pragma omp parallel for + for(int i = 0; i < lens[0]; i++) { + for(int j = 0; j < lens[1]; j++) { + for(int k = 0; k < lens[2]; k++) { + body[i][j][k] = array[i*dims[0] + j*dims[1] + k]; + } + } + } + + for(int i = 0; i < lens[0]; i++) { + for(int j = 0; j < lens[1]; j++) { + env->ReleaseByteArrayElements(jArray[i][j], body[i][j], 0); + } + delete[] jArray[i]; + delete[] body[i]; + } + delete[] jArray; + delete[] body; + */ + + for(int i = 0; i < lens[0]; i++) { + jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement(buffer, i); + for(int j = 0; j < lens[1]; j++) { + jbyteArray jArray = (jbyteArray)env->GetObjectArrayElement(jrow, j); + jbyte* body = env->GetByteArrayElements(jArray,0); + for(int k = 0; k < lens[2]; k++) { + body[k] = array[i*dims[0] + j*dims[1] + k]; + } + env->ReleaseByteArrayElements(jArray, body, 0); + } + } +} + +void AparapiBuffer::inflateShort3D(JNIEnv *env, KernelArg* arg) { + + jobjectArray buffer = (jobjectArray)javaObject; + jshort* array = (jshort*)data; + /* + jshortArray** jArray = new jshortArray*[lens[0]]; + jshort*** body = new jshort**[lens[0]]; + + for(int i = 0; i < lens[0]; i++) { + jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement(buffer, i); + jArray[i] = new jshortArray[lens[0]]; + body[i] = new jshort*[lens[0]]; + for(int j = 0; j < lens[1]; j++) { + jArray[i][j] = (jshortArray)env->GetObjectArrayElement(jrow, j); + body[i][j] = env->GetShortArrayElements(jArray[i][j],0); + } + } + + #pragma omp parallel for + for(int i = 0; i < lens[0]; i++) { + for(int j = 0; j < lens[1]; j++) { + for(int k = 0; k < lens[2]; k++) { + body[i][j][k] = array[i*dims[0] + j*dims[1] + k]; + } + } + } + + for(int i = 0; i < lens[0]; i++) { + for(int j = 0; j < lens[1]; j++) { + env->ReleaseShortArrayElements(jArray[i][j], body[i][j], 0); + } + delete[] jArray[i]; + delete[] body[i]; + } + delete[] jArray; + delete[] body; + */ + + for(int i = 0; i < lens[0]; i++) { + jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement(buffer, i); + for(int j = 0; j < lens[1]; j++) { + jshortArray jArray = (jshortArray)env->GetObjectArrayElement(jrow, j); + jshort* body = env->GetShortArrayElements(jArray,0); + for(int k = 0; k < lens[2]; k++) { + body[k] = array[i*dims[0] + j*dims[1] + k]; + } + env->ReleaseShortArrayElements(jArray, body, 0); + } + } +} + +void AparapiBuffer::inflateInt3D(JNIEnv *env, KernelArg* arg) { + + jobjectArray buffer = (jobjectArray)javaObject; + jint* array = (jint*)data; + /* + jintArray** jArray = new jintArray*[lens[0]]; + jint*** body = new jint**[lens[0]]; + + for(int i = 0; i < lens[0]; i++) { + jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement(buffer, i); + jArray[i] = new jintArray[lens[0]]; + body[i] = new jint*[lens[0]]; + for(int j = 0; j < lens[1]; j++) { + jArray[i][j] = (jintArray)env->GetObjectArrayElement(jrow, j); + body[i][j] = env->GetIntArrayElements(jArray[i][j],0); + } + } + + #pragma omp parallel for + for(int i = 0; i < lens[0]; i++) { + for(int j = 0; j < lens[1]; j++) { + for(int k = 0; k < lens[2]; k++) { + body[i][j][k] = array[i*dims[0] + j*dims[1] + k]; + } + } + } + + for(int i = 0; i < lens[0]; i++) { + for(int j = 0; j < lens[1]; j++) { + env->ReleaseIntArrayElements(jArray[i][j], body[i][j], 0); + } + delete[] jArray[i]; + delete[] body[i]; + } + delete[] jArray; + delete[] body; + */ + + for(int i = 0; i < lens[0]; i++) { + jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement(buffer, i); + for(int j = 0; j < lens[1]; j++) { + jintArray jArray = (jintArray)env->GetObjectArrayElement(jrow, j); + jint* body = env->GetIntArrayElements(jArray,0); + for(int k = 0; k < lens[2]; k++) { + body[k] = array[i*dims[0] + j*dims[1] + k]; + } + env->ReleaseIntArrayElements(jArray, body, 0); + } + } +} + +void AparapiBuffer::inflateLong3D(JNIEnv *env, KernelArg* arg) { + + jobjectArray buffer = (jobjectArray)javaObject; + jlong* array = (jlong*)data; + /* + jlongArray** jArray = new jlongArray*[lens[0]]; + jlong*** body = new jlong**[lens[0]]; + + for(int i = 0; i < lens[0]; i++) { + jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement(buffer, i); + jArray[i] = new jlongArray[lens[0]]; + body[i] = new jlong*[lens[0]]; + for(int j = 0; j < lens[1]; j++) { + jArray[i][j] = (jlongArray)env->GetObjectArrayElement(jrow, j); + body[i][j] = env->GetLongArrayElements(jArray[i][j],0); + } + } + + #pragma omp parallel for + for(int i = 0; i < lens[0]; i++) { + for(int j = 0; j < lens[1]; j++) { + for(int k = 0; k < lens[2]; k++) { + body[i][j][k] = array[i*dims[0] + j*dims[1] + k]; + } + } + } + + for(int i = 0; i < lens[0]; i++) { + for(int j = 0; j < lens[1]; j++) { + env->ReleaseLongArrayElements(jArray[i][j], body[i][j], 0); + } + delete[] jArray[i]; + delete[] body[i]; + } + delete[] jArray; + delete[] body; + */ + + for(int i = 0; i < lens[0]; i++) { + jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement(buffer, i); + for(int j = 0; j < lens[1]; j++) { + jlongArray jArray = (jlongArray)env->GetObjectArrayElement(jrow, j); + jlong* body = env->GetLongArrayElements(jArray,0); + for(int k = 0; k < lens[2]; k++) { + body[k] = array[i*dims[0] + j*dims[1] + k]; + } + env->ReleaseLongArrayElements(jArray, body, 0); + } + } +} + +void AparapiBuffer::inflateFloat3D(JNIEnv *env, KernelArg* arg) { + + jobjectArray buffer = (jobjectArray)javaObject; + jfloat* array = (jfloat*)data; + /* + jfloatArray** jArray = new jfloatArray*[lens[0]]; + jfloat*** body = new jfloat**[lens[0]]; + + for(int i = 0; i < lens[0]; i++) { + jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement(buffer, i); + jArray[i] = new jfloatArray[lens[0]]; + body[i] = new jfloat*[lens[0]]; + for(int j = 0; j < lens[1]; j++) { + jArray[i][j] = (jfloatArray)env->GetObjectArrayElement(jrow, j); + body[i][j] = env->GetFloatArrayElements(jArray[i][j],0); + } + } + + #pragma omp parallel for + for(int i = 0; i < lens[0]; i++) { + for(int j = 0; j < lens[1]; j++) { + for(int k = 0; k < lens[2]; k++) { + body[i][j][k] = array[i*dims[0] + j*dims[1] + k]; + } + } + } + + for(int i = 0; i < lens[0]; i++) { + for(int j = 0; j < lens[1]; j++) { + env->ReleaseFloatArrayElements(jArray[i][j], body[i][j], 0); + } + delete[] jArray[i]; + delete[] body[i]; + } + delete[] jArray; + delete[] body; + */ + + for(int i = 0; i < lens[0]; i++) { + jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement(buffer, i); + for(int j = 0; j < lens[1]; j++) { + jfloatArray jArray = (jfloatArray)env->GetObjectArrayElement(jrow, j); + jfloat* body = env->GetFloatArrayElements(jArray,0); + for(int k = 0; k < lens[2]; k++) { + body[k] = array[i*dims[0] + j*dims[1] + k]; + } + env->ReleaseFloatArrayElements(jArray, body, 0); + } + } +} + +void AparapiBuffer::inflateDouble3D(JNIEnv *env, KernelArg* arg) { + + jobjectArray buffer = (jobjectArray)javaObject; + jdouble* array = (jdouble*)data; + /* + jdoubleArray** jArray = new jdoubleArray*[lens[0]]; + jdouble*** body = new jdouble**[lens[0]]; + + for(int i = 0; i < lens[0]; i++) { + jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement(buffer, i); + jArray[i] = new jdoubleArray[lens[0]]; + body[i] = new jdouble*[lens[0]]; + for(int j = 0; j < lens[1]; j++) { + jArray[i][j] = (jdoubleArray)env->GetObjectArrayElement(jrow, j); + body[i][j] = env->GetDoubleArrayElements(jArray[i][j],0); + } + } + + #pragma omp parallel for + for(int i = 0; i < lens[0]; i++) { + for(int j = 0; j < lens[1]; j++) { + for(int k = 0; k < lens[2]; k++) { + body[i][j][k] = array[i*dims[0] + j*dims[1] + k]; + } + } + } + + for(int i = 0; i < lens[0]; i++) { + for(int j = 0; j < lens[1]; j++) { + env->ReleaseDoubleArrayElements(jArray[i][j], body[i][j], 0); + } + delete[] jArray[i]; + delete[] body[i]; + } + delete[] jArray; + delete[] body; + */ + + for(int i = 0; i < lens[0]; i++) { + jobjectArray jrow = (jobjectArray)env->GetObjectArrayElement(buffer, i); + for(int j = 0; j < lens[1]; j++) { + jdoubleArray jArray = (jdoubleArray)env->GetObjectArrayElement(jrow, j); + jdouble* body = env->GetDoubleArrayElements(jArray,0); + for(int k = 0; k < lens[2]; k++) { + body[k] = array[i*dims[0] + j*dims[1] + k]; + } + env->ReleaseDoubleArrayElements(jArray, body, 0); + } + } +} + +void AparapiBuffer::deleteBuffer(KernelArg* arg) +{ + delete[] dims; + delete[] lens; + if(arg->isBoolean()) { + delete[] (jboolean*)data; + } else if(arg->isByte()) { + delete[] (jbyte*)data; + } else if(arg->isShort()) { + delete[] (jshort*)data; + } else if(arg->isInt()) { + delete[] (jint*)data; + } else if(arg->isLong()) { + delete[] (jlong*)data; + } else if(arg->isFloat()) { + delete[] (jfloat*)data; + } else if(arg->isDouble()) { + delete[] (jdouble*)data; + } +} diff --git a/com.amd.aparapi.jni/src/cpp/runKernel/AparapiBuffer.h b/com.amd.aparapi.jni/src/cpp/runKernel/AparapiBuffer.h new file mode 100644 index 00000000..b5a05d82 --- /dev/null +++ b/com.amd.aparapi.jni/src/cpp/runKernel/AparapiBuffer.h @@ -0,0 +1,133 @@ +/* + Copyright (c) 2010-2011, Advanced Micro Devices, Inc. + All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, are permitted provided that the + following conditions are met: + + Redistributions of source code must retain the above copyright notice, this list of conditions and the following + disclaimer. + + Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided with the distribution. + + Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, + INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + If you use the software (in whole or in part), you shall adhere to all applicable U.S., European, and other export + laws, including but not limited to the U.S. Export Administration Regulations ("EAR"), (15 C.F.R. Sections 730 + through 774), and E.U. Council Regulation (EC) No 1334/2000 of 22 June 2000. Further, pursuant to Section 740.6 of + the EAR, you hereby certify that, except pursuant to a license granted by the United States Department of Commerce + Bureau of Industry and Security or as otherwise permitted pursuant to a License Exception under the U.S. Export + Administration Regulations ("EAR"), you will not (1) export, re-export or release to a national of a country in + Country Groups D:1, E:1 or E:2 any restricted technology, software, or source code you receive hereunder, or (2) + export to Country Groups D:1, E:1 or E:2 the direct product of such technology or software, if such foreign produced + direct product is subject to national security controls as identified on the Commerce Control List (currently + found in Supplement 1 to Part 774 of EAR). For the most current Country Group listings, or for additional + information about the EAR or your obligations under those regulations, please refer to the U.S. Bureau of Industry + and Security?s website at http://www.bis.doc.gov/. + */ + +#ifndef APARAPIBUFFER_H +#define APARAPIBUFFER_H +#include "Common.h" +#include "ProfileInfo.h" +#include "com_amd_aparapi_internal_jni_KernelRunnerJNI.h" + +class KernelArg; + +class AparapiBuffer{ + +private: + + static int isFloat(int type){ + return(type&com_amd_aparapi_internal_jni_KernelRunnerJNI_ARG_FLOAT); + } + static int isLong(int type){ + return (type&com_amd_aparapi_internal_jni_KernelRunnerJNI_ARG_LONG); + } + static int isInt(int type){ + return (type&com_amd_aparapi_internal_jni_KernelRunnerJNI_ARG_INT); + } + static int isDouble(int type){ + return (type&com_amd_aparapi_internal_jni_KernelRunnerJNI_ARG_DOUBLE); + } + static int isBoolean(int type){ + return (type&com_amd_aparapi_internal_jni_KernelRunnerJNI_ARG_BOOLEAN); + } + static int isByte(int type){ + return (type&com_amd_aparapi_internal_jni_KernelRunnerJNI_ARG_BYTE); + } + static int isShort(int type){ + return (type&com_amd_aparapi_internal_jni_KernelRunnerJNI_ARG_SHORT); + } + +public: + jobject javaObject; // The java array that this arg is mapped to + cl_uint numDims; // sizes of dimensions of the object (array lengths for ND arrays) + cl_uint* dims; // sizes of offsets of the object (first element offset in ND arrays) + cl_uint* lens; // sizes of dimensions of the object (array lengths for ND arrays) + jint lengthInBytes; // bytes in the array or directBuf + cl_mem mem; // the opencl buffer + void *data; // a copy of the object itself (this is what we pass to OpenCL) + cl_uint memMask; // the mask used for createBuffer + ProfileInfo read; + ProfileInfo write; + + AparapiBuffer(); + AparapiBuffer(void* _data, cl_uint* _dims, cl_uint _numDims, long _lengthInBytes, jobject _javaObject); + + void deleteBuffer(KernelArg* arg); + + static AparapiBuffer* flatten(JNIEnv *env, jobject arg, int type); + + static AparapiBuffer* flattenBoolean2D(JNIEnv *env, jobject arg); + static AparapiBuffer* flattenChar2D(JNIEnv *env, jobject arg); + static AparapiBuffer* flattenByte2D(JNIEnv *env, jobject arg); + static AparapiBuffer* flattenShort2D(JNIEnv *env, jobject arg); + static AparapiBuffer* flattenInt2D(JNIEnv *env, jobject arg); + static AparapiBuffer* flattenLong2D(JNIEnv *env, jobject arg); + static AparapiBuffer* flattenFloat2D(JNIEnv *env, jobject arg); + static AparapiBuffer* flattenDouble2D(JNIEnv *env, jobject arg); + + static AparapiBuffer* flattenBoolean3D(JNIEnv *env, jobject arg); + static AparapiBuffer* flattenChar3D(JNIEnv *env, jobject arg); + static AparapiBuffer* flattenByte3D(JNIEnv *env, jobject arg); + static AparapiBuffer* flattenShort3D(JNIEnv *env, jobject arg); + static AparapiBuffer* flattenInt3D(JNIEnv *env, jobject arg); + static AparapiBuffer* flattenLong3D(JNIEnv *env, jobject arg); + static AparapiBuffer* flattenFloat3D(JNIEnv *env, jobject arg); + static AparapiBuffer* flattenDouble3D(JNIEnv *env, jobject arg); + + void inflate(JNIEnv *env, KernelArg* arg); + + void inflateBoolean2D(JNIEnv *env, KernelArg* arg); + void inflateChar2D(JNIEnv *env, KernelArg* arg); + void inflateByte2D(JNIEnv *env, KernelArg* arg); + void inflateShort2D(JNIEnv *env, KernelArg* arg); + void inflateInt2D(JNIEnv *env, KernelArg* arg); + void inflateLong2D(JNIEnv *env, KernelArg* arg); + void inflateFloat2D(JNIEnv *env, KernelArg* arg); + void inflateDouble2D(JNIEnv *env, KernelArg* arg); + + void inflateBoolean3D(JNIEnv *env, KernelArg* arg); + void inflateChar3D(JNIEnv *env, KernelArg* arg); + void inflateByte3D(JNIEnv *env, KernelArg* arg); + void inflateShort3D(JNIEnv *env, KernelArg* arg); + void inflateInt3D(JNIEnv *env, KernelArg* arg); + void inflateLong3D(JNIEnv *env, KernelArg* arg); + void inflateFloat3D(JNIEnv *env, KernelArg* arg); + void inflateDouble3D(JNIEnv *env, KernelArg* arg); + + jobject getJavaObject(JNIEnv* env, KernelArg* arg); +}; + +#endif // ARRAYBUFFER_H diff --git a/com.amd.aparapi.jni/src/cpp/runKernel/KernelArg.cpp b/com.amd.aparapi.jni/src/cpp/runKernel/KernelArg.cpp index c7e918b9..0f401cea 100644 --- a/com.amd.aparapi.jni/src/cpp/runKernel/KernelArg.cpp +++ b/com.amd.aparapi.jni/src/cpp/runKernel/KernelArg.cpp @@ -34,6 +34,8 @@ KernelArg::KernelArg(JNIEnv *jenv, JNIContext *jniContext, jobject argObj): jenv->ReleaseStringUTFChars(nameString, nameChars); if (isArray()){ arrayBuffer = new ArrayBuffer(); + } else if(isAparapiBuffer()) { + aparapiBuffer = AparapiBuffer::flatten(jenv, argObj, type); } } @@ -44,6 +46,13 @@ cl_int KernelArg::setLocalBufferArg(JNIEnv *jenv, int argIdx, int argPos, bool v return(clSetKernelArg(jniContext->kernel, argPos, (int)arrayBuffer->lengthInBytes, NULL)); } +cl_int KernelArg::setLocalAparapiBufferArg(JNIEnv *jenv, int argIdx, int argPos, bool verbose) { + if (verbose){ + fprintf(stderr, "ISLOCAL, clSetKernelArg(jniContext->kernel, %d, %d, NULL);\n", argIdx, (int) aparapiBuffer->lengthInBytes); + } + return(clSetKernelArg(jniContext->kernel, argPos, (int)aparapiBuffer->lengthInBytes, NULL)); +} + const char* KernelArg::getTypeName() { string s = ""; if(isStatic()) { diff --git a/com.amd.aparapi.jni/src/cpp/runKernel/KernelArg.h b/com.amd.aparapi.jni/src/cpp/runKernel/KernelArg.h index 75c7a5ca..a54e411e 100644 --- a/com.amd.aparapi.jni/src/cpp/runKernel/KernelArg.h +++ b/com.amd.aparapi.jni/src/cpp/runKernel/KernelArg.h @@ -5,6 +5,7 @@ #include "Common.h" #include "JNIHelper.h" #include "ArrayBuffer.h" +#include "AparapiBuffer.h" #include "com_amd_aparapi_internal_jni_KernelRunnerJNI.h" #include "Config.h" #include @@ -69,6 +70,7 @@ class KernelArg{ jint type; // a bit mask determining the type of this arg ArrayBuffer *arrayBuffer; + AparapiBuffer *aparapiBuffer; // Uses JNIContext so cant inline here see below KernelArg(JNIEnv *jenv, JNIContext *jniContext, jobject argObj); @@ -157,14 +159,14 @@ class KernelArg{ int isConstant(){ return (type&com_amd_aparapi_internal_jni_KernelRunnerJNI_ARG_CONSTANT); } - int isAparapiBuf(){ - return (type&com_amd_aparapi_internal_jni_KernelRunnerJNI_ARG_APARAPI_BUF); + int isAparapiBuffer(){ + return (type&com_amd_aparapi_internal_jni_KernelRunnerJNI_ARG_APARAPI_BUFFER); } int isBackedByArray(){ return ( (isArray() && (isGlobal() || isConstant()))); } int needToEnqueueRead(){ - return(((isArray() && isGlobal()) || ((isAparapiBuf()&&isGlobal()))) && (isImplicit()&&isMutableByKernel())); + return(((isArray() && isGlobal()) || ((isAparapiBuffer()&&isGlobal()))) && (isImplicit()&&isMutableByKernel())); } int needToEnqueueWrite(){ return ((isImplicit()&&isReadByKernel())||(isExplicit()&&isExplicitWrite())); @@ -188,6 +190,7 @@ class KernelArg{ // Uses JNIContext so can't inline here we below. cl_int setLocalBufferArg(JNIEnv *jenv, int argIdx, int argPos, bool verbose); + cl_int setLocalAparapiBufferArg(JNIEnv *jenv, int argIdx, int argPos, bool verbose); // Uses JNIContext so can't inline here we below. cl_int setPrimitiveArg(JNIEnv *jenv, int argIdx, int argPos, bool verbose); }; diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelArgJNI.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelArgJNI.java index 19d297f2..270321ff 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelArgJNI.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelArgJNI.java @@ -47,6 +47,11 @@ public abstract class KernelArgJNI{ */ @UsedByJNICode protected Object javaArray; + /** + * If this field represents an aparapi buffer then the instance will be captured here + */ + @UsedByJNICode protected Object javaBuffer; + /** * If this is an array or a buffer then the size (in bytes) is held here */ @@ -57,6 +62,18 @@ public abstract class KernelArgJNI{ */ @UsedByJNICode protected int numElements; + + /** + * If this is an multidimensional array then the number of dimensions is stored here + */ + @UsedByJNICode protected int numDims; + + + /** + * If this is an multidimensional array then the dimensions are stored here + */ + @UsedByJNICode protected int[] dims; + /** * If this is an array buffer then the number of elements is stored here. * @@ -69,6 +86,8 @@ public abstract class KernelArgJNI{ */ @UsedByJNICode protected Object array; + @UsedByJNICode protected Object buffer; + /** * Field in Kernel class corresponding to this arg */ diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelRunnerJNI.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelRunnerJNI.java index 76aae813..a672093f 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelRunnerJNI.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/jni/KernelRunnerJNI.java @@ -178,7 +178,7 @@ public abstract class KernelRunnerJNI{ * * @author gfrost */ - @UsedByJNICode protected static final int ARG_APARAPI_BUF = 1 << 15; + @UsedByJNICode protected static final int ARG_APARAPI_BUFFER = 1 << 15; /** * This 'bit' indicates that the arg has been explicitly marked for reading @@ -207,23 +207,6 @@ public abstract class KernelRunnerJNI{ */ @UsedByJNICode protected static final int ARG_OBJ_ARRAY_STRUCT = 1 << 18; - /** - * TODO: - * - * @see com.amd.aparapi.annotations.UsedByJNICode - * - * @author gfrost - */ - // @UsedByJNICode protected static final int ARG_APARAPI_BUF_HAS_ARRAY = 1 << 19; - - /** - * TODO: - * - * @see com.amd.aparapi.annotations.UsedByJNICode - * - * @author gfrost - */ - // @UsedByJNICode protected static final int ARG_APARAPI_BUF_IS_DIRECT = 1 << 20; /** * This 'bit' indicates that a particular KernelArg represents a char type (array or primitive). diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelArg.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelArg.java index 29fe3fc8..9599bf04 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelArg.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelArg.java @@ -218,4 +218,46 @@ protected Field getField() { protected void setField(Field field) { this.field = field; } + + /** + * @return the buffer + */ + protected Object getJavaBuffer() { + return javaBuffer; + } + + /** + * @param buffer the buffer to set + */ + protected void setJavaBuffer(Object buffer) { + this.javaBuffer = buffer; + } + + /** + * @return the number of dimensions to buffer + */ + protected int getNumDims() { + return numDims; + } + + /** + * @param numDims the number of dimensions for the buffer + */ + protected void setNumDims(int numDims) { + this.numDims = numDims; + } + + /** + * @return the dimensions for the buffer + */ + protected int[] getDims() { + return dims; + } + + /** + * @param dims the dimsensions for the buffer + */ + protected void setDims(int[] dims) { + this.dims = dims; + } } diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java index 75d37b2a..b036829c 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/kernel/KernelRunner.java @@ -1056,14 +1056,9 @@ && hasGlobalInt32ExtendedAtomicsSupport() && hasLocalInt32BaseAtomicsSupport() } else { args[i].setType(args[i].getType() | ARG_GLOBAL); } - - args[i].setArray(null); // will get updated in updateKernelArrayRefs - args[i].setType(args[i].getType() | ARG_ARRAY); - if (isExplicit()) { args[i].setType(args[i].getType() | ARG_EXPLICIT); } - // for now, treat all write arrays as read-write, see bugzilla issue 4859 // we might come up with a better solution later args[i].setType(args[i].getType() @@ -1071,26 +1066,51 @@ && hasGlobalInt32ExtendedAtomicsSupport() && hasLocalInt32BaseAtomicsSupport() args[i].setType(args[i].getType() | (entryPoint.getArrayFieldAccesses().contains(field.getName()) ? ARG_READ : 0)); // args[i].type |= ARG_GLOBAL; - args[i].setType(args[i].getType() | (type.isAssignableFrom(float[].class) ? ARG_FLOAT : 0)); - args[i].setType(args[i].getType() | (type.isAssignableFrom(int[].class) ? ARG_INT : 0)); - args[i].setType(args[i].getType() | (type.isAssignableFrom(boolean[].class) ? ARG_BOOLEAN : 0)); - args[i].setType(args[i].getType() | (type.isAssignableFrom(byte[].class) ? ARG_BYTE : 0)); - args[i].setType(args[i].getType() | (type.isAssignableFrom(char[].class) ? ARG_CHAR : 0)); - args[i].setType(args[i].getType() | (type.isAssignableFrom(double[].class) ? ARG_DOUBLE : 0)); - args[i].setType(args[i].getType() | (type.isAssignableFrom(long[].class) ? ARG_LONG : 0)); - args[i].setType(args[i].getType() | (type.isAssignableFrom(short[].class) ? ARG_SHORT : 0)); - - // arrays whose length is used will have an int arg holding - // the length as a kernel param - if (entryPoint.getArrayFieldArrayLengthUsed().contains(args[i].getName())) { - args[i].setType(args[i].getType() | ARG_ARRAYLENGTH); - } + if (type.getName().startsWith("[L")) { - args[i].setType(args[i].getType() | (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)); + args[i].setType(args[i].getType() + | (ARG_OBJ_ARRAY_STRUCT | + ARG_WRITE | + ARG_READ | + ARG_APARAPI_BUFFER)); + if (logger.isLoggable(Level.FINE)) { logger.fine("tagging " + args[i].getName() + " as (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)"); } + } else if (type.getName().startsWith("[[")) { + + try { + setMultiArrayType(args[i], type); + } catch(AparapiException e) { + return warnFallBackAndExecute(_entrypointName, _range, _passes, "failed to set kernel arguement " + args[i].getName() + ". Aparapi only supports 2D and 3D arrays."); + } + } else { + + args[i].setArray(null); // will get updated in updateKernelArrayRefs + args[i].setType(args[i].getType() | ARG_ARRAY); + + args[i].setType(args[i].getType() | (type.isAssignableFrom(float[].class) ? ARG_FLOAT : 0)); + args[i].setType(args[i].getType() | (type.isAssignableFrom(int[].class) ? ARG_INT : 0)); + args[i].setType(args[i].getType() | (type.isAssignableFrom(boolean[].class) ? ARG_BOOLEAN : 0)); + args[i].setType(args[i].getType() | (type.isAssignableFrom(byte[].class) ? ARG_BYTE : 0)); + args[i].setType(args[i].getType() | (type.isAssignableFrom(char[].class) ? ARG_CHAR : 0)); + args[i].setType(args[i].getType() | (type.isAssignableFrom(double[].class) ? ARG_DOUBLE : 0)); + args[i].setType(args[i].getType() | (type.isAssignableFrom(long[].class) ? ARG_LONG : 0)); + args[i].setType(args[i].getType() | (type.isAssignableFrom(short[].class) ? ARG_SHORT : 0)); + + // arrays whose length is used will have an int arg holding + // the length as a kernel param + if (entryPoint.getArrayFieldArrayLengthUsed().contains(args[i].getName())) { + args[i].setType(args[i].getType() | ARG_ARRAYLENGTH); + } + + if (type.getName().startsWith("[L")) { + args[i].setType(args[i].getType() | (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)); + if (logger.isLoggable(Level.FINE)) { + logger.fine("tagging " + args[i].getName() + " as (ARG_OBJ_ARRAY_STRUCT | ARG_WRITE | ARG_READ)"); + } + } } } else if (type.isAssignableFrom(float.class)) { args[i].setType(args[i].getType() | ARG_PRIMITIVE); @@ -1122,10 +1142,7 @@ && hasGlobalInt32ExtendedAtomicsSupport() && hasLocalInt32BaseAtomicsSupport() e.printStackTrace(); } - args[i].setPrimitiveSize(((args[i].getType() & ARG_FLOAT) != 0 ? 4 : (args[i].getType() & ARG_INT) != 0 ? 4 - : (args[i].getType() & ARG_BYTE) != 0 ? 1 : (args[i].getType() & ARG_CHAR) != 0 ? 2 - : (args[i].getType() & ARG_BOOLEAN) != 0 ? 1 : (args[i].getType() & ARG_SHORT) != 0 ? 2 : (args[i] - .getType() & ARG_LONG) != 0 ? 8 : (args[i].getType() & ARG_DOUBLE) != 0 ? 8 : 0)); + args[i].setPrimitiveSize(getPrimitiveSize(args[i].getType())); if (logger.isLoggable(Level.FINE)) { logger.fine("arg " + i + ", " + args[i].getName() + ", type=" + Integer.toHexString(args[i].getType()) @@ -1177,6 +1194,87 @@ && hasGlobalInt32ExtendedAtomicsSupport() && hasLocalInt32BaseAtomicsSupport() return kernel; } + + private int getPrimitiveSize(int type) { + if ((type & ARG_FLOAT) != 0) { + return 4; + } else if ((type & ARG_INT) != 0) { + return 4; + } else if ((type & ARG_BYTE) != 0) { + return 1; + } else if ((type & ARG_CHAR) != 0) { + return 2; + } else if ((type & ARG_BOOLEAN) != 0) { + return 1; + } else if ((type & ARG_SHORT) != 0) { + return 2; + } else if ((type & ARG_LONG) != 0) { + return 8; + } else if ((type & ARG_DOUBLE) != 0) { + return 8; + } + return 0; + } + + private void setMultiArrayType(KernelArg arg, Class type) throws AparapiException { + arg.setType(arg.getType() | (ARG_WRITE | ARG_READ | ARG_APARAPI_BUFFER)); + int numDims = 0; + while(type.getName().startsWith("[[[[")) { + throw new AparapiException("Aparapi only supports 2D and 3D arrays."); + } + arg.setType(arg.getType() | ARG_ARRAYLENGTH); + while(type.getName().charAt(numDims) == '[') { + numDims++; + } + Object buffer = new Object(); + try { + buffer = arg.getField().get(kernel); + } catch(IllegalAccessException e) { + e.printStackTrace(); + } + arg.setJavaBuffer(buffer); + arg.setNumDims(numDims); + Object subBuffer = buffer; + int[] dims = new int[numDims]; + for(int i = 0; i < numDims-1; i++) { + dims[i] = Array.getLength(subBuffer); + subBuffer = Array.get(subBuffer, 0); + } + dims[numDims-1] = Array.getLength(subBuffer); + arg.setDims(dims); + + if (subBuffer.getClass().isAssignableFrom(float[].class)) { + arg.setType(arg.getType() | ARG_FLOAT); + } + if (subBuffer.getClass().isAssignableFrom(int[].class)) { + arg.setType(arg.getType() | ARG_INT); + } + if (subBuffer.getClass().isAssignableFrom(boolean[].class)) { + arg.setType(arg.getType() | ARG_BOOLEAN); + } + if (subBuffer.getClass().isAssignableFrom(byte[].class)) { + arg.setType(arg.getType() | ARG_BYTE); + } + if (subBuffer.getClass().isAssignableFrom(char[].class)) { + arg.setType(arg.getType() | ARG_CHAR); + } + if (subBuffer.getClass().isAssignableFrom(double[].class)) { + arg.setType(arg.getType() | ARG_DOUBLE); + } + if (subBuffer.getClass().isAssignableFrom(long[].class)) { + arg.setType(arg.getType() | ARG_LONG); + } + if (subBuffer.getClass().isAssignableFrom(short[].class)) { + arg.setType(arg.getType() | ARG_SHORT); + } + int primitiveSize = getPrimitiveSize(arg.getType()); + int totalElements = 1; + for(int i = 0; i < numDims; i++) { + totalElements *= dims[i]; + } + arg.setSizeInBytes(totalElements * primitiveSize); + } + private final Set puts = new HashSet(); /** diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/Entrypoint.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/Entrypoint.java index f17f57cd..425d5802 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/Entrypoint.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/Entrypoint.java @@ -63,6 +63,7 @@ to national security controls as identified on the Commerce Control List (curren import com.amd.aparapi.internal.instruction.InstructionSet.AssignToArrayElement; import com.amd.aparapi.internal.instruction.InstructionSet.AssignToField; import com.amd.aparapi.internal.instruction.InstructionSet.I_ARRAYLENGTH; +import com.amd.aparapi.internal.instruction.InstructionSet.I_AALOAD; import com.amd.aparapi.internal.instruction.InstructionSet.I_GETFIELD; import com.amd.aparapi.internal.instruction.InstructionSet.I_INVOKESPECIAL; import com.amd.aparapi.internal.instruction.InstructionSet.I_INVOKESTATIC; @@ -591,11 +592,15 @@ public Entrypoint(ClassModel _classModel, MethodModel _methodModel, Object _k) t } } else if (instruction instanceof I_ARRAYLENGTH) { - if (!(instruction.getFirstChild() instanceof AccessField)) { + Instruction child = instruction.getFirstChild(); + while(child instanceof I_AALOAD) { + child = child.getFirstChild(); + } + if (!(child instanceof AccessField)) { throw new ClassParseException(ClassParseException.TYPE.LOCALARRAYLENGTHACCESS); } - final AccessField child = (AccessField) instruction.getFirstChild(); - final String arrayName = child.getConstantPoolFieldEntry().getNameAndTypeEntry().getNameUTF8Entry().getUTF8(); + final AccessField childField = (AccessField) child; + final String arrayName = childField.getConstantPoolFieldEntry().getNameAndTypeEntry().getNameUTF8Entry().getUTF8(); arrayFieldArrayLengthUsed.add(arrayName); if (logger.isLoggable(Level.FINE)) { logger.fine("Noted arraylength in " + methodModel.getName() + " on " + arrayName); diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/MethodModel.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/MethodModel.java index a798efa5..7bda9e2e 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/MethodModel.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/model/MethodModel.java @@ -1675,30 +1675,6 @@ private void init(ClassModelMethod _method) throws AparapiException { foldExpressions(); - // Attempt to detect accesses through multi-dimension arrays. - // This was issue 10 in open source release http://code.google.com/p/aparapi/issues/detail?id=10 - for (final Entry instructionEntry : pcMap.entrySet()) { - final Instruction instruction = instructionEntry.getValue(); - if (instruction instanceof AccessArrayElement) { - final AccessArrayElement accessArrayElement = (AccessArrayElement) instruction; - final Instruction accessed = accessArrayElement.getArrayRef(); - // System.out.println("accessed "+accessed); - if (accessed instanceof AccessArrayElement) { - throw new ClassParseException(ClassParseException.TYPE.MULTIDIMENSIONARRAYACCESS); - } - - } - if (instruction instanceof AssignToArrayElement) { - final AssignToArrayElement assignToArrayElement = (AssignToArrayElement) instruction; - final Instruction assigned = assignToArrayElement.getArrayRef(); - - // System.out.println("assigned "+assigned); - if (assigned instanceof AccessArrayElement) { - throw new ClassParseException(ClassParseException.TYPE.MULTIDIMENSIONARRAYASSIGN); - } - - } - } // Accessor conversion only works on member object arrays if ((entrypoint != null) && (_method.getClassModel() != entrypoint.getClassModel())) { if (logger.isLoggable(Level.FINE)) { diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/writer/BlockWriter.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/writer/BlockWriter.java index 5d91b24b..7cf912fa 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/writer/BlockWriter.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/writer/BlockWriter.java @@ -73,6 +73,7 @@ to national security controls as identified on the Commerce Control List (curren import com.amd.aparapi.internal.instruction.InstructionSet.FieldArrayElementAssign; import com.amd.aparapi.internal.instruction.InstructionSet.FieldArrayElementIncrement; import com.amd.aparapi.internal.instruction.InstructionSet.I_ALOAD_0; +import com.amd.aparapi.internal.instruction.InstructionSet.I_AALOAD; import com.amd.aparapi.internal.instruction.InstructionSet.I_ARRAYLENGTH; import com.amd.aparapi.internal.instruction.InstructionSet.I_IFNONNULL; import com.amd.aparapi.internal.instruction.InstructionSet.I_IFNULL; @@ -103,6 +104,7 @@ to national security controls as identified on the Commerce Control List (curren public abstract class BlockWriter{ public final static String arrayLengthMangleSuffix = "__javaArrayLength"; + public final static String arrayDimMangleSuffix = "__javaArrayDimension"; public abstract void write(String _string); @@ -434,11 +436,45 @@ public void writeInstruction(Instruction _instruction) throws CodeGenException { write(" = "); writeInstruction(arrayAssignmentInstruction.getValue()); } else if (_instruction instanceof AccessArrayElement) { + + //we're getting an element from an array + //if the array is a primitive then we just return the value + //so the generated code looks like + //arrayName[arrayIndex]; + //but if the array is an object, or multidimensional array, then we want to return + //a pointer to our index our position in the array. The code will look like + //&(arrayName[arrayIndex * this->arrayNameLen_dimension] + // final AccessArrayElement arrayLoadInstruction = (AccessArrayElement) _instruction; + + //object array, get address + if(arrayLoadInstruction instanceof I_AALOAD) { + write("(&"); + } writeInstruction(arrayLoadInstruction.getArrayRef()); write("["); writeInstruction(arrayLoadInstruction.getArrayIndex()); + + //object array, find the size of each object in the array + //for 2D arrays, this size is the size of a row. + if(arrayLoadInstruction instanceof I_AALOAD) { + int dim = 0; + Instruction load = arrayLoadInstruction.getArrayRef(); + while(load instanceof I_AALOAD) { + load = load.getFirstChild(); + dim++; + } + + String arrayName = ((AccessInstanceField)load).getConstantPoolFieldEntry().getNameAndTypeEntry().getNameUTF8Entry().getUTF8(); + write(" * this->" + arrayName + arrayDimMangleSuffix+dim); + } + write("]"); + + //object array, close parentheses + if(arrayLoadInstruction instanceof I_AALOAD) { + write(")"); + } } else if (_instruction instanceof AccessField) { final AccessField accessField = (AccessField) _instruction; if (accessField instanceof AccessInstanceField) { @@ -456,9 +492,20 @@ public void writeInstruction(Instruction _instruction) throws CodeGenException { write(accessField.getConstantPoolFieldEntry().getNameAndTypeEntry().getNameUTF8Entry().getUTF8()); } else if (_instruction instanceof I_ARRAYLENGTH) { - final AccessInstanceField child = (AccessInstanceField) _instruction.getFirstChild(); + + //getting the length of an array. + //if this is a primitive array, then this is trivial + //if we're getting an object array, then we need to find what dimension + //we're looking at + int dim = 0; + Instruction load = _instruction.getFirstChild(); + while(load instanceof I_AALOAD) { + load = load.getFirstChild(); + dim++; + } + final AccessInstanceField child = (AccessInstanceField) load; final String arrayName = child.getConstantPoolFieldEntry().getNameAndTypeEntry().getNameUTF8Entry().getUTF8(); - write("this->" + arrayName + arrayLengthMangleSuffix); + write("this->" + arrayName + arrayLengthMangleSuffix + dim); } else if (_instruction instanceof AssignToField) { final AssignToField assignedField = (AssignToField) _instruction; diff --git a/com.amd.aparapi/src/java/com/amd/aparapi/internal/writer/KernelWriter.java b/com.amd.aparapi/src/java/com/amd/aparapi/internal/writer/KernelWriter.java index 760fd2fc..05e0881d 100644 --- a/com.amd.aparapi/src/java/com/amd/aparapi/internal/writer/KernelWriter.java +++ b/com.amd.aparapi/src/java/com/amd/aparapi/internal/writer/KernelWriter.java @@ -304,6 +304,8 @@ public void writePragma(String _name, boolean _enable) { boolean isPointer = false; + int numDimensions = 0; + // check the suffix String type = field.getName().endsWith(Kernel.LOCAL_SUFFIX) ? __local : (field.getName().endsWith(Kernel.CONSTANT_SUFFIX) ? __constant : __global); @@ -320,10 +322,15 @@ public void writePragma(String _name, boolean _enable) { } } - if (signature.startsWith("[")) { - argLine.append(type + " "); - thisStructLine.append(type + " "); + //if we have a an array we want to mark the object as a pointer + //if we have a multiple diminational array we want to remember the number of dimensions + while (signature.startsWith("[")) { + if(isPointer == false) { + argLine.append(type + " "); + thisStructLine.append(type + " "); + } isPointer = true; + numDimensions++; signature = signature.substring(1); } @@ -362,23 +369,49 @@ public void writePragma(String _name, boolean _enable) { // Add int field into "this" struct for supporting java arraylength op // named like foo__javaArrayLength - if (isPointer && _entryPoint.getArrayFieldArrayLengthUsed().contains(field.getName())) { - final StringBuilder lenStructLine = new StringBuilder(); - final StringBuilder lenArgLine = new StringBuilder(); - final StringBuilder lenAssignLine = new StringBuilder(); + if (isPointer && _entryPoint.getArrayFieldArrayLengthUsed().contains(field.getName()) || + isPointer && numDimensions > 1) { + + for(int i = 0; i < numDimensions; i++) { + final StringBuilder lenStructLine = new StringBuilder(); + final StringBuilder lenArgLine = new StringBuilder(); + final StringBuilder lenAssignLine = new StringBuilder(); + final StringBuilder dimStructLine = new StringBuilder(); + final StringBuilder dimArgLine = new StringBuilder(); + final StringBuilder dimAssignLine = new StringBuilder(); + + String lenName = field.getName() + BlockWriter.arrayLengthMangleSuffix + + Integer.toString(i); + + lenStructLine.append("int " + lenName); + + lenAssignLine.append("this->"); + lenAssignLine.append(lenName); + lenAssignLine.append(" = "); + lenAssignLine.append(lenName); - lenStructLine.append("int " + field.getName() + BlockWriter.arrayLengthMangleSuffix); + lenArgLine.append("int " + lenName); - lenAssignLine.append("this->"); - lenAssignLine.append(field.getName() + BlockWriter.arrayLengthMangleSuffix); - lenAssignLine.append(" = "); - lenAssignLine.append(field.getName() + BlockWriter.arrayLengthMangleSuffix); + assigns.add(lenAssignLine.toString()); + argLines.add(lenArgLine.toString()); + thisStruct.add(lenStructLine.toString()); - lenArgLine.append("int " + field.getName() + BlockWriter.arrayLengthMangleSuffix); + String dimName = field.getName() + BlockWriter.arrayDimMangleSuffix + + Integer.toString(i); - assigns.add(lenAssignLine.toString()); - argLines.add(lenArgLine.toString()); - thisStruct.add(lenStructLine.toString()); + dimStructLine.append("int " + dimName); + + dimAssignLine.append("this->"); + dimAssignLine.append(dimName); + dimAssignLine.append(" = "); + dimAssignLine.append(dimName); + + dimArgLine.append("int " + dimName); + + assigns.add(dimAssignLine.toString()); + argLines.add(dimArgLine.toString()); + thisStruct.add(dimStructLine.toString()); + } } } @@ -653,9 +686,9 @@ public static String writeToString(Entrypoint _entrypoint) throws CodeGenExcepti openCLWriter.write(_entrypoint); } catch (final CodeGenException codeGenException) { throw codeGenException; - } catch (final Throwable t) { + }/* catch (final Throwable t) { throw new CodeGenException(t); - } + }*/ return (openCLStringBuilder.toString()); } diff --git a/samples/MDArray/.classpath b/samples/MDArray/.classpath new file mode 100644 index 00000000..43bd144c --- /dev/null +++ b/samples/MDArray/.classpath @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff --git a/samples/MDArray/.project b/samples/MDArray/.project new file mode 100644 index 00000000..2273fb14 --- /dev/null +++ b/samples/MDArray/.project @@ -0,0 +1,17 @@ + + + mdarray + + + + + + org.eclipse.jdt.core.javabuilder + + + + + + org.eclipse.jdt.core.javanature + + diff --git a/samples/MDArray/build.xml b/samples/MDArray/build.xml new file mode 100644 index 00000000..fca33590 --- /dev/null +++ b/samples/MDArray/build.xml @@ -0,0 +1,118 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/BMatMul1D.java b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/BMatMul1D.java new file mode 100644 index 00000000..4ffe6240 --- /dev/null +++ b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/BMatMul1D.java @@ -0,0 +1,28 @@ +package gov.pnnl.aparapi.sample.mdarray; +import com.amd.aparapi.Kernel; + +class BMatMul1D extends Kernel{ + byte[] A; + + byte[] B; + + byte[] C; + + int N; + + public BMatMul1D(byte[] A, byte[] B, byte[] C, int N) { + this.A = A; + this.B = B; + this.C = C; + this.N = N; + } + + @Override public void run() { + int id = getGlobalId(); + int i = id / N; + int j = id % N; + for (int k = 0; k < N; k++) { + C[i * N + j] += (byte) (A[i * N + k] * B[k * N + j]); + } + } +} diff --git a/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/BMatMul2D.java b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/BMatMul2D.java new file mode 100644 index 00000000..1839d82b --- /dev/null +++ b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/BMatMul2D.java @@ -0,0 +1,28 @@ +package gov.pnnl.aparapi.sample.mdarray; +import com.amd.aparapi.Kernel; + +class BMatMul2D extends Kernel{ + byte[][] A; + + byte[][] B; + + byte[][] C; + + int N; + + public BMatMul2D(byte[][] A, byte[][] B, byte[][] C, int N) { + this.A = A; + this.B = B; + this.C = C; + this.N = N; + } + + @Override public void run() { + int id = getGlobalId(); + int i = id / N; + int j = id % N; + for (int k = 0; k < N; k++) { + C[i][j] += (byte) (A[i][k] * B[k][j]); + } + } +} diff --git a/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/BMatMul3D.java b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/BMatMul3D.java new file mode 100644 index 00000000..eb27bf76 --- /dev/null +++ b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/BMatMul3D.java @@ -0,0 +1,32 @@ +package gov.pnnl.aparapi.sample.mdarray; +import com.amd.aparapi.Kernel; + +class BMatMul3D extends Kernel{ + byte[][][] A; + + byte[][][] B; + + byte[][][] C; + + int N; + + public BMatMul3D(byte[][][] A, byte[][][] B, byte[][][] C, int N) { + this.A = A; + this.B = B; + this.C = C; + this.N = N; + } + + @Override public void run() { + int id = getGlobalId(); + int i = id / (N * N); + int j = (id / N) % N; + int k = id % N; + int a0 = A.length; + int a1 = A[0].length; + int a2 = A[0][0].length; + for (int l = 0; l < N; l++) { + C[i][j][k] += (byte) (A[i][j][l] * B[l][j][k]); + } + } +} diff --git a/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/DMatMul1D.java b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/DMatMul1D.java new file mode 100644 index 00000000..f852a900 --- /dev/null +++ b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/DMatMul1D.java @@ -0,0 +1,28 @@ +package gov.pnnl.aparapi.sample.mdarray; +import com.amd.aparapi.Kernel; + +class DMatMul1D extends Kernel{ + double[] A; + + double[] B; + + double[] C; + + int N; + + public DMatMul1D(double[] A, double[] B, double[] C, int N) { + this.A = A; + this.B = B; + this.C = C; + this.N = N; + } + + @Override public void run() { + int id = getGlobalId(); + int i = id / N; + int j = id % N; + for (int k = 0; k < N; k++) { + C[i * N + j] += A[i * N + k] * B[k * N + j]; + } + } +} diff --git a/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/DMatMul2D.java b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/DMatMul2D.java new file mode 100644 index 00000000..e46e8f6e --- /dev/null +++ b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/DMatMul2D.java @@ -0,0 +1,28 @@ +package gov.pnnl.aparapi.sample.mdarray; +import com.amd.aparapi.Kernel; + +class DMatMul2D extends Kernel{ + double[][] A; + + double[][] B; + + double[][] C; + + int N; + + public DMatMul2D(double[][] A, double[][] B, double[][] C, int N) { + this.A = A; + this.B = B; + this.C = C; + this.N = N; + } + + @Override public void run() { + int id = getGlobalId(); + int i = id / N; + int j = id % N; + for (int k = 0; k < N; k++) { + C[i][j] += A[i][k] * B[k][j]; + } + } +} diff --git a/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/DMatMul3D.java b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/DMatMul3D.java new file mode 100644 index 00000000..8641f2ff --- /dev/null +++ b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/DMatMul3D.java @@ -0,0 +1,29 @@ +package gov.pnnl.aparapi.sample.mdarray; +import com.amd.aparapi.Kernel; + +class DMatMul3D extends Kernel{ + double[][][] A; + + double[][][] B; + + double[][][] C; + + int N; + + public DMatMul3D(double[][][] A, double[][][] B, double[][][] C, int N) { + this.A = A; + this.B = B; + this.C = C; + this.N = N; + } + + @Override public void run() { + int id = getGlobalId(); + int i = id / (N * N); + int j = (id / N) % N; + int k = id % N; + for (int l = 0; l < N; l++) { + C[i][j][k] += A[i][j][l] * B[l][j][k]; + } + } +} diff --git a/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/FMatMul1D.java b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/FMatMul1D.java new file mode 100644 index 00000000..15a2fa21 --- /dev/null +++ b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/FMatMul1D.java @@ -0,0 +1,28 @@ +package gov.pnnl.aparapi.sample.mdarray; +import com.amd.aparapi.Kernel; + +class FMatMul1D extends Kernel{ + float[] A; + + float[] B; + + float[] C; + + int N; + + public FMatMul1D(float[] A, float[] B, float[] C, int N) { + this.A = A; + this.B = B; + this.C = C; + this.N = N; + } + + @Override public void run() { + int id = getGlobalId(); + int i = id / N; + int j = id % N; + for (int k = 0; k < N; k++) { + C[i * N + j] += A[i * N + k] * B[k * N + j]; + } + } +} diff --git a/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/FMatMul2D.java b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/FMatMul2D.java new file mode 100644 index 00000000..21e66103 --- /dev/null +++ b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/FMatMul2D.java @@ -0,0 +1,28 @@ +package gov.pnnl.aparapi.sample.mdarray; +import com.amd.aparapi.Kernel; + +class FMatMul2D extends Kernel{ + float[][] A; + + float[][] B; + + float[][] C; + + int N; + + public FMatMul2D(float[][] A, float[][] B, float[][] C, int N) { + this.A = A; + this.B = B; + this.C = C; + this.N = N; + } + + @Override public void run() { + int id = getGlobalId(); + int i = id / N; + int j = id % N; + for (int k = 0; k < N; k++) { + C[i][j] += A[i][k] * B[k][j]; + } + } +} diff --git a/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/FMatMul3D.java b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/FMatMul3D.java new file mode 100644 index 00000000..b5af1848 --- /dev/null +++ b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/FMatMul3D.java @@ -0,0 +1,29 @@ +package gov.pnnl.aparapi.sample.mdarray; +import com.amd.aparapi.Kernel; + +class FMatMul3D extends Kernel{ + float[][][] A; + + float[][][] B; + + float[][][] C; + + int N; + + public FMatMul3D(float[][][] A, float[][][] B, float[][][] C, int N) { + this.A = A; + this.B = B; + this.C = C; + this.N = N; + } + + @Override public void run() { + int id = getGlobalId(); + int i = id / (N * N); + int j = (id / N) % N; + int k = id % N; + for (int l = 0; l < N; l++) { + C[i][j][k] += A[i][j][l] * B[l][j][k]; + } + } +} diff --git a/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/IMatMul1D.java b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/IMatMul1D.java new file mode 100644 index 00000000..4760c086 --- /dev/null +++ b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/IMatMul1D.java @@ -0,0 +1,28 @@ +package gov.pnnl.aparapi.sample.mdarray; +import com.amd.aparapi.Kernel; + +class IMatMul1D extends Kernel{ + int[] A; + + int[] B; + + int[] C; + + int N; + + public IMatMul1D(int[] A, int[] B, int[] C, int N) { + this.A = A; + this.B = B; + this.C = C; + this.N = N; + } + + @Override public void run() { + int id = getGlobalId(); + int i = id / N; + int j = id % N; + for (int k = 0; k < N; k++) { + C[i * N + j] += A[i * N + k] * B[k * N + j]; + } + } +} diff --git a/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/IMatMul2D.java b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/IMatMul2D.java new file mode 100644 index 00000000..4493633b --- /dev/null +++ b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/IMatMul2D.java @@ -0,0 +1,28 @@ +package gov.pnnl.aparapi.sample.mdarray; +import com.amd.aparapi.Kernel; + +class IMatMul2D extends Kernel{ + int[][] A; + + int[][] B; + + int[][] C; + + int N; + + public IMatMul2D(int[][] A, int[][] B, int[][] C, int N) { + this.A = A; + this.B = B; + this.C = C; + this.N = N; + } + + @Override public void run() { + int id = getGlobalId(); + int i = id / N; + int j = id % N; + for (int k = 0; k < N; k++) { + C[i][j] += A[i][k] * B[k][j]; + } + } +} diff --git a/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/IMatMul3D.java b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/IMatMul3D.java new file mode 100644 index 00000000..a4658b65 --- /dev/null +++ b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/IMatMul3D.java @@ -0,0 +1,29 @@ +package gov.pnnl.aparapi.sample.mdarray; +import com.amd.aparapi.Kernel; + +class IMatMul3D extends Kernel{ + int[][][] A; + + int[][][] B; + + int[][][] C; + + int N; + + public IMatMul3D(int[][][] A, int[][][] B, int[][][] C, int N) { + this.A = A; + this.B = B; + this.C = C; + this.N = N; + } + + @Override public void run() { + int id = getGlobalId(); + int i = id / (N * N); + int j = (id / N) % N; + int k = id % N; + for (int l = 0; l < N; l++) { + C[i][j][k] += A[i][j][l] * B[l][j][k]; + } + } +} diff --git a/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/LMatMul1D.java b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/LMatMul1D.java new file mode 100644 index 00000000..9d81d1e8 --- /dev/null +++ b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/LMatMul1D.java @@ -0,0 +1,28 @@ +package gov.pnnl.aparapi.sample.mdarray; +import com.amd.aparapi.Kernel; + +class LMatMul1D extends Kernel{ + long[] A; + + long[] B; + + long[] C; + + int N; + + public LMatMul1D(long[] A, long[] B, long[] C, int N) { + this.A = A; + this.B = B; + this.C = C; + this.N = N; + } + + @Override public void run() { + int id = getGlobalId(); + int i = id / N; + int j = id % N; + for (int k = 0; k < N; k++) { + C[i * N + j] += A[i * N + k] * B[k * N + j]; + } + } +} diff --git a/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/LMatMul2D.java b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/LMatMul2D.java new file mode 100644 index 00000000..d8f8b8e5 --- /dev/null +++ b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/LMatMul2D.java @@ -0,0 +1,28 @@ +package gov.pnnl.aparapi.sample.mdarray; +import com.amd.aparapi.Kernel; + +class LMatMul2D extends Kernel{ + long[][] A; + + long[][] B; + + long[][] C; + + int N; + + public LMatMul2D(long[][] A, long[][] B, long[][] C, int N) { + this.A = A; + this.B = B; + this.C = C; + this.N = N; + } + + @Override public void run() { + int id = getGlobalId(); + int i = id / N; + int j = id % N; + for (int k = 0; k < N; k++) { + C[i][j] += A[i][k] * B[k][j]; + } + } +} diff --git a/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/LMatMul3D.java b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/LMatMul3D.java new file mode 100644 index 00000000..46ec3c8f --- /dev/null +++ b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/LMatMul3D.java @@ -0,0 +1,29 @@ +package gov.pnnl.aparapi.sample.mdarray; +import com.amd.aparapi.Kernel; + +class LMatMul3D extends Kernel{ + long[][][] A; + + long[][][] B; + + long[][][] C; + + int N; + + public LMatMul3D(long[][][] A, long[][][] B, long[][][] C, int N) { + this.A = A; + this.B = B; + this.C = C; + this.N = N; + } + + @Override public void run() { + int id = getGlobalId(); + int i = id / (N * N); + int j = (id / N) % N; + int k = id % N; + for (int l = 0; l < N; l++) { + C[i][j][k] += A[i][j][l] * B[l][j][k]; + } + } +} diff --git a/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/MDArray.java b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/MDArray.java new file mode 100644 index 00000000..8e80c59e --- /dev/null +++ b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/MDArray.java @@ -0,0 +1,1281 @@ +package gov.pnnl.aparapi.sample.mdarray; + +import com.amd.aparapi.Kernel; + +class MDArray { + + static int N = 1 << 10; + + static int M = 1 << 5; + + public static void main(String[] args) { + System.out.println("boolean 1D"); + Zrun1D(); + System.out.println("byte 1D"); + Brun1D(); + System.out.println("short 1D"); + Srun1D(); + System.out.println("int 1D"); + Irun1D(); + System.out.println("long 1D"); + Lrun1D(); + System.out.println("float 1D"); + Frun1D(); + System.out.println("double 1D"); + Drun1D(); + System.out.println("boolean 2D"); + Zrun2D(); + System.out.println("byte 2D"); + Brun2D(); + System.out.println("short 2D"); + Srun2D(); + System.out.println("int 2D"); + Irun2D(); + System.out.println("long 2D"); + Lrun2D(); + System.out.println("float 2D"); + Frun2D(); + System.out.println("double 2D"); + Drun2D(); + System.out.println("boolean 3D"); + Zrun3D(); + System.out.println("byte 3D"); + Brun3D(); + System.out.println("short 3D"); + Srun3D(); + System.out.println("int 3D"); + Irun3D(); + System.out.println("long 3D"); + Lrun3D(); + System.out.println("float 3D"); + Frun3D(); + System.out.println("double 3D"); + Drun3D(); + } + + private static boolean[] matMull(boolean[] A, boolean[] B, int N) { + final boolean[] C = new boolean[N * N]; + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + C[(i * N) + j] ^= A[(i * N) + k] & B[(k * N) + j]; + } + } + } + return C; + } + + private static byte[] matMull(byte[] A, byte[] B, int N) { + final byte[] C = new byte[N * N]; + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + C[(i * N) + j] += (byte) (A[(i * N) + k] * B[(k * N) + j]); + } + } + } + return C; + } + + private static short[] matMull(short[] A, short[] B, int N) { + final short[] C = new short[N * N]; + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + C[(i * N) + j] += (short) (A[(i * N) + k] * B[(k * N) + j]); + } + } + } + return C; + } + + private static int[] matMull(int[] A, int[] B, int N) { + final int[] C = new int[N * N]; + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j]; + } + } + } + return C; + } + + private static long[] matMull(long[] A, long[] B, int N) { + final long[] C = new long[N * N]; + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j]; + } + } + } + return C; + } + + private static float[] matMull(float[] A, float[] B, int N) { + final float[] C = new float[N * N]; + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j]; + } + } + } + return C; + } + + private static double[] matMull(double[] A, double[] B, int N) { + final double[] C = new double[N * N]; + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j]; + } + } + } + return C; + } + + private static boolean[][] matMull(boolean[][] A, boolean[][] B, int N) { + final boolean[][] C = new boolean[N][N]; + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + C[i][j] ^= A[i][k] & B[k][j]; + } + } + } + return C; + } + + private static byte[][] matMull(byte[][] A, byte[][] B, int N) { + final byte[][] C = new byte[N][N]; + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + C[i][j] += (byte) (A[i][k] * B[k][j]); + } + } + } + return C; + } + + private static short[][] matMull(short[][] A, short[][] B, int N) { + final short[][] C = new short[N][N]; + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + C[i][j] += (short) (A[i][k] * B[k][j]); + } + } + } + return C; + } + + private static int[][] matMull(int[][] A, int[][] B, int N) { + final int[][] C = new int[N][N]; + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + C[i][j] += A[i][k] * B[k][j]; + } + } + } + return C; + } + + private static long[][] matMull(long[][] A, long[][] B, int N) { + final long[][] C = new long[N][N]; + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + C[i][j] += A[i][k] * B[k][j]; + } + } + } + return C; + } + + private static float[][] matMull(float[][] A, float[][] B, int N) { + final float[][] C = new float[N][N]; + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + C[i][j] += A[i][k] * B[k][j]; + } + } + } + return C; + } + + private static double[][] matMull(double[][] A, double[][] B, int N) { + final double[][] C = new double[N][N]; + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + C[i][j] += A[i][k] * B[k][j]; + } + } + } + return C; + } + + private static boolean[][][] matMull(boolean[][][] A, boolean[][][] B, int N) { + final boolean[][][] C = new boolean[N][N][N]; + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + for (int l = 0; l < N; l++) { + C[i][j][k] ^= A[i][j][l] & B[l][j][k]; + } + } + } + } + return C; + } + + private static byte[][][] matMull(byte[][][] A, byte[][][] B, int N) { + final byte[][][] C = new byte[N][N][N]; + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + for (int l = 0; l < N; l++) { + C[i][j][k] += (byte) (A[i][j][l] * B[l][j][k]); + } + } + } + } + return C; + } + + private static short[][][] matMull(short[][][] A, short[][][] B, int N) { + final short[][][] C = new short[N][N][N]; + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + for (int l = 0; l < N; l++) { + C[i][j][k] += (short) (A[i][j][l] * B[l][j][k]); + } + } + } + } + return C; + } + + private static int[][][] matMull(int[][][] A, int[][][] B, int N) { + final int[][][] C = new int[N][N][N]; + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + for (int l = 0; l < N; l++) { + C[i][j][k] += A[i][j][l] * B[l][j][k]; + } + } + } + } + return C; + } + + private static long[][][] matMull(long[][][] A, long[][][] B, int N) { + final long[][][] C = new long[N][N][N]; + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + for (int l = 0; l < N; l++) { + C[i][j][k] += A[i][j][l] * B[l][j][k]; + } + } + } + } + return C; + } + + private static float[][][] matMull(float[][][] A, float[][][] B, int N) { + final float[][][] C = new float[N][N][N]; + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + for (int l = 0; l < N; l++) { + C[i][j][k] += A[i][j][l] * B[l][j][k]; + } + } + } + } + return C; + } + + private static double[][][] matMull(double[][][] A, double[][][] B, int N) { + final double[][][] C = new double[N][N][N]; + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + for (int k = 0; k < N; k++) { + for (int l = 0; l < N; l++) { + C[i][j][k] += A[i][j][l] * B[l][j][k]; + } + } + } + } + return C; + } + + private static boolean checkResults(boolean[] cpu, boolean[] gpu) { + for (int i = 0; i < cpu.length; i++) { + if (cpu[i] != gpu[i]) { + return false; + } + } + return true; + } + + private static boolean checkResults(byte[] cpu, byte[] gpu) { + for (int i = 0; i < cpu.length; i++) { + if (cpu[i] != gpu[i]) { + return false; + } + } + return true; + } + + private static boolean checkResults(short[] cpu, short[] gpu) { + for (int i = 0; i < cpu.length; i++) { + if (cpu[i] != gpu[i]) { + return false; + } + } + return true; + } + + private static boolean checkResults(int[] cpu, int[] gpu) { + for (int i = 0; i < cpu.length; i++) { + if (cpu[i] != gpu[i]) { + return false; + } + } + return true; + } + + private static boolean checkResults(long[] cpu, long[] gpu) { + for (int i = 0; i < cpu.length; i++) { + if (cpu[i] != gpu[i]) { + return false; + } + } + return true; + } + + private static boolean checkResults(float[] cpu, float[] gpu) { + for (int i = 0; i < cpu.length; i++) { + if (cpu[i] != gpu[i]) { + return false; + } + } + return true; + } + + private static boolean checkResults(double[] cpu, double[] gpu) { + for (int i = 0; i < cpu.length; i++) { + if (cpu[i] != gpu[i]) { + return false; + } + } + return true; + } + + private static boolean checkResults(boolean[][] cpu, boolean[][] gpu) { + for (int i = 0; i < cpu.length; i++) { + for (int j = 0; j < cpu[i].length; j++) { + if (cpu[i][j] != gpu[i][j]) { + return false; + } + } + } + return true; + } + + private static boolean checkResults(byte[][] cpu, byte[][] gpu) { + for (int i = 0; i < cpu.length; i++) { + for (int j = 0; j < cpu[i].length; j++) { + if (cpu[i][j] != gpu[i][j]) { + return false; + } + } + } + return true; + } + + private static boolean checkResults(short[][] cpu, short[][] gpu) { + for (int i = 0; i < cpu.length; i++) { + for (int j = 0; j < cpu[i].length; j++) { + if (cpu[i][j] != gpu[i][j]) { + return false; + } + } + } + return true; + } + + private static boolean checkResults(int[][] cpu, int[][] gpu) { + for (int i = 0; i < cpu.length; i++) { + for (int j = 0; j < cpu[i].length; j++) { + if (cpu[i][j] != gpu[i][j]) { + return false; + } + } + } + return true; + } + + private static boolean checkResults(long[][] cpu, long[][] gpu) { + for (int i = 0; i < cpu.length; i++) { + for (int j = 0; j < cpu[i].length; j++) { + if (cpu[i][j] != gpu[i][j]) { + return false; + } + } + } + return true; + } + + private static boolean checkResults(float[][] cpu, float[][] gpu) { + for (int i = 0; i < cpu.length; i++) { + for (int j = 0; j < cpu[i].length; j++) { + if (cpu[i][j] != gpu[i][j]) { + return false; + } + } + } + return true; + } + + private static boolean checkResults(double[][] cpu, double[][] gpu) { + for (int i = 0; i < cpu.length; i++) { + for (int j = 0; j < cpu[i].length; j++) { + if (cpu[i][j] != gpu[i][j]) { + return false; + } + } + } + return true; + } + + private static boolean checkResults(boolean[][][] cpu, boolean[][][] gpu) { + for (int i = 0; i < cpu.length; i++) { + for (int j = 0; j < cpu[i].length; j++) { + for (int k = 0; k < cpu[i][j].length; k++) { + if (cpu[i][j][k] != gpu[i][j][k]) { + return false; + } + } + } + } + return true; + } + + private static boolean checkResults(byte[][][] cpu, byte[][][] gpu) { + for (int i = 0; i < cpu.length; i++) { + for (int j = 0; j < cpu[i].length; j++) { + for (int k = 0; k < cpu[i][j].length; k++) { + if (cpu[i][j][k] != gpu[i][j][k]) { + return false; + } + } + } + } + return true; + } + + private static boolean checkResults(short[][][] cpu, short[][][] gpu) { + for (int i = 0; i < cpu.length; i++) { + for (int j = 0; j < cpu[i].length; j++) { + for (int k = 0; k < cpu[i][j].length; k++) { + if (cpu[i][j][k] != gpu[i][j][k]) { + return false; + } + } + } + } + return true; + } + + private static boolean checkResults(int[][][] cpu, int[][][] gpu) { + for (int i = 0; i < cpu.length; i++) { + for (int j = 0; j < cpu[i].length; j++) { + for (int k = 0; k < cpu[i][j].length; k++) { + if (cpu[i][j][k] != gpu[i][j][k]) { + return false; + } + } + } + } + return true; + } + + private static boolean checkResults(long[][][] cpu, long[][][] gpu) { + for (int i = 0; i < cpu.length; i++) { + for (int j = 0; j < cpu[i].length; j++) { + for (int k = 0; k < cpu[i][j].length; k++) { + if (cpu[i][j][k] != gpu[i][j][k]) { + return false; + } + } + } + } + return true; + } + + private static boolean checkResults(float[][][] cpu, float[][][] gpu) { + for (int i = 0; i < cpu.length; i++) { + for (int j = 0; j < cpu[i].length; j++) { + for (int k = 0; k < cpu[i][j].length; k++) { + if (cpu[i][j][k] != gpu[i][j][k]) { + return false; + } + } + } + } + return true; + } + + private static boolean checkResults(double[][][] cpu, double[][][] gpu) { + for (int i = 0; i < cpu.length; i++) { + for (int j = 0; j < cpu[i].length; j++) { + for (int k = 0; k < cpu[i][j].length; k++) { + if (cpu[i][j][k] != gpu[i][j][k]) { + return false; + } + } + } + } + return true; + } + + public static void Zrun1D() { + final boolean[] A = new boolean[N * N]; + final boolean[] B = new boolean[N * N]; + final boolean[] gpu = new boolean[N * N]; + boolean[] cpu = new boolean[N * N]; + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + A[(i * N) + j] = ((i % 2) == 0) ^ ((j % 2) == 0); + B[(i * N) + j] = ((i % 2) == 0) & ((j % 2) == 0); + cpu[(i * N) + j] = false; + gpu[(i * N) + j] = false; + } + } + + long gs = System.currentTimeMillis(); + final Kernel kernel = new ZMatMul1D(A, B, gpu, N); + kernel.execute(N * N); + gs = System.currentTimeMillis() - gs; + + long cs = System.currentTimeMillis(); + cpu = matMull(A, B, N); + cs = System.currentTimeMillis() - cs; + + System.out.println("gpu time: " + gs + "\ncpu time: " + cs); + System.out.print("valid? "); + + if (checkResults(cpu, gpu)) { + System.out.println("yes"); + } else { + System.out.println("no"); + } + } + + public static void Brun1D() { + final byte[] A = new byte[N * N]; + final byte[] B = new byte[N * N]; + final byte[] gpu = new byte[N * N]; + byte[] cpu = new byte[N * N]; + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + A[(i * N) + j] = (byte) (i + j); + B[(i * N) + j] = (byte) (i - j); + cpu[(i * N) + j] = (byte) 0; + gpu[(i * N) + j] = (byte) 0; + } + } + + long gs = System.currentTimeMillis(); + final Kernel kernel = new BMatMul1D(A, B, gpu, N); + kernel.execute(N * N); + gs = System.currentTimeMillis() - gs; + + long cs = System.currentTimeMillis(); + cpu = matMull(A, B, N); + cs = System.currentTimeMillis() - cs; + + System.out.println("gpu time: " + gs + "\ncpu time: " + cs); + System.out.print("valid? "); + + if (checkResults(cpu, gpu)) { + System.out.println("yes"); + } else { + System.out.println("no"); + } + } + + public static void Srun1D() { + final short[] A = new short[N * N]; + final short[] B = new short[N * N]; + final short[] gpu = new short[N * N]; + short[] cpu = new short[N * N]; + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + A[(i * N) + j] = (short) (i + j); + B[(i * N) + j] = (short) (i - j); + cpu[(i * N) + j] = (short) 0; + gpu[(i * N) + j] = (short) 0; + } + } + + long gs = System.currentTimeMillis(); + final Kernel kernel = new SMatMul1D(A, B, gpu, N); + kernel.execute(N * N); + gs = System.currentTimeMillis() - gs; + + long cs = System.currentTimeMillis(); + cpu = matMull(A, B, N); + cs = System.currentTimeMillis() - cs; + + System.out.println("gpu time: " + gs + "\ncpu time: " + cs); + System.out.print("valid? "); + + if (checkResults(cpu, gpu)) { + System.out.println("yes"); + } else { + System.out.println("no"); + } + } + + public static void Irun1D() { + final int[] A = new int[N * N]; + final int[] B = new int[N * N]; + final int[] gpu = new int[N * N]; + int[] cpu = new int[N * N]; + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + A[(i * N) + j] = i + j; + B[(i * N) + j] = i - j; + cpu[(i * N) + j] = 0; + gpu[(i * N) + j] = 0; + } + } + + long gs = System.currentTimeMillis(); + final Kernel kernel = new IMatMul1D(A, B, gpu, N); + kernel.execute(N * N); + gs = System.currentTimeMillis() - gs; + + long cs = System.currentTimeMillis(); + cpu = matMull(A, B, N); + cs = System.currentTimeMillis() - cs; + + System.out.println("gpu time: " + gs + "\ncpu time: " + cs); + System.out.print("valid? "); + + if (checkResults(cpu, gpu)) { + System.out.println("yes"); + } else { + System.out.println("no"); + } + } + + public static void Lrun1D() { + final long[] A = new long[N * N]; + final long[] B = new long[N * N]; + final long[] gpu = new long[N * N]; + long[] cpu = new long[N * N]; + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + A[(i * N) + j] = i + j; + B[(i * N) + j] = i - j; + cpu[(i * N) + j] = 0l; + gpu[(i * N) + j] = 0l; + } + } + + long gs = System.currentTimeMillis(); + final Kernel kernel = new LMatMul1D(A, B, gpu, N); + kernel.execute(N * N); + gs = System.currentTimeMillis() - gs; + + long cs = System.currentTimeMillis(); + cpu = matMull(A, B, N); + cs = System.currentTimeMillis() - cs; + + System.out.println("gpu time: " + gs + "\ncpu time: " + cs); + System.out.print("valid? "); + + if (checkResults(cpu, gpu)) { + System.out.println("yes"); + } else { + System.out.println("no"); + } + } + + public static void Frun1D() { + final float[] A = new float[N * N]; + final float[] B = new float[N * N]; + final float[] gpu = new float[N * N]; + float[] cpu = new float[N * N]; + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + A[(i * N) + j] = i + j; + B[(i * N) + j] = i - j; + cpu[(i * N) + j] = 0.0f; + gpu[(i * N) + j] = 0.0f; + } + } + + long gs = System.currentTimeMillis(); + final Kernel kernel = new FMatMul1D(A, B, gpu, N); + kernel.execute(N * N); + gs = System.currentTimeMillis() - gs; + + long cs = System.currentTimeMillis(); + cpu = matMull(A, B, N); + cs = System.currentTimeMillis() - cs; + + System.out.println("gpu time: " + gs + "\ncpu time: " + cs); + System.out.print("valid? "); + + if (checkResults(cpu, gpu)) { + System.out.println("yes"); + } else { + System.out.println("no"); + } + } + + public static void Drun1D() { + final double[] A = new double[N * N]; + final double[] B = new double[N * N]; + final double[] gpu = new double[N * N]; + double[] cpu = new double[N * N]; + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + A[(i * N) + j] = i + j; + B[(i * N) + j] = i - j; + cpu[(i * N) + j] = 0.0; + gpu[(i * N) + j] = 0.0; + } + } + + long gs = System.currentTimeMillis(); + final Kernel kernel = new DMatMul1D(A, B, gpu, N); + kernel.execute(N * N); + gs = System.currentTimeMillis() - gs; + + long cs = System.currentTimeMillis(); + cpu = matMull(A, B, N); + cs = System.currentTimeMillis() - cs; + + System.out.println("gpu time: " + gs + "\ncpu time: " + cs); + System.out.print("valid? "); + + if (checkResults(cpu, gpu)) { + System.out.println("yes"); + } else { + System.out.println("no"); + } + } + + public static void Zrun2D() { + final boolean[][] A = new boolean[N][N]; + final boolean[][] B = new boolean[N][N]; + final boolean[][] gpu = new boolean[N][N]; + boolean[][] cpu = new boolean[N][N]; + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + A[i][j] = ((i % 2) == 0) ^ ((j % 2) == 0); + B[i][j] = ((i % 2) == 0) & ((j % 2) == 0); + cpu[i][j] = false; + gpu[i][j] = false; + } + } + + long gs = System.currentTimeMillis(); + final Kernel kernel = new ZMatMul2D(A, B, gpu, N); + kernel.execute(N * N); + gs = System.currentTimeMillis() - gs; + + long cs = System.currentTimeMillis(); + cpu = matMull(A, B, N); + cs = System.currentTimeMillis() - cs; + + System.out.println("gpu time: " + gs + "\ncpu time: " + cs); + System.out.print("valid? "); + + if (checkResults(cpu, gpu)) { + System.out.println("yes"); + } else { + System.out.println("no"); + } + } + + public static void Brun2D() { + final byte[][] A = new byte[N][N]; + final byte[][] B = new byte[N][N]; + final byte[][] gpu = new byte[N][N]; + byte[][] cpu = new byte[N][N]; + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + A[i][j] = (byte) (i + j); + B[i][j] = (byte) (i - j); + cpu[i][j] = (byte) 0; + gpu[i][j] = (byte) 0; + } + } + + long gs = System.currentTimeMillis(); + final Kernel kernel = new BMatMul2D(A, B, gpu, N); + kernel.execute(N * N); + gs = System.currentTimeMillis() - gs; + + long cs = System.currentTimeMillis(); + cpu = matMull(A, B, N); + cs = System.currentTimeMillis() - cs; + + System.out.println("gpu time: " + gs + "\ncpu time: " + cs); + System.out.print("valid? "); + + if (checkResults(cpu, gpu)) { + System.out.println("yes"); + } else { + System.out.println("no"); + } + } + + public static void Srun2D() { + final short[][] A = new short[N][N]; + final short[][] B = new short[N][N]; + final short[][] gpu = new short[N][N]; + short[][] cpu = new short[N][N]; + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + A[i][j] = (short) (i + j); + B[i][j] = (short) (i - j); + cpu[i][j] = (short) 0; + gpu[i][j] = (short) 0; + } + } + + long gs = System.currentTimeMillis(); + final Kernel kernel = new SMatMul2D(A, B, gpu, N); + kernel.execute(N * N); + gs = System.currentTimeMillis() - gs; + + long cs = System.currentTimeMillis(); + cpu = matMull(A, B, N); + cs = System.currentTimeMillis() - cs; + + System.out.println("gpu time: " + gs + "\ncpu time: " + cs); + System.out.print("valid? "); + + if (checkResults(cpu, gpu)) { + System.out.println("yes"); + } else { + System.out.println("no"); + } + } + + public static void Irun2D() { + final int[][] A = new int[N][N]; + final int[][] B = new int[N][N]; + final int[][] gpu = new int[N][N]; + int[][] cpu = new int[N][N]; + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + A[i][j] = i + j; + B[i][j] = i - j; + cpu[i][j] = 0; + gpu[i][j] = 0; + } + } + + long gs = System.currentTimeMillis(); + final Kernel kernel = new IMatMul2D(A, B, gpu, N); + kernel.execute(N * N); + gs = System.currentTimeMillis() - gs; + + long cs = System.currentTimeMillis(); + cpu = matMull(A, B, N); + cs = System.currentTimeMillis() - cs; + + System.out.println("gpu time: " + gs + "\ncpu time: " + cs); + System.out.print("valid? "); + + if (checkResults(cpu, gpu)) { + System.out.println("yes"); + } else { + System.out.println("no"); + } + } + + public static void Lrun2D() { + final long[][] A = new long[N][N]; + final long[][] B = new long[N][N]; + final long[][] gpu = new long[N][N]; + long[][] cpu = new long[N][N]; + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + A[i][j] = i + j; + B[i][j] = i - j; + cpu[i][j] = 0l; + gpu[i][j] = 0l; + } + } + + long gs = System.currentTimeMillis(); + final Kernel kernel = new LMatMul2D(A, B, gpu, N); + kernel.execute(N * N); + gs = System.currentTimeMillis() - gs; + + long cs = System.currentTimeMillis(); + cpu = matMull(A, B, N); + cs = System.currentTimeMillis() - cs; + + System.out.println("gpu time: " + gs + "\ncpu time: " + cs); + System.out.print("valid? "); + + if (checkResults(cpu, gpu)) { + System.out.println("yes"); + } else { + System.out.println("no"); + } + } + + public static void Frun2D() { + final float[][] A = new float[N][N]; + final float[][] B = new float[N][N]; + final float[][] gpu = new float[N][N]; + float[][] cpu = new float[N][N]; + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + A[i][j] = i + j; + B[i][j] = i - j; + cpu[i][j] = 0.0f; + gpu[i][j] = 0.0f; + } + } + + long gs = System.currentTimeMillis(); + final Kernel kernel = new FMatMul2D(A, B, gpu, N); + kernel.execute(N * N); + gs = System.currentTimeMillis() - gs; + + long cs = System.currentTimeMillis(); + cpu = matMull(A, B, N); + cs = System.currentTimeMillis() - cs; + + System.out.println("gpu time: " + gs + "\ncpu time: " + cs); + System.out.print("valid? "); + + if (checkResults(cpu, gpu)) { + System.out.println("yes"); + } else { + System.out.println("no"); + } + } + + public static void Drun2D() { + final double[][] A = new double[N][N]; + final double[][] B = new double[N][N]; + final double[][] gpu = new double[N][N]; + double[][] cpu = new double[N][N]; + + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + A[i][j] = i + j; + B[i][j] = i - j; + cpu[i][j] = 0.0; + gpu[i][j] = 0.0; + } + } + + long gs = System.currentTimeMillis(); + final Kernel kernel = new DMatMul2D(A, B, gpu, N); + kernel.execute(N * N); + gs = System.currentTimeMillis() - gs; + + long cs = System.currentTimeMillis(); + cpu = matMull(A, B, N); + cs = System.currentTimeMillis() - cs; + + System.out.println("gpu time: " + gs + "\ncpu time: " + cs); + System.out.print("valid? "); + + if (checkResults(cpu, gpu)) { + System.out.println("yes"); + } else { + System.out.println("no"); + } + } + + public static void Zrun3D() { + final boolean[][][] A = new boolean[M][M][M]; + final boolean[][][] B = new boolean[M][M][M]; + final boolean[][][] gpu = new boolean[M][M][M]; + boolean[][][] cpu = new boolean[M][M][M]; + + for (int i = 0; i < M; i++) { + for (int j = 0; j < M; j++) { + for (int k = 0; k < M; k++) { + A[i][j][k] = ((i % 2) == 0) ^ (((j % 2) == 0) & ((k % 2) == 0)); + B[i][j][k] = (((i % 2) == 0) & ((j % 2) == 0)) ^ ((k % 2) == 0); + ; + cpu[i][j][k] = false; + gpu[i][j][k] = false; + } + } + } + + long gs = System.currentTimeMillis(); + final Kernel kernel = new ZMatMul3D(A, B, gpu, M); + kernel.execute(M * M * M); + gs = System.currentTimeMillis() - gs; + + long cs = System.currentTimeMillis(); + cpu = matMull(A, B, M); + cs = System.currentTimeMillis() - cs; + + System.out.println("gpu time: " + gs + "\ncpu time: " + cs); + System.out.print("valid? "); + + if (checkResults(cpu, gpu)) { + System.out.println("yes"); + } else { + System.out.println("no"); + } + } + + public static void Brun3D() { + final byte[][][] A = new byte[M][M][M]; + final byte[][][] B = new byte[M][M][M]; + final byte[][][] gpu = new byte[M][M][M]; + byte[][][] cpu = new byte[M][M][M]; + + for (int i = 0; i < M; i++) { + for (int j = 0; j < M; j++) { + for (int k = 0; k < M; k++) { + A[i][j][k] = (byte) (i + j + k); + B[i][j][k] = (byte) ((i - j) + k); + cpu[i][j][k] = (byte) 0; + gpu[i][j][k] = (byte) 0; + } + } + } + + long gs = System.currentTimeMillis(); + final Kernel kernel = new BMatMul3D(A, B, gpu, M); + kernel.execute(M * M * M); + gs = System.currentTimeMillis() - gs; + + long cs = System.currentTimeMillis(); + cpu = matMull(A, B, M); + cs = System.currentTimeMillis() - cs; + + System.out.println("gpu time: " + gs + "\ncpu time: " + cs); + System.out.print("valid? "); + + if (checkResults(cpu, gpu)) { + System.out.println("yes"); + } else { + System.out.println("no"); + } + } + + public static void Srun3D() { + final short[][][] A = new short[M][M][M]; + final short[][][] B = new short[M][M][M]; + final short[][][] gpu = new short[M][M][M]; + short[][][] cpu = new short[M][M][M]; + + for (int i = 0; i < M; i++) { + for (int j = 0; j < M; j++) { + for (int k = 0; k < M; k++) { + A[i][j][k] = (short) (i + j + k); + B[i][j][k] = (short) ((i - j) + k); + cpu[i][j][k] = (short) 0; + gpu[i][j][k] = (short) 0; + } + } + } + + long gs = System.currentTimeMillis(); + final Kernel kernel = new SMatMul3D(A, B, gpu, M); + kernel.execute(M * M * M); + gs = System.currentTimeMillis() - gs; + + long cs = System.currentTimeMillis(); + cpu = matMull(A, B, M); + cs = System.currentTimeMillis() - cs; + + System.out.println("gpu time: " + gs + "\ncpu time: " + cs); + System.out.print("valid? "); + + if (checkResults(cpu, gpu)) { + System.out.println("yes"); + } else { + System.out.println("no"); + } + } + + public static void Irun3D() { + final int[][][] A = new int[M][M][M]; + final int[][][] B = new int[M][M][M]; + final int[][][] gpu = new int[M][M][M]; + int[][][] cpu = new int[M][M][M]; + + for (int i = 0; i < M; i++) { + for (int j = 0; j < M; j++) { + for (int k = 0; k < M; k++) { + A[i][j][k] = i + j + k; + B[i][j][k] = (i - j) + k; + cpu[i][j][k] = 0; + gpu[i][j][k] = 0; + } + } + } + + long gs = System.currentTimeMillis(); + final Kernel kernel = new IMatMul3D(A, B, gpu, M); + kernel.execute(M * M * M); + gs = System.currentTimeMillis() - gs; + + long cs = System.currentTimeMillis(); + cpu = matMull(A, B, M); + cs = System.currentTimeMillis() - cs; + + System.out.println("gpu time: " + gs + "\ncpu time: " + cs); + System.out.print("valid? "); + + if (checkResults(cpu, gpu)) { + System.out.println("yes"); + } else { + System.out.println("no"); + } + } + + public static void Lrun3D() { + final long[][][] A = new long[M][M][M]; + final long[][][] B = new long[M][M][M]; + final long[][][] gpu = new long[M][M][M]; + long[][][] cpu = new long[M][M][M]; + + for (int i = 0; i < M; i++) { + for (int j = 0; j < M; j++) { + for (int k = 0; k < M; k++) { + A[i][j][k] = i + j + k; + B[i][j][k] = (i - j) + k; + cpu[i][j][k] = 0l; + gpu[i][j][k] = 0l; + } + } + } + + long gs = System.currentTimeMillis(); + final Kernel kernel = new LMatMul3D(A, B, gpu, M); + kernel.execute(M * M * M); + gs = System.currentTimeMillis() - gs; + + long cs = System.currentTimeMillis(); + cpu = matMull(A, B, M); + cs = System.currentTimeMillis() - cs; + + System.out.println("gpu time: " + gs + "\ncpu time: " + cs); + System.out.print("valid? "); + + if (checkResults(cpu, gpu)) { + System.out.println("yes"); + } else { + System.out.println("no"); + } + } + + public static void Frun3D() { + final float[][][] A = new float[M][M][M]; + final float[][][] B = new float[M][M][M]; + final float[][][] gpu = new float[M][M][M]; + float[][][] cpu = new float[M][M][M]; + + for (int i = 0; i < M; i++) { + for (int j = 0; j < M; j++) { + for (int k = 0; k < M; k++) { + A[i][j][k] = i + j + k; + B[i][j][k] = (i - j) + k; + cpu[i][j][k] = 0.0f; + gpu[i][j][k] = 0.0f; + } + } + } + + long gs = System.currentTimeMillis(); + final Kernel kernel = new FMatMul3D(A, B, gpu, M); + kernel.execute(M * M * M); + gs = System.currentTimeMillis() - gs; + + long cs = System.currentTimeMillis(); + cpu = matMull(A, B, M); + cs = System.currentTimeMillis() - cs; + + System.out.println("gpu time: " + gs + "\ncpu time: " + cs); + System.out.print("valid? "); + + if (checkResults(cpu, gpu)) { + System.out.println("yes"); + } else { + System.out.println("no"); + } + } + + public static void Drun3D() { + final double[][][] A = new double[M][M][M]; + final double[][][] B = new double[M][M][M]; + final double[][][] gpu = new double[M][M][M]; + double[][][] cpu = new double[M][M][M]; + + for (int i = 0; i < M; i++) { + for (int j = 0; j < M; j++) { + for (int k = 0; k < M; k++) { + A[i][j][k] = i + j + k; + B[i][j][k] = (i - j) + k; + cpu[i][j][k] = 0.0; + gpu[i][j][k] = 0.0; + } + } + } + + long gs = System.currentTimeMillis(); + final Kernel kernel = new DMatMul3D(A, B, gpu, M); + kernel.execute(M * M * M); + gs = System.currentTimeMillis() - gs; + + long cs = System.currentTimeMillis(); + cpu = matMull(A, B, M); + cs = System.currentTimeMillis() - cs; + + System.out.println("gpu time: " + gs + "\ncpu time: " + cs); + System.out.print("valid? "); + + if (checkResults(cpu, gpu)) { + System.out.println("yes"); + } else { + System.out.println("no"); + } + } +} diff --git a/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/SMatMul1D.java b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/SMatMul1D.java new file mode 100644 index 00000000..858be09e --- /dev/null +++ b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/SMatMul1D.java @@ -0,0 +1,28 @@ +package gov.pnnl.aparapi.sample.mdarray; +import com.amd.aparapi.Kernel; + +class SMatMul1D extends Kernel{ + short[] A; + + short[] B; + + short[] C; + + int N; + + public SMatMul1D(short[] A, short[] B, short[] C, int N) { + this.A = A; + this.B = B; + this.C = C; + this.N = N; + } + + @Override public void run() { + int id = getGlobalId(); + int i = id / N; + int j = id % N; + for (int k = 0; k < N; k++) { + C[i * N + j] += (short) (A[i * N + k] * B[k * N + j]); + } + } +} diff --git a/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/SMatMul2D.java b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/SMatMul2D.java new file mode 100644 index 00000000..2ed216e5 --- /dev/null +++ b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/SMatMul2D.java @@ -0,0 +1,28 @@ +package gov.pnnl.aparapi.sample.mdarray; +import com.amd.aparapi.Kernel; + +class SMatMul2D extends Kernel{ + short[][] A; + + short[][] B; + + short[][] C; + + int N; + + public SMatMul2D(short[][] A, short[][] B, short[][] C, int N) { + this.A = A; + this.B = B; + this.C = C; + this.N = N; + } + + @Override public void run() { + int id = getGlobalId(); + int i = id / N; + int j = id % N; + for (int k = 0; k < N; k++) { + C[i][j] += (short) (A[i][k] * B[k][j]); + } + } +} diff --git a/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/SMatMul3D.java b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/SMatMul3D.java new file mode 100644 index 00000000..95de3a37 --- /dev/null +++ b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/SMatMul3D.java @@ -0,0 +1,29 @@ +package gov.pnnl.aparapi.sample.mdarray; +import com.amd.aparapi.Kernel; + +class SMatMul3D extends Kernel{ + short[][][] A; + + short[][][] B; + + short[][][] C; + + int N; + + public SMatMul3D(short[][][] A, short[][][] B, short[][][] C, int N) { + this.A = A; + this.B = B; + this.C = C; + this.N = N; + } + + @Override public void run() { + int id = getGlobalId(); + int i = id / (N * N); + int j = (id / N) % N; + int k = id % N; + for (int l = 0; l < N; l++) { + C[i][j][k] += (short) (A[i][j][l] * B[l][j][k]); + } + } +} diff --git a/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/ZMatMul1D.java b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/ZMatMul1D.java new file mode 100644 index 00000000..f28db2f6 --- /dev/null +++ b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/ZMatMul1D.java @@ -0,0 +1,28 @@ +package gov.pnnl.aparapi.sample.mdarray; +import com.amd.aparapi.Kernel; + +class ZMatMul1D extends Kernel{ + boolean[] A; + + boolean[] B; + + boolean[] C; + + int N; + + public ZMatMul1D(boolean[] A, boolean[] B, boolean[] C, int N) { + this.A = A; + this.B = B; + this.C = C; + this.N = N; + } + + @Override public void run() { + int id = getGlobalId(); + int i = id / N; + int j = id % N; + for (int k = 0; k < N; k++) { + C[i * N + j] ^= A[i * N + k] & B[k * N + j]; + } + } +} diff --git a/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/ZMatMul2D.java b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/ZMatMul2D.java new file mode 100644 index 00000000..75304d54 --- /dev/null +++ b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/ZMatMul2D.java @@ -0,0 +1,28 @@ +package gov.pnnl.aparapi.sample.mdarray; +import com.amd.aparapi.Kernel; + +class ZMatMul2D extends Kernel{ + boolean[][] A; + + boolean[][] B; + + boolean[][] C; + + int N; + + public ZMatMul2D(boolean[][] A, boolean[][] B, boolean[][] C, int N) { + this.A = A; + this.B = B; + this.C = C; + this.N = N; + } + + @Override public void run() { + int id = getGlobalId(); + int i = id / N; + int j = id % N; + for (int k = 0; k < N; k++) { + C[i][j] ^= A[i][k] & B[k][j]; + } + } +} diff --git a/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/ZMatMul3D.java b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/ZMatMul3D.java new file mode 100644 index 00000000..6d63dfb6 --- /dev/null +++ b/samples/MDArray/src/gov/pnnl/aparapi/sample/mdarray/ZMatMul3D.java @@ -0,0 +1,29 @@ +package gov.pnnl.aparapi.sample.mdarray; +import com.amd.aparapi.Kernel; + +class ZMatMul3D extends Kernel{ + boolean[][][] A; + + boolean[][][] B; + + boolean[][][] C; + + int N; + + public ZMatMul3D(boolean[][][] A, boolean[][][] B, boolean[][][] C, int N) { + this.A = A; + this.B = B; + this.C = C; + this.N = N; + } + + @Override public void run() { + int id = getGlobalId(); + int i = id / (N * N); + int j = (id / N) % N; + int k = id % N; + for (int l = 0; l < N; l++) { + C[i][j][k] ^= A[i][j][l] & B[l][j][k]; + } + } +}