__device__ void levenshteinDistance(char *str,int strStart,int strLength,char *patternRemoved,int patternRemovedStart,int patternRemovedLength,int *dXIndividual,int *dXFinal)
{
int indexA = blockIdx.x * blockDim.x + threadIdx.x;
int offsetStr = strStart;
int offsetPattern = patternRemovedStart;
if (indexA < patternRemovedLength) {
for (int i = offsetStr; i <= strLength; i++) {
if (i == 0)
dXIndividual[indexA * (strLength+1) + i] = 0;
else{
if (str[i-1] == patternRemoved[indexA+offsetPattern])
dXIndividual[indexA * (strLength+1) + i] = i;
else if (str[i-1] != patternRemoved[indexA+offsetPattern])
dXIndividual[indexA * (strLength+1) + i] = dXIndividual[indexA * (strLength+1) + i - 1];
}
}
__syncthreads();
}
dXFinal[0] = dXIndividual[(strLength+1) * (patternRemovedLength)];
}
extern "C"
__global__ void ComputationdXOnGPU(int numStr, char *str, int *strStartIndices, int *strIndividualLengths,int numPatternRemoved, char *patternRemoved, int *patternRemovedStartIndices,int *patternRemovedIndividualLengths, int *dXFinal)
{
int ix = blockIdx.x * blockDim.x + threadIdx.x;
if (ix<numStr)
{
for (int i=0; i<numPatternRemoved; i++)
{
int strStart = strStartIndices[ix];
int strLength = strIndividualLengths[ix];
int patternStart = patternRemovedStartIndices[i];
int patternRemovedLength = patternRemovedIndividualLengths[i];
int size = (strLength+1) * patternRemovedLength;
int dXIndividual [size];
int *result = &dXFinal[ix * numStrings1 + i];
levenshteinDistance(str,strStart, strLength, patternRemoved, patternRemovedStart, patternRemovedLength, dXIndividual, dXFinal);
}
}
}
What I have tried:
In the device function, it runs every time from core0 to patternRemovedLength (if (indexA < patternRemovedLength) )
These will limit all token pairs to be run on a small range of cores. I need to say for(indexA = 3; indexA<patternRemovedLength;indexA++) for example in terms of cuda in order to specify for each token a certain range of cores to fully utilize the GPU.