visual studio – How do I obtain timing in a CUDA c++ kernel?


I finally created a GPU powered low md5-finder, I am so happy! Here is the code, slightly adapted form another implementation:

https://github.com/EnesO226/md5zerofinder/blob/main/kernel.cu

However, when I try to use cudaEvents in the kernel, my visual studio says that cudaEvents are not allowed in a kernel. I want to print the hashrate here, everytime a new hash is found:

__device__ void md5(const uchar* data, const uint size, uint result[4])
{
    uint state[4] = { 0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476 }, i;

    for (i = 0; i + block_size <= size; i += block_size)
    {
        transform(state, data + i);
    }

    uint size_in_bits = size << 3;
    uchar buffer[block_size];

    memcpy(buffer, data + i, size - i);
    memcpy(buffer + size - i, padding, block_size - (size - i));
    memcpy(buffer + block_size - (2 * sizeof(uint)), &size_in_bits, sizeof(uint));

    transform(state, buffer);

    memcpy(result, state, 4 * sizeof(uint));
    if (result[0] == 0 && byteswap(result[1]) < 0x0fffffff) {
        printf("Hash found -------> %08x%08x%08x%08x\n", byteswap(result[0]), byteswap(result[1]), byteswap(result[2]), byteswap(result[3]));
        printf("For data   -------> ");
        for (int j = 0; j < size; j++) {
            printf("%02x", data[j]);
        }
        printf("\n\n");

    }
}

And the kernel which is actually run is this:

__global__ void test() {
    int thread = blockIdx.x * blockDim.x + threadIdx.x;
    uchar m[12];
    uint res[4];
    m[0] = (uchar)(thread & 0x000000ff);
    m[1] = (uchar)((thread >> 8) & 0x000000ff);
    m[2] = (uchar)((thread >> 16) & 0x000000ff);
    m[3] = (uchar)((thread >> 24) & 0x000000ff);

    for (unsigned long long i = 0; i < 0xffffffffffffffff; i++) {
        m[4] = (uchar)(i & 0x000000ff);
        m[5] = (uchar)((i >> 8) & 0x00000000000000ff);
        m[6] = (uchar)((i >> 16) & 0x00000000000000ff);
        m[7] = (uchar)((i >> 24) & 0x00000000000000ff);
        m[8] = (uchar)((i >> 32) & 0x00000000000000ff);
        m[9] = (uchar)((i >> 40) & 0x00000000000000ff);
        m[10] = (uchar)((i >> 48) & 0x00000000000000ff);
        m[11] = (uchar)((i >> 56) & 0x00000000000000ff);
        md5(m, 12, res);
    }
}

int main()
{
    test << <1024, 1024 >> > ();
    system("pause");
    return 0;
}

How would I go about getting the hashrate here? I asked google (and chatGPT, yes, it is useful sometimes) but I only know how to time the timing when the kernel is already finished, so I would have to set the iterations way lower, but then it does not run indefinitely, you understand? So that is my question; is there a way to get timing in a cuda __device__ or __global__ kernel?

IMPORTANT: my visual studio does not support atomicAdd and clock()

Leave a Reply

Your email address will not be published. Required fields are marked *