I finally created a GPU powered low md5-finder, I am so happy! Here is the code, slightly adapted form another implementation:
https://github.com/EnesO226/md5zerofinder/blob/main/kernel.cu
However, when I try to use cudaEvents in the kernel, my visual studio says that cudaEvents are not allowed in a kernel. I want to print the hashrate here, everytime a new hash is found:
__device__ void md5(const uchar* data, const uint size, uint result[4])
{
uint state[4] = { 0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476 }, i;
for (i = 0; i + block_size <= size; i += block_size)
{
transform(state, data + i);
}
uint size_in_bits = size << 3;
uchar buffer[block_size];
memcpy(buffer, data + i, size - i);
memcpy(buffer + size - i, padding, block_size - (size - i));
memcpy(buffer + block_size - (2 * sizeof(uint)), &size_in_bits, sizeof(uint));
transform(state, buffer);
memcpy(result, state, 4 * sizeof(uint));
if (result[0] == 0 && byteswap(result[1]) < 0x0fffffff) {
printf("Hash found -------> %08x%08x%08x%08x\n", byteswap(result[0]), byteswap(result[1]), byteswap(result[2]), byteswap(result[3]));
printf("For data -------> ");
for (int j = 0; j < size; j++) {
printf("%02x", data[j]);
}
printf("\n\n");
}
}
And the kernel which is actually run is this:
__global__ void test() {
int thread = blockIdx.x * blockDim.x + threadIdx.x;
uchar m[12];
uint res[4];
m[0] = (uchar)(thread & 0x000000ff);
m[1] = (uchar)((thread >> 8) & 0x000000ff);
m[2] = (uchar)((thread >> 16) & 0x000000ff);
m[3] = (uchar)((thread >> 24) & 0x000000ff);
for (unsigned long long i = 0; i < 0xffffffffffffffff; i++) {
m[4] = (uchar)(i & 0x000000ff);
m[5] = (uchar)((i >> 8) & 0x00000000000000ff);
m[6] = (uchar)((i >> 16) & 0x00000000000000ff);
m[7] = (uchar)((i >> 24) & 0x00000000000000ff);
m[8] = (uchar)((i >> 32) & 0x00000000000000ff);
m[9] = (uchar)((i >> 40) & 0x00000000000000ff);
m[10] = (uchar)((i >> 48) & 0x00000000000000ff);
m[11] = (uchar)((i >> 56) & 0x00000000000000ff);
md5(m, 12, res);
}
}
int main()
{
test << <1024, 1024 >> > ();
system("pause");
return 0;
}
How would I go about getting the hashrate here? I asked google (and chatGPT, yes, it is useful sometimes) but I only know how to time the timing when the kernel is already finished, so I would have to set the iterations way lower, but then it does not run indefinitely, you understand? So that is my question; is there a way to get timing in a cuda __device__
or __global__
kernel?
IMPORTANT: my visual studio does not support atomicAdd
and clock()