1. Switch to Nvidia driver in Additional driver tab and reboot
run nvidia-smi
and you should see driver version is 390
2. Install CUDA using package from Ubuntu, NOT from Nvidia
sudo apt install nvidia-cuda-toolkit
3. Install cuDNN, you need to register a Nvidia developer account, download cuDNN v7.1.2 Library for Linux, use the tarball instead of the ‘official’ deb. Update LD_LIBRARY_PATH
so the libcudnn.so
file can be found. You need to open a new terminal so the export
can work
tar -zxvf cudnn-9.1-linux-x64-v7.1.tgz
mv cuda ~/cuda
# add the following line to your ~/.bashrc or ~/.zshrc
export LD_LIBRARY_PATH=$HOME/cuda/lib64:$LD_LIBRARY_PATH
4. Install Anaconda, if you want to use builtin python distribution, just skip this step, Anaconda ships with a lot of packages.
5. Install MXNet, NOTE we use mxnet-cu91 instead of mxnet-cu90
pip install mxnet-cu91
This is because Ubuntu ships with CUDA 9.18, you will see the following error if you follow the official doc and install mxnet-cu90
OSError: libcudart.so.9.0: cannot open shared object file: No such file or directory
ls /usr/lib/x86_64-linux-gnu | grep libcu
and you will see so
ended with 9.1
instead of 9.0
6. Test everything is working
#!/usr/bin/env python3import mxnet as mxdef main():
print('test mxnet gpu')
a = mx.nd.ones((2, 3), mx.gpu())
b = a * 2 + 1
print(b.asnumpy())if __name__ == '__main__':
main()
A1. If you only want to test CUDA, use the following code, NOTE: Ubuntu 18.04 comes with GCC 7, but nvcc only supports GCC 6, when install CUDA, package manager will install GCC 6 along with it, but you still need to explicit specify it to nvcc when compile using -ccbin g++-6
/*
* Example from Udacity Intro to Parallel Programming https://www.udacity.com/course/intro-to-parallel-programming--cs344
* nvcc -ccbin g++-6 cube.cu
*/
#include <stdio.h>__global__ void cube(float * d_out, float * d_in){
int idx = threadIdx.x;
float f = d_in[idx];
d_out[idx] = f * f * f;
}int main(int argc, char ** argv) {
printf("I am a CUDA program for compute matrix\n");const int ARRAY_SIZE = 64;
const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);// generate the input array on the host
float h_in[ARRAY_SIZE];
for (int i = 0; i < ARRAY_SIZE; i++) {
h_in[i] = float(i);
}
float h_out[ARRAY_SIZE];// declare GPU memory pointers
float * d_in;
float * d_out;// allocate GPU memory
cudaMalloc((void**) &d_in, ARRAY_BYTES);
cudaMalloc((void**) &d_out, ARRAY_BYTES);// transfer the array to the GPU
cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);// launch the kernel
cube<<<1, ARRAY_SIZE>>>(d_out, d_in);// copy back the result array to the CPU
cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost);// print out the resulting array
for (int i =0; i < ARRAY_SIZE; i++) {
printf("%f", h_out[i]);
printf(((i % 4) != 3) ? "\t" : "\n");
}cudaFree(d_in);
cudaFree(d_out);return 0;
}