ShangjinTang · September 14, 2024 03:10
diff --git a/#TF_Torch_GPU_Installation b/#TF_Torch_GPU_Installation
 Install TensorFlow GPU and PyTorch GPU on a NVIDIA Graphics Card available computer.
diff --git a/1_install_driver_cuda_cudnn.sh b/1_install_driver_cuda_cudnn.sh
 #!/bin/bash

 ### stage 1 ####
 # verify the system has a cuda-capable gpu
 # download and install the nvidia cuda toolkit and cudnn
 # setup environmental variables
 ###

 ### to verify your gpu is cuda enable check
 lspci | grep -i nvidia

 ### remove previous installation
 sudo apt purge '.*nvidia.*' '.*cuda.*' '.*cudnn.*'
 sudo apt remove '.*nvidia.*' '.*cuda.*' '.*cudnn.*'
 sudo rm /etc/apt/sources.list.d/cuda*
 sudo apt-get autoremove && sudo apt-get autoclean
 sudo rm -rf /usr/local/cuda*

 ### do system upgrade
 sudo apt update && sudo apt upgrade -y
 sudo apt install -y g++ freeglut3-dev build-essential libx11-dev libxmu-dev libxi-dev libglu1-mesa libglu1-mesa-dev

 # install nvidia driver
 sudo add-apt-repository ppa:graphics-drivers/ppa
 sudo apt update
 sudo apt install -y libnvidia-common-535 libnvidia-gl-535 nvidia-driver-535

 # install cuda deb(network)
 # Reference: https://developer.nvidia.com/cuda-11-8-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_network
 wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb
 sudo dpkg -i cuda-keyring_1.0-1_all.deb
 sudo apt-get update
 sudo apt install -y cuda-11-8

 # Note: you need to add below lines to ~/.bashrc or ~/.zshrc
 echo 'export PATH=/usr/local/cuda/bin:$PATH' >> ~/."$(basename $SHELL)"rc
 echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' >> ~/."$(basename $SHELL)"rc

 # install cuDNN v8.7
 # Reference: https://developer.nvidia.com/cudnn
 CUDNN_TAR_FILE="cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz"
 sudo wget https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz
 sudo tar -xvf cudnn-linux-*.tar.xz
 sudo mv cudnn-linux-x86_64-8.7.0.84_cuda11-archive cuda

 # copy the following files into the cuda toolkit directory.
 sudo cp -P cuda/include/cudnn.h /usr/local/cuda/include
 sudo cp -P cuda/lib/libcudnn* /usr/local/cuda/lib64/
 sudo chmod a+r /usr/local/cuda/lib64/libcudnn*

 # reboot to solve "Failed to initialize NVML: Driver/library version mismatch"
 sudo reboot
diff --git a/2_install_miniforge_create_env_pyml.sh b/2_install_miniforge_create_env_pyml.sh
 #!/usr/bin/env bash

 ### stage 2 ####
 # install mamba (python 3.10) with miniforge
 ###

 export PATH="$HOME/miniforge-pypy3/bin:$PATH"

 # https://github.com/conda-forge/miniforge/releases/tag/24.3.0-0 is the latest version supports python 3.10
 DOWNLOAD_URL="https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge-pypy3-Linux-x86_64.sh"
 DOWNLOAD_TMP_FILE="/tmp/$(basename ${DOWNLOAD_URL})"

 if ! which mamba &> /dev/null; then

    echo "Downloading ${DOWNLOAD_URL} ..."

    curl -# -sSLf ${DOWNLOAD_URL} -o "${DOWNLOAD_TMP_FILE}" && echo "Installing ..." && bash "${DOWNLOAD_TMP_FILE}" -b -f && rm "${DOWNLOAD_TMP_FILE}"

 fi

 echo "mamba path: $(which mamba)"

 if ! mamba env list | grep -q pyml; then

    mamba create --yes -n pyml \
        'python==3.10' \
        'notebook' 'jupyterhub' 'jupyterlab' \
        'scipy' 'numpy' 'pandas' 'matplotlib'

 fi
diff --git a/3_install_tf_torch.sh b/3_install_tf_torch.sh
 #!/bin/bash

 ### stage 3 ####
 # verify the nvidia driver + cuda + cudnn installation
 # install TensorFlow and PyTorch
 ###

 # verify the installation
 nvidia-smi
 nvcc -V

 # activate mamba environment
 mamba activate pyml

 # install TensorFlow GPU
 python3 -m pip install nvidia-cudnn-cu11==8.6.0.163 tensorflow==2.13.0

 # install PyTorch GPU
 python3 -m pip install torch==2.1.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
diff --git a/4_gpu_check.sh b/4_gpu_check.sh
 #!/bin/bash

 ### stage 4 ####
 # check if framework is actually using GPU
 ###

 mamba activate pyml

 python3 -c "import tensorflow as tf; print('TensorFlow: Version: ' + tf.__version__ + ', GPU Available: ', bool(len(tf.config.list_physical_devices('GPU'))))"

 python3 -c "import torch; print('PyTorch: Version: ' + torch.__version__ + ', GPU Available: ', torch.cuda.is_available())"
diff --git a/5_additional_packages.md b/5_additional_packages.md
diff --git a/Notes.md b/Notes.md
	#!/bin/bash

	### stage 1 ####
	# verify the system has a cuda-capable gpu
	# download and install the nvidia cuda toolkit and cudnn
	# setup environmental variables
	###

	### to verify your gpu is cuda enable check
	lspci \| grep -i nvidia

	### remove previous installation
	sudo apt purge '.nvidia.' '.cuda.' '.cudnn.'
	sudo apt remove '.nvidia.' '.cuda.' '.cudnn.'
	sudo rm /etc/apt/sources.list.d/cuda*
	sudo apt-get autoremove && sudo apt-get autoclean
	sudo rm -rf /usr/local/cuda*

	### do system upgrade
	sudo apt update && sudo apt upgrade -y
	sudo apt install -y g++ freeglut3-dev build-essential libx11-dev libxmu-dev libxi-dev libglu1-mesa libglu1-mesa-dev

	# install nvidia driver
	sudo add-apt-repository ppa:graphics-drivers/ppa
	sudo apt update
	sudo apt install -y libnvidia-common-535 libnvidia-gl-535 nvidia-driver-535

	# install cuda deb(network)
	# Reference: https://developer.nvidia.com/cuda-11-8-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_network
	wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb
	sudo dpkg -i cuda-keyring_1.0-1_all.deb
	sudo apt-get update
	sudo apt install -y cuda-11-8

	# Note: you need to add below lines to ~/.bashrc or ~/.zshrc
	echo 'export PATH=/usr/local/cuda/bin:$PATH' >> ~/."$(basename $SHELL)"rc
	echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' >> ~/."$(basename $SHELL)"rc

	# install cuDNN v8.7
	# Reference: https://developer.nvidia.com/cudnn
	CUDNN_TAR_FILE="cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz"
	sudo wget https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz
	sudo tar -xvf cudnn-linux-*.tar.xz
	sudo mv cudnn-linux-x86_64-8.7.0.84_cuda11-archive cuda

	# copy the following files into the cuda toolkit directory.
	sudo cp -P cuda/include/cudnn.h /usr/local/cuda/include
	sudo cp -P cuda/lib/libcudnn* /usr/local/cuda/lib64/
	sudo chmod a+r /usr/local/cuda/lib64/libcudnn*

	# reboot to solve "Failed to initialize NVML: Driver/library version mismatch"
	sudo reboot
	#!/usr/bin/env bash

	### stage 2 ####
	# install mamba (python 3.10) with miniforge
	###

	export PATH="$HOME/miniforge-pypy3/bin:$PATH"

	# https://github.com/conda-forge/miniforge/releases/tag/24.3.0-0 is the latest version supports python 3.10
	DOWNLOAD_URL="https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge-pypy3-Linux-x86_64.sh"
	DOWNLOAD_TMP_FILE="/tmp/$(basename ${DOWNLOAD_URL})"

	if ! which mamba &> /dev/null; then

	echo "Downloading ${DOWNLOAD_URL} ..."

	curl -# -sSLf ${DOWNLOAD_URL} -o "${DOWNLOAD_TMP_FILE}" && echo "Installing ..." && bash "${DOWNLOAD_TMP_FILE}" -b -f && rm "${DOWNLOAD_TMP_FILE}"

	fi

	echo "mamba path: $(which mamba)"

	if ! mamba env list \| grep -q pyml; then

	mamba create --yes -n pyml \
	'python==3.10' \
	'notebook' 'jupyterhub' 'jupyterlab' \
	'scipy' 'numpy' 'pandas' 'matplotlib'

	fi
	#!/bin/bash

	### stage 3 ####
	# verify the nvidia driver + cuda + cudnn installation
	# install TensorFlow and PyTorch
	###

	# verify the installation
	nvidia-smi
	nvcc -V

	# activate mamba environment
	mamba activate pyml

	# install TensorFlow GPU
	python3 -m pip install nvidia-cudnn-cu11==8.6.0.163 tensorflow==2.13.0

	# install PyTorch GPU
	python3 -m pip install torch==2.1.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
	#!/bin/bash

	### stage 4 ####
	# check if framework is actually using GPU
	###

	mamba activate pyml

	python3 -c "import tensorflow as tf; print('TensorFlow: Version: ' + tf.__version__ + ', GPU Available: ', bool(len(tf.config.list_physical_devices('GPU'))))"

	python3 -c "import torch; print('PyTorch: Version: ' + torch.__version__ + ', GPU Available: ', torch.cuda.is_available())"