Skip to content

Instantly share code, notes, and snippets.

@dpmccabe
Created April 30, 2025 19:30
Show Gist options
  • Save dpmccabe/14f78a7e50d2cd92be6293ea779bb142 to your computer and use it in GitHub Desktop.
Save dpmccabe/14f78a7e50d2cd92be6293ea779bb142 to your computer and use it in GitHub Desktop.
# based on https://github.com/Ensembl/ensembl-vep/blob/release/112/docker/Dockerfile
# but with everything installed as root (to make it easier to run on files in /cromwell_root)
ARG BRANCH=release/113
###################################################
# Stage 1 - docker container to build ensembl-vep #
###################################################
FROM ubuntu:22.04 AS builder
# Update aptitude and install some required packages
# a lot of them are required for Bio::DB::BigFile
RUN apt-get update && apt-get -y install \
build-essential \
git \
libpng-dev \
zlib1g-dev \
libbz2-dev \
liblzma-dev \
perl \
perl-base \
unzip \
wget && \
rm -rf /var/lib/apt/lists/*
# Setup VEP environment
ENV OPT=/opt/vep
ENV OPT_SRC=$OPT/src
ENV HTSLIB_DIR=$OPT_SRC/htslib
ARG BRANCH
# Working directory
WORKDIR $OPT_SRC
# Clone/download repositories/libraries
RUN if [ "$BRANCH" = "main" ]; \
then export BRANCH_OPT=""; \
else export BRANCH_OPT="-b $BRANCH"; \
fi && \
# Get ensembl cpanfile in order to get the list of the required Perl libraries
wget -q "https://raw.githubusercontent.com/Ensembl/ensembl/$BRANCH/cpanfile" -O "ensembl_cpanfile" && \
# Clone ensembl-vep git repository
git clone $BRANCH_OPT --depth 1 https://github.com/Ensembl/ensembl-vep.git && chmod u+x ensembl-vep/*.pl && \
# Clone ensembl-variation git repository and compile C code
git clone $BRANCH_OPT --depth 1 https://github.com/Ensembl/ensembl-variation.git && \
mkdir var_c_code && \
cp ensembl-variation/C_code/*.c ensembl-variation/C_code/Makefile var_c_code/ && \
rm -rf ensembl-variation && \
chmod u+x var_c_code/* && \
# Clone bioperl-ext git repository - used by Haplosaurus
git clone --depth 1 https://github.com/bioperl/bioperl-ext.git && \
# Download ensembl-xs - it contains compiled versions of certain key subroutines used in VEP
wget https://github.com/Ensembl/ensembl-xs/archive/2.3.2.zip -O ensembl-xs.zip && \
unzip -q ensembl-xs.zip && mv ensembl-xs-2.3.2 ensembl-xs && rm -rf ensembl-xs.zip && \
# Clone/Download other repositories: bioperl-live is needed so the cpanm dependencies installation from the ensembl-vep/cpanfile file takes less disk space
ensembl-vep/travisci/get_dependencies.sh && \
# Only keep the bioperl-live "Bio" library
mv bioperl-live bioperl-live_bak && mkdir bioperl-live && mv bioperl-live_bak/Bio bioperl-live/ && rm -rf bioperl-live_bak && \
## A lot of cleanup on the imported libraries, in order to reduce the docker image ##
rm -rf Bio-HTS/.??* Bio-HTS/Changes Bio-HTS/DISCLAIMER Bio-HTS/MANIFEST* Bio-HTS/README Bio-HTS/scripts Bio-HTS/t Bio-HTS/travisci \
bioperl-ext/.??* bioperl-ext/Bio/SeqIO bioperl-ext/Bio/Tools bioperl-ext/Makefile.PL bioperl-ext/README* bioperl-ext/t bioperl-ext/examples \
ensembl-xs/.??* ensembl-xs/TODO ensembl-xs/Changes ensembl-xs/INSTALL ensembl-xs/MANIFEST ensembl-xs/README ensembl-xs/t ensembl-xs/travisci \
htslib/.??* htslib/INSTALL htslib/NEWS htslib/README* htslib/test && \
# Only keep needed kent-335_base libraries for VEP - used by Bio::DB::BigFile (bigWig parsing)
mv kent-335_base kent-335_base_bak && mkdir -p kent-335_base/src && \
cp -R kent-335_base_bak/src/lib kent-335_base_bak/src/inc kent-335_base_bak/src/jkOwnLib kent-335_base/src/ && \
cp kent-335_base_bak/src/*.sh kent-335_base/src/ && \
rm -rf kent-335_base_bak
# Setup bioperl-ext
WORKDIR bioperl-ext/Bio/Ext/Align/
RUN perl -pi -e"s|(cd libs.+)CFLAGS=\\\'|\$1CFLAGS=\\\'-fPIC |" Makefile.PL
# Install htslib binaries (for 'bgzip' and 'tabix')
# htslib requires the packages 'zlib1g-dev', 'libbz2-dev' and 'liblzma-dev'
WORKDIR $HTSLIB_DIR
RUN make install && rm -f Makefile *.c
# Compile Variation LD C scripts
WORKDIR $OPT_SRC/var_c_code
RUN make && rm -f Makefile *.c
###################################################
# Stage 2 - docker container to build ensembl-vep #
###################################################
FROM ubuntu:22.04
# Update aptitude and install some required packages
# a lot of them are required for Bio::DB::BigFile
RUN apt-get update && apt-get -y install \
build-essential \
cpanminus \
curl \
libmysqlclient-dev \
libdbd-mysql-perl \
libpng-dev \
libssl-dev \
zlib1g-dev \
libbz2-dev \
liblzma-dev \
locales \
openssl \
perl \
perl-base \
unzip \
vim && \
apt-get -y purge manpages-dev && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Setup VEP environment
ENV OPT=/opt/vep
ENV OPT_SRC=$OPT/src
ENV PERL5LIB_TMP=$PERL5LIB:$OPT_SRC/ensembl-vep:$OPT_SRC/ensembl-vep/modules:/plugins
ENV PERL5LIB=$PERL5LIB_TMP:$OPT_SRC/bioperl-live
ENV KENT_SRC=$OPT/src/kent-335_base/src
ENV HTSLIB_DIR=$OPT_SRC/htslib
ENV DEPS=$OPT_SRC
ENV PATH=$OPT_SRC/ensembl-vep:$OPT_SRC/var_c_code:$PATH
ENV LANG_VAR=en_US.UTF-8
ARG BRANCH
# Copy downloaded libraries (stage 1) to this image (stage 2)
COPY --from=builder $OPT_SRC $OPT_SRC
#############################################################
# Install bioperl-ext, faster alignments for haplo (XS-based BioPerl extensions to C libraries)
WORKDIR $OPT_SRC/bioperl-ext/Bio/Ext/Align/
RUN perl Makefile.PL && make && make install && rm -f Makefile*
# Install ensembl-xs, faster run using re-implementation in C of some of the Perl subroutines
WORKDIR $OPT_SRC/ensembl-xs
RUN perl Makefile.PL && make && make install && rm -f Makefile* cpanfile
WORKDIR $OPT_SRC
# Install/compile more libraries
RUN export MACHTYPE=$(uname -m) &&\
ensembl-vep/travisci/build_c.sh && \
# Remove unused Bio-DB-HTS files
rm -rf Bio-HTS/cpanfile Bio-HTS/Build.PL Bio-HTS/Build Bio-HTS/_build Bio-HTS/INSTALL.pl && \
# Install ensembl perl dependencies (cpanm)
cpanm --installdeps --with-recommends --notest --cpanfile ensembl_cpanfile . && \
cpanm --installdeps --with-recommends --notest --cpanfile ensembl-vep/cpanfile . && \
# Delete bioperl and cpanfiles after the cpanm installs as bioperl will be reinstalled by the INSTALL.pl script
rm -rf bioperl-live ensembl_cpanfile ensembl-vep/cpanfile && \
# Configure "locale", see https://github.com/rocker-org/rocker/issues/19
echo "$LANG_VAR UTF-8" >> /etc/locale.gen && locale-gen en_US.utf8 && \
/usr/sbin/update-locale LANG=$LANG_VAR && \
# Copy htslib executables. It also requires the packages 'zlib1g-dev', 'libbz2-dev' and 'liblzma-dev'
cp $HTSLIB_DIR/bgzip $HTSLIB_DIR/tabix $HTSLIB_DIR/htsfile /usr/local/bin/ && \
# Remove CPAN cache
rm -rf /root/.cpanm
ENV LC_ALL=$LANG_VAR
ENV LANG=$LANG_VAR
ENV PERL5LIB=$PERL5LIB_TMP
# Setup Docker environment for when users run VEP and INSTALL.pl in Docker image:
# - skip VEP updates in INSTALL.pl
ENV VEP_NO_UPDATE=1
# - avoid Faidx/HTSLIB installation in INSTALL.pl
ENV VEP_NO_HTSLIB=1
# - skip plugin installation in INSTALL.pl
ENV VEP_NO_PLUGINS=1
# - set plugins directory for VEP and INSTALL.pl
ENV VEP_DIR_PLUGINS=/plugins
ENV VEP_PLUGINSDIR=$VEP_DIR_PLUGINS
WORKDIR $VEP_DIR_PLUGINS
# Update bash profile
WORKDIR $OPT_SRC/ensembl-vep
RUN echo >> $OPT/.profile && \
echo PATH=$PATH:\$PATH >> $OPT/.profile && \
echo export PATH >> $OPT/.profile && \
# Install Ensembl API and plugins
./INSTALL.pl --auto ap --plugins all --pluginsdir $VEP_DIR_PLUGINS --no_update --no_htslib && \
# Remove ensemb-vep's travisci folder
rm -rf travisci
# Install dependencies for VEP plugins:
ENV PLUGIN_DEPS="https://raw.githubusercontent.com/Ensembl/VEP_plugins/$BRANCH/config"
# - Ubuntu packages
RUN curl -O "$PLUGIN_DEPS/ubuntu-packages.txt" && \
apt-get update && apt-get install -y --no-install-recommends \
$(sed -e s/\#.*//g ubuntu-packages.txt) && \
rm -rf /var/lib/apt/lists/* ubuntu-packages.txt
# - Symlink python to python2
RUN ln -s /usr/bin/python2 /usr/bin/python
# - Perl modules
RUN curl -O "$PLUGIN_DEPS/cpanfile" && \
cpanm --installdeps --with-recommends . && \
rm -rf /root/.cpanm cpanfile
# - Python packages
RUN curl -O https://raw.githubusercontent.com/paulfitz/mysql-connector-c/master/include/my_config.h && \
mv my_config.h /usr/include/mysql/my_config.h
RUN curl -O "$PLUGIN_DEPS/requirements.txt" && \
python2 -m pip install --no-cache-dir -r requirements.txt && \
rm requirements.txt
# Install GeneSplicer binary
WORKDIR $VEP_DIR_PLUGINS
RUN curl -O ftp://ftp.ccb.jhu.edu/pub/software/genesplicer/GeneSplicer.tar.gz && \
tar -xzf GeneSplicer.tar.gz && \
rm GeneSplicer.tar.gz && \
cd GeneSplicer/sources && \
make && \
mv genesplicer .. && \
rm -rf GeneSplicer/*/
ENV PATH=$VEP_DIR_PLUGINS/GeneSplicer:$PATH
# Set working directory as symlink to $OPT/.vep (containing VEP cache and data)
RUN ln -s $OPT/.vep /data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment