Doing Some Big Data Tools’ Docker Images Right

Zookeeper: Like the Big Data Tax

tickTime=2000
dataDir=/var/lib/zookeeper/
clientPort=2181
initLimit=5
syncLimit=2
server.1=zoo1:2888:3888
server.2=zoo2:2888:3888
server.3=zoo3:2888:3888
java -cp "$CLASSPATH" \
-Dcom.sun.management.jmxremote \
-Dcom.sun.management.jmxremote.local.only=false \
-Dzookeeper.logs.dir=logs/ \
-Dlog4j.configuration=file:conf/log4j.properties \
org.apache.zookeeper.server.quorum.QuorumPeerMain \
conf/zoo.cfg

Kafka: The reliable messenger

 java \
-Xmx256M \
-Xms128M \
-server \
-XX:+UseG1GC \
-XX:MaxGCPauseMillis=20 \
-XX:InitiatingHeapOccupancyPercent=35 \
-XX:+DisableExplicitGC \
-Djava.awt.headless=true \
-Xloggc:logs/kafkaServer-gc.log \
-verbose:gc \
-XX:+PrintGCDetails \
-XX:+PrintGCDateStamps \
-XX:+PrintGCTimeStamps \
-Dcom.sun.management.jmxremote \
-Dcom.sun.management.jmxremote.authenticate=false \
-Dcom.sun.management.jmxremote.ssl=false \
-Dkafka.logs.dir=logs/ \
-Dlog4j.configuration=file:config/log4j.properties \
-Dcom.sun.management.jmxremote.port=9999 \
-cp :libs/* \
kafka.Kafka \
config/server.properties

HDFS: Almost perfect distributed file system

if [ “$COMMAND” = “namenode” ] ; then     
CLASS=’org.apache.hadoop.hdfs.server.namenode.NameNode’
HADOOP_OPTS=”$HADOOP_OPTS $HADOOP_NAMENODE_OPTS”
exec “$JAVA” -Dproc_$COMMAND $JAVA_HEAP_MAX $HADOOP_OPTS $CLASS “$@”
$ ps aux | grep NameNode
root 95449 0.5 2.9 2826444 982364 ? Sl Oct18 76:06 /usr/lib/jvm/java-8-oracle//bin/java -Dproc_namenode -Xmx1000m -Djava.net.preferIPv4Stack=true -Dhadoop.log.dir=/opt/hadoop-2.7.3/logs -Dhadoop.log.file=hadoop.log -Dhadoop.home.dir=/opt/hadoop-2.7.3 -Dhadoop.id.str=root -Dhadoop.root.logger=INFO,console -Djava.library.path=/opt/hadoop-2.7.3/lib/native -Dhadoop.policy.file=hadoop-policy.xml -Djava.net.preferIPv4Stack=true -Djava.net.preferIPv4Stack=true -Djava.net.preferIPv4Stack=true -Dhadoop.log.dir=/opt/hadoop-2.7.3/logs -Dhadoop.log.file=hadoop-root-namenode-fhadoop1.log -Dhadoop.home.dir=/opt/hadoop-2.7.3 -Dhadoop.id.str=root -Dhadoop.root.logger=INFO,RFA -Djava.library.path=/opt/hadoop-2.7.3/lib/native -Dhadoop.policy.file=hadoop-policy.xml -Djava.net.preferIPv4Stack=true -Dhadoop.security.logger=INFO,RFAS -Dhdfs.audit.logger=INFO,NullAppender -Dhadoop.security.logger=INFO,RFAS -Dhdfs.audit.logger=INFO,NullAppender -Dhadoop.security.logger=INFO,RFAS -Dhdfs.audit.logger=INFO,NullAppender -Dhadoop.security.logger=INFO,RFAS org.apache.hadoop.hdfs.server.namenode.NameNode
java  \
-Dproc_namenode \
-Xmx1000m \
-Djava.net.preferIPv4Stack=true \
-Dhadoop.home.dir=$HADOOP_HOME \
-Dhadoop.id.str=root \
-Dhadoop.root.logger=INFO,console \
-Djava.library.path=$HADOOP_HOME/lib/native \
-Dhadoop.policy.file=hadoop-policy.xml \
-Dhadoop.security.logger=INFO,RFAS \
org.apache.hadoop.hdfs.server.namenode.NameNode
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://namenode.company.com:8020</value>
</property>
</configuration>
$ xargs --null --max-args=1 echo < /proc/$PID/environ

Flink — The streamer and batcher

java -cp “lib/*” \ 
-Dproc_jobmanager \
-Dlog4j.configuration=file:conf/log4j.properties \
org.apache.flink.runtime.jobmanager.JobManager \
-configDir conf/ \
-executionMode cluster \
-host $IPADDR
java -cp “lib/*” \
-Dproc_taskmanager \
-Dlog4j.configuration=file:conf/log4j.properties \
org.apache.flink.runtime.taskmanager.TaskManager \
-configDir conf/

Drill — Schema Free Query Engine

drill.exec:{
cluster-id: "<mydrillcluster>",
zk.connect: "<zkhostname1>:<port>,<zkhostname2>:<port>,<zkhostname3>:<port>"
}
java \ 
-Xms4G -Xmx4G -XX:MaxDirectMemorySize=8G \
-XX:ReservedCodeCacheSize=1G \
-Ddrill.exec.enable-epoll=false \
-XX:MaxPermSize=512M \
-XX:+CMSClassUnloadingEnabled \
-XX:+UseG1GC \
-Dlog.path=log/drillbit.log \
-Dlog.query.path=log/drillbit_queries.json \
-cp conf/:jars/*:jars/ext/*:jars/3rdparty/*:jars/classb/* \
org.apache.drill.exec.server.Drillbit

Conclusion

Bonus: An example Cluster

$ docker network create --subnet 10.0.50.1/24 mynet
$ for i in {1..3}; do
docker run -d --network=mynet --name=zk$i -e MYID=$i \
-e PEERS=zk1,zk2,zk3 \
mustafaakin/zookeeper
done
$ for i in {1..5}; do
docker run -d --network=mynet --name=kafka$i \
-e BROKERID=$i -e ZKHOSTS=zk1,zk2,zk3 -e IFACE=eth0 \
mustafaakin/kafka
done
$ docker volume create --name mydata1
$ docker run -h namenode1 --rm \
--network=mynet \
-e NAMENODE=namenode1 \
--name=namenode1 -it \
-v mydata1:/data \
mustafaakin/hadoop namenode -format
$ docker run -p 50070:50070 -d -h namenode1 \
--network=mynet \
-e NAMENODE=namenode1 \
--name=namenode1 -it \
-v mydata1:/data \
mustafaakin/hadoop namenode
$ for i in {1..5}; do
docker volume create --name hadoopdata$i
docker run --net=mynet -d -e NAMENODE=namenode1 \
--name=datanode$i -it \
-v hadoopdata$i:/data \
mustafaakin/hadoop datanode
done
ZKHOSTS="zk1,zk2,zk3"
HDFS="namenode1:8020"
HASTORAGEDIR="flink-ha/"
ZKROOT="/flink"
SLOTS="8"
CHECKPOINTDIR="checkpoints/"
IFACE="eth0"
for i in {1..3}; do
docker run -d -p 1808$i:8081 --net=mynet \
--name=jobmanager$i \
-e ZKHOSTS=$ZKHOSTS \
-e HDFS=$HDFS \
-e HASTORAGEDIR=$HASTORAGEDIR \
-e ZKROOT=$ZKROOT \
-e SLOTS=$SLOTS \
-e CHECKPOINTDIR=$CHECKPOINTDIR \
-e IFACE=$IFACE \
flink jobmanager
done
for i in {1..8}; do
docker run -d --net=mynet \
--name=taskmanager$i \
-e ZKHOSTS=$ZKHOSTS \
-e HDFS=$HDFS \
-e HASTORAGEDIR=$HASTORAGEDIR \
-e ZKROOT=$ZKROOT \
-e SLOTS=$SLOTS \
-e CHECKPOINTDIR=$CHECKPOINTDIR \
-e IFACE=$IFACE \
flink taskmanager
done
ZKHOSTS="zk1,zk2,zk3"
CLUSTERID=”mydrillcluster”
for i in `seq 1 10`; do
docker run -d --network=mynet \
--name=drill$i \
-e ZKHOSTS=$ZKHOSTS \
-e CLUSTERID=$CLUSTERID \
drill
done

--

--

--

PhD Student @BilkentUniversity CS, on @OpsGenie SRE Team

Love podcasts or audiobooks? Learn on the go with our new app.

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
Mustafa Akin

Mustafa Akin

PhD Student @BilkentUniversity CS, on @OpsGenie SRE Team

More from Medium

Enabling live debugging of email pipelines with AWS SES and Google Groups

Automatic Semver Versioning Using Github Actions and deploying to AWS Pipeline

Beating the slowness of AWS glue crawlers

Creating an AWS AppConfig Application using Terraform