Demystifying Containers - Part I: Kernel Space

Part I: Kernel Space

Introduction

  1. Not negotiable: They have to run on a single host. Okay, so two computers cannot run a single container.
  2. Clearly: They are groups of processes. You might know that Linux processes live inside a tree structure, so we can say containers must have a root process.
  3. Okay: They need to be isolated, whatever this means in detail.
  4. Not so clear: They have to fulfill common features. Features in general seem to change over time, so we have to point out what the most common features are.

chroot

> mkdir -p new-root/{bin,lib64}
> cp /bin/bash new-root/bin
> cp /lib64/{ld-linux-x86-64.so*,libc.so*,libdl.so.2,libreadline.so*,libtinfo.so*} new-root/lib64
> sudo chroot new-root
#include <sys/stat.h>
#include <unistd.h>

int main(void)
{
mkdir(".out", 0755);
chroot(".out");
chdir("../../../../../");
chroot(".");
return execl("/bin/bash", "-i", NULL);
}
> skopeo copy docker://opensuse/tumbleweed:latest oci:tumbleweed:latest
[output removed]
> sudo umoci unpack --image tumbleweed:latest bundle
[output removed]
> sudo chroot bundle/rootfs
#
> mkdir /proc
> mount -t proc proc /proc
> ps aux
[output removed]
> mkdir /sys
> mount -t sysfs sys /sys
> ls /sys/class/net
eth0 lo

Linux Namespaces

API

clone

unshare

setns

proc

> ls -Gg /proc/self/ns/
total 0
lrwxrwxrwx 1 0 Feb 6 18:32 cgroup -> 'cgroup:[4026531835]'
lrwxrwxrwx 1 0 Feb 6 18:32 ipc -> 'ipc:[4026531839]'
lrwxrwxrwx 1 0 Feb 6 18:32 mnt -> 'mnt:[4026531840]'
lrwxrwxrwx 1 0 Feb 6 18:32 net -> 'net:[4026532008]'
lrwxrwxrwx 1 0 Feb 6 18:32 pid -> 'pid:[4026531836]'
lrwxrwxrwx 1 0 Feb 6 18:32 pid_for_children -> 'pid:[4026531836]'
lrwxrwxrwx 1 0 Feb 6 18:32 user -> 'user:[4026531837]'
lrwxrwxrwx 1 0 Feb 6 18:32 uts -> 'uts:[4026531838]'

Available Namespaces

Mount (mnt)

> sudo unshare -m
# mkdir mount-dir
# mount -n -o size=10m -t tmpfs tmpfs mount-dir
# df mount-dir
Filesystem 1K-blocks Used Available Use% Mounted on
tmpfs 10240 0 10240 0% <PATH>/mount-dir
# touch mount-dir/{0,1,2}
> ls mount-dir
> grep mount-dir /proc/mounts
>
> grep mount-dir /proc/$(pgrep -u root bash)/mountinfo
349 399 0:84 / /mount-dir rw,relatime - tmpfs tmpfs rw,size=1024k

UNIX Time-sharing System (uts)

> sudo unshare -u
# hostname
nb
# hostname new-hostname
# hostname
new-hostname
> hostname
nb

Interprocess Communication (ipc)

Process ID (pid)

> sudo unshare -fp --mount-proc
# ps aux
USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND
root 1 0.4 0.6 18688 6608 pts/0 S 23:15 0:00 -bash
root 39 0.0 0.1 35480 1768 pts/0 R+ 23:15 0:00 ps aux

Network (net)

> sudo unshare -n
# ip l
1: lo: <LOOPBACK> mtu 65536 qdisc noop state DOWN mode DEFAULT group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
> sudo ip netns add mynet
> sudo ip netns list
mynet
> sudo ip netns exec mynet ip l
1: lo: <LOOPBACK> mtu 65536 qdisc noop state DOWN mode DEFAULT group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
> sudo ip netns exec mynet ping 127.0.0.1
connect: Network is unreachable
> sudo ip netns exec mynet ip link set dev lo up
> sudo ip netns exec mynet ping 127.0.0.1
PING 127.0.0.1 (127.0.0.1) 56(84) bytes of data.
64 bytes from 127.0.0.1: icmp_seq=1 ttl=64 time=0.016 ms
> sudo ip link add veth0 type veth peer name veth1
> sudo ip link show type veth
11: veth1@veth0: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000
link/ether b2:d1:fc:31:9c:d3 brd ff:ff:ff:ff:ff:ff
12: veth0@veth1: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000
link/ether ca:0f:37:18:76:52 brd ff:ff:ff:ff:ff:ff
> sudo ip link set veth1 netns mynet
> ip link show type veth
12: veth0@if11: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000
link/ether ca:0f:37:18:76:52 brd ff:ff:ff:ff:ff:ff link-netns mynet
> sudo ip netns exec mynet ip addr add 172.2.0.1/24 dev veth1
> sudo ip netns exec mynet ip link set dev veth1 up
> sudo ip addr add 172.2.0.2/24 dev veth0
> sudo ip link set dev veth0 up
> ping -c1 172.2.0.1
PING 172.2.0.1 (172.2.0.1) 56(84) bytes of data.
64 bytes from 172.2.0.1: icmp_seq=1 ttl=64 time=0.036 ms

--- 172.2.0.1 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.036/0.036/0.036/0.000 ms
> sudo ip netns exec mynet ping -c1 172.2.0.2
PING 172.2.0.2 (172.2.0.2) 56(84) bytes of data.
64 bytes from 172.2.0.2: icmp_seq=1 ttl=64 time=0.020 ms

--- 172.2.0.2 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.020/0.020/0.020/0.000 ms

User ID (user)

> id -u
1000
> unshare -U
> whoami
nobody
> cat /proc/$PID/uid_map
0 1000 1

Control Group (cgroup)

> sudo mkdir /sys/fs/cgroup/memory/demo
> ls /sys/fs/cgroup/memory/demo
cgroup.clone_children
cgroup.event_control
cgroup.procs
memory.failcnt
memory.force_empty
memory.kmem.failcnt
memory.kmem.limit_in_bytes
memory.kmem.max_usage_in_bytes
memory.kmem.slabinfo
memory.kmem.tcp.failcnt
memory.kmem.tcp.limit_in_bytes
memory.kmem.tcp.max_usage_in_bytes
memory.kmem.tcp.usage_in_bytes
memory.kmem.usage_in_bytes
memory.limit_in_bytes
memory.max_usage_in_bytes
memory.move_charge_at_immigrate
memory.numa_stat
memory.oom_control
memory.pressure_level
memory.soft_limit_in_bytes
memory.stat
memory.swappiness
memory.usage_in_bytes
memory.use_hierarchy
notify_on_release
tasks
> sudo su
# echo 100000000 > /sys/fs/cgroup/memory/demo/memory.limit_in_bytes
# echo 0 > /sys/fs/cgroup/memory/demo/memory.swappiness
# echo $$ > /sys/fs/cgroup/memory/demo/cgroup.procs
pub fn main() {
let mut vec = vec![];
loop {
vec.extend_from_slice(&[1u8; 10_000_000]);
println!("{}0 MB", vec.len() / 10_000_000);
}
}
# rustc memory.rs
# ./memory
10 MB
20 MB
30 MB
40 MB
50 MB
60 MB
70 MB
80 MB
90 MB
Killed

Composing Namespaces

> sudo unshare -fp --mount-proc
# ps aux
USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND
root 1 0.1 0.6 18688 6904 pts/0 S 23:36 0:00 -bash
root 39 0.0 0.1 35480 1836 pts/0 R+ 23:36 0:00 ps aux
> export PID=$(pgrep -u root bash)
> sudo ls -l /proc/$PID/ns
> sudo nsenter --pid=/proc/$PID/ns/pid unshare --mount-proc
# ps aux
root 1 0.1 0.0 10804 8840 pts/1 S+ 14:25 0:00 -bash
root 48 3.9 0.0 10804 8796 pts/3 S 14:26 0:00 -bash
root 88 0.0 0.0 7700 3760 pts/3 R+ 14:26 0:00 ps aux

Demo Application

#define _GNU_SOURCE
#include <errno.h>
#include <sched.h>
#include <stdio.h>
#include <string.h>
#include <sys/mount.h>
#include <sys/msg.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#define STACKSIZE (1024 * 1024)
static char stack[STACKSIZE];
void print_err(char const * const reason)
{
fprintf(stderr, "Error %s: %s\n", reason, strerror(errno));
}
int exec(void * args)
{
// Remount proc
if (mount("proc", "/proc", "proc", 0, "") != 0) {
print_err("mounting proc");
return 1;
}
// Set a new hostname
char const * const hostname = "new-hostname";
if (sethostname(hostname, strlen(hostname)) != 0) {
print_err("setting hostname");
return 1;
}
// Create a message queue
key_t key = {0};
if (msgget(key, IPC_CREAT) == -1) {
print_err("creating message queue");
return 1;
}
// Execute the given command
char ** const argv = args;
if (execvp(argv[0], argv) != 0) {
print_err("executing command");
return 1;
}
return 0;
}
int main(int argc, char ** argv)
{
// Provide some feedback about the usage
if (argc < 2) {
fprintf(stderr, "No command specified\n");
return 1;
}
// Namespace flags
const int flags = CLONE_NEWNET | CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWIPC |
CLONE_NEWPID | CLONE_NEWUSER | SIGCHLD;
// Create a new child process
pid_t pid = clone(exec, stack + STACKSIZE, flags, &argv[1]);
if (pid < 0) {
print_err("calling clone");
return 1;
}
// Wait for the process to finish
int status = 0;
if (waitpid(pid, &status, 0) == -1) {
print_err("waiting for pid");
return 1;
}
// Return the exit code
return WEXITSTATUS(status);
}
> gcc -o namespaces namespaces.c
> ./namespaces ip a
1: lo: <LOOPBACK> mtu 65536 qdisc noop state DOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
> ./namespaces ps aux
USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND
nobody 1 0.0 0.1 36524 1828 pts/0 R+ 23:46 0:00 ps aux
> ./namespaces whoami
nobody

Putting it all Together

> sudo runc run -b bundle container
> sudo lsns | grep bash
4026532499 mnt 1 6409 root /bin/bash
4026532500 uts 1 6409 root /bin/bash
4026532504 ipc 1 6409 root /bin/bash
4026532505 pid 1 6409 root /bin/bash
4026532511 net 1 6409 root /bin/bash

Conclusion

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store