Last active
June 13, 2018 21:25
-
-
Save sczizzo/dd5767de5b1d0aad72db27c82803fb8d to your computer and use it in GitHub Desktop.
Toy container runtime in Ruby
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# Dependencies: | |
# - capsh | |
# - cgroup-utils | |
# - systemd | |
# - ruby | |
# | |
require 'fileutils' | |
require 'logger' | |
require 'optparse' | |
require 'ostruct' | |
require 'tmpdir' | |
require 'securerandom' | |
require 'shellwords' | |
Thread.abort_on_exception = true | |
class Options | |
class Invalid < StandardError; end | |
def self.parse(args, logger) | |
options = OpenStruct.new | |
options.cap_set = [] | |
options.cap_drop = [] | |
options.cgroups = [ | |
'blkio', | |
'cpu', | |
'cpuset', | |
'devices', | |
'freezer', | |
'hugetlb', | |
'memory', | |
'net_prio', | |
'perf_event', | |
'pids' | |
] | |
options.command = ['/sbin/init'] | |
options.container = nil | |
options.log_level = Logger::INFO | |
options.environment = { 'PATH' => '/usr/sbin:/usr/bin:/sbin:/bin' } | |
options.hostname = nil | |
options.image = nil | |
options.namespaces = ['ipc', 'mount', 'pid', 'uts'] # net, user | |
options.resources = [] | |
options.user = nil | |
options.volumes = [] | |
args = parse_command! options, args | |
parser(options).parse! args | |
parse_positional! options, args | |
logger.level = options.log_level | |
options.id = options.container + '-' + SecureRandom.hex | |
options.hostname ||= options.container | |
options | |
end | |
def self.parse_command!(options, args) | |
if dash_idx = args.index('--') | |
options.command = args.drop(dash_idx + 1) | |
args = args.take(dash_idx) | |
end | |
if options.command.empty? | |
raise OptionParser::MissingArgument, 'command' | |
end | |
args | |
end | |
def self.parse_positional!(options, args) | |
case args.size | |
when 0 | |
raise OptionParser::MissingArgument, 'image' | |
when 1 | |
raise OptionParser::MissingArgument, 'container' | |
when 2 | |
options.image, options.container = args | |
else | |
raise OptionParser::InvalidArgument | |
end | |
end | |
def self.parser(options) | |
overwrite_cgroups = nil | |
overwrite_environment = nil | |
overwrite_namespaces = nil | |
parser = OptionParser.new do |opts| | |
opts.banner = "Usage: #{__FILE__} [OPTIONS] IMAGE CONTAINER [-- COMMAND]" | |
opts.separator '' | |
opts.separator 'Options:' | |
opts.on '-c', '--cgroup CGROUP', 'CGroups (array)' do |cgroup| | |
overwrite_cgroups ||= [] | |
overwrite_cgroups << cgroup | |
options[:cgroups] = overwrite_cgroups.uniq | |
end | |
opts.on '-d', '--drop-cap CAPABILITY', 'Drop capabilities (array)' do |cap| | |
options[:cap_drop] << cap | |
options[:cap_drop].uniq! | |
end | |
opts.on '-e', '--environment ENV_VAR', 'Environment (array)' do |env_var| | |
key, value = env_var.split '=', 2 | |
overwrite_environment ||= {} | |
overwrite_environment[key] = value | |
options[:environment] = overwrite_environment | |
end | |
opts.on '-h', '--hostname NAME', 'Hostname' do |name| | |
options[:hostname] = name | |
end | |
opts.on '-l', '--level LEVEL', 'Log level' do |level| | |
options[:log_level] = Logger.const_get level.upcase | |
end | |
opts.on '-n', '--namespace NAMESPACE', 'Namespaces (array)' do |namespace| | |
overwrite_namespaces ||= [] | |
overwrite_namespaces << namespace | |
options[:namespaces] = overwrite_namespaces.uniq | |
end | |
opts.on '-r', '--resource RESOURCE', 'Set resource (array)' do |resource| | |
options[:resources] << resource | |
options[:resources].uniq! | |
end | |
opts.on '-s', '--set-cap CAPABILITY', 'Set capabilities (array)' do |cap| | |
options[:cap_set] << cap | |
options[:cap_set].uniq! | |
end | |
opts.on '-u', '--user USER', 'Set user' do |user| | |
options[:user] = user | |
end | |
opts.on '-v', '--volume SRC[:DEST[:OPTS]]', 'Volumes (array)' do |volume| | |
src, dest, opts = volume.split ':', 3 | |
dest ||= src | |
opts ||= '' | |
opts = opts.split(',').map do |opt| | |
opt = opt.split('=', 2) | |
opt = [opt.first, nil] if opt.size == 1 | |
opt | |
end | |
options[:volumes] << { | |
device: src, | |
dir: dest, | |
opts: Hash[opts] | |
} | |
end | |
end | |
parser | |
end | |
end | |
class Mount | |
def self.format_opts(opts={}) | |
opts.map do |k, v| | |
[k, v].compact.map { |i| Shellwords.escape(i) }.join('=') | |
end.join(',') | |
end | |
def self.format_command(device:, dir:, type:nil, opts:nil, args:[]) | |
command = ['mount'] | |
command += args | |
command += ['-t', type] if type | |
command += ['-o', format_opts(opts)] if opts.any? | |
command += [device, dir] | |
command | |
end | |
def self.unmount(dir:) | |
system Shellwords.join(['umount', '-l', dir]) | |
end | |
def self.mount(device:, dir:, type:nil, opts:nil, args:[]) | |
mount_command = format_command({ | |
device: device, dir: dir, type: type, opts: opts | |
}) | |
system Shellwords.join(mount_command) | |
begin | |
yield dir | |
ensure | |
unmount dir: dir | |
end if block_given? | |
dir | |
end | |
def self.overlay(root, &block) | |
Dir.mktmpdir do |tmp| | |
dirs = { | |
lowerdir: root, | |
upperdir: File.join(tmp, 'upper'), | |
workdir: File.join(tmp, 'work'), | |
overlay: File.join(tmp, 'overlay') | |
} | |
FileUtils.mkdir_p dirs.values | |
overlay_dir = dirs.delete :overlay | |
mount device: root, | |
dir: overlay_dir, | |
type: :overlay, | |
opts: dirs \ | |
do | |
block.call overlay_dir | |
end | |
end | |
end | |
end | |
class Utils | |
def self.child_pids(pid=Process.pid) | |
pids = [] | |
children = `pgrep -P #{pid}`.lines.map(&:strip).map(&:to_i) | |
children.delete(pid) | |
pids += children | |
pids += children.map { |child| child_pids(child) } | |
pids.flatten | |
end | |
def self.copy_host_resolv_conf(overlay) | |
FileUtils.cp '/etc/resolv.conf', \ | |
File.join(overlay, '/etc/resolv.conf') | |
end | |
def self.mount_host_volumes(overlay, volumes) | |
volumes.map do |vol| | |
overlay_vol_dir = File.join(overlay, vol[:dir]) | |
vol[:dir] = overlay_vol_dir | |
vol[:args] ||= [] | |
vol[:args] |= ['--rbind'] | |
[ | |
['mkdir', '-p', overlay_vol_dir], | |
Mount.format_command(vol) | |
].each do |command| | |
system Shellwords.join(command) | |
end | |
trap 'EXIT' do | |
`#{Shellwords.join(['umount', '-f', overlay_vol_dir])} >/dev/null 2>&1` | |
end | |
overlay_vol_dir | |
end | |
end | |
def self.umount_host_volumes(mounts) | |
mounts.each do |mount| | |
`#{Shellwords.join(['umount', mount])} >/dev/null 2>&1` | |
end | |
end | |
end | |
class CGroup | |
def self.group_opt(controllers:, path:) | |
group = controllers.join(',') + ':' + path | |
['-g', group] | |
end | |
def self.create_group(controllers:, path:) | |
system Shellwords.join([ | |
'cgcreate', *group_opt({ | |
controllers: controllers, | |
path: path | |
}) | |
]) | |
end | |
def self.delete_group(controllers:, path:) | |
system Shellwords.join([ | |
'cgdelete', *group_opt({ | |
controllers: controllers, | |
path: path | |
}) | |
]) | |
end | |
def self.resources(cgroup:, resources:) | |
resource_opts = resources.map { |r| ['-r', r] }.flatten | |
return if resource_opts.empty? | |
system Shellwords.join([ | |
'cgset', *resource_opts, cgroup[:path] | |
]) | |
end | |
def self.exec(cgroup:, command:) | |
system Shellwords.join([ | |
'cgexec', '--sticky', *group_opt(cgroup), *command | |
]) | |
end | |
def self.with_group(controllers:, path:, &block) | |
cgroup = { controllers: controllers, path: path } | |
create_group cgroup | |
begin | |
block.call cgroup | |
ensure | |
delete_group cgroup | |
end | |
end | |
end | |
class Slice | |
def self.slice_path(*path) | |
File.join '/sys/fs/cgroup/systemd', *path | |
end | |
def self.create(slice) | |
path = slice_path slice | |
FileUtils.mkdir_p path | |
path | |
end | |
def self.delete(slice_path) | |
FileUtils.rm_rf slice_path | |
end | |
def self.watch(slice_path) | |
Thread.new do | |
loop do | |
Utils.child_pids.each do |pid| | |
begin | |
File.open(File.join(slice_path, 'tasks'), 'a') do |f| | |
f.puts pid.to_s | |
end | |
rescue | |
end | |
end | |
sleep 0.25 | |
end | |
end | |
end | |
end | |
unless Process.uid.zero? | |
class YouAintRoot < StandardError; end | |
raise YouAintRoot, 'Must be run with root privileges' | |
end | |
logger = Logger.new $stderr | |
options = Options.parse ARGV, logger | |
unless File.exist? options.image | |
class ImageDontExist < StandardError; end | |
raise ImageDontExist, 'Could not find specified image' | |
end | |
Mount.overlay options.image do |overlay| | |
environment = options.environment.map do |k, v| | |
"#{Shellwords.escape k}=#{Shellwords.escape v}" | |
end.join(' ') | |
start_container = <<-END | |
mount -t proc proc /proc | |
mount -t tmpfs -o nosuid,strictatime,mode=755,size=1G tmpfs /tmp | |
mount -t tmpfs -o nosuid,strictatime,mode=755,size=1G tmpfs /dev | |
mount -t sysfs -o nosuid,noexec,nodev,ro sys /sys | |
hostname #{Shellwords.escape options.hostname} | |
exec env -i #{environment} \ | |
#{Shellwords.join(options.command)} | |
END | |
cap_chroot = "--chroot=#{overlay}" | |
cap_sets = options.cap_set.map { |cap| Shellwords.escape "cap_#{cap}" } | |
cap_set = cap_sets.any? ? '--cap=' + cap_sets.join(',') : nil | |
cap_drops = options.cap_drop.map { |cap| Shellwords.escape "cap_#{cap}" } | |
cap_drop = cap_drops.any? ? '--drop=' + cap_drops.join(',') : nil | |
cap_user = options.user ? "--user=#{options.user}" : nil | |
capsh_opts = [cap_chroot, cap_set, cap_drop, cap_user].compact | |
capsh_command = ['capsh', *capsh_opts, '--', '-c', start_container] | |
unshare_opts = options.namespaces.map { |ns| "--#{ns}" } | |
unshare_command = ['unshare', *unshare_opts, '--fork', *capsh_command] | |
logger.debug options: options, command: unshare_command | |
CGroup.with_group controllers: options.cgroups, | |
path: "/rocker/#{options.id}" \ | |
do |cgroup| | |
Utils.copy_host_resolv_conf overlay | |
mounts = Utils.mount_host_volumes overlay, options.volumes | |
rocker_slice = Slice.create '/rocker' | |
container_slice = Slice.create "/rocker/#{options.id}" | |
begin | |
Slice.watch container_slice | |
CGroup.resources cgroup: cgroup, | |
resources: options.resources | |
CGroup.exec cgroup: cgroup, | |
command: unshare_command | |
ensure | |
Slice.delete container_slice | |
Utils.umount_host_volumes mounts | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment