doc: Fix typo in English lxc.container.conf(5)

[mirror_lxc.git] / doc / lxc.container.conf.sgml.in
diff --git a/doc/lxc.container.conf.sgml.in b/doc/lxc.container.conf.sgml.in

index 922eeaac8bcb60003d904ae65ddfe165c58d22ec..f3eb13d71f34b48d3df6d8102c876e5fb94f4da5 100644 (file)
--- a/doc/lxc.container.conf.sgml.in
+++ b/doc/lxc.container.conf.sgml.in
@@ -412,6 +412,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
              <para>
                specify what kind of network virtualization to be used
                for the container.
+              Must be specified before any other option(s) on the net device.
                Multiple networks can be specified by using an additional index
                <option>i</option>
                after all <option>lxc.net.*</option> keys. For example,
@@ -474,6 +475,12 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                <option>lxc.net.[i].veth.ipv6.route</option> options.
                Several lines specify several routes.
                The route is in format x.y.z.t/m, eg. 192.168.1.0/24.
+
+              In <option>bridge</option> mode untagged VLAN membership can be set with the
+              <option>lxc.net.[i].veth.vlan.id</option> option. It accepts a special value of 'none' indicating
+              that the container port should be removed from the bridge's default untagged VLAN.
+              The <option>lxc.net.[i].veth.vlan.tagged.id</option> option can be specified multiple times to set
+              the container's bridge port membership to one or more tagged VLANs.
              </para>
  
              <para>
@@ -530,25 +537,25 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                modes are <option>l3</option>, <option>l3s</option> and
                <option>l2</option>. It defaults to <option>l3</option> mode.
                In <option>l3</option> mode TX processing up to L3 happens on the stack instance
-              attached to the slave device and packets are switched to the stack instance of the
-              master device for the L2 processing and routing from that instance will be
-              used before packets are queued on the outbound device. In this mode the slaves
+              attached to the dependent device and packets are switched to the stack instance of the
+              parent device for the L2 processing and routing from that instance will be
+              used before packets are queued on the outbound device. In this mode the dependent devices
                will not receive nor can send multicast / broadcast traffic.
                In <option>l3s</option> mode TX processing is very similar to the L3 mode except that
                iptables (conn-tracking) works in this mode and hence it is L3-symmetric (L3s).
                This will have slightly less performance but that shouldn't matter since you are
                choosing this mode over plain-L3 mode to make conn-tracking work.
                In <option>l2</option> mode TX processing happens on the stack instance attached to
-              the slave device and packets are switched and queued to the master device to send
-              out. In this mode the slaves will RX/TX multicast and broadcast (if applicable) as well.
+              the dependent device and packets are switched and queued to the parent device to send devices
+              out. In this mode the dependent devices will RX/TX multicast and broadcast (if applicable) as well.
                <option>lxc.net.[i].ipvlan.isolation</option> specifies the isolation mode.
                The accepted isolation values are <option>bridge</option>,
                <option>private</option> and <option>vepa</option>.
                It defaults to <option>bridge</option>.
-              In <option>bridge</option> isolation mode slaves can cross-talk among themselves
-              apart from talking through the master device.
+              In <option>bridge</option> isolation mode dependent devices can cross-talk among themselves
+              apart from talking through the parent device.
                In <option>private</option> isolation mode the port is set in private mode.
-              i.e. port won't allow cross communication between slaves.
+              i.e. port won't allow cross communication between dependent devices.
                In <option>vepa</option> isolation mode the port is set in VEPA mode.
                i.e. port will offload switching functionality to the external entity as
                described in 802.1Qbg.
@@ -867,7 +874,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
              <para>
                If set, the container will have a new pseudo tty
                instance, making this private to it. The value specifies
-              the maximum number of pseudo ttys allowed for a pts
+              the maximum number of pseudo ttys allowed for a pty
                instance (this limitation is not implemented yet).
              </para>
            </listitem>
@@ -1118,7 +1125,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
          the container at some <filename>path</filename>, and then mounts
          under <filename>path</filename>, then a TOCTTOU attack would be
          possible where the container user modifies a symbolic link under
-        his home directory at just the right time.
+        their home directory at just the right time.
        </para>
        <variablelist>
          <varlistentry>
@@ -1164,7 +1171,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                Specify a mount point corresponding to a line in the
                fstab format.
  
-              Moreover lxc supports mount propagation, such as rslave or
+              Moreover lxc supports mount propagation, such as rshared or
                rprivate, and adds three additional mount options.
                <option>optional</option> don't fail if mount does not work.
                <option>create=dir</option> or <option>create=file</option>
@@ -1172,9 +1179,9 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                <option>relative</option> source path is taken to be relative to
                the mounted container root. For instance,
               </para>
-<screen>
-dev/null proc/kcore none bind,relative 0 0
-</screen>
+             <programlisting>
+             dev/null proc/kcore none bind,relative 0 0
+             </programlisting>
               <para>
                Will expand dev/null to ${<option>LXC_ROOTFS_MOUNT</option>}/dev/null,
                and mount it to proc/kcore inside the container.
@@ -1490,7 +1497,21 @@ dev/null proc/kcore none bind,relative 0 0
            </term>
            <listitem>
              <para>
-              extra mount options to use when mounting the rootfs.
+              Specify extra mount options to use when mounting the rootfs.
+                The format of the mount options corresponds to the
+               format used in fstab. In addition, LXC supports the custom
+                <option>idmap=</option> mount option. This option can be used
+               to tell LXC to create an idmapped mount for the container's
+                rootfs. This is useful when the user doesn't want to recursively
+               chown the rootfs of the container to match the idmapping of the
+               user namespace the container is going to use. Instead an
+               idmapped mount can be used to handle this.
+               The argument for
+                <option>idmap=</option>
+                can either be a path pointing to a user namespace file that
+                LXC will open and use to idmap the rootfs or the special value
+                "container" which will instruct LXC to use
+               the container's user namespace to idmap the rootfs.
              </para>
            </listitem>
          </varlistentry>
@@ -1512,7 +1533,7 @@ dev/null proc/kcore none bind,relative 0 0
      </refsect2>
  
      <refsect2>
-      <title>Control group</title>
+      <title>Control groups ("cgroups")</title>
        <para>
          The control group section contains the configuration for the
          different subsystem. <command>lxc</command> does not check the
@@ -1521,10 +1542,195 @@ dev/null proc/kcore none bind,relative 0 0
          started, but has the advantage of permitting any future
          subsystem.
        </para>
+
+      <para>
+       The kernel implementation of cgroups has changed significantly over the
+       years. With Linux 4.5 support for a new cgroup filesystem was added
+       usually referred to as "cgroup2" or "unified hierarchy". Since then the
+       old cgroup filesystem is usually referred to as "cgroup1" or the
+       "legacy hierarchies". Please see the cgroups manual page for a detailed
+       explanation of the differences between the two versions.
+      </para>
+
+      <para>
+       LXC distinguishes settings for the legacy and the unified hierarchy by
+       using different configuration key prefixes. To alter settings for
+       controllers in a legacy hierarchy the key prefix
+       <option>lxc.cgroup.</option> must be used and in order to alter the
+       settings for a controller in the unified hierarchy the
+       <option>lxc.cgroup2.</option> key must be used. Note that LXC will
+       ignore <option>lxc.cgroup.</option> settings on systems that only use
+       the unified hierarchy. Conversely, it will ignore
+       <option>lxc.cgroup2.</option> options on systems that only use legacy
+       hierachies.
+      </para>
+
+      <para>
+       At its core a cgroup hierarchy is a way to hierarchically organize
+       processes. Usually a cgroup hierarchy will have one or more
+       "controllers" enabled. A "controller" in a cgroup hierarchy is usually
+       responsible for distributing a specific type of system resource along
+       the hierarchy. Controllers include the "pids" controller, the "cpu"
+       controller, the "memory" controller and others. Some controllers
+       however do not fall into the category of distributing a system
+       resource, instead they are often referred to as "utility" controllers.
+       One utility controller is the device controller. Instead of
+       distributing a system resource it allows to manage device access.
+      </para>
+
+      <para>
+       In the legacy hierarchy the device controller was implemented like most
+       other controllers as a set of files that could be written to. These
+       files where named "devices.allow" and "devices.deny". The legacy device
+       controller allowed the implementation of both "allowlists" and
+       "denylists".
+      </para>
+
+      <para>
+       An allowlist is a device program that by default blocks access to all
+       devices. In order to access specific devices "allow rules" for
+       particular devices or device classes must be specified. In contrast, a
+       denylist is a device program that by default allows access to all
+       devices. In order to restrict access to specific devices "deny rules"
+       for particular devices or device classes must be specified.
+      </para>
+
+      <para>
+       In the unified cgroup hierarchy the implementation of the device
+       controller has completely changed. Instead of files to read from and
+       write to a eBPF program of
+       <option>BPF_PROG_TYPE_CGROUP_DEVICE</option> can be attached to a
+       cgroup. Even though the kernel implementation has changed completely
+       LXC tries to allow for the same semantics to be followed in the legacy
+       device cgroup and the unified eBPF-based device controller. The
+       following paragraphs explain the semantics for the unified eBPF-based
+       device controller.
+      </para>
+
+      <para>
+       As mentioned the format for specifying device rules for the unified
+       eBPF-based device controller is the same as for the legacy cgroup
+       device controller; only the configuration key prefix has changed.
+       Specifically, device rules for the legacy cgroup device controller are
+       specified via <option>lxc.cgroup.devices.allow</option> and
+       <option>lxc.cgroup.devices.deny</option> whereas for the
+       cgroup2 eBPF-based device controller
+       <option>lxc.cgroup2.devices.allow</option> and
+       <option>lxc.cgroup2.devices.deny</option> must be used.
+      </para>
+      <para>
+        <itemizedlist>
+          <listitem>
+           <para>
+             A allowlist device rule
+              <programlisting>
+               lxc.cgroup2.devices.deny = a
+              </programlisting>
+             will cause LXC to instruct the kernel to block access to all
+             devices by default. To grant access to devices allow device rules
+             must be added via the <option>lxc.cgroup2.devices.allow</option>
+             key. This is referred to as a "allowlist" device program.
+           </para>
+         </listitem>
+
+          <listitem>
+           <para>
+             A denylist device rule
+              <programlisting>
+               lxc.cgroup2.devices.allow = a
+              </programlisting>
+             will cause LXC to instruct the kernel to allow access to all
+             devices by default. To deny access to devices deny device rules
+             must be added via <option>lxc.cgroup2.devices.deny</option> key.
+             This is referred to as a "denylist" device program.
+           </para>
+         </listitem>
+
+          <listitem>
+           <para>
+             Specifying any of the aformentioned two rules will cause all
+             previous rules to be cleared, i.e. the device list will be reset.
+           </para>
+         </listitem>
+
+          <listitem>
+           <para>
+           When an allowlist program is requested, i.e. access to all devices
+           is blocked by default, specific deny rules for individual devices
+           or device classes are ignored.
+           </para>
+         </listitem>
+
+          <listitem>
+           <para>
+           When a denylist program is requested, i.e. access to all devices
+           is allowed by default, specific allow rules for individual devices
+           or device classes are ignored.
+           </para>
+         </listitem>
+        </itemizedlist>
+      </para>
+
+      <para>
+        For example the set of rules:
+        <programlisting>
+          lxc.cgroup2.devices.deny = a
+          lxc.cgroup2.devices.allow = c *:* m
+          lxc.cgroup2.devices.allow = b *:* m
+          lxc.cgroup2.devices.allow = c 1:3 rwm
+        </programlisting>
+       implements an allowlist device program, i.e. the kernel will block
+       access to all devices not specifically allowed in this list. This
+       particular program states that all character and block devices may be
+       created but only /dev/null might be read or written.
+      </para>
+
+      <para>
+        If we instead switch to the following set of rules:
+        <programlisting>
+          lxc.cgroup2.devices.allow = a
+          lxc.cgroup2.devices.deny = c *:* m
+          lxc.cgroup2.devices.deny = b *:* m
+          lxc.cgroup2.devices.deny = c 1:3 rwm
+        </programlisting>
+        then LXC would instruct the kernel to implement a denylist, i.e. the
+        kernel will allow access to all devices not specifically denied in
+        this list. This particular program states that no character devices or
+        block devices might be created and that /dev/null is not allow allowed
+        to be read, written, or created.
+      </para>
+
+      <para>
+        Now consider the same program but followed by a "global rule"
+        which determines the type of device program (allowlist or
+        denylist) as explained above:
+        <programlisting>
+          lxc.cgroup2.devices.allow = a
+          lxc.cgroup2.devices.deny = c *:* m
+          lxc.cgroup2.devices.deny = b *:* m
+          lxc.cgroup2.devices.deny = c 1:3 rwm
+          lxc.cgroup2.devices.allow = a
+        </programlisting>
+       The last line will cause LXC to reset the device list without changing
+       the type of device program.
+      </para>
+
+      <para>
+       If we specify:
+        <programlisting>
+          lxc.cgroup2.devices.allow = a
+          lxc.cgroup2.devices.deny = c *:* m
+          lxc.cgroup2.devices.deny = b *:* m
+          lxc.cgroup2.devices.deny = c 1:3 rwm
+          lxc.cgroup2.devices.deny = a
+        </programlisting>
+       instead then the last line will cause LXC to reset the device list and
+       switch from a allowlist program to a denylist program.
+      </para>
        <variablelist>
          <varlistentry>
            <term>
-            <option>lxc.cgroup.[controller name]</option>
+            <option>lxc.cgroup.[controller name].[controller file]</option>
            </term>
            <listitem>
              <para>
@@ -1539,7 +1745,7 @@ dev/null proc/kcore none bind,relative 0 0
          </varlistentry>
          <varlistentry>
            <term>
-            <option>lxc.cgroup2.[controller name]</option>
+            <option>lxc.cgroup2.[controller name].[controller file]</option>
            </term>
            <listitem>
              <para>
@@ -1571,6 +1777,65 @@ dev/null proc/kcore none bind,relative 0 0
              </para>
            </listitem>
          </varlistentry>
+        <varlistentry>
+          <term>
+            <option>lxc.cgroup.dir.container</option>
+          </term>
+          <listitem>
+            <para>
+              This is similar to <option>lxc.cgroup.dir</option>, but must be
+              used together with <option>lxc.cgroup.dir.monitor</option> and
+              affects only the container's cgroup path. This option is mutually
+              exclusive with <option>lxc.cgroup.dir</option>.
+              Note that the final path the container attaches to may be
+              extended further by the
+              <option>lxc.cgroup.dir.container.inner</option> option.
+            </para>
+          </listitem>
+        </varlistentry>
+        <varlistentry>
+          <term>
+            <option>lxc.cgroup.dir.monitor</option>
+          </term>
+          <listitem>
+            <para>
+              This is the monitor process counterpart to
+              <option>lxc.cgroup.dir.container</option>.
+            </para>
+          </listitem>
+        </varlistentry>
+        <varlistentry>
+          <term>
+            <option>lxc.cgroup.dir.monitor.pivot</option>
+          </term>
+          <listitem>
+            <para>
+              On container termination the PID of the monitor process is attached to this cgroup.
+              This path should not be a subpath of any other configured cgroup dir to ensure
+              proper removal of other cgroup paths on container termination.
+            </para>
+          </listitem>
+        </varlistentry>
+        <varlistentry>
+          <term>
+            <option>lxc.cgroup.dir.container.inner</option>
+          </term>
+          <listitem>
+            <para>
+              Specify an additional subdirectory where the cgroup namespace
+              will be created. With this option, the cgroup limits will be
+              applied to the outer path specified in
+              <option>lxc.cgroup.dir.container</option>, which is not accessible
+              from within the container, making it possible to better enforce
+              limits for privileged containers in a way they cannot override
+              them.
+              This only works in conjunction with the
+              <option>lxc.cgroup.dir.container</option> and
+              <option>lxc.cgroup.dir.monitor</option> options and has otherwise
+              no effect.
+            </para>
+          </listitem>
+        </varlistentry>
          <varlistentry>
            <term>
              <option>lxc.cgroup.relative</option>
@@ -1676,7 +1941,7 @@ dev/null proc/kcore none bind,relative 0 0
              standard namespace identifiers as seen in the
              <filename>/proc/PID/ns</filename> directory.
              The <option>lxc.namespace.keep</option> is a
-            blacklist option, i.e. it is useful when enforcing that containers
+            denylist option, i.e. it is useful when enforcing that containers
              must keep a specific set of namespaces.
              </para>
  
@@ -1753,6 +2018,33 @@ dev/null proc/kcore none bind,relative 0 0
              </para>
            </listitem>
          </varlistentry>
+
+        <varlistentry>
+          <term>
+            <option>lxc.time.offset.boot</option>
+          </term>
+          <listitem>
+            <para>
+           Specify a positive or negative offset for the boottime clock. The
+           format accepts hours (h), minutes (m), seconds (s),
+           milliseconds (ms), microseconds (us), and nanoseconds (ns).
+            </para>
+          </listitem>
+        </varlistentry>
+
+        <varlistentry>
+          <term>
+            <option>lxc.time.offset.monotonic</option>
+          </term>
+          <listitem>
+            <para>
+           Specify a positive or negative offset for the monotonic clock. The
+           format accepts hours (h), minutes (m), seconds (s),
+           milliseconds (ms), microseconds (us), and nanoseconds (ns).
+            </para>
+          </listitem>
+        </varlistentry>
+
        </variablelist>
      </refsect2>
  
@@ -1932,6 +2224,44 @@ dev/null proc/kcore none bind,relative 0 0
              <programlisting>lxc.selinux.context = system_u:system_r:lxc_t:s0:c22</programlisting>
            </listitem>
          </varlistentry>
+        <varlistentry>
+          <term>
+            <option>lxc.selinux.context.keyring</option>
+          </term>
+          <listitem>
+            <para>
+              Specify the SELinux context under which the container's keyring
+              should be created. By default this the same as lxc.selinux.context, or
+              the context lxc is executed under if lxc.selinux.context has not been set.
+            </para>
+            <programlisting>lxc.selinux.context.keyring = system_u:system_r:lxc_t:s0:c22</programlisting>
+          </listitem>
+        </varlistentry>
+      </variablelist>
+    </refsect2>
+
+    <refsect2>
+      <title>Kernel Keyring</title>
+      <para>
+        The Linux Keyring facility is primarily a way for various
+        kernel components to retain or cache security data, authentication
+        keys, encryption keys, and other data in the kernel. By default lxc
+        will create a new session keyring for the started application.
+      </para>
+      <variablelist>
+        <varlistentry>
+          <term>
+            <option>lxc.keyring.session</option>
+          </term>
+          <listitem>
+            <para>
+              Disable the creation of new session keyring by lxc. The started
+              application will then inherit the current session keyring.
+              By default, or when passing the value 1, a new keyring will be created.
+            </para>
+            <programlisting>lxc.keyring.session = 0</programlisting>
+          </listitem>
+        </varlistentry>
        </variablelist>
      </refsect2>
  
@@ -1946,26 +2276,26 @@ dev/null proc/kcore none bind,relative 0 0
        </para>
        <para>
          Versions 1 and 2 are currently supported.  In version 1, the
-        policy is a simple whitelist.  The second line therefore must
-        read "whitelist", with the rest of the file containing one (numeric)
-        syscall number per line.  Each syscall number is whitelisted,
-        while every unlisted number is blacklisted for use in the container
+        policy is a simple allowlist.  The second line therefore must
+        read "allowlist", with the rest of the file containing one (numeric)
+        syscall number per line.  Each syscall number is allowlisted,
+        while every unlisted number is denylisted for use in the container
        </para>
  
        <para>
-       In version 2, the policy may be blacklist or whitelist,
+       In version 2, the policy may be denylist or allowlist,
         supports per-rule and per-policy default actions, and supports
         per-architecture system call resolution from textual names.
        </para>
        <para>
-       An example blacklist policy, in which all system calls are
+       An example denylist policy, in which all system calls are
         allowed except for mknod, which will simply do nothing and
         return 0 (success), looks like:
        </para>
  
        <programlisting>
        2
-      blacklist
+      denylist
        mknod errno 0
        ioctl notify
        </programlisting>
@@ -2770,7 +3100,7 @@ dev/null proc/kcore none bind,relative 0 0
          lxc.net.1.ipv6.address = 2003:db8:1:0:214:1234:fe0b:3596
          lxc.net.2.type = phys
          lxc.net.2.flags = up
-        lxc.net.2.link = dummy0
+        lxc.net.2.link = random0
          lxc.net.2.hwaddr = 4a:49:43:49:79:ff
          lxc.net.2.ipv4.address = 10.2.3.6/24
          lxc.net.2.ipv6.address = 2003:db8:1:0:214:1234:fe0b:3297
@@ -2782,6 +3112,7 @@ dev/null proc/kcore none bind,relative 0 0
          lxc.mount.fstab = /etc/fstab.complex
          lxc.mount.entry = /lib /root/myrootfs/lib none ro,bind 0 0
          lxc.rootfs.path = dir:/mnt/rootfs.complex
+        lxc.rootfs.options = idmap=container
          lxc.cap.drop = sys_module mknod setuid net_raw
          lxc.cap.drop = mac_override
        </programlisting>