doc: Fix typo in English lxc.container.conf(5)

[mirror_lxc.git] / doc / lxc.container.conf.sgml.in
diff --git a/doc/lxc.container.conf.sgml.in b/doc/lxc.container.conf.sgml.in

index 00b51a94aa9c92d502d2840c711b4a7472465949..f3eb13d71f34b48d3df6d8102c876e5fb94f4da5 100644 (file)
--- a/doc/lxc.container.conf.sgml.in
+++ b/doc/lxc.container.conf.sgml.in
@@ -375,7 +375,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
            <listitem>
              <para>
                The only allowed values are 0 and 1. Set this to 1 to destroy a
-              container on shutdown. 
+              container on shutdown.
              </para>
            </listitem>
          </varlistentry>
@@ -412,6 +412,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
              <para>
                specify what kind of network virtualization to be used
                for the container.
+              Must be specified before any other option(s) on the net device.
                Multiple networks can be specified by using an additional index
                <option>i</option>
                after all <option>lxc.net.*</option> keys. For example,
@@ -443,14 +444,23 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
              <para>
                <option>veth:</option> a virtual ethernet pair
                device is created with one side assigned to the container
-              and the other side attached to a bridge specified by
+              and the other side on the host.
+              <option>lxc.net.[i].veth.mode</option> specifies the
+              mode the veth parent will use on the host.
+              The accepted  modes are <option>bridge</option> and <option>router</option>.
+              The mode defaults to bridge if not specified.
+              In <option>bridge</option> mode the host side is attached to a bridge specified by
                the <option>lxc.net.[i].link</option> option.
-              If the bridge is not specified, then the veth pair device
+              If the bridge link is not specified, then the veth pair device
                will be created but not attached to any bridge.
                Otherwise, the bridge has to be created on the system
                before starting the container.
                <command>lxc</command> won't handle any
                configuration outside of the container.
+              In <option>router</option> mode static routes are created on the host for the
+              container's IP addresses pointing to the host side veth interface.
+              Additionally Proxy ARP and Proxy NDP entries are added on the host side veth interface
+              for the gateway IPs defined in the container to allow the container to reach the host.
                By default, <command>lxc</command> chooses a name for the
                network device belonging to the outside of the
                container, but if you wish to handle
@@ -459,6 +469,18 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                the <option>lxc.net.[i].veth.pair</option> option (except for
                unprivileged containers where this option is ignored for security
                reasons).
+
+              Static routes can be added on the host pointing to the container using the
+              <option>lxc.net.[i].veth.ipv4.route</option> and
+              <option>lxc.net.[i].veth.ipv6.route</option> options.
+              Several lines specify several routes.
+              The route is in format x.y.z.t/m, eg. 192.168.1.0/24.
+
+              In <option>bridge</option> mode untagged VLAN membership can be set with the
+              <option>lxc.net.[i].veth.vlan.id</option> option. It accepts a special value of 'none' indicating
+              that the container port should be removed from the bridge's default untagged VLAN.
+              The <option>lxc.net.[i].veth.vlan.tagged.id</option> option can be specified multiple times to set
+              the container's bridge port membership to one or more tagged VLANs.
              </para>
  
              <para>
@@ -479,7 +501,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                different macvlan on the same upper device. The accepted
                modes are <option>private</option>, <option>vepa</option>,
                <option>bridge</option> and <option>passthru</option>.
-             In <option>private</option> mode, the device never
+              In <option>private</option> mode, the device never
                communicates with any other device on the same upper_dev (default).
                In <option>vepa</option> mode, the new Virtual Ethernet Port
                Aggregator (VEPA) mode, it assumes that the adjacent
@@ -504,6 +526,41 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                mode is possible for one physical interface.
              </para>
  
+            <para>
+              <option>ipvlan:</option> an ipvlan interface is linked
+              with the interface specified by
+              the <option>lxc.net.[i].link</option> and assigned to
+              the container.
+              <option>lxc.net.[i].ipvlan.mode</option> specifies the
+              mode the ipvlan will use to communicate between
+              different ipvlan on the same upper device. The accepted
+              modes are <option>l3</option>, <option>l3s</option> and
+              <option>l2</option>. It defaults to <option>l3</option> mode.
+              In <option>l3</option> mode TX processing up to L3 happens on the stack instance
+              attached to the dependent device and packets are switched to the stack instance of the
+              parent device for the L2 processing and routing from that instance will be
+              used before packets are queued on the outbound device. In this mode the dependent devices
+              will not receive nor can send multicast / broadcast traffic.
+              In <option>l3s</option> mode TX processing is very similar to the L3 mode except that
+              iptables (conn-tracking) works in this mode and hence it is L3-symmetric (L3s).
+              This will have slightly less performance but that shouldn't matter since you are
+              choosing this mode over plain-L3 mode to make conn-tracking work.
+              In <option>l2</option> mode TX processing happens on the stack instance attached to
+              the dependent device and packets are switched and queued to the parent device to send devices
+              out. In this mode the dependent devices will RX/TX multicast and broadcast (if applicable) as well.
+              <option>lxc.net.[i].ipvlan.isolation</option> specifies the isolation mode.
+              The accepted isolation values are <option>bridge</option>,
+              <option>private</option> and <option>vepa</option>.
+              It defaults to <option>bridge</option>.
+              In <option>bridge</option> isolation mode dependent devices can cross-talk among themselves
+              apart from talking through the parent device.
+              In <option>private</option> isolation mode the port is set in private mode.
+              i.e. port won't allow cross communication between dependent devices.
+              In <option>vepa</option> isolation mode the port is set in VEPA mode.
+              i.e. port will offload switching functionality to the external entity as
+              described in 802.1Qbg.
+            </para>
+
              <para>
                <option>phys:</option> an already existing interface
                specified by the <option>lxc.net.[i].link</option> is
@@ -537,6 +594,24 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
            </listitem>
          </varlistentry>
  
+        <varlistentry>
+          <term>
+            <option>lxc.net.[i].l2proxy</option>
+          </term>
+          <listitem>
+            <para>
+              Controls whether layer 2 IP neighbour proxy entries will be added to the
+              lxc.net.[i].link interface for the IP addresses of the container.
+              Can be set to 0 or 1. Defaults to 0.
+              When used with IPv4 addresses, the following sysctl values need to be set:
+              net.ipv4.conf.[link].forwarding=1
+              When used with IPv6 addresses, the following sysctl values need to be set:
+              net.ipv6.conf.[link].proxy_ndp=1
+              net.ipv6.conf.[link].forwarding=1
+              </para>
+          </listitem>
+        </varlistentry>
+
          <varlistentry>
            <term>
              <option>lxc.net.[i].mtu</option>
@@ -604,8 +679,11 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                interface (as specified by the
                <option>lxc.net.[i].link</option> option) and use that as
                the gateway. <option>auto</option> is only available when
-              using the <option>veth</option> and
-              <option>macvlan</option> network types.
+              using the <option>veth</option>,
+              <option>macvlan</option> and <option>ipvlan</option> network types.
+              Can also have the special value of <option>dev</option>,
+              which means to set the default gateway as a device route.
+              This is primarily for use with layer 3 network modes, such as IPVLAN.
              </para>
            </listitem>
          </varlistentry>
@@ -638,8 +716,11 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                interface (as specified by the
                <option>lxc.net.[i].link</option> option) and use that as
                the gateway. <option>auto</option> is only available when
-              using the <option>veth</option> and
-              <option>macvlan</option> network types.
+              using the <option>veth</option>,
+              <option>macvlan</option> and <option>ipvlan</option> network types.
+              Can also have the special value of <option>dev</option>,
+              which means to set the default gateway as a device route.
+              This is primarily for use with layer 3 network modes, such as IPVLAN.
              </para>
            </listitem>
          </varlistentry>
@@ -674,7 +755,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                  <listitem>
                   <para>
                   LXC_NET_TYPE: the network type. This is one of the valid
-                 network types listed here (e.g. 'macvlan', 'veth').
+                 network types listed here (e.g. 'vlan', 'macvlan', 'ipvlan', 'veth').
                    </para>
                  </listitem>
  
@@ -740,7 +821,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                  <listitem>
                   <para>
                   LXC_NET_TYPE: the network type. This is one of the valid
-                 network types listed here (e.g. 'macvlan', 'veth').
+                 network types listed here (e.g. 'vlan', 'macvlan', 'ipvlan', 'veth').
                    </para>
                  </listitem>
  
@@ -793,7 +874,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
              <para>
                If set, the container will have a new pseudo tty
                instance, making this private to it. The value specifies
-              the maximum number of pseudo ttys allowed for a pts
+              the maximum number of pseudo ttys allowed for a pty
                instance (this limitation is not implemented yet).
              </para>
            </listitem>
@@ -855,7 +936,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
              When manually specifying a size for the log file the value should
              be a power of 2 when converted to bytes. Valid size prefixes are
              'KB', 'MB', 'GB'. (Note that all conversions are based on multiples
-            of 1024. That means 'KB' == 'KiB', 'MB' == 'MiB', 'GB' == 'GiB'. 
+            of 1024. That means 'KB' == 'KiB', 'MB' == 'MiB', 'GB' == 'GiB'.
              Additionally, the case of the suffix is ignored, i.e. 'kB', 'KB' and
              'Kb' are treated equally.)
  
@@ -990,7 +1071,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
          <filename>/dev</filename> to be set up as needed in the container
          rootfs.  If lxc.autodev is set to 1, then after mounting the container's
          rootfs LXC will mount a fresh tmpfs under <filename>/dev</filename>
-        (limited to 500k) and fill in a minimal set of initial devices.
+        (limited to 500K by default, unless defined in lxc.autodev.tmpfs.size)
+        and fill in a minimal set of initial devices.
          This is generally required when starting a container containing
          a "systemd" based "init" but may be optional at other times.  Additional
          devices in the containers /dev directory may be created through the
@@ -1008,6 +1090,19 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
              </para>
            </listitem>
          </varlistentry>
+
+        <varlistentry>
+          <term>
+            <option>lxc.autodev.tmpfs.size</option>
+          </term>
+          <listitem>
+            <para>
+              Set this to define the size of the /dev tmpfs.
+              The default value is 500000 (500K). If the parameter is used
+              but without value, the default value is used.
+            </para>
+          </listitem>
+        </varlistentry>
        </variablelist>
      </refsect2>
  
@@ -1030,7 +1125,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
          the container at some <filename>path</filename>, and then mounts
          under <filename>path</filename>, then a TOCTTOU attack would be
          possible where the container user modifies a symbolic link under
-        his home directory at just the right time.
+        their home directory at just the right time.
        </para>
        <variablelist>
          <varlistentry>
@@ -1076,7 +1171,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                Specify a mount point corresponding to a line in the
                fstab format.
  
-              Moreover lxc supports mount propagation, such as rslave or
+              Moreover lxc supports mount propagation, such as rshared or
                rprivate, and adds three additional mount options.
                <option>optional</option> don't fail if mount does not work.
                <option>create=dir</option> or <option>create=file</option>
@@ -1084,9 +1179,9 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                <option>relative</option> source path is taken to be relative to
                the mounted container root. For instance,
               </para>
-<screen>
-dev/null proc/kcore none bind,relative 0 0
-</screen>
+             <programlisting>
+             dev/null proc/kcore none bind,relative 0 0
+             </programlisting>
               <para>
                Will expand dev/null to ${<option>LXC_ROOTFS_MOUNT</option>}/dev/null,
                and mount it to proc/kcore inside the container.
@@ -1402,7 +1497,34 @@ dev/null proc/kcore none bind,relative 0 0
            </term>
            <listitem>
              <para>
-              extra mount options to use when mounting the rootfs.
+              Specify extra mount options to use when mounting the rootfs.
+                The format of the mount options corresponds to the
+               format used in fstab. In addition, LXC supports the custom
+                <option>idmap=</option> mount option. This option can be used
+               to tell LXC to create an idmapped mount for the container's
+                rootfs. This is useful when the user doesn't want to recursively
+               chown the rootfs of the container to match the idmapping of the
+               user namespace the container is going to use. Instead an
+               idmapped mount can be used to handle this.
+               The argument for
+                <option>idmap=</option>
+                can either be a path pointing to a user namespace file that
+                LXC will open and use to idmap the rootfs or the special value
+                "container" which will instruct LXC to use
+               the container's user namespace to idmap the rootfs.
+            </para>
+          </listitem>
+        </varlistentry>
+
+        <varlistentry>
+          <term>
+            <option>lxc.rootfs.managed</option>
+          </term>
+          <listitem>
+            <para>
+              Set this to 0 to indicate that LXC is not managing the
+              container storage, then LXC will not modify the
+              container storage. The default is 1.
              </para>
            </listitem>
          </varlistentry>
@@ -1411,7 +1533,7 @@ dev/null proc/kcore none bind,relative 0 0
      </refsect2>
  
      <refsect2>
-      <title>Control group</title>
+      <title>Control groups ("cgroups")</title>
        <para>
          The control group section contains the configuration for the
          different subsystem. <command>lxc</command> does not check the
@@ -1420,10 +1542,195 @@ dev/null proc/kcore none bind,relative 0 0
          started, but has the advantage of permitting any future
          subsystem.
        </para>
+
+      <para>
+       The kernel implementation of cgroups has changed significantly over the
+       years. With Linux 4.5 support for a new cgroup filesystem was added
+       usually referred to as "cgroup2" or "unified hierarchy". Since then the
+       old cgroup filesystem is usually referred to as "cgroup1" or the
+       "legacy hierarchies". Please see the cgroups manual page for a detailed
+       explanation of the differences between the two versions.
+      </para>
+
+      <para>
+       LXC distinguishes settings for the legacy and the unified hierarchy by
+       using different configuration key prefixes. To alter settings for
+       controllers in a legacy hierarchy the key prefix
+       <option>lxc.cgroup.</option> must be used and in order to alter the
+       settings for a controller in the unified hierarchy the
+       <option>lxc.cgroup2.</option> key must be used. Note that LXC will
+       ignore <option>lxc.cgroup.</option> settings on systems that only use
+       the unified hierarchy. Conversely, it will ignore
+       <option>lxc.cgroup2.</option> options on systems that only use legacy
+       hierachies.
+      </para>
+
+      <para>
+       At its core a cgroup hierarchy is a way to hierarchically organize
+       processes. Usually a cgroup hierarchy will have one or more
+       "controllers" enabled. A "controller" in a cgroup hierarchy is usually
+       responsible for distributing a specific type of system resource along
+       the hierarchy. Controllers include the "pids" controller, the "cpu"
+       controller, the "memory" controller and others. Some controllers
+       however do not fall into the category of distributing a system
+       resource, instead they are often referred to as "utility" controllers.
+       One utility controller is the device controller. Instead of
+       distributing a system resource it allows to manage device access.
+      </para>
+
+      <para>
+       In the legacy hierarchy the device controller was implemented like most
+       other controllers as a set of files that could be written to. These
+       files where named "devices.allow" and "devices.deny". The legacy device
+       controller allowed the implementation of both "allowlists" and
+       "denylists".
+      </para>
+
+      <para>
+       An allowlist is a device program that by default blocks access to all
+       devices. In order to access specific devices "allow rules" for
+       particular devices or device classes must be specified. In contrast, a
+       denylist is a device program that by default allows access to all
+       devices. In order to restrict access to specific devices "deny rules"
+       for particular devices or device classes must be specified.
+      </para>
+
+      <para>
+       In the unified cgroup hierarchy the implementation of the device
+       controller has completely changed. Instead of files to read from and
+       write to a eBPF program of
+       <option>BPF_PROG_TYPE_CGROUP_DEVICE</option> can be attached to a
+       cgroup. Even though the kernel implementation has changed completely
+       LXC tries to allow for the same semantics to be followed in the legacy
+       device cgroup and the unified eBPF-based device controller. The
+       following paragraphs explain the semantics for the unified eBPF-based
+       device controller.
+      </para>
+
+      <para>
+       As mentioned the format for specifying device rules for the unified
+       eBPF-based device controller is the same as for the legacy cgroup
+       device controller; only the configuration key prefix has changed.
+       Specifically, device rules for the legacy cgroup device controller are
+       specified via <option>lxc.cgroup.devices.allow</option> and
+       <option>lxc.cgroup.devices.deny</option> whereas for the
+       cgroup2 eBPF-based device controller
+       <option>lxc.cgroup2.devices.allow</option> and
+       <option>lxc.cgroup2.devices.deny</option> must be used.
+      </para>
+      <para>
+        <itemizedlist>
+          <listitem>
+           <para>
+             A allowlist device rule
+              <programlisting>
+               lxc.cgroup2.devices.deny = a
+              </programlisting>
+             will cause LXC to instruct the kernel to block access to all
+             devices by default. To grant access to devices allow device rules
+             must be added via the <option>lxc.cgroup2.devices.allow</option>
+             key. This is referred to as a "allowlist" device program.
+           </para>
+         </listitem>
+
+          <listitem>
+           <para>
+             A denylist device rule
+              <programlisting>
+               lxc.cgroup2.devices.allow = a
+              </programlisting>
+             will cause LXC to instruct the kernel to allow access to all
+             devices by default. To deny access to devices deny device rules
+             must be added via <option>lxc.cgroup2.devices.deny</option> key.
+             This is referred to as a "denylist" device program.
+           </para>
+         </listitem>
+
+          <listitem>
+           <para>
+             Specifying any of the aformentioned two rules will cause all
+             previous rules to be cleared, i.e. the device list will be reset.
+           </para>
+         </listitem>
+
+          <listitem>
+           <para>
+           When an allowlist program is requested, i.e. access to all devices
+           is blocked by default, specific deny rules for individual devices
+           or device classes are ignored.
+           </para>
+         </listitem>
+
+          <listitem>
+           <para>
+           When a denylist program is requested, i.e. access to all devices
+           is allowed by default, specific allow rules for individual devices
+           or device classes are ignored.
+           </para>
+         </listitem>
+        </itemizedlist>
+      </para>
+
+      <para>
+        For example the set of rules:
+        <programlisting>
+          lxc.cgroup2.devices.deny = a
+          lxc.cgroup2.devices.allow = c *:* m
+          lxc.cgroup2.devices.allow = b *:* m
+          lxc.cgroup2.devices.allow = c 1:3 rwm
+        </programlisting>
+       implements an allowlist device program, i.e. the kernel will block
+       access to all devices not specifically allowed in this list. This
+       particular program states that all character and block devices may be
+       created but only /dev/null might be read or written.
+      </para>
+
+      <para>
+        If we instead switch to the following set of rules:
+        <programlisting>
+          lxc.cgroup2.devices.allow = a
+          lxc.cgroup2.devices.deny = c *:* m
+          lxc.cgroup2.devices.deny = b *:* m
+          lxc.cgroup2.devices.deny = c 1:3 rwm
+        </programlisting>
+        then LXC would instruct the kernel to implement a denylist, i.e. the
+        kernel will allow access to all devices not specifically denied in
+        this list. This particular program states that no character devices or
+        block devices might be created and that /dev/null is not allow allowed
+        to be read, written, or created.
+      </para>
+
+      <para>
+        Now consider the same program but followed by a "global rule"
+        which determines the type of device program (allowlist or
+        denylist) as explained above:
+        <programlisting>
+          lxc.cgroup2.devices.allow = a
+          lxc.cgroup2.devices.deny = c *:* m
+          lxc.cgroup2.devices.deny = b *:* m
+          lxc.cgroup2.devices.deny = c 1:3 rwm
+          lxc.cgroup2.devices.allow = a
+        </programlisting>
+       The last line will cause LXC to reset the device list without changing
+       the type of device program.
+      </para>
+
+      <para>
+       If we specify:
+        <programlisting>
+          lxc.cgroup2.devices.allow = a
+          lxc.cgroup2.devices.deny = c *:* m
+          lxc.cgroup2.devices.deny = b *:* m
+          lxc.cgroup2.devices.deny = c 1:3 rwm
+          lxc.cgroup2.devices.deny = a
+        </programlisting>
+       instead then the last line will cause LXC to reset the device list and
+       switch from a allowlist program to a denylist program.
+      </para>
        <variablelist>
          <varlistentry>
            <term>
-            <option>lxc.cgroup.[controller name]</option>
+            <option>lxc.cgroup.[controller name].[controller file]</option>
            </term>
            <listitem>
              <para>
@@ -1438,7 +1745,7 @@ dev/null proc/kcore none bind,relative 0 0
          </varlistentry>
          <varlistentry>
            <term>
-            <option>lxc.cgroup2.[controller name]</option>
+            <option>lxc.cgroup2.[controller name].[controller file]</option>
            </term>
            <listitem>
              <para>
@@ -1470,6 +1777,79 @@ dev/null proc/kcore none bind,relative 0 0
              </para>
            </listitem>
          </varlistentry>
+        <varlistentry>
+          <term>
+            <option>lxc.cgroup.dir.container</option>
+          </term>
+          <listitem>
+            <para>
+              This is similar to <option>lxc.cgroup.dir</option>, but must be
+              used together with <option>lxc.cgroup.dir.monitor</option> and
+              affects only the container's cgroup path. This option is mutually
+              exclusive with <option>lxc.cgroup.dir</option>.
+              Note that the final path the container attaches to may be
+              extended further by the
+              <option>lxc.cgroup.dir.container.inner</option> option.
+            </para>
+          </listitem>
+        </varlistentry>
+        <varlistentry>
+          <term>
+            <option>lxc.cgroup.dir.monitor</option>
+          </term>
+          <listitem>
+            <para>
+              This is the monitor process counterpart to
+              <option>lxc.cgroup.dir.container</option>.
+            </para>
+          </listitem>
+        </varlistentry>
+        <varlistentry>
+          <term>
+            <option>lxc.cgroup.dir.monitor.pivot</option>
+          </term>
+          <listitem>
+            <para>
+              On container termination the PID of the monitor process is attached to this cgroup.
+              This path should not be a subpath of any other configured cgroup dir to ensure
+              proper removal of other cgroup paths on container termination.
+            </para>
+          </listitem>
+        </varlistentry>
+        <varlistentry>
+          <term>
+            <option>lxc.cgroup.dir.container.inner</option>
+          </term>
+          <listitem>
+            <para>
+              Specify an additional subdirectory where the cgroup namespace
+              will be created. With this option, the cgroup limits will be
+              applied to the outer path specified in
+              <option>lxc.cgroup.dir.container</option>, which is not accessible
+              from within the container, making it possible to better enforce
+              limits for privileged containers in a way they cannot override
+              them.
+              This only works in conjunction with the
+              <option>lxc.cgroup.dir.container</option> and
+              <option>lxc.cgroup.dir.monitor</option> options and has otherwise
+              no effect.
+            </para>
+          </listitem>
+        </varlistentry>
+        <varlistentry>
+          <term>
+            <option>lxc.cgroup.relative</option>
+          </term>
+          <listitem>
+            <para>
+              Set this to 1 to instruct LXC to never escape to the
+              root cgroup. This makes it easy for users to adhere to
+              restrictions enforced by cgroup2 and
+              systemd. Specifically, this makes it possible to run LXC
+              containers as systemd services.
+            </para>
+          </listitem>
+        </varlistentry>
        </variablelist>
      </refsect2>
  
@@ -1561,7 +1941,7 @@ dev/null proc/kcore none bind,relative 0 0
              standard namespace identifiers as seen in the
              <filename>/proc/PID/ns</filename> directory.
              The <option>lxc.namespace.keep</option> is a
-            blacklist option, i.e. it is useful when enforcing that containers
+            denylist option, i.e. it is useful when enforcing that containers
              must keep a specific set of namespaces.
              </para>
  
@@ -1602,7 +1982,7 @@ dev/null proc/kcore none bind,relative 0 0
              </para>
  
              <para>
-            To inherit the namespace from another container set the 
+            To inherit the namespace from another container set the
              <option>lxc.namespace.share.[namespace identifier]</option> to the name of
              the container, e.g. <option>lxc.namespace.share.pid=c3</option>.
              </para>
@@ -1630,8 +2010,41 @@ dev/null proc/kcore none bind,relative 0 0
              process wants to inherit the other's network namespace it usually
              needs to inherit the user namespace as well.
              </para>
+
+            <para>
+            Note that without careful additional configuration of an LSM,
+            sharing user+pid namespaces with a task may allow that task to
+            escalate privileges to that of the task calling liblxc.
+            </para>
+          </listitem>
+        </varlistentry>
+
+        <varlistentry>
+          <term>
+            <option>lxc.time.offset.boot</option>
+          </term>
+          <listitem>
+            <para>
+           Specify a positive or negative offset for the boottime clock. The
+           format accepts hours (h), minutes (m), seconds (s),
+           milliseconds (ms), microseconds (us), and nanoseconds (ns).
+            </para>
            </listitem>
          </varlistentry>
+
+        <varlistentry>
+          <term>
+            <option>lxc.time.offset.monotonic</option>
+          </term>
+          <listitem>
+            <para>
+           Specify a positive or negative offset for the monotonic clock. The
+           format accepts hours (h), minutes (m), seconds (s),
+           milliseconds (ms), microseconds (us), and nanoseconds (ns).
+            </para>
+          </listitem>
+        </varlistentry>
+
        </variablelist>
      </refsect2>
  
@@ -1681,7 +2094,7 @@ dev/null proc/kcore none bind,relative 0 0
            </term>
            <listitem>
              <para>
-              Specify the kernel parameters to be set. The parameters available 
+              Specify the kernel parameters to be set. The parameters available
                are those listed under /proc/sys/.
                Note that not all sysctls are namespaced. Changing Non-namespaced
                sysctls will cause the system-wide setting to be modified.
@@ -1689,7 +2102,7 @@ dev/null proc/kcore none bind,relative 0 0
                  <refentrytitle><command>sysctl</command></refentrytitle>
                  <manvolnum>8</manvolnum>
                </citerefentry>.
-              If used with no value, lxc will clear the parameters specified up 
+              If used with no value, lxc will clear the parameters specified up
                to this point.
              </para>
            </listitem>
@@ -1724,6 +2137,11 @@ dev/null proc/kcore none bind,relative 0 0
               are nesting containers and are already confined), then use
              </para>
                <programlisting>lxc.apparmor.profile = unchanged</programlisting>
+            <para>
+              If you instruct LXC to generate the apparmor profile,
+              then use
+            </para>
+              <programlisting>lxc.apparmor.profile = generated</programlisting>
            </listitem>
          </varlistentry>
          <varlistentry>
@@ -1747,6 +2165,38 @@ dev/null proc/kcore none bind,relative 0 0
              </para>
            </listitem>
          </varlistentry>
+
+        <varlistentry>
+          <term>
+            <option>lxc.apparmor.allow_nesting</option>
+          </term>
+          <listitem>
+            <para>
+              If set this to 1, causes the following changes. When
+              generated apparmor profiles are used, they will contain
+              the necessary changes to allow creating a nested
+              container. In addition to the usual mount points,
+              <filename>/dev/.lxc/proc</filename>
+              and <filename>/dev/.lxc/sys</filename> will contain
+              procfs and sysfs mount points without the lxcfs
+              overlays, which, if generated apparmor profiles are
+              being used, will not be read/writable directly.
+            </para>
+          </listitem>
+        </varlistentry>
+
+        <varlistentry>
+          <term>
+            <option>lxc.apparmor.raw</option>
+          </term>
+          <listitem>
+            <para>
+              A list of raw AppArmor profile lines to append to the
+              profile. Only valid when using generated profiles.
+            </para>
+          </listitem>
+        </varlistentry>
+
        </variablelist>
      </refsect2>
  
@@ -1774,6 +2224,44 @@ dev/null proc/kcore none bind,relative 0 0
              <programlisting>lxc.selinux.context = system_u:system_r:lxc_t:s0:c22</programlisting>
            </listitem>
          </varlistentry>
+        <varlistentry>
+          <term>
+            <option>lxc.selinux.context.keyring</option>
+          </term>
+          <listitem>
+            <para>
+              Specify the SELinux context under which the container's keyring
+              should be created. By default this the same as lxc.selinux.context, or
+              the context lxc is executed under if lxc.selinux.context has not been set.
+            </para>
+            <programlisting>lxc.selinux.context.keyring = system_u:system_r:lxc_t:s0:c22</programlisting>
+          </listitem>
+        </varlistentry>
+      </variablelist>
+    </refsect2>
+
+    <refsect2>
+      <title>Kernel Keyring</title>
+      <para>
+        The Linux Keyring facility is primarily a way for various
+        kernel components to retain or cache security data, authentication
+        keys, encryption keys, and other data in the kernel. By default lxc
+        will create a new session keyring for the started application.
+      </para>
+      <variablelist>
+        <varlistentry>
+          <term>
+            <option>lxc.keyring.session</option>
+          </term>
+          <listitem>
+            <para>
+              Disable the creation of new session keyring by lxc. The started
+              application will then inherit the current session keyring.
+              By default, or when passing the value 1, a new keyring will be created.
+            </para>
+            <programlisting>lxc.keyring.session = 0</programlisting>
+          </listitem>
+        </varlistentry>
        </variablelist>
      </refsect2>
  
@@ -1788,29 +2276,48 @@ dev/null proc/kcore none bind,relative 0 0
        </para>
        <para>
          Versions 1 and 2 are currently supported.  In version 1, the
-        policy is a simple whitelist.  The second line therefore must
-        read "whitelist", with the rest of the file containing one (numeric)
-        syscall number per line.  Each syscall number is whitelisted,
-        while every unlisted number is blacklisted for use in the container
+        policy is a simple allowlist.  The second line therefore must
+        read "allowlist", with the rest of the file containing one (numeric)
+        syscall number per line.  Each syscall number is allowlisted,
+        while every unlisted number is denylisted for use in the container
        </para>
  
        <para>
-       In version 2, the policy may be blacklist or whitelist,
+       In version 2, the policy may be denylist or allowlist,
         supports per-rule and per-policy default actions, and supports
         per-architecture system call resolution from textual names.
        </para>
        <para>
-       An example blacklist policy, in which all system calls are
+       An example denylist policy, in which all system calls are
         allowed except for mknod, which will simply do nothing and
         return 0 (success), looks like:
        </para>
  
        <programlisting>
        2
-      blacklist
+      denylist
        mknod errno 0
+      ioctl notify
        </programlisting>
  
+      <para>
+      Specifying "errno" as action will cause LXC to register a seccomp filter
+      that will cause a specific errno to be returned to the caller. The errno
+      value can be specified after the "errno" action word.
+      </para>
+
+      <para>
+      Specifying "notify" as action will cause LXC to register a seccomp
+      listener and retrieve a listener file descriptor from the kernel. When a
+      syscall is made that is registered as "notify" the kernel will generate a
+      poll event and send a message over the file descriptor. The caller can
+      read this message, inspect the syscalls including its arguments. Based on
+      this information the caller is expected to send back a message informing
+      the kernel which action to take. Until that message is sent the kernel
+      will block the calling process. The format of the messages to read and
+      sent is documented in seccomp itself.
+      </para>
+
        <variablelist>
          <varlistentry>
            <term>
@@ -1836,6 +2343,31 @@ dev/null proc/kcore none bind,relative 0 0
               </para>
            </listitem>
          </varlistentry>
+        <varlistentry>
+          <term>
+            <option>lxc.seccomp.notify.proxy</option>
+          </term>
+          <listitem>
+            <para>
+             Specify a unix socket to which LXC will connect and forward
+             seccomp events to. The path must be in the form
+             unix:/path/to/socket or unix:@socket. The former specifies a
+             path-bound unix domain socket while the latter specifies an
+             abstract unix domain socket.
+            </para>
+          </listitem>
+        </varlistentry>
+        <varlistentry>
+          <term>
+            <option>lxc.seccomp.notify.cookie</option>
+          </term>
+          <listitem>
+            <para>
+             An additional string sent along with proxied seccomp notification
+             requests.
+            </para>
+          </listitem>
+        </varlistentry>
        </variablelist>
      </refsect2>
  
@@ -2376,7 +2908,8 @@ dev/null proc/kcore none bind,relative 0 0
            <listitem>
              <para>
                An integer used to sort the containers when auto-starting
-              a series of containers at once.
+              a series of containers at once. A lower value means an
+              earlier start.
              </para>
            </listitem>
          </varlistentry>
@@ -2567,7 +3100,7 @@ dev/null proc/kcore none bind,relative 0 0
          lxc.net.1.ipv6.address = 2003:db8:1:0:214:1234:fe0b:3596
          lxc.net.2.type = phys
          lxc.net.2.flags = up
-        lxc.net.2.link = dummy0
+        lxc.net.2.link = random0
          lxc.net.2.hwaddr = 4a:49:43:49:79:ff
          lxc.net.2.ipv4.address = 10.2.3.6/24
          lxc.net.2.ipv6.address = 2003:db8:1:0:214:1234:fe0b:3297
@@ -2579,6 +3112,7 @@ dev/null proc/kcore none bind,relative 0 0
          lxc.mount.fstab = /etc/fstab.complex
          lxc.mount.entry = /lib /root/myrootfs/lib none ro,bind 0 0
          lxc.rootfs.path = dir:/mnt/rootfs.complex
+        lxc.rootfs.options = idmap=container
          lxc.cap.drop = sys_module mknod setuid net_raw
          lxc.cap.drop = mac_override
        </programlisting>