import ha-manager docs

author Dietmar Maurer <dietmar@proxmox.com>

Wed, 24 Feb 2016 10:37:48 +0000 (11:37 +0100)

committer Dietmar Maurer <dietmar@proxmox.com>

Wed, 24 Feb 2016 10:37:48 +0000 (11:37 +0100)
author Dietmar Maurer <dietmar@proxmox.com>
Wed, 24 Feb 2016 10:37:48 +0000 (11:37 +0100)
committer Dietmar Maurer <dietmar@proxmox.com>
Wed, 24 Feb 2016 10:37:48 +0000 (11:37 +0100)
diff --git a/Makefile b/Makefile

index 797777972f905b54b4d563bdc87f42afe391d17f..d636fb6085354feab7b1588f5bfd284338c51ae8 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -6,6 +6,7 @@ VZDUMP_SOURCES=attributes.txt vzdump.adoc vzdump.1-synopsis.adoc
  PVEFW_SOURCES=attributes.txt pve-firewall.adoc pve-firewall.8-synopsis.adoc
  QM_SOURCES=attributes.txt qm.adoc qm.1-synopsis.adoc
  PCT_SOURCES=attributes.txt pct.adoc pct.1-synopsis.adoc
  PVEFW_SOURCES=attributes.txt pve-firewall.adoc pve-firewall.8-synopsis.adoc
  QM_SOURCES=attributes.txt qm.adoc qm.1-synopsis.adoc
  PCT_SOURCES=attributes.txt pct.adoc pct.1-synopsis.adoc
+HA_SOURCES=attributes.txt ha-manager.1-synopsis.adoc ha-manager.adoc
  
  SYSADMIN_SOURCES=                      \
         getting-help.adoc               \
  
  SYSADMIN_SOURCES=                      \
         getting-help.adoc               \
@@ -26,6 +27,7 @@ PVE_ADMIN_GUIDE_SOURCES=              \
         ${PVEUM_SOURCES}                \
         ${PVESM_SOURCES}                \
         ${VZDUMP_SOURCES}               \
         ${PVEUM_SOURCES}                \
         ${PVESM_SOURCES}                \
         ${VZDUMP_SOURCES}               \
+       ${HA_SOURCES}                   \
         images/cluster-nwdiag.svg       \
         images/node-nwdiag.svg          \
         pve-bibliography.adoc           \
         images/cluster-nwdiag.svg       \
         images/node-nwdiag.svg          \
         pve-bibliography.adoc           \
@@ -71,7 +73,7 @@ all: pve-admin-guide.html
  
  index.html: index.adoc ${PVE_ADMIN_GUIDE_SOURCES}
         $(MAKE) NOVIEW=1 pve-admin-guide.pdf pve-admin-guide.html pve-admin-guide.epub
  
  index.html: index.adoc ${PVE_ADMIN_GUIDE_SOURCES}
         $(MAKE) NOVIEW=1 pve-admin-guide.pdf pve-admin-guide.html pve-admin-guide.epub
-       $(MAKE) NOVIEW=1 qm.1.html pct.1.html pvesm.1.html pveum.1.html vzdump.1.html pve-firewall.8.html
+       $(MAKE) NOVIEW=1 qm.1.html pct.1.html pvesm.1.html pveum.1.html vzdump.1.html pve-firewall.8.html ha-manager.1.html
         asciidoc -a "date=$(shell date)" -a "revnumber=${RELEASE}" index.adoc 
         $(BROWSER) index.html &
  
         asciidoc -a "date=$(shell date)" -a "revnumber=${RELEASE}" index.adoc 
         $(BROWSER) index.html &
  
diff --git a/ha-manager.adoc b/ha-manager.adoc

new file mode 100644 (file)

index 0000000..321fa3d
--- /dev/null
+++ b/ha-manager.adoc
@@ -0,0 +1,207 @@
+[[chapter-ha-manager]]
+ifdef::manvolnum[]
+PVE({manvolnum})
+================
+include::attributes.txt[]
+
+NAME
+----
+
+ha-manager - Proxmox VE HA manager command line interface
+
+SYNOPSYS
+--------
+
+include::ha-manager.1-synopsis.adoc[]
+
+DESCRIPTION
+-----------
+endif::manvolnum[]
+
+ifndef::manvolnum[]
+High Availability
+=================
+include::attributes.txt[]
+endif::manvolnum[]
+
+'ha-manager' handles management of user-defined cluster services. This
+includes handling of user requests including service start, service
+disable, service relocate, and service restart. The cluster resource
+manager daemon also handles restarting and relocating services in the
+event of failures.
+
+HOW IT WORKS
+------------
+
+The local resource manager ('pve-ha-lrm') is started as a daemon on
+each node at system start and waits until the HA cluster is quorate
+and locks are working.  After initialization, the LRM determines which
+services are enabled and starts them. Also the watchdog gets
+initialized.
+
+The cluster resource manager ('pve-ha-crm') starts on each node and
+waits there for the manager lock, which can only be held by one node
+at a time.  The node which successfully acquires the manager lock gets
+promoted to the CRM, it handles cluster wide actions like migrations
+and failures.
+
+When an node leaves the cluster quorum, its state changes to unknown.
+If the current CRM then can secure the failed nodes lock, the services
+will be 'stolen' and restarted on another node.
+
+When a cluster member determines that it is no longer in the cluster
+quorum, the LRM waits for a new quorum to form. As long as there is no
+quorum the node cannot reset the watchdog. This will trigger a reboot
+after 60 seconds.
+
+CONFIGURATION
+-------------
+
+The HA stack is well integrated int the Proxmox VE API2. So, for
+example, HA can be configured via 'ha-manager' or the PVE web
+interface, which both provide an easy to use tool.
+
+The resource configuration file can be located at
+'/etc/pve/ha/resources.cfg' and the group configuration file at
+'/etc/pve/ha/groups.cfg'. Use the provided tools to make changes,
+there shouldn't be any need to edit them manually.
+
+RESOURCES/SERVICES AGENTS
+-------------------------
+
+A resource or also called service can be managed by the
+ha-manager. Currently we support virtual machines and container.
+
+GROUPS
+------
+
+A group is a collection of cluster nodes which a service may be bound to.
+
+GROUP SETTINGS
+~~~~~~~~~~~~~~
+
+nodes::
+
+list of group node members
+
+restricted::
+
+resources bound to this group may only run on nodes defined by the
+group. If no group node member is available the resource will be
+placed in the stopped state.
+
+nofailback::
+
+the resource won't automatically fail back when a more preferred node
+(re)joins the cluster.
+
+
+RECOVERY POLICY
+---------------
+
+There are two service recover policy settings which can be configured
+specific for each resource.
+
+max_restart::
+
+maximal number of tries to restart an failed service on the actual
+node.  The default is set to one.
+
+max_relocate::
+
+maximal number of tries to relocate the service to a different node.
+A relocate only happens after the max_restart value is exceeded on the
+actual node. The default is set to one.
+
+Note that the relocate count state will only reset to zero when the
+service had at least one successful start. That means if a service is
+re-enabled without fixing the error only the restart policy gets
+repeated.
+
+ERROR RECOVERY
+--------------
+
+If after all tries the service state could not be recovered it gets
+placed in an error state. In this state the service won't get touched
+by the HA stack anymore.  To recover from this state you should follow
+these steps:
+
+* bring the resource back into an safe and consistent state (e.g:
+killing its process)
+
+* disable the ha resource to place it in an stopped state
+
+* fix the error which led to this failures
+
+* *after* you fixed all errors you may enable the service again
+
+
+SERVICE OPERATIONS
+------------------
+
+This are how the basic user-initiated service operations (via
+'ha-manager') work.
+
+enable::
+
+the service will be started by the LRM if not already running.
+
+disable::
+
+the service will be stopped by the LRM if running.
+
+migrate/relocate::
+
+the service will be relocated (live) to another node.
+
+remove::
+
+the service will be removed from the HA managed resource list. Its
+current state will not be touched.
+
+start/stop::
+
+start and stop commands can be issued to the resource specific tools
+(like 'qm' or 'pct'), they will forward the request to the
+'ha-manager' which then will execute the action and set the resulting
+service state (enabled, disabled).
+
+
+SERVICE STATES
+--------------
+
+stopped::
+
+Service is stopped (confirmed by LRM)
+
+request_stop::
+
+Service should be stopped. Waiting for confirmation from LRM.
+
+started::
+
+Service is active an LRM should start it ASAP if not already running.
+
+fence::
+
+Wait for node fencing (service node is not inside quorate cluster
+partition).
+
+freeze::
+
+Do not touch the service state. We use this state while we reboot a
+node, or when we restart the LRM daemon.
+
+migrate::
+
+Migrate service (live) to other node.
+
+error::
+
+Service disabled because of LRM errors. Needs manual intervention.
+
+
+ifdef::manvolnum[]
+include::pve-copyright.adoc[]
+endif::manvolnum[]
+
diff --git a/index.adoc b/index.adoc

index bd73e409bed84549eb87ec1a2b50f138fc3388b8..e20aa9145b298c43878b879064fd8747b4aa2632 100644 (file)
--- a/index.adoc
+++ b/index.adoc
@@ -26,6 +26,7 @@ include::attributes.txt[]
  | pvesm        | link:pvesm.1.html[pvesm.1]
  | pveum        | link:pveum.1.html[pveum.1]
  | vzdump       | link:vzdump.1.html[vzdump.1]
  | pvesm        | link:pvesm.1.html[pvesm.1]
  | pveum        | link:pveum.1.html[pveum.1]
  | vzdump       | link:vzdump.1.html[vzdump.1]
+| ha-manager   | link:ha-manager.1.html[ha-manager.1]
  | pve-firewall | link:pve-firewall.8.html[pve-firewall.8]
  |===========================================================
  
  | pve-firewall | link:pve-firewall.8.html[pve-firewall.8]
  |===========================================================
  
diff --git a/pve-admin-guide.adoc b/pve-admin-guide.adoc

index 5f65c7d4348a9f52faa98bbbc20dd82c3a8e787f..764e6e9a330f105fa75dcacf3315f0669357890c 100644 (file)
--- a/pve-admin-guide.adoc
+++ b/pve-admin-guide.adoc
@@ -26,6 +26,8 @@ include::pveum.adoc[]
  
  include::pct.adoc[]
  
  
  include::pct.adoc[]
  
+include::ha-manager.adoc[]
+
  include::vzdump.adoc[]
  
  // Return to normal title levels.
  include::vzdump.adoc[]
  
  // Return to normal title levels.
@@ -85,6 +87,14 @@ include::pve-firewall.8-synopsis.adoc[]
  
  :leveloffset: 0
  
  
  :leveloffset: 0
  
+*ha-manager* - Proxmox VE HA manager
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+:leveloffset: 1
+include::ha-manager.1-synopsis.adoc[]
+
+:leveloffset: 0
+
  include::pve-bibliography.adoc[]
  
  :leveloffset: 1
  include::pve-bibliography.adoc[]
  
  :leveloffset: 1
author	Dietmar Maurer <dietmar@proxmox.com>
	Wed, 24 Feb 2016 10:37:48 +0000 (11:37 +0100)
committer	Dietmar Maurer <dietmar@proxmox.com>
	Wed, 24 Feb 2016 10:37:48 +0000 (11:37 +0100)
Makefile		patch \| blob \| blame \| history
ha-manager.adoc	[new file with mode: 0644]	patch \| blob
index.adoc		patch \| blob \| blame \| history
pve-admin-guide.adoc		patch \| blob \| blame \| history