]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - drivers/misc/mic/cosm/cosm_scif_server.c
sched/headers: Prepare for new header dependencies before moving code to <linux/sched...
[mirror_ubuntu-artful-kernel.git] / drivers / misc / mic / cosm / cosm_scif_server.c
CommitLineData
6727b613
AD
1/*
2 * Intel MIC Platform Software Stack (MPSS)
3 *
4 * Copyright(c) 2015 Intel Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License, version 2, as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * The full GNU General Public License is included in this distribution in
16 * the file called "COPYING".
17 *
18 * Intel MIC Coprocessor State Management (COSM) Driver
19 *
20 */
21#include <linux/kthread.h>
3f07c014
IM
22#include <linux/sched/signal.h>
23
6727b613
AD
24#include "cosm_main.h"
25
26/*
27 * The COSM driver uses SCIF to communicate between the management node and the
28 * MIC cards. SCIF is used to (a) Send a shutdown command to the card (b)
29 * receive a shutdown status back from the card upon completion of shutdown and
30 * (c) receive periodic heartbeat messages from the card used to deduce if the
31 * card has crashed.
32 *
33 * A COSM server consisting of a SCIF listening endpoint waits for incoming
34 * connections from the card. Upon acceptance of the connection, a separate
35 * work-item is scheduled to handle SCIF message processing for that card. The
36 * life-time of this work-item is therefore the time from which the connection
37 * from a card is accepted to the time at which the connection is closed. A new
38 * work-item starts each time the card boots and is alive till the card (a)
39 * shuts down (b) is reset (c) crashes (d) cosm_client driver on the card is
40 * unloaded.
41 *
42 * From the point of view of COSM interactions with SCIF during card
43 * shutdown, reset and crash are as follows:
44 *
45 * Card shutdown
46 * -------------
47 * 1. COSM client on the card invokes orderly_poweroff() in response to SHUTDOWN
48 * message from the host.
49 * 2. Card driver shutdown callback invokes scif_unregister_device(..) resulting
50 * in scif_remove(..) getting called on the card
51 * 3. scif_remove -> scif_stop -> scif_handle_remove_node ->
52 * scif_peer_unregister_device -> device_unregister for the host peer device
53 * 4. During device_unregister remove(..) method of cosm_client is invoked which
54 * closes the COSM SCIF endpoint on the card. This results in a SCIF_DISCNCT
55 * message being sent to host SCIF. SCIF_DISCNCT message processing on the
56 * host SCIF sets the host COSM SCIF endpoint state to DISCONNECTED and wakes
57 * up the host COSM thread blocked in scif_poll(..) resulting in
58 * scif_poll(..) returning POLLHUP.
59 * 5. On the card, scif_peer_release_dev is next called which results in an
60 * SCIF_EXIT message being sent to the host and after receiving the
61 * SCIF_EXIT_ACK from the host the peer device teardown on the card is
62 * complete.
63 * 6. As part of the SCIF_EXIT message processing on the host, host sends a
64 * SCIF_REMOVE_NODE to itself corresponding to the card being removed. This
65 * starts a similar SCIF peer device teardown sequence on the host
66 * corresponding to the card being shut down.
67 *
68 * Card reset
69 * ----------
70 * The case of interest here is when the card has not been previously shut down
71 * since most of the steps below are skipped in that case:
72
73 * 1. cosm_stop(..) invokes hw_ops->stop(..) method of the base PCIe driver
74 * which unregisters the SCIF HW device resulting in scif_remove(..) being
75 * called on the host.
76 * 2. scif_remove(..) calls scif_disconnect_node(..) which results in a
77 * SCIF_EXIT message being sent to the card.
78 * 3. The card executes scif_stop() as part of SCIF_EXIT message
79 * processing. This results in the COSM endpoint on the card being closed and
80 * the SCIF host peer device on the card getting unregistered similar to
81 * steps 3, 4 and 5 for the card shutdown case above. scif_poll(..) on the
82 * host returns POLLHUP as a result.
83 * 4. On the host, card peer device unregister and SCIF HW remove(..) also
84 * subsequently complete.
85 *
86 * Card crash
87 * ----------
88 * If a reset is issued after the card has crashed, there is no SCIF_DISCNT
89 * message from the card which would result in scif_poll(..) returning
90 * POLLHUP. In this case when the host SCIF driver sends a SCIF_REMOVE_NODE
91 * message to itself resulting in the card SCIF peer device being unregistered,
92 * this results in a scif_peer_release_dev -> scif_cleanup_scifdev->
93 * scif_invalidate_ep call sequence which sets the endpoint state to
94 * DISCONNECTED and results in scif_poll(..) returning POLLHUP.
95 */
96
97#define COSM_SCIF_BACKLOG 16
98#define COSM_HEARTBEAT_CHECK_DELTA_SEC 10
99#define COSM_HEARTBEAT_TIMEOUT_SEC \
100 (COSM_HEARTBEAT_SEND_SEC + COSM_HEARTBEAT_CHECK_DELTA_SEC)
101#define COSM_HEARTBEAT_TIMEOUT_MSEC (COSM_HEARTBEAT_TIMEOUT_SEC * MSEC_PER_SEC)
102
103static struct task_struct *server_thread;
104static scif_epd_t listen_epd;
105
106/* Publish MIC card's shutdown status to user space MIC daemon */
107static void cosm_update_mic_status(struct cosm_device *cdev)
108{
109 if (cdev->shutdown_status_int != MIC_NOP) {
110 cosm_set_shutdown_status(cdev, cdev->shutdown_status_int);
111 cdev->shutdown_status_int = MIC_NOP;
112 }
113}
114
115/* Store MIC card's shutdown status internally when it is received */
116static void cosm_shutdown_status_int(struct cosm_device *cdev,
117 enum mic_status shutdown_status)
118{
119 switch (shutdown_status) {
120 case MIC_HALTED:
121 case MIC_POWER_OFF:
122 case MIC_RESTART:
123 case MIC_CRASHED:
124 break;
125 default:
126 dev_err(&cdev->dev, "%s %d Unexpected shutdown_status %d\n",
127 __func__, __LINE__, shutdown_status);
128 return;
129 };
130 cdev->shutdown_status_int = shutdown_status;
131 cdev->heartbeat_watchdog_enable = false;
132
133 if (cdev->state != MIC_SHUTTING_DOWN)
134 cosm_set_state(cdev, MIC_SHUTTING_DOWN);
135}
136
137/* Non-blocking recv. Read and process all available messages */
138static void cosm_scif_recv(struct cosm_device *cdev)
139{
140 struct cosm_msg msg;
141 int rc;
142
143 while (1) {
144 rc = scif_recv(cdev->epd, &msg, sizeof(msg), 0);
145 if (!rc) {
146 break;
147 } else if (rc < 0) {
148 dev_dbg(&cdev->dev, "%s: %d rc %d\n",
149 __func__, __LINE__, rc);
150 break;
151 }
152 dev_dbg(&cdev->dev, "%s: %d rc %d id 0x%llx\n",
153 __func__, __LINE__, rc, msg.id);
154
155 switch (msg.id) {
156 case COSM_MSG_SHUTDOWN_STATUS:
157 cosm_shutdown_status_int(cdev, msg.shutdown_status);
158 break;
159 case COSM_MSG_HEARTBEAT:
160 /* Nothing to do, heartbeat only unblocks scif_poll */
161 break;
162 default:
163 dev_err(&cdev->dev, "%s: %d unknown msg.id %lld\n",
164 __func__, __LINE__, msg.id);
165 break;
166 }
167 }
168}
169
170/* Publish crashed status for this MIC card */
171static void cosm_set_crashed(struct cosm_device *cdev)
172{
173 dev_err(&cdev->dev, "node alive timeout\n");
174 cosm_shutdown_status_int(cdev, MIC_CRASHED);
175 cosm_update_mic_status(cdev);
176}
177
178/* Send host time to the MIC card to sync system time between host and MIC */
179static void cosm_send_time(struct cosm_device *cdev)
180{
181 struct cosm_msg msg = { .id = COSM_MSG_SYNC_TIME };
182 int rc;
183
184 getnstimeofday64(&msg.timespec);
185 rc = scif_send(cdev->epd, &msg, sizeof(msg), SCIF_SEND_BLOCK);
186 if (rc < 0)
187 dev_err(&cdev->dev, "%s %d scif_send failed rc %d\n",
188 __func__, __LINE__, rc);
189}
190
191/*
192 * Close this cosm_device's endpoint after its peer endpoint on the card has
193 * been closed. In all cases except MIC card crash POLLHUP on the host is
194 * triggered by the client's endpoint being closed.
195 */
196static void cosm_scif_close(struct cosm_device *cdev)
197{
198 /*
199 * Because SHUTDOWN_STATUS message is sent by the MIC cards in the
200 * reboot notifier when shutdown is still not complete, we notify mpssd
201 * to reset the card when SCIF endpoint is closed.
202 */
203 cosm_update_mic_status(cdev);
204 scif_close(cdev->epd);
205 cdev->epd = NULL;
206 dev_dbg(&cdev->dev, "%s %d\n", __func__, __LINE__);
207}
208
209/*
210 * Set card state to ONLINE when a new SCIF connection from a MIC card is
211 * received. Normally the state is BOOTING when the connection comes in, but can
212 * be ONLINE if cosm_client driver on the card was unloaded and then reloaded.
213 */
214static int cosm_set_online(struct cosm_device *cdev)
215{
216 int rc = 0;
217
218 if (MIC_BOOTING == cdev->state || MIC_ONLINE == cdev->state) {
219 cdev->heartbeat_watchdog_enable = cdev->sysfs_heartbeat_enable;
220 cdev->epd = cdev->newepd;
221 if (cdev->state == MIC_BOOTING)
222 cosm_set_state(cdev, MIC_ONLINE);
223 cosm_send_time(cdev);
224 dev_dbg(&cdev->dev, "%s %d\n", __func__, __LINE__);
225 } else {
226 dev_warn(&cdev->dev, "%s %d not going online in state: %s\n",
227 __func__, __LINE__, cosm_state_string[cdev->state]);
228 rc = -EINVAL;
229 }
230 /* Drop reference acquired by bus_find_device in the server thread */
231 put_device(&cdev->dev);
232 return rc;
233}
234
235/*
236 * Work function for handling work for a SCIF connection from a particular MIC
237 * card. It first sets the card state to ONLINE and then calls scif_poll to
238 * block on activity such as incoming messages on the SCIF endpoint. When the
239 * endpoint is closed, the work function exits, completing its life cycle, from
240 * MIC card boot to card shutdown/reset/crash.
241 */
242void cosm_scif_work(struct work_struct *work)
243{
244 struct cosm_device *cdev = container_of(work, struct cosm_device,
245 scif_work);
246 struct scif_pollepd pollepd;
247 int rc;
248
249 mutex_lock(&cdev->cosm_mutex);
250 if (cosm_set_online(cdev))
251 goto exit;
252
253 while (1) {
254 pollepd.epd = cdev->epd;
255 pollepd.events = POLLIN;
256
257 /* Drop the mutex before blocking in scif_poll(..) */
258 mutex_unlock(&cdev->cosm_mutex);
259 /* poll(..) with timeout on our endpoint */
260 rc = scif_poll(&pollepd, 1, COSM_HEARTBEAT_TIMEOUT_MSEC);
261 mutex_lock(&cdev->cosm_mutex);
262 if (rc < 0) {
263 dev_err(&cdev->dev, "%s %d scif_poll rc %d\n",
264 __func__, __LINE__, rc);
265 continue;
266 }
267
268 /* There is a message from the card */
269 if (pollepd.revents & POLLIN)
270 cosm_scif_recv(cdev);
271
272 /* The peer endpoint is closed or this endpoint disconnected */
273 if (pollepd.revents & POLLHUP) {
274 cosm_scif_close(cdev);
275 break;
276 }
277
278 /* Did we timeout from poll? */
279 if (!rc && cdev->heartbeat_watchdog_enable)
280 cosm_set_crashed(cdev);
281 }
282exit:
283 dev_dbg(&cdev->dev, "%s %d exiting\n", __func__, __LINE__);
284 mutex_unlock(&cdev->cosm_mutex);
285}
286
287/*
288 * COSM SCIF server thread function. Accepts incoming SCIF connections from MIC
289 * cards, finds the correct cosm_device to associate that connection with and
290 * schedules individual work items for each MIC card.
291 */
292static int cosm_scif_server(void *unused)
293{
294 struct cosm_device *cdev;
295 scif_epd_t newepd;
296 struct scif_port_id port_id;
297 int rc;
298
299 allow_signal(SIGKILL);
300
301 while (!kthread_should_stop()) {
302 rc = scif_accept(listen_epd, &port_id, &newepd,
303 SCIF_ACCEPT_SYNC);
304 if (rc < 0) {
305 if (-ERESTARTSYS != rc)
306 pr_err("%s %d rc %d\n", __func__, __LINE__, rc);
307 continue;
308 }
309
310 /*
311 * Associate the incoming connection with a particular
312 * cosm_device, COSM device ID == SCIF node ID - 1
313 */
314 cdev = cosm_find_cdev_by_id(port_id.node - 1);
315 if (!cdev)
316 continue;
317 cdev->newepd = newepd;
318 schedule_work(&cdev->scif_work);
319 }
320
321 pr_debug("%s %d Server thread stopped\n", __func__, __LINE__);
322 return 0;
323}
324
325static int cosm_scif_listen(void)
326{
327 int rc;
328
329 listen_epd = scif_open();
330 if (!listen_epd) {
331 pr_err("%s %d scif_open failed\n", __func__, __LINE__);
332 return -ENOMEM;
333 }
334
335 rc = scif_bind(listen_epd, SCIF_COSM_LISTEN_PORT);
336 if (rc < 0) {
337 pr_err("%s %d scif_bind failed rc %d\n",
338 __func__, __LINE__, rc);
339 goto err;
340 }
341
342 rc = scif_listen(listen_epd, COSM_SCIF_BACKLOG);
343 if (rc < 0) {
344 pr_err("%s %d scif_listen rc %d\n", __func__, __LINE__, rc);
345 goto err;
346 }
347 pr_debug("%s %d listen_epd set up\n", __func__, __LINE__);
348 return 0;
349err:
350 scif_close(listen_epd);
351 listen_epd = NULL;
352 return rc;
353}
354
355static void cosm_scif_listen_exit(void)
356{
357 pr_debug("%s %d closing listen_epd\n", __func__, __LINE__);
358 if (listen_epd) {
359 scif_close(listen_epd);
360 listen_epd = NULL;
361 }
362}
363
364/*
365 * Create a listening SCIF endpoint and a server kthread which accepts incoming
366 * SCIF connections from MIC cards
367 */
368int cosm_scif_init(void)
369{
370 int rc = cosm_scif_listen();
371
372 if (rc) {
373 pr_err("%s %d cosm_scif_listen rc %d\n",
374 __func__, __LINE__, rc);
375 goto err;
376 }
377
378 server_thread = kthread_run(cosm_scif_server, NULL, "cosm_server");
379 if (IS_ERR(server_thread)) {
380 rc = PTR_ERR(server_thread);
381 pr_err("%s %d kthread_run rc %d\n", __func__, __LINE__, rc);
382 goto listen_exit;
383 }
384 return 0;
385listen_exit:
386 cosm_scif_listen_exit();
387err:
388 return rc;
389}
390
391/* Stop the running server thread and close the listening SCIF endpoint */
392void cosm_scif_exit(void)
393{
394 int rc;
395
396 if (!IS_ERR_OR_NULL(server_thread)) {
397 rc = send_sig(SIGKILL, server_thread, 0);
398 if (rc) {
399 pr_err("%s %d send_sig rc %d\n",
400 __func__, __LINE__, rc);
401 return;
402 }
403 kthread_stop(server_thread);
404 }
405
406 cosm_scif_listen_exit();
407}