]>
Commit | Line | Data |
---|---|---|
6727b613 AD |
1 | /* |
2 | * Intel MIC Platform Software Stack (MPSS) | |
3 | * | |
4 | * Copyright(c) 2015 Intel Corporation. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License, version 2, as | |
8 | * published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License for more details. | |
14 | * | |
15 | * The full GNU General Public License is included in this distribution in | |
16 | * the file called "COPYING". | |
17 | * | |
18 | * Intel MIC Coprocessor State Management (COSM) Driver | |
19 | * | |
20 | */ | |
21 | #include <linux/kthread.h> | |
3f07c014 IM |
22 | #include <linux/sched/signal.h> |
23 | ||
6727b613 AD |
24 | #include "cosm_main.h" |
25 | ||
26 | /* | |
27 | * The COSM driver uses SCIF to communicate between the management node and the | |
28 | * MIC cards. SCIF is used to (a) Send a shutdown command to the card (b) | |
29 | * receive a shutdown status back from the card upon completion of shutdown and | |
30 | * (c) receive periodic heartbeat messages from the card used to deduce if the | |
31 | * card has crashed. | |
32 | * | |
33 | * A COSM server consisting of a SCIF listening endpoint waits for incoming | |
34 | * connections from the card. Upon acceptance of the connection, a separate | |
35 | * work-item is scheduled to handle SCIF message processing for that card. The | |
36 | * life-time of this work-item is therefore the time from which the connection | |
37 | * from a card is accepted to the time at which the connection is closed. A new | |
38 | * work-item starts each time the card boots and is alive till the card (a) | |
39 | * shuts down (b) is reset (c) crashes (d) cosm_client driver on the card is | |
40 | * unloaded. | |
41 | * | |
42 | * From the point of view of COSM interactions with SCIF during card | |
43 | * shutdown, reset and crash are as follows: | |
44 | * | |
45 | * Card shutdown | |
46 | * ------------- | |
47 | * 1. COSM client on the card invokes orderly_poweroff() in response to SHUTDOWN | |
48 | * message from the host. | |
49 | * 2. Card driver shutdown callback invokes scif_unregister_device(..) resulting | |
50 | * in scif_remove(..) getting called on the card | |
51 | * 3. scif_remove -> scif_stop -> scif_handle_remove_node -> | |
52 | * scif_peer_unregister_device -> device_unregister for the host peer device | |
53 | * 4. During device_unregister remove(..) method of cosm_client is invoked which | |
54 | * closes the COSM SCIF endpoint on the card. This results in a SCIF_DISCNCT | |
55 | * message being sent to host SCIF. SCIF_DISCNCT message processing on the | |
56 | * host SCIF sets the host COSM SCIF endpoint state to DISCONNECTED and wakes | |
57 | * up the host COSM thread blocked in scif_poll(..) resulting in | |
58 | * scif_poll(..) returning POLLHUP. | |
59 | * 5. On the card, scif_peer_release_dev is next called which results in an | |
60 | * SCIF_EXIT message being sent to the host and after receiving the | |
61 | * SCIF_EXIT_ACK from the host the peer device teardown on the card is | |
62 | * complete. | |
63 | * 6. As part of the SCIF_EXIT message processing on the host, host sends a | |
64 | * SCIF_REMOVE_NODE to itself corresponding to the card being removed. This | |
65 | * starts a similar SCIF peer device teardown sequence on the host | |
66 | * corresponding to the card being shut down. | |
67 | * | |
68 | * Card reset | |
69 | * ---------- | |
70 | * The case of interest here is when the card has not been previously shut down | |
71 | * since most of the steps below are skipped in that case: | |
72 | ||
73 | * 1. cosm_stop(..) invokes hw_ops->stop(..) method of the base PCIe driver | |
74 | * which unregisters the SCIF HW device resulting in scif_remove(..) being | |
75 | * called on the host. | |
76 | * 2. scif_remove(..) calls scif_disconnect_node(..) which results in a | |
77 | * SCIF_EXIT message being sent to the card. | |
78 | * 3. The card executes scif_stop() as part of SCIF_EXIT message | |
79 | * processing. This results in the COSM endpoint on the card being closed and | |
80 | * the SCIF host peer device on the card getting unregistered similar to | |
81 | * steps 3, 4 and 5 for the card shutdown case above. scif_poll(..) on the | |
82 | * host returns POLLHUP as a result. | |
83 | * 4. On the host, card peer device unregister and SCIF HW remove(..) also | |
84 | * subsequently complete. | |
85 | * | |
86 | * Card crash | |
87 | * ---------- | |
88 | * If a reset is issued after the card has crashed, there is no SCIF_DISCNT | |
89 | * message from the card which would result in scif_poll(..) returning | |
90 | * POLLHUP. In this case when the host SCIF driver sends a SCIF_REMOVE_NODE | |
91 | * message to itself resulting in the card SCIF peer device being unregistered, | |
92 | * this results in a scif_peer_release_dev -> scif_cleanup_scifdev-> | |
93 | * scif_invalidate_ep call sequence which sets the endpoint state to | |
94 | * DISCONNECTED and results in scif_poll(..) returning POLLHUP. | |
95 | */ | |
96 | ||
97 | #define COSM_SCIF_BACKLOG 16 | |
98 | #define COSM_HEARTBEAT_CHECK_DELTA_SEC 10 | |
99 | #define COSM_HEARTBEAT_TIMEOUT_SEC \ | |
100 | (COSM_HEARTBEAT_SEND_SEC + COSM_HEARTBEAT_CHECK_DELTA_SEC) | |
101 | #define COSM_HEARTBEAT_TIMEOUT_MSEC (COSM_HEARTBEAT_TIMEOUT_SEC * MSEC_PER_SEC) | |
102 | ||
103 | static struct task_struct *server_thread; | |
104 | static scif_epd_t listen_epd; | |
105 | ||
106 | /* Publish MIC card's shutdown status to user space MIC daemon */ | |
107 | static void cosm_update_mic_status(struct cosm_device *cdev) | |
108 | { | |
109 | if (cdev->shutdown_status_int != MIC_NOP) { | |
110 | cosm_set_shutdown_status(cdev, cdev->shutdown_status_int); | |
111 | cdev->shutdown_status_int = MIC_NOP; | |
112 | } | |
113 | } | |
114 | ||
115 | /* Store MIC card's shutdown status internally when it is received */ | |
116 | static void cosm_shutdown_status_int(struct cosm_device *cdev, | |
117 | enum mic_status shutdown_status) | |
118 | { | |
119 | switch (shutdown_status) { | |
120 | case MIC_HALTED: | |
121 | case MIC_POWER_OFF: | |
122 | case MIC_RESTART: | |
123 | case MIC_CRASHED: | |
124 | break; | |
125 | default: | |
126 | dev_err(&cdev->dev, "%s %d Unexpected shutdown_status %d\n", | |
127 | __func__, __LINE__, shutdown_status); | |
128 | return; | |
129 | }; | |
130 | cdev->shutdown_status_int = shutdown_status; | |
131 | cdev->heartbeat_watchdog_enable = false; | |
132 | ||
133 | if (cdev->state != MIC_SHUTTING_DOWN) | |
134 | cosm_set_state(cdev, MIC_SHUTTING_DOWN); | |
135 | } | |
136 | ||
137 | /* Non-blocking recv. Read and process all available messages */ | |
138 | static void cosm_scif_recv(struct cosm_device *cdev) | |
139 | { | |
140 | struct cosm_msg msg; | |
141 | int rc; | |
142 | ||
143 | while (1) { | |
144 | rc = scif_recv(cdev->epd, &msg, sizeof(msg), 0); | |
145 | if (!rc) { | |
146 | break; | |
147 | } else if (rc < 0) { | |
148 | dev_dbg(&cdev->dev, "%s: %d rc %d\n", | |
149 | __func__, __LINE__, rc); | |
150 | break; | |
151 | } | |
152 | dev_dbg(&cdev->dev, "%s: %d rc %d id 0x%llx\n", | |
153 | __func__, __LINE__, rc, msg.id); | |
154 | ||
155 | switch (msg.id) { | |
156 | case COSM_MSG_SHUTDOWN_STATUS: | |
157 | cosm_shutdown_status_int(cdev, msg.shutdown_status); | |
158 | break; | |
159 | case COSM_MSG_HEARTBEAT: | |
160 | /* Nothing to do, heartbeat only unblocks scif_poll */ | |
161 | break; | |
162 | default: | |
163 | dev_err(&cdev->dev, "%s: %d unknown msg.id %lld\n", | |
164 | __func__, __LINE__, msg.id); | |
165 | break; | |
166 | } | |
167 | } | |
168 | } | |
169 | ||
170 | /* Publish crashed status for this MIC card */ | |
171 | static void cosm_set_crashed(struct cosm_device *cdev) | |
172 | { | |
173 | dev_err(&cdev->dev, "node alive timeout\n"); | |
174 | cosm_shutdown_status_int(cdev, MIC_CRASHED); | |
175 | cosm_update_mic_status(cdev); | |
176 | } | |
177 | ||
178 | /* Send host time to the MIC card to sync system time between host and MIC */ | |
179 | static void cosm_send_time(struct cosm_device *cdev) | |
180 | { | |
181 | struct cosm_msg msg = { .id = COSM_MSG_SYNC_TIME }; | |
182 | int rc; | |
183 | ||
184 | getnstimeofday64(&msg.timespec); | |
185 | rc = scif_send(cdev->epd, &msg, sizeof(msg), SCIF_SEND_BLOCK); | |
186 | if (rc < 0) | |
187 | dev_err(&cdev->dev, "%s %d scif_send failed rc %d\n", | |
188 | __func__, __LINE__, rc); | |
189 | } | |
190 | ||
191 | /* | |
192 | * Close this cosm_device's endpoint after its peer endpoint on the card has | |
193 | * been closed. In all cases except MIC card crash POLLHUP on the host is | |
194 | * triggered by the client's endpoint being closed. | |
195 | */ | |
196 | static void cosm_scif_close(struct cosm_device *cdev) | |
197 | { | |
198 | /* | |
199 | * Because SHUTDOWN_STATUS message is sent by the MIC cards in the | |
200 | * reboot notifier when shutdown is still not complete, we notify mpssd | |
201 | * to reset the card when SCIF endpoint is closed. | |
202 | */ | |
203 | cosm_update_mic_status(cdev); | |
204 | scif_close(cdev->epd); | |
205 | cdev->epd = NULL; | |
206 | dev_dbg(&cdev->dev, "%s %d\n", __func__, __LINE__); | |
207 | } | |
208 | ||
209 | /* | |
210 | * Set card state to ONLINE when a new SCIF connection from a MIC card is | |
211 | * received. Normally the state is BOOTING when the connection comes in, but can | |
212 | * be ONLINE if cosm_client driver on the card was unloaded and then reloaded. | |
213 | */ | |
214 | static int cosm_set_online(struct cosm_device *cdev) | |
215 | { | |
216 | int rc = 0; | |
217 | ||
218 | if (MIC_BOOTING == cdev->state || MIC_ONLINE == cdev->state) { | |
219 | cdev->heartbeat_watchdog_enable = cdev->sysfs_heartbeat_enable; | |
220 | cdev->epd = cdev->newepd; | |
221 | if (cdev->state == MIC_BOOTING) | |
222 | cosm_set_state(cdev, MIC_ONLINE); | |
223 | cosm_send_time(cdev); | |
224 | dev_dbg(&cdev->dev, "%s %d\n", __func__, __LINE__); | |
225 | } else { | |
226 | dev_warn(&cdev->dev, "%s %d not going online in state: %s\n", | |
227 | __func__, __LINE__, cosm_state_string[cdev->state]); | |
228 | rc = -EINVAL; | |
229 | } | |
230 | /* Drop reference acquired by bus_find_device in the server thread */ | |
231 | put_device(&cdev->dev); | |
232 | return rc; | |
233 | } | |
234 | ||
235 | /* | |
236 | * Work function for handling work for a SCIF connection from a particular MIC | |
237 | * card. It first sets the card state to ONLINE and then calls scif_poll to | |
238 | * block on activity such as incoming messages on the SCIF endpoint. When the | |
239 | * endpoint is closed, the work function exits, completing its life cycle, from | |
240 | * MIC card boot to card shutdown/reset/crash. | |
241 | */ | |
242 | void cosm_scif_work(struct work_struct *work) | |
243 | { | |
244 | struct cosm_device *cdev = container_of(work, struct cosm_device, | |
245 | scif_work); | |
246 | struct scif_pollepd pollepd; | |
247 | int rc; | |
248 | ||
249 | mutex_lock(&cdev->cosm_mutex); | |
250 | if (cosm_set_online(cdev)) | |
251 | goto exit; | |
252 | ||
253 | while (1) { | |
254 | pollepd.epd = cdev->epd; | |
255 | pollepd.events = POLLIN; | |
256 | ||
257 | /* Drop the mutex before blocking in scif_poll(..) */ | |
258 | mutex_unlock(&cdev->cosm_mutex); | |
259 | /* poll(..) with timeout on our endpoint */ | |
260 | rc = scif_poll(&pollepd, 1, COSM_HEARTBEAT_TIMEOUT_MSEC); | |
261 | mutex_lock(&cdev->cosm_mutex); | |
262 | if (rc < 0) { | |
263 | dev_err(&cdev->dev, "%s %d scif_poll rc %d\n", | |
264 | __func__, __LINE__, rc); | |
265 | continue; | |
266 | } | |
267 | ||
268 | /* There is a message from the card */ | |
269 | if (pollepd.revents & POLLIN) | |
270 | cosm_scif_recv(cdev); | |
271 | ||
272 | /* The peer endpoint is closed or this endpoint disconnected */ | |
273 | if (pollepd.revents & POLLHUP) { | |
274 | cosm_scif_close(cdev); | |
275 | break; | |
276 | } | |
277 | ||
278 | /* Did we timeout from poll? */ | |
279 | if (!rc && cdev->heartbeat_watchdog_enable) | |
280 | cosm_set_crashed(cdev); | |
281 | } | |
282 | exit: | |
283 | dev_dbg(&cdev->dev, "%s %d exiting\n", __func__, __LINE__); | |
284 | mutex_unlock(&cdev->cosm_mutex); | |
285 | } | |
286 | ||
287 | /* | |
288 | * COSM SCIF server thread function. Accepts incoming SCIF connections from MIC | |
289 | * cards, finds the correct cosm_device to associate that connection with and | |
290 | * schedules individual work items for each MIC card. | |
291 | */ | |
292 | static int cosm_scif_server(void *unused) | |
293 | { | |
294 | struct cosm_device *cdev; | |
295 | scif_epd_t newepd; | |
296 | struct scif_port_id port_id; | |
297 | int rc; | |
298 | ||
299 | allow_signal(SIGKILL); | |
300 | ||
301 | while (!kthread_should_stop()) { | |
302 | rc = scif_accept(listen_epd, &port_id, &newepd, | |
303 | SCIF_ACCEPT_SYNC); | |
304 | if (rc < 0) { | |
305 | if (-ERESTARTSYS != rc) | |
306 | pr_err("%s %d rc %d\n", __func__, __LINE__, rc); | |
307 | continue; | |
308 | } | |
309 | ||
310 | /* | |
311 | * Associate the incoming connection with a particular | |
312 | * cosm_device, COSM device ID == SCIF node ID - 1 | |
313 | */ | |
314 | cdev = cosm_find_cdev_by_id(port_id.node - 1); | |
315 | if (!cdev) | |
316 | continue; | |
317 | cdev->newepd = newepd; | |
318 | schedule_work(&cdev->scif_work); | |
319 | } | |
320 | ||
321 | pr_debug("%s %d Server thread stopped\n", __func__, __LINE__); | |
322 | return 0; | |
323 | } | |
324 | ||
325 | static int cosm_scif_listen(void) | |
326 | { | |
327 | int rc; | |
328 | ||
329 | listen_epd = scif_open(); | |
330 | if (!listen_epd) { | |
331 | pr_err("%s %d scif_open failed\n", __func__, __LINE__); | |
332 | return -ENOMEM; | |
333 | } | |
334 | ||
335 | rc = scif_bind(listen_epd, SCIF_COSM_LISTEN_PORT); | |
336 | if (rc < 0) { | |
337 | pr_err("%s %d scif_bind failed rc %d\n", | |
338 | __func__, __LINE__, rc); | |
339 | goto err; | |
340 | } | |
341 | ||
342 | rc = scif_listen(listen_epd, COSM_SCIF_BACKLOG); | |
343 | if (rc < 0) { | |
344 | pr_err("%s %d scif_listen rc %d\n", __func__, __LINE__, rc); | |
345 | goto err; | |
346 | } | |
347 | pr_debug("%s %d listen_epd set up\n", __func__, __LINE__); | |
348 | return 0; | |
349 | err: | |
350 | scif_close(listen_epd); | |
351 | listen_epd = NULL; | |
352 | return rc; | |
353 | } | |
354 | ||
355 | static void cosm_scif_listen_exit(void) | |
356 | { | |
357 | pr_debug("%s %d closing listen_epd\n", __func__, __LINE__); | |
358 | if (listen_epd) { | |
359 | scif_close(listen_epd); | |
360 | listen_epd = NULL; | |
361 | } | |
362 | } | |
363 | ||
364 | /* | |
365 | * Create a listening SCIF endpoint and a server kthread which accepts incoming | |
366 | * SCIF connections from MIC cards | |
367 | */ | |
368 | int cosm_scif_init(void) | |
369 | { | |
370 | int rc = cosm_scif_listen(); | |
371 | ||
372 | if (rc) { | |
373 | pr_err("%s %d cosm_scif_listen rc %d\n", | |
374 | __func__, __LINE__, rc); | |
375 | goto err; | |
376 | } | |
377 | ||
378 | server_thread = kthread_run(cosm_scif_server, NULL, "cosm_server"); | |
379 | if (IS_ERR(server_thread)) { | |
380 | rc = PTR_ERR(server_thread); | |
381 | pr_err("%s %d kthread_run rc %d\n", __func__, __LINE__, rc); | |
382 | goto listen_exit; | |
383 | } | |
384 | return 0; | |
385 | listen_exit: | |
386 | cosm_scif_listen_exit(); | |
387 | err: | |
388 | return rc; | |
389 | } | |
390 | ||
391 | /* Stop the running server thread and close the listening SCIF endpoint */ | |
392 | void cosm_scif_exit(void) | |
393 | { | |
394 | int rc; | |
395 | ||
396 | if (!IS_ERR_OR_NULL(server_thread)) { | |
397 | rc = send_sig(SIGKILL, server_thread, 0); | |
398 | if (rc) { | |
399 | pr_err("%s %d send_sig rc %d\n", | |
400 | __func__, __LINE__, rc); | |
401 | return; | |
402 | } | |
403 | kthread_stop(server_thread); | |
404 | } | |
405 | ||
406 | cosm_scif_listen_exit(); | |
407 | } |