]>
Commit | Line | Data |
---|---|---|
00e0f34c AG |
1 | /* |
2 | * Copyright (c) 2006 Oracle. All rights reserved. | |
3 | * | |
4 | * This software is available to you under a choice of one of two | |
5 | * licenses. You may choose to be licensed under the terms of the GNU | |
6 | * General Public License (GPL) Version 2, available from the file | |
7 | * COPYING in the main directory of this source tree, or the | |
8 | * OpenIB.org BSD license below: | |
9 | * | |
10 | * Redistribution and use in source and binary forms, with or | |
11 | * without modification, are permitted provided that the following | |
12 | * conditions are met: | |
13 | * | |
14 | * - Redistributions of source code must retain the above | |
15 | * copyright notice, this list of conditions and the following | |
16 | * disclaimer. | |
17 | * | |
18 | * - Redistributions in binary form must reproduce the above | |
19 | * copyright notice, this list of conditions and the following | |
20 | * disclaimer in the documentation and/or other materials | |
21 | * provided with the distribution. | |
22 | * | |
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
30 | * SOFTWARE. | |
31 | * | |
32 | */ | |
33 | #include <linux/kernel.h> | |
34 | #include <linux/random.h> | |
bc3b2d7f | 35 | #include <linux/export.h> |
00e0f34c AG |
36 | |
37 | #include "rds.h" | |
38 | ||
39 | /* | |
40 | * All of connection management is simplified by serializing it through | |
41 | * work queues that execute in a connection managing thread. | |
42 | * | |
43 | * TCP wants to send acks through sendpage() in response to data_ready(), | |
44 | * but it needs a process context to do so. | |
45 | * | |
46 | * The receive paths need to allocate but can't drop packets (!) so we have | |
47 | * a thread around to block allocating if the receive fast path sees an | |
48 | * allocation failure. | |
49 | */ | |
50 | ||
51 | /* Grand Unified Theory of connection life cycle: | |
52 | * At any point in time, the connection can be in one of these states: | |
53 | * DOWN, CONNECTING, UP, DISCONNECTING, ERROR | |
54 | * | |
55 | * The following transitions are possible: | |
56 | * ANY -> ERROR | |
57 | * UP -> DISCONNECTING | |
58 | * ERROR -> DISCONNECTING | |
59 | * DISCONNECTING -> DOWN | |
60 | * DOWN -> CONNECTING | |
61 | * CONNECTING -> UP | |
62 | * | |
63 | * Transition to state DISCONNECTING/DOWN: | |
64 | * - Inside the shutdown worker; synchronizes with xmit path | |
0f4b1c7e | 65 | * through RDS_IN_XMIT, and with connection management callbacks |
00e0f34c AG |
66 | * via c_cm_lock. |
67 | * | |
68 | * For receive callbacks, we rely on the underlying transport | |
69 | * (TCP, IB/RDMA) to provide the necessary synchronisation. | |
70 | */ | |
71 | struct workqueue_struct *rds_wq; | |
616b757a | 72 | EXPORT_SYMBOL_GPL(rds_wq); |
00e0f34c | 73 | |
0cb43965 | 74 | void rds_connect_path_complete(struct rds_conn_path *cp, int curr) |
00e0f34c | 75 | { |
0cb43965 | 76 | if (!rds_conn_path_transition(cp, curr, RDS_CONN_UP)) { |
00e0f34c AG |
77 | printk(KERN_WARNING "%s: Cannot transition to state UP, " |
78 | "current state is %d\n", | |
79 | __func__, | |
0cb43965 SV |
80 | atomic_read(&cp->cp_state)); |
81 | rds_conn_path_drop(cp); | |
00e0f34c AG |
82 | return; |
83 | } | |
84 | ||
85 | rdsdebug("conn %p for %pI4 to %pI4 complete\n", | |
0cb43965 | 86 | cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr); |
00e0f34c | 87 | |
0cb43965 SV |
88 | cp->cp_reconnect_jiffies = 0; |
89 | set_bit(0, &cp->cp_conn->c_map_queued); | |
90 | queue_delayed_work(rds_wq, &cp->cp_send_w, 0); | |
91 | queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); | |
00e0f34c | 92 | } |
9c79440e SV |
93 | EXPORT_SYMBOL_GPL(rds_connect_path_complete); |
94 | ||
95 | void rds_connect_complete(struct rds_connection *conn) | |
96 | { | |
0cb43965 | 97 | rds_connect_path_complete(&conn->c_path[0], RDS_CONN_CONNECTING); |
9c79440e | 98 | } |
616b757a | 99 | EXPORT_SYMBOL_GPL(rds_connect_complete); |
00e0f34c AG |
100 | |
101 | /* | |
102 | * This random exponential backoff is relied on to eventually resolve racing | |
103 | * connects. | |
104 | * | |
105 | * If connect attempts race then both parties drop both connections and come | |
106 | * here to wait for a random amount of time before trying again. Eventually | |
107 | * the backoff range will be so much greater than the time it takes to | |
108 | * establish a connection that one of the pair will establish the connection | |
109 | * before the other's random delay fires. | |
110 | * | |
111 | * Connection attempts that arrive while a connection is already established | |
112 | * are also considered to be racing connects. This lets a connection from | |
113 | * a rebooted machine replace an existing stale connection before the transport | |
114 | * notices that the connection has failed. | |
115 | * | |
116 | * We should *always* start with a random backoff; otherwise a broken connection | |
117 | * will always take several iterations to be re-established. | |
118 | */ | |
0cb43965 | 119 | void rds_queue_reconnect(struct rds_conn_path *cp) |
00e0f34c AG |
120 | { |
121 | unsigned long rand; | |
0cb43965 | 122 | struct rds_connection *conn = cp->cp_conn; |
00e0f34c AG |
123 | |
124 | rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n", | |
125 | conn, &conn->c_laddr, &conn->c_faddr, | |
0cb43965 | 126 | cp->cp_reconnect_jiffies); |
00e0f34c | 127 | |
8315011a SV |
128 | /* let peer with smaller addr initiate reconnect, to avoid duels */ |
129 | if (conn->c_trans->t_type == RDS_TRANS_TCP && | |
130 | conn->c_laddr > conn->c_faddr) | |
131 | return; | |
132 | ||
0cb43965 SV |
133 | set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); |
134 | if (cp->cp_reconnect_jiffies == 0) { | |
135 | cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies; | |
136 | queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); | |
00e0f34c AG |
137 | return; |
138 | } | |
139 | ||
140 | get_random_bytes(&rand, sizeof(rand)); | |
141 | rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n", | |
0cb43965 | 142 | rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies, |
00e0f34c | 143 | conn, &conn->c_laddr, &conn->c_faddr); |
0cb43965 SV |
144 | queue_delayed_work(rds_wq, &cp->cp_conn_w, |
145 | rand % cp->cp_reconnect_jiffies); | |
00e0f34c | 146 | |
0cb43965 | 147 | cp->cp_reconnect_jiffies = min(cp->cp_reconnect_jiffies * 2, |
00e0f34c AG |
148 | rds_sysctl_reconnect_max_jiffies); |
149 | } | |
150 | ||
151 | void rds_connect_worker(struct work_struct *work) | |
152 | { | |
0cb43965 SV |
153 | struct rds_conn_path *cp = container_of(work, |
154 | struct rds_conn_path, | |
155 | cp_conn_w.work); | |
156 | struct rds_connection *conn = cp->cp_conn; | |
00e0f34c AG |
157 | int ret; |
158 | ||
5916e2c1 SV |
159 | if (cp->cp_index > 1 && cp->cp_conn->c_laddr > cp->cp_conn->c_faddr) |
160 | return; | |
0cb43965 | 161 | clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); |
b04e8554 SV |
162 | ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING); |
163 | if (ret) { | |
164 | ret = conn->c_trans->conn_path_connect(cp); | |
00e0f34c AG |
165 | rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n", |
166 | conn, &conn->c_laddr, &conn->c_faddr, ret); | |
167 | ||
168 | if (ret) { | |
0cb43965 SV |
169 | if (rds_conn_path_transition(cp, |
170 | RDS_CONN_CONNECTING, | |
171 | RDS_CONN_DOWN)) | |
172 | rds_queue_reconnect(cp); | |
00e0f34c | 173 | else |
fb1b3dc4 SV |
174 | rds_conn_path_error(cp, |
175 | "RDS: connect failed\n"); | |
00e0f34c AG |
176 | } |
177 | } | |
178 | } | |
179 | ||
00e0f34c AG |
180 | void rds_send_worker(struct work_struct *work) |
181 | { | |
0cb43965 SV |
182 | struct rds_conn_path *cp = container_of(work, |
183 | struct rds_conn_path, | |
184 | cp_send_w.work); | |
00e0f34c AG |
185 | int ret; |
186 | ||
0cb43965 SV |
187 | if (rds_conn_path_state(cp) == RDS_CONN_UP) { |
188 | clear_bit(RDS_LL_SEND_FULL, &cp->cp_flags); | |
1f9ecd7e | 189 | ret = rds_send_xmit(cp); |
db6526dc | 190 | cond_resched(); |
0cb43965 | 191 | rdsdebug("conn %p ret %d\n", cp->cp_conn, ret); |
00e0f34c AG |
192 | switch (ret) { |
193 | case -EAGAIN: | |
194 | rds_stats_inc(s_send_immediate_retry); | |
0cb43965 | 195 | queue_delayed_work(rds_wq, &cp->cp_send_w, 0); |
00e0f34c AG |
196 | break; |
197 | case -ENOMEM: | |
198 | rds_stats_inc(s_send_delayed_retry); | |
0cb43965 | 199 | queue_delayed_work(rds_wq, &cp->cp_send_w, 2); |
00e0f34c AG |
200 | default: |
201 | break; | |
202 | } | |
203 | } | |
204 | } | |
205 | ||
206 | void rds_recv_worker(struct work_struct *work) | |
207 | { | |
0cb43965 SV |
208 | struct rds_conn_path *cp = container_of(work, |
209 | struct rds_conn_path, | |
210 | cp_recv_w.work); | |
00e0f34c AG |
211 | int ret; |
212 | ||
0cb43965 | 213 | if (rds_conn_path_state(cp) == RDS_CONN_UP) { |
2da43c4a | 214 | ret = cp->cp_conn->c_trans->recv_path(cp); |
0cb43965 | 215 | rdsdebug("conn %p ret %d\n", cp->cp_conn, ret); |
00e0f34c AG |
216 | switch (ret) { |
217 | case -EAGAIN: | |
218 | rds_stats_inc(s_recv_immediate_retry); | |
0cb43965 | 219 | queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); |
00e0f34c AG |
220 | break; |
221 | case -ENOMEM: | |
222 | rds_stats_inc(s_recv_delayed_retry); | |
0cb43965 | 223 | queue_delayed_work(rds_wq, &cp->cp_recv_w, 2); |
00e0f34c AG |
224 | default: |
225 | break; | |
226 | } | |
227 | } | |
228 | } | |
229 | ||
2dc39357 AG |
230 | void rds_shutdown_worker(struct work_struct *work) |
231 | { | |
0cb43965 SV |
232 | struct rds_conn_path *cp = container_of(work, |
233 | struct rds_conn_path, | |
234 | cp_down_w); | |
2dc39357 | 235 | |
d769ef81 | 236 | rds_conn_shutdown(cp); |
2dc39357 AG |
237 | } |
238 | ||
00e0f34c AG |
239 | void rds_threads_exit(void) |
240 | { | |
241 | destroy_workqueue(rds_wq); | |
242 | } | |
243 | ||
ef87b7ea | 244 | int rds_threads_init(void) |
00e0f34c | 245 | { |
80c51be5 | 246 | rds_wq = create_singlethread_workqueue("krdsd"); |
8690bfa1 | 247 | if (!rds_wq) |
00e0f34c AG |
248 | return -ENOMEM; |
249 | ||
250 | return 0; | |
251 | } |