aboutsummaryrefslogtreecommitdiffstats
path: root/net/smc
diff options
context:
space:
mode:
authorWe-unite <3205135446@qq.com>2025-03-08 22:04:20 +0800
committerWe-unite <3205135446@qq.com>2025-03-08 22:04:20 +0800
commita07bb8fd1299070229f0e8f3dcb57ffd5ef9870a (patch)
tree84f21bd0bf7071bc5fc7dd989e77d7ceb5476682 /net/smc
downloadohosKernel-a07bb8fd1299070229f0e8f3dcb57ffd5ef9870a.tar.gz
ohosKernel-a07bb8fd1299070229f0e8f3dcb57ffd5ef9870a.zip
Initial commit: OpenHarmony-v4.0-ReleaseOpenHarmony-v4.0-Release
Diffstat (limited to 'net/smc')
-rw-r--r--net/smc/Kconfig21
-rw-r--r--net/smc/Makefile5
-rw-r--r--net/smc/af_smc.c2624
-rw-r--r--net/smc/smc.h300
-rw-r--r--net/smc/smc_cdc.c476
-rw-r--r--net/smc/smc_cdc.h305
-rw-r--r--net/smc/smc_clc.c784
-rw-r--r--net/smc/smc_clc.h333
-rw-r--r--net/smc/smc_close.c499
-rw-r--r--net/smc/smc_close.h30
-rw-r--r--net/smc/smc_core.c1973
-rw-r--r--net/smc/smc_core.h425
-rw-r--r--net/smc/smc_diag.c283
-rw-r--r--net/smc/smc_ib.c643
-rw-r--r--net/smc/smc_ib.h91
-rw-r--r--net/smc/smc_ism.c439
-rw-r--r--net/smc/smc_ism.h56
-rw-r--r--net/smc/smc_llc.c1974
-rw-r--r--net/smc/smc_llc.h109
-rw-r--r--net/smc/smc_netns.h21
-rw-r--r--net/smc/smc_pnet.c1174
-rw-r--r--net/smc/smc_pnet.h70
-rw-r--r--net/smc/smc_rx.c444
-rw-r--r--net/smc/smc_rx.h31
-rw-r--r--net/smc/smc_tx.c646
-rw-r--r--net/smc/smc_tx.h39
-rw-r--r--net/smc/smc_wr.c720
-rw-r--r--net/smc/smc_wr.h131
28 files changed, 14646 insertions, 0 deletions
diff --git a/net/smc/Kconfig b/net/smc/Kconfig
new file mode 100644
index 000000000..1ab3c5a2c
--- /dev/null
+++ b/net/smc/Kconfig
@@ -0,0 +1,21 @@
1# SPDX-License-Identifier: GPL-2.0-only
2config SMC
3 tristate "SMC socket protocol family"
4 depends on INET && INFINIBAND
5 help
6 SMC-R provides a "sockets over RDMA" solution making use of
7 RDMA over Converged Ethernet (RoCE) technology to upgrade
8 AF_INET TCP connections transparently.
9 The Linux implementation of the SMC-R solution is designed as
10 a separate socket family SMC.
11
12 Select this option if you want to run SMC socket applications
13
14config SMC_DIAG
15 tristate "SMC: socket monitoring interface"
16 depends on SMC
17 help
18 Support for SMC socket monitoring interface used by tools such as
19 smcss.
20
21 if unsure, say Y.
diff --git a/net/smc/Makefile b/net/smc/Makefile
new file mode 100644
index 000000000..cb1254541
--- /dev/null
+++ b/net/smc/Makefile
@@ -0,0 +1,5 @@
1# SPDX-License-Identifier: GPL-2.0-only
2obj-$(CONFIG_SMC) += smc.o
3obj-$(CONFIG_SMC_DIAG) += smc_diag.o
4smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
5smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
new file mode 100644
index 000000000..41cbc7c89
--- /dev/null
+++ b/net/smc/af_smc.c
@@ -0,0 +1,2624 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
4 *
5 * AF_SMC protocol family socket handler keeping the AF_INET sock address type
6 * applies to SOCK_STREAM sockets only
7 * offers an alternative communication option for TCP-protocol sockets
8 * applicable with RoCE-cards only
9 *
10 * Initial restrictions:
11 * - support for alternate links postponed
12 *
13 * Copyright IBM Corp. 2016, 2018
14 *
15 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
16 * based on prototype from Frank Blaschka
17 */
18
19#define KMSG_COMPONENT "smc"
20#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
21
22#include <linux/module.h>
23#include <linux/socket.h>
24#include <linux/workqueue.h>
25#include <linux/in.h>
26#include <linux/sched/signal.h>
27#include <linux/if_vlan.h>
28#include <linux/rcupdate_wait.h>
29#include <linux/ctype.h>
30
31#include <net/sock.h>
32#include <net/tcp.h>
33#include <net/smc.h>
34#include <asm/ioctls.h>
35
36#include <net/net_namespace.h>
37#include <net/netns/generic.h>
38#include "smc_netns.h"
39
40#include "smc.h"
41#include "smc_clc.h"
42#include "smc_llc.h"
43#include "smc_cdc.h"
44#include "smc_core.h"
45#include "smc_ib.h"
46#include "smc_ism.h"
47#include "smc_pnet.h"
48#include "smc_tx.h"
49#include "smc_rx.h"
50#include "smc_close.h"
51
52static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group
53 * creation on server
54 */
55static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group
56 * creation on client
57 */
58
59struct workqueue_struct *smc_hs_wq; /* wq for handshake work */
60struct workqueue_struct *smc_close_wq; /* wq for close work */
61
62static void smc_tcp_listen_work(struct work_struct *);
63static void smc_connect_work(struct work_struct *);
64
65static void smc_set_keepalive(struct sock *sk, int val)
66{
67 struct smc_sock *smc = smc_sk(sk);
68
69 smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
70}
71
72static struct smc_hashinfo smc_v4_hashinfo = {
73 .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
74};
75
76static struct smc_hashinfo smc_v6_hashinfo = {
77 .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
78};
79
80int smc_hash_sk(struct sock *sk)
81{
82 struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
83 struct hlist_head *head;
84
85 head = &h->ht;
86
87 write_lock_bh(&h->lock);
88 sk_add_node(sk, head);
89 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
90 write_unlock_bh(&h->lock);
91
92 return 0;
93}
94EXPORT_SYMBOL_GPL(smc_hash_sk);
95
96void smc_unhash_sk(struct sock *sk)
97{
98 struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
99
100 write_lock_bh(&h->lock);
101 if (sk_del_node_init(sk))
102 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
103 write_unlock_bh(&h->lock);
104}
105EXPORT_SYMBOL_GPL(smc_unhash_sk);
106
107struct proto smc_proto = {
108 .name = "SMC",
109 .owner = THIS_MODULE,
110 .keepalive = smc_set_keepalive,
111 .hash = smc_hash_sk,
112 .unhash = smc_unhash_sk,
113 .obj_size = sizeof(struct smc_sock),
114 .h.smc_hash = &smc_v4_hashinfo,
115 .slab_flags = SLAB_TYPESAFE_BY_RCU,
116};
117EXPORT_SYMBOL_GPL(smc_proto);
118
119struct proto smc_proto6 = {
120 .name = "SMC6",
121 .owner = THIS_MODULE,
122 .keepalive = smc_set_keepalive,
123 .hash = smc_hash_sk,
124 .unhash = smc_unhash_sk,
125 .obj_size = sizeof(struct smc_sock),
126 .h.smc_hash = &smc_v6_hashinfo,
127 .slab_flags = SLAB_TYPESAFE_BY_RCU,
128};
129EXPORT_SYMBOL_GPL(smc_proto6);
130
131static void smc_restore_fallback_changes(struct smc_sock *smc)
132{
133 if (smc->clcsock->file) { /* non-accepted sockets have no file yet */
134 smc->clcsock->file->private_data = smc->sk.sk_socket;
135 smc->clcsock->file = NULL;
136 }
137}
138
139static int __smc_release(struct smc_sock *smc)
140{
141 struct sock *sk = &smc->sk;
142 int rc = 0;
143
144 if (!smc->use_fallback) {
145 rc = smc_close_active(smc);
146 sock_set_flag(sk, SOCK_DEAD);
147 sk->sk_shutdown |= SHUTDOWN_MASK;
148 } else {
149 if (sk->sk_state != SMC_CLOSED) {
150 if (sk->sk_state != SMC_LISTEN &&
151 sk->sk_state != SMC_INIT)
152 sock_put(sk); /* passive closing */
153 if (sk->sk_state == SMC_LISTEN) {
154 /* wake up clcsock accept */
155 rc = kernel_sock_shutdown(smc->clcsock,
156 SHUT_RDWR);
157 }
158 sk->sk_state = SMC_CLOSED;
159 sk->sk_state_change(sk);
160 }
161 smc_restore_fallback_changes(smc);
162 }
163
164 sk->sk_prot->unhash(sk);
165
166 if (sk->sk_state == SMC_CLOSED) {
167 if (smc->clcsock) {
168 release_sock(sk);
169 smc_clcsock_release(smc);
170 lock_sock(sk);
171 }
172 if (!smc->use_fallback)
173 smc_conn_free(&smc->conn);
174 }
175
176 return rc;
177}
178
179static int smc_release(struct socket *sock)
180{
181 struct sock *sk = sock->sk;
182 struct smc_sock *smc;
183 int old_state, rc = 0;
184
185 if (!sk)
186 goto out;
187
188 sock_hold(sk); /* sock_put below */
189 smc = smc_sk(sk);
190
191 old_state = sk->sk_state;
192
193 /* cleanup for a dangling non-blocking connect */
194 if (smc->connect_nonblock && old_state == SMC_INIT)
195 tcp_abort(smc->clcsock->sk, ECONNABORTED);
196
197 if (cancel_work_sync(&smc->connect_work))
198 sock_put(&smc->sk); /* sock_hold in smc_connect for passive closing */
199
200 if (sk->sk_state == SMC_LISTEN)
201 /* smc_close_non_accepted() is called and acquires
202 * sock lock for child sockets again
203 */
204 lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
205 else
206 lock_sock(sk);
207
208 if (old_state == SMC_INIT && sk->sk_state == SMC_ACTIVE &&
209 !smc->use_fallback)
210 smc_close_active_abort(smc);
211
212 rc = __smc_release(smc);
213
214 /* detach socket */
215 sock_orphan(sk);
216 sock->sk = NULL;
217 release_sock(sk);
218
219 sock_put(sk); /* sock_hold above */
220 sock_put(sk); /* final sock_put */
221out:
222 return rc;
223}
224
225static void smc_destruct(struct sock *sk)
226{
227 if (sk->sk_state != SMC_CLOSED)
228 return;
229 if (!sock_flag(sk, SOCK_DEAD))
230 return;
231
232 sk_refcnt_debug_dec(sk);
233}
234
235static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
236 int protocol)
237{
238 struct smc_sock *smc;
239 struct proto *prot;
240 struct sock *sk;
241
242 prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
243 sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
244 if (!sk)
245 return NULL;
246
247 sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
248 sk->sk_state = SMC_INIT;
249 sk->sk_destruct = smc_destruct;
250 sk->sk_protocol = protocol;
251 smc = smc_sk(sk);
252 INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
253 INIT_WORK(&smc->connect_work, smc_connect_work);
254 INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
255 INIT_LIST_HEAD(&smc->accept_q);
256 spin_lock_init(&smc->accept_q_lock);
257 spin_lock_init(&smc->conn.send_lock);
258 sk->sk_prot->hash(sk);
259 sk_refcnt_debug_inc(sk);
260 mutex_init(&smc->clcsock_release_lock);
261
262 return sk;
263}
264
265static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
266 int addr_len)
267{
268 struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
269 struct sock *sk = sock->sk;
270 struct smc_sock *smc;
271 int rc;
272
273 smc = smc_sk(sk);
274
275 /* replicate tests from inet_bind(), to be safe wrt. future changes */
276 rc = -EINVAL;
277 if (addr_len < sizeof(struct sockaddr_in))
278 goto out;
279
280 rc = -EAFNOSUPPORT;
281 if (addr->sin_family != AF_INET &&
282 addr->sin_family != AF_INET6 &&
283 addr->sin_family != AF_UNSPEC)
284 goto out;
285 /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
286 if (addr->sin_family == AF_UNSPEC &&
287 addr->sin_addr.s_addr != htonl(INADDR_ANY))
288 goto out;
289
290 lock_sock(sk);
291
292 /* Check if socket is already active */
293 rc = -EINVAL;
294 if (sk->sk_state != SMC_INIT || smc->connect_nonblock)
295 goto out_rel;
296
297 smc->clcsock->sk->sk_reuse = sk->sk_reuse;
298 rc = kernel_bind(smc->clcsock, uaddr, addr_len);
299
300out_rel:
301 release_sock(sk);
302out:
303 return rc;
304}
305
306static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
307 unsigned long mask)
308{
309 /* options we don't get control via setsockopt for */
310 nsk->sk_type = osk->sk_type;
311 nsk->sk_sndbuf = osk->sk_sndbuf;
312 nsk->sk_rcvbuf = osk->sk_rcvbuf;
313 nsk->sk_sndtimeo = osk->sk_sndtimeo;
314 nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
315 nsk->sk_mark = osk->sk_mark;
316 nsk->sk_priority = osk->sk_priority;
317 nsk->sk_rcvlowat = osk->sk_rcvlowat;
318 nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
319 nsk->sk_err = osk->sk_err;
320
321 nsk->sk_flags &= ~mask;
322 nsk->sk_flags |= osk->sk_flags & mask;
323}
324
325#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
326 (1UL << SOCK_KEEPOPEN) | \
327 (1UL << SOCK_LINGER) | \
328 (1UL << SOCK_BROADCAST) | \
329 (1UL << SOCK_TIMESTAMP) | \
330 (1UL << SOCK_DBG) | \
331 (1UL << SOCK_RCVTSTAMP) | \
332 (1UL << SOCK_RCVTSTAMPNS) | \
333 (1UL << SOCK_LOCALROUTE) | \
334 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
335 (1UL << SOCK_RXQ_OVFL) | \
336 (1UL << SOCK_WIFI_STATUS) | \
337 (1UL << SOCK_NOFCS) | \
338 (1UL << SOCK_FILTER_LOCKED) | \
339 (1UL << SOCK_TSTAMP_NEW))
340/* copy only relevant settings and flags of SOL_SOCKET level from smc to
341 * clc socket (since smc is not called for these options from net/core)
342 */
343static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
344{
345 smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
346}
347
348#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
349 (1UL << SOCK_KEEPOPEN) | \
350 (1UL << SOCK_LINGER) | \
351 (1UL << SOCK_DBG))
352/* copy only settings and flags relevant for smc from clc to smc socket */
353static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
354{
355 smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
356}
357
358/* register the new rmb on all links */
359static int smcr_lgr_reg_rmbs(struct smc_link *link,
360 struct smc_buf_desc *rmb_desc)
361{
362 struct smc_link_group *lgr = link->lgr;
363 int i, rc = 0;
364
365 rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY);
366 if (rc)
367 return rc;
368 /* protect against parallel smc_llc_cli_rkey_exchange() and
369 * parallel smcr_link_reg_rmb()
370 */
371 mutex_lock(&lgr->llc_conf_mutex);
372 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
373 if (!smc_link_active(&lgr->lnk[i]))
374 continue;
375 rc = smcr_link_reg_rmb(&lgr->lnk[i], rmb_desc);
376 if (rc)
377 goto out;
378 }
379
380 /* exchange confirm_rkey msg with peer */
381 rc = smc_llc_do_confirm_rkey(link, rmb_desc);
382 if (rc) {
383 rc = -EFAULT;
384 goto out;
385 }
386 rmb_desc->is_conf_rkey = true;
387out:
388 mutex_unlock(&lgr->llc_conf_mutex);
389 smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
390 return rc;
391}
392
393static int smcr_clnt_conf_first_link(struct smc_sock *smc)
394{
395 struct smc_link *link = smc->conn.lnk;
396 struct smc_llc_qentry *qentry;
397 int rc;
398
399 /* receive CONFIRM LINK request from server over RoCE fabric */
400 qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME,
401 SMC_LLC_CONFIRM_LINK);
402 if (!qentry) {
403 struct smc_clc_msg_decline dclc;
404
405 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
406 SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
407 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
408 }
409 smc_llc_save_peer_uid(qentry);
410 rc = smc_llc_eval_conf_link(qentry, SMC_LLC_REQ);
411 smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
412 if (rc)
413 return SMC_CLC_DECL_RMBE_EC;
414
415 rc = smc_ib_modify_qp_rts(link);
416 if (rc)
417 return SMC_CLC_DECL_ERR_RDYLNK;
418
419 smc_wr_remember_qp_attr(link);
420
421 if (smcr_link_reg_rmb(link, smc->conn.rmb_desc))
422 return SMC_CLC_DECL_ERR_REGRMB;
423
424 /* confirm_rkey is implicit on 1st contact */
425 smc->conn.rmb_desc->is_conf_rkey = true;
426
427 /* send CONFIRM LINK response over RoCE fabric */
428 rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
429 if (rc < 0)
430 return SMC_CLC_DECL_TIMEOUT_CL;
431
432 smc_llc_link_active(link);
433 smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE);
434
435 /* optional 2nd link, receive ADD LINK request from server */
436 qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME,
437 SMC_LLC_ADD_LINK);
438 if (!qentry) {
439 struct smc_clc_msg_decline dclc;
440
441 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
442 SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
443 if (rc == -EAGAIN)
444 rc = 0; /* no DECLINE received, go with one link */
445 return rc;
446 }
447 smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl);
448 smc_llc_cli_add_link(link, qentry);
449 return 0;
450}
451
452static void smcr_conn_save_peer_info(struct smc_sock *smc,
453 struct smc_clc_msg_accept_confirm *clc)
454{
455 int bufsize = smc_uncompress_bufsize(clc->r0.rmbe_size);
456
457 smc->conn.peer_rmbe_idx = clc->r0.rmbe_idx;
458 smc->conn.local_tx_ctrl.token = ntohl(clc->r0.rmbe_alert_token);
459 smc->conn.peer_rmbe_size = bufsize;
460 atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
461 smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
462}
463
464static bool smc_isascii(char *hostname)
465{
466 int i;
467
468 for (i = 0; i < SMC_MAX_HOSTNAME_LEN; i++)
469 if (!isascii(hostname[i]))
470 return false;
471 return true;
472}
473
474static void smcd_conn_save_peer_info(struct smc_sock *smc,
475 struct smc_clc_msg_accept_confirm *clc)
476{
477 int bufsize = smc_uncompress_bufsize(clc->d0.dmbe_size);
478
479 smc->conn.peer_rmbe_idx = clc->d0.dmbe_idx;
480 smc->conn.peer_token = clc->d0.token;
481 /* msg header takes up space in the buffer */
482 smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
483 atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
484 smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
485 if (clc->hdr.version > SMC_V1 &&
486 (clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) {
487 struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
488 (struct smc_clc_msg_accept_confirm_v2 *)clc;
489 struct smc_clc_first_contact_ext *fce =
490 (struct smc_clc_first_contact_ext *)
491 (((u8 *)clc_v2) + sizeof(*clc_v2));
492
493 memcpy(smc->conn.lgr->negotiated_eid, clc_v2->eid,
494 SMC_MAX_EID_LEN);
495 smc->conn.lgr->peer_os = fce->os_type;
496 smc->conn.lgr->peer_smc_release = fce->release;
497 if (smc_isascii(fce->hostname))
498 memcpy(smc->conn.lgr->peer_hostname, fce->hostname,
499 SMC_MAX_HOSTNAME_LEN);
500 }
501}
502
503static void smc_conn_save_peer_info(struct smc_sock *smc,
504 struct smc_clc_msg_accept_confirm *clc)
505{
506 if (smc->conn.lgr->is_smcd)
507 smcd_conn_save_peer_info(smc, clc);
508 else
509 smcr_conn_save_peer_info(smc, clc);
510}
511
512static void smc_link_save_peer_info(struct smc_link *link,
513 struct smc_clc_msg_accept_confirm *clc)
514{
515 link->peer_qpn = ntoh24(clc->r0.qpn);
516 memcpy(link->peer_gid, clc->r0.lcl.gid, SMC_GID_SIZE);
517 memcpy(link->peer_mac, clc->r0.lcl.mac, sizeof(link->peer_mac));
518 link->peer_psn = ntoh24(clc->r0.psn);
519 link->peer_mtu = clc->r0.qp_mtu;
520}
521
522static void smc_switch_to_fallback(struct smc_sock *smc)
523{
524 wait_queue_head_t *smc_wait = sk_sleep(&smc->sk);
525 wait_queue_head_t *clc_wait = sk_sleep(smc->clcsock->sk);
526 unsigned long flags;
527
528 smc->use_fallback = true;
529 if (smc->sk.sk_socket && smc->sk.sk_socket->file) {
530 smc->clcsock->file = smc->sk.sk_socket->file;
531 smc->clcsock->file->private_data = smc->clcsock;
532 smc->clcsock->wq.fasync_list =
533 smc->sk.sk_socket->wq.fasync_list;
534
535 /* There may be some entries remaining in
536 * smc socket->wq, which should be removed
537 * to clcsocket->wq during the fallback.
538 */
539 spin_lock_irqsave(&smc_wait->lock, flags);
540 spin_lock_nested(&clc_wait->lock, SINGLE_DEPTH_NESTING);
541 list_splice_init(&smc_wait->head, &clc_wait->head);
542 spin_unlock(&clc_wait->lock);
543 spin_unlock_irqrestore(&smc_wait->lock, flags);
544 }
545}
546
547/* fall back during connect */
548static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
549{
550 smc_switch_to_fallback(smc);
551 smc->fallback_rsn = reason_code;
552 smc_copy_sock_settings_to_clc(smc);
553 smc->connect_nonblock = 0;
554 if (smc->sk.sk_state == SMC_INIT)
555 smc->sk.sk_state = SMC_ACTIVE;
556 return 0;
557}
558
559/* decline and fall back during connect */
560static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code,
561 u8 version)
562{
563 int rc;
564
565 if (reason_code < 0) { /* error, fallback is not possible */
566 if (smc->sk.sk_state == SMC_INIT)
567 sock_put(&smc->sk); /* passive closing */
568 return reason_code;
569 }
570 if (reason_code != SMC_CLC_DECL_PEERDECL) {
571 rc = smc_clc_send_decline(smc, reason_code, version);
572 if (rc < 0) {
573 if (smc->sk.sk_state == SMC_INIT)
574 sock_put(&smc->sk); /* passive closing */
575 return rc;
576 }
577 }
578 return smc_connect_fallback(smc, reason_code);
579}
580
581/* abort connecting */
582static void smc_connect_abort(struct smc_sock *smc, int local_first)
583{
584 if (local_first)
585 smc_lgr_cleanup_early(&smc->conn);
586 else
587 smc_conn_free(&smc->conn);
588}
589
590/* check if there is a rdma device available for this connection. */
591/* called for connect and listen */
592static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini)
593{
594 /* PNET table look up: search active ib_device and port
595 * within same PNETID that also contains the ethernet device
596 * used for the internal TCP socket
597 */
598 smc_pnet_find_roce_resource(smc->clcsock->sk, ini);
599 if (!ini->ib_dev)
600 return SMC_CLC_DECL_NOSMCRDEV;
601 return 0;
602}
603
604/* check if there is an ISM device available for this connection. */
605/* called for connect and listen */
606static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini)
607{
608 /* Find ISM device with same PNETID as connecting interface */
609 smc_pnet_find_ism_resource(smc->clcsock->sk, ini);
610 if (!ini->ism_dev[0])
611 return SMC_CLC_DECL_NOSMCDDEV;
612 else
613 ini->ism_chid[0] = smc_ism_get_chid(ini->ism_dev[0]);
614 return 0;
615}
616
617/* is chid unique for the ism devices that are already determined? */
618static bool smc_find_ism_v2_is_unique_chid(u16 chid, struct smc_init_info *ini,
619 int cnt)
620{
621 int i = (!ini->ism_dev[0]) ? 1 : 0;
622
623 for (; i < cnt; i++)
624 if (ini->ism_chid[i] == chid)
625 return false;
626 return true;
627}
628
629/* determine possible V2 ISM devices (either without PNETID or with PNETID plus
630 * PNETID matching net_device)
631 */
632static int smc_find_ism_v2_device_clnt(struct smc_sock *smc,
633 struct smc_init_info *ini)
634{
635 int rc = SMC_CLC_DECL_NOSMCDDEV;
636 struct smcd_dev *smcd;
637 int i = 1;
638 u16 chid;
639
640 if (smcd_indicated(ini->smc_type_v1))
641 rc = 0; /* already initialized for V1 */
642 mutex_lock(&smcd_dev_list.mutex);
643 list_for_each_entry(smcd, &smcd_dev_list.list, list) {
644 if (smcd->going_away || smcd == ini->ism_dev[0])
645 continue;
646 chid = smc_ism_get_chid(smcd);
647 if (!smc_find_ism_v2_is_unique_chid(chid, ini, i))
648 continue;
649 if (!smc_pnet_is_pnetid_set(smcd->pnetid) ||
650 smc_pnet_is_ndev_pnetid(sock_net(&smc->sk), smcd->pnetid)) {
651 ini->ism_dev[i] = smcd;
652 ini->ism_chid[i] = chid;
653 ini->is_smcd = true;
654 rc = 0;
655 i++;
656 if (i > SMC_MAX_ISM_DEVS)
657 break;
658 }
659 }
660 mutex_unlock(&smcd_dev_list.mutex);
661 ini->ism_offered_cnt = i - 1;
662 if (!ini->ism_dev[0] && !ini->ism_dev[1])
663 ini->smcd_version = 0;
664
665 return rc;
666}
667
668/* Check for VLAN ID and register it on ISM device just for CLC handshake */
669static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
670 struct smc_init_info *ini)
671{
672 if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev[0], ini->vlan_id))
673 return SMC_CLC_DECL_ISMVLANERR;
674 return 0;
675}
676
677static int smc_find_proposal_devices(struct smc_sock *smc,
678 struct smc_init_info *ini)
679{
680 int rc = 0;
681
682 /* check if there is an ism device available */
683 if (ini->smcd_version & SMC_V1) {
684 if (smc_find_ism_device(smc, ini) ||
685 smc_connect_ism_vlan_setup(smc, ini)) {
686 if (ini->smc_type_v1 == SMC_TYPE_B)
687 ini->smc_type_v1 = SMC_TYPE_R;
688 else
689 ini->smc_type_v1 = SMC_TYPE_N;
690 } /* else ISM V1 is supported for this connection */
691 if (smc_find_rdma_device(smc, ini)) {
692 if (ini->smc_type_v1 == SMC_TYPE_B)
693 ini->smc_type_v1 = SMC_TYPE_D;
694 else
695 ini->smc_type_v1 = SMC_TYPE_N;
696 } /* else RDMA is supported for this connection */
697 }
698 if (smc_ism_v2_capable && smc_find_ism_v2_device_clnt(smc, ini))
699 ini->smc_type_v2 = SMC_TYPE_N;
700
701 /* if neither ISM nor RDMA are supported, fallback */
702 if (!smcr_indicated(ini->smc_type_v1) &&
703 ini->smc_type_v1 == SMC_TYPE_N && ini->smc_type_v2 == SMC_TYPE_N)
704 rc = SMC_CLC_DECL_NOSMCDEV;
705
706 return rc;
707}
708
709/* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
710 * used, the VLAN ID will be registered again during the connection setup.
711 */
712static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc,
713 struct smc_init_info *ini)
714{
715 if (!smcd_indicated(ini->smc_type_v1))
716 return 0;
717 if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev[0], ini->vlan_id))
718 return SMC_CLC_DECL_CNFERR;
719 return 0;
720}
721
722#define SMC_CLC_MAX_ACCEPT_LEN \
723 (sizeof(struct smc_clc_msg_accept_confirm_v2) + \
724 sizeof(struct smc_clc_first_contact_ext) + \
725 sizeof(struct smc_clc_msg_trail))
726
727/* CLC handshake during connect */
728static int smc_connect_clc(struct smc_sock *smc,
729 struct smc_clc_msg_accept_confirm_v2 *aclc2,
730 struct smc_init_info *ini)
731{
732 int rc = 0;
733
734 /* do inband token exchange */
735 rc = smc_clc_send_proposal(smc, ini);
736 if (rc)
737 return rc;
738 /* receive SMC Accept CLC message */
739 return smc_clc_wait_msg(smc, aclc2, SMC_CLC_MAX_ACCEPT_LEN,
740 SMC_CLC_ACCEPT, CLC_WAIT_TIME);
741}
742
743/* setup for RDMA connection of client */
744static int smc_connect_rdma(struct smc_sock *smc,
745 struct smc_clc_msg_accept_confirm *aclc,
746 struct smc_init_info *ini)
747{
748 int i, reason_code = 0;
749 struct smc_link *link;
750
751 ini->is_smcd = false;
752 ini->ib_lcl = &aclc->r0.lcl;
753 ini->ib_clcqpn = ntoh24(aclc->r0.qpn);
754 ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK;
755
756 mutex_lock(&smc_client_lgr_pending);
757 reason_code = smc_conn_create(smc, ini);
758 if (reason_code) {
759 mutex_unlock(&smc_client_lgr_pending);
760 return reason_code;
761 }
762
763 smc_conn_save_peer_info(smc, aclc);
764
765 if (ini->first_contact_local) {
766 link = smc->conn.lnk;
767 } else {
768 /* set link that was assigned by server */
769 link = NULL;
770 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
771 struct smc_link *l = &smc->conn.lgr->lnk[i];
772
773 if (l->peer_qpn == ntoh24(aclc->r0.qpn) &&
774 !memcmp(l->peer_gid, &aclc->r0.lcl.gid,
775 SMC_GID_SIZE) &&
776 !memcmp(l->peer_mac, &aclc->r0.lcl.mac,
777 sizeof(l->peer_mac))) {
778 link = l;
779 break;
780 }
781 }
782 if (!link) {
783 reason_code = SMC_CLC_DECL_NOSRVLINK;
784 goto connect_abort;
785 }
786 smc->conn.lnk = link;
787 }
788
789 /* create send buffer and rmb */
790 if (smc_buf_create(smc, false)) {
791 reason_code = SMC_CLC_DECL_MEM;
792 goto connect_abort;
793 }
794
795 if (ini->first_contact_local)
796 smc_link_save_peer_info(link, aclc);
797
798 if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) {
799 reason_code = SMC_CLC_DECL_ERR_RTOK;
800 goto connect_abort;
801 }
802
803 smc_close_init(smc);
804 smc_rx_init(smc);
805
806 if (ini->first_contact_local) {
807 if (smc_ib_ready_link(link)) {
808 reason_code = SMC_CLC_DECL_ERR_RDYLNK;
809 goto connect_abort;
810 }
811 } else {
812 if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) {
813 reason_code = SMC_CLC_DECL_ERR_REGRMB;
814 goto connect_abort;
815 }
816 }
817 smc_rmb_sync_sg_for_device(&smc->conn);
818
819 reason_code = smc_clc_send_confirm(smc, ini->first_contact_local,
820 SMC_V1);
821 if (reason_code)
822 goto connect_abort;
823
824 smc_tx_init(smc);
825
826 if (ini->first_contact_local) {
827 /* QP confirmation over RoCE fabric */
828 smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
829 reason_code = smcr_clnt_conf_first_link(smc);
830 smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
831 if (reason_code)
832 goto connect_abort;
833 }
834 mutex_unlock(&smc_client_lgr_pending);
835
836 smc_copy_sock_settings_to_clc(smc);
837 smc->connect_nonblock = 0;
838 if (smc->sk.sk_state == SMC_INIT)
839 smc->sk.sk_state = SMC_ACTIVE;
840
841 return 0;
842connect_abort:
843 smc_connect_abort(smc, ini->first_contact_local);
844 mutex_unlock(&smc_client_lgr_pending);
845 smc->connect_nonblock = 0;
846
847 return reason_code;
848}
849
850/* The server has chosen one of the proposed ISM devices for the communication.
851 * Determine from the CHID of the received CLC ACCEPT the ISM device chosen.
852 */
853static int
854smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm_v2 *aclc,
855 struct smc_init_info *ini)
856{
857 int i;
858
859 for (i = 0; i < ini->ism_offered_cnt + 1; i++) {
860 if (ini->ism_chid[i] == ntohs(aclc->chid)) {
861 ini->ism_selected = i;
862 return 0;
863 }
864 }
865
866 return -EPROTO;
867}
868
869/* setup for ISM connection of client */
870static int smc_connect_ism(struct smc_sock *smc,
871 struct smc_clc_msg_accept_confirm *aclc,
872 struct smc_init_info *ini)
873{
874 int rc = 0;
875
876 ini->is_smcd = true;
877 ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK;
878
879 if (aclc->hdr.version == SMC_V2) {
880 struct smc_clc_msg_accept_confirm_v2 *aclc_v2 =
881 (struct smc_clc_msg_accept_confirm_v2 *)aclc;
882
883 rc = smc_v2_determine_accepted_chid(aclc_v2, ini);
884 if (rc)
885 return rc;
886 }
887 ini->ism_peer_gid[ini->ism_selected] = aclc->d0.gid;
888
889 /* there is only one lgr role for SMC-D; use server lock */
890 mutex_lock(&smc_server_lgr_pending);
891 rc = smc_conn_create(smc, ini);
892 if (rc) {
893 mutex_unlock(&smc_server_lgr_pending);
894 return rc;
895 }
896
897 /* Create send and receive buffers */
898 rc = smc_buf_create(smc, true);
899 if (rc) {
900 rc = (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : SMC_CLC_DECL_MEM;
901 goto connect_abort;
902 }
903
904 smc_conn_save_peer_info(smc, aclc);
905 smc_close_init(smc);
906 smc_rx_init(smc);
907 smc_tx_init(smc);
908
909 rc = smc_clc_send_confirm(smc, ini->first_contact_local,
910 aclc->hdr.version);
911 if (rc)
912 goto connect_abort;
913 mutex_unlock(&smc_server_lgr_pending);
914
915 smc_copy_sock_settings_to_clc(smc);
916 smc->connect_nonblock = 0;
917 if (smc->sk.sk_state == SMC_INIT)
918 smc->sk.sk_state = SMC_ACTIVE;
919
920 return 0;
921connect_abort:
922 smc_connect_abort(smc, ini->first_contact_local);
923 mutex_unlock(&smc_server_lgr_pending);
924 smc->connect_nonblock = 0;
925
926 return rc;
927}
928
929/* check if received accept type and version matches a proposed one */
930static int smc_connect_check_aclc(struct smc_init_info *ini,
931 struct smc_clc_msg_accept_confirm *aclc)
932{
933 if ((aclc->hdr.typev1 == SMC_TYPE_R &&
934 !smcr_indicated(ini->smc_type_v1)) ||
935 (aclc->hdr.typev1 == SMC_TYPE_D &&
936 ((!smcd_indicated(ini->smc_type_v1) &&
937 !smcd_indicated(ini->smc_type_v2)) ||
938 (aclc->hdr.version == SMC_V1 &&
939 !smcd_indicated(ini->smc_type_v1)) ||
940 (aclc->hdr.version == SMC_V2 &&
941 !smcd_indicated(ini->smc_type_v2)))))
942 return SMC_CLC_DECL_MODEUNSUPP;
943
944 return 0;
945}
946
947/* perform steps before actually connecting */
948static int __smc_connect(struct smc_sock *smc)
949{
950 u8 version = smc_ism_v2_capable ? SMC_V2 : SMC_V1;
951 struct smc_clc_msg_accept_confirm_v2 *aclc2;
952 struct smc_clc_msg_accept_confirm *aclc;
953 struct smc_init_info *ini = NULL;
954 u8 *buf = NULL;
955 int rc = 0;
956
957 if (smc->use_fallback)
958 return smc_connect_fallback(smc, smc->fallback_rsn);
959
960 /* if peer has not signalled SMC-capability, fall back */
961 if (!tcp_sk(smc->clcsock->sk)->syn_smc)
962 return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
963
964 /* IPSec connections opt out of SMC optimizations */
965 if (using_ipsec(smc))
966 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC,
967 version);
968
969 ini = kzalloc(sizeof(*ini), GFP_KERNEL);
970 if (!ini)
971 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_MEM,
972 version);
973
974 ini->smcd_version = SMC_V1;
975 ini->smcd_version |= smc_ism_v2_capable ? SMC_V2 : 0;
976 ini->smc_type_v1 = SMC_TYPE_B;
977 ini->smc_type_v2 = smc_ism_v2_capable ? SMC_TYPE_D : SMC_TYPE_N;
978
979 /* get vlan id from IP device */
980 if (smc_vlan_by_tcpsk(smc->clcsock, ini)) {
981 ini->smcd_version &= ~SMC_V1;
982 ini->smc_type_v1 = SMC_TYPE_N;
983 if (!ini->smcd_version) {
984 rc = SMC_CLC_DECL_GETVLANERR;
985 goto fallback;
986 }
987 }
988
989 rc = smc_find_proposal_devices(smc, ini);
990 if (rc)
991 goto fallback;
992
993 buf = kzalloc(SMC_CLC_MAX_ACCEPT_LEN, GFP_KERNEL);
994 if (!buf) {
995 rc = SMC_CLC_DECL_MEM;
996 goto fallback;
997 }
998 aclc2 = (struct smc_clc_msg_accept_confirm_v2 *)buf;
999 aclc = (struct smc_clc_msg_accept_confirm *)aclc2;
1000
1001 /* perform CLC handshake */
1002 rc = smc_connect_clc(smc, aclc2, ini);
1003 if (rc)
1004 goto vlan_cleanup;
1005
1006 /* check if smc modes and versions of CLC proposal and accept match */
1007 rc = smc_connect_check_aclc(ini, aclc);
1008 version = aclc->hdr.version == SMC_V1 ? SMC_V1 : SMC_V2;
1009 ini->smcd_version = version;
1010 if (rc)
1011 goto vlan_cleanup;
1012
1013 /* depending on previous steps, connect using rdma or ism */
1014 if (aclc->hdr.typev1 == SMC_TYPE_R)
1015 rc = smc_connect_rdma(smc, aclc, ini);
1016 else if (aclc->hdr.typev1 == SMC_TYPE_D)
1017 rc = smc_connect_ism(smc, aclc, ini);
1018 if (rc)
1019 goto vlan_cleanup;
1020
1021 smc_connect_ism_vlan_cleanup(smc, ini);
1022 kfree(buf);
1023 kfree(ini);
1024 return 0;
1025
1026vlan_cleanup:
1027 smc_connect_ism_vlan_cleanup(smc, ini);
1028 kfree(buf);
1029fallback:
1030 kfree(ini);
1031 return smc_connect_decline_fallback(smc, rc, version);
1032}
1033
1034static void smc_connect_work(struct work_struct *work)
1035{
1036 struct smc_sock *smc = container_of(work, struct smc_sock,
1037 connect_work);
1038 long timeo = smc->sk.sk_sndtimeo;
1039 int rc = 0;
1040
1041 if (!timeo)
1042 timeo = MAX_SCHEDULE_TIMEOUT;
1043 lock_sock(smc->clcsock->sk);
1044 if (smc->clcsock->sk->sk_err) {
1045 smc->sk.sk_err = smc->clcsock->sk->sk_err;
1046 } else if ((1 << smc->clcsock->sk->sk_state) &
1047 (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
1048 rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo);
1049 if ((rc == -EPIPE) &&
1050 ((1 << smc->clcsock->sk->sk_state) &
1051 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)))
1052 rc = 0;
1053 }
1054 release_sock(smc->clcsock->sk);
1055 lock_sock(&smc->sk);
1056 if (rc != 0 || smc->sk.sk_err) {
1057 smc->sk.sk_state = SMC_CLOSED;
1058 if (rc == -EPIPE || rc == -EAGAIN)
1059 smc->sk.sk_err = EPIPE;
1060 else if (rc == -ECONNREFUSED)
1061 smc->sk.sk_err = ECONNREFUSED;
1062 else if (signal_pending(current))
1063 smc->sk.sk_err = -sock_intr_errno(timeo);
1064 sock_put(&smc->sk); /* passive closing */
1065 goto out;
1066 }
1067
1068 rc = __smc_connect(smc);
1069 if (rc < 0)
1070 smc->sk.sk_err = -rc;
1071
1072out:
1073 if (!sock_flag(&smc->sk, SOCK_DEAD)) {
1074 if (smc->sk.sk_err) {
1075 smc->sk.sk_state_change(&smc->sk);
1076 } else { /* allow polling before and after fallback decision */
1077 smc->clcsock->sk->sk_write_space(smc->clcsock->sk);
1078 smc->sk.sk_write_space(&smc->sk);
1079 }
1080 }
1081 release_sock(&smc->sk);
1082}
1083
1084static int smc_connect(struct socket *sock, struct sockaddr *addr,
1085 int alen, int flags)
1086{
1087 struct sock *sk = sock->sk;
1088 struct smc_sock *smc;
1089 int rc = -EINVAL;
1090
1091 smc = smc_sk(sk);
1092
1093 /* separate smc parameter checking to be safe */
1094 if (alen < sizeof(addr->sa_family))
1095 goto out_err;
1096 if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
1097 goto out_err;
1098
1099 lock_sock(sk);
1100 switch (sk->sk_state) {
1101 default:
1102 goto out;
1103 case SMC_ACTIVE:
1104 rc = -EISCONN;
1105 goto out;
1106 case SMC_INIT:
1107 rc = 0;
1108 break;
1109 }
1110
1111 smc_copy_sock_settings_to_clc(smc);
1112 tcp_sk(smc->clcsock->sk)->syn_smc = 1;
1113 if (smc->connect_nonblock) {
1114 rc = -EALREADY;
1115 goto out;
1116 }
1117 rc = kernel_connect(smc->clcsock, addr, alen, flags);
1118 if (rc && rc != -EINPROGRESS)
1119 goto out;
1120
1121 if (smc->use_fallback)
1122 goto out;
1123 sock_hold(&smc->sk); /* sock put in passive closing */
1124 if (flags & O_NONBLOCK) {
1125 if (queue_work(smc_hs_wq, &smc->connect_work))
1126 smc->connect_nonblock = 1;
1127 rc = -EINPROGRESS;
1128 } else {
1129 rc = __smc_connect(smc);
1130 if (rc < 0)
1131 goto out;
1132 else
1133 rc = 0; /* success cases including fallback */
1134 }
1135
1136out:
1137 release_sock(sk);
1138out_err:
1139 return rc;
1140}
1141
1142static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
1143{
1144 struct socket *new_clcsock = NULL;
1145 struct sock *lsk = &lsmc->sk;
1146 struct sock *new_sk;
1147 int rc = -EINVAL;
1148
1149 release_sock(lsk);
1150 new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
1151 if (!new_sk) {
1152 rc = -ENOMEM;
1153 lsk->sk_err = ENOMEM;
1154 *new_smc = NULL;
1155 lock_sock(lsk);
1156 goto out;
1157 }
1158 *new_smc = smc_sk(new_sk);
1159
1160 mutex_lock(&lsmc->clcsock_release_lock);
1161 if (lsmc->clcsock)
1162 rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK);
1163 mutex_unlock(&lsmc->clcsock_release_lock);
1164 lock_sock(lsk);
1165 if (rc < 0 && rc != -EAGAIN)
1166 lsk->sk_err = -rc;
1167 if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
1168 new_sk->sk_prot->unhash(new_sk);
1169 if (new_clcsock)
1170 sock_release(new_clcsock);
1171 new_sk->sk_state = SMC_CLOSED;
1172 sock_set_flag(new_sk, SOCK_DEAD);
1173 sock_put(new_sk); /* final */
1174 *new_smc = NULL;
1175 goto out;
1176 }
1177
1178 /* new clcsock has inherited the smc listen-specific sk_data_ready
1179 * function; switch it back to the original sk_data_ready function
1180 */
1181 new_clcsock->sk->sk_data_ready = lsmc->clcsk_data_ready;
1182 (*new_smc)->clcsock = new_clcsock;
1183out:
1184 return rc;
1185}
1186
1187/* add a just created sock to the accept queue of the listen sock as
1188 * candidate for a following socket accept call from user space
1189 */
1190static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
1191{
1192 struct smc_sock *par = smc_sk(parent);
1193
1194 sock_hold(sk); /* sock_put in smc_accept_unlink () */
1195 spin_lock(&par->accept_q_lock);
1196 list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
1197 spin_unlock(&par->accept_q_lock);
1198 sk_acceptq_added(parent);
1199}
1200
1201/* remove a socket from the accept queue of its parental listening socket */
1202static void smc_accept_unlink(struct sock *sk)
1203{
1204 struct smc_sock *par = smc_sk(sk)->listen_smc;
1205
1206 spin_lock(&par->accept_q_lock);
1207 list_del_init(&smc_sk(sk)->accept_q);
1208 spin_unlock(&par->accept_q_lock);
1209 sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
1210 sock_put(sk); /* sock_hold in smc_accept_enqueue */
1211}
1212
1213/* remove a sock from the accept queue to bind it to a new socket created
1214 * for a socket accept call from user space
1215 */
1216struct sock *smc_accept_dequeue(struct sock *parent,
1217 struct socket *new_sock)
1218{
1219 struct smc_sock *isk, *n;
1220 struct sock *new_sk;
1221
1222 list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
1223 new_sk = (struct sock *)isk;
1224
1225 smc_accept_unlink(new_sk);
1226 if (new_sk->sk_state == SMC_CLOSED) {
1227 new_sk->sk_prot->unhash(new_sk);
1228 if (isk->clcsock) {
1229 sock_release(isk->clcsock);
1230 isk->clcsock = NULL;
1231 }
1232 sock_put(new_sk); /* final */
1233 continue;
1234 }
1235 if (new_sock) {
1236 sock_graft(new_sk, new_sock);
1237 if (isk->use_fallback) {
1238 smc_sk(new_sk)->clcsock->file = new_sock->file;
1239 isk->clcsock->file->private_data = isk->clcsock;
1240 }
1241 }
1242 return new_sk;
1243 }
1244 return NULL;
1245}
1246
1247/* clean up for a created but never accepted sock */
1248void smc_close_non_accepted(struct sock *sk)
1249{
1250 struct smc_sock *smc = smc_sk(sk);
1251
1252 sock_hold(sk); /* sock_put below */
1253 lock_sock(sk);
1254 if (!sk->sk_lingertime)
1255 /* wait for peer closing */
1256 sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
1257 __smc_release(smc);
1258 release_sock(sk);
1259 sock_put(sk); /* sock_hold above */
1260 sock_put(sk); /* final sock_put */
1261}
1262
1263static int smcr_serv_conf_first_link(struct smc_sock *smc)
1264{
1265 struct smc_link *link = smc->conn.lnk;
1266 struct smc_llc_qentry *qentry;
1267 int rc;
1268
1269 if (smcr_link_reg_rmb(link, smc->conn.rmb_desc))
1270 return SMC_CLC_DECL_ERR_REGRMB;
1271
1272 /* send CONFIRM LINK request to client over the RoCE fabric */
1273 rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ);
1274 if (rc < 0)
1275 return SMC_CLC_DECL_TIMEOUT_CL;
1276
1277 /* receive CONFIRM LINK response from client over the RoCE fabric */
1278 qentry = smc_llc_wait(link->lgr, link, SMC_LLC_WAIT_TIME,
1279 SMC_LLC_CONFIRM_LINK);
1280 if (!qentry) {
1281 struct smc_clc_msg_decline dclc;
1282
1283 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
1284 SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
1285 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
1286 }
1287 smc_llc_save_peer_uid(qentry);
1288 rc = smc_llc_eval_conf_link(qentry, SMC_LLC_RESP);
1289 smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
1290 if (rc)
1291 return SMC_CLC_DECL_RMBE_EC;
1292
1293 /* confirm_rkey is implicit on 1st contact */
1294 smc->conn.rmb_desc->is_conf_rkey = true;
1295
1296 smc_llc_link_active(link);
1297 smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE);
1298
1299 /* initial contact - try to establish second link */
1300 smc_llc_srv_add_link(link);
1301 return 0;
1302}
1303
1304/* listen worker: finish */
1305static void smc_listen_out(struct smc_sock *new_smc)
1306{
1307 struct smc_sock *lsmc = new_smc->listen_smc;
1308 struct sock *newsmcsk = &new_smc->sk;
1309
1310 if (lsmc->sk.sk_state == SMC_LISTEN) {
1311 lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
1312 smc_accept_enqueue(&lsmc->sk, newsmcsk);
1313 release_sock(&lsmc->sk);
1314 } else { /* no longer listening */
1315 smc_close_non_accepted(newsmcsk);
1316 }
1317
1318 /* Wake up accept */
1319 lsmc->sk.sk_data_ready(&lsmc->sk);
1320 sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
1321}
1322
1323/* listen worker: finish in state connected */
1324static void smc_listen_out_connected(struct smc_sock *new_smc)
1325{
1326 struct sock *newsmcsk = &new_smc->sk;
1327
1328 if (newsmcsk->sk_state == SMC_INIT)
1329 newsmcsk->sk_state = SMC_ACTIVE;
1330
1331 smc_listen_out(new_smc);
1332}
1333
1334/* listen worker: finish in error state */
1335static void smc_listen_out_err(struct smc_sock *new_smc)
1336{
1337 struct sock *newsmcsk = &new_smc->sk;
1338
1339 if (newsmcsk->sk_state == SMC_INIT)
1340 sock_put(&new_smc->sk); /* passive closing */
1341 newsmcsk->sk_state = SMC_CLOSED;
1342
1343 smc_listen_out(new_smc);
1344}
1345
1346/* listen worker: decline and fall back if possible */
1347static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
1348 int local_first, u8 version)
1349{
1350 /* RDMA setup failed, switch back to TCP */
1351 if (local_first)
1352 smc_lgr_cleanup_early(&new_smc->conn);
1353 else
1354 smc_conn_free(&new_smc->conn);
1355 if (reason_code < 0) { /* error, no fallback possible */
1356 smc_listen_out_err(new_smc);
1357 return;
1358 }
1359 smc_switch_to_fallback(new_smc);
1360 new_smc->fallback_rsn = reason_code;
1361 if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
1362 if (smc_clc_send_decline(new_smc, reason_code, version) < 0) {
1363 smc_listen_out_err(new_smc);
1364 return;
1365 }
1366 }
1367 smc_listen_out_connected(new_smc);
1368}
1369
1370/* listen worker: version checking */
1371static int smc_listen_v2_check(struct smc_sock *new_smc,
1372 struct smc_clc_msg_proposal *pclc,
1373 struct smc_init_info *ini)
1374{
1375 struct smc_clc_smcd_v2_extension *pclc_smcd_v2_ext;
1376 struct smc_clc_v2_extension *pclc_v2_ext;
1377
1378 ini->smc_type_v1 = pclc->hdr.typev1;
1379 ini->smc_type_v2 = pclc->hdr.typev2;
1380 ini->smcd_version = ini->smc_type_v1 != SMC_TYPE_N ? SMC_V1 : 0;
1381 if (pclc->hdr.version > SMC_V1)
1382 ini->smcd_version |=
1383 ini->smc_type_v2 != SMC_TYPE_N ? SMC_V2 : 0;
1384 if (!smc_ism_v2_capable) {
1385 ini->smcd_version &= ~SMC_V2;
1386 goto out;
1387 }
1388 pclc_v2_ext = smc_get_clc_v2_ext(pclc);
1389 if (!pclc_v2_ext) {
1390 ini->smcd_version &= ~SMC_V2;
1391 goto out;
1392 }
1393 pclc_smcd_v2_ext = smc_get_clc_smcd_v2_ext(pclc_v2_ext);
1394 if (!pclc_smcd_v2_ext)
1395 ini->smcd_version &= ~SMC_V2;
1396
1397out:
1398 if (!ini->smcd_version) {
1399 if (pclc->hdr.typev1 == SMC_TYPE_B ||
1400 pclc->hdr.typev2 == SMC_TYPE_B)
1401 return SMC_CLC_DECL_NOSMCDEV;
1402 if (pclc->hdr.typev1 == SMC_TYPE_D ||
1403 pclc->hdr.typev2 == SMC_TYPE_D)
1404 return SMC_CLC_DECL_NOSMCDDEV;
1405 return SMC_CLC_DECL_NOSMCRDEV;
1406 }
1407
1408 return 0;
1409}
1410
1411/* listen worker: check prefixes */
1412static int smc_listen_prfx_check(struct smc_sock *new_smc,
1413 struct smc_clc_msg_proposal *pclc)
1414{
1415 struct smc_clc_msg_proposal_prefix *pclc_prfx;
1416 struct socket *newclcsock = new_smc->clcsock;
1417
1418 if (pclc->hdr.typev1 == SMC_TYPE_N)
1419 return 0;
1420 pclc_prfx = smc_clc_proposal_get_prefix(pclc);
1421 if (smc_clc_prfx_match(newclcsock, pclc_prfx))
1422 return SMC_CLC_DECL_DIFFPREFIX;
1423
1424 return 0;
1425}
1426
1427/* listen worker: initialize connection and buffers */
1428static int smc_listen_rdma_init(struct smc_sock *new_smc,
1429 struct smc_init_info *ini)
1430{
1431 int rc;
1432
1433 /* allocate connection / link group */
1434 rc = smc_conn_create(new_smc, ini);
1435 if (rc)
1436 return rc;
1437
1438 /* create send buffer and rmb */
1439 if (smc_buf_create(new_smc, false))
1440 return SMC_CLC_DECL_MEM;
1441
1442 return 0;
1443}
1444
1445/* listen worker: initialize connection and buffers for SMC-D */
1446static int smc_listen_ism_init(struct smc_sock *new_smc,
1447 struct smc_init_info *ini)
1448{
1449 int rc;
1450
1451 rc = smc_conn_create(new_smc, ini);
1452 if (rc)
1453 return rc;
1454
1455 /* Create send and receive buffers */
1456 rc = smc_buf_create(new_smc, true);
1457 if (rc) {
1458 if (ini->first_contact_local)
1459 smc_lgr_cleanup_early(&new_smc->conn);
1460 else
1461 smc_conn_free(&new_smc->conn);
1462 return (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB :
1463 SMC_CLC_DECL_MEM;
1464 }
1465
1466 return 0;
1467}
1468
1469static bool smc_is_already_selected(struct smcd_dev *smcd,
1470 struct smc_init_info *ini,
1471 int matches)
1472{
1473 int i;
1474
1475 for (i = 0; i < matches; i++)
1476 if (smcd == ini->ism_dev[i])
1477 return true;
1478
1479 return false;
1480}
1481
1482/* check for ISM devices matching proposed ISM devices */
1483static void smc_check_ism_v2_match(struct smc_init_info *ini,
1484 u16 proposed_chid, u64 proposed_gid,
1485 unsigned int *matches)
1486{
1487 struct smcd_dev *smcd;
1488
1489 list_for_each_entry(smcd, &smcd_dev_list.list, list) {
1490 if (smcd->going_away)
1491 continue;
1492 if (smc_is_already_selected(smcd, ini, *matches))
1493 continue;
1494 if (smc_ism_get_chid(smcd) == proposed_chid &&
1495 !smc_ism_cantalk(proposed_gid, ISM_RESERVED_VLANID, smcd)) {
1496 ini->ism_peer_gid[*matches] = proposed_gid;
1497 ini->ism_dev[*matches] = smcd;
1498 (*matches)++;
1499 break;
1500 }
1501 }
1502}
1503
1504static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc,
1505 struct smc_clc_msg_proposal *pclc,
1506 struct smc_init_info *ini)
1507{
1508 struct smc_clc_smcd_v2_extension *smcd_v2_ext;
1509 struct smc_clc_v2_extension *smc_v2_ext;
1510 struct smc_clc_msg_smcd *pclc_smcd;
1511 unsigned int matches = 0;
1512 u8 smcd_version;
1513 u8 *eid = NULL;
1514 int i;
1515
1516 if (!(ini->smcd_version & SMC_V2) || !smcd_indicated(ini->smc_type_v2))
1517 goto not_found;
1518
1519 pclc_smcd = smc_get_clc_msg_smcd(pclc);
1520 smc_v2_ext = smc_get_clc_v2_ext(pclc);
1521 smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext);
1522 if (!smcd_v2_ext ||
1523 !smc_v2_ext->hdr.flag.seid) /* no system EID support for SMCD */
1524 goto not_found;
1525
1526 mutex_lock(&smcd_dev_list.mutex);
1527 if (pclc_smcd->ism.chid)
1528 /* check for ISM device matching proposed native ISM device */
1529 smc_check_ism_v2_match(ini, ntohs(pclc_smcd->ism.chid),
1530 ntohll(pclc_smcd->ism.gid), &matches);
1531 for (i = 1; i <= smc_v2_ext->hdr.ism_gid_cnt; i++) {
1532 /* check for ISM devices matching proposed non-native ISM
1533 * devices
1534 */
1535 smc_check_ism_v2_match(ini,
1536 ntohs(smcd_v2_ext->gidchid[i - 1].chid),
1537 ntohll(smcd_v2_ext->gidchid[i - 1].gid),
1538 &matches);
1539 }
1540 mutex_unlock(&smcd_dev_list.mutex);
1541
1542 if (ini->ism_dev[0]) {
1543 smc_ism_get_system_eid(ini->ism_dev[0], &eid);
1544 if (memcmp(eid, smcd_v2_ext->system_eid, SMC_MAX_EID_LEN))
1545 goto not_found;
1546 } else {
1547 goto not_found;
1548 }
1549
1550 /* separate - outside the smcd_dev_list.lock */
1551 smcd_version = ini->smcd_version;
1552 for (i = 0; i < matches; i++) {
1553 ini->smcd_version = SMC_V2;
1554 ini->is_smcd = true;
1555 ini->ism_selected = i;
1556 if (smc_listen_ism_init(new_smc, ini))
1557 /* try next active ISM device */
1558 continue;
1559 return; /* matching and usable V2 ISM device found */
1560 }
1561 /* no V2 ISM device could be initialized */
1562 ini->smcd_version = smcd_version; /* restore original value */
1563
1564not_found:
1565 ini->smcd_version &= ~SMC_V2;
1566 ini->ism_dev[0] = NULL;
1567 ini->is_smcd = false;
1568}
1569
1570static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc,
1571 struct smc_clc_msg_proposal *pclc,
1572 struct smc_init_info *ini)
1573{
1574 struct smc_clc_msg_smcd *pclc_smcd = smc_get_clc_msg_smcd(pclc);
1575
1576 /* check if ISM V1 is available */
1577 if (!(ini->smcd_version & SMC_V1) || !smcd_indicated(ini->smc_type_v1))
1578 goto not_found;
1579 ini->is_smcd = true; /* prepare ISM check */
1580 ini->ism_peer_gid[0] = ntohll(pclc_smcd->ism.gid);
1581 if (smc_find_ism_device(new_smc, ini))
1582 goto not_found;
1583 ini->ism_selected = 0;
1584 if (!smc_listen_ism_init(new_smc, ini))
1585 return; /* V1 ISM device found */
1586
1587not_found:
1588 ini->ism_dev[0] = NULL;
1589 ini->is_smcd = false;
1590}
1591
1592/* listen worker: register buffers */
1593static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first)
1594{
1595 struct smc_connection *conn = &new_smc->conn;
1596
1597 if (!local_first) {
1598 if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc))
1599 return SMC_CLC_DECL_ERR_REGRMB;
1600 }
1601 smc_rmb_sync_sg_for_device(&new_smc->conn);
1602
1603 return 0;
1604}
1605
1606static int smc_find_rdma_v1_device_serv(struct smc_sock *new_smc,
1607 struct smc_clc_msg_proposal *pclc,
1608 struct smc_init_info *ini)
1609{
1610 int rc;
1611
1612 if (!smcr_indicated(ini->smc_type_v1))
1613 return SMC_CLC_DECL_NOSMCDEV;
1614
1615 /* prepare RDMA check */
1616 ini->ib_lcl = &pclc->lcl;
1617 rc = smc_find_rdma_device(new_smc, ini);
1618 if (rc) {
1619 /* no RDMA device found */
1620 if (ini->smc_type_v1 == SMC_TYPE_B)
1621 /* neither ISM nor RDMA device found */
1622 rc = SMC_CLC_DECL_NOSMCDEV;
1623 return rc;
1624 }
1625 rc = smc_listen_rdma_init(new_smc, ini);
1626 if (rc)
1627 return rc;
1628 return smc_listen_rdma_reg(new_smc, ini->first_contact_local);
1629}
1630
1631/* determine the local device matching to proposal */
1632static int smc_listen_find_device(struct smc_sock *new_smc,
1633 struct smc_clc_msg_proposal *pclc,
1634 struct smc_init_info *ini)
1635{
1636 int rc;
1637
1638 /* check for ISM device matching V2 proposed device */
1639 smc_find_ism_v2_device_serv(new_smc, pclc, ini);
1640 if (ini->ism_dev[0])
1641 return 0;
1642
1643 if (!(ini->smcd_version & SMC_V1))
1644 return SMC_CLC_DECL_NOSMCDEV;
1645
1646 /* check for matching IP prefix and subnet length */
1647 rc = smc_listen_prfx_check(new_smc, pclc);
1648 if (rc)
1649 return rc;
1650
1651 /* get vlan id from IP device */
1652 if (smc_vlan_by_tcpsk(new_smc->clcsock, ini))
1653 return SMC_CLC_DECL_GETVLANERR;
1654
1655 /* check for ISM device matching V1 proposed device */
1656 smc_find_ism_v1_device_serv(new_smc, pclc, ini);
1657 if (ini->ism_dev[0])
1658 return 0;
1659
1660 if (pclc->hdr.typev1 == SMC_TYPE_D)
1661 return SMC_CLC_DECL_NOSMCDDEV; /* skip RDMA and decline */
1662
1663 /* check if RDMA is available */
1664 return smc_find_rdma_v1_device_serv(new_smc, pclc, ini);
1665}
1666
1667/* listen worker: finish RDMA setup */
1668static int smc_listen_rdma_finish(struct smc_sock *new_smc,
1669 struct smc_clc_msg_accept_confirm *cclc,
1670 bool local_first)
1671{
1672 struct smc_link *link = new_smc->conn.lnk;
1673 int reason_code = 0;
1674
1675 if (local_first)
1676 smc_link_save_peer_info(link, cclc);
1677
1678 if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc))
1679 return SMC_CLC_DECL_ERR_RTOK;
1680
1681 if (local_first) {
1682 if (smc_ib_ready_link(link))
1683 return SMC_CLC_DECL_ERR_RDYLNK;
1684 /* QP confirmation over RoCE fabric */
1685 smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
1686 reason_code = smcr_serv_conf_first_link(new_smc);
1687 smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
1688 }
1689 return reason_code;
1690}
1691
1692/* setup for connection of server */
1693static void smc_listen_work(struct work_struct *work)
1694{
1695 struct smc_sock *new_smc = container_of(work, struct smc_sock,
1696 smc_listen_work);
1697 u8 version = smc_ism_v2_capable ? SMC_V2 : SMC_V1;
1698 struct socket *newclcsock = new_smc->clcsock;
1699 struct smc_clc_msg_accept_confirm *cclc;
1700 struct smc_clc_msg_proposal_area *buf;
1701 struct smc_clc_msg_proposal *pclc;
1702 struct smc_init_info *ini = NULL;
1703 int rc = 0;
1704
1705 if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN)
1706 return smc_listen_out_err(new_smc);
1707
1708 if (new_smc->use_fallback) {
1709 smc_listen_out_connected(new_smc);
1710 return;
1711 }
1712
1713 /* check if peer is smc capable */
1714 if (!tcp_sk(newclcsock->sk)->syn_smc) {
1715 smc_switch_to_fallback(new_smc);
1716 new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC;
1717 smc_listen_out_connected(new_smc);
1718 return;
1719 }
1720
1721 /* do inband token exchange -
1722 * wait for and receive SMC Proposal CLC message
1723 */
1724 buf = kzalloc(sizeof(*buf), GFP_KERNEL);
1725 if (!buf) {
1726 rc = SMC_CLC_DECL_MEM;
1727 goto out_decl;
1728 }
1729 pclc = (struct smc_clc_msg_proposal *)buf;
1730 rc = smc_clc_wait_msg(new_smc, pclc, sizeof(*buf),
1731 SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
1732 if (rc)
1733 goto out_decl;
1734 version = pclc->hdr.version == SMC_V1 ? SMC_V1 : version;
1735
1736 /* IPSec connections opt out of SMC optimizations */
1737 if (using_ipsec(new_smc)) {
1738 rc = SMC_CLC_DECL_IPSEC;
1739 goto out_decl;
1740 }
1741
1742 ini = kzalloc(sizeof(*ini), GFP_KERNEL);
1743 if (!ini) {
1744 rc = SMC_CLC_DECL_MEM;
1745 goto out_decl;
1746 }
1747
1748 /* initial version checking */
1749 rc = smc_listen_v2_check(new_smc, pclc, ini);
1750 if (rc)
1751 goto out_decl;
1752
1753 mutex_lock(&smc_server_lgr_pending);
1754 smc_close_init(new_smc);
1755 smc_rx_init(new_smc);
1756 smc_tx_init(new_smc);
1757
1758 /* determine ISM or RoCE device used for connection */
1759 rc = smc_listen_find_device(new_smc, pclc, ini);
1760 if (rc)
1761 goto out_unlock;
1762
1763 /* send SMC Accept CLC message */
1764 rc = smc_clc_send_accept(new_smc, ini->first_contact_local,
1765 ini->smcd_version == SMC_V2 ? SMC_V2 : SMC_V1);
1766 if (rc)
1767 goto out_unlock;
1768
1769 /* SMC-D does not need this lock any more */
1770 if (ini->is_smcd)
1771 mutex_unlock(&smc_server_lgr_pending);
1772
1773 /* receive SMC Confirm CLC message */
1774 memset(buf, 0, sizeof(*buf));
1775 cclc = (struct smc_clc_msg_accept_confirm *)buf;
1776 rc = smc_clc_wait_msg(new_smc, cclc, sizeof(*buf),
1777 SMC_CLC_CONFIRM, CLC_WAIT_TIME);
1778 if (rc) {
1779 if (!ini->is_smcd)
1780 goto out_unlock;
1781 goto out_decl;
1782 }
1783
1784 /* finish worker */
1785 if (!ini->is_smcd) {
1786 rc = smc_listen_rdma_finish(new_smc, cclc,
1787 ini->first_contact_local);
1788 if (rc)
1789 goto out_unlock;
1790 mutex_unlock(&smc_server_lgr_pending);
1791 }
1792 smc_conn_save_peer_info(new_smc, cclc);
1793 smc_listen_out_connected(new_smc);
1794 goto out_free;
1795
1796out_unlock:
1797 mutex_unlock(&smc_server_lgr_pending);
1798out_decl:
1799 smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0,
1800 version);
1801out_free:
1802 kfree(ini);
1803 kfree(buf);
1804}
1805
1806static void smc_tcp_listen_work(struct work_struct *work)
1807{
1808 struct smc_sock *lsmc = container_of(work, struct smc_sock,
1809 tcp_listen_work);
1810 struct sock *lsk = &lsmc->sk;
1811 struct smc_sock *new_smc;
1812 int rc = 0;
1813
1814 lock_sock(lsk);
1815 while (lsk->sk_state == SMC_LISTEN) {
1816 rc = smc_clcsock_accept(lsmc, &new_smc);
1817 if (rc) /* clcsock accept queue empty or error */
1818 goto out;
1819 if (!new_smc)
1820 continue;
1821
1822 new_smc->listen_smc = lsmc;
1823 new_smc->use_fallback = lsmc->use_fallback;
1824 new_smc->fallback_rsn = lsmc->fallback_rsn;
1825 sock_hold(lsk); /* sock_put in smc_listen_work */
1826 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
1827 smc_copy_sock_settings_to_smc(new_smc);
1828 new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf;
1829 new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf;
1830 sock_hold(&new_smc->sk); /* sock_put in passive closing */
1831 if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work))
1832 sock_put(&new_smc->sk);
1833 }
1834
1835out:
1836 release_sock(lsk);
1837 sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */
1838}
1839
1840static void smc_clcsock_data_ready(struct sock *listen_clcsock)
1841{
1842 struct smc_sock *lsmc;
1843
1844 lsmc = (struct smc_sock *)
1845 ((uintptr_t)listen_clcsock->sk_user_data & ~SK_USER_DATA_NOCOPY);
1846 if (!lsmc)
1847 return;
1848 lsmc->clcsk_data_ready(listen_clcsock);
1849 if (lsmc->sk.sk_state == SMC_LISTEN) {
1850 sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */
1851 if (!queue_work(smc_hs_wq, &lsmc->tcp_listen_work))
1852 sock_put(&lsmc->sk);
1853 }
1854}
1855
1856static int smc_listen(struct socket *sock, int backlog)
1857{
1858 struct sock *sk = sock->sk;
1859 struct smc_sock *smc;
1860 int rc;
1861
1862 smc = smc_sk(sk);
1863 lock_sock(sk);
1864
1865 rc = -EINVAL;
1866 if ((sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) ||
1867 smc->connect_nonblock)
1868 goto out;
1869
1870 rc = 0;
1871 if (sk->sk_state == SMC_LISTEN) {
1872 sk->sk_max_ack_backlog = backlog;
1873 goto out;
1874 }
1875 /* some socket options are handled in core, so we could not apply
1876 * them to the clc socket -- copy smc socket options to clc socket
1877 */
1878 smc_copy_sock_settings_to_clc(smc);
1879 if (!smc->use_fallback)
1880 tcp_sk(smc->clcsock->sk)->syn_smc = 1;
1881
1882 /* save original sk_data_ready function and establish
1883 * smc-specific sk_data_ready function
1884 */
1885 smc->clcsk_data_ready = smc->clcsock->sk->sk_data_ready;
1886 smc->clcsock->sk->sk_data_ready = smc_clcsock_data_ready;
1887 smc->clcsock->sk->sk_user_data =
1888 (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY);
1889 rc = kernel_listen(smc->clcsock, backlog);
1890 if (rc) {
1891 smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready;
1892 goto out;
1893 }
1894 sk->sk_max_ack_backlog = backlog;
1895 sk->sk_ack_backlog = 0;
1896 sk->sk_state = SMC_LISTEN;
1897
1898out:
1899 release_sock(sk);
1900 return rc;
1901}
1902
1903static int smc_accept(struct socket *sock, struct socket *new_sock,
1904 int flags, bool kern)
1905{
1906 struct sock *sk = sock->sk, *nsk;
1907 DECLARE_WAITQUEUE(wait, current);
1908 struct smc_sock *lsmc;
1909 long timeo;
1910 int rc = 0;
1911
1912 lsmc = smc_sk(sk);
1913 sock_hold(sk); /* sock_put below */
1914 lock_sock(sk);
1915
1916 if (lsmc->sk.sk_state != SMC_LISTEN) {
1917 rc = -EINVAL;
1918 release_sock(sk);
1919 goto out;
1920 }
1921
1922 /* Wait for an incoming connection */
1923 timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1924 add_wait_queue_exclusive(sk_sleep(sk), &wait);
1925 while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
1926 set_current_state(TASK_INTERRUPTIBLE);
1927 if (!timeo) {
1928 rc = -EAGAIN;
1929 break;
1930 }
1931 release_sock(sk);
1932 timeo = schedule_timeout(timeo);
1933 /* wakeup by sk_data_ready in smc_listen_work() */
1934 sched_annotate_sleep();
1935 lock_sock(sk);
1936 if (signal_pending(current)) {
1937 rc = sock_intr_errno(timeo);
1938 break;
1939 }
1940 }
1941 set_current_state(TASK_RUNNING);
1942 remove_wait_queue(sk_sleep(sk), &wait);
1943
1944 if (!rc)
1945 rc = sock_error(nsk);
1946 release_sock(sk);
1947 if (rc)
1948 goto out;
1949
1950 if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
1951 /* wait till data arrives on the socket */
1952 timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
1953 MSEC_PER_SEC);
1954 if (smc_sk(nsk)->use_fallback) {
1955 struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
1956
1957 lock_sock(clcsk);
1958 if (skb_queue_empty(&clcsk->sk_receive_queue))
1959 sk_wait_data(clcsk, &timeo, NULL);
1960 release_sock(clcsk);
1961 } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
1962 lock_sock(nsk);
1963 smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
1964 release_sock(nsk);
1965 }
1966 }
1967
1968out:
1969 sock_put(sk); /* sock_hold above */
1970 return rc;
1971}
1972
1973static int smc_getname(struct socket *sock, struct sockaddr *addr,
1974 int peer)
1975{
1976 struct smc_sock *smc;
1977
1978 if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
1979 (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1980 return -ENOTCONN;
1981
1982 smc = smc_sk(sock->sk);
1983
1984 return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
1985}
1986
1987static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1988{
1989 struct sock *sk = sock->sk;
1990 struct smc_sock *smc;
1991 int rc = -EPIPE;
1992
1993 smc = smc_sk(sk);
1994 lock_sock(sk);
1995 if ((sk->sk_state != SMC_ACTIVE) &&
1996 (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1997 (sk->sk_state != SMC_INIT))
1998 goto out;
1999
2000 if (msg->msg_flags & MSG_FASTOPEN) {
2001 if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
2002 smc_switch_to_fallback(smc);
2003 smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
2004 } else {
2005 rc = -EINVAL;
2006 goto out;
2007 }
2008 }
2009
2010 if (smc->use_fallback)
2011 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
2012 else
2013 rc = smc_tx_sendmsg(smc, msg, len);
2014out:
2015 release_sock(sk);
2016 return rc;
2017}
2018
2019static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
2020 int flags)
2021{
2022 struct sock *sk = sock->sk;
2023 struct smc_sock *smc;
2024 int rc = -ENOTCONN;
2025
2026 smc = smc_sk(sk);
2027 lock_sock(sk);
2028 if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
2029 /* socket was connected before, no more data to read */
2030 rc = 0;
2031 goto out;
2032 }
2033 if ((sk->sk_state == SMC_INIT) ||
2034 (sk->sk_state == SMC_LISTEN) ||
2035 (sk->sk_state == SMC_CLOSED))
2036 goto out;
2037
2038 if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
2039 rc = 0;
2040 goto out;
2041 }
2042
2043 if (smc->use_fallback) {
2044 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
2045 } else {
2046 msg->msg_namelen = 0;
2047 rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
2048 }
2049
2050out:
2051 release_sock(sk);
2052 return rc;
2053}
2054
2055static __poll_t smc_accept_poll(struct sock *parent)
2056{
2057 struct smc_sock *isk = smc_sk(parent);
2058 __poll_t mask = 0;
2059
2060 spin_lock(&isk->accept_q_lock);
2061 if (!list_empty(&isk->accept_q))
2062 mask = EPOLLIN | EPOLLRDNORM;
2063 spin_unlock(&isk->accept_q_lock);
2064
2065 return mask;
2066}
2067
2068static __poll_t smc_poll(struct file *file, struct socket *sock,
2069 poll_table *wait)
2070{
2071 struct sock *sk = sock->sk;
2072 struct smc_sock *smc;
2073 __poll_t mask = 0;
2074
2075 if (!sk)
2076 return EPOLLNVAL;
2077
2078 smc = smc_sk(sock->sk);
2079 if (smc->use_fallback) {
2080 /* delegate to CLC child sock */
2081 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
2082 sk->sk_err = smc->clcsock->sk->sk_err;
2083 } else {
2084 if (sk->sk_state != SMC_CLOSED)
2085 sock_poll_wait(file, sock, wait);
2086 if (sk->sk_err)
2087 mask |= EPOLLERR;
2088 if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
2089 (sk->sk_state == SMC_CLOSED))
2090 mask |= EPOLLHUP;
2091 if (sk->sk_state == SMC_LISTEN) {
2092 /* woken up by sk_data_ready in smc_listen_work() */
2093 mask |= smc_accept_poll(sk);
2094 } else if (smc->use_fallback) { /* as result of connect_work()*/
2095 mask |= smc->clcsock->ops->poll(file, smc->clcsock,
2096 wait);
2097 sk->sk_err = smc->clcsock->sk->sk_err;
2098 } else {
2099 if ((sk->sk_state != SMC_INIT &&
2100 atomic_read(&smc->conn.sndbuf_space)) ||
2101 sk->sk_shutdown & SEND_SHUTDOWN) {
2102 mask |= EPOLLOUT | EPOLLWRNORM;
2103 } else {
2104 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2105 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2106 }
2107 if (atomic_read(&smc->conn.bytes_to_rcv))
2108 mask |= EPOLLIN | EPOLLRDNORM;
2109 if (sk->sk_shutdown & RCV_SHUTDOWN)
2110 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
2111 if (sk->sk_state == SMC_APPCLOSEWAIT1)
2112 mask |= EPOLLIN;
2113 if (smc->conn.urg_state == SMC_URG_VALID)
2114 mask |= EPOLLPRI;
2115 }
2116 }
2117
2118 return mask;
2119}
2120
2121static int smc_shutdown(struct socket *sock, int how)
2122{
2123 struct sock *sk = sock->sk;
2124 bool do_shutdown = true;
2125 struct smc_sock *smc;
2126 int rc = -EINVAL;
2127 int old_state;
2128 int rc1 = 0;
2129
2130 smc = smc_sk(sk);
2131
2132 if ((how < SHUT_RD) || (how > SHUT_RDWR))
2133 return rc;
2134
2135 lock_sock(sk);
2136
2137 rc = -ENOTCONN;
2138 if ((sk->sk_state != SMC_ACTIVE) &&
2139 (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
2140 (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
2141 (sk->sk_state != SMC_APPCLOSEWAIT1) &&
2142 (sk->sk_state != SMC_APPCLOSEWAIT2) &&
2143 (sk->sk_state != SMC_APPFINCLOSEWAIT))
2144 goto out;
2145 if (smc->use_fallback) {
2146 rc = kernel_sock_shutdown(smc->clcsock, how);
2147 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
2148 if (sk->sk_shutdown == SHUTDOWN_MASK) {
2149 sk->sk_state = SMC_CLOSED;
2150 sock_put(sk);
2151 }
2152 goto out;
2153 }
2154 switch (how) {
2155 case SHUT_RDWR: /* shutdown in both directions */
2156 old_state = sk->sk_state;
2157 rc = smc_close_active(smc);
2158 if (old_state == SMC_ACTIVE &&
2159 sk->sk_state == SMC_PEERCLOSEWAIT1)
2160 do_shutdown = false;
2161 break;
2162 case SHUT_WR:
2163 rc = smc_close_shutdown_write(smc);
2164 break;
2165 case SHUT_RD:
2166 rc = 0;
2167 /* nothing more to do because peer is not involved */
2168 break;
2169 }
2170 if (do_shutdown && smc->clcsock)
2171 rc1 = kernel_sock_shutdown(smc->clcsock, how);
2172 /* map sock_shutdown_cmd constants to sk_shutdown value range */
2173 sk->sk_shutdown |= how + 1;
2174
2175out:
2176 release_sock(sk);
2177 return rc ? rc : rc1;
2178}
2179
2180static int smc_setsockopt(struct socket *sock, int level, int optname,
2181 sockptr_t optval, unsigned int optlen)
2182{
2183 struct sock *sk = sock->sk;
2184 struct smc_sock *smc;
2185 int val, rc;
2186
2187 if (level == SOL_TCP && optname == TCP_ULP)
2188 return -EOPNOTSUPP;
2189
2190 smc = smc_sk(sk);
2191
2192 /* generic setsockopts reaching us here always apply to the
2193 * CLC socket
2194 */
2195 if (unlikely(!smc->clcsock->ops->setsockopt))
2196 rc = -EOPNOTSUPP;
2197 else
2198 rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
2199 optval, optlen);
2200 if (smc->clcsock->sk->sk_err) {
2201 sk->sk_err = smc->clcsock->sk->sk_err;
2202 sk->sk_error_report(sk);
2203 }
2204
2205 if (optlen < sizeof(int))
2206 return -EINVAL;
2207 if (copy_from_sockptr(&val, optval, sizeof(int)))
2208 return -EFAULT;
2209
2210 lock_sock(sk);
2211 if (rc || smc->use_fallback)
2212 goto out;
2213 switch (optname) {
2214 case TCP_FASTOPEN:
2215 case TCP_FASTOPEN_CONNECT:
2216 case TCP_FASTOPEN_KEY:
2217 case TCP_FASTOPEN_NO_COOKIE:
2218 /* option not supported by SMC */
2219 if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
2220 smc_switch_to_fallback(smc);
2221 smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
2222 } else {
2223 rc = -EINVAL;
2224 }
2225 break;
2226 case TCP_NODELAY:
2227 if (sk->sk_state != SMC_INIT &&
2228 sk->sk_state != SMC_LISTEN &&
2229 sk->sk_state != SMC_CLOSED) {
2230 if (val)
2231 mod_delayed_work(smc->conn.lgr->tx_wq,
2232 &smc->conn.tx_work, 0);
2233 }
2234 break;
2235 case TCP_CORK:
2236 if (sk->sk_state != SMC_INIT &&
2237 sk->sk_state != SMC_LISTEN &&
2238 sk->sk_state != SMC_CLOSED) {
2239 if (!val)
2240 mod_delayed_work(smc->conn.lgr->tx_wq,
2241 &smc->conn.tx_work, 0);
2242 }
2243 break;
2244 case TCP_DEFER_ACCEPT:
2245 smc->sockopt_defer_accept = val;
2246 break;
2247 default:
2248 break;
2249 }
2250out:
2251 release_sock(sk);
2252
2253 return rc;
2254}
2255
2256static int smc_getsockopt(struct socket *sock, int level, int optname,
2257 char __user *optval, int __user *optlen)
2258{
2259 struct smc_sock *smc;
2260
2261 smc = smc_sk(sock->sk);
2262 /* socket options apply to the CLC socket */
2263 if (unlikely(!smc->clcsock->ops->getsockopt))
2264 return -EOPNOTSUPP;
2265 return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
2266 optval, optlen);
2267}
2268
2269static int smc_ioctl(struct socket *sock, unsigned int cmd,
2270 unsigned long arg)
2271{
2272 union smc_host_cursor cons, urg;
2273 struct smc_connection *conn;
2274 struct smc_sock *smc;
2275 int answ;
2276
2277 smc = smc_sk(sock->sk);
2278 conn = &smc->conn;
2279 lock_sock(&smc->sk);
2280 if (smc->use_fallback) {
2281 if (!smc->clcsock) {
2282 release_sock(&smc->sk);
2283 return -EBADF;
2284 }
2285 answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
2286 release_sock(&smc->sk);
2287 return answ;
2288 }
2289 switch (cmd) {
2290 case SIOCINQ: /* same as FIONREAD */
2291 if (smc->sk.sk_state == SMC_LISTEN) {
2292 release_sock(&smc->sk);
2293 return -EINVAL;
2294 }
2295 if (smc->sk.sk_state == SMC_INIT ||
2296 smc->sk.sk_state == SMC_CLOSED)
2297 answ = 0;
2298 else
2299 answ = atomic_read(&smc->conn.bytes_to_rcv);
2300 break;
2301 case SIOCOUTQ:
2302 /* output queue size (not send + not acked) */
2303 if (smc->sk.sk_state == SMC_LISTEN) {
2304 release_sock(&smc->sk);
2305 return -EINVAL;
2306 }
2307 if (smc->sk.sk_state == SMC_INIT ||
2308 smc->sk.sk_state == SMC_CLOSED)
2309 answ = 0;
2310 else
2311 answ = smc->conn.sndbuf_desc->len -
2312 atomic_read(&smc->conn.sndbuf_space);
2313 break;
2314 case SIOCOUTQNSD:
2315 /* output queue size (not send only) */
2316 if (smc->sk.sk_state == SMC_LISTEN) {
2317 release_sock(&smc->sk);
2318 return -EINVAL;
2319 }
2320 if (smc->sk.sk_state == SMC_INIT ||
2321 smc->sk.sk_state == SMC_CLOSED)
2322 answ = 0;
2323 else
2324 answ = smc_tx_prepared_sends(&smc->conn);
2325 break;
2326 case SIOCATMARK:
2327 if (smc->sk.sk_state == SMC_LISTEN) {
2328 release_sock(&smc->sk);
2329 return -EINVAL;
2330 }
2331 if (smc->sk.sk_state == SMC_INIT ||
2332 smc->sk.sk_state == SMC_CLOSED) {
2333 answ = 0;
2334 } else {
2335 smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
2336 smc_curs_copy(&urg, &conn->urg_curs, conn);
2337 answ = smc_curs_diff(conn->rmb_desc->len,
2338 &cons, &urg) == 1;
2339 }
2340 break;
2341 default:
2342 release_sock(&smc->sk);
2343 return -ENOIOCTLCMD;
2344 }
2345 release_sock(&smc->sk);
2346
2347 return put_user(answ, (int __user *)arg);
2348}
2349
2350static ssize_t smc_sendpage(struct socket *sock, struct page *page,
2351 int offset, size_t size, int flags)
2352{
2353 struct sock *sk = sock->sk;
2354 struct smc_sock *smc;
2355 int rc = -EPIPE;
2356
2357 smc = smc_sk(sk);
2358 lock_sock(sk);
2359 if (sk->sk_state != SMC_ACTIVE) {
2360 release_sock(sk);
2361 goto out;
2362 }
2363 release_sock(sk);
2364 if (smc->use_fallback)
2365 rc = kernel_sendpage(smc->clcsock, page, offset,
2366 size, flags);
2367 else
2368 rc = sock_no_sendpage(sock, page, offset, size, flags);
2369
2370out:
2371 return rc;
2372}
2373
2374/* Map the affected portions of the rmbe into an spd, note the number of bytes
2375 * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
2376 * updates till whenever a respective page has been fully processed.
2377 * Note that subsequent recv() calls have to wait till all splice() processing
2378 * completed.
2379 */
2380static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
2381 struct pipe_inode_info *pipe, size_t len,
2382 unsigned int flags)
2383{
2384 struct sock *sk = sock->sk;
2385 struct smc_sock *smc;
2386 int rc = -ENOTCONN;
2387
2388 smc = smc_sk(sk);
2389 lock_sock(sk);
2390 if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
2391 /* socket was connected before, no more data to read */
2392 rc = 0;
2393 goto out;
2394 }
2395 if (sk->sk_state == SMC_INIT ||
2396 sk->sk_state == SMC_LISTEN ||
2397 sk->sk_state == SMC_CLOSED)
2398 goto out;
2399
2400 if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
2401 rc = 0;
2402 goto out;
2403 }
2404
2405 if (smc->use_fallback) {
2406 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
2407 pipe, len, flags);
2408 } else {
2409 if (*ppos) {
2410 rc = -ESPIPE;
2411 goto out;
2412 }
2413 if (flags & SPLICE_F_NONBLOCK)
2414 flags = MSG_DONTWAIT;
2415 else
2416 flags = 0;
2417 rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
2418 }
2419out:
2420 release_sock(sk);
2421
2422 return rc;
2423}
2424
2425/* must look like tcp */
2426static const struct proto_ops smc_sock_ops = {
2427 .family = PF_SMC,
2428 .owner = THIS_MODULE,
2429 .release = smc_release,
2430 .bind = smc_bind,
2431 .connect = smc_connect,
2432 .socketpair = sock_no_socketpair,
2433 .accept = smc_accept,
2434 .getname = smc_getname,
2435 .poll = smc_poll,
2436 .ioctl = smc_ioctl,
2437 .listen = smc_listen,
2438 .shutdown = smc_shutdown,
2439 .setsockopt = smc_setsockopt,
2440 .getsockopt = smc_getsockopt,
2441 .sendmsg = smc_sendmsg,
2442 .recvmsg = smc_recvmsg,
2443 .mmap = sock_no_mmap,
2444 .sendpage = smc_sendpage,
2445 .splice_read = smc_splice_read,
2446};
2447
2448static int smc_create(struct net *net, struct socket *sock, int protocol,
2449 int kern)
2450{
2451 int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
2452 struct smc_sock *smc;
2453 struct sock *sk;
2454 int rc;
2455
2456 rc = -ESOCKTNOSUPPORT;
2457 if (sock->type != SOCK_STREAM)
2458 goto out;
2459
2460 rc = -EPROTONOSUPPORT;
2461 if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6)
2462 goto out;
2463
2464 rc = -ENOBUFS;
2465 sock->ops = &smc_sock_ops;
2466 sk = smc_sock_alloc(net, sock, protocol);
2467 if (!sk)
2468 goto out;
2469
2470 /* create internal TCP socket for CLC handshake and fallback */
2471 smc = smc_sk(sk);
2472 smc->use_fallback = false; /* assume rdma capability first */
2473 smc->fallback_rsn = 0;
2474 rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
2475 &smc->clcsock);
2476 if (rc) {
2477 sk_common_release(sk);
2478 goto out;
2479 }
2480 smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
2481 smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
2482
2483out:
2484 return rc;
2485}
2486
2487static const struct net_proto_family smc_sock_family_ops = {
2488 .family = PF_SMC,
2489 .owner = THIS_MODULE,
2490 .create = smc_create,
2491};
2492
2493unsigned int smc_net_id;
2494
2495static __net_init int smc_net_init(struct net *net)
2496{
2497 return smc_pnet_net_init(net);
2498}
2499
2500static void __net_exit smc_net_exit(struct net *net)
2501{
2502 smc_pnet_net_exit(net);
2503}
2504
2505static struct pernet_operations smc_net_ops = {
2506 .init = smc_net_init,
2507 .exit = smc_net_exit,
2508 .id = &smc_net_id,
2509 .size = sizeof(struct smc_net),
2510};
2511
2512static int __init smc_init(void)
2513{
2514 int rc;
2515
2516 rc = register_pernet_subsys(&smc_net_ops);
2517 if (rc)
2518 return rc;
2519
2520 smc_ism_init();
2521 smc_clc_init();
2522
2523 rc = smc_pnet_init();
2524 if (rc)
2525 goto out_pernet_subsys;
2526
2527 rc = -ENOMEM;
2528 smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0);
2529 if (!smc_hs_wq)
2530 goto out_pnet;
2531
2532 smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0);
2533 if (!smc_close_wq)
2534 goto out_alloc_hs_wq;
2535
2536 rc = smc_core_init();
2537 if (rc) {
2538 pr_err("%s: smc_core_init fails with %d\n", __func__, rc);
2539 goto out_alloc_wqs;
2540 }
2541
2542 rc = smc_llc_init();
2543 if (rc) {
2544 pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
2545 goto out_core;
2546 }
2547
2548 rc = smc_cdc_init();
2549 if (rc) {
2550 pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
2551 goto out_core;
2552 }
2553
2554 rc = proto_register(&smc_proto, 1);
2555 if (rc) {
2556 pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
2557 goto out_core;
2558 }
2559
2560 rc = proto_register(&smc_proto6, 1);
2561 if (rc) {
2562 pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc);
2563 goto out_proto;
2564 }
2565
2566 rc = sock_register(&smc_sock_family_ops);
2567 if (rc) {
2568 pr_err("%s: sock_register fails with %d\n", __func__, rc);
2569 goto out_proto6;
2570 }
2571 INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
2572 INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);
2573
2574 rc = smc_ib_register_client();
2575 if (rc) {
2576 pr_err("%s: ib_register fails with %d\n", __func__, rc);
2577 goto out_sock;
2578 }
2579
2580 static_branch_enable(&tcp_have_smc);
2581 return 0;
2582
2583out_sock:
2584 sock_unregister(PF_SMC);
2585out_proto6:
2586 proto_unregister(&smc_proto6);
2587out_proto:
2588 proto_unregister(&smc_proto);
2589out_core:
2590 smc_core_exit();
2591out_alloc_wqs:
2592 destroy_workqueue(smc_close_wq);
2593out_alloc_hs_wq:
2594 destroy_workqueue(smc_hs_wq);
2595out_pnet:
2596 smc_pnet_exit();
2597out_pernet_subsys:
2598 unregister_pernet_subsys(&smc_net_ops);
2599
2600 return rc;
2601}
2602
2603static void __exit smc_exit(void)
2604{
2605 static_branch_disable(&tcp_have_smc);
2606 sock_unregister(PF_SMC);
2607 smc_core_exit();
2608 smc_ib_unregister_client();
2609 destroy_workqueue(smc_close_wq);
2610 destroy_workqueue(smc_hs_wq);
2611 proto_unregister(&smc_proto6);
2612 proto_unregister(&smc_proto);
2613 smc_pnet_exit();
2614 unregister_pernet_subsys(&smc_net_ops);
2615 rcu_barrier();
2616}
2617
2618module_init(smc_init);
2619module_exit(smc_exit);
2620
2621MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
2622MODULE_DESCRIPTION("smc socket address family");
2623MODULE_LICENSE("GPL");
2624MODULE_ALIAS_NETPROTO(PF_SMC);
diff --git a/net/smc/smc.h b/net/smc/smc.h
new file mode 100644
index 000000000..e6919fe31
--- /dev/null
+++ b/net/smc/smc.h
@@ -0,0 +1,300 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
4 *
5 * Definitions for the SMC module (socket related)
6 *
7 * Copyright IBM Corp. 2016
8 *
9 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
10 */
11#ifndef __SMC_H
12#define __SMC_H
13
14#include <linux/socket.h>
15#include <linux/types.h>
16#include <linux/compiler.h> /* __aligned */
17#include <net/sock.h>
18
19#include "smc_ib.h"
20
21#define SMC_V1 1 /* SMC version V1 */
22#define SMC_V2 2 /* SMC version V2 */
23#define SMC_RELEASE 0
24
25#define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */
26#define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */
27
28#define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM
29 * devices
30 */
31
32#define SMC_MAX_HOSTNAME_LEN 32
33#define SMC_MAX_EID_LEN 32
34
35extern struct proto smc_proto;
36extern struct proto smc_proto6;
37
38#ifdef ATOMIC64_INIT
39#define KERNEL_HAS_ATOMIC64
40#endif
41
42enum smc_state { /* possible states of an SMC socket */
43 SMC_ACTIVE = 1,
44 SMC_INIT = 2,
45 SMC_CLOSED = 7,
46 SMC_LISTEN = 10,
47 /* normal close */
48 SMC_PEERCLOSEWAIT1 = 20,
49 SMC_PEERCLOSEWAIT2 = 21,
50 SMC_APPFINCLOSEWAIT = 24,
51 SMC_APPCLOSEWAIT1 = 22,
52 SMC_APPCLOSEWAIT2 = 23,
53 SMC_PEERFINCLOSEWAIT = 25,
54 /* abnormal close */
55 SMC_PEERABORTWAIT = 26,
56 SMC_PROCESSABORT = 27,
57};
58
59struct smc_link_group;
60
61struct smc_wr_rx_hdr { /* common prefix part of LLC and CDC to demultiplex */
62 u8 type;
63} __aligned(1);
64
65struct smc_cdc_conn_state_flags {
66#if defined(__BIG_ENDIAN_BITFIELD)
67 u8 peer_done_writing : 1; /* Sending done indicator */
68 u8 peer_conn_closed : 1; /* Peer connection closed indicator */
69 u8 peer_conn_abort : 1; /* Abnormal close indicator */
70 u8 reserved : 5;
71#elif defined(__LITTLE_ENDIAN_BITFIELD)
72 u8 reserved : 5;
73 u8 peer_conn_abort : 1;
74 u8 peer_conn_closed : 1;
75 u8 peer_done_writing : 1;
76#endif
77};
78
79struct smc_cdc_producer_flags {
80#if defined(__BIG_ENDIAN_BITFIELD)
81 u8 write_blocked : 1; /* Writing Blocked, no rx buf space */
82 u8 urg_data_pending : 1; /* Urgent Data Pending */
83 u8 urg_data_present : 1; /* Urgent Data Present */
84 u8 cons_curs_upd_req : 1; /* cursor update requested */
85 u8 failover_validation : 1;/* message replay due to failover */
86 u8 reserved : 3;
87#elif defined(__LITTLE_ENDIAN_BITFIELD)
88 u8 reserved : 3;
89 u8 failover_validation : 1;
90 u8 cons_curs_upd_req : 1;
91 u8 urg_data_present : 1;
92 u8 urg_data_pending : 1;
93 u8 write_blocked : 1;
94#endif
95};
96
97/* in host byte order */
98union smc_host_cursor { /* SMC cursor - an offset in an RMBE */
99 struct {
100 u16 reserved;
101 u16 wrap; /* window wrap sequence number */
102 u32 count; /* cursor (= offset) part */
103 };
104#ifdef KERNEL_HAS_ATOMIC64
105 atomic64_t acurs; /* for atomic processing */
106#else
107 u64 acurs; /* for atomic processing */
108#endif
109} __aligned(8);
110
111/* in host byte order, except for flag bitfields in network byte order */
112struct smc_host_cdc_msg { /* Connection Data Control message */
113 struct smc_wr_rx_hdr common; /* .type = 0xFE */
114 u8 len; /* length = 44 */
115 u16 seqno; /* connection seq # */
116 u32 token; /* alert_token */
117 union smc_host_cursor prod; /* producer cursor */
118 union smc_host_cursor cons; /* consumer cursor,
119 * piggy backed "ack"
120 */
121 struct smc_cdc_producer_flags prod_flags; /* conn. tx/rx status */
122 struct smc_cdc_conn_state_flags conn_state_flags; /* peer conn. status*/
123 u8 reserved[18];
124} __aligned(8);
125
126enum smc_urg_state {
127 SMC_URG_VALID = 1, /* data present */
128 SMC_URG_NOTYET = 2, /* data pending */
129 SMC_URG_READ = 3, /* data was already read */
130};
131
132struct smc_connection {
133 struct rb_node alert_node;
134 struct smc_link_group *lgr; /* link group of connection */
135 struct smc_link *lnk; /* assigned SMC-R link */
136 u32 alert_token_local; /* unique conn. id */
137 u8 peer_rmbe_idx; /* from tcp handshake */
138 int peer_rmbe_size; /* size of peer rx buffer */
139 atomic_t peer_rmbe_space;/* remaining free bytes in peer
140 * rmbe
141 */
142 int rtoken_idx; /* idx to peer RMB rkey/addr */
143
144 struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */
145 struct smc_buf_desc *rmb_desc; /* RMBE descriptor */
146 int rmbe_size_short;/* compressed notation */
147 int rmbe_update_limit;
148 /* lower limit for consumer
149 * cursor update
150 */
151
152 struct smc_host_cdc_msg local_tx_ctrl; /* host byte order staging
153 * buffer for CDC msg send
154 * .prod cf. TCP snd_nxt
155 * .cons cf. TCP sends ack
156 */
157 union smc_host_cursor local_tx_ctrl_fin;
158 /* prod crsr - confirmed by peer
159 */
160 union smc_host_cursor tx_curs_prep; /* tx - prepared data
161 * snd_max..wmem_alloc
162 */
163 union smc_host_cursor tx_curs_sent; /* tx - sent data
164 * snd_nxt ?
165 */
166 union smc_host_cursor tx_curs_fin; /* tx - confirmed by peer
167 * snd-wnd-begin ?
168 */
169 atomic_t sndbuf_space; /* remaining space in sndbuf */
170 u16 tx_cdc_seq; /* sequence # for CDC send */
171 u16 tx_cdc_seq_fin; /* sequence # - tx completed */
172 spinlock_t send_lock; /* protect wr_sends */
173 atomic_t cdc_pend_tx_wr; /* number of pending tx CDC wqe
174 * - inc when post wqe,
175 * - dec on polled tx cqe
176 */
177 wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/
178 struct delayed_work tx_work; /* retry of smc_cdc_msg_send */
179 u32 tx_off; /* base offset in peer rmb */
180
181 struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl.
182 * .prod cf. TCP rcv_nxt
183 * .cons cf. TCP snd_una
184 */
185 union smc_host_cursor rx_curs_confirmed; /* confirmed to peer
186 * source of snd_una ?
187 */
188 union smc_host_cursor urg_curs; /* points at urgent byte */
189 enum smc_urg_state urg_state;
190 bool urg_tx_pend; /* urgent data staged */
191 bool urg_rx_skip_pend;
192 /* indicate urgent oob data
193 * read, but previous regular
194 * data still pending
195 */
196 char urg_rx_byte; /* urgent byte */
197 atomic_t bytes_to_rcv; /* arrived data,
198 * not yet received
199 */
200 atomic_t splice_pending; /* number of spliced bytes
201 * pending processing
202 */
203#ifndef KERNEL_HAS_ATOMIC64
204 spinlock_t acurs_lock; /* protect cursors */
205#endif
206 struct work_struct close_work; /* peer sent some closing */
207 struct work_struct abort_work; /* abort the connection */
208 struct tasklet_struct rx_tsklet; /* Receiver tasklet for SMC-D */
209 u8 rx_off; /* receive offset:
210 * 0 for SMC-R, 32 for SMC-D
211 */
212 u64 peer_token; /* SMC-D token of peer */
213 u8 killed : 1; /* abnormal termination */
214 u8 out_of_sync : 1; /* out of sync with peer */
215};
216
217struct smc_sock { /* smc sock container */
218 struct sock sk;
219 struct socket *clcsock; /* internal tcp socket */
220 void (*clcsk_data_ready)(struct sock *sk);
221 /* original data_ready fct. **/
222 struct smc_connection conn; /* smc connection */
223 struct smc_sock *listen_smc; /* listen parent */
224 struct work_struct connect_work; /* handle non-blocking connect*/
225 struct work_struct tcp_listen_work;/* handle tcp socket accepts */
226 struct work_struct smc_listen_work;/* prepare new accept socket */
227 struct list_head accept_q; /* sockets to be accepted */
228 spinlock_t accept_q_lock; /* protects accept_q */
229 bool use_fallback; /* fallback to tcp */
230 int fallback_rsn; /* reason for fallback */
231 u32 peer_diagnosis; /* decline reason from peer */
232 int sockopt_defer_accept;
233 /* sockopt TCP_DEFER_ACCEPT
234 * value
235 */
236 u8 wait_close_tx_prepared : 1;
237 /* shutdown wr or close
238 * started, waiting for unsent
239 * data to be sent
240 */
241 u8 connect_nonblock : 1;
242 /* non-blocking connect in
243 * flight
244 */
245 struct mutex clcsock_release_lock;
246 /* protects clcsock of a listen
247 * socket
248 * */
249};
250
251static inline struct smc_sock *smc_sk(const struct sock *sk)
252{
253 return (struct smc_sock *)sk;
254}
255
256extern struct workqueue_struct *smc_hs_wq; /* wq for handshake work */
257extern struct workqueue_struct *smc_close_wq; /* wq for close work */
258
259#define SMC_SYSTEMID_LEN 8
260
261extern u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */
262
263#define ntohll(x) be64_to_cpu(x)
264#define htonll(x) cpu_to_be64(x)
265
266/* convert an u32 value into network byte order, store it into a 3 byte field */
267static inline void hton24(u8 *net, u32 host)
268{
269 __be32 t;
270
271 t = cpu_to_be32(host);
272 memcpy(net, ((u8 *)&t) + 1, 3);
273}
274
275/* convert a received 3 byte field into host byte order*/
276static inline u32 ntoh24(u8 *net)
277{
278 __be32 t = 0;
279
280 memcpy(((u8 *)&t) + 1, net, 3);
281 return be32_to_cpu(t);
282}
283
284#ifdef CONFIG_XFRM
285static inline bool using_ipsec(struct smc_sock *smc)
286{
287 return (smc->clcsock->sk->sk_policy[0] ||
288 smc->clcsock->sk->sk_policy[1]) ? true : false;
289}
290#else
291static inline bool using_ipsec(struct smc_sock *smc)
292{
293 return false;
294}
295#endif
296
297struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock);
298void smc_close_non_accepted(struct sock *sk);
299
300#endif /* __SMC_H */
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
new file mode 100644
index 000000000..94503f36b
--- /dev/null
+++ b/net/smc/smc_cdc.c
@@ -0,0 +1,476 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
4 *
5 * Connection Data Control (CDC)
6 * handles flow control
7 *
8 * Copyright IBM Corp. 2016
9 *
10 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
11 */
12
13#include <linux/spinlock.h>
14
15#include "smc.h"
16#include "smc_wr.h"
17#include "smc_cdc.h"
18#include "smc_tx.h"
19#include "smc_rx.h"
20#include "smc_close.h"
21
22/********************************** send *************************************/
23
24/* handler for send/transmission completion of a CDC msg */
25static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
26 struct smc_link *link,
27 enum ib_wc_status wc_status)
28{
29 struct smc_cdc_tx_pend *cdcpend = (struct smc_cdc_tx_pend *)pnd_snd;
30 struct smc_connection *conn = cdcpend->conn;
31 struct smc_sock *smc;
32 int diff;
33
34 smc = container_of(conn, struct smc_sock, conn);
35 bh_lock_sock(&smc->sk);
36 if (!wc_status) {
37 diff = smc_curs_diff(cdcpend->conn->sndbuf_desc->len,
38 &cdcpend->conn->tx_curs_fin,
39 &cdcpend->cursor);
40 /* sndbuf_space is decreased in smc_sendmsg */
41 smp_mb__before_atomic();
42 atomic_add(diff, &cdcpend->conn->sndbuf_space);
43 /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
44 smp_mb__after_atomic();
45 smc_curs_copy(&conn->tx_curs_fin, &cdcpend->cursor, conn);
46 smc_curs_copy(&conn->local_tx_ctrl_fin, &cdcpend->p_cursor,
47 conn);
48 conn->tx_cdc_seq_fin = cdcpend->ctrl_seq;
49 }
50
51 if (atomic_dec_and_test(&conn->cdc_pend_tx_wr) &&
52 unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq)))
53 wake_up(&conn->cdc_pend_tx_wq);
54 WARN_ON(atomic_read(&conn->cdc_pend_tx_wr) < 0);
55
56 smc_tx_sndbuf_nonfull(smc);
57 bh_unlock_sock(&smc->sk);
58}
59
60int smc_cdc_get_free_slot(struct smc_connection *conn,
61 struct smc_link *link,
62 struct smc_wr_buf **wr_buf,
63 struct smc_rdma_wr **wr_rdma_buf,
64 struct smc_cdc_tx_pend **pend)
65{
66 int rc;
67
68 rc = smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf,
69 wr_rdma_buf,
70 (struct smc_wr_tx_pend_priv **)pend);
71 if (conn->killed) {
72 /* abnormal termination */
73 if (!rc)
74 smc_wr_tx_put_slot(link,
75 (struct smc_wr_tx_pend_priv *)(*pend));
76 rc = -EPIPE;
77 }
78 return rc;
79}
80
81static inline void smc_cdc_add_pending_send(struct smc_connection *conn,
82 struct smc_cdc_tx_pend *pend)
83{
84 BUILD_BUG_ON_MSG(
85 sizeof(struct smc_cdc_msg) > SMC_WR_BUF_SIZE,
86 "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_cdc_msg)");
87 BUILD_BUG_ON_MSG(
88 offsetofend(struct smc_cdc_msg, reserved) > SMC_WR_TX_SIZE,
89 "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_cdc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()");
90 BUILD_BUG_ON_MSG(
91 sizeof(struct smc_cdc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE,
92 "must increase SMC_WR_TX_PEND_PRIV_SIZE to at least sizeof(struct smc_cdc_tx_pend)");
93 pend->conn = conn;
94 pend->cursor = conn->tx_curs_sent;
95 pend->p_cursor = conn->local_tx_ctrl.prod;
96 pend->ctrl_seq = conn->tx_cdc_seq;
97}
98
99int smc_cdc_msg_send(struct smc_connection *conn,
100 struct smc_wr_buf *wr_buf,
101 struct smc_cdc_tx_pend *pend)
102{
103 struct smc_link *link = conn->lnk;
104 union smc_host_cursor cfed;
105 int rc;
106
107 smc_cdc_add_pending_send(conn, pend);
108
109 conn->tx_cdc_seq++;
110 conn->local_tx_ctrl.seqno = conn->tx_cdc_seq;
111 smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, conn, &cfed);
112
113 atomic_inc(&conn->cdc_pend_tx_wr);
114 smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */
115
116 rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend);
117 if (!rc) {
118 smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn);
119 conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0;
120 } else {
121 conn->tx_cdc_seq--;
122 conn->local_tx_ctrl.seqno = conn->tx_cdc_seq;
123 atomic_dec(&conn->cdc_pend_tx_wr);
124 }
125
126 return rc;
127}
128
129/* send a validation msg indicating the move of a conn to an other QP link */
130int smcr_cdc_msg_send_validation(struct smc_connection *conn,
131 struct smc_cdc_tx_pend *pend,
132 struct smc_wr_buf *wr_buf)
133{
134 struct smc_host_cdc_msg *local = &conn->local_tx_ctrl;
135 struct smc_link *link = conn->lnk;
136 struct smc_cdc_msg *peer;
137 int rc;
138
139 peer = (struct smc_cdc_msg *)wr_buf;
140 peer->common.type = local->common.type;
141 peer->len = local->len;
142 peer->seqno = htons(conn->tx_cdc_seq_fin); /* seqno last compl. tx */
143 peer->token = htonl(local->token);
144 peer->prod_flags.failover_validation = 1;
145
146 /* We need to set pend->conn here to make sure smc_cdc_tx_handler()
147 * can handle properly
148 */
149 smc_cdc_add_pending_send(conn, pend);
150
151 atomic_inc(&conn->cdc_pend_tx_wr);
152 smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */
153
154 rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend);
155 if (unlikely(rc))
156 atomic_dec(&conn->cdc_pend_tx_wr);
157
158 return rc;
159}
160
161static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn)
162{
163 struct smc_cdc_tx_pend *pend;
164 struct smc_wr_buf *wr_buf;
165 struct smc_link *link;
166 bool again = false;
167 int rc;
168
169again:
170 link = conn->lnk;
171 if (!smc_wr_tx_link_hold(link))
172 return -ENOLINK;
173 rc = smc_cdc_get_free_slot(conn, link, &wr_buf, NULL, &pend);
174 if (rc)
175 goto put_out;
176
177 spin_lock_bh(&conn->send_lock);
178 if (link != conn->lnk) {
179 /* link of connection changed, try again one time*/
180 spin_unlock_bh(&conn->send_lock);
181 smc_wr_tx_put_slot(link,
182 (struct smc_wr_tx_pend_priv *)pend);
183 smc_wr_tx_link_put(link);
184 if (again)
185 return -ENOLINK;
186 again = true;
187 goto again;
188 }
189 rc = smc_cdc_msg_send(conn, wr_buf, pend);
190 spin_unlock_bh(&conn->send_lock);
191put_out:
192 smc_wr_tx_link_put(link);
193 return rc;
194}
195
196int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn)
197{
198 int rc;
199
200 if (!conn->lgr || (conn->lgr->is_smcd && conn->lgr->peer_shutdown))
201 return -EPIPE;
202
203 if (conn->lgr->is_smcd) {
204 spin_lock_bh(&conn->send_lock);
205 rc = smcd_cdc_msg_send(conn);
206 spin_unlock_bh(&conn->send_lock);
207 } else {
208 rc = smcr_cdc_get_slot_and_msg_send(conn);
209 }
210
211 return rc;
212}
213
214void smc_cdc_wait_pend_tx_wr(struct smc_connection *conn)
215{
216 wait_event(conn->cdc_pend_tx_wq, !atomic_read(&conn->cdc_pend_tx_wr));
217}
218
219/* Send a SMC-D CDC header.
220 * This increments the free space available in our send buffer.
221 * Also update the confirmed receive buffer with what was sent to the peer.
222 */
223int smcd_cdc_msg_send(struct smc_connection *conn)
224{
225 struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
226 union smc_host_cursor curs;
227 struct smcd_cdc_msg cdc;
228 int rc, diff;
229
230 memset(&cdc, 0, sizeof(cdc));
231 cdc.common.type = SMC_CDC_MSG_TYPE;
232 curs.acurs.counter = atomic64_read(&conn->local_tx_ctrl.prod.acurs);
233 cdc.prod.wrap = curs.wrap;
234 cdc.prod.count = curs.count;
235 curs.acurs.counter = atomic64_read(&conn->local_tx_ctrl.cons.acurs);
236 cdc.cons.wrap = curs.wrap;
237 cdc.cons.count = curs.count;
238 cdc.cons.prod_flags = conn->local_tx_ctrl.prod_flags;
239 cdc.cons.conn_state_flags = conn->local_tx_ctrl.conn_state_flags;
240 rc = smcd_tx_ism_write(conn, &cdc, sizeof(cdc), 0, 1);
241 if (rc)
242 return rc;
243 smc_curs_copy(&conn->rx_curs_confirmed, &curs, conn);
244 conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0;
245 /* Calculate transmitted data and increment free send buffer space */
246 diff = smc_curs_diff(conn->sndbuf_desc->len, &conn->tx_curs_fin,
247 &conn->tx_curs_sent);
248 /* increased by confirmed number of bytes */
249 smp_mb__before_atomic();
250 atomic_add(diff, &conn->sndbuf_space);
251 /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
252 smp_mb__after_atomic();
253 smc_curs_copy(&conn->tx_curs_fin, &conn->tx_curs_sent, conn);
254
255 smc_tx_sndbuf_nonfull(smc);
256 return rc;
257}
258
259/********************************* receive ***********************************/
260
261static inline bool smc_cdc_before(u16 seq1, u16 seq2)
262{
263 return (s16)(seq1 - seq2) < 0;
264}
265
266static void smc_cdc_handle_urg_data_arrival(struct smc_sock *smc,
267 int *diff_prod)
268{
269 struct smc_connection *conn = &smc->conn;
270 char *base;
271
272 /* new data included urgent business */
273 smc_curs_copy(&conn->urg_curs, &conn->local_rx_ctrl.prod, conn);
274 conn->urg_state = SMC_URG_VALID;
275 if (!sock_flag(&smc->sk, SOCK_URGINLINE))
276 /* we'll skip the urgent byte, so don't account for it */
277 (*diff_prod)--;
278 base = (char *)conn->rmb_desc->cpu_addr + conn->rx_off;
279 if (conn->urg_curs.count)
280 conn->urg_rx_byte = *(base + conn->urg_curs.count - 1);
281 else
282 conn->urg_rx_byte = *(base + conn->rmb_desc->len - 1);
283 sk_send_sigurg(&smc->sk);
284}
285
286static void smc_cdc_msg_validate(struct smc_sock *smc, struct smc_cdc_msg *cdc,
287 struct smc_link *link)
288{
289 struct smc_connection *conn = &smc->conn;
290 u16 recv_seq = ntohs(cdc->seqno);
291 s16 diff;
292
293 /* check that seqnum was seen before */
294 diff = conn->local_rx_ctrl.seqno - recv_seq;
295 if (diff < 0) { /* diff larger than 0x7fff */
296 /* drop connection */
297 conn->out_of_sync = 1; /* prevent any further receives */
298 spin_lock_bh(&conn->send_lock);
299 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
300 conn->lnk = link;
301 spin_unlock_bh(&conn->send_lock);
302 sock_hold(&smc->sk); /* sock_put in abort_work */
303 if (!queue_work(smc_close_wq, &conn->abort_work))
304 sock_put(&smc->sk);
305 }
306}
307
308static void smc_cdc_msg_recv_action(struct smc_sock *smc,
309 struct smc_cdc_msg *cdc)
310{
311 union smc_host_cursor cons_old, prod_old;
312 struct smc_connection *conn = &smc->conn;
313 int diff_cons, diff_prod;
314
315 smc_curs_copy(&prod_old, &conn->local_rx_ctrl.prod, conn);
316 smc_curs_copy(&cons_old, &conn->local_rx_ctrl.cons, conn);
317 smc_cdc_msg_to_host(&conn->local_rx_ctrl, cdc, conn);
318
319 diff_cons = smc_curs_diff(conn->peer_rmbe_size, &cons_old,
320 &conn->local_rx_ctrl.cons);
321 if (diff_cons) {
322 /* peer_rmbe_space is decreased during data transfer with RDMA
323 * write
324 */
325 smp_mb__before_atomic();
326 atomic_add(diff_cons, &conn->peer_rmbe_space);
327 /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */
328 smp_mb__after_atomic();
329 }
330
331 diff_prod = smc_curs_diff(conn->rmb_desc->len, &prod_old,
332 &conn->local_rx_ctrl.prod);
333 if (diff_prod) {
334 if (conn->local_rx_ctrl.prod_flags.urg_data_present)
335 smc_cdc_handle_urg_data_arrival(smc, &diff_prod);
336 /* bytes_to_rcv is decreased in smc_recvmsg */
337 smp_mb__before_atomic();
338 atomic_add(diff_prod, &conn->bytes_to_rcv);
339 /* guarantee 0 <= bytes_to_rcv <= rmb_desc->len */
340 smp_mb__after_atomic();
341 smc->sk.sk_data_ready(&smc->sk);
342 } else {
343 if (conn->local_rx_ctrl.prod_flags.write_blocked)
344 smc->sk.sk_data_ready(&smc->sk);
345 if (conn->local_rx_ctrl.prod_flags.urg_data_pending)
346 conn->urg_state = SMC_URG_NOTYET;
347 }
348
349 /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */
350 if ((diff_cons && smc_tx_prepared_sends(conn)) ||
351 conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
352 conn->local_rx_ctrl.prod_flags.urg_data_pending)
353 smc_tx_sndbuf_nonempty(conn);
354
355 if (diff_cons && conn->urg_tx_pend &&
356 atomic_read(&conn->peer_rmbe_space) == conn->peer_rmbe_size) {
357 /* urg data confirmed by peer, indicate we're ready for more */
358 conn->urg_tx_pend = false;
359 smc->sk.sk_write_space(&smc->sk);
360 }
361
362 if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
363 smc->sk.sk_err = ECONNRESET;
364 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
365 }
366 if (smc_cdc_rxed_any_close_or_senddone(conn)) {
367 smc->sk.sk_shutdown |= RCV_SHUTDOWN;
368 if (smc->clcsock && smc->clcsock->sk)
369 smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN;
370 sock_set_flag(&smc->sk, SOCK_DONE);
371 sock_hold(&smc->sk); /* sock_put in close_work */
372 if (!queue_work(smc_close_wq, &conn->close_work))
373 sock_put(&smc->sk);
374 }
375}
376
377/* called under tasklet context */
378static void smc_cdc_msg_recv(struct smc_sock *smc, struct smc_cdc_msg *cdc)
379{
380 sock_hold(&smc->sk);
381 bh_lock_sock(&smc->sk);
382 smc_cdc_msg_recv_action(smc, cdc);
383 bh_unlock_sock(&smc->sk);
384 sock_put(&smc->sk); /* no free sk in softirq-context */
385}
386
387/* Schedule a tasklet for this connection. Triggered from the ISM device IRQ
388 * handler to indicate update in the DMBE.
389 *
390 * Context:
391 * - tasklet context
392 */
393static void smcd_cdc_rx_tsklet(unsigned long data)
394{
395 struct smc_connection *conn = (struct smc_connection *)data;
396 struct smcd_cdc_msg *data_cdc;
397 struct smcd_cdc_msg cdc;
398 struct smc_sock *smc;
399
400 if (!conn || conn->killed)
401 return;
402
403 data_cdc = (struct smcd_cdc_msg *)conn->rmb_desc->cpu_addr;
404 smcd_curs_copy(&cdc.prod, &data_cdc->prod, conn);
405 smcd_curs_copy(&cdc.cons, &data_cdc->cons, conn);
406 smc = container_of(conn, struct smc_sock, conn);
407 smc_cdc_msg_recv(smc, (struct smc_cdc_msg *)&cdc);
408}
409
410/* Initialize receive tasklet. Called from ISM device IRQ handler to start
411 * receiver side.
412 */
413void smcd_cdc_rx_init(struct smc_connection *conn)
414{
415 tasklet_init(&conn->rx_tsklet, smcd_cdc_rx_tsklet, (unsigned long)conn);
416}
417
418/***************************** init, exit, misc ******************************/
419
420static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf)
421{
422 struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
423 struct smc_cdc_msg *cdc = buf;
424 struct smc_connection *conn;
425 struct smc_link_group *lgr;
426 struct smc_sock *smc;
427
428 if (wc->byte_len < offsetof(struct smc_cdc_msg, reserved))
429 return; /* short message */
430 if (cdc->len != SMC_WR_TX_SIZE)
431 return; /* invalid message */
432
433 /* lookup connection */
434 lgr = smc_get_lgr(link);
435 read_lock_bh(&lgr->conns_lock);
436 conn = smc_lgr_find_conn(ntohl(cdc->token), lgr);
437 read_unlock_bh(&lgr->conns_lock);
438 if (!conn || conn->out_of_sync)
439 return;
440 smc = container_of(conn, struct smc_sock, conn);
441
442 if (cdc->prod_flags.failover_validation) {
443 smc_cdc_msg_validate(smc, cdc, link);
444 return;
445 }
446 if (smc_cdc_before(ntohs(cdc->seqno),
447 conn->local_rx_ctrl.seqno))
448 /* received seqno is old */
449 return;
450
451 smc_cdc_msg_recv(smc, cdc);
452}
453
454static struct smc_wr_rx_handler smc_cdc_rx_handlers[] = {
455 {
456 .handler = smc_cdc_rx_handler,
457 .type = SMC_CDC_MSG_TYPE
458 },
459 {
460 .handler = NULL,
461 }
462};
463
464int __init smc_cdc_init(void)
465{
466 struct smc_wr_rx_handler *handler;
467 int rc = 0;
468
469 for (handler = smc_cdc_rx_handlers; handler->handler; handler++) {
470 INIT_HLIST_NODE(&handler->list);
471 rc = smc_wr_rx_register_handler(handler);
472 if (rc)
473 break;
474 }
475 return rc;
476}
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
new file mode 100644
index 000000000..696cc11f2
--- /dev/null
+++ b/net/smc/smc_cdc.h
@@ -0,0 +1,305 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
4 *
5 * Connection Data Control (CDC)
6 *
7 * Copyright IBM Corp. 2016
8 *
9 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
10 */
11
12#ifndef SMC_CDC_H
13#define SMC_CDC_H
14
15#include <linux/kernel.h> /* max_t */
16#include <linux/atomic.h>
17#include <linux/in.h>
18#include <linux/compiler.h>
19
20#include "smc.h"
21#include "smc_core.h"
22#include "smc_wr.h"
23
24#define SMC_CDC_MSG_TYPE 0xFE
25
26/* in network byte order */
27union smc_cdc_cursor { /* SMC cursor */
28 struct {
29 __be16 reserved;
30 __be16 wrap;
31 __be32 count;
32 };
33#ifdef KERNEL_HAS_ATOMIC64
34 atomic64_t acurs; /* for atomic processing */
35#else
36 u64 acurs; /* for atomic processing */
37#endif
38} __aligned(8);
39
40/* in network byte order */
41struct smc_cdc_msg {
42 struct smc_wr_rx_hdr common; /* .type = 0xFE */
43 u8 len; /* 44 */
44 __be16 seqno;
45 __be32 token;
46 union smc_cdc_cursor prod;
47 union smc_cdc_cursor cons; /* piggy backed "ack" */
48 struct smc_cdc_producer_flags prod_flags;
49 struct smc_cdc_conn_state_flags conn_state_flags;
50 u8 reserved[18];
51};
52
53/* SMC-D cursor format */
54union smcd_cdc_cursor {
55 struct {
56 u16 wrap;
57 u32 count;
58 struct smc_cdc_producer_flags prod_flags;
59 struct smc_cdc_conn_state_flags conn_state_flags;
60 } __packed;
61#ifdef KERNEL_HAS_ATOMIC64
62 atomic64_t acurs; /* for atomic processing */
63#else
64 u64 acurs; /* for atomic processing */
65#endif
66} __aligned(8);
67
68/* CDC message for SMC-D */
69struct smcd_cdc_msg {
70 struct smc_wr_rx_hdr common; /* Type = 0xFE */
71 u8 res1[7];
72 union smcd_cdc_cursor prod;
73 union smcd_cdc_cursor cons;
74 u8 res3[8];
75} __aligned(8);
76
77static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn)
78{
79 return conn->local_rx_ctrl.conn_state_flags.peer_conn_abort ||
80 conn->local_rx_ctrl.conn_state_flags.peer_conn_closed;
81}
82
83static inline bool smc_cdc_rxed_any_close_or_senddone(
84 struct smc_connection *conn)
85{
86 return smc_cdc_rxed_any_close(conn) ||
87 conn->local_rx_ctrl.conn_state_flags.peer_done_writing;
88}
89
90static inline void smc_curs_add(int size, union smc_host_cursor *curs,
91 int value)
92{
93 curs->count += value;
94 if (curs->count >= size) {
95 curs->wrap++;
96 curs->count -= size;
97 }
98}
99
100/* Copy cursor src into tgt */
101static inline void smc_curs_copy(union smc_host_cursor *tgt,
102 union smc_host_cursor *src,
103 struct smc_connection *conn)
104{
105#ifndef KERNEL_HAS_ATOMIC64
106 unsigned long flags;
107
108 spin_lock_irqsave(&conn->acurs_lock, flags);
109 tgt->acurs = src->acurs;
110 spin_unlock_irqrestore(&conn->acurs_lock, flags);
111#else
112 atomic64_set(&tgt->acurs, atomic64_read(&src->acurs));
113#endif
114}
115
116static inline void smc_curs_copy_net(union smc_cdc_cursor *tgt,
117 union smc_cdc_cursor *src,
118 struct smc_connection *conn)
119{
120#ifndef KERNEL_HAS_ATOMIC64
121 unsigned long flags;
122
123 spin_lock_irqsave(&conn->acurs_lock, flags);
124 tgt->acurs = src->acurs;
125 spin_unlock_irqrestore(&conn->acurs_lock, flags);
126#else
127 atomic64_set(&tgt->acurs, atomic64_read(&src->acurs));
128#endif
129}
130
131static inline void smcd_curs_copy(union smcd_cdc_cursor *tgt,
132 union smcd_cdc_cursor *src,
133 struct smc_connection *conn)
134{
135#ifndef KERNEL_HAS_ATOMIC64
136 unsigned long flags;
137
138 spin_lock_irqsave(&conn->acurs_lock, flags);
139 tgt->acurs = src->acurs;
140 spin_unlock_irqrestore(&conn->acurs_lock, flags);
141#else
142 atomic64_set(&tgt->acurs, atomic64_read(&src->acurs));
143#endif
144}
145
146/* calculate cursor difference between old and new, where old <= new and
147 * difference cannot exceed size
148 */
149static inline int smc_curs_diff(unsigned int size,
150 union smc_host_cursor *old,
151 union smc_host_cursor *new)
152{
153 if (old->wrap != new->wrap)
154 return max_t(int, 0,
155 ((size - old->count) + new->count));
156
157 return max_t(int, 0, (new->count - old->count));
158}
159
160/* calculate cursor difference between old and new - returns negative
161 * value in case old > new
162 */
163static inline int smc_curs_comp(unsigned int size,
164 union smc_host_cursor *old,
165 union smc_host_cursor *new)
166{
167 if (old->wrap > new->wrap ||
168 (old->wrap == new->wrap && old->count > new->count))
169 return -smc_curs_diff(size, new, old);
170 return smc_curs_diff(size, old, new);
171}
172
173/* calculate cursor difference between old and new, where old <= new and
174 * difference may exceed size
175 */
176static inline int smc_curs_diff_large(unsigned int size,
177 union smc_host_cursor *old,
178 union smc_host_cursor *new)
179{
180 if (old->wrap < new->wrap)
181 return min_t(int,
182 (size - old->count) + new->count +
183 (new->wrap - old->wrap - 1) * size,
184 size);
185
186 if (old->wrap > new->wrap) /* wrap has switched from 0xffff to 0x0000 */
187 return min_t(int,
188 (size - old->count) + new->count +
189 (new->wrap + 0xffff - old->wrap) * size,
190 size);
191
192 return max_t(int, 0, (new->count - old->count));
193}
194
195static inline void smc_host_cursor_to_cdc(union smc_cdc_cursor *peer,
196 union smc_host_cursor *local,
197 union smc_host_cursor *save,
198 struct smc_connection *conn)
199{
200 smc_curs_copy(save, local, conn);
201 peer->count = htonl(save->count);
202 peer->wrap = htons(save->wrap);
203 /* peer->reserved = htons(0); must be ensured by caller */
204}
205
206static inline void smc_host_msg_to_cdc(struct smc_cdc_msg *peer,
207 struct smc_connection *conn,
208 union smc_host_cursor *save)
209{
210 struct smc_host_cdc_msg *local = &conn->local_tx_ctrl;
211
212 peer->common.type = local->common.type;
213 peer->len = local->len;
214 peer->seqno = htons(local->seqno);
215 peer->token = htonl(local->token);
216 smc_host_cursor_to_cdc(&peer->prod, &local->prod, save, conn);
217 smc_host_cursor_to_cdc(&peer->cons, &local->cons, save, conn);
218 peer->prod_flags = local->prod_flags;
219 peer->conn_state_flags = local->conn_state_flags;
220}
221
222static inline void smc_cdc_cursor_to_host(union smc_host_cursor *local,
223 union smc_cdc_cursor *peer,
224 struct smc_connection *conn)
225{
226 union smc_host_cursor temp, old;
227 union smc_cdc_cursor net;
228
229 smc_curs_copy(&old, local, conn);
230 smc_curs_copy_net(&net, peer, conn);
231 temp.count = ntohl(net.count);
232 temp.wrap = ntohs(net.wrap);
233 if ((old.wrap > temp.wrap) && temp.wrap)
234 return;
235 if ((old.wrap == temp.wrap) &&
236 (old.count > temp.count))
237 return;
238 smc_curs_copy(local, &temp, conn);
239}
240
241static inline void smcr_cdc_msg_to_host(struct smc_host_cdc_msg *local,
242 struct smc_cdc_msg *peer,
243 struct smc_connection *conn)
244{
245 local->common.type = peer->common.type;
246 local->len = peer->len;
247 local->seqno = ntohs(peer->seqno);
248 local->token = ntohl(peer->token);
249 smc_cdc_cursor_to_host(&local->prod, &peer->prod, conn);
250 smc_cdc_cursor_to_host(&local->cons, &peer->cons, conn);
251 local->prod_flags = peer->prod_flags;
252 local->conn_state_flags = peer->conn_state_flags;
253}
254
255static inline void smcd_cdc_msg_to_host(struct smc_host_cdc_msg *local,
256 struct smcd_cdc_msg *peer,
257 struct smc_connection *conn)
258{
259 union smc_host_cursor temp;
260
261 temp.wrap = peer->prod.wrap;
262 temp.count = peer->prod.count;
263 smc_curs_copy(&local->prod, &temp, conn);
264
265 temp.wrap = peer->cons.wrap;
266 temp.count = peer->cons.count;
267 smc_curs_copy(&local->cons, &temp, conn);
268 local->prod_flags = peer->cons.prod_flags;
269 local->conn_state_flags = peer->cons.conn_state_flags;
270}
271
272static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local,
273 struct smc_cdc_msg *peer,
274 struct smc_connection *conn)
275{
276 if (conn->lgr->is_smcd)
277 smcd_cdc_msg_to_host(local, (struct smcd_cdc_msg *)peer, conn);
278 else
279 smcr_cdc_msg_to_host(local, peer, conn);
280}
281
282struct smc_cdc_tx_pend {
283 struct smc_connection *conn; /* socket connection */
284 union smc_host_cursor cursor; /* tx sndbuf cursor sent */
285 union smc_host_cursor p_cursor; /* rx RMBE cursor produced */
286 u16 ctrl_seq; /* conn. tx sequence # */
287};
288
289int smc_cdc_get_free_slot(struct smc_connection *conn,
290 struct smc_link *link,
291 struct smc_wr_buf **wr_buf,
292 struct smc_rdma_wr **wr_rdma_buf,
293 struct smc_cdc_tx_pend **pend);
294void smc_cdc_wait_pend_tx_wr(struct smc_connection *conn);
295int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf,
296 struct smc_cdc_tx_pend *pend);
297int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn);
298int smcd_cdc_msg_send(struct smc_connection *conn);
299int smcr_cdc_msg_send_validation(struct smc_connection *conn,
300 struct smc_cdc_tx_pend *pend,
301 struct smc_wr_buf *wr_buf);
302int smc_cdc_init(void) __init;
303void smcd_cdc_rx_init(struct smc_connection *conn);
304
305#endif /* SMC_CDC_H */
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
new file mode 100644
index 000000000..5ee5b2ce2
--- /dev/null
+++ b/net/smc/smc_clc.c
@@ -0,0 +1,784 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
4 *
5 * CLC (connection layer control) handshake over initial TCP socket to
6 * prepare for RDMA traffic
7 *
8 * Copyright IBM Corp. 2016, 2018
9 *
10 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
11 */
12
13#include <linux/in.h>
14#include <linux/inetdevice.h>
15#include <linux/if_ether.h>
16#include <linux/sched/signal.h>
17#include <linux/utsname.h>
18#include <linux/ctype.h>
19
20#include <net/addrconf.h>
21#include <net/sock.h>
22#include <net/tcp.h>
23
24#include "smc.h"
25#include "smc_core.h"
26#include "smc_clc.h"
27#include "smc_ib.h"
28#include "smc_ism.h"
29
30#define SMCR_CLC_ACCEPT_CONFIRM_LEN 68
31#define SMCD_CLC_ACCEPT_CONFIRM_LEN 48
32#define SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 78
33#define SMC_CLC_RECV_BUF_LEN 100
34
35/* eye catcher "SMCR" EBCDIC for CLC messages */
36static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'};
37/* eye catcher "SMCD" EBCDIC for CLC messages */
38static const char SMCD_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xc4'};
39
40static u8 smc_hostname[SMC_MAX_HOSTNAME_LEN];
41
42/* check arriving CLC proposal */
43static bool smc_clc_msg_prop_valid(struct smc_clc_msg_proposal *pclc)
44{
45 struct smc_clc_msg_proposal_prefix *pclc_prfx;
46 struct smc_clc_smcd_v2_extension *smcd_v2_ext;
47 struct smc_clc_msg_hdr *hdr = &pclc->hdr;
48 struct smc_clc_v2_extension *v2_ext;
49
50 v2_ext = smc_get_clc_v2_ext(pclc);
51 pclc_prfx = smc_clc_proposal_get_prefix(pclc);
52 if (hdr->version == SMC_V1) {
53 if (hdr->typev1 == SMC_TYPE_N)
54 return false;
55 if (ntohs(hdr->length) !=
56 sizeof(*pclc) + ntohs(pclc->iparea_offset) +
57 sizeof(*pclc_prfx) +
58 pclc_prfx->ipv6_prefixes_cnt *
59 sizeof(struct smc_clc_ipv6_prefix) +
60 sizeof(struct smc_clc_msg_trail))
61 return false;
62 } else {
63 if (ntohs(hdr->length) !=
64 sizeof(*pclc) +
65 sizeof(struct smc_clc_msg_smcd) +
66 (hdr->typev1 != SMC_TYPE_N ?
67 sizeof(*pclc_prfx) +
68 pclc_prfx->ipv6_prefixes_cnt *
69 sizeof(struct smc_clc_ipv6_prefix) : 0) +
70 (hdr->typev2 != SMC_TYPE_N ?
71 sizeof(*v2_ext) +
72 v2_ext->hdr.eid_cnt * SMC_MAX_EID_LEN : 0) +
73 (smcd_indicated(hdr->typev2) ?
74 sizeof(*smcd_v2_ext) + v2_ext->hdr.ism_gid_cnt *
75 sizeof(struct smc_clc_smcd_gid_chid) :
76 0) +
77 sizeof(struct smc_clc_msg_trail))
78 return false;
79 }
80 return true;
81}
82
83/* check arriving CLC accept or confirm */
84static bool
85smc_clc_msg_acc_conf_valid(struct smc_clc_msg_accept_confirm_v2 *clc_v2)
86{
87 struct smc_clc_msg_hdr *hdr = &clc_v2->hdr;
88
89 if (hdr->typev1 != SMC_TYPE_R && hdr->typev1 != SMC_TYPE_D)
90 return false;
91 if (hdr->version == SMC_V1) {
92 if ((hdr->typev1 == SMC_TYPE_R &&
93 ntohs(hdr->length) != SMCR_CLC_ACCEPT_CONFIRM_LEN) ||
94 (hdr->typev1 == SMC_TYPE_D &&
95 ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN))
96 return false;
97 } else {
98 if (hdr->typev1 == SMC_TYPE_D &&
99 ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 &&
100 (ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 +
101 sizeof(struct smc_clc_first_contact_ext)))
102 return false;
103 }
104 return true;
105}
106
107static void smc_clc_fill_fce(struct smc_clc_first_contact_ext *fce, int *len)
108{
109 memset(fce, 0, sizeof(*fce));
110 fce->os_type = SMC_CLC_OS_LINUX;
111 fce->release = SMC_RELEASE;
112 memcpy(fce->hostname, smc_hostname, sizeof(smc_hostname));
113 (*len) += sizeof(*fce);
114}
115
116/* check if received message has a correct header length and contains valid
117 * heading and trailing eyecatchers
118 */
119static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm, bool check_trl)
120{
121 struct smc_clc_msg_accept_confirm_v2 *clc_v2;
122 struct smc_clc_msg_proposal *pclc;
123 struct smc_clc_msg_decline *dclc;
124 struct smc_clc_msg_trail *trl;
125
126 if (memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) &&
127 memcmp(clcm->eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)))
128 return false;
129 switch (clcm->type) {
130 case SMC_CLC_PROPOSAL:
131 pclc = (struct smc_clc_msg_proposal *)clcm;
132 if (!smc_clc_msg_prop_valid(pclc))
133 return false;
134 trl = (struct smc_clc_msg_trail *)
135 ((u8 *)pclc + ntohs(pclc->hdr.length) - sizeof(*trl));
136 break;
137 case SMC_CLC_ACCEPT:
138 case SMC_CLC_CONFIRM:
139 clc_v2 = (struct smc_clc_msg_accept_confirm_v2 *)clcm;
140 if (!smc_clc_msg_acc_conf_valid(clc_v2))
141 return false;
142 trl = (struct smc_clc_msg_trail *)
143 ((u8 *)clc_v2 + ntohs(clc_v2->hdr.length) -
144 sizeof(*trl));
145 break;
146 case SMC_CLC_DECLINE:
147 dclc = (struct smc_clc_msg_decline *)clcm;
148 if (ntohs(dclc->hdr.length) != sizeof(*dclc))
149 return false;
150 trl = &dclc->trl;
151 break;
152 default:
153 return false;
154 }
155 if (check_trl &&
156 memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) &&
157 memcmp(trl->eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)))
158 return false;
159 return true;
160}
161
162/* find ipv4 addr on device and get the prefix len, fill CLC proposal msg */
163static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4,
164 struct smc_clc_msg_proposal_prefix *prop)
165{
166 struct in_device *in_dev = __in_dev_get_rcu(dst->dev);
167 const struct in_ifaddr *ifa;
168
169 if (!in_dev)
170 return -ENODEV;
171
172 in_dev_for_each_ifa_rcu(ifa, in_dev) {
173 if (!inet_ifa_match(ipv4, ifa))
174 continue;
175 prop->prefix_len = inet_mask_len(ifa->ifa_mask);
176 prop->outgoing_subnet = ifa->ifa_address & ifa->ifa_mask;
177 /* prop->ipv6_prefixes_cnt = 0; already done by memset before */
178 return 0;
179 }
180 return -ENOENT;
181}
182
183/* fill CLC proposal msg with ipv6 prefixes from device */
184static int smc_clc_prfx_set6_rcu(struct dst_entry *dst,
185 struct smc_clc_msg_proposal_prefix *prop,
186 struct smc_clc_ipv6_prefix *ipv6_prfx)
187{
188#if IS_ENABLED(CONFIG_IPV6)
189 struct inet6_dev *in6_dev = __in6_dev_get(dst->dev);
190 struct inet6_ifaddr *ifa;
191 int cnt = 0;
192
193 if (!in6_dev)
194 return -ENODEV;
195 /* use a maximum of 8 IPv6 prefixes from device */
196 list_for_each_entry(ifa, &in6_dev->addr_list, if_list) {
197 if (ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL)
198 continue;
199 ipv6_addr_prefix(&ipv6_prfx[cnt].prefix,
200 &ifa->addr, ifa->prefix_len);
201 ipv6_prfx[cnt].prefix_len = ifa->prefix_len;
202 cnt++;
203 if (cnt == SMC_CLC_MAX_V6_PREFIX)
204 break;
205 }
206 prop->ipv6_prefixes_cnt = cnt;
207 if (cnt)
208 return 0;
209#endif
210 return -ENOENT;
211}
212
213/* retrieve and set prefixes in CLC proposal msg */
214static int smc_clc_prfx_set(struct socket *clcsock,
215 struct smc_clc_msg_proposal_prefix *prop,
216 struct smc_clc_ipv6_prefix *ipv6_prfx)
217{
218 struct dst_entry *dst = sk_dst_get(clcsock->sk);
219 struct sockaddr_storage addrs;
220 struct sockaddr_in6 *addr6;
221 struct sockaddr_in *addr;
222 int rc = -ENOENT;
223
224 if (!dst) {
225 rc = -ENOTCONN;
226 goto out;
227 }
228 if (!dst->dev) {
229 rc = -ENODEV;
230 goto out_rel;
231 }
232 /* get address to which the internal TCP socket is bound */
233 if (kernel_getsockname(clcsock, (struct sockaddr *)&addrs) < 0)
234 goto out_rel;
235 /* analyze IP specific data of net_device belonging to TCP socket */
236 addr6 = (struct sockaddr_in6 *)&addrs;
237 rcu_read_lock();
238 if (addrs.ss_family == PF_INET) {
239 /* IPv4 */
240 addr = (struct sockaddr_in *)&addrs;
241 rc = smc_clc_prfx_set4_rcu(dst, addr->sin_addr.s_addr, prop);
242 } else if (ipv6_addr_v4mapped(&addr6->sin6_addr)) {
243 /* mapped IPv4 address - peer is IPv4 only */
244 rc = smc_clc_prfx_set4_rcu(dst, addr6->sin6_addr.s6_addr32[3],
245 prop);
246 } else {
247 /* IPv6 */
248 rc = smc_clc_prfx_set6_rcu(dst, prop, ipv6_prfx);
249 }
250 rcu_read_unlock();
251out_rel:
252 dst_release(dst);
253out:
254 return rc;
255}
256
257/* match ipv4 addrs of dev against addr in CLC proposal */
258static int smc_clc_prfx_match4_rcu(struct net_device *dev,
259 struct smc_clc_msg_proposal_prefix *prop)
260{
261 struct in_device *in_dev = __in_dev_get_rcu(dev);
262 const struct in_ifaddr *ifa;
263
264 if (!in_dev)
265 return -ENODEV;
266 in_dev_for_each_ifa_rcu(ifa, in_dev) {
267 if (prop->prefix_len == inet_mask_len(ifa->ifa_mask) &&
268 inet_ifa_match(prop->outgoing_subnet, ifa))
269 return 0;
270 }
271
272 return -ENOENT;
273}
274
275/* match ipv6 addrs of dev against addrs in CLC proposal */
276static int smc_clc_prfx_match6_rcu(struct net_device *dev,
277 struct smc_clc_msg_proposal_prefix *prop)
278{
279#if IS_ENABLED(CONFIG_IPV6)
280 struct inet6_dev *in6_dev = __in6_dev_get(dev);
281 struct smc_clc_ipv6_prefix *ipv6_prfx;
282 struct inet6_ifaddr *ifa;
283 int i, max;
284
285 if (!in6_dev)
286 return -ENODEV;
287 /* ipv6 prefix list starts behind smc_clc_msg_proposal_prefix */
288 ipv6_prfx = (struct smc_clc_ipv6_prefix *)((u8 *)prop + sizeof(*prop));
289 max = min_t(u8, prop->ipv6_prefixes_cnt, SMC_CLC_MAX_V6_PREFIX);
290 list_for_each_entry(ifa, &in6_dev->addr_list, if_list) {
291 if (ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL)
292 continue;
293 for (i = 0; i < max; i++) {
294 if (ifa->prefix_len == ipv6_prfx[i].prefix_len &&
295 ipv6_prefix_equal(&ifa->addr, &ipv6_prfx[i].prefix,
296 ifa->prefix_len))
297 return 0;
298 }
299 }
300#endif
301 return -ENOENT;
302}
303
304/* check if proposed prefixes match one of our device prefixes */
305int smc_clc_prfx_match(struct socket *clcsock,
306 struct smc_clc_msg_proposal_prefix *prop)
307{
308 struct dst_entry *dst = sk_dst_get(clcsock->sk);
309 int rc;
310
311 if (!dst) {
312 rc = -ENOTCONN;
313 goto out;
314 }
315 if (!dst->dev) {
316 rc = -ENODEV;
317 goto out_rel;
318 }
319 rcu_read_lock();
320 if (!prop->ipv6_prefixes_cnt)
321 rc = smc_clc_prfx_match4_rcu(dst->dev, prop);
322 else
323 rc = smc_clc_prfx_match6_rcu(dst->dev, prop);
324 rcu_read_unlock();
325out_rel:
326 dst_release(dst);
327out:
328 return rc;
329}
330
331/* Wait for data on the tcp-socket, analyze received data
332 * Returns:
333 * 0 if success and it was not a decline that we received.
334 * SMC_CLC_DECL_REPLY if decline received for fallback w/o another decl send.
335 * clcsock error, -EINTR, -ECONNRESET, -EPROTO otherwise.
336 */
337int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
338 u8 expected_type, unsigned long timeout)
339{
340 long rcvtimeo = smc->clcsock->sk->sk_rcvtimeo;
341 struct sock *clc_sk = smc->clcsock->sk;
342 struct smc_clc_msg_hdr *clcm = buf;
343 struct msghdr msg = {NULL, 0};
344 int reason_code = 0;
345 struct kvec vec = {buf, buflen};
346 int len, datlen, recvlen;
347 bool check_trl = true;
348 int krflags;
349
350 /* peek the first few bytes to determine length of data to receive
351 * so we don't consume any subsequent CLC message or payload data
352 * in the TCP byte stream
353 */
354 /*
355 * Caller must make sure that buflen is no less than
356 * sizeof(struct smc_clc_msg_hdr)
357 */
358 krflags = MSG_PEEK | MSG_WAITALL;
359 clc_sk->sk_rcvtimeo = timeout;
360 iov_iter_kvec(&msg.msg_iter, READ, &vec, 1,
361 sizeof(struct smc_clc_msg_hdr));
362 len = sock_recvmsg(smc->clcsock, &msg, krflags);
363 if (signal_pending(current)) {
364 reason_code = -EINTR;
365 clc_sk->sk_err = EINTR;
366 smc->sk.sk_err = EINTR;
367 goto out;
368 }
369 if (clc_sk->sk_err) {
370 reason_code = -clc_sk->sk_err;
371 if (clc_sk->sk_err == EAGAIN &&
372 expected_type == SMC_CLC_DECLINE)
373 clc_sk->sk_err = 0; /* reset for fallback usage */
374 else
375 smc->sk.sk_err = clc_sk->sk_err;
376 goto out;
377 }
378 if (!len) { /* peer has performed orderly shutdown */
379 smc->sk.sk_err = ECONNRESET;
380 reason_code = -ECONNRESET;
381 goto out;
382 }
383 if (len < 0) {
384 if (len != -EAGAIN || expected_type != SMC_CLC_DECLINE)
385 smc->sk.sk_err = -len;
386 reason_code = len;
387 goto out;
388 }
389 datlen = ntohs(clcm->length);
390 if ((len < sizeof(struct smc_clc_msg_hdr)) ||
391 (clcm->version < SMC_V1) ||
392 ((clcm->type != SMC_CLC_DECLINE) &&
393 (clcm->type != expected_type))) {
394 smc->sk.sk_err = EPROTO;
395 reason_code = -EPROTO;
396 goto out;
397 }
398
399 /* receive the complete CLC message */
400 memset(&msg, 0, sizeof(struct msghdr));
401 if (datlen > buflen) {
402 check_trl = false;
403 recvlen = buflen;
404 } else {
405 recvlen = datlen;
406 }
407 iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, recvlen);
408 krflags = MSG_WAITALL;
409 len = sock_recvmsg(smc->clcsock, &msg, krflags);
410 if (len < recvlen || !smc_clc_msg_hdr_valid(clcm, check_trl)) {
411 smc->sk.sk_err = EPROTO;
412 reason_code = -EPROTO;
413 goto out;
414 }
415 datlen -= len;
416 while (datlen) {
417 u8 tmp[SMC_CLC_RECV_BUF_LEN];
418
419 vec.iov_base = &tmp;
420 vec.iov_len = SMC_CLC_RECV_BUF_LEN;
421 /* receive remaining proposal message */
422 recvlen = datlen > SMC_CLC_RECV_BUF_LEN ?
423 SMC_CLC_RECV_BUF_LEN : datlen;
424 iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, recvlen);
425 len = sock_recvmsg(smc->clcsock, &msg, krflags);
426 datlen -= len;
427 }
428 if (clcm->type == SMC_CLC_DECLINE) {
429 struct smc_clc_msg_decline *dclc;
430
431 dclc = (struct smc_clc_msg_decline *)clcm;
432 reason_code = SMC_CLC_DECL_PEERDECL;
433 smc->peer_diagnosis = ntohl(dclc->peer_diagnosis);
434 if (((struct smc_clc_msg_decline *)buf)->hdr.typev2 &
435 SMC_FIRST_CONTACT_MASK) {
436 smc->conn.lgr->sync_err = 1;
437 smc_lgr_terminate_sched(smc->conn.lgr);
438 }
439 }
440
441out:
442 clc_sk->sk_rcvtimeo = rcvtimeo;
443 return reason_code;
444}
445
446/* send CLC DECLINE message across internal TCP socket */
447int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version)
448{
449 struct smc_clc_msg_decline dclc;
450 struct msghdr msg;
451 struct kvec vec;
452 int len;
453
454 memset(&dclc, 0, sizeof(dclc));
455 memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
456 dclc.hdr.type = SMC_CLC_DECLINE;
457 dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline));
458 dclc.hdr.version = version;
459 dclc.os_type = version == SMC_V1 ? 0 : SMC_CLC_OS_LINUX;
460 dclc.hdr.typev2 = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ?
461 SMC_FIRST_CONTACT_MASK : 0;
462 if ((!smc->conn.lgr || !smc->conn.lgr->is_smcd) &&
463 smc_ib_is_valid_local_systemid())
464 memcpy(dclc.id_for_peer, local_systemid,
465 sizeof(local_systemid));
466 dclc.peer_diagnosis = htonl(peer_diag_info);
467 memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
468
469 memset(&msg, 0, sizeof(msg));
470 vec.iov_base = &dclc;
471 vec.iov_len = sizeof(struct smc_clc_msg_decline);
472 len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1,
473 sizeof(struct smc_clc_msg_decline));
474 if (len < 0 || len < sizeof(struct smc_clc_msg_decline))
475 len = -EPROTO;
476 return len > 0 ? 0 : len;
477}
478
479/* send CLC PROPOSAL message across internal TCP socket */
480int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini)
481{
482 struct smc_clc_smcd_v2_extension *smcd_v2_ext;
483 struct smc_clc_msg_proposal_prefix *pclc_prfx;
484 struct smc_clc_msg_proposal *pclc_base;
485 struct smc_clc_smcd_gid_chid *gidchids;
486 struct smc_clc_msg_proposal_area *pclc;
487 struct smc_clc_ipv6_prefix *ipv6_prfx;
488 struct smc_clc_v2_extension *v2_ext;
489 struct smc_clc_msg_smcd *pclc_smcd;
490 struct smc_clc_msg_trail *trl;
491 int len, i, plen, rc;
492 int reason_code = 0;
493 struct kvec vec[8];
494 struct msghdr msg;
495
496 pclc = kzalloc(sizeof(*pclc), GFP_KERNEL);
497 if (!pclc)
498 return -ENOMEM;
499
500 pclc_base = &pclc->pclc_base;
501 pclc_smcd = &pclc->pclc_smcd;
502 pclc_prfx = &pclc->pclc_prfx;
503 ipv6_prfx = pclc->pclc_prfx_ipv6;
504 v2_ext = &pclc->pclc_v2_ext;
505 smcd_v2_ext = &pclc->pclc_smcd_v2_ext;
506 gidchids = pclc->pclc_gidchids;
507 trl = &pclc->pclc_trl;
508
509 pclc_base->hdr.version = SMC_V2;
510 pclc_base->hdr.typev1 = ini->smc_type_v1;
511 pclc_base->hdr.typev2 = ini->smc_type_v2;
512 plen = sizeof(*pclc_base) + sizeof(*pclc_smcd) + sizeof(*trl);
513
514 /* retrieve ip prefixes for CLC proposal msg */
515 if (ini->smc_type_v1 != SMC_TYPE_N) {
516 rc = smc_clc_prfx_set(smc->clcsock, pclc_prfx, ipv6_prfx);
517 if (rc) {
518 if (ini->smc_type_v2 == SMC_TYPE_N) {
519 kfree(pclc);
520 return SMC_CLC_DECL_CNFERR;
521 }
522 pclc_base->hdr.typev1 = SMC_TYPE_N;
523 } else {
524 pclc_base->iparea_offset = htons(sizeof(*pclc_smcd));
525 plen += sizeof(*pclc_prfx) +
526 pclc_prfx->ipv6_prefixes_cnt *
527 sizeof(ipv6_prfx[0]);
528 }
529 }
530
531 /* build SMC Proposal CLC message */
532 memcpy(pclc_base->hdr.eyecatcher, SMC_EYECATCHER,
533 sizeof(SMC_EYECATCHER));
534 pclc_base->hdr.type = SMC_CLC_PROPOSAL;
535 if (smcr_indicated(ini->smc_type_v1)) {
536 /* add SMC-R specifics */
537 memcpy(pclc_base->lcl.id_for_peer, local_systemid,
538 sizeof(local_systemid));
539 memcpy(pclc_base->lcl.gid, ini->ib_gid, SMC_GID_SIZE);
540 memcpy(pclc_base->lcl.mac, &ini->ib_dev->mac[ini->ib_port - 1],
541 ETH_ALEN);
542 }
543 if (smcd_indicated(ini->smc_type_v1)) {
544 /* add SMC-D specifics */
545 if (ini->ism_dev[0]) {
546 pclc_smcd->ism.gid = htonll(ini->ism_dev[0]->local_gid);
547 pclc_smcd->ism.chid =
548 htons(smc_ism_get_chid(ini->ism_dev[0]));
549 }
550 }
551 if (ini->smc_type_v2 == SMC_TYPE_N) {
552 pclc_smcd->v2_ext_offset = 0;
553 } else {
554 u16 v2_ext_offset;
555 u8 *eid = NULL;
556
557 v2_ext_offset = sizeof(*pclc_smcd) -
558 offsetofend(struct smc_clc_msg_smcd, v2_ext_offset);
559 if (ini->smc_type_v1 != SMC_TYPE_N)
560 v2_ext_offset += sizeof(*pclc_prfx) +
561 pclc_prfx->ipv6_prefixes_cnt *
562 sizeof(ipv6_prfx[0]);
563 pclc_smcd->v2_ext_offset = htons(v2_ext_offset);
564 v2_ext->hdr.eid_cnt = 0;
565 v2_ext->hdr.ism_gid_cnt = ini->ism_offered_cnt;
566 v2_ext->hdr.flag.release = SMC_RELEASE;
567 v2_ext->hdr.flag.seid = 1;
568 v2_ext->hdr.smcd_v2_ext_offset = htons(sizeof(*v2_ext) -
569 offsetofend(struct smc_clnt_opts_area_hdr,
570 smcd_v2_ext_offset) +
571 v2_ext->hdr.eid_cnt * SMC_MAX_EID_LEN);
572 if (ini->ism_dev[0])
573 smc_ism_get_system_eid(ini->ism_dev[0], &eid);
574 else
575 smc_ism_get_system_eid(ini->ism_dev[1], &eid);
576 if (eid)
577 memcpy(smcd_v2_ext->system_eid, eid, SMC_MAX_EID_LEN);
578 plen += sizeof(*v2_ext) + sizeof(*smcd_v2_ext);
579 if (ini->ism_offered_cnt) {
580 for (i = 1; i <= ini->ism_offered_cnt; i++) {
581 gidchids[i - 1].gid =
582 htonll(ini->ism_dev[i]->local_gid);
583 gidchids[i - 1].chid =
584 htons(smc_ism_get_chid(ini->ism_dev[i]));
585 }
586 plen += ini->ism_offered_cnt *
587 sizeof(struct smc_clc_smcd_gid_chid);
588 }
589 }
590 pclc_base->hdr.length = htons(plen);
591 memcpy(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
592
593 /* send SMC Proposal CLC message */
594 memset(&msg, 0, sizeof(msg));
595 i = 0;
596 vec[i].iov_base = pclc_base;
597 vec[i++].iov_len = sizeof(*pclc_base);
598 vec[i].iov_base = pclc_smcd;
599 vec[i++].iov_len = sizeof(*pclc_smcd);
600 if (ini->smc_type_v1 != SMC_TYPE_N) {
601 vec[i].iov_base = pclc_prfx;
602 vec[i++].iov_len = sizeof(*pclc_prfx);
603 if (pclc_prfx->ipv6_prefixes_cnt > 0) {
604 vec[i].iov_base = ipv6_prfx;
605 vec[i++].iov_len = pclc_prfx->ipv6_prefixes_cnt *
606 sizeof(ipv6_prfx[0]);
607 }
608 }
609 if (ini->smc_type_v2 != SMC_TYPE_N) {
610 vec[i].iov_base = v2_ext;
611 vec[i++].iov_len = sizeof(*v2_ext);
612 vec[i].iov_base = smcd_v2_ext;
613 vec[i++].iov_len = sizeof(*smcd_v2_ext);
614 if (ini->ism_offered_cnt) {
615 vec[i].iov_base = gidchids;
616 vec[i++].iov_len = ini->ism_offered_cnt *
617 sizeof(struct smc_clc_smcd_gid_chid);
618 }
619 }
620 vec[i].iov_base = trl;
621 vec[i++].iov_len = sizeof(*trl);
622 /* due to the few bytes needed for clc-handshake this cannot block */
623 len = kernel_sendmsg(smc->clcsock, &msg, vec, i, plen);
624 if (len < 0) {
625 smc->sk.sk_err = smc->clcsock->sk->sk_err;
626 reason_code = -smc->sk.sk_err;
627 } else if (len < ntohs(pclc_base->hdr.length)) {
628 reason_code = -ENETUNREACH;
629 smc->sk.sk_err = -reason_code;
630 }
631
632 kfree(pclc);
633 return reason_code;
634}
635
636/* build and send CLC CONFIRM / ACCEPT message */
637static int smc_clc_send_confirm_accept(struct smc_sock *smc,
638 struct smc_clc_msg_accept_confirm_v2 *clc_v2,
639 int first_contact, u8 version)
640{
641 struct smc_connection *conn = &smc->conn;
642 struct smc_clc_msg_accept_confirm *clc;
643 struct smc_clc_first_contact_ext fce;
644 struct smc_clc_msg_trail trl;
645 struct kvec vec[3];
646 struct msghdr msg;
647 int i, len;
648
649 /* send SMC Confirm CLC msg */
650 clc = (struct smc_clc_msg_accept_confirm *)clc_v2;
651 clc->hdr.version = version; /* SMC version */
652 if (first_contact)
653 clc->hdr.typev2 |= SMC_FIRST_CONTACT_MASK;
654 if (conn->lgr->is_smcd) {
655 /* SMC-D specific settings */
656 memcpy(clc->hdr.eyecatcher, SMCD_EYECATCHER,
657 sizeof(SMCD_EYECATCHER));
658 clc->hdr.typev1 = SMC_TYPE_D;
659 clc->d0.gid = conn->lgr->smcd->local_gid;
660 clc->d0.token = conn->rmb_desc->token;
661 clc->d0.dmbe_size = conn->rmbe_size_short;
662 clc->d0.dmbe_idx = 0;
663 memcpy(&clc->d0.linkid, conn->lgr->id, SMC_LGR_ID_SIZE);
664 if (version == SMC_V1) {
665 clc->hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN);
666 } else {
667 u8 *eid = NULL;
668
669 clc_v2->chid = htons(smc_ism_get_chid(conn->lgr->smcd));
670 smc_ism_get_system_eid(conn->lgr->smcd, &eid);
671 if (eid)
672 memcpy(clc_v2->eid, eid, SMC_MAX_EID_LEN);
673 len = SMCD_CLC_ACCEPT_CONFIRM_LEN_V2;
674 if (first_contact)
675 smc_clc_fill_fce(&fce, &len);
676 clc_v2->hdr.length = htons(len);
677 }
678 memcpy(trl.eyecatcher, SMCD_EYECATCHER,
679 sizeof(SMCD_EYECATCHER));
680 } else {
681 struct smc_link *link = conn->lnk;
682
683 /* SMC-R specific settings */
684 link = conn->lnk;
685 memcpy(clc->hdr.eyecatcher, SMC_EYECATCHER,
686 sizeof(SMC_EYECATCHER));
687 clc->hdr.typev1 = SMC_TYPE_R;
688 clc->hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN);
689 memcpy(clc->r0.lcl.id_for_peer, local_systemid,
690 sizeof(local_systemid));
691 memcpy(&clc->r0.lcl.gid, link->gid, SMC_GID_SIZE);
692 memcpy(&clc->r0.lcl.mac, &link->smcibdev->mac[link->ibport - 1],
693 ETH_ALEN);
694 hton24(clc->r0.qpn, link->roce_qp->qp_num);
695 clc->r0.rmb_rkey =
696 htonl(conn->rmb_desc->mr_rx[link->link_idx]->rkey);
697 clc->r0.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */
698 clc->r0.rmbe_alert_token = htonl(conn->alert_token_local);
699 switch (clc->hdr.type) {
700 case SMC_CLC_ACCEPT:
701 clc->r0.qp_mtu = link->path_mtu;
702 break;
703 case SMC_CLC_CONFIRM:
704 clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu);
705 break;
706 }
707 clc->r0.rmbe_size = conn->rmbe_size_short;
708 clc->r0.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address
709 (conn->rmb_desc->sgt[link->link_idx].sgl));
710 hton24(clc->r0.psn, link->psn_initial);
711 memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
712 }
713
714 memset(&msg, 0, sizeof(msg));
715 i = 0;
716 vec[i].iov_base = clc_v2;
717 if (version > SMC_V1)
718 vec[i++].iov_len = SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 - sizeof(trl);
719 else
720 vec[i++].iov_len = (clc->hdr.typev1 == SMC_TYPE_D ?
721 SMCD_CLC_ACCEPT_CONFIRM_LEN :
722 SMCR_CLC_ACCEPT_CONFIRM_LEN) -
723 sizeof(trl);
724 if (version > SMC_V1 && first_contact) {
725 vec[i].iov_base = &fce;
726 vec[i++].iov_len = sizeof(fce);
727 }
728 vec[i].iov_base = &trl;
729 vec[i++].iov_len = sizeof(trl);
730 return kernel_sendmsg(smc->clcsock, &msg, vec, 1,
731 ntohs(clc->hdr.length));
732}
733
734/* send CLC CONFIRM message across internal TCP socket */
735int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact,
736 u8 version)
737{
738 struct smc_clc_msg_accept_confirm_v2 cclc_v2;
739 int reason_code = 0;
740 int len;
741
742 /* send SMC Confirm CLC msg */
743 memset(&cclc_v2, 0, sizeof(cclc_v2));
744 cclc_v2.hdr.type = SMC_CLC_CONFIRM;
745 len = smc_clc_send_confirm_accept(smc, &cclc_v2, clnt_first_contact,
746 version);
747 if (len < ntohs(cclc_v2.hdr.length)) {
748 if (len >= 0) {
749 reason_code = -ENETUNREACH;
750 smc->sk.sk_err = -reason_code;
751 } else {
752 smc->sk.sk_err = smc->clcsock->sk->sk_err;
753 reason_code = -smc->sk.sk_err;
754 }
755 }
756 return reason_code;
757}
758
759/* send CLC ACCEPT message across internal TCP socket */
760int smc_clc_send_accept(struct smc_sock *new_smc, bool srv_first_contact,
761 u8 version)
762{
763 struct smc_clc_msg_accept_confirm_v2 aclc_v2;
764 int len;
765
766 memset(&aclc_v2, 0, sizeof(aclc_v2));
767 aclc_v2.hdr.type = SMC_CLC_ACCEPT;
768 len = smc_clc_send_confirm_accept(new_smc, &aclc_v2, srv_first_contact,
769 version);
770 if (len < ntohs(aclc_v2.hdr.length))
771 len = len >= 0 ? -EPROTO : -new_smc->clcsock->sk->sk_err;
772
773 return len > 0 ? 0 : len;
774}
775
776void __init smc_clc_init(void)
777{
778 struct new_utsname *u;
779
780 memset(smc_hostname, _S, sizeof(smc_hostname)); /* ASCII blanks */
781 u = utsname();
782 memcpy(smc_hostname, u->nodename,
783 min_t(size_t, strlen(u->nodename), sizeof(smc_hostname)));
784}
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
new file mode 100644
index 000000000..c579d1d59
--- /dev/null
+++ b/net/smc/smc_clc.h
@@ -0,0 +1,333 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
4 *
5 * CLC (connection layer control) handshake over initial TCP socket to
6 * prepare for RDMA traffic
7 *
8 * Copyright IBM Corp. 2016
9 *
10 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
11 */
12
13#ifndef _SMC_CLC_H
14#define _SMC_CLC_H
15
16#include <rdma/ib_verbs.h>
17
18#include "smc.h"
19
20#define SMC_CLC_PROPOSAL 0x01
21#define SMC_CLC_ACCEPT 0x02
22#define SMC_CLC_CONFIRM 0x03
23#define SMC_CLC_DECLINE 0x04
24
25#define SMC_TYPE_R 0 /* SMC-R only */
26#define SMC_TYPE_D 1 /* SMC-D only */
27#define SMC_TYPE_N 2 /* neither SMC-R nor SMC-D */
28#define SMC_TYPE_B 3 /* SMC-R and SMC-D */
29#define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */
30#define CLC_WAIT_TIME_SHORT HZ /* short wait time on clcsock */
31#define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */
32#define SMC_CLC_DECL_TIMEOUT_CL 0x02010000 /* timeout w4 QP confirm link */
33#define SMC_CLC_DECL_TIMEOUT_AL 0x02020000 /* timeout w4 QP add link */
34#define SMC_CLC_DECL_CNFERR 0x03000000 /* configuration error */
35#define SMC_CLC_DECL_PEERNOSMC 0x03010000 /* peer did not indicate SMC */
36#define SMC_CLC_DECL_IPSEC 0x03020000 /* IPsec usage */
37#define SMC_CLC_DECL_NOSMCDEV 0x03030000 /* no SMC device found (R or D) */
38#define SMC_CLC_DECL_NOSMCDDEV 0x03030001 /* no SMC-D device found */
39#define SMC_CLC_DECL_NOSMCRDEV 0x03030002 /* no SMC-R device found */
40#define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/
41#define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */
42#define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */
43#define SMC_CLC_DECL_DIFFPREFIX 0x03070000 /* IP prefix / subnet mismatch */
44#define SMC_CLC_DECL_GETVLANERR 0x03080000 /* err to get vlan id of ip device*/
45#define SMC_CLC_DECL_ISMVLANERR 0x03090000 /* err to reg vlan id on ism dev */
46#define SMC_CLC_DECL_NOACTLINK 0x030a0000 /* no active smc-r link in lgr */
47#define SMC_CLC_DECL_NOSRVLINK 0x030b0000 /* SMC-R link from srv not found */
48#define SMC_CLC_DECL_VERSMISMAT 0x030c0000 /* SMC version mismatch */
49#define SMC_CLC_DECL_MAX_DMB 0x030d0000 /* SMC-D DMB limit exceeded */
50#define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */
51#define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */
52#define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */
53#define SMC_CLC_DECL_ERR_RTOK 0x09990001 /* rtoken handling failed */
54#define SMC_CLC_DECL_ERR_RDYLNK 0x09990002 /* ib ready link failed */
55#define SMC_CLC_DECL_ERR_REGRMB 0x09990003 /* reg rmb failed */
56
57#define SMC_FIRST_CONTACT_MASK 0b10 /* first contact bit within typev2 */
58
59struct smc_clc_msg_hdr { /* header1 of clc messages */
60 u8 eyecatcher[4]; /* eye catcher */
61 u8 type; /* proposal / accept / confirm / decline */
62 __be16 length;
63#if defined(__BIG_ENDIAN_BITFIELD)
64 u8 version : 4,
65 typev2 : 2,
66 typev1 : 2;
67#elif defined(__LITTLE_ENDIAN_BITFIELD)
68 u8 typev1 : 2,
69 typev2 : 2,
70 version : 4;
71#endif
72} __packed; /* format defined in RFC7609 */
73
74struct smc_clc_msg_trail { /* trailer of clc messages */
75 u8 eyecatcher[4];
76};
77
78struct smc_clc_msg_local { /* header2 of clc messages */
79 u8 id_for_peer[SMC_SYSTEMID_LEN]; /* unique system id */
80 u8 gid[16]; /* gid of ib_device port */
81 u8 mac[6]; /* mac of ib_device port */
82};
83
84/* Struct would be 4 byte aligned, but it is used in an array that is sent
85 * to peers and must conform to RFC7609, hence we need to use packed here.
86 */
87struct smc_clc_ipv6_prefix {
88 struct in6_addr prefix;
89 u8 prefix_len;
90} __packed; /* format defined in RFC7609 */
91
92#if defined(__BIG_ENDIAN_BITFIELD)
93struct smc_clc_v2_flag {
94 u8 release : 4,
95 rsvd : 3,
96 seid : 1;
97};
98#elif defined(__LITTLE_ENDIAN_BITFIELD)
99struct smc_clc_v2_flag {
100 u8 seid : 1,
101 rsvd : 3,
102 release : 4;
103};
104#endif
105
106struct smc_clnt_opts_area_hdr {
107 u8 eid_cnt; /* number of user defined EIDs */
108 u8 ism_gid_cnt; /* number of ISMv2 GIDs */
109 u8 reserved1;
110 struct smc_clc_v2_flag flag;
111 u8 reserved2[2];
112 __be16 smcd_v2_ext_offset; /* SMC-Dv2 Extension Offset */
113};
114
115struct smc_clc_smcd_gid_chid {
116 __be64 gid; /* ISM GID */
117 __be16 chid; /* ISMv2 CHID */
118} __packed; /* format defined in
119 * IBM Shared Memory Communications Version 2
120 * (https://www.ibm.com/support/pages/node/6326337)
121 */
122
123struct smc_clc_v2_extension {
124 struct smc_clnt_opts_area_hdr hdr;
125 u8 roce[16]; /* RoCEv2 GID */
126 u8 reserved[16];
127 u8 user_eids[][SMC_MAX_EID_LEN];
128};
129
130struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/
131 __be32 outgoing_subnet; /* subnet mask */
132 u8 prefix_len; /* number of significant bits in mask */
133 u8 reserved[2];
134 u8 ipv6_prefixes_cnt; /* number of IPv6 prefixes in prefix array */
135} __aligned(4);
136
137struct smc_clc_msg_smcd { /* SMC-D GID information */
138 struct smc_clc_smcd_gid_chid ism; /* ISM native GID+CHID of requestor */
139 __be16 v2_ext_offset; /* SMC Version 2 Extension Offset */
140 u8 reserved[28];
141};
142
143struct smc_clc_smcd_v2_extension {
144 u8 system_eid[SMC_MAX_EID_LEN];
145 u8 reserved[16];
146 struct smc_clc_smcd_gid_chid gidchid[];
147};
148
149struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */
150 struct smc_clc_msg_hdr hdr;
151 struct smc_clc_msg_local lcl;
152 __be16 iparea_offset; /* offset to IP address information area */
153} __aligned(4);
154
155#define SMC_CLC_MAX_V6_PREFIX 8
156
157struct smc_clc_msg_proposal_area {
158 struct smc_clc_msg_proposal pclc_base;
159 struct smc_clc_msg_smcd pclc_smcd;
160 struct smc_clc_msg_proposal_prefix pclc_prfx;
161 struct smc_clc_ipv6_prefix pclc_prfx_ipv6[SMC_CLC_MAX_V6_PREFIX];
162 struct smc_clc_v2_extension pclc_v2_ext;
163 struct smc_clc_smcd_v2_extension pclc_smcd_v2_ext;
164 struct smc_clc_smcd_gid_chid pclc_gidchids[SMC_MAX_ISM_DEVS];
165 struct smc_clc_msg_trail pclc_trl;
166};
167
168struct smcr_clc_msg_accept_confirm { /* SMCR accept/confirm */
169 struct smc_clc_msg_local lcl;
170 u8 qpn[3]; /* QP number */
171 __be32 rmb_rkey; /* RMB rkey */
172 u8 rmbe_idx; /* Index of RMBE in RMB */
173 __be32 rmbe_alert_token; /* unique connection id */
174 #if defined(__BIG_ENDIAN_BITFIELD)
175 u8 rmbe_size : 4, /* buf size (compressed) */
176 qp_mtu : 4; /* QP mtu */
177#elif defined(__LITTLE_ENDIAN_BITFIELD)
178 u8 qp_mtu : 4,
179 rmbe_size : 4;
180#endif
181 u8 reserved;
182 __be64 rmb_dma_addr; /* RMB virtual address */
183 u8 reserved2;
184 u8 psn[3]; /* packet sequence number */
185} __packed;
186
187struct smcd_clc_msg_accept_confirm_common { /* SMCD accept/confirm */
188 u64 gid; /* Sender GID */
189 u64 token; /* DMB token */
190 u8 dmbe_idx; /* DMBE index */
191#if defined(__BIG_ENDIAN_BITFIELD)
192 u8 dmbe_size : 4, /* buf size (compressed) */
193 reserved3 : 4;
194#elif defined(__LITTLE_ENDIAN_BITFIELD)
195 u8 reserved3 : 4,
196 dmbe_size : 4;
197#endif
198 u16 reserved4;
199 __be32 linkid; /* Link identifier */
200} __packed;
201
202#define SMC_CLC_OS_ZOS 1
203#define SMC_CLC_OS_LINUX 2
204#define SMC_CLC_OS_AIX 3
205
206struct smc_clc_first_contact_ext {
207 u8 reserved1;
208#if defined(__BIG_ENDIAN_BITFIELD)
209 u8 os_type : 4,
210 release : 4;
211#elif defined(__LITTLE_ENDIAN_BITFIELD)
212 u8 release : 4,
213 os_type : 4;
214#endif
215 u8 reserved2[2];
216 u8 hostname[SMC_MAX_HOSTNAME_LEN];
217};
218
219struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */
220 struct smc_clc_msg_hdr hdr;
221 union {
222 struct smcr_clc_msg_accept_confirm r0; /* SMC-R */
223 struct { /* SMC-D */
224 struct smcd_clc_msg_accept_confirm_common d0;
225 u32 reserved5[3];
226 };
227 };
228} __packed; /* format defined in RFC7609 */
229
230struct smc_clc_msg_accept_confirm_v2 { /* clc accept / confirm message */
231 struct smc_clc_msg_hdr hdr;
232 union {
233 struct smcr_clc_msg_accept_confirm r0; /* SMC-R */
234 struct { /* SMC-D */
235 struct smcd_clc_msg_accept_confirm_common d0;
236 __be16 chid;
237 u8 eid[SMC_MAX_EID_LEN];
238 u8 reserved5[8];
239 };
240 };
241};
242
243struct smc_clc_msg_decline { /* clc decline message */
244 struct smc_clc_msg_hdr hdr;
245 u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */
246 __be32 peer_diagnosis; /* diagnosis information */
247#if defined(__BIG_ENDIAN_BITFIELD)
248 u8 os_type : 4,
249 reserved : 4;
250#elif defined(__LITTLE_ENDIAN_BITFIELD)
251 u8 reserved : 4,
252 os_type : 4;
253#endif
254 u8 reserved2[3];
255 struct smc_clc_msg_trail trl; /* eye catcher "SMCD" or "SMCR" EBCDIC */
256} __aligned(4);
257
258/* determine start of the prefix area within the proposal message */
259static inline struct smc_clc_msg_proposal_prefix *
260smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc)
261{
262 return (struct smc_clc_msg_proposal_prefix *)
263 ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset));
264}
265
266static inline bool smcr_indicated(int smc_type)
267{
268 return smc_type == SMC_TYPE_R || smc_type == SMC_TYPE_B;
269}
270
271static inline bool smcd_indicated(int smc_type)
272{
273 return smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B;
274}
275
276/* get SMC-D info from proposal message */
277static inline struct smc_clc_msg_smcd *
278smc_get_clc_msg_smcd(struct smc_clc_msg_proposal *prop)
279{
280 if (smcd_indicated(prop->hdr.typev1) &&
281 ntohs(prop->iparea_offset) != sizeof(struct smc_clc_msg_smcd))
282 return NULL;
283
284 return (struct smc_clc_msg_smcd *)(prop + 1);
285}
286
287static inline struct smc_clc_v2_extension *
288smc_get_clc_v2_ext(struct smc_clc_msg_proposal *prop)
289{
290 struct smc_clc_msg_smcd *prop_smcd = smc_get_clc_msg_smcd(prop);
291
292 if (!prop_smcd || !ntohs(prop_smcd->v2_ext_offset))
293 return NULL;
294
295 return (struct smc_clc_v2_extension *)
296 ((u8 *)prop_smcd +
297 offsetof(struct smc_clc_msg_smcd, v2_ext_offset) +
298 sizeof(prop_smcd->v2_ext_offset) +
299 ntohs(prop_smcd->v2_ext_offset));
300}
301
302static inline struct smc_clc_smcd_v2_extension *
303smc_get_clc_smcd_v2_ext(struct smc_clc_v2_extension *prop_v2ext)
304{
305 if (!prop_v2ext)
306 return NULL;
307 if (!ntohs(prop_v2ext->hdr.smcd_v2_ext_offset))
308 return NULL;
309
310 return (struct smc_clc_smcd_v2_extension *)
311 ((u8 *)prop_v2ext +
312 offsetof(struct smc_clc_v2_extension, hdr) +
313 offsetof(struct smc_clnt_opts_area_hdr, smcd_v2_ext_offset) +
314 sizeof(prop_v2ext->hdr.smcd_v2_ext_offset) +
315 ntohs(prop_v2ext->hdr.smcd_v2_ext_offset));
316}
317
318struct smcd_dev;
319struct smc_init_info;
320
321int smc_clc_prfx_match(struct socket *clcsock,
322 struct smc_clc_msg_proposal_prefix *prop);
323int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
324 u8 expected_type, unsigned long timeout);
325int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version);
326int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini);
327int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact,
328 u8 version);
329int smc_clc_send_accept(struct smc_sock *smc, bool srv_first_contact,
330 u8 version);
331void smc_clc_init(void) __init;
332
333#endif
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
new file mode 100644
index 000000000..84102db5b
--- /dev/null
+++ b/net/smc/smc_close.c
@@ -0,0 +1,499 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
4 *
5 * Socket Closing - normal and abnormal
6 *
7 * Copyright IBM Corp. 2016
8 *
9 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
10 */
11
12#include <linux/workqueue.h>
13#include <linux/sched/signal.h>
14
15#include <net/sock.h>
16#include <net/tcp.h>
17
18#include "smc.h"
19#include "smc_tx.h"
20#include "smc_cdc.h"
21#include "smc_close.h"
22
23/* release the clcsock that is assigned to the smc_sock */
24void smc_clcsock_release(struct smc_sock *smc)
25{
26 struct socket *tcp;
27
28 if (smc->listen_smc && current_work() != &smc->smc_listen_work)
29 cancel_work_sync(&smc->smc_listen_work);
30 mutex_lock(&smc->clcsock_release_lock);
31 if (smc->clcsock) {
32 tcp = smc->clcsock;
33 smc->clcsock = NULL;
34 sock_release(tcp);
35 }
36 mutex_unlock(&smc->clcsock_release_lock);
37}
38
39static void smc_close_cleanup_listen(struct sock *parent)
40{
41 struct sock *sk;
42
43 /* Close non-accepted connections */
44 while ((sk = smc_accept_dequeue(parent, NULL)))
45 smc_close_non_accepted(sk);
46}
47
48/* wait for sndbuf data being transmitted */
49static void smc_close_stream_wait(struct smc_sock *smc, long timeout)
50{
51 DEFINE_WAIT_FUNC(wait, woken_wake_function);
52 struct sock *sk = &smc->sk;
53
54 if (!timeout)
55 return;
56
57 if (!smc_tx_prepared_sends(&smc->conn))
58 return;
59
60 smc->wait_close_tx_prepared = 1;
61 add_wait_queue(sk_sleep(sk), &wait);
62 while (!signal_pending(current) && timeout) {
63 int rc;
64
65 rc = sk_wait_event(sk, &timeout,
66 !smc_tx_prepared_sends(&smc->conn) ||
67 sk->sk_err == ECONNABORTED ||
68 sk->sk_err == ECONNRESET ||
69 smc->conn.killed,
70 &wait);
71 if (rc)
72 break;
73 }
74 remove_wait_queue(sk_sleep(sk), &wait);
75 smc->wait_close_tx_prepared = 0;
76}
77
78void smc_close_wake_tx_prepared(struct smc_sock *smc)
79{
80 if (smc->wait_close_tx_prepared)
81 /* wake up socket closing */
82 smc->sk.sk_state_change(&smc->sk);
83}
84
85static int smc_close_wr(struct smc_connection *conn)
86{
87 conn->local_tx_ctrl.conn_state_flags.peer_done_writing = 1;
88
89 return smc_cdc_get_slot_and_msg_send(conn);
90}
91
92static int smc_close_final(struct smc_connection *conn)
93{
94 if (atomic_read(&conn->bytes_to_rcv))
95 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
96 else
97 conn->local_tx_ctrl.conn_state_flags.peer_conn_closed = 1;
98 if (conn->killed)
99 return -EPIPE;
100
101 return smc_cdc_get_slot_and_msg_send(conn);
102}
103
104int smc_close_abort(struct smc_connection *conn)
105{
106 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
107
108 return smc_cdc_get_slot_and_msg_send(conn);
109}
110
111static void smc_close_cancel_work(struct smc_sock *smc)
112{
113 struct sock *sk = &smc->sk;
114
115 release_sock(sk);
116 cancel_work_sync(&smc->conn.close_work);
117 cancel_delayed_work_sync(&smc->conn.tx_work);
118 lock_sock(sk);
119}
120
121/* terminate smc socket abnormally - active abort
122 * link group is terminated, i.e. RDMA communication no longer possible
123 */
124void smc_close_active_abort(struct smc_sock *smc)
125{
126 struct sock *sk = &smc->sk;
127 bool release_clcsock = false;
128
129 if (sk->sk_state != SMC_INIT && smc->clcsock && smc->clcsock->sk) {
130 sk->sk_err = ECONNABORTED;
131 if (smc->clcsock && smc->clcsock->sk)
132 tcp_abort(smc->clcsock->sk, ECONNABORTED);
133 }
134 switch (sk->sk_state) {
135 case SMC_ACTIVE:
136 case SMC_APPCLOSEWAIT1:
137 case SMC_APPCLOSEWAIT2:
138 sk->sk_state = SMC_PEERABORTWAIT;
139 smc_close_cancel_work(smc);
140 if (sk->sk_state != SMC_PEERABORTWAIT)
141 break;
142 sk->sk_state = SMC_CLOSED;
143 sock_put(sk); /* (postponed) passive closing */
144 break;
145 case SMC_PEERCLOSEWAIT1:
146 case SMC_PEERCLOSEWAIT2:
147 case SMC_PEERFINCLOSEWAIT:
148 sk->sk_state = SMC_PEERABORTWAIT;
149 smc_close_cancel_work(smc);
150 if (sk->sk_state != SMC_PEERABORTWAIT)
151 break;
152 sk->sk_state = SMC_CLOSED;
153 smc_conn_free(&smc->conn);
154 release_clcsock = true;
155 sock_put(sk); /* passive closing */
156 break;
157 case SMC_PROCESSABORT:
158 case SMC_APPFINCLOSEWAIT:
159 sk->sk_state = SMC_PEERABORTWAIT;
160 smc_close_cancel_work(smc);
161 if (sk->sk_state != SMC_PEERABORTWAIT)
162 break;
163 sk->sk_state = SMC_CLOSED;
164 smc_conn_free(&smc->conn);
165 release_clcsock = true;
166 break;
167 case SMC_INIT:
168 case SMC_PEERABORTWAIT:
169 case SMC_CLOSED:
170 break;
171 }
172
173 sock_set_flag(sk, SOCK_DEAD);
174 sk->sk_state_change(sk);
175
176 if (release_clcsock) {
177 release_sock(sk);
178 smc_clcsock_release(smc);
179 lock_sock(sk);
180 }
181}
182
183static inline bool smc_close_sent_any_close(struct smc_connection *conn)
184{
185 return conn->local_tx_ctrl.conn_state_flags.peer_conn_abort ||
186 conn->local_tx_ctrl.conn_state_flags.peer_conn_closed;
187}
188
189int smc_close_active(struct smc_sock *smc)
190{
191 struct smc_cdc_conn_state_flags *txflags =
192 &smc->conn.local_tx_ctrl.conn_state_flags;
193 struct smc_connection *conn = &smc->conn;
194 struct sock *sk = &smc->sk;
195 int old_state;
196 long timeout;
197 int rc = 0;
198 int rc1 = 0;
199
200 timeout = current->flags & PF_EXITING ?
201 0 : sock_flag(sk, SOCK_LINGER) ?
202 sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT;
203
204 old_state = sk->sk_state;
205again:
206 switch (sk->sk_state) {
207 case SMC_INIT:
208 sk->sk_state = SMC_CLOSED;
209 break;
210 case SMC_LISTEN:
211 sk->sk_state = SMC_CLOSED;
212 sk->sk_state_change(sk); /* wake up accept */
213 if (smc->clcsock && smc->clcsock->sk) {
214 smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready;
215 smc->clcsock->sk->sk_user_data = NULL;
216 rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
217 }
218 smc_close_cleanup_listen(sk);
219 release_sock(sk);
220 flush_work(&smc->tcp_listen_work);
221 lock_sock(sk);
222 break;
223 case SMC_ACTIVE:
224 smc_close_stream_wait(smc, timeout);
225 release_sock(sk);
226 cancel_delayed_work_sync(&conn->tx_work);
227 lock_sock(sk);
228 if (sk->sk_state == SMC_ACTIVE) {
229 /* send close request */
230 rc = smc_close_final(conn);
231 sk->sk_state = SMC_PEERCLOSEWAIT1;
232
233 /* actively shutdown clcsock before peer close it,
234 * prevent peer from entering TIME_WAIT state.
235 */
236 if (smc->clcsock && smc->clcsock->sk) {
237 rc1 = kernel_sock_shutdown(smc->clcsock,
238 SHUT_RDWR);
239 rc = rc ? rc : rc1;
240 }
241 } else {
242 /* peer event has changed the state */
243 goto again;
244 }
245 break;
246 case SMC_APPFINCLOSEWAIT:
247 /* socket already shutdown wr or both (active close) */
248 if (txflags->peer_done_writing &&
249 !smc_close_sent_any_close(conn)) {
250 /* just shutdown wr done, send close request */
251 rc = smc_close_final(conn);
252 }
253 sk->sk_state = SMC_CLOSED;
254 break;
255 case SMC_APPCLOSEWAIT1:
256 case SMC_APPCLOSEWAIT2:
257 if (!smc_cdc_rxed_any_close(conn))
258 smc_close_stream_wait(smc, timeout);
259 release_sock(sk);
260 cancel_delayed_work_sync(&conn->tx_work);
261 lock_sock(sk);
262 if (sk->sk_state != SMC_APPCLOSEWAIT1 &&
263 sk->sk_state != SMC_APPCLOSEWAIT2)
264 goto again;
265 /* confirm close from peer */
266 rc = smc_close_final(conn);
267 if (smc_cdc_rxed_any_close(conn)) {
268 /* peer has closed the socket already */
269 sk->sk_state = SMC_CLOSED;
270 sock_put(sk); /* postponed passive closing */
271 } else {
272 /* peer has just issued a shutdown write */
273 sk->sk_state = SMC_PEERFINCLOSEWAIT;
274 }
275 break;
276 case SMC_PEERCLOSEWAIT1:
277 case SMC_PEERCLOSEWAIT2:
278 if (txflags->peer_done_writing &&
279 !smc_close_sent_any_close(conn)) {
280 /* just shutdown wr done, send close request */
281 rc = smc_close_final(conn);
282 }
283 /* peer sending PeerConnectionClosed will cause transition */
284 break;
285 case SMC_PEERFINCLOSEWAIT:
286 /* peer sending PeerConnectionClosed will cause transition */
287 break;
288 case SMC_PROCESSABORT:
289 rc = smc_close_abort(conn);
290 sk->sk_state = SMC_CLOSED;
291 break;
292 case SMC_PEERABORTWAIT:
293 sk->sk_state = SMC_CLOSED;
294 break;
295 case SMC_CLOSED:
296 /* nothing to do, add tracing in future patch */
297 break;
298 }
299
300 if (old_state != sk->sk_state)
301 sk->sk_state_change(sk);
302 return rc;
303}
304
305static void smc_close_passive_abort_received(struct smc_sock *smc)
306{
307 struct smc_cdc_conn_state_flags *txflags =
308 &smc->conn.local_tx_ctrl.conn_state_flags;
309 struct sock *sk = &smc->sk;
310
311 switch (sk->sk_state) {
312 case SMC_INIT:
313 case SMC_ACTIVE:
314 case SMC_APPCLOSEWAIT1:
315 sk->sk_state = SMC_PROCESSABORT;
316 sock_put(sk); /* passive closing */
317 break;
318 case SMC_APPFINCLOSEWAIT:
319 sk->sk_state = SMC_PROCESSABORT;
320 break;
321 case SMC_PEERCLOSEWAIT1:
322 case SMC_PEERCLOSEWAIT2:
323 if (txflags->peer_done_writing &&
324 !smc_close_sent_any_close(&smc->conn))
325 /* just shutdown, but not yet closed locally */
326 sk->sk_state = SMC_PROCESSABORT;
327 else
328 sk->sk_state = SMC_CLOSED;
329 sock_put(sk); /* passive closing */
330 break;
331 case SMC_APPCLOSEWAIT2:
332 case SMC_PEERFINCLOSEWAIT:
333 sk->sk_state = SMC_CLOSED;
334 sock_put(sk); /* passive closing */
335 break;
336 case SMC_PEERABORTWAIT:
337 sk->sk_state = SMC_CLOSED;
338 break;
339 case SMC_PROCESSABORT:
340 /* nothing to do, add tracing in future patch */
341 break;
342 }
343}
344
345/* Either some kind of closing has been received: peer_conn_closed,
346 * peer_conn_abort, or peer_done_writing
347 * or the link group of the connection terminates abnormally.
348 */
349static void smc_close_passive_work(struct work_struct *work)
350{
351 struct smc_connection *conn = container_of(work,
352 struct smc_connection,
353 close_work);
354 struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
355 struct smc_cdc_conn_state_flags *rxflags;
356 bool release_clcsock = false;
357 struct sock *sk = &smc->sk;
358 int old_state;
359
360 lock_sock(sk);
361 old_state = sk->sk_state;
362
363 rxflags = &conn->local_rx_ctrl.conn_state_flags;
364 if (rxflags->peer_conn_abort) {
365 /* peer has not received all data */
366 smc_close_passive_abort_received(smc);
367 release_sock(&smc->sk);
368 cancel_delayed_work_sync(&conn->tx_work);
369 lock_sock(&smc->sk);
370 goto wakeup;
371 }
372
373 switch (sk->sk_state) {
374 case SMC_INIT:
375 sk->sk_state = SMC_APPCLOSEWAIT1;
376 break;
377 case SMC_ACTIVE:
378 sk->sk_state = SMC_APPCLOSEWAIT1;
379 /* postpone sock_put() for passive closing to cover
380 * received SEND_SHUTDOWN as well
381 */
382 break;
383 case SMC_PEERCLOSEWAIT1:
384 if (rxflags->peer_done_writing)
385 sk->sk_state = SMC_PEERCLOSEWAIT2;
386 fallthrough;
387 /* to check for closing */
388 case SMC_PEERCLOSEWAIT2:
389 if (!smc_cdc_rxed_any_close(conn))
390 break;
391 if (sock_flag(sk, SOCK_DEAD) &&
392 smc_close_sent_any_close(conn)) {
393 /* smc_release has already been called locally */
394 sk->sk_state = SMC_CLOSED;
395 } else {
396 /* just shutdown, but not yet closed locally */
397 sk->sk_state = SMC_APPFINCLOSEWAIT;
398 }
399 sock_put(sk); /* passive closing */
400 break;
401 case SMC_PEERFINCLOSEWAIT:
402 if (smc_cdc_rxed_any_close(conn)) {
403 sk->sk_state = SMC_CLOSED;
404 sock_put(sk); /* passive closing */
405 }
406 break;
407 case SMC_APPCLOSEWAIT1:
408 case SMC_APPCLOSEWAIT2:
409 /* postpone sock_put() for passive closing to cover
410 * received SEND_SHUTDOWN as well
411 */
412 break;
413 case SMC_APPFINCLOSEWAIT:
414 case SMC_PEERABORTWAIT:
415 case SMC_PROCESSABORT:
416 case SMC_CLOSED:
417 /* nothing to do, add tracing in future patch */
418 break;
419 }
420
421wakeup:
422 sk->sk_data_ready(sk); /* wakeup blocked rcvbuf consumers */
423 sk->sk_write_space(sk); /* wakeup blocked sndbuf producers */
424
425 if (old_state != sk->sk_state) {
426 sk->sk_state_change(sk);
427 if ((sk->sk_state == SMC_CLOSED) &&
428 (sock_flag(sk, SOCK_DEAD) || !sk->sk_socket)) {
429 smc_conn_free(conn);
430 if (smc->clcsock)
431 release_clcsock = true;
432 }
433 }
434 release_sock(sk);
435 if (release_clcsock)
436 smc_clcsock_release(smc);
437 sock_put(sk); /* sock_hold done by schedulers of close_work */
438}
439
440int smc_close_shutdown_write(struct smc_sock *smc)
441{
442 struct smc_connection *conn = &smc->conn;
443 struct sock *sk = &smc->sk;
444 int old_state;
445 long timeout;
446 int rc = 0;
447
448 timeout = current->flags & PF_EXITING ?
449 0 : sock_flag(sk, SOCK_LINGER) ?
450 sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT;
451
452 old_state = sk->sk_state;
453again:
454 switch (sk->sk_state) {
455 case SMC_ACTIVE:
456 smc_close_stream_wait(smc, timeout);
457 release_sock(sk);
458 cancel_delayed_work_sync(&conn->tx_work);
459 lock_sock(sk);
460 if (sk->sk_state != SMC_ACTIVE)
461 goto again;
462 /* send close wr request */
463 rc = smc_close_wr(conn);
464 sk->sk_state = SMC_PEERCLOSEWAIT1;
465 break;
466 case SMC_APPCLOSEWAIT1:
467 /* passive close */
468 if (!smc_cdc_rxed_any_close(conn))
469 smc_close_stream_wait(smc, timeout);
470 release_sock(sk);
471 cancel_delayed_work_sync(&conn->tx_work);
472 lock_sock(sk);
473 if (sk->sk_state != SMC_APPCLOSEWAIT1)
474 goto again;
475 /* confirm close from peer */
476 rc = smc_close_wr(conn);
477 sk->sk_state = SMC_APPCLOSEWAIT2;
478 break;
479 case SMC_APPCLOSEWAIT2:
480 case SMC_PEERFINCLOSEWAIT:
481 case SMC_PEERCLOSEWAIT1:
482 case SMC_PEERCLOSEWAIT2:
483 case SMC_APPFINCLOSEWAIT:
484 case SMC_PROCESSABORT:
485 case SMC_PEERABORTWAIT:
486 /* nothing to do, add tracing in future patch */
487 break;
488 }
489
490 if (old_state != sk->sk_state)
491 sk->sk_state_change(sk);
492 return rc;
493}
494
495/* Initialize close properties on connection establishment. */
496void smc_close_init(struct smc_sock *smc)
497{
498 INIT_WORK(&smc->conn.close_work, smc_close_passive_work);
499}
diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h
new file mode 100644
index 000000000..634fea2b7
--- /dev/null
+++ b/net/smc/smc_close.h
@@ -0,0 +1,30 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
4 *
5 * Socket Closing
6 *
7 * Copyright IBM Corp. 2016
8 *
9 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
10 */
11
12#ifndef SMC_CLOSE_H
13#define SMC_CLOSE_H
14
15#include <linux/workqueue.h>
16
17#include "smc.h"
18
19#define SMC_MAX_STREAM_WAIT_TIMEOUT (2 * HZ)
20#define SMC_CLOSE_SOCK_PUT_DELAY HZ
21
22void smc_close_wake_tx_prepared(struct smc_sock *smc);
23int smc_close_active(struct smc_sock *smc);
24int smc_close_shutdown_write(struct smc_sock *smc);
25void smc_close_init(struct smc_sock *smc);
26void smc_clcsock_release(struct smc_sock *smc);
27int smc_close_abort(struct smc_connection *conn);
28void smc_close_active_abort(struct smc_sock *smc);
29
30#endif /* SMC_CLOSE_H */
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
new file mode 100644
index 000000000..bf485a201
--- /dev/null
+++ b/net/smc/smc_core.c
@@ -0,0 +1,1973 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
4 *
5 * Basic Transport Functions exploiting Infiniband API
6 *
7 * Copyright IBM Corp. 2016
8 *
9 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
10 */
11
12#include <linux/socket.h>
13#include <linux/if_vlan.h>
14#include <linux/random.h>
15#include <linux/workqueue.h>
16#include <linux/wait.h>
17#include <linux/reboot.h>
18#include <linux/mutex.h>
19#include <net/tcp.h>
20#include <net/sock.h>
21#include <rdma/ib_verbs.h>
22#include <rdma/ib_cache.h>
23
24#include "smc.h"
25#include "smc_clc.h"
26#include "smc_core.h"
27#include "smc_ib.h"
28#include "smc_wr.h"
29#include "smc_llc.h"
30#include "smc_cdc.h"
31#include "smc_close.h"
32#include "smc_ism.h"
33
34#define SMC_LGR_NUM_INCR 256
35#define SMC_LGR_FREE_DELAY_SERV (600 * HZ)
36#define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ)
37
38static struct smc_lgr_list smc_lgr_list = { /* established link groups */
39 .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
40 .list = LIST_HEAD_INIT(smc_lgr_list.list),
41 .num = 0,
42};
43
44static atomic_t lgr_cnt = ATOMIC_INIT(0); /* number of existing link groups */
45static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted);
46
47static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
48 struct smc_buf_desc *buf_desc);
49static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft);
50
51static void smc_link_down_work(struct work_struct *work);
52
53/* return head of link group list and its lock for a given link group */
54static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr,
55 spinlock_t **lgr_lock)
56{
57 if (lgr->is_smcd) {
58 *lgr_lock = &lgr->smcd->lgr_lock;
59 return &lgr->smcd->lgr_list;
60 }
61
62 *lgr_lock = &smc_lgr_list.lock;
63 return &smc_lgr_list.list;
64}
65
66static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
67{
68 /* client link group creation always follows the server link group
69 * creation. For client use a somewhat higher removal delay time,
70 * otherwise there is a risk of out-of-sync link groups.
71 */
72 if (!lgr->freeing) {
73 mod_delayed_work(system_wq, &lgr->free_work,
74 (!lgr->is_smcd && lgr->role == SMC_CLNT) ?
75 SMC_LGR_FREE_DELAY_CLNT :
76 SMC_LGR_FREE_DELAY_SERV);
77 }
78}
79
80/* Register connection's alert token in our lookup structure.
81 * To use rbtrees we have to implement our own insert core.
82 * Requires @conns_lock
83 * @smc connection to register
84 * Returns 0 on success, != otherwise.
85 */
86static void smc_lgr_add_alert_token(struct smc_connection *conn)
87{
88 struct rb_node **link, *parent = NULL;
89 u32 token = conn->alert_token_local;
90
91 link = &conn->lgr->conns_all.rb_node;
92 while (*link) {
93 struct smc_connection *cur = rb_entry(*link,
94 struct smc_connection, alert_node);
95
96 parent = *link;
97 if (cur->alert_token_local > token)
98 link = &parent->rb_left;
99 else
100 link = &parent->rb_right;
101 }
102 /* Put the new node there */
103 rb_link_node(&conn->alert_node, parent, link);
104 rb_insert_color(&conn->alert_node, &conn->lgr->conns_all);
105}
106
107/* assign an SMC-R link to the connection */
108static int smcr_lgr_conn_assign_link(struct smc_connection *conn, bool first)
109{
110 enum smc_link_state expected = first ? SMC_LNK_ACTIVATING :
111 SMC_LNK_ACTIVE;
112 int i, j;
113
114 /* do link balancing */
115 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
116 struct smc_link *lnk = &conn->lgr->lnk[i];
117
118 if (lnk->state != expected || lnk->link_is_asym)
119 continue;
120 if (conn->lgr->role == SMC_CLNT) {
121 conn->lnk = lnk; /* temporary, SMC server assigns link*/
122 break;
123 }
124 if (conn->lgr->conns_num % 2) {
125 for (j = i + 1; j < SMC_LINKS_PER_LGR_MAX; j++) {
126 struct smc_link *lnk2;
127
128 lnk2 = &conn->lgr->lnk[j];
129 if (lnk2->state == expected &&
130 !lnk2->link_is_asym) {
131 conn->lnk = lnk2;
132 break;
133 }
134 }
135 }
136 if (!conn->lnk)
137 conn->lnk = lnk;
138 break;
139 }
140 if (!conn->lnk)
141 return SMC_CLC_DECL_NOACTLINK;
142 return 0;
143}
144
145/* Register connection in link group by assigning an alert token
146 * registered in a search tree.
147 * Requires @conns_lock
148 * Note that '0' is a reserved value and not assigned.
149 */
150static int smc_lgr_register_conn(struct smc_connection *conn, bool first)
151{
152 struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
153 static atomic_t nexttoken = ATOMIC_INIT(0);
154 int rc;
155
156 if (!conn->lgr->is_smcd) {
157 rc = smcr_lgr_conn_assign_link(conn, first);
158 if (rc)
159 return rc;
160 }
161 /* find a new alert_token_local value not yet used by some connection
162 * in this link group
163 */
164 sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */
165 while (!conn->alert_token_local) {
166 conn->alert_token_local = atomic_inc_return(&nexttoken);
167 if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr))
168 conn->alert_token_local = 0;
169 }
170 smc_lgr_add_alert_token(conn);
171 conn->lgr->conns_num++;
172 return 0;
173}
174
175/* Unregister connection and reset the alert token of the given connection<
176 */
177static void __smc_lgr_unregister_conn(struct smc_connection *conn)
178{
179 struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
180 struct smc_link_group *lgr = conn->lgr;
181
182 rb_erase(&conn->alert_node, &lgr->conns_all);
183 lgr->conns_num--;
184 conn->alert_token_local = 0;
185 sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */
186}
187
188/* Unregister connection from lgr
189 */
190static void smc_lgr_unregister_conn(struct smc_connection *conn)
191{
192 struct smc_link_group *lgr = conn->lgr;
193
194 if (!lgr)
195 return;
196 write_lock_bh(&lgr->conns_lock);
197 if (conn->alert_token_local) {
198 __smc_lgr_unregister_conn(conn);
199 }
200 write_unlock_bh(&lgr->conns_lock);
201 conn->lgr = NULL;
202}
203
204void smc_lgr_cleanup_early(struct smc_connection *conn)
205{
206 struct smc_link_group *lgr = conn->lgr;
207 spinlock_t *lgr_lock;
208
209 if (!lgr)
210 return;
211
212 smc_conn_free(conn);
213 smc_lgr_list_head(lgr, &lgr_lock);
214 spin_lock_bh(lgr_lock);
215 /* do not use this link group for new connections */
216 if (!list_empty(&lgr->list))
217 list_del_init(&lgr->list);
218 spin_unlock_bh(lgr_lock);
219 __smc_lgr_terminate(lgr, true);
220}
221
222static void smcr_lgr_link_deactivate_all(struct smc_link_group *lgr)
223{
224 int i;
225
226 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
227 struct smc_link *lnk = &lgr->lnk[i];
228
229 if (smc_link_sendable(lnk))
230 lnk->state = SMC_LNK_INACTIVE;
231 }
232 wake_up_all(&lgr->llc_msg_waiter);
233 wake_up_all(&lgr->llc_flow_waiter);
234}
235
236static void smc_lgr_free(struct smc_link_group *lgr);
237
238static void smc_lgr_free_work(struct work_struct *work)
239{
240 struct smc_link_group *lgr = container_of(to_delayed_work(work),
241 struct smc_link_group,
242 free_work);
243 spinlock_t *lgr_lock;
244 bool conns;
245
246 smc_lgr_list_head(lgr, &lgr_lock);
247 spin_lock_bh(lgr_lock);
248 if (lgr->freeing) {
249 spin_unlock_bh(lgr_lock);
250 return;
251 }
252 read_lock_bh(&lgr->conns_lock);
253 conns = RB_EMPTY_ROOT(&lgr->conns_all);
254 read_unlock_bh(&lgr->conns_lock);
255 if (!conns) { /* number of lgr connections is no longer zero */
256 spin_unlock_bh(lgr_lock);
257 return;
258 }
259 list_del_init(&lgr->list); /* remove from smc_lgr_list */
260 lgr->freeing = 1; /* this instance does the freeing, no new schedule */
261 spin_unlock_bh(lgr_lock);
262 cancel_delayed_work(&lgr->free_work);
263
264 if (!lgr->is_smcd && !lgr->terminating)
265 smc_llc_send_link_delete_all(lgr, true,
266 SMC_LLC_DEL_PROG_INIT_TERM);
267 if (lgr->is_smcd && !lgr->terminating)
268 smc_ism_signal_shutdown(lgr);
269 if (!lgr->is_smcd)
270 smcr_lgr_link_deactivate_all(lgr);
271 smc_lgr_free(lgr);
272}
273
274static void smc_lgr_terminate_work(struct work_struct *work)
275{
276 struct smc_link_group *lgr = container_of(work, struct smc_link_group,
277 terminate_work);
278
279 __smc_lgr_terminate(lgr, true);
280}
281
282/* return next unique link id for the lgr */
283static u8 smcr_next_link_id(struct smc_link_group *lgr)
284{
285 u8 link_id;
286 int i;
287
288 while (1) {
289again:
290 link_id = ++lgr->next_link_id;
291 if (!link_id) /* skip zero as link_id */
292 link_id = ++lgr->next_link_id;
293 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
294 if (smc_link_usable(&lgr->lnk[i]) &&
295 lgr->lnk[i].link_id == link_id)
296 goto again;
297 }
298 break;
299 }
300 return link_id;
301}
302
303int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
304 u8 link_idx, struct smc_init_info *ini)
305{
306 u8 rndvec[3];
307 int rc;
308
309 get_device(&ini->ib_dev->ibdev->dev);
310 atomic_inc(&ini->ib_dev->lnk_cnt);
311 lnk->link_id = smcr_next_link_id(lgr);
312 lnk->lgr = lgr;
313 lnk->link_idx = link_idx;
314 lnk->smcibdev = ini->ib_dev;
315 lnk->ibport = ini->ib_port;
316 lnk->path_mtu = ini->ib_dev->pattr[ini->ib_port - 1].active_mtu;
317 smc_llc_link_set_uid(lnk);
318 INIT_WORK(&lnk->link_down_wrk, smc_link_down_work);
319 if (!ini->ib_dev->initialized) {
320 rc = (int)smc_ib_setup_per_ibdev(ini->ib_dev);
321 if (rc)
322 goto out;
323 }
324 get_random_bytes(rndvec, sizeof(rndvec));
325 lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) +
326 (rndvec[2] << 16);
327 rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport,
328 ini->vlan_id, lnk->gid, &lnk->sgid_index);
329 if (rc)
330 goto out;
331 rc = smc_llc_link_init(lnk);
332 if (rc)
333 goto out;
334 rc = smc_wr_alloc_link_mem(lnk);
335 if (rc)
336 goto clear_llc_lnk;
337 rc = smc_ib_create_protection_domain(lnk);
338 if (rc)
339 goto free_link_mem;
340 rc = smc_ib_create_queue_pair(lnk);
341 if (rc)
342 goto dealloc_pd;
343 rc = smc_wr_create_link(lnk);
344 if (rc)
345 goto destroy_qp;
346 lnk->state = SMC_LNK_ACTIVATING;
347 return 0;
348
349destroy_qp:
350 smc_ib_destroy_queue_pair(lnk);
351dealloc_pd:
352 smc_ib_dealloc_protection_domain(lnk);
353free_link_mem:
354 smc_wr_free_link_mem(lnk);
355clear_llc_lnk:
356 smc_llc_link_clear(lnk, false);
357out:
358 put_device(&ini->ib_dev->ibdev->dev);
359 memset(lnk, 0, sizeof(struct smc_link));
360 lnk->state = SMC_LNK_UNUSED;
361 if (!atomic_dec_return(&ini->ib_dev->lnk_cnt))
362 wake_up(&ini->ib_dev->lnks_deleted);
363 return rc;
364}
365
366/* create a new SMC link group */
367static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
368{
369 struct smc_link_group *lgr;
370 struct list_head *lgr_list;
371 struct smc_link *lnk;
372 spinlock_t *lgr_lock;
373 u8 link_idx;
374 int rc = 0;
375 int i;
376
377 if (ini->is_smcd && ini->vlan_id) {
378 if (smc_ism_get_vlan(ini->ism_dev[ini->ism_selected],
379 ini->vlan_id)) {
380 rc = SMC_CLC_DECL_ISMVLANERR;
381 goto out;
382 }
383 }
384
385 lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
386 if (!lgr) {
387 rc = SMC_CLC_DECL_MEM;
388 goto ism_put_vlan;
389 }
390 lgr->tx_wq = alloc_workqueue("smc_tx_wq-%*phN", 0, 0,
391 SMC_LGR_ID_SIZE, &lgr->id);
392 if (!lgr->tx_wq) {
393 rc = -ENOMEM;
394 goto free_lgr;
395 }
396 lgr->is_smcd = ini->is_smcd;
397 lgr->sync_err = 0;
398 lgr->terminating = 0;
399 lgr->freeing = 0;
400 lgr->vlan_id = ini->vlan_id;
401 mutex_init(&lgr->sndbufs_lock);
402 mutex_init(&lgr->rmbs_lock);
403 rwlock_init(&lgr->conns_lock);
404 for (i = 0; i < SMC_RMBE_SIZES; i++) {
405 INIT_LIST_HEAD(&lgr->sndbufs[i]);
406 INIT_LIST_HEAD(&lgr->rmbs[i]);
407 }
408 lgr->next_link_id = 0;
409 smc_lgr_list.num += SMC_LGR_NUM_INCR;
410 memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE);
411 INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
412 INIT_WORK(&lgr->terminate_work, smc_lgr_terminate_work);
413 lgr->conns_all = RB_ROOT;
414 if (ini->is_smcd) {
415 /* SMC-D specific settings */
416 get_device(&ini->ism_dev[ini->ism_selected]->dev);
417 lgr->peer_gid = ini->ism_peer_gid[ini->ism_selected];
418 lgr->smcd = ini->ism_dev[ini->ism_selected];
419 lgr_list = &ini->ism_dev[ini->ism_selected]->lgr_list;
420 lgr_lock = &lgr->smcd->lgr_lock;
421 lgr->smc_version = ini->smcd_version;
422 lgr->peer_shutdown = 0;
423 atomic_inc(&ini->ism_dev[ini->ism_selected]->lgr_cnt);
424 } else {
425 /* SMC-R specific settings */
426 lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
427 memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer,
428 SMC_SYSTEMID_LEN);
429 memcpy(lgr->pnet_id, ini->ib_dev->pnetid[ini->ib_port - 1],
430 SMC_MAX_PNETID_LEN);
431 smc_llc_lgr_init(lgr, smc);
432
433 link_idx = SMC_SINGLE_LINK;
434 lnk = &lgr->lnk[link_idx];
435 rc = smcr_link_init(lgr, lnk, link_idx, ini);
436 if (rc)
437 goto free_wq;
438 lgr_list = &smc_lgr_list.list;
439 lgr_lock = &smc_lgr_list.lock;
440 atomic_inc(&lgr_cnt);
441 }
442 smc->conn.lgr = lgr;
443 spin_lock_bh(lgr_lock);
444 list_add_tail(&lgr->list, lgr_list);
445 spin_unlock_bh(lgr_lock);
446 return 0;
447
448free_wq:
449 destroy_workqueue(lgr->tx_wq);
450free_lgr:
451 kfree(lgr);
452ism_put_vlan:
453 if (ini->is_smcd && ini->vlan_id)
454 smc_ism_put_vlan(ini->ism_dev[ini->ism_selected], ini->vlan_id);
455out:
456 if (rc < 0) {
457 if (rc == -ENOMEM)
458 rc = SMC_CLC_DECL_MEM;
459 else
460 rc = SMC_CLC_DECL_INTERR;
461 }
462 return rc;
463}
464
465static int smc_write_space(struct smc_connection *conn)
466{
467 int buffer_len = conn->peer_rmbe_size;
468 union smc_host_cursor prod;
469 union smc_host_cursor cons;
470 int space;
471
472 smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn);
473 smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn);
474 /* determine rx_buf space */
475 space = buffer_len - smc_curs_diff(buffer_len, &cons, &prod);
476 return space;
477}
478
479static int smc_switch_cursor(struct smc_sock *smc, struct smc_cdc_tx_pend *pend,
480 struct smc_wr_buf *wr_buf)
481{
482 struct smc_connection *conn = &smc->conn;
483 union smc_host_cursor cons, fin;
484 int rc = 0;
485 int diff;
486
487 smc_curs_copy(&conn->tx_curs_sent, &conn->tx_curs_fin, conn);
488 smc_curs_copy(&fin, &conn->local_tx_ctrl_fin, conn);
489 /* set prod cursor to old state, enforce tx_rdma_writes() */
490 smc_curs_copy(&conn->local_tx_ctrl.prod, &fin, conn);
491 smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn);
492
493 if (smc_curs_comp(conn->peer_rmbe_size, &cons, &fin) < 0) {
494 /* cons cursor advanced more than fin, and prod was set
495 * fin above, so now prod is smaller than cons. Fix that.
496 */
497 diff = smc_curs_diff(conn->peer_rmbe_size, &fin, &cons);
498 smc_curs_add(conn->sndbuf_desc->len,
499 &conn->tx_curs_sent, diff);
500 smc_curs_add(conn->sndbuf_desc->len,
501 &conn->tx_curs_fin, diff);
502
503 smp_mb__before_atomic();
504 atomic_add(diff, &conn->sndbuf_space);
505 smp_mb__after_atomic();
506
507 smc_curs_add(conn->peer_rmbe_size,
508 &conn->local_tx_ctrl.prod, diff);
509 smc_curs_add(conn->peer_rmbe_size,
510 &conn->local_tx_ctrl_fin, diff);
511 }
512 /* recalculate, value is used by tx_rdma_writes() */
513 atomic_set(&smc->conn.peer_rmbe_space, smc_write_space(conn));
514
515 if (smc->sk.sk_state != SMC_INIT &&
516 smc->sk.sk_state != SMC_CLOSED) {
517 rc = smcr_cdc_msg_send_validation(conn, pend, wr_buf);
518 if (!rc) {
519 queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, 0);
520 smc->sk.sk_data_ready(&smc->sk);
521 }
522 } else {
523 smc_wr_tx_put_slot(conn->lnk,
524 (struct smc_wr_tx_pend_priv *)pend);
525 }
526 return rc;
527}
528
529struct smc_link *smc_switch_conns(struct smc_link_group *lgr,
530 struct smc_link *from_lnk, bool is_dev_err)
531{
532 struct smc_link *to_lnk = NULL;
533 struct smc_cdc_tx_pend *pend;
534 struct smc_connection *conn;
535 struct smc_wr_buf *wr_buf;
536 struct smc_sock *smc;
537 struct rb_node *node;
538 int i, rc = 0;
539
540 /* link is inactive, wake up tx waiters */
541 smc_wr_wakeup_tx_wait(from_lnk);
542
543 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
544 if (!smc_link_active(&lgr->lnk[i]) || i == from_lnk->link_idx)
545 continue;
546 if (is_dev_err && from_lnk->smcibdev == lgr->lnk[i].smcibdev &&
547 from_lnk->ibport == lgr->lnk[i].ibport) {
548 continue;
549 }
550 to_lnk = &lgr->lnk[i];
551 break;
552 }
553 if (!to_lnk || !smc_wr_tx_link_hold(to_lnk)) {
554 smc_lgr_terminate_sched(lgr);
555 return NULL;
556 }
557again:
558 read_lock_bh(&lgr->conns_lock);
559 for (node = rb_first(&lgr->conns_all); node; node = rb_next(node)) {
560 conn = rb_entry(node, struct smc_connection, alert_node);
561 if (conn->lnk != from_lnk)
562 continue;
563 smc = container_of(conn, struct smc_sock, conn);
564 /* conn->lnk not yet set in SMC_INIT state */
565 if (smc->sk.sk_state == SMC_INIT)
566 continue;
567 if (smc->sk.sk_state == SMC_CLOSED ||
568 smc->sk.sk_state == SMC_PEERCLOSEWAIT1 ||
569 smc->sk.sk_state == SMC_PEERCLOSEWAIT2 ||
570 smc->sk.sk_state == SMC_APPFINCLOSEWAIT ||
571 smc->sk.sk_state == SMC_APPCLOSEWAIT1 ||
572 smc->sk.sk_state == SMC_APPCLOSEWAIT2 ||
573 smc->sk.sk_state == SMC_PEERFINCLOSEWAIT ||
574 smc->sk.sk_state == SMC_PEERABORTWAIT ||
575 smc->sk.sk_state == SMC_PROCESSABORT) {
576 spin_lock_bh(&conn->send_lock);
577 conn->lnk = to_lnk;
578 spin_unlock_bh(&conn->send_lock);
579 continue;
580 }
581 sock_hold(&smc->sk);
582 read_unlock_bh(&lgr->conns_lock);
583 /* pre-fetch buffer outside of send_lock, might sleep */
584 rc = smc_cdc_get_free_slot(conn, to_lnk, &wr_buf, NULL, &pend);
585 if (rc)
586 goto err_out;
587 /* avoid race with smcr_tx_sndbuf_nonempty() */
588 spin_lock_bh(&conn->send_lock);
589 conn->lnk = to_lnk;
590 rc = smc_switch_cursor(smc, pend, wr_buf);
591 spin_unlock_bh(&conn->send_lock);
592 sock_put(&smc->sk);
593 if (rc)
594 goto err_out;
595 goto again;
596 }
597 read_unlock_bh(&lgr->conns_lock);
598 smc_wr_tx_link_put(to_lnk);
599 return to_lnk;
600
601err_out:
602 smcr_link_down_cond_sched(to_lnk);
603 smc_wr_tx_link_put(to_lnk);
604 return NULL;
605}
606
607static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc,
608 struct smc_link_group *lgr)
609{
610 int rc;
611
612 if (rmb_desc->is_conf_rkey && !list_empty(&lgr->list)) {
613 /* unregister rmb with peer */
614 rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY);
615 if (!rc) {
616 /* protect against smc_llc_cli_rkey_exchange() */
617 mutex_lock(&lgr->llc_conf_mutex);
618 smc_llc_do_delete_rkey(lgr, rmb_desc);
619 rmb_desc->is_conf_rkey = false;
620 mutex_unlock(&lgr->llc_conf_mutex);
621 smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
622 }
623 }
624
625 if (rmb_desc->is_reg_err) {
626 /* buf registration failed, reuse not possible */
627 mutex_lock(&lgr->rmbs_lock);
628 list_del(&rmb_desc->list);
629 mutex_unlock(&lgr->rmbs_lock);
630
631 smc_buf_free(lgr, true, rmb_desc);
632 } else {
633 rmb_desc->used = 0;
634 }
635}
636
637static void smc_buf_unuse(struct smc_connection *conn,
638 struct smc_link_group *lgr)
639{
640 if (conn->sndbuf_desc)
641 conn->sndbuf_desc->used = 0;
642 if (conn->rmb_desc && lgr->is_smcd)
643 conn->rmb_desc->used = 0;
644 else if (conn->rmb_desc)
645 smcr_buf_unuse(conn->rmb_desc, lgr);
646}
647
648/* remove a finished connection from its link group */
649void smc_conn_free(struct smc_connection *conn)
650{
651 struct smc_link_group *lgr = conn->lgr;
652
653 if (!lgr)
654 return;
655 if (lgr->is_smcd) {
656 if (!list_empty(&lgr->list))
657 smc_ism_unset_conn(conn);
658 tasklet_kill(&conn->rx_tsklet);
659 } else {
660 smc_cdc_wait_pend_tx_wr(conn);
661 if (current_work() != &conn->abort_work)
662 cancel_work_sync(&conn->abort_work);
663 }
664 if (!list_empty(&lgr->list)) {
665 smc_buf_unuse(conn, lgr); /* allow buffer reuse */
666 smc_lgr_unregister_conn(conn);
667 }
668
669 if (!lgr->conns_num)
670 smc_lgr_schedule_free_work(lgr);
671}
672
673/* unregister a link from a buf_desc */
674static void smcr_buf_unmap_link(struct smc_buf_desc *buf_desc, bool is_rmb,
675 struct smc_link *lnk)
676{
677 if (is_rmb)
678 buf_desc->is_reg_mr[lnk->link_idx] = false;
679 if (!buf_desc->is_map_ib[lnk->link_idx])
680 return;
681 if (is_rmb) {
682 if (buf_desc->mr_rx[lnk->link_idx]) {
683 smc_ib_put_memory_region(
684 buf_desc->mr_rx[lnk->link_idx]);
685 buf_desc->mr_rx[lnk->link_idx] = NULL;
686 }
687 smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE);
688 } else {
689 smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE);
690 }
691 sg_free_table(&buf_desc->sgt[lnk->link_idx]);
692 buf_desc->is_map_ib[lnk->link_idx] = false;
693}
694
695/* unmap all buffers of lgr for a deleted link */
696static void smcr_buf_unmap_lgr(struct smc_link *lnk)
697{
698 struct smc_link_group *lgr = lnk->lgr;
699 struct smc_buf_desc *buf_desc, *bf;
700 int i;
701
702 for (i = 0; i < SMC_RMBE_SIZES; i++) {
703 mutex_lock(&lgr->rmbs_lock);
704 list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list)
705 smcr_buf_unmap_link(buf_desc, true, lnk);
706 mutex_unlock(&lgr->rmbs_lock);
707 mutex_lock(&lgr->sndbufs_lock);
708 list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i],
709 list)
710 smcr_buf_unmap_link(buf_desc, false, lnk);
711 mutex_unlock(&lgr->sndbufs_lock);
712 }
713}
714
715static void smcr_rtoken_clear_link(struct smc_link *lnk)
716{
717 struct smc_link_group *lgr = lnk->lgr;
718 int i;
719
720 for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
721 lgr->rtokens[i][lnk->link_idx].rkey = 0;
722 lgr->rtokens[i][lnk->link_idx].dma_addr = 0;
723 }
724}
725
726/* must be called under lgr->llc_conf_mutex lock */
727void smcr_link_clear(struct smc_link *lnk, bool log)
728{
729 struct smc_ib_device *smcibdev;
730
731 if (!lnk->lgr || lnk->state == SMC_LNK_UNUSED)
732 return;
733 lnk->peer_qpn = 0;
734 smc_llc_link_clear(lnk, log);
735 smcr_buf_unmap_lgr(lnk);
736 smcr_rtoken_clear_link(lnk);
737 smc_ib_modify_qp_error(lnk);
738 smc_wr_free_link(lnk);
739 smc_ib_destroy_queue_pair(lnk);
740 smc_ib_dealloc_protection_domain(lnk);
741 smc_wr_free_link_mem(lnk);
742 put_device(&lnk->smcibdev->ibdev->dev);
743 smcibdev = lnk->smcibdev;
744 memset(lnk, 0, sizeof(struct smc_link));
745 lnk->state = SMC_LNK_UNUSED;
746 if (!atomic_dec_return(&smcibdev->lnk_cnt))
747 wake_up(&smcibdev->lnks_deleted);
748}
749
750static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb,
751 struct smc_buf_desc *buf_desc)
752{
753 int i;
754
755 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++)
756 smcr_buf_unmap_link(buf_desc, is_rmb, &lgr->lnk[i]);
757
758 if (buf_desc->pages)
759 __free_pages(buf_desc->pages, buf_desc->order);
760 kfree(buf_desc);
761}
762
763static void smcd_buf_free(struct smc_link_group *lgr, bool is_dmb,
764 struct smc_buf_desc *buf_desc)
765{
766 if (is_dmb) {
767 /* restore original buf len */
768 buf_desc->len += sizeof(struct smcd_cdc_msg);
769 smc_ism_unregister_dmb(lgr->smcd, buf_desc);
770 } else {
771 kfree(buf_desc->cpu_addr);
772 }
773 kfree(buf_desc);
774}
775
776static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
777 struct smc_buf_desc *buf_desc)
778{
779 if (lgr->is_smcd)
780 smcd_buf_free(lgr, is_rmb, buf_desc);
781 else
782 smcr_buf_free(lgr, is_rmb, buf_desc);
783}
784
785static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb)
786{
787 struct smc_buf_desc *buf_desc, *bf_desc;
788 struct list_head *buf_list;
789 int i;
790
791 for (i = 0; i < SMC_RMBE_SIZES; i++) {
792 if (is_rmb)
793 buf_list = &lgr->rmbs[i];
794 else
795 buf_list = &lgr->sndbufs[i];
796 list_for_each_entry_safe(buf_desc, bf_desc, buf_list,
797 list) {
798 list_del(&buf_desc->list);
799 smc_buf_free(lgr, is_rmb, buf_desc);
800 }
801 }
802}
803
804static void smc_lgr_free_bufs(struct smc_link_group *lgr)
805{
806 /* free send buffers */
807 __smc_lgr_free_bufs(lgr, false);
808 /* free rmbs */
809 __smc_lgr_free_bufs(lgr, true);
810}
811
812/* remove a link group */
813static void smc_lgr_free(struct smc_link_group *lgr)
814{
815 int i;
816
817 if (!lgr->is_smcd) {
818 mutex_lock(&lgr->llc_conf_mutex);
819 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
820 if (lgr->lnk[i].state != SMC_LNK_UNUSED)
821 smcr_link_clear(&lgr->lnk[i], false);
822 }
823 mutex_unlock(&lgr->llc_conf_mutex);
824 smc_llc_lgr_clear(lgr);
825 }
826
827 smc_lgr_free_bufs(lgr);
828 destroy_workqueue(lgr->tx_wq);
829 if (lgr->is_smcd) {
830 smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
831 put_device(&lgr->smcd->dev);
832 if (!atomic_dec_return(&lgr->smcd->lgr_cnt))
833 wake_up(&lgr->smcd->lgrs_deleted);
834 } else {
835 if (!atomic_dec_return(&lgr_cnt))
836 wake_up(&lgrs_deleted);
837 }
838 kfree(lgr);
839}
840
841static void smcd_unregister_all_dmbs(struct smc_link_group *lgr)
842{
843 int i;
844
845 for (i = 0; i < SMC_RMBE_SIZES; i++) {
846 struct smc_buf_desc *buf_desc;
847
848 list_for_each_entry(buf_desc, &lgr->rmbs[i], list) {
849 buf_desc->len += sizeof(struct smcd_cdc_msg);
850 smc_ism_unregister_dmb(lgr->smcd, buf_desc);
851 }
852 }
853}
854
855static void smc_sk_wake_ups(struct smc_sock *smc)
856{
857 smc->sk.sk_write_space(&smc->sk);
858 smc->sk.sk_data_ready(&smc->sk);
859 smc->sk.sk_state_change(&smc->sk);
860}
861
862/* kill a connection */
863static void smc_conn_kill(struct smc_connection *conn, bool soft)
864{
865 struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
866
867 if (conn->lgr->is_smcd && conn->lgr->peer_shutdown)
868 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
869 else
870 smc_close_abort(conn);
871 conn->killed = 1;
872 smc->sk.sk_err = ECONNABORTED;
873 smc_sk_wake_ups(smc);
874 if (conn->lgr->is_smcd) {
875 smc_ism_unset_conn(conn);
876 if (soft)
877 tasklet_kill(&conn->rx_tsklet);
878 else
879 tasklet_unlock_wait(&conn->rx_tsklet);
880 } else {
881 smc_cdc_wait_pend_tx_wr(conn);
882 }
883 smc_lgr_unregister_conn(conn);
884 smc_close_active_abort(smc);
885}
886
887static void smc_lgr_cleanup(struct smc_link_group *lgr)
888{
889 if (lgr->is_smcd) {
890 smc_ism_signal_shutdown(lgr);
891 smcd_unregister_all_dmbs(lgr);
892 } else {
893 u32 rsn = lgr->llc_termination_rsn;
894
895 if (!rsn)
896 rsn = SMC_LLC_DEL_PROG_INIT_TERM;
897 smc_llc_send_link_delete_all(lgr, false, rsn);
898 smcr_lgr_link_deactivate_all(lgr);
899 }
900}
901
902/* terminate link group
903 * @soft: true if link group shutdown can take its time
904 * false if immediate link group shutdown is required
905 */
906static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft)
907{
908 struct smc_connection *conn;
909 struct smc_sock *smc;
910 struct rb_node *node;
911
912 if (lgr->terminating)
913 return; /* lgr already terminating */
914 /* cancel free_work sync, will terminate when lgr->freeing is set */
915 cancel_delayed_work_sync(&lgr->free_work);
916 lgr->terminating = 1;
917
918 /* kill remaining link group connections */
919 read_lock_bh(&lgr->conns_lock);
920 node = rb_first(&lgr->conns_all);
921 while (node) {
922 read_unlock_bh(&lgr->conns_lock);
923 conn = rb_entry(node, struct smc_connection, alert_node);
924 smc = container_of(conn, struct smc_sock, conn);
925 sock_hold(&smc->sk); /* sock_put below */
926 lock_sock(&smc->sk);
927 smc_conn_kill(conn, soft);
928 release_sock(&smc->sk);
929 sock_put(&smc->sk); /* sock_hold above */
930 read_lock_bh(&lgr->conns_lock);
931 node = rb_first(&lgr->conns_all);
932 }
933 read_unlock_bh(&lgr->conns_lock);
934 smc_lgr_cleanup(lgr);
935 smc_lgr_free(lgr);
936}
937
938/* unlink link group and schedule termination */
939void smc_lgr_terminate_sched(struct smc_link_group *lgr)
940{
941 spinlock_t *lgr_lock;
942
943 smc_lgr_list_head(lgr, &lgr_lock);
944 spin_lock_bh(lgr_lock);
945 if (list_empty(&lgr->list) || lgr->terminating || lgr->freeing) {
946 spin_unlock_bh(lgr_lock);
947 return; /* lgr already terminating */
948 }
949 list_del_init(&lgr->list);
950 lgr->freeing = 1;
951 spin_unlock_bh(lgr_lock);
952 schedule_work(&lgr->terminate_work);
953}
954
955/* Called when peer lgr shutdown (regularly or abnormally) is received */
956void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan)
957{
958 struct smc_link_group *lgr, *l;
959 LIST_HEAD(lgr_free_list);
960
961 /* run common cleanup function and build free list */
962 spin_lock_bh(&dev->lgr_lock);
963 list_for_each_entry_safe(lgr, l, &dev->lgr_list, list) {
964 if ((!peer_gid || lgr->peer_gid == peer_gid) &&
965 (vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) {
966 if (peer_gid) /* peer triggered termination */
967 lgr->peer_shutdown = 1;
968 list_move(&lgr->list, &lgr_free_list);
969 lgr->freeing = 1;
970 }
971 }
972 spin_unlock_bh(&dev->lgr_lock);
973
974 /* cancel the regular free workers and actually free lgrs */
975 list_for_each_entry_safe(lgr, l, &lgr_free_list, list) {
976 list_del_init(&lgr->list);
977 schedule_work(&lgr->terminate_work);
978 }
979}
980
981/* Called when an SMCD device is removed or the smc module is unloaded */
982void smc_smcd_terminate_all(struct smcd_dev *smcd)
983{
984 struct smc_link_group *lgr, *lg;
985 LIST_HEAD(lgr_free_list);
986
987 spin_lock_bh(&smcd->lgr_lock);
988 list_splice_init(&smcd->lgr_list, &lgr_free_list);
989 list_for_each_entry(lgr, &lgr_free_list, list)
990 lgr->freeing = 1;
991 spin_unlock_bh(&smcd->lgr_lock);
992
993 list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) {
994 list_del_init(&lgr->list);
995 __smc_lgr_terminate(lgr, false);
996 }
997
998 if (atomic_read(&smcd->lgr_cnt))
999 wait_event(smcd->lgrs_deleted, !atomic_read(&smcd->lgr_cnt));
1000}
1001
1002/* Called when an SMCR device is removed or the smc module is unloaded.
1003 * If smcibdev is given, all SMCR link groups using this device are terminated.
1004 * If smcibdev is NULL, all SMCR link groups are terminated.
1005 */
1006void smc_smcr_terminate_all(struct smc_ib_device *smcibdev)
1007{
1008 struct smc_link_group *lgr, *lg;
1009 LIST_HEAD(lgr_free_list);
1010 int i;
1011
1012 spin_lock_bh(&smc_lgr_list.lock);
1013 if (!smcibdev) {
1014 list_splice_init(&smc_lgr_list.list, &lgr_free_list);
1015 list_for_each_entry(lgr, &lgr_free_list, list)
1016 lgr->freeing = 1;
1017 } else {
1018 list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) {
1019 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
1020 if (lgr->lnk[i].smcibdev == smcibdev)
1021 smcr_link_down_cond_sched(&lgr->lnk[i]);
1022 }
1023 }
1024 }
1025 spin_unlock_bh(&smc_lgr_list.lock);
1026
1027 list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) {
1028 list_del_init(&lgr->list);
1029 smc_llc_set_termination_rsn(lgr, SMC_LLC_DEL_OP_INIT_TERM);
1030 __smc_lgr_terminate(lgr, false);
1031 }
1032
1033 if (smcibdev) {
1034 if (atomic_read(&smcibdev->lnk_cnt))
1035 wait_event(smcibdev->lnks_deleted,
1036 !atomic_read(&smcibdev->lnk_cnt));
1037 } else {
1038 if (atomic_read(&lgr_cnt))
1039 wait_event(lgrs_deleted, !atomic_read(&lgr_cnt));
1040 }
1041}
1042
1043/* set new lgr type and clear all asymmetric link tagging */
1044void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type)
1045{
1046 char *lgr_type = "";
1047 int i;
1048
1049 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++)
1050 if (smc_link_usable(&lgr->lnk[i]))
1051 lgr->lnk[i].link_is_asym = false;
1052 if (lgr->type == new_type)
1053 return;
1054 lgr->type = new_type;
1055
1056 switch (lgr->type) {
1057 case SMC_LGR_NONE:
1058 lgr_type = "NONE";
1059 break;
1060 case SMC_LGR_SINGLE:
1061 lgr_type = "SINGLE";
1062 break;
1063 case SMC_LGR_SYMMETRIC:
1064 lgr_type = "SYMMETRIC";
1065 break;
1066 case SMC_LGR_ASYMMETRIC_PEER:
1067 lgr_type = "ASYMMETRIC_PEER";
1068 break;
1069 case SMC_LGR_ASYMMETRIC_LOCAL:
1070 lgr_type = "ASYMMETRIC_LOCAL";
1071 break;
1072 }
1073 pr_warn_ratelimited("smc: SMC-R lg %*phN state changed: "
1074 "%s, pnetid %.16s\n", SMC_LGR_ID_SIZE, &lgr->id,
1075 lgr_type, lgr->pnet_id);
1076}
1077
1078/* set new lgr type and tag a link as asymmetric */
1079void smcr_lgr_set_type_asym(struct smc_link_group *lgr,
1080 enum smc_lgr_type new_type, int asym_lnk_idx)
1081{
1082 smcr_lgr_set_type(lgr, new_type);
1083 lgr->lnk[asym_lnk_idx].link_is_asym = true;
1084}
1085
1086/* abort connection, abort_work scheduled from tasklet context */
1087static void smc_conn_abort_work(struct work_struct *work)
1088{
1089 struct smc_connection *conn = container_of(work,
1090 struct smc_connection,
1091 abort_work);
1092 struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
1093
1094 lock_sock(&smc->sk);
1095 smc_conn_kill(conn, true);
1096 release_sock(&smc->sk);
1097 sock_put(&smc->sk); /* sock_hold done by schedulers of abort_work */
1098}
1099
1100void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport)
1101{
1102 struct smc_link_group *lgr, *n;
1103
1104 list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) {
1105 struct smc_link *link;
1106
1107 if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id,
1108 SMC_MAX_PNETID_LEN) ||
1109 lgr->type == SMC_LGR_SYMMETRIC ||
1110 lgr->type == SMC_LGR_ASYMMETRIC_PEER)
1111 continue;
1112
1113 /* trigger local add link processing */
1114 link = smc_llc_usable_link(lgr);
1115 if (link)
1116 smc_llc_add_link_local(link);
1117 }
1118}
1119
1120/* link is down - switch connections to alternate link,
1121 * must be called under lgr->llc_conf_mutex lock
1122 */
1123static void smcr_link_down(struct smc_link *lnk)
1124{
1125 struct smc_link_group *lgr = lnk->lgr;
1126 struct smc_link *to_lnk;
1127 int del_link_id;
1128
1129 if (!lgr || lnk->state == SMC_LNK_UNUSED || list_empty(&lgr->list))
1130 return;
1131
1132 to_lnk = smc_switch_conns(lgr, lnk, true);
1133 if (!to_lnk) { /* no backup link available */
1134 smcr_link_clear(lnk, true);
1135 return;
1136 }
1137 smcr_lgr_set_type(lgr, SMC_LGR_SINGLE);
1138 del_link_id = lnk->link_id;
1139
1140 if (lgr->role == SMC_SERV) {
1141 /* trigger local delete link processing */
1142 smc_llc_srv_delete_link_local(to_lnk, del_link_id);
1143 } else {
1144 if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) {
1145 /* another llc task is ongoing */
1146 mutex_unlock(&lgr->llc_conf_mutex);
1147 wait_event_timeout(lgr->llc_flow_waiter,
1148 (list_empty(&lgr->list) ||
1149 lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE),
1150 SMC_LLC_WAIT_TIME);
1151 mutex_lock(&lgr->llc_conf_mutex);
1152 }
1153 if (!list_empty(&lgr->list)) {
1154 smc_llc_send_delete_link(to_lnk, del_link_id,
1155 SMC_LLC_REQ, true,
1156 SMC_LLC_DEL_LOST_PATH);
1157 smcr_link_clear(lnk, true);
1158 }
1159 wake_up(&lgr->llc_flow_waiter); /* wake up next waiter */
1160 }
1161}
1162
1163/* must be called under lgr->llc_conf_mutex lock */
1164void smcr_link_down_cond(struct smc_link *lnk)
1165{
1166 if (smc_link_downing(&lnk->state))
1167 smcr_link_down(lnk);
1168}
1169
1170/* will get the lgr->llc_conf_mutex lock */
1171void smcr_link_down_cond_sched(struct smc_link *lnk)
1172{
1173 if (smc_link_downing(&lnk->state))
1174 schedule_work(&lnk->link_down_wrk);
1175}
1176
1177void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport)
1178{
1179 struct smc_link_group *lgr, *n;
1180 int i;
1181
1182 list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) {
1183 if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id,
1184 SMC_MAX_PNETID_LEN))
1185 continue; /* lgr is not affected */
1186 if (list_empty(&lgr->list))
1187 continue;
1188 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
1189 struct smc_link *lnk = &lgr->lnk[i];
1190
1191 if (smc_link_usable(lnk) &&
1192 lnk->smcibdev == smcibdev && lnk->ibport == ibport)
1193 smcr_link_down_cond_sched(lnk);
1194 }
1195 }
1196}
1197
1198static void smc_link_down_work(struct work_struct *work)
1199{
1200 struct smc_link *link = container_of(work, struct smc_link,
1201 link_down_wrk);
1202 struct smc_link_group *lgr = link->lgr;
1203
1204 if (list_empty(&lgr->list))
1205 return;
1206 wake_up_all(&lgr->llc_msg_waiter);
1207 mutex_lock(&lgr->llc_conf_mutex);
1208 smcr_link_down(link);
1209 mutex_unlock(&lgr->llc_conf_mutex);
1210}
1211
1212static int smc_vlan_by_tcpsk_walk(struct net_device *lower_dev,
1213 struct netdev_nested_priv *priv)
1214{
1215 unsigned short *vlan_id = (unsigned short *)priv->data;
1216
1217 if (is_vlan_dev(lower_dev)) {
1218 *vlan_id = vlan_dev_vlan_id(lower_dev);
1219 return 1;
1220 }
1221
1222 return 0;
1223}
1224
1225/* Determine vlan of internal TCP socket. */
1226int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini)
1227{
1228 struct dst_entry *dst = sk_dst_get(clcsock->sk);
1229 struct netdev_nested_priv priv;
1230 struct net_device *ndev;
1231 int rc = 0;
1232
1233 ini->vlan_id = 0;
1234 if (!dst) {
1235 rc = -ENOTCONN;
1236 goto out;
1237 }
1238 if (!dst->dev) {
1239 rc = -ENODEV;
1240 goto out_rel;
1241 }
1242
1243 ndev = dst->dev;
1244 if (is_vlan_dev(ndev)) {
1245 ini->vlan_id = vlan_dev_vlan_id(ndev);
1246 goto out_rel;
1247 }
1248
1249 priv.data = (void *)&ini->vlan_id;
1250 rtnl_lock();
1251 netdev_walk_all_lower_dev(ndev, smc_vlan_by_tcpsk_walk, &priv);
1252 rtnl_unlock();
1253
1254out_rel:
1255 dst_release(dst);
1256out:
1257 return rc;
1258}
1259
1260static bool smcr_lgr_match(struct smc_link_group *lgr,
1261 struct smc_clc_msg_local *lcl,
1262 enum smc_lgr_role role, u32 clcqpn)
1263{
1264 int i;
1265
1266 if (memcmp(lgr->peer_systemid, lcl->id_for_peer, SMC_SYSTEMID_LEN) ||
1267 lgr->role != role)
1268 return false;
1269
1270 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
1271 if (!smc_link_active(&lgr->lnk[i]))
1272 continue;
1273 if ((lgr->role == SMC_SERV || lgr->lnk[i].peer_qpn == clcqpn) &&
1274 !memcmp(lgr->lnk[i].peer_gid, &lcl->gid, SMC_GID_SIZE) &&
1275 !memcmp(lgr->lnk[i].peer_mac, lcl->mac, sizeof(lcl->mac)))
1276 return true;
1277 }
1278 return false;
1279}
1280
1281static bool smcd_lgr_match(struct smc_link_group *lgr,
1282 struct smcd_dev *smcismdev, u64 peer_gid)
1283{
1284 return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev;
1285}
1286
1287/* create a new SMC connection (and a new link group if necessary) */
1288int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
1289{
1290 struct smc_connection *conn = &smc->conn;
1291 struct list_head *lgr_list;
1292 struct smc_link_group *lgr;
1293 enum smc_lgr_role role;
1294 spinlock_t *lgr_lock;
1295 int rc = 0;
1296
1297 lgr_list = ini->is_smcd ? &ini->ism_dev[ini->ism_selected]->lgr_list :
1298 &smc_lgr_list.list;
1299 lgr_lock = ini->is_smcd ? &ini->ism_dev[ini->ism_selected]->lgr_lock :
1300 &smc_lgr_list.lock;
1301 ini->first_contact_local = 1;
1302 role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
1303 if (role == SMC_CLNT && ini->first_contact_peer)
1304 /* create new link group as well */
1305 goto create;
1306
1307 /* determine if an existing link group can be reused */
1308 spin_lock_bh(lgr_lock);
1309 list_for_each_entry(lgr, lgr_list, list) {
1310 write_lock_bh(&lgr->conns_lock);
1311 if ((ini->is_smcd ?
1312 smcd_lgr_match(lgr, ini->ism_dev[ini->ism_selected],
1313 ini->ism_peer_gid[ini->ism_selected]) :
1314 smcr_lgr_match(lgr, ini->ib_lcl, role, ini->ib_clcqpn)) &&
1315 !lgr->sync_err &&
1316 (ini->smcd_version == SMC_V2 ||
1317 lgr->vlan_id == ini->vlan_id) &&
1318 (role == SMC_CLNT || ini->is_smcd ||
1319 (lgr->conns_num < SMC_RMBS_PER_LGR_MAX &&
1320 !bitmap_full(lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX)))) {
1321 /* link group found */
1322 ini->first_contact_local = 0;
1323 conn->lgr = lgr;
1324 rc = smc_lgr_register_conn(conn, false);
1325 write_unlock_bh(&lgr->conns_lock);
1326 if (!rc && delayed_work_pending(&lgr->free_work))
1327 cancel_delayed_work(&lgr->free_work);
1328 break;
1329 }
1330 write_unlock_bh(&lgr->conns_lock);
1331 }
1332 spin_unlock_bh(lgr_lock);
1333 if (rc)
1334 return rc;
1335
1336 if (role == SMC_CLNT && !ini->first_contact_peer &&
1337 ini->first_contact_local) {
1338 /* Server reuses a link group, but Client wants to start
1339 * a new one
1340 * send out_of_sync decline, reason synchr. error
1341 */
1342 return SMC_CLC_DECL_SYNCERR;
1343 }
1344
1345create:
1346 if (ini->first_contact_local) {
1347 rc = smc_lgr_create(smc, ini);
1348 if (rc)
1349 goto out;
1350 lgr = conn->lgr;
1351 write_lock_bh(&lgr->conns_lock);
1352 rc = smc_lgr_register_conn(conn, true);
1353 write_unlock_bh(&lgr->conns_lock);
1354 if (rc)
1355 goto out;
1356 }
1357 conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
1358 conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
1359 conn->urg_state = SMC_URG_READ;
1360 init_waitqueue_head(&conn->cdc_pend_tx_wq);
1361 INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work);
1362 if (ini->is_smcd) {
1363 conn->rx_off = sizeof(struct smcd_cdc_msg);
1364 smcd_cdc_rx_init(conn); /* init tasklet for this conn */
1365 } else {
1366 conn->rx_off = 0;
1367 }
1368#ifndef KERNEL_HAS_ATOMIC64
1369 spin_lock_init(&conn->acurs_lock);
1370#endif
1371
1372out:
1373 return rc;
1374}
1375
1376/* convert the RMB size into the compressed notation - minimum 16K.
1377 * In contrast to plain ilog2, this rounds towards the next power of 2,
1378 * so the socket application gets at least its desired sndbuf / rcvbuf size.
1379 */
1380static u8 smc_compress_bufsize(int size)
1381{
1382 u8 compressed;
1383
1384 if (size <= SMC_BUF_MIN_SIZE)
1385 return 0;
1386
1387 size = (size - 1) >> 14;
1388 compressed = ilog2(size) + 1;
1389 if (compressed >= SMC_RMBE_SIZES)
1390 compressed = SMC_RMBE_SIZES - 1;
1391 return compressed;
1392}
1393
1394/* convert the RMB size from compressed notation into integer */
1395int smc_uncompress_bufsize(u8 compressed)
1396{
1397 u32 size;
1398
1399 size = 0x00000001 << (((int)compressed) + 14);
1400 return (int)size;
1401}
1402
1403/* try to reuse a sndbuf or rmb description slot for a certain
1404 * buffer size; if not available, return NULL
1405 */
1406static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
1407 struct mutex *lock,
1408 struct list_head *buf_list)
1409{
1410 struct smc_buf_desc *buf_slot;
1411
1412 mutex_lock(lock);
1413 list_for_each_entry(buf_slot, buf_list, list) {
1414 if (cmpxchg(&buf_slot->used, 0, 1) == 0) {
1415 mutex_unlock(lock);
1416 return buf_slot;
1417 }
1418 }
1419 mutex_unlock(lock);
1420 return NULL;
1421}
1422
1423/* one of the conditions for announcing a receiver's current window size is
1424 * that it "results in a minimum increase in the window size of 10% of the
1425 * receive buffer space" [RFC7609]
1426 */
1427static inline int smc_rmb_wnd_update_limit(int rmbe_size)
1428{
1429 return max_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
1430}
1431
1432/* map an rmb buf to a link */
1433static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb,
1434 struct smc_link *lnk)
1435{
1436 int rc;
1437
1438 if (buf_desc->is_map_ib[lnk->link_idx])
1439 return 0;
1440
1441 rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], 1, GFP_KERNEL);
1442 if (rc)
1443 return rc;
1444 sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl,
1445 buf_desc->cpu_addr, buf_desc->len);
1446
1447 /* map sg table to DMA address */
1448 rc = smc_ib_buf_map_sg(lnk, buf_desc,
1449 is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
1450 /* SMC protocol depends on mapping to one DMA address only */
1451 if (rc != 1) {
1452 rc = -EAGAIN;
1453 goto free_table;
1454 }
1455
1456 /* create a new memory region for the RMB */
1457 if (is_rmb) {
1458 rc = smc_ib_get_memory_region(lnk->roce_pd,
1459 IB_ACCESS_REMOTE_WRITE |
1460 IB_ACCESS_LOCAL_WRITE,
1461 buf_desc, lnk->link_idx);
1462 if (rc)
1463 goto buf_unmap;
1464 smc_ib_sync_sg_for_device(lnk, buf_desc, DMA_FROM_DEVICE);
1465 }
1466 buf_desc->is_map_ib[lnk->link_idx] = true;
1467 return 0;
1468
1469buf_unmap:
1470 smc_ib_buf_unmap_sg(lnk, buf_desc,
1471 is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
1472free_table:
1473 sg_free_table(&buf_desc->sgt[lnk->link_idx]);
1474 return rc;
1475}
1476
1477/* register a new rmb on IB device,
1478 * must be called under lgr->llc_conf_mutex lock
1479 */
1480int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc)
1481{
1482 if (list_empty(&link->lgr->list))
1483 return -ENOLINK;
1484 if (!rmb_desc->is_reg_mr[link->link_idx]) {
1485 /* register memory region for new rmb */
1486 if (smc_wr_reg_send(link, rmb_desc->mr_rx[link->link_idx])) {
1487 rmb_desc->is_reg_err = true;
1488 return -EFAULT;
1489 }
1490 rmb_desc->is_reg_mr[link->link_idx] = true;
1491 }
1492 return 0;
1493}
1494
1495static int _smcr_buf_map_lgr(struct smc_link *lnk, struct mutex *lock,
1496 struct list_head *lst, bool is_rmb)
1497{
1498 struct smc_buf_desc *buf_desc, *bf;
1499 int rc = 0;
1500
1501 mutex_lock(lock);
1502 list_for_each_entry_safe(buf_desc, bf, lst, list) {
1503 if (!buf_desc->used)
1504 continue;
1505 rc = smcr_buf_map_link(buf_desc, is_rmb, lnk);
1506 if (rc)
1507 goto out;
1508 }
1509out:
1510 mutex_unlock(lock);
1511 return rc;
1512}
1513
1514/* map all used buffers of lgr for a new link */
1515int smcr_buf_map_lgr(struct smc_link *lnk)
1516{
1517 struct smc_link_group *lgr = lnk->lgr;
1518 int i, rc = 0;
1519
1520 for (i = 0; i < SMC_RMBE_SIZES; i++) {
1521 rc = _smcr_buf_map_lgr(lnk, &lgr->rmbs_lock,
1522 &lgr->rmbs[i], true);
1523 if (rc)
1524 return rc;
1525 rc = _smcr_buf_map_lgr(lnk, &lgr->sndbufs_lock,
1526 &lgr->sndbufs[i], false);
1527 if (rc)
1528 return rc;
1529 }
1530 return 0;
1531}
1532
1533/* register all used buffers of lgr for a new link,
1534 * must be called under lgr->llc_conf_mutex lock
1535 */
1536int smcr_buf_reg_lgr(struct smc_link *lnk)
1537{
1538 struct smc_link_group *lgr = lnk->lgr;
1539 struct smc_buf_desc *buf_desc, *bf;
1540 int i, rc = 0;
1541
1542 mutex_lock(&lgr->rmbs_lock);
1543 for (i = 0; i < SMC_RMBE_SIZES; i++) {
1544 list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) {
1545 if (!buf_desc->used)
1546 continue;
1547 rc = smcr_link_reg_rmb(lnk, buf_desc);
1548 if (rc)
1549 goto out;
1550 }
1551 }
1552out:
1553 mutex_unlock(&lgr->rmbs_lock);
1554 return rc;
1555}
1556
1557static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr,
1558 bool is_rmb, int bufsize)
1559{
1560 struct smc_buf_desc *buf_desc;
1561
1562 /* try to alloc a new buffer */
1563 buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
1564 if (!buf_desc)
1565 return ERR_PTR(-ENOMEM);
1566
1567 buf_desc->order = get_order(bufsize);
1568 buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN |
1569 __GFP_NOMEMALLOC | __GFP_COMP |
1570 __GFP_NORETRY | __GFP_ZERO,
1571 buf_desc->order);
1572 if (!buf_desc->pages) {
1573 kfree(buf_desc);
1574 return ERR_PTR(-EAGAIN);
1575 }
1576 buf_desc->cpu_addr = (void *)page_address(buf_desc->pages);
1577 buf_desc->len = bufsize;
1578 return buf_desc;
1579}
1580
1581/* map buf_desc on all usable links,
1582 * unused buffers stay mapped as long as the link is up
1583 */
1584static int smcr_buf_map_usable_links(struct smc_link_group *lgr,
1585 struct smc_buf_desc *buf_desc, bool is_rmb)
1586{
1587 int i, rc = 0, cnt = 0;
1588
1589 /* protect against parallel link reconfiguration */
1590 mutex_lock(&lgr->llc_conf_mutex);
1591 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
1592 struct smc_link *lnk = &lgr->lnk[i];
1593
1594 if (!smc_link_usable(lnk))
1595 continue;
1596 if (smcr_buf_map_link(buf_desc, is_rmb, lnk)) {
1597 rc = -ENOMEM;
1598 goto out;
1599 }
1600 cnt++;
1601 }
1602out:
1603 mutex_unlock(&lgr->llc_conf_mutex);
1604 if (!rc && !cnt)
1605 rc = -EINVAL;
1606 return rc;
1607}
1608
1609#define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */
1610
1611static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr,
1612 bool is_dmb, int bufsize)
1613{
1614 struct smc_buf_desc *buf_desc;
1615 int rc;
1616
1617 if (smc_compress_bufsize(bufsize) > SMCD_DMBE_SIZES)
1618 return ERR_PTR(-EAGAIN);
1619
1620 /* try to alloc a new DMB */
1621 buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
1622 if (!buf_desc)
1623 return ERR_PTR(-ENOMEM);
1624 if (is_dmb) {
1625 rc = smc_ism_register_dmb(lgr, bufsize, buf_desc);
1626 if (rc) {
1627 kfree(buf_desc);
1628 if (rc == -ENOMEM)
1629 return ERR_PTR(-EAGAIN);
1630 if (rc == -ENOSPC)
1631 return ERR_PTR(-ENOSPC);
1632 return ERR_PTR(-EIO);
1633 }
1634 buf_desc->pages = virt_to_page(buf_desc->cpu_addr);
1635 /* CDC header stored in buf. So, pretend it was smaller */
1636 buf_desc->len = bufsize - sizeof(struct smcd_cdc_msg);
1637 } else {
1638 buf_desc->cpu_addr = kzalloc(bufsize, GFP_KERNEL |
1639 __GFP_NOWARN | __GFP_NORETRY |
1640 __GFP_NOMEMALLOC);
1641 if (!buf_desc->cpu_addr) {
1642 kfree(buf_desc);
1643 return ERR_PTR(-EAGAIN);
1644 }
1645 buf_desc->len = bufsize;
1646 }
1647 return buf_desc;
1648}
1649
1650static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
1651{
1652 struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM);
1653 struct smc_connection *conn = &smc->conn;
1654 struct smc_link_group *lgr = conn->lgr;
1655 struct list_head *buf_list;
1656 int bufsize, bufsize_short;
1657 struct mutex *lock; /* lock buffer list */
1658 int sk_buf_size;
1659
1660 if (is_rmb)
1661 /* use socket recv buffer size (w/o overhead) as start value */
1662 sk_buf_size = smc->sk.sk_rcvbuf / 2;
1663 else
1664 /* use socket send buffer size (w/o overhead) as start value */
1665 sk_buf_size = smc->sk.sk_sndbuf / 2;
1666
1667 for (bufsize_short = smc_compress_bufsize(sk_buf_size);
1668 bufsize_short >= 0; bufsize_short--) {
1669
1670 if (is_rmb) {
1671 lock = &lgr->rmbs_lock;
1672 buf_list = &lgr->rmbs[bufsize_short];
1673 } else {
1674 lock = &lgr->sndbufs_lock;
1675 buf_list = &lgr->sndbufs[bufsize_short];
1676 }
1677 bufsize = smc_uncompress_bufsize(bufsize_short);
1678 if ((1 << get_order(bufsize)) > SG_MAX_SINGLE_ALLOC)
1679 continue;
1680
1681 /* check for reusable slot in the link group */
1682 buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list);
1683 if (buf_desc) {
1684 memset(buf_desc->cpu_addr, 0, bufsize);
1685 break; /* found reusable slot */
1686 }
1687
1688 if (is_smcd)
1689 buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize);
1690 else
1691 buf_desc = smcr_new_buf_create(lgr, is_rmb, bufsize);
1692
1693 if (PTR_ERR(buf_desc) == -ENOMEM)
1694 break;
1695 if (IS_ERR(buf_desc))
1696 continue;
1697
1698 buf_desc->used = 1;
1699 mutex_lock(lock);
1700 list_add(&buf_desc->list, buf_list);
1701 mutex_unlock(lock);
1702 break; /* found */
1703 }
1704
1705 if (IS_ERR(buf_desc))
1706 return PTR_ERR(buf_desc);
1707
1708 if (!is_smcd) {
1709 if (smcr_buf_map_usable_links(lgr, buf_desc, is_rmb)) {
1710 smcr_buf_unuse(buf_desc, lgr);
1711 return -ENOMEM;
1712 }
1713 }
1714
1715 if (is_rmb) {
1716 conn->rmb_desc = buf_desc;
1717 conn->rmbe_size_short = bufsize_short;
1718 smc->sk.sk_rcvbuf = bufsize * 2;
1719 atomic_set(&conn->bytes_to_rcv, 0);
1720 conn->rmbe_update_limit =
1721 smc_rmb_wnd_update_limit(buf_desc->len);
1722 if (is_smcd)
1723 smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
1724 } else {
1725 conn->sndbuf_desc = buf_desc;
1726 smc->sk.sk_sndbuf = bufsize * 2;
1727 atomic_set(&conn->sndbuf_space, bufsize);
1728 }
1729 return 0;
1730}
1731
1732void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn)
1733{
1734 if (!conn->lgr || conn->lgr->is_smcd || !smc_link_active(conn->lnk))
1735 return;
1736 smc_ib_sync_sg_for_cpu(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE);
1737}
1738
1739void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn)
1740{
1741 if (!conn->lgr || conn->lgr->is_smcd || !smc_link_active(conn->lnk))
1742 return;
1743 smc_ib_sync_sg_for_device(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE);
1744}
1745
1746void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn)
1747{
1748 int i;
1749
1750 if (!conn->lgr || conn->lgr->is_smcd)
1751 return;
1752 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
1753 if (!smc_link_active(&conn->lgr->lnk[i]))
1754 continue;
1755 smc_ib_sync_sg_for_cpu(&conn->lgr->lnk[i], conn->rmb_desc,
1756 DMA_FROM_DEVICE);
1757 }
1758}
1759
1760void smc_rmb_sync_sg_for_device(struct smc_connection *conn)
1761{
1762 int i;
1763
1764 if (!conn->lgr || conn->lgr->is_smcd)
1765 return;
1766 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
1767 if (!smc_link_active(&conn->lgr->lnk[i]))
1768 continue;
1769 smc_ib_sync_sg_for_device(&conn->lgr->lnk[i], conn->rmb_desc,
1770 DMA_FROM_DEVICE);
1771 }
1772}
1773
1774/* create the send and receive buffer for an SMC socket;
1775 * receive buffers are called RMBs;
1776 * (even though the SMC protocol allows more than one RMB-element per RMB,
1777 * the Linux implementation uses just one RMB-element per RMB, i.e. uses an
1778 * extra RMB for every connection in a link group
1779 */
1780int smc_buf_create(struct smc_sock *smc, bool is_smcd)
1781{
1782 int rc;
1783
1784 /* create send buffer */
1785 rc = __smc_buf_create(smc, is_smcd, false);
1786 if (rc)
1787 return rc;
1788 /* create rmb */
1789 rc = __smc_buf_create(smc, is_smcd, true);
1790 if (rc) {
1791 mutex_lock(&smc->conn.lgr->sndbufs_lock);
1792 list_del(&smc->conn.sndbuf_desc->list);
1793 mutex_unlock(&smc->conn.lgr->sndbufs_lock);
1794 smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc);
1795 smc->conn.sndbuf_desc = NULL;
1796 }
1797 return rc;
1798}
1799
1800static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
1801{
1802 int i;
1803
1804 for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) {
1805 if (!test_and_set_bit(i, lgr->rtokens_used_mask))
1806 return i;
1807 }
1808 return -ENOSPC;
1809}
1810
1811static int smc_rtoken_find_by_link(struct smc_link_group *lgr, int lnk_idx,
1812 u32 rkey)
1813{
1814 int i;
1815
1816 for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
1817 if (test_bit(i, lgr->rtokens_used_mask) &&
1818 lgr->rtokens[i][lnk_idx].rkey == rkey)
1819 return i;
1820 }
1821 return -ENOENT;
1822}
1823
1824/* set rtoken for a new link to an existing rmb */
1825void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new,
1826 __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey)
1827{
1828 int rtok_idx;
1829
1830 rtok_idx = smc_rtoken_find_by_link(lgr, link_idx, ntohl(nw_rkey_known));
1831 if (rtok_idx == -ENOENT)
1832 return;
1833 lgr->rtokens[rtok_idx][link_idx_new].rkey = ntohl(nw_rkey);
1834 lgr->rtokens[rtok_idx][link_idx_new].dma_addr = be64_to_cpu(nw_vaddr);
1835}
1836
1837/* set rtoken for a new link whose link_id is given */
1838void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id,
1839 __be64 nw_vaddr, __be32 nw_rkey)
1840{
1841 u64 dma_addr = be64_to_cpu(nw_vaddr);
1842 u32 rkey = ntohl(nw_rkey);
1843 bool found = false;
1844 int link_idx;
1845
1846 for (link_idx = 0; link_idx < SMC_LINKS_PER_LGR_MAX; link_idx++) {
1847 if (lgr->lnk[link_idx].link_id == link_id) {
1848 found = true;
1849 break;
1850 }
1851 }
1852 if (!found)
1853 return;
1854 lgr->rtokens[rtok_idx][link_idx].rkey = rkey;
1855 lgr->rtokens[rtok_idx][link_idx].dma_addr = dma_addr;
1856}
1857
1858/* add a new rtoken from peer */
1859int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey)
1860{
1861 struct smc_link_group *lgr = smc_get_lgr(lnk);
1862 u64 dma_addr = be64_to_cpu(nw_vaddr);
1863 u32 rkey = ntohl(nw_rkey);
1864 int i;
1865
1866 for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
1867 if (lgr->rtokens[i][lnk->link_idx].rkey == rkey &&
1868 lgr->rtokens[i][lnk->link_idx].dma_addr == dma_addr &&
1869 test_bit(i, lgr->rtokens_used_mask)) {
1870 /* already in list */
1871 return i;
1872 }
1873 }
1874 i = smc_rmb_reserve_rtoken_idx(lgr);
1875 if (i < 0)
1876 return i;
1877 lgr->rtokens[i][lnk->link_idx].rkey = rkey;
1878 lgr->rtokens[i][lnk->link_idx].dma_addr = dma_addr;
1879 return i;
1880}
1881
1882/* delete an rtoken from all links */
1883int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey)
1884{
1885 struct smc_link_group *lgr = smc_get_lgr(lnk);
1886 u32 rkey = ntohl(nw_rkey);
1887 int i, j;
1888
1889 for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
1890 if (lgr->rtokens[i][lnk->link_idx].rkey == rkey &&
1891 test_bit(i, lgr->rtokens_used_mask)) {
1892 for (j = 0; j < SMC_LINKS_PER_LGR_MAX; j++) {
1893 lgr->rtokens[i][j].rkey = 0;
1894 lgr->rtokens[i][j].dma_addr = 0;
1895 }
1896 clear_bit(i, lgr->rtokens_used_mask);
1897 return 0;
1898 }
1899 }
1900 return -ENOENT;
1901}
1902
1903/* save rkey and dma_addr received from peer during clc handshake */
1904int smc_rmb_rtoken_handling(struct smc_connection *conn,
1905 struct smc_link *lnk,
1906 struct smc_clc_msg_accept_confirm *clc)
1907{
1908 conn->rtoken_idx = smc_rtoken_add(lnk, clc->r0.rmb_dma_addr,
1909 clc->r0.rmb_rkey);
1910 if (conn->rtoken_idx < 0)
1911 return conn->rtoken_idx;
1912 return 0;
1913}
1914
1915static void smc_core_going_away(void)
1916{
1917 struct smc_ib_device *smcibdev;
1918 struct smcd_dev *smcd;
1919
1920 mutex_lock(&smc_ib_devices.mutex);
1921 list_for_each_entry(smcibdev, &smc_ib_devices.list, list) {
1922 int i;
1923
1924 for (i = 0; i < SMC_MAX_PORTS; i++)
1925 set_bit(i, smcibdev->ports_going_away);
1926 }
1927 mutex_unlock(&smc_ib_devices.mutex);
1928
1929 mutex_lock(&smcd_dev_list.mutex);
1930 list_for_each_entry(smcd, &smcd_dev_list.list, list) {
1931 smcd->going_away = 1;
1932 }
1933 mutex_unlock(&smcd_dev_list.mutex);
1934}
1935
1936/* Clean up all SMC link groups */
1937static void smc_lgrs_shutdown(void)
1938{
1939 struct smcd_dev *smcd;
1940
1941 smc_core_going_away();
1942
1943 smc_smcr_terminate_all(NULL);
1944
1945 mutex_lock(&smcd_dev_list.mutex);
1946 list_for_each_entry(smcd, &smcd_dev_list.list, list)
1947 smc_smcd_terminate_all(smcd);
1948 mutex_unlock(&smcd_dev_list.mutex);
1949}
1950
1951static int smc_core_reboot_event(struct notifier_block *this,
1952 unsigned long event, void *ptr)
1953{
1954 smc_lgrs_shutdown();
1955 smc_ib_unregister_client();
1956 return 0;
1957}
1958
1959static struct notifier_block smc_reboot_notifier = {
1960 .notifier_call = smc_core_reboot_event,
1961};
1962
1963int __init smc_core_init(void)
1964{
1965 return register_reboot_notifier(&smc_reboot_notifier);
1966}
1967
1968/* Called (from smc_exit) when module is removed */
1969void smc_core_exit(void)
1970{
1971 unregister_reboot_notifier(&smc_reboot_notifier);
1972 smc_lgrs_shutdown();
1973}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
new file mode 100644
index 000000000..9364d0f35
--- /dev/null
+++ b/net/smc/smc_core.h
@@ -0,0 +1,425 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
4 *
5 * Definitions for SMC Connections, Link Groups and Links
6 *
7 * Copyright IBM Corp. 2016
8 *
9 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
10 */
11
12#ifndef _SMC_CORE_H
13#define _SMC_CORE_H
14
15#include <linux/atomic.h>
16#include <rdma/ib_verbs.h>
17
18#include "smc.h"
19#include "smc_ib.h"
20
21#define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */
22
23struct smc_lgr_list { /* list of link group definition */
24 struct list_head list;
25 spinlock_t lock; /* protects list of link groups */
26 u32 num; /* unique link group number */
27};
28
29enum smc_lgr_role { /* possible roles of a link group */
30 SMC_CLNT, /* client */
31 SMC_SERV /* server */
32};
33
34enum smc_link_state { /* possible states of a link */
35 SMC_LNK_UNUSED, /* link is unused */
36 SMC_LNK_INACTIVE, /* link is inactive */
37 SMC_LNK_ACTIVATING, /* link is being activated */
38 SMC_LNK_ACTIVE, /* link is active */
39};
40
41#define SMC_WR_BUF_SIZE 48 /* size of work request buffer */
42
43struct smc_wr_buf {
44 u8 raw[SMC_WR_BUF_SIZE];
45};
46
47#define SMC_WR_REG_MR_WAIT_TIME (5 * HZ)/* wait time for ib_wr_reg_mr result */
48
49enum smc_wr_reg_state {
50 POSTED, /* ib_wr_reg_mr request posted */
51 CONFIRMED, /* ib_wr_reg_mr response: successful */
52 FAILED /* ib_wr_reg_mr response: failure */
53};
54
55struct smc_rdma_sge { /* sges for RDMA writes */
56 struct ib_sge wr_tx_rdma_sge[SMC_IB_MAX_SEND_SGE];
57};
58
59#define SMC_MAX_RDMA_WRITES 2 /* max. # of RDMA writes per
60 * message send
61 */
62
63struct smc_rdma_sges { /* sges per message send */
64 struct smc_rdma_sge tx_rdma_sge[SMC_MAX_RDMA_WRITES];
65};
66
67struct smc_rdma_wr { /* work requests per message
68 * send
69 */
70 struct ib_rdma_wr wr_tx_rdma[SMC_MAX_RDMA_WRITES];
71};
72
73#define SMC_LGR_ID_SIZE 4
74
75struct smc_link {
76 struct smc_ib_device *smcibdev; /* ib-device */
77 u8 ibport; /* port - values 1 | 2 */
78 struct ib_pd *roce_pd; /* IB protection domain,
79 * unique for every RoCE QP
80 */
81 struct ib_qp *roce_qp; /* IB queue pair */
82 struct ib_qp_attr qp_attr; /* IB queue pair attributes */
83
84 struct smc_wr_buf *wr_tx_bufs; /* WR send payload buffers */
85 struct ib_send_wr *wr_tx_ibs; /* WR send meta data */
86 struct ib_sge *wr_tx_sges; /* WR send gather meta data */
87 struct smc_rdma_sges *wr_tx_rdma_sges;/*RDMA WRITE gather meta data*/
88 struct smc_rdma_wr *wr_tx_rdmas; /* WR RDMA WRITE */
89 struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */
90 struct completion *wr_tx_compl; /* WR send CQE completion */
91 /* above four vectors have wr_tx_cnt elements and use the same index */
92 dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */
93 atomic_long_t wr_tx_id; /* seq # of last sent WR */
94 unsigned long *wr_tx_mask; /* bit mask of used indexes */
95 u32 wr_tx_cnt; /* number of WR send buffers */
96 wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */
97 atomic_t wr_tx_refcnt; /* tx refs to link */
98
99 struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */
100 struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */
101 struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */
102 /* above three vectors have wr_rx_cnt elements and use the same index */
103 dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */
104 u64 wr_rx_id; /* seq # of last recv WR */
105 u32 wr_rx_cnt; /* number of WR recv buffers */
106 unsigned long wr_rx_tstamp; /* jiffies when last buf rx */
107
108 struct ib_reg_wr wr_reg; /* WR register memory region */
109 wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */
110 atomic_t wr_reg_refcnt; /* reg refs to link */
111 enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */
112
113 u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/
114 u8 sgid_index; /* gid index for vlan id */
115 u32 peer_qpn; /* QP number of peer */
116 enum ib_mtu path_mtu; /* used mtu */
117 enum ib_mtu peer_mtu; /* mtu size of peer */
118 u32 psn_initial; /* QP tx initial packet seqno */
119 u32 peer_psn; /* QP rx initial packet seqno */
120 u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */
121 u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/
122 u8 link_id; /* unique # within link group */
123 u8 link_uid[SMC_LGR_ID_SIZE]; /* unique lnk id */
124 u8 peer_link_uid[SMC_LGR_ID_SIZE]; /* peer uid */
125 u8 link_idx; /* index in lgr link array */
126 u8 link_is_asym; /* is link asymmetric? */
127 struct smc_link_group *lgr; /* parent link group */
128 struct work_struct link_down_wrk; /* wrk to bring link down */
129
130 enum smc_link_state state; /* state of link */
131 struct delayed_work llc_testlink_wrk; /* testlink worker */
132 struct completion llc_testlink_resp; /* wait for rx of testlink */
133 int llc_testlink_time; /* testlink interval */
134};
135
136/* For now we just allow one parallel link per link group. The SMC protocol
137 * allows more (up to 8).
138 */
139#define SMC_LINKS_PER_LGR_MAX 3
140#define SMC_SINGLE_LINK 0
141
142/* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */
143struct smc_buf_desc {
144 struct list_head list;
145 void *cpu_addr; /* virtual address of buffer */
146 struct page *pages;
147 int len; /* length of buffer */
148 u32 used; /* currently used / unused */
149 union {
150 struct { /* SMC-R */
151 struct sg_table sgt[SMC_LINKS_PER_LGR_MAX];
152 /* virtual buffer */
153 struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX];
154 /* for rmb only: memory region
155 * incl. rkey provided to peer
156 */
157 u32 order; /* allocation order */
158
159 u8 is_conf_rkey;
160 /* confirm_rkey done */
161 u8 is_reg_mr[SMC_LINKS_PER_LGR_MAX];
162 /* mem region registered */
163 u8 is_map_ib[SMC_LINKS_PER_LGR_MAX];
164 /* mem region mapped to lnk */
165 u8 is_reg_err;
166 /* buffer registration err */
167 };
168 struct { /* SMC-D */
169 unsigned short sba_idx;
170 /* SBA index number */
171 u64 token;
172 /* DMB token number */
173 dma_addr_t dma_addr;
174 /* DMA address */
175 };
176 };
177};
178
179struct smc_rtoken { /* address/key of remote RMB */
180 u64 dma_addr;
181 u32 rkey;
182};
183
184#define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */
185#define SMC_RMBE_SIZES 16 /* number of distinct RMBE sizes */
186/* theoretically, the RFC states that largest size would be 512K,
187 * i.e. compressed 5 and thus 6 sizes (0..5), despite
188 * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15)
189 */
190
191struct smcd_dev;
192
193enum smc_lgr_type { /* redundancy state of lgr */
194 SMC_LGR_NONE, /* no active links, lgr to be deleted */
195 SMC_LGR_SINGLE, /* 1 active RNIC on each peer */
196 SMC_LGR_SYMMETRIC, /* 2 active RNICs on each peer */
197 SMC_LGR_ASYMMETRIC_PEER, /* local has 2, peer 1 active RNICs */
198 SMC_LGR_ASYMMETRIC_LOCAL, /* local has 1, peer 2 active RNICs */
199};
200
201enum smc_llc_flowtype {
202 SMC_LLC_FLOW_NONE = 0,
203 SMC_LLC_FLOW_ADD_LINK = 2,
204 SMC_LLC_FLOW_DEL_LINK = 4,
205 SMC_LLC_FLOW_RKEY = 6,
206};
207
208struct smc_llc_qentry;
209
210struct smc_llc_flow {
211 enum smc_llc_flowtype type;
212 struct smc_llc_qentry *qentry;
213};
214
215struct smc_link_group {
216 struct list_head list;
217 struct rb_root conns_all; /* connection tree */
218 rwlock_t conns_lock; /* protects conns_all */
219 unsigned int conns_num; /* current # of connections */
220 unsigned short vlan_id; /* vlan id of link group */
221
222 struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */
223 struct mutex sndbufs_lock; /* protects tx buffers */
224 struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */
225 struct mutex rmbs_lock; /* protects rx buffers */
226
227 u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */
228 struct delayed_work free_work; /* delayed freeing of an lgr */
229 struct work_struct terminate_work; /* abnormal lgr termination */
230 struct workqueue_struct *tx_wq; /* wq for conn. tx workers */
231 u8 sync_err : 1; /* lgr no longer fits to peer */
232 u8 terminating : 1;/* lgr is terminating */
233 u8 freeing : 1; /* lgr is being freed */
234
235 bool is_smcd; /* SMC-R or SMC-D */
236 u8 smc_version;
237 u8 negotiated_eid[SMC_MAX_EID_LEN];
238 u8 peer_os; /* peer operating system */
239 u8 peer_smc_release;
240 u8 peer_hostname[SMC_MAX_HOSTNAME_LEN];
241 union {
242 struct { /* SMC-R */
243 enum smc_lgr_role role;
244 /* client or server */
245 struct smc_link lnk[SMC_LINKS_PER_LGR_MAX];
246 /* smc link */
247 char peer_systemid[SMC_SYSTEMID_LEN];
248 /* unique system_id of peer */
249 struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX]
250 [SMC_LINKS_PER_LGR_MAX];
251 /* remote addr/key pairs */
252 DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX);
253 /* used rtoken elements */
254 u8 next_link_id;
255 enum smc_lgr_type type;
256 /* redundancy state */
257 u8 pnet_id[SMC_MAX_PNETID_LEN + 1];
258 /* pnet id of this lgr */
259 struct list_head llc_event_q;
260 /* queue for llc events */
261 spinlock_t llc_event_q_lock;
262 /* protects llc_event_q */
263 struct mutex llc_conf_mutex;
264 /* protects lgr reconfig. */
265 struct work_struct llc_add_link_work;
266 struct work_struct llc_del_link_work;
267 struct work_struct llc_event_work;
268 /* llc event worker */
269 wait_queue_head_t llc_flow_waiter;
270 /* w4 next llc event */
271 wait_queue_head_t llc_msg_waiter;
272 /* w4 next llc msg */
273 struct smc_llc_flow llc_flow_lcl;
274 /* llc local control field */
275 struct smc_llc_flow llc_flow_rmt;
276 /* llc remote control field */
277 struct smc_llc_qentry *delayed_event;
278 /* arrived when flow active */
279 spinlock_t llc_flow_lock;
280 /* protects llc flow */
281 int llc_testlink_time;
282 /* link keep alive time */
283 u32 llc_termination_rsn;
284 /* rsn code for termination */
285 };
286 struct { /* SMC-D */
287 u64 peer_gid;
288 /* Peer GID (remote) */
289 struct smcd_dev *smcd;
290 /* ISM device for VLAN reg. */
291 u8 peer_shutdown : 1;
292 /* peer triggered shutdownn */
293 };
294 };
295};
296
297struct smc_clc_msg_local;
298
299struct smc_init_info {
300 u8 is_smcd;
301 u8 smc_type_v1;
302 u8 smc_type_v2;
303 u8 first_contact_peer;
304 u8 first_contact_local;
305 unsigned short vlan_id;
306 /* SMC-R */
307 struct smc_clc_msg_local *ib_lcl;
308 struct smc_ib_device *ib_dev;
309 u8 ib_gid[SMC_GID_SIZE];
310 u8 ib_port;
311 u32 ib_clcqpn;
312 /* SMC-D */
313 u64 ism_peer_gid[SMC_MAX_ISM_DEVS + 1];
314 struct smcd_dev *ism_dev[SMC_MAX_ISM_DEVS + 1];
315 u16 ism_chid[SMC_MAX_ISM_DEVS + 1];
316 u8 ism_offered_cnt; /* # of ISM devices offered */
317 u8 ism_selected; /* index of selected ISM dev*/
318 u8 smcd_version;
319};
320
321/* Find the connection associated with the given alert token in the link group.
322 * To use rbtrees we have to implement our own search core.
323 * Requires @conns_lock
324 * @token alert token to search for
325 * @lgr link group to search in
326 * Returns connection associated with token if found, NULL otherwise.
327 */
328static inline struct smc_connection *smc_lgr_find_conn(
329 u32 token, struct smc_link_group *lgr)
330{
331 struct smc_connection *res = NULL;
332 struct rb_node *node;
333
334 node = lgr->conns_all.rb_node;
335 while (node) {
336 struct smc_connection *cur = rb_entry(node,
337 struct smc_connection, alert_node);
338
339 if (cur->alert_token_local > token) {
340 node = node->rb_left;
341 } else {
342 if (cur->alert_token_local < token) {
343 node = node->rb_right;
344 } else {
345 res = cur;
346 break;
347 }
348 }
349 }
350
351 return res;
352}
353
354/* returns true if the specified link is usable */
355static inline bool smc_link_usable(struct smc_link *lnk)
356{
357 if (lnk->state == SMC_LNK_UNUSED || lnk->state == SMC_LNK_INACTIVE)
358 return false;
359 return true;
360}
361
362static inline bool smc_link_sendable(struct smc_link *lnk)
363{
364 return smc_link_usable(lnk) &&
365 lnk->qp_attr.cur_qp_state == IB_QPS_RTS;
366}
367
368static inline bool smc_link_active(struct smc_link *lnk)
369{
370 return lnk->state == SMC_LNK_ACTIVE;
371}
372
373struct smc_sock;
374struct smc_clc_msg_accept_confirm;
375struct smc_clc_msg_local;
376
377void smc_lgr_cleanup_early(struct smc_connection *conn);
378void smc_lgr_terminate_sched(struct smc_link_group *lgr);
379void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport);
380void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport);
381void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid,
382 unsigned short vlan);
383void smc_smcd_terminate_all(struct smcd_dev *dev);
384void smc_smcr_terminate_all(struct smc_ib_device *smcibdev);
385int smc_buf_create(struct smc_sock *smc, bool is_smcd);
386int smc_uncompress_bufsize(u8 compressed);
387int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link,
388 struct smc_clc_msg_accept_confirm *clc);
389int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey);
390int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey);
391void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new,
392 __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey);
393void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id,
394 __be64 nw_vaddr, __be32 nw_rkey);
395void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn);
396void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn);
397void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn);
398void smc_rmb_sync_sg_for_device(struct smc_connection *conn);
399int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini);
400
401void smc_conn_free(struct smc_connection *conn);
402int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini);
403void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr);
404int smc_core_init(void);
405void smc_core_exit(void);
406
407int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
408 u8 link_idx, struct smc_init_info *ini);
409void smcr_link_clear(struct smc_link *lnk, bool log);
410int smcr_buf_map_lgr(struct smc_link *lnk);
411int smcr_buf_reg_lgr(struct smc_link *lnk);
412void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type);
413void smcr_lgr_set_type_asym(struct smc_link_group *lgr,
414 enum smc_lgr_type new_type, int asym_lnk_idx);
415int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc);
416struct smc_link *smc_switch_conns(struct smc_link_group *lgr,
417 struct smc_link *from_lnk, bool is_dev_err);
418void smcr_link_down_cond(struct smc_link *lnk);
419void smcr_link_down_cond_sched(struct smc_link *lnk);
420
421static inline struct smc_link_group *smc_get_lgr(struct smc_link *link)
422{
423 return link->lgr;
424}
425#endif
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
new file mode 100644
index 000000000..f15fca59b
--- /dev/null
+++ b/net/smc/smc_diag.c
@@ -0,0 +1,283 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
4 *
5 * Monitoring SMC transport protocol sockets
6 *
7 * Copyright IBM Corp. 2016
8 *
9 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
10 */
11
12#include <linux/kernel.h>
13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/init.h>
16#include <linux/sock_diag.h>
17#include <linux/inet_diag.h>
18#include <linux/smc_diag.h>
19#include <net/netlink.h>
20#include <net/smc.h>
21
22#include "smc.h"
23#include "smc_core.h"
24
25struct smc_diag_dump_ctx {
26 int pos[2];
27};
28
29static struct smc_diag_dump_ctx *smc_dump_context(struct netlink_callback *cb)
30{
31 return (struct smc_diag_dump_ctx *)cb->ctx;
32}
33
34static void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw)
35{
36 sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x",
37 be16_to_cpu(((__be16 *)gid_raw)[0]),
38 be16_to_cpu(((__be16 *)gid_raw)[1]),
39 be16_to_cpu(((__be16 *)gid_raw)[2]),
40 be16_to_cpu(((__be16 *)gid_raw)[3]),
41 be16_to_cpu(((__be16 *)gid_raw)[4]),
42 be16_to_cpu(((__be16 *)gid_raw)[5]),
43 be16_to_cpu(((__be16 *)gid_raw)[6]),
44 be16_to_cpu(((__be16 *)gid_raw)[7]));
45}
46
47static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk)
48{
49 struct smc_sock *smc = smc_sk(sk);
50
51 memset(r, 0, sizeof(*r));
52 r->diag_family = sk->sk_family;
53 sock_diag_save_cookie(sk, r->id.idiag_cookie);
54 if (!smc->clcsock)
55 return;
56 r->id.idiag_sport = htons(smc->clcsock->sk->sk_num);
57 r->id.idiag_dport = smc->clcsock->sk->sk_dport;
58 r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if;
59 if (sk->sk_protocol == SMCPROTO_SMC) {
60 r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr;
61 r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr;
62#if IS_ENABLED(CONFIG_IPV6)
63 } else if (sk->sk_protocol == SMCPROTO_SMC6) {
64 memcpy(&r->id.idiag_src, &smc->clcsock->sk->sk_v6_rcv_saddr,
65 sizeof(smc->clcsock->sk->sk_v6_rcv_saddr));
66 memcpy(&r->id.idiag_dst, &smc->clcsock->sk->sk_v6_daddr,
67 sizeof(smc->clcsock->sk->sk_v6_daddr));
68#endif
69 }
70}
71
72static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
73 struct smc_diag_msg *r,
74 struct user_namespace *user_ns)
75{
76 if (nla_put_u8(skb, SMC_DIAG_SHUTDOWN, sk->sk_shutdown))
77 return 1;
78
79 r->diag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
80 r->diag_inode = sock_i_ino(sk);
81 return 0;
82}
83
84static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
85 struct netlink_callback *cb,
86 const struct smc_diag_req *req,
87 struct nlattr *bc)
88{
89 struct smc_sock *smc = smc_sk(sk);
90 struct smc_diag_fallback fallback;
91 struct user_namespace *user_ns;
92 struct smc_diag_msg *r;
93 struct nlmsghdr *nlh;
94
95 nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
96 cb->nlh->nlmsg_type, sizeof(*r), NLM_F_MULTI);
97 if (!nlh)
98 return -EMSGSIZE;
99
100 r = nlmsg_data(nlh);
101 smc_diag_msg_common_fill(r, sk);
102 r->diag_state = sk->sk_state;
103 if (smc->use_fallback)
104 r->diag_mode = SMC_DIAG_MODE_FALLBACK_TCP;
105 else if (smc->conn.lgr && smc->conn.lgr->is_smcd)
106 r->diag_mode = SMC_DIAG_MODE_SMCD;
107 else
108 r->diag_mode = SMC_DIAG_MODE_SMCR;
109 user_ns = sk_user_ns(NETLINK_CB(cb->skb).sk);
110 if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns))
111 goto errout;
112
113 fallback.reason = smc->fallback_rsn;
114 fallback.peer_diagnosis = smc->peer_diagnosis;
115 if (nla_put(skb, SMC_DIAG_FALLBACK, sizeof(fallback), &fallback) < 0)
116 goto errout;
117
118 if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) &&
119 smc->conn.alert_token_local) {
120 struct smc_connection *conn = &smc->conn;
121 struct smc_diag_conninfo cinfo = {
122 .token = conn->alert_token_local,
123 .sndbuf_size = conn->sndbuf_desc ?
124 conn->sndbuf_desc->len : 0,
125 .rmbe_size = conn->rmb_desc ? conn->rmb_desc->len : 0,
126 .peer_rmbe_size = conn->peer_rmbe_size,
127
128 .rx_prod.wrap = conn->local_rx_ctrl.prod.wrap,
129 .rx_prod.count = conn->local_rx_ctrl.prod.count,
130 .rx_cons.wrap = conn->local_rx_ctrl.cons.wrap,
131 .rx_cons.count = conn->local_rx_ctrl.cons.count,
132
133 .tx_prod.wrap = conn->local_tx_ctrl.prod.wrap,
134 .tx_prod.count = conn->local_tx_ctrl.prod.count,
135 .tx_cons.wrap = conn->local_tx_ctrl.cons.wrap,
136 .tx_cons.count = conn->local_tx_ctrl.cons.count,
137
138 .tx_prod_flags =
139 *(u8 *)&conn->local_tx_ctrl.prod_flags,
140 .tx_conn_state_flags =
141 *(u8 *)&conn->local_tx_ctrl.conn_state_flags,
142 .rx_prod_flags = *(u8 *)&conn->local_rx_ctrl.prod_flags,
143 .rx_conn_state_flags =
144 *(u8 *)&conn->local_rx_ctrl.conn_state_flags,
145
146 .tx_prep.wrap = conn->tx_curs_prep.wrap,
147 .tx_prep.count = conn->tx_curs_prep.count,
148 .tx_sent.wrap = conn->tx_curs_sent.wrap,
149 .tx_sent.count = conn->tx_curs_sent.count,
150 .tx_fin.wrap = conn->tx_curs_fin.wrap,
151 .tx_fin.count = conn->tx_curs_fin.count,
152 };
153
154 if (nla_put(skb, SMC_DIAG_CONNINFO, sizeof(cinfo), &cinfo) < 0)
155 goto errout;
156 }
157
158 if (smc->conn.lgr && !smc->conn.lgr->is_smcd &&
159 (req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) &&
160 !list_empty(&smc->conn.lgr->list)) {
161 struct smc_diag_lgrinfo linfo = {
162 .role = smc->conn.lgr->role,
163 .lnk[0].ibport = smc->conn.lgr->lnk[0].ibport,
164 .lnk[0].link_id = smc->conn.lgr->lnk[0].link_id,
165 };
166
167 memcpy(linfo.lnk[0].ibname,
168 smc->conn.lgr->lnk[0].smcibdev->ibdev->name,
169 sizeof(smc->conn.lgr->lnk[0].smcibdev->ibdev->name));
170 smc_gid_be16_convert(linfo.lnk[0].gid,
171 smc->conn.lgr->lnk[0].gid);
172 smc_gid_be16_convert(linfo.lnk[0].peer_gid,
173 smc->conn.lgr->lnk[0].peer_gid);
174
175 if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0)
176 goto errout;
177 }
178 if (smc->conn.lgr && smc->conn.lgr->is_smcd &&
179 (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) &&
180 !list_empty(&smc->conn.lgr->list)) {
181 struct smc_connection *conn = &smc->conn;
182 struct smcd_diag_dmbinfo dinfo;
183
184 memset(&dinfo, 0, sizeof(dinfo));
185
186 dinfo.linkid = *((u32 *)conn->lgr->id);
187 dinfo.peer_gid = conn->lgr->peer_gid;
188 dinfo.my_gid = conn->lgr->smcd->local_gid;
189 dinfo.token = conn->rmb_desc->token;
190 dinfo.peer_token = conn->peer_token;
191
192 if (nla_put(skb, SMC_DIAG_DMBINFO, sizeof(dinfo), &dinfo) < 0)
193 goto errout;
194 }
195
196 nlmsg_end(skb, nlh);
197 return 0;
198
199errout:
200 nlmsg_cancel(skb, nlh);
201 return -EMSGSIZE;
202}
203
204static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb,
205 struct netlink_callback *cb, int p_type)
206{
207 struct smc_diag_dump_ctx *cb_ctx = smc_dump_context(cb);
208 struct net *net = sock_net(skb->sk);
209 int snum = cb_ctx->pos[p_type];
210 struct nlattr *bc = NULL;
211 struct hlist_head *head;
212 int rc = 0, num = 0;
213 struct sock *sk;
214
215 read_lock(&prot->h.smc_hash->lock);
216 head = &prot->h.smc_hash->ht;
217 if (hlist_empty(head))
218 goto out;
219
220 sk_for_each(sk, head) {
221 if (!net_eq(sock_net(sk), net))
222 continue;
223 if (num < snum)
224 goto next;
225 rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc);
226 if (rc < 0)
227 goto out;
228next:
229 num++;
230 }
231
232out:
233 read_unlock(&prot->h.smc_hash->lock);
234 cb_ctx->pos[p_type] = num;
235 return rc;
236}
237
238static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
239{
240 int rc = 0;
241
242 rc = smc_diag_dump_proto(&smc_proto, skb, cb, SMCPROTO_SMC);
243 if (!rc)
244 smc_diag_dump_proto(&smc_proto6, skb, cb, SMCPROTO_SMC6);
245 return skb->len;
246}
247
248static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
249{
250 struct net *net = sock_net(skb->sk);
251
252 if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY &&
253 h->nlmsg_flags & NLM_F_DUMP) {
254 {
255 struct netlink_dump_control c = {
256 .dump = smc_diag_dump,
257 .min_dump_alloc = SKB_WITH_OVERHEAD(32768),
258 };
259 return netlink_dump_start(net->diag_nlsk, skb, h, &c);
260 }
261 }
262 return 0;
263}
264
265static const struct sock_diag_handler smc_diag_handler = {
266 .family = AF_SMC,
267 .dump = smc_diag_handler_dump,
268};
269
270static int __init smc_diag_init(void)
271{
272 return sock_diag_register(&smc_diag_handler);
273}
274
275static void __exit smc_diag_exit(void)
276{
277 sock_diag_unregister(&smc_diag_handler);
278}
279
280module_init(smc_diag_init);
281module_exit(smc_diag_exit);
282MODULE_LICENSE("GPL");
283MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 43 /* AF_SMC */);
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
new file mode 100644
index 000000000..f1ffbd414
--- /dev/null
+++ b/net/smc/smc_ib.c
@@ -0,0 +1,643 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
4 *
5 * IB infrastructure:
6 * Establish SMC-R as an Infiniband Client to be notified about added and
7 * removed IB devices of type RDMA.
8 * Determine device and port characteristics for these IB devices.
9 *
10 * Copyright IBM Corp. 2016
11 *
12 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
13 */
14
15#include <linux/random.h>
16#include <linux/workqueue.h>
17#include <linux/scatterlist.h>
18#include <linux/wait.h>
19#include <linux/mutex.h>
20#include <rdma/ib_verbs.h>
21#include <rdma/ib_cache.h>
22
23#include "smc_pnet.h"
24#include "smc_ib.h"
25#include "smc_core.h"
26#include "smc_wr.h"
27#include "smc.h"
28
29#define SMC_MAX_CQE 32766 /* max. # of completion queue elements */
30
31#define SMC_QP_MIN_RNR_TIMER 5
32#define SMC_QP_TIMEOUT 15 /* 4096 * 2 ** timeout usec */
33#define SMC_QP_RETRY_CNT 7 /* 7: infinite */
34#define SMC_QP_RNR_RETRY 7 /* 7: infinite */
35
36struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */
37 .mutex = __MUTEX_INITIALIZER(smc_ib_devices.mutex),
38 .list = LIST_HEAD_INIT(smc_ib_devices.list),
39};
40
41u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */
42
43static int smc_ib_modify_qp_init(struct smc_link *lnk)
44{
45 struct ib_qp_attr qp_attr;
46
47 memset(&qp_attr, 0, sizeof(qp_attr));
48 qp_attr.qp_state = IB_QPS_INIT;
49 qp_attr.pkey_index = 0;
50 qp_attr.port_num = lnk->ibport;
51 qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE
52 | IB_ACCESS_REMOTE_WRITE;
53 return ib_modify_qp(lnk->roce_qp, &qp_attr,
54 IB_QP_STATE | IB_QP_PKEY_INDEX |
55 IB_QP_ACCESS_FLAGS | IB_QP_PORT);
56}
57
58static int smc_ib_modify_qp_rtr(struct smc_link *lnk)
59{
60 enum ib_qp_attr_mask qp_attr_mask =
61 IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN |
62 IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER;
63 struct ib_qp_attr qp_attr;
64
65 memset(&qp_attr, 0, sizeof(qp_attr));
66 qp_attr.qp_state = IB_QPS_RTR;
67 qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu);
68 qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE;
69 rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport);
70 rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, 1, 0);
71 rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid);
72 memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac,
73 sizeof(lnk->peer_mac));
74 qp_attr.dest_qp_num = lnk->peer_qpn;
75 qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */
76 qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming
77 * requests
78 */
79 qp_attr.min_rnr_timer = SMC_QP_MIN_RNR_TIMER;
80
81 return ib_modify_qp(lnk->roce_qp, &qp_attr, qp_attr_mask);
82}
83
84int smc_ib_modify_qp_rts(struct smc_link *lnk)
85{
86 struct ib_qp_attr qp_attr;
87
88 memset(&qp_attr, 0, sizeof(qp_attr));
89 qp_attr.qp_state = IB_QPS_RTS;
90 qp_attr.timeout = SMC_QP_TIMEOUT; /* local ack timeout */
91 qp_attr.retry_cnt = SMC_QP_RETRY_CNT; /* retry count */
92 qp_attr.rnr_retry = SMC_QP_RNR_RETRY; /* RNR retries, 7=infinite */
93 qp_attr.sq_psn = lnk->psn_initial; /* starting send packet seq # */
94 qp_attr.max_rd_atomic = 1; /* # of outstanding RDMA reads and
95 * atomic ops allowed
96 */
97 return ib_modify_qp(lnk->roce_qp, &qp_attr,
98 IB_QP_STATE | IB_QP_TIMEOUT | IB_QP_RETRY_CNT |
99 IB_QP_SQ_PSN | IB_QP_RNR_RETRY |
100 IB_QP_MAX_QP_RD_ATOMIC);
101}
102
103int smc_ib_modify_qp_error(struct smc_link *lnk)
104{
105 struct ib_qp_attr qp_attr;
106
107 memset(&qp_attr, 0, sizeof(qp_attr));
108 qp_attr.qp_state = IB_QPS_ERR;
109 return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE);
110}
111
112int smc_ib_ready_link(struct smc_link *lnk)
113{
114 struct smc_link_group *lgr = smc_get_lgr(lnk);
115 int rc = 0;
116
117 rc = smc_ib_modify_qp_init(lnk);
118 if (rc)
119 goto out;
120
121 rc = smc_ib_modify_qp_rtr(lnk);
122 if (rc)
123 goto out;
124 smc_wr_remember_qp_attr(lnk);
125 rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv,
126 IB_CQ_SOLICITED_MASK);
127 if (rc)
128 goto out;
129 rc = smc_wr_rx_post_init(lnk);
130 if (rc)
131 goto out;
132 smc_wr_remember_qp_attr(lnk);
133
134 if (lgr->role == SMC_SERV) {
135 rc = smc_ib_modify_qp_rts(lnk);
136 if (rc)
137 goto out;
138 smc_wr_remember_qp_attr(lnk);
139 }
140out:
141 return rc;
142}
143
144static int smc_ib_fill_mac(struct smc_ib_device *smcibdev, u8 ibport)
145{
146 const struct ib_gid_attr *attr;
147 int rc;
148
149 attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, 0);
150 if (IS_ERR(attr))
151 return -ENODEV;
152
153 rc = rdma_read_gid_l2_fields(attr, NULL, smcibdev->mac[ibport - 1]);
154 rdma_put_gid_attr(attr);
155 return rc;
156}
157
158/* Create an identifier unique for this instance of SMC-R.
159 * The MAC-address of the first active registered IB device
160 * plus a random 2-byte number is used to create this identifier.
161 * This name is delivered to the peer during connection initialization.
162 */
163static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev,
164 u8 ibport)
165{
166 memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1],
167 sizeof(smcibdev->mac[ibport - 1]));
168}
169
170bool smc_ib_is_valid_local_systemid(void)
171{
172 return !is_zero_ether_addr(&local_systemid[2]);
173}
174
175static void smc_ib_init_local_systemid(void)
176{
177 get_random_bytes(&local_systemid[0], 2);
178}
179
180bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport)
181{
182 return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE;
183}
184
185/* determine the gid for an ib-device port and vlan id */
186int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
187 unsigned short vlan_id, u8 gid[], u8 *sgid_index)
188{
189 const struct ib_gid_attr *attr;
190 const struct net_device *ndev;
191 int i;
192
193 for (i = 0; i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) {
194 attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i);
195 if (IS_ERR(attr))
196 continue;
197
198 rcu_read_lock();
199 ndev = rdma_read_gid_attr_ndev_rcu(attr);
200 if (!IS_ERR(ndev) &&
201 ((!vlan_id && !is_vlan_dev(ndev)) ||
202 (vlan_id && is_vlan_dev(ndev) &&
203 vlan_dev_vlan_id(ndev) == vlan_id)) &&
204 attr->gid_type == IB_GID_TYPE_ROCE) {
205 rcu_read_unlock();
206 if (gid)
207 memcpy(gid, &attr->gid, SMC_GID_SIZE);
208 if (sgid_index)
209 *sgid_index = attr->index;
210 rdma_put_gid_attr(attr);
211 return 0;
212 }
213 rcu_read_unlock();
214 rdma_put_gid_attr(attr);
215 }
216 return -ENODEV;
217}
218
219static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
220{
221 int rc;
222
223 memset(&smcibdev->pattr[ibport - 1], 0,
224 sizeof(smcibdev->pattr[ibport - 1]));
225 rc = ib_query_port(smcibdev->ibdev, ibport,
226 &smcibdev->pattr[ibport - 1]);
227 if (rc)
228 goto out;
229 /* the SMC protocol requires specification of the RoCE MAC address */
230 rc = smc_ib_fill_mac(smcibdev, ibport);
231 if (rc)
232 goto out;
233 if (!smc_ib_is_valid_local_systemid() &&
234 smc_ib_port_active(smcibdev, ibport))
235 /* create unique system identifier */
236 smc_ib_define_local_systemid(smcibdev, ibport);
237out:
238 return rc;
239}
240
241/* process context wrapper for might_sleep smc_ib_remember_port_attr */
242static void smc_ib_port_event_work(struct work_struct *work)
243{
244 struct smc_ib_device *smcibdev = container_of(
245 work, struct smc_ib_device, port_event_work);
246 u8 port_idx;
247
248 for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) {
249 smc_ib_remember_port_attr(smcibdev, port_idx + 1);
250 clear_bit(port_idx, &smcibdev->port_event_mask);
251 if (!smc_ib_port_active(smcibdev, port_idx + 1)) {
252 set_bit(port_idx, smcibdev->ports_going_away);
253 smcr_port_err(smcibdev, port_idx + 1);
254 } else {
255 clear_bit(port_idx, smcibdev->ports_going_away);
256 smcr_port_add(smcibdev, port_idx + 1);
257 }
258 }
259}
260
261/* can be called in IRQ context */
262static void smc_ib_global_event_handler(struct ib_event_handler *handler,
263 struct ib_event *ibevent)
264{
265 struct smc_ib_device *smcibdev;
266 bool schedule = false;
267 u8 port_idx;
268
269 smcibdev = container_of(handler, struct smc_ib_device, event_handler);
270
271 switch (ibevent->event) {
272 case IB_EVENT_DEVICE_FATAL:
273 /* terminate all ports on device */
274 for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) {
275 set_bit(port_idx, &smcibdev->port_event_mask);
276 if (!test_and_set_bit(port_idx,
277 smcibdev->ports_going_away))
278 schedule = true;
279 }
280 if (schedule)
281 schedule_work(&smcibdev->port_event_work);
282 break;
283 case IB_EVENT_PORT_ACTIVE:
284 port_idx = ibevent->element.port_num - 1;
285 if (port_idx >= SMC_MAX_PORTS)
286 break;
287 set_bit(port_idx, &smcibdev->port_event_mask);
288 if (test_and_clear_bit(port_idx, smcibdev->ports_going_away))
289 schedule_work(&smcibdev->port_event_work);
290 break;
291 case IB_EVENT_PORT_ERR:
292 port_idx = ibevent->element.port_num - 1;
293 if (port_idx >= SMC_MAX_PORTS)
294 break;
295 set_bit(port_idx, &smcibdev->port_event_mask);
296 if (!test_and_set_bit(port_idx, smcibdev->ports_going_away))
297 schedule_work(&smcibdev->port_event_work);
298 break;
299 case IB_EVENT_GID_CHANGE:
300 port_idx = ibevent->element.port_num - 1;
301 if (port_idx >= SMC_MAX_PORTS)
302 break;
303 set_bit(port_idx, &smcibdev->port_event_mask);
304 schedule_work(&smcibdev->port_event_work);
305 break;
306 default:
307 break;
308 }
309}
310
311void smc_ib_dealloc_protection_domain(struct smc_link *lnk)
312{
313 if (lnk->roce_pd)
314 ib_dealloc_pd(lnk->roce_pd);
315 lnk->roce_pd = NULL;
316}
317
318int smc_ib_create_protection_domain(struct smc_link *lnk)
319{
320 int rc;
321
322 lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0);
323 rc = PTR_ERR_OR_ZERO(lnk->roce_pd);
324 if (IS_ERR(lnk->roce_pd))
325 lnk->roce_pd = NULL;
326 return rc;
327}
328
329static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
330{
331 struct smc_link *lnk = (struct smc_link *)priv;
332 struct smc_ib_device *smcibdev = lnk->smcibdev;
333 u8 port_idx;
334
335 switch (ibevent->event) {
336 case IB_EVENT_QP_FATAL:
337 case IB_EVENT_QP_ACCESS_ERR:
338 port_idx = ibevent->element.qp->port - 1;
339 if (port_idx >= SMC_MAX_PORTS)
340 break;
341 set_bit(port_idx, &smcibdev->port_event_mask);
342 if (!test_and_set_bit(port_idx, smcibdev->ports_going_away))
343 schedule_work(&smcibdev->port_event_work);
344 break;
345 default:
346 break;
347 }
348}
349
350void smc_ib_destroy_queue_pair(struct smc_link *lnk)
351{
352 if (lnk->roce_qp)
353 ib_destroy_qp(lnk->roce_qp);
354 lnk->roce_qp = NULL;
355}
356
357/* create a queue pair within the protection domain for a link */
358int smc_ib_create_queue_pair(struct smc_link *lnk)
359{
360 struct ib_qp_init_attr qp_attr = {
361 .event_handler = smc_ib_qp_event_handler,
362 .qp_context = lnk,
363 .send_cq = lnk->smcibdev->roce_cq_send,
364 .recv_cq = lnk->smcibdev->roce_cq_recv,
365 .srq = NULL,
366 .cap = {
367 /* include unsolicited rdma_writes as well,
368 * there are max. 2 RDMA_WRITE per 1 WR_SEND
369 */
370 .max_send_wr = SMC_WR_BUF_CNT * 3,
371 .max_recv_wr = SMC_WR_BUF_CNT * 3,
372 .max_send_sge = SMC_IB_MAX_SEND_SGE,
373 .max_recv_sge = 1,
374 },
375 .sq_sig_type = IB_SIGNAL_REQ_WR,
376 .qp_type = IB_QPT_RC,
377 };
378 int rc;
379
380 lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr);
381 rc = PTR_ERR_OR_ZERO(lnk->roce_qp);
382 if (IS_ERR(lnk->roce_qp))
383 lnk->roce_qp = NULL;
384 else
385 smc_wr_remember_qp_attr(lnk);
386 return rc;
387}
388
389void smc_ib_put_memory_region(struct ib_mr *mr)
390{
391 ib_dereg_mr(mr);
392}
393
394static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx)
395{
396 unsigned int offset = 0;
397 int sg_num;
398
399 /* map the largest prefix of a dma mapped SG list */
400 sg_num = ib_map_mr_sg(buf_slot->mr_rx[link_idx],
401 buf_slot->sgt[link_idx].sgl,
402 buf_slot->sgt[link_idx].orig_nents,
403 &offset, PAGE_SIZE);
404
405 return sg_num;
406}
407
408/* Allocate a memory region and map the dma mapped SG list of buf_slot */
409int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
410 struct smc_buf_desc *buf_slot, u8 link_idx)
411{
412 if (buf_slot->mr_rx[link_idx])
413 return 0; /* already done */
414
415 buf_slot->mr_rx[link_idx] =
416 ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order);
417 if (IS_ERR(buf_slot->mr_rx[link_idx])) {
418 int rc;
419
420 rc = PTR_ERR(buf_slot->mr_rx[link_idx]);
421 buf_slot->mr_rx[link_idx] = NULL;
422 return rc;
423 }
424
425 if (smc_ib_map_mr_sg(buf_slot, link_idx) != 1)
426 return -EINVAL;
427
428 return 0;
429}
430
431/* synchronize buffer usage for cpu access */
432void smc_ib_sync_sg_for_cpu(struct smc_link *lnk,
433 struct smc_buf_desc *buf_slot,
434 enum dma_data_direction data_direction)
435{
436 struct scatterlist *sg;
437 unsigned int i;
438
439 /* for now there is just one DMA address */
440 for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg,
441 buf_slot->sgt[lnk->link_idx].nents, i) {
442 if (!sg_dma_len(sg))
443 break;
444 ib_dma_sync_single_for_cpu(lnk->smcibdev->ibdev,
445 sg_dma_address(sg),
446 sg_dma_len(sg),
447 data_direction);
448 }
449}
450
451/* synchronize buffer usage for device access */
452void smc_ib_sync_sg_for_device(struct smc_link *lnk,
453 struct smc_buf_desc *buf_slot,
454 enum dma_data_direction data_direction)
455{
456 struct scatterlist *sg;
457 unsigned int i;
458
459 /* for now there is just one DMA address */
460 for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg,
461 buf_slot->sgt[lnk->link_idx].nents, i) {
462 if (!sg_dma_len(sg))
463 break;
464 ib_dma_sync_single_for_device(lnk->smcibdev->ibdev,
465 sg_dma_address(sg),
466 sg_dma_len(sg),
467 data_direction);
468 }
469}
470
471/* Map a new TX or RX buffer SG-table to DMA */
472int smc_ib_buf_map_sg(struct smc_link *lnk,
473 struct smc_buf_desc *buf_slot,
474 enum dma_data_direction data_direction)
475{
476 int mapped_nents;
477
478 mapped_nents = ib_dma_map_sg(lnk->smcibdev->ibdev,
479 buf_slot->sgt[lnk->link_idx].sgl,
480 buf_slot->sgt[lnk->link_idx].orig_nents,
481 data_direction);
482 if (!mapped_nents)
483 return -ENOMEM;
484
485 return mapped_nents;
486}
487
488void smc_ib_buf_unmap_sg(struct smc_link *lnk,
489 struct smc_buf_desc *buf_slot,
490 enum dma_data_direction data_direction)
491{
492 if (!buf_slot->sgt[lnk->link_idx].sgl->dma_address)
493 return; /* already unmapped */
494
495 ib_dma_unmap_sg(lnk->smcibdev->ibdev,
496 buf_slot->sgt[lnk->link_idx].sgl,
497 buf_slot->sgt[lnk->link_idx].orig_nents,
498 data_direction);
499 buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0;
500}
501
502long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
503{
504 struct ib_cq_init_attr cqattr = {
505 .cqe = SMC_MAX_CQE, .comp_vector = 0 };
506 int cqe_size_order, smc_order;
507 long rc;
508
509 mutex_lock(&smcibdev->mutex);
510 rc = 0;
511 if (smcibdev->initialized)
512 goto out;
513 /* the calculated number of cq entries fits to mlx5 cq allocation */
514 cqe_size_order = cache_line_size() == 128 ? 7 : 6;
515 smc_order = MAX_ORDER - cqe_size_order - 1;
516 if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE)
517 cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2;
518 smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev,
519 smc_wr_tx_cq_handler, NULL,
520 smcibdev, &cqattr);
521 rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send);
522 if (IS_ERR(smcibdev->roce_cq_send)) {
523 smcibdev->roce_cq_send = NULL;
524 goto out;
525 }
526 smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev,
527 smc_wr_rx_cq_handler, NULL,
528 smcibdev, &cqattr);
529 rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv);
530 if (IS_ERR(smcibdev->roce_cq_recv)) {
531 smcibdev->roce_cq_recv = NULL;
532 goto err;
533 }
534 smc_wr_add_dev(smcibdev);
535 smcibdev->initialized = 1;
536 goto out;
537
538err:
539 ib_destroy_cq(smcibdev->roce_cq_send);
540out:
541 mutex_unlock(&smcibdev->mutex);
542 return rc;
543}
544
545static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev)
546{
547 mutex_lock(&smcibdev->mutex);
548 if (!smcibdev->initialized)
549 goto out;
550 smcibdev->initialized = 0;
551 ib_destroy_cq(smcibdev->roce_cq_recv);
552 ib_destroy_cq(smcibdev->roce_cq_send);
553 smc_wr_remove_dev(smcibdev);
554out:
555 mutex_unlock(&smcibdev->mutex);
556}
557
558static struct ib_client smc_ib_client;
559
560/* callback function for ib_register_client() */
561static int smc_ib_add_dev(struct ib_device *ibdev)
562{
563 struct smc_ib_device *smcibdev;
564 u8 port_cnt;
565 int i;
566
567 if (ibdev->node_type != RDMA_NODE_IB_CA)
568 return -EOPNOTSUPP;
569
570 smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL);
571 if (!smcibdev)
572 return -ENOMEM;
573
574 smcibdev->ibdev = ibdev;
575 INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work);
576 atomic_set(&smcibdev->lnk_cnt, 0);
577 init_waitqueue_head(&smcibdev->lnks_deleted);
578 mutex_init(&smcibdev->mutex);
579 mutex_lock(&smc_ib_devices.mutex);
580 list_add_tail(&smcibdev->list, &smc_ib_devices.list);
581 mutex_unlock(&smc_ib_devices.mutex);
582 ib_set_client_data(ibdev, &smc_ib_client, smcibdev);
583 INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev,
584 smc_ib_global_event_handler);
585 ib_register_event_handler(&smcibdev->event_handler);
586
587 /* trigger reading of the port attributes */
588 port_cnt = smcibdev->ibdev->phys_port_cnt;
589 pr_warn_ratelimited("smc: adding ib device %s with port count %d\n",
590 smcibdev->ibdev->name, port_cnt);
591 for (i = 0;
592 i < min_t(size_t, port_cnt, SMC_MAX_PORTS);
593 i++) {
594 set_bit(i, &smcibdev->port_event_mask);
595 /* determine pnetids of the port */
596 if (smc_pnetid_by_dev_port(ibdev->dev.parent, i,
597 smcibdev->pnetid[i]))
598 smc_pnetid_by_table_ib(smcibdev, i + 1);
599 pr_warn_ratelimited("smc: ib device %s port %d has pnetid "
600 "%.16s%s\n",
601 smcibdev->ibdev->name, i + 1,
602 smcibdev->pnetid[i],
603 smcibdev->pnetid_by_user[i] ?
604 " (user defined)" :
605 "");
606 }
607 schedule_work(&smcibdev->port_event_work);
608 return 0;
609}
610
611/* callback function for ib_unregister_client() */
612static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
613{
614 struct smc_ib_device *smcibdev = client_data;
615
616 mutex_lock(&smc_ib_devices.mutex);
617 list_del_init(&smcibdev->list); /* remove from smc_ib_devices */
618 mutex_unlock(&smc_ib_devices.mutex);
619 pr_warn_ratelimited("smc: removing ib device %s\n",
620 smcibdev->ibdev->name);
621 smc_smcr_terminate_all(smcibdev);
622 smc_ib_cleanup_per_ibdev(smcibdev);
623 ib_unregister_event_handler(&smcibdev->event_handler);
624 cancel_work_sync(&smcibdev->port_event_work);
625 kfree(smcibdev);
626}
627
628static struct ib_client smc_ib_client = {
629 .name = "smc_ib",
630 .add = smc_ib_add_dev,
631 .remove = smc_ib_remove_dev,
632};
633
634int __init smc_ib_register_client(void)
635{
636 smc_ib_init_local_systemid();
637 return ib_register_client(&smc_ib_client);
638}
639
640void smc_ib_unregister_client(void)
641{
642 ib_unregister_client(&smc_ib_client);
643}
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
new file mode 100644
index 000000000..f90d15eae
--- /dev/null
+++ b/net/smc/smc_ib.h
@@ -0,0 +1,91 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
4 *
5 * Definitions for IB environment
6 *
7 * Copyright IBM Corp. 2016
8 *
9 * Author(s): Ursula Braun <Ursula Braun@linux.vnet.ibm.com>
10 */
11
12#ifndef _SMC_IB_H
13#define _SMC_IB_H
14
15#include <linux/interrupt.h>
16#include <linux/if_ether.h>
17#include <linux/mutex.h>
18#include <linux/wait.h>
19#include <rdma/ib_verbs.h>
20#include <net/smc.h>
21
22#define SMC_MAX_PORTS 2 /* Max # of ports */
23#define SMC_GID_SIZE sizeof(union ib_gid)
24
25#define SMC_IB_MAX_SEND_SGE 2
26
27struct smc_ib_devices { /* list of smc ib devices definition */
28 struct list_head list;
29 struct mutex mutex; /* protects list of smc ib devices */
30};
31
32extern struct smc_ib_devices smc_ib_devices; /* list of smc ib devices */
33
34struct smc_ib_device { /* ib-device infos for smc */
35 struct list_head list;
36 struct ib_device *ibdev;
37 struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */
38 struct ib_event_handler event_handler; /* global ib_event handler */
39 struct ib_cq *roce_cq_send; /* send completion queue */
40 struct ib_cq *roce_cq_recv; /* recv completion queue */
41 struct tasklet_struct send_tasklet; /* called by send cq handler */
42 struct tasklet_struct recv_tasklet; /* called by recv cq handler */
43 char mac[SMC_MAX_PORTS][ETH_ALEN];
44 /* mac address per port*/
45 u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN];
46 /* pnetid per port */
47 bool pnetid_by_user[SMC_MAX_PORTS];
48 /* pnetid defined by user? */
49 u8 initialized : 1; /* ib dev CQ, evthdl done */
50 struct work_struct port_event_work;
51 unsigned long port_event_mask;
52 DECLARE_BITMAP(ports_going_away, SMC_MAX_PORTS);
53 atomic_t lnk_cnt; /* number of links on ibdev */
54 wait_queue_head_t lnks_deleted; /* wait 4 removal of all links*/
55 struct mutex mutex; /* protect dev setup+cleanup */
56};
57
58struct smc_buf_desc;
59struct smc_link;
60
61int smc_ib_register_client(void) __init;
62void smc_ib_unregister_client(void);
63bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport);
64int smc_ib_buf_map_sg(struct smc_link *lnk,
65 struct smc_buf_desc *buf_slot,
66 enum dma_data_direction data_direction);
67void smc_ib_buf_unmap_sg(struct smc_link *lnk,
68 struct smc_buf_desc *buf_slot,
69 enum dma_data_direction data_direction);
70void smc_ib_dealloc_protection_domain(struct smc_link *lnk);
71int smc_ib_create_protection_domain(struct smc_link *lnk);
72void smc_ib_destroy_queue_pair(struct smc_link *lnk);
73int smc_ib_create_queue_pair(struct smc_link *lnk);
74int smc_ib_ready_link(struct smc_link *lnk);
75int smc_ib_modify_qp_rts(struct smc_link *lnk);
76int smc_ib_modify_qp_reset(struct smc_link *lnk);
77int smc_ib_modify_qp_error(struct smc_link *lnk);
78long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev);
79int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
80 struct smc_buf_desc *buf_slot, u8 link_idx);
81void smc_ib_put_memory_region(struct ib_mr *mr);
82void smc_ib_sync_sg_for_cpu(struct smc_link *lnk,
83 struct smc_buf_desc *buf_slot,
84 enum dma_data_direction data_direction);
85void smc_ib_sync_sg_for_device(struct smc_link *lnk,
86 struct smc_buf_desc *buf_slot,
87 enum dma_data_direction data_direction);
88int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
89 unsigned short vlan_id, u8 gid[], u8 *sgid_index);
90bool smc_ib_is_valid_local_systemid(void);
91#endif
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
new file mode 100644
index 000000000..8e33c0128
--- /dev/null
+++ b/net/smc/smc_ism.c
@@ -0,0 +1,439 @@
1// SPDX-License-Identifier: GPL-2.0
2/* Shared Memory Communications Direct over ISM devices (SMC-D)
3 *
4 * Functions for ISM device.
5 *
6 * Copyright IBM Corp. 2018
7 */
8
9#include <linux/spinlock.h>
10#include <linux/mutex.h>
11#include <linux/slab.h>
12#include <asm/page.h>
13
14#include "smc.h"
15#include "smc_core.h"
16#include "smc_ism.h"
17#include "smc_pnet.h"
18
19struct smcd_dev_list smcd_dev_list = {
20 .list = LIST_HEAD_INIT(smcd_dev_list.list),
21 .mutex = __MUTEX_INITIALIZER(smcd_dev_list.mutex)
22};
23
24bool smc_ism_v2_capable;
25
26/* Test if an ISM communication is possible - same CPC */
27int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *smcd)
28{
29 return smcd->ops->query_remote_gid(smcd, peer_gid, vlan_id ? 1 : 0,
30 vlan_id);
31}
32
33int smc_ism_write(struct smcd_dev *smcd, const struct smc_ism_position *pos,
34 void *data, size_t len)
35{
36 int rc;
37
38 rc = smcd->ops->move_data(smcd, pos->token, pos->index, pos->signal,
39 pos->offset, data, len);
40
41 return rc < 0 ? rc : 0;
42}
43
44void smc_ism_get_system_eid(struct smcd_dev *smcd, u8 **eid)
45{
46 smcd->ops->get_system_eid(smcd, eid);
47}
48
49u16 smc_ism_get_chid(struct smcd_dev *smcd)
50{
51 return smcd->ops->get_chid(smcd);
52}
53
54/* Set a connection using this DMBE. */
55void smc_ism_set_conn(struct smc_connection *conn)
56{
57 unsigned long flags;
58
59 spin_lock_irqsave(&conn->lgr->smcd->lock, flags);
60 conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = conn;
61 spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags);
62}
63
64/* Unset a connection using this DMBE. */
65void smc_ism_unset_conn(struct smc_connection *conn)
66{
67 unsigned long flags;
68
69 if (!conn->rmb_desc)
70 return;
71
72 spin_lock_irqsave(&conn->lgr->smcd->lock, flags);
73 conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = NULL;
74 spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags);
75}
76
77/* Register a VLAN identifier with the ISM device. Use a reference count
78 * and add a VLAN identifier only when the first DMB using this VLAN is
79 * registered.
80 */
81int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid)
82{
83 struct smc_ism_vlanid *new_vlan, *vlan;
84 unsigned long flags;
85 int rc = 0;
86
87 if (!vlanid) /* No valid vlan id */
88 return -EINVAL;
89
90 /* create new vlan entry, in case we need it */
91 new_vlan = kzalloc(sizeof(*new_vlan), GFP_KERNEL);
92 if (!new_vlan)
93 return -ENOMEM;
94 new_vlan->vlanid = vlanid;
95 refcount_set(&new_vlan->refcnt, 1);
96
97 /* if there is an existing entry, increase count and return */
98 spin_lock_irqsave(&smcd->lock, flags);
99 list_for_each_entry(vlan, &smcd->vlan, list) {
100 if (vlan->vlanid == vlanid) {
101 refcount_inc(&vlan->refcnt);
102 kfree(new_vlan);
103 goto out;
104 }
105 }
106
107 /* no existing entry found.
108 * add new entry to device; might fail, e.g., if HW limit reached
109 */
110 if (smcd->ops->add_vlan_id(smcd, vlanid)) {
111 kfree(new_vlan);
112 rc = -EIO;
113 goto out;
114 }
115 list_add_tail(&new_vlan->list, &smcd->vlan);
116out:
117 spin_unlock_irqrestore(&smcd->lock, flags);
118 return rc;
119}
120
121/* Unregister a VLAN identifier with the ISM device. Use a reference count
122 * and remove a VLAN identifier only when the last DMB using this VLAN is
123 * unregistered.
124 */
125int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid)
126{
127 struct smc_ism_vlanid *vlan;
128 unsigned long flags;
129 bool found = false;
130 int rc = 0;
131
132 if (!vlanid) /* No valid vlan id */
133 return -EINVAL;
134
135 spin_lock_irqsave(&smcd->lock, flags);
136 list_for_each_entry(vlan, &smcd->vlan, list) {
137 if (vlan->vlanid == vlanid) {
138 if (!refcount_dec_and_test(&vlan->refcnt))
139 goto out;
140 found = true;
141 break;
142 }
143 }
144 if (!found) {
145 rc = -ENOENT;
146 goto out; /* VLAN id not in table */
147 }
148
149 /* Found and the last reference just gone */
150 if (smcd->ops->del_vlan_id(smcd, vlanid))
151 rc = -EIO;
152 list_del(&vlan->list);
153 kfree(vlan);
154out:
155 spin_unlock_irqrestore(&smcd->lock, flags);
156 return rc;
157}
158
159int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc)
160{
161 struct smcd_dmb dmb;
162 int rc = 0;
163
164 if (!dmb_desc->dma_addr)
165 return rc;
166
167 memset(&dmb, 0, sizeof(dmb));
168 dmb.dmb_tok = dmb_desc->token;
169 dmb.sba_idx = dmb_desc->sba_idx;
170 dmb.cpu_addr = dmb_desc->cpu_addr;
171 dmb.dma_addr = dmb_desc->dma_addr;
172 dmb.dmb_len = dmb_desc->len;
173 rc = smcd->ops->unregister_dmb(smcd, &dmb);
174 if (!rc || rc == ISM_ERROR) {
175 dmb_desc->cpu_addr = NULL;
176 dmb_desc->dma_addr = 0;
177 }
178
179 return rc;
180}
181
182int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len,
183 struct smc_buf_desc *dmb_desc)
184{
185 struct smcd_dmb dmb;
186 int rc;
187
188 memset(&dmb, 0, sizeof(dmb));
189 dmb.dmb_len = dmb_len;
190 dmb.sba_idx = dmb_desc->sba_idx;
191 dmb.vlan_id = lgr->vlan_id;
192 dmb.rgid = lgr->peer_gid;
193 rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb);
194 if (!rc) {
195 dmb_desc->sba_idx = dmb.sba_idx;
196 dmb_desc->token = dmb.dmb_tok;
197 dmb_desc->cpu_addr = dmb.cpu_addr;
198 dmb_desc->dma_addr = dmb.dma_addr;
199 dmb_desc->len = dmb.dmb_len;
200 }
201 return rc;
202}
203
204struct smc_ism_event_work {
205 struct work_struct work;
206 struct smcd_dev *smcd;
207 struct smcd_event event;
208};
209
210#define ISM_EVENT_REQUEST 0x0001
211#define ISM_EVENT_RESPONSE 0x0002
212#define ISM_EVENT_REQUEST_IR 0x00000001
213#define ISM_EVENT_CODE_SHUTDOWN 0x80
214#define ISM_EVENT_CODE_TESTLINK 0x83
215
216union smcd_sw_event_info {
217 u64 info;
218 struct {
219 u8 uid[SMC_LGR_ID_SIZE];
220 unsigned short vlan_id;
221 u16 code;
222 };
223};
224
225static void smcd_handle_sw_event(struct smc_ism_event_work *wrk)
226{
227 union smcd_sw_event_info ev_info;
228
229 ev_info.info = wrk->event.info;
230 switch (wrk->event.code) {
231 case ISM_EVENT_CODE_SHUTDOWN: /* Peer shut down DMBs */
232 smc_smcd_terminate(wrk->smcd, wrk->event.tok, ev_info.vlan_id);
233 break;
234 case ISM_EVENT_CODE_TESTLINK: /* Activity timer */
235 if (ev_info.code == ISM_EVENT_REQUEST) {
236 ev_info.code = ISM_EVENT_RESPONSE;
237 wrk->smcd->ops->signal_event(wrk->smcd,
238 wrk->event.tok,
239 ISM_EVENT_REQUEST_IR,
240 ISM_EVENT_CODE_TESTLINK,
241 ev_info.info);
242 }
243 break;
244 }
245}
246
247int smc_ism_signal_shutdown(struct smc_link_group *lgr)
248{
249 int rc;
250 union smcd_sw_event_info ev_info;
251
252 if (lgr->peer_shutdown)
253 return 0;
254
255 memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE);
256 ev_info.vlan_id = lgr->vlan_id;
257 ev_info.code = ISM_EVENT_REQUEST;
258 rc = lgr->smcd->ops->signal_event(lgr->smcd, lgr->peer_gid,
259 ISM_EVENT_REQUEST_IR,
260 ISM_EVENT_CODE_SHUTDOWN,
261 ev_info.info);
262 return rc;
263}
264
265/* worker for SMC-D events */
266static void smc_ism_event_work(struct work_struct *work)
267{
268 struct smc_ism_event_work *wrk =
269 container_of(work, struct smc_ism_event_work, work);
270
271 switch (wrk->event.type) {
272 case ISM_EVENT_GID: /* GID event, token is peer GID */
273 smc_smcd_terminate(wrk->smcd, wrk->event.tok, VLAN_VID_MASK);
274 break;
275 case ISM_EVENT_DMB:
276 break;
277 case ISM_EVENT_SWR: /* Software defined event */
278 smcd_handle_sw_event(wrk);
279 break;
280 }
281 kfree(wrk);
282}
283
284static void smcd_release(struct device *dev)
285{
286 struct smcd_dev *smcd = container_of(dev, struct smcd_dev, dev);
287
288 kfree(smcd->conn);
289 kfree(smcd);
290}
291
292struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name,
293 const struct smcd_ops *ops, int max_dmbs)
294{
295 struct smcd_dev *smcd;
296
297 smcd = kzalloc(sizeof(*smcd), GFP_KERNEL);
298 if (!smcd)
299 return NULL;
300 smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *),
301 GFP_KERNEL);
302 if (!smcd->conn) {
303 kfree(smcd);
304 return NULL;
305 }
306
307 smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)",
308 WQ_MEM_RECLAIM, name);
309 if (!smcd->event_wq) {
310 kfree(smcd->conn);
311 kfree(smcd);
312 return NULL;
313 }
314
315 smcd->dev.parent = parent;
316 smcd->dev.release = smcd_release;
317 device_initialize(&smcd->dev);
318 dev_set_name(&smcd->dev, name);
319 smcd->ops = ops;
320 if (smc_pnetid_by_dev_port(parent, 0, smcd->pnetid))
321 smc_pnetid_by_table_smcd(smcd);
322
323 spin_lock_init(&smcd->lock);
324 spin_lock_init(&smcd->lgr_lock);
325 INIT_LIST_HEAD(&smcd->vlan);
326 INIT_LIST_HEAD(&smcd->lgr_list);
327 init_waitqueue_head(&smcd->lgrs_deleted);
328 return smcd;
329}
330EXPORT_SYMBOL_GPL(smcd_alloc_dev);
331
332int smcd_register_dev(struct smcd_dev *smcd)
333{
334 int rc;
335
336 mutex_lock(&smcd_dev_list.mutex);
337 if (list_empty(&smcd_dev_list.list)) {
338 u8 *system_eid = NULL;
339
340 smc_ism_get_system_eid(smcd, &system_eid);
341 if (system_eid[24] != '0' || system_eid[28] != '0')
342 smc_ism_v2_capable = true;
343 }
344 /* sort list: devices without pnetid before devices with pnetid */
345 if (smcd->pnetid[0])
346 list_add_tail(&smcd->list, &smcd_dev_list.list);
347 else
348 list_add(&smcd->list, &smcd_dev_list.list);
349 mutex_unlock(&smcd_dev_list.mutex);
350
351 pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n",
352 dev_name(&smcd->dev), smcd->pnetid,
353 smcd->pnetid_by_user ? " (user defined)" : "");
354
355 rc = device_add(&smcd->dev);
356 if (rc) {
357 mutex_lock(&smcd_dev_list.mutex);
358 list_del(&smcd->list);
359 mutex_unlock(&smcd_dev_list.mutex);
360 }
361
362 return rc;
363}
364EXPORT_SYMBOL_GPL(smcd_register_dev);
365
366void smcd_unregister_dev(struct smcd_dev *smcd)
367{
368 pr_warn_ratelimited("smc: removing smcd device %s\n",
369 dev_name(&smcd->dev));
370 mutex_lock(&smcd_dev_list.mutex);
371 list_del_init(&smcd->list);
372 mutex_unlock(&smcd_dev_list.mutex);
373 smcd->going_away = 1;
374 smc_smcd_terminate_all(smcd);
375 flush_workqueue(smcd->event_wq);
376 destroy_workqueue(smcd->event_wq);
377
378 device_del(&smcd->dev);
379}
380EXPORT_SYMBOL_GPL(smcd_unregister_dev);
381
382void smcd_free_dev(struct smcd_dev *smcd)
383{
384 put_device(&smcd->dev);
385}
386EXPORT_SYMBOL_GPL(smcd_free_dev);
387
388/* SMCD Device event handler. Called from ISM device interrupt handler.
389 * Parameters are smcd device pointer,
390 * - event->type (0 --> DMB, 1 --> GID),
391 * - event->code (event code),
392 * - event->tok (either DMB token when event type 0, or GID when event type 1)
393 * - event->time (time of day)
394 * - event->info (debug info).
395 *
396 * Context:
397 * - Function called in IRQ context from ISM device driver event handler.
398 */
399void smcd_handle_event(struct smcd_dev *smcd, struct smcd_event *event)
400{
401 struct smc_ism_event_work *wrk;
402
403 if (smcd->going_away)
404 return;
405 /* copy event to event work queue, and let it be handled there */
406 wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC);
407 if (!wrk)
408 return;
409 INIT_WORK(&wrk->work, smc_ism_event_work);
410 wrk->smcd = smcd;
411 wrk->event = *event;
412 queue_work(smcd->event_wq, &wrk->work);
413}
414EXPORT_SYMBOL_GPL(smcd_handle_event);
415
416/* SMCD Device interrupt handler. Called from ISM device interrupt handler.
417 * Parameters are smcd device pointer and DMB number. Find the connection and
418 * schedule the tasklet for this connection.
419 *
420 * Context:
421 * - Function called in IRQ context from ISM device driver IRQ handler.
422 */
423void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno)
424{
425 struct smc_connection *conn = NULL;
426 unsigned long flags;
427
428 spin_lock_irqsave(&smcd->lock, flags);
429 conn = smcd->conn[dmbno];
430 if (conn && !conn->killed)
431 tasklet_schedule(&conn->rx_tsklet);
432 spin_unlock_irqrestore(&smcd->lock, flags);
433}
434EXPORT_SYMBOL_GPL(smcd_handle_irq);
435
436void __init smc_ism_init(void)
437{
438 smc_ism_v2_capable = false;
439}
diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h
new file mode 100644
index 000000000..8048e09dd
--- /dev/null
+++ b/net/smc/smc_ism.h
@@ -0,0 +1,56 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/* Shared Memory Communications Direct over ISM devices (SMC-D)
3 *
4 * SMC-D ISM device structure definitions.
5 *
6 * Copyright IBM Corp. 2018
7 */
8
9#ifndef SMCD_ISM_H
10#define SMCD_ISM_H
11
12#include <linux/uio.h>
13#include <linux/mutex.h>
14
15#include "smc.h"
16
17struct smcd_dev_list { /* List of SMCD devices */
18 struct list_head list;
19 struct mutex mutex; /* Protects list of devices */
20};
21
22extern struct smcd_dev_list smcd_dev_list; /* list of smcd devices */
23extern bool smc_ism_v2_capable; /* HW supports ISM V2 and thus
24 * System EID is defined
25 */
26
27struct smc_ism_vlanid { /* VLAN id set on ISM device */
28 struct list_head list;
29 unsigned short vlanid; /* Vlan id */
30 refcount_t refcnt; /* Reference count */
31};
32
33struct smc_ism_position { /* ISM device position to write to */
34 u64 token; /* Token of DMB */
35 u32 offset; /* Offset into DMBE */
36 u8 index; /* Index of DMBE */
37 u8 signal; /* Generate interrupt on owner side */
38};
39
40struct smcd_dev;
41
42int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *dev);
43void smc_ism_set_conn(struct smc_connection *conn);
44void smc_ism_unset_conn(struct smc_connection *conn);
45int smc_ism_get_vlan(struct smcd_dev *dev, unsigned short vlan_id);
46int smc_ism_put_vlan(struct smcd_dev *dev, unsigned short vlan_id);
47int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size,
48 struct smc_buf_desc *dmb_desc);
49int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc);
50int smc_ism_write(struct smcd_dev *dev, const struct smc_ism_position *pos,
51 void *data, size_t len);
52int smc_ism_signal_shutdown(struct smc_link_group *lgr);
53void smc_ism_get_system_eid(struct smcd_dev *dev, u8 **eid);
54u16 smc_ism_get_chid(struct smcd_dev *dev);
55void smc_ism_init(void);
56#endif
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
new file mode 100644
index 000000000..0ef15f8fb
--- /dev/null
+++ b/net/smc/smc_llc.c
@@ -0,0 +1,1974 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
4 *
5 * Link Layer Control (LLC)
6 *
7 * Copyright IBM Corp. 2016
8 *
9 * Author(s): Klaus Wacker <Klaus.Wacker@de.ibm.com>
10 * Ursula Braun <ubraun@linux.vnet.ibm.com>
11 */
12
13#include <net/tcp.h>
14#include <rdma/ib_verbs.h>
15
16#include "smc.h"
17#include "smc_core.h"
18#include "smc_clc.h"
19#include "smc_llc.h"
20#include "smc_pnet.h"
21
22#define SMC_LLC_DATA_LEN 40
23
24struct smc_llc_hdr {
25 struct smc_wr_rx_hdr common;
26 u8 length; /* 44 */
27#if defined(__BIG_ENDIAN_BITFIELD)
28 u8 reserved:4,
29 add_link_rej_rsn:4;
30#elif defined(__LITTLE_ENDIAN_BITFIELD)
31 u8 add_link_rej_rsn:4,
32 reserved:4;
33#endif
34 u8 flags;
35};
36
37#define SMC_LLC_FLAG_NO_RMBE_EYEC 0x03
38
39struct smc_llc_msg_confirm_link { /* type 0x01 */
40 struct smc_llc_hdr hd;
41 u8 sender_mac[ETH_ALEN];
42 u8 sender_gid[SMC_GID_SIZE];
43 u8 sender_qp_num[3];
44 u8 link_num;
45 u8 link_uid[SMC_LGR_ID_SIZE];
46 u8 max_links;
47 u8 reserved[9];
48};
49
50#define SMC_LLC_FLAG_ADD_LNK_REJ 0x40
51#define SMC_LLC_REJ_RSN_NO_ALT_PATH 1
52
53#define SMC_LLC_ADD_LNK_MAX_LINKS 2
54
55struct smc_llc_msg_add_link { /* type 0x02 */
56 struct smc_llc_hdr hd;
57 u8 sender_mac[ETH_ALEN];
58 u8 reserved2[2];
59 u8 sender_gid[SMC_GID_SIZE];
60 u8 sender_qp_num[3];
61 u8 link_num;
62#if defined(__BIG_ENDIAN_BITFIELD)
63 u8 reserved3 : 4,
64 qp_mtu : 4;
65#elif defined(__LITTLE_ENDIAN_BITFIELD)
66 u8 qp_mtu : 4,
67 reserved3 : 4;
68#endif
69 u8 initial_psn[3];
70 u8 reserved[8];
71};
72
73struct smc_llc_msg_add_link_cont_rt {
74 __be32 rmb_key;
75 __be32 rmb_key_new;
76 __be64 rmb_vaddr_new;
77};
78
79#define SMC_LLC_RKEYS_PER_CONT_MSG 2
80
81struct smc_llc_msg_add_link_cont { /* type 0x03 */
82 struct smc_llc_hdr hd;
83 u8 link_num;
84 u8 num_rkeys;
85 u8 reserved2[2];
86 struct smc_llc_msg_add_link_cont_rt rt[SMC_LLC_RKEYS_PER_CONT_MSG];
87 u8 reserved[4];
88} __packed; /* format defined in RFC7609 */
89
90#define SMC_LLC_FLAG_DEL_LINK_ALL 0x40
91#define SMC_LLC_FLAG_DEL_LINK_ORDERLY 0x20
92
93struct smc_llc_msg_del_link { /* type 0x04 */
94 struct smc_llc_hdr hd;
95 u8 link_num;
96 __be32 reason;
97 u8 reserved[35];
98} __packed; /* format defined in RFC7609 */
99
100struct smc_llc_msg_test_link { /* type 0x07 */
101 struct smc_llc_hdr hd;
102 u8 user_data[16];
103 u8 reserved[24];
104};
105
106struct smc_rmb_rtoken {
107 union {
108 u8 num_rkeys; /* first rtoken byte of CONFIRM LINK msg */
109 /* is actually the num of rtokens, first */
110 /* rtoken is always for the current link */
111 u8 link_id; /* link id of the rtoken */
112 };
113 __be32 rmb_key;
114 __be64 rmb_vaddr;
115} __packed; /* format defined in RFC7609 */
116
117#define SMC_LLC_RKEYS_PER_MSG 3
118
119struct smc_llc_msg_confirm_rkey { /* type 0x06 */
120 struct smc_llc_hdr hd;
121 struct smc_rmb_rtoken rtoken[SMC_LLC_RKEYS_PER_MSG];
122 u8 reserved;
123};
124
125#define SMC_LLC_DEL_RKEY_MAX 8
126#define SMC_LLC_FLAG_RKEY_RETRY 0x10
127#define SMC_LLC_FLAG_RKEY_NEG 0x20
128
129struct smc_llc_msg_delete_rkey { /* type 0x09 */
130 struct smc_llc_hdr hd;
131 u8 num_rkeys;
132 u8 err_mask;
133 u8 reserved[2];
134 __be32 rkey[8];
135 u8 reserved2[4];
136};
137
138union smc_llc_msg {
139 struct smc_llc_msg_confirm_link confirm_link;
140 struct smc_llc_msg_add_link add_link;
141 struct smc_llc_msg_add_link_cont add_link_cont;
142 struct smc_llc_msg_del_link delete_link;
143
144 struct smc_llc_msg_confirm_rkey confirm_rkey;
145 struct smc_llc_msg_delete_rkey delete_rkey;
146
147 struct smc_llc_msg_test_link test_link;
148 struct {
149 struct smc_llc_hdr hdr;
150 u8 data[SMC_LLC_DATA_LEN];
151 } raw;
152};
153
154#define SMC_LLC_FLAG_RESP 0x80
155
156struct smc_llc_qentry {
157 struct list_head list;
158 struct smc_link *link;
159 union smc_llc_msg msg;
160};
161
162static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc);
163
164struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow)
165{
166 struct smc_llc_qentry *qentry = flow->qentry;
167
168 flow->qentry = NULL;
169 return qentry;
170}
171
172void smc_llc_flow_qentry_del(struct smc_llc_flow *flow)
173{
174 struct smc_llc_qentry *qentry;
175
176 if (flow->qentry) {
177 qentry = flow->qentry;
178 flow->qentry = NULL;
179 kfree(qentry);
180 }
181}
182
183static inline void smc_llc_flow_qentry_set(struct smc_llc_flow *flow,
184 struct smc_llc_qentry *qentry)
185{
186 flow->qentry = qentry;
187}
188
189static void smc_llc_flow_parallel(struct smc_link_group *lgr, u8 flow_type,
190 struct smc_llc_qentry *qentry)
191{
192 u8 msg_type = qentry->msg.raw.hdr.common.type;
193
194 if ((msg_type == SMC_LLC_ADD_LINK || msg_type == SMC_LLC_DELETE_LINK) &&
195 flow_type != msg_type && !lgr->delayed_event) {
196 lgr->delayed_event = qentry;
197 return;
198 }
199 /* drop parallel or already-in-progress llc requests */
200 if (flow_type != msg_type)
201 pr_warn_once("smc: SMC-R lg %*phN dropped parallel "
202 "LLC msg: msg %d flow %d role %d\n",
203 SMC_LGR_ID_SIZE, &lgr->id,
204 qentry->msg.raw.hdr.common.type,
205 flow_type, lgr->role);
206 kfree(qentry);
207}
208
209/* try to start a new llc flow, initiated by an incoming llc msg */
210static bool smc_llc_flow_start(struct smc_llc_flow *flow,
211 struct smc_llc_qentry *qentry)
212{
213 struct smc_link_group *lgr = qentry->link->lgr;
214
215 spin_lock_bh(&lgr->llc_flow_lock);
216 if (flow->type) {
217 /* a flow is already active */
218 smc_llc_flow_parallel(lgr, flow->type, qentry);
219 spin_unlock_bh(&lgr->llc_flow_lock);
220 return false;
221 }
222 switch (qentry->msg.raw.hdr.common.type) {
223 case SMC_LLC_ADD_LINK:
224 flow->type = SMC_LLC_FLOW_ADD_LINK;
225 break;
226 case SMC_LLC_DELETE_LINK:
227 flow->type = SMC_LLC_FLOW_DEL_LINK;
228 break;
229 case SMC_LLC_CONFIRM_RKEY:
230 case SMC_LLC_DELETE_RKEY:
231 flow->type = SMC_LLC_FLOW_RKEY;
232 break;
233 default:
234 flow->type = SMC_LLC_FLOW_NONE;
235 }
236 smc_llc_flow_qentry_set(flow, qentry);
237 spin_unlock_bh(&lgr->llc_flow_lock);
238 return true;
239}
240
241/* start a new local llc flow, wait till current flow finished */
242int smc_llc_flow_initiate(struct smc_link_group *lgr,
243 enum smc_llc_flowtype type)
244{
245 enum smc_llc_flowtype allowed_remote = SMC_LLC_FLOW_NONE;
246 int rc;
247
248 /* all flows except confirm_rkey and delete_rkey are exclusive,
249 * confirm/delete rkey flows can run concurrently (local and remote)
250 */
251 if (type == SMC_LLC_FLOW_RKEY)
252 allowed_remote = SMC_LLC_FLOW_RKEY;
253again:
254 if (list_empty(&lgr->list))
255 return -ENODEV;
256 spin_lock_bh(&lgr->llc_flow_lock);
257 if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE &&
258 (lgr->llc_flow_rmt.type == SMC_LLC_FLOW_NONE ||
259 lgr->llc_flow_rmt.type == allowed_remote)) {
260 lgr->llc_flow_lcl.type = type;
261 spin_unlock_bh(&lgr->llc_flow_lock);
262 return 0;
263 }
264 spin_unlock_bh(&lgr->llc_flow_lock);
265 rc = wait_event_timeout(lgr->llc_flow_waiter, (list_empty(&lgr->list) ||
266 (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE &&
267 (lgr->llc_flow_rmt.type == SMC_LLC_FLOW_NONE ||
268 lgr->llc_flow_rmt.type == allowed_remote))),
269 SMC_LLC_WAIT_TIME * 10);
270 if (!rc)
271 return -ETIMEDOUT;
272 goto again;
273}
274
275/* finish the current llc flow */
276void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow)
277{
278 spin_lock_bh(&lgr->llc_flow_lock);
279 memset(flow, 0, sizeof(*flow));
280 flow->type = SMC_LLC_FLOW_NONE;
281 spin_unlock_bh(&lgr->llc_flow_lock);
282 if (!list_empty(&lgr->list) && lgr->delayed_event &&
283 flow == &lgr->llc_flow_lcl)
284 schedule_work(&lgr->llc_event_work);
285 else
286 wake_up(&lgr->llc_flow_waiter);
287}
288
289/* lnk is optional and used for early wakeup when link goes down, useful in
290 * cases where we wait for a response on the link after we sent a request
291 */
292struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr,
293 struct smc_link *lnk,
294 int time_out, u8 exp_msg)
295{
296 struct smc_llc_flow *flow = &lgr->llc_flow_lcl;
297 u8 rcv_msg;
298
299 wait_event_timeout(lgr->llc_msg_waiter,
300 (flow->qentry ||
301 (lnk && !smc_link_usable(lnk)) ||
302 list_empty(&lgr->list)),
303 time_out);
304 if (!flow->qentry ||
305 (lnk && !smc_link_usable(lnk)) || list_empty(&lgr->list)) {
306 smc_llc_flow_qentry_del(flow);
307 goto out;
308 }
309 rcv_msg = flow->qentry->msg.raw.hdr.common.type;
310 if (exp_msg && rcv_msg != exp_msg) {
311 if (exp_msg == SMC_LLC_ADD_LINK &&
312 rcv_msg == SMC_LLC_DELETE_LINK) {
313 /* flow_start will delay the unexpected msg */
314 smc_llc_flow_start(&lgr->llc_flow_lcl,
315 smc_llc_flow_qentry_clr(flow));
316 return NULL;
317 }
318 pr_warn_once("smc: SMC-R lg %*phN dropped unexpected LLC msg: "
319 "msg %d exp %d flow %d role %d flags %x\n",
320 SMC_LGR_ID_SIZE, &lgr->id, rcv_msg, exp_msg,
321 flow->type, lgr->role,
322 flow->qentry->msg.raw.hdr.flags);
323 smc_llc_flow_qentry_del(flow);
324 }
325out:
326 return flow->qentry;
327}
328
329/********************************** send *************************************/
330
331struct smc_llc_tx_pend {
332};
333
334/* handler for send/transmission completion of an LLC msg */
335static void smc_llc_tx_handler(struct smc_wr_tx_pend_priv *pend,
336 struct smc_link *link,
337 enum ib_wc_status wc_status)
338{
339 /* future work: handle wc_status error for recovery and failover */
340}
341
342/**
343 * smc_llc_add_pending_send() - add LLC control message to pending WQE transmits
344 * @link: Pointer to SMC link used for sending LLC control message.
345 * @wr_buf: Out variable returning pointer to work request payload buffer.
346 * @pend: Out variable returning pointer to private pending WR tracking.
347 * It's the context the transmit complete handler will get.
348 *
349 * Reserves and pre-fills an entry for a pending work request send/tx.
350 * Used by mid-level smc_llc_send_msg() to prepare for later actual send/tx.
351 * Can sleep due to smc_get_ctrl_buf (if not in softirq context).
352 *
353 * Return: 0 on success, otherwise an error value.
354 */
355static int smc_llc_add_pending_send(struct smc_link *link,
356 struct smc_wr_buf **wr_buf,
357 struct smc_wr_tx_pend_priv **pend)
358{
359 int rc;
360
361 rc = smc_wr_tx_get_free_slot(link, smc_llc_tx_handler, wr_buf, NULL,
362 pend);
363 if (rc < 0)
364 return rc;
365 BUILD_BUG_ON_MSG(
366 sizeof(union smc_llc_msg) > SMC_WR_BUF_SIZE,
367 "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_llc_msg)");
368 BUILD_BUG_ON_MSG(
369 sizeof(union smc_llc_msg) != SMC_WR_TX_SIZE,
370 "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_llc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()");
371 BUILD_BUG_ON_MSG(
372 sizeof(struct smc_llc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE,
373 "must increase SMC_WR_TX_PEND_PRIV_SIZE to at least sizeof(struct smc_llc_tx_pend)");
374 return 0;
375}
376
377/* high-level API to send LLC confirm link */
378int smc_llc_send_confirm_link(struct smc_link *link,
379 enum smc_llc_reqresp reqresp)
380{
381 struct smc_llc_msg_confirm_link *confllc;
382 struct smc_wr_tx_pend_priv *pend;
383 struct smc_wr_buf *wr_buf;
384 int rc;
385
386 if (!smc_wr_tx_link_hold(link))
387 return -ENOLINK;
388 rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
389 if (rc)
390 goto put_out;
391 confllc = (struct smc_llc_msg_confirm_link *)wr_buf;
392 memset(confllc, 0, sizeof(*confllc));
393 confllc->hd.common.type = SMC_LLC_CONFIRM_LINK;
394 confllc->hd.length = sizeof(struct smc_llc_msg_confirm_link);
395 confllc->hd.flags |= SMC_LLC_FLAG_NO_RMBE_EYEC;
396 if (reqresp == SMC_LLC_RESP)
397 confllc->hd.flags |= SMC_LLC_FLAG_RESP;
398 memcpy(confllc->sender_mac, link->smcibdev->mac[link->ibport - 1],
399 ETH_ALEN);
400 memcpy(confllc->sender_gid, link->gid, SMC_GID_SIZE);
401 hton24(confllc->sender_qp_num, link->roce_qp->qp_num);
402 confllc->link_num = link->link_id;
403 memcpy(confllc->link_uid, link->link_uid, SMC_LGR_ID_SIZE);
404 confllc->max_links = SMC_LLC_ADD_LNK_MAX_LINKS;
405 /* send llc message */
406 rc = smc_wr_tx_send(link, pend);
407put_out:
408 smc_wr_tx_link_put(link);
409 return rc;
410}
411
412/* send LLC confirm rkey request */
413static int smc_llc_send_confirm_rkey(struct smc_link *send_link,
414 struct smc_buf_desc *rmb_desc)
415{
416 struct smc_llc_msg_confirm_rkey *rkeyllc;
417 struct smc_wr_tx_pend_priv *pend;
418 struct smc_wr_buf *wr_buf;
419 struct smc_link *link;
420 int i, rc, rtok_ix;
421
422 if (!smc_wr_tx_link_hold(send_link))
423 return -ENOLINK;
424 rc = smc_llc_add_pending_send(send_link, &wr_buf, &pend);
425 if (rc)
426 goto put_out;
427 rkeyllc = (struct smc_llc_msg_confirm_rkey *)wr_buf;
428 memset(rkeyllc, 0, sizeof(*rkeyllc));
429 rkeyllc->hd.common.type = SMC_LLC_CONFIRM_RKEY;
430 rkeyllc->hd.length = sizeof(struct smc_llc_msg_confirm_rkey);
431
432 rtok_ix = 1;
433 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
434 link = &send_link->lgr->lnk[i];
435 if (smc_link_active(link) && link != send_link) {
436 rkeyllc->rtoken[rtok_ix].link_id = link->link_id;
437 rkeyllc->rtoken[rtok_ix].rmb_key =
438 htonl(rmb_desc->mr_rx[link->link_idx]->rkey);
439 rkeyllc->rtoken[rtok_ix].rmb_vaddr = cpu_to_be64(
440 (u64)sg_dma_address(
441 rmb_desc->sgt[link->link_idx].sgl));
442 rtok_ix++;
443 }
444 }
445 /* rkey of send_link is in rtoken[0] */
446 rkeyllc->rtoken[0].num_rkeys = rtok_ix - 1;
447 rkeyllc->rtoken[0].rmb_key =
448 htonl(rmb_desc->mr_rx[send_link->link_idx]->rkey);
449 rkeyllc->rtoken[0].rmb_vaddr = cpu_to_be64(
450 (u64)sg_dma_address(rmb_desc->sgt[send_link->link_idx].sgl));
451 /* send llc message */
452 rc = smc_wr_tx_send(send_link, pend);
453put_out:
454 smc_wr_tx_link_put(send_link);
455 return rc;
456}
457
458/* send LLC delete rkey request */
459static int smc_llc_send_delete_rkey(struct smc_link *link,
460 struct smc_buf_desc *rmb_desc)
461{
462 struct smc_llc_msg_delete_rkey *rkeyllc;
463 struct smc_wr_tx_pend_priv *pend;
464 struct smc_wr_buf *wr_buf;
465 int rc;
466
467 if (!smc_wr_tx_link_hold(link))
468 return -ENOLINK;
469 rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
470 if (rc)
471 goto put_out;
472 rkeyllc = (struct smc_llc_msg_delete_rkey *)wr_buf;
473 memset(rkeyllc, 0, sizeof(*rkeyllc));
474 rkeyllc->hd.common.type = SMC_LLC_DELETE_RKEY;
475 rkeyllc->hd.length = sizeof(struct smc_llc_msg_delete_rkey);
476 rkeyllc->num_rkeys = 1;
477 rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[link->link_idx]->rkey);
478 /* send llc message */
479 rc = smc_wr_tx_send(link, pend);
480put_out:
481 smc_wr_tx_link_put(link);
482 return rc;
483}
484
485/* send ADD LINK request or response */
486int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[],
487 struct smc_link *link_new,
488 enum smc_llc_reqresp reqresp)
489{
490 struct smc_llc_msg_add_link *addllc;
491 struct smc_wr_tx_pend_priv *pend;
492 struct smc_wr_buf *wr_buf;
493 int rc;
494
495 if (!smc_wr_tx_link_hold(link))
496 return -ENOLINK;
497 rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
498 if (rc)
499 goto put_out;
500 addllc = (struct smc_llc_msg_add_link *)wr_buf;
501
502 memset(addllc, 0, sizeof(*addllc));
503 addllc->hd.common.type = SMC_LLC_ADD_LINK;
504 addllc->hd.length = sizeof(struct smc_llc_msg_add_link);
505 if (reqresp == SMC_LLC_RESP)
506 addllc->hd.flags |= SMC_LLC_FLAG_RESP;
507 memcpy(addllc->sender_mac, mac, ETH_ALEN);
508 memcpy(addllc->sender_gid, gid, SMC_GID_SIZE);
509 if (link_new) {
510 addllc->link_num = link_new->link_id;
511 hton24(addllc->sender_qp_num, link_new->roce_qp->qp_num);
512 hton24(addllc->initial_psn, link_new->psn_initial);
513 if (reqresp == SMC_LLC_REQ)
514 addllc->qp_mtu = link_new->path_mtu;
515 else
516 addllc->qp_mtu = min(link_new->path_mtu,
517 link_new->peer_mtu);
518 }
519 /* send llc message */
520 rc = smc_wr_tx_send(link, pend);
521put_out:
522 smc_wr_tx_link_put(link);
523 return rc;
524}
525
526/* send DELETE LINK request or response */
527int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id,
528 enum smc_llc_reqresp reqresp, bool orderly,
529 u32 reason)
530{
531 struct smc_llc_msg_del_link *delllc;
532 struct smc_wr_tx_pend_priv *pend;
533 struct smc_wr_buf *wr_buf;
534 int rc;
535
536 if (!smc_wr_tx_link_hold(link))
537 return -ENOLINK;
538 rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
539 if (rc)
540 goto put_out;
541 delllc = (struct smc_llc_msg_del_link *)wr_buf;
542
543 memset(delllc, 0, sizeof(*delllc));
544 delllc->hd.common.type = SMC_LLC_DELETE_LINK;
545 delllc->hd.length = sizeof(struct smc_llc_msg_del_link);
546 if (reqresp == SMC_LLC_RESP)
547 delllc->hd.flags |= SMC_LLC_FLAG_RESP;
548 if (orderly)
549 delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
550 if (link_del_id)
551 delllc->link_num = link_del_id;
552 else
553 delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL;
554 delllc->reason = htonl(reason);
555 /* send llc message */
556 rc = smc_wr_tx_send(link, pend);
557put_out:
558 smc_wr_tx_link_put(link);
559 return rc;
560}
561
562/* send LLC test link request */
563static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16])
564{
565 struct smc_llc_msg_test_link *testllc;
566 struct smc_wr_tx_pend_priv *pend;
567 struct smc_wr_buf *wr_buf;
568 int rc;
569
570 if (!smc_wr_tx_link_hold(link))
571 return -ENOLINK;
572 rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
573 if (rc)
574 goto put_out;
575 testllc = (struct smc_llc_msg_test_link *)wr_buf;
576 memset(testllc, 0, sizeof(*testllc));
577 testllc->hd.common.type = SMC_LLC_TEST_LINK;
578 testllc->hd.length = sizeof(struct smc_llc_msg_test_link);
579 memcpy(testllc->user_data, user_data, sizeof(testllc->user_data));
580 /* send llc message */
581 rc = smc_wr_tx_send(link, pend);
582put_out:
583 smc_wr_tx_link_put(link);
584 return rc;
585}
586
587/* schedule an llc send on link, may wait for buffers */
588static int smc_llc_send_message(struct smc_link *link, void *llcbuf)
589{
590 struct smc_wr_tx_pend_priv *pend;
591 struct smc_wr_buf *wr_buf;
592 int rc;
593
594 if (!smc_wr_tx_link_hold(link))
595 return -ENOLINK;
596 rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
597 if (rc)
598 goto put_out;
599 memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg));
600 rc = smc_wr_tx_send(link, pend);
601put_out:
602 smc_wr_tx_link_put(link);
603 return rc;
604}
605
606/* schedule an llc send on link, may wait for buffers,
607 * and wait for send completion notification.
608 * @return 0 on success
609 */
610static int smc_llc_send_message_wait(struct smc_link *link, void *llcbuf)
611{
612 struct smc_wr_tx_pend_priv *pend;
613 struct smc_wr_buf *wr_buf;
614 int rc;
615
616 if (!smc_wr_tx_link_hold(link))
617 return -ENOLINK;
618 rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
619 if (rc)
620 goto put_out;
621 memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg));
622 rc = smc_wr_tx_send_wait(link, pend, SMC_LLC_WAIT_TIME);
623put_out:
624 smc_wr_tx_link_put(link);
625 return rc;
626}
627
628/********************************* receive ***********************************/
629
630static int smc_llc_alloc_alt_link(struct smc_link_group *lgr,
631 enum smc_lgr_type lgr_new_t)
632{
633 int i;
634
635 if (lgr->type == SMC_LGR_SYMMETRIC ||
636 (lgr->type != SMC_LGR_SINGLE &&
637 (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL ||
638 lgr_new_t == SMC_LGR_ASYMMETRIC_PEER)))
639 return -EMLINK;
640
641 if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL ||
642 lgr_new_t == SMC_LGR_ASYMMETRIC_PEER) {
643 for (i = SMC_LINKS_PER_LGR_MAX - 1; i >= 0; i--)
644 if (lgr->lnk[i].state == SMC_LNK_UNUSED)
645 return i;
646 } else {
647 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++)
648 if (lgr->lnk[i].state == SMC_LNK_UNUSED)
649 return i;
650 }
651 return -EMLINK;
652}
653
654/* return first buffer from any of the next buf lists */
655static struct smc_buf_desc *_smc_llc_get_next_rmb(struct smc_link_group *lgr,
656 int *buf_lst)
657{
658 struct smc_buf_desc *buf_pos;
659
660 while (*buf_lst < SMC_RMBE_SIZES) {
661 buf_pos = list_first_entry_or_null(&lgr->rmbs[*buf_lst],
662 struct smc_buf_desc, list);
663 if (buf_pos)
664 return buf_pos;
665 (*buf_lst)++;
666 }
667 return NULL;
668}
669
670/* return next rmb from buffer lists */
671static struct smc_buf_desc *smc_llc_get_next_rmb(struct smc_link_group *lgr,
672 int *buf_lst,
673 struct smc_buf_desc *buf_pos)
674{
675 struct smc_buf_desc *buf_next;
676
677 if (!buf_pos || list_is_last(&buf_pos->list, &lgr->rmbs[*buf_lst])) {
678 (*buf_lst)++;
679 return _smc_llc_get_next_rmb(lgr, buf_lst);
680 }
681 buf_next = list_next_entry(buf_pos, list);
682 return buf_next;
683}
684
685static struct smc_buf_desc *smc_llc_get_first_rmb(struct smc_link_group *lgr,
686 int *buf_lst)
687{
688 *buf_lst = 0;
689 return smc_llc_get_next_rmb(lgr, buf_lst, NULL);
690}
691
692/* send one add_link_continue msg */
693static int smc_llc_add_link_cont(struct smc_link *link,
694 struct smc_link *link_new, u8 *num_rkeys_todo,
695 int *buf_lst, struct smc_buf_desc **buf_pos)
696{
697 struct smc_llc_msg_add_link_cont *addc_llc;
698 struct smc_link_group *lgr = link->lgr;
699 int prim_lnk_idx, lnk_idx, i, rc;
700 struct smc_wr_tx_pend_priv *pend;
701 struct smc_wr_buf *wr_buf;
702 struct smc_buf_desc *rmb;
703 u8 n;
704
705 if (!smc_wr_tx_link_hold(link))
706 return -ENOLINK;
707 rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
708 if (rc)
709 goto put_out;
710 addc_llc = (struct smc_llc_msg_add_link_cont *)wr_buf;
711 memset(addc_llc, 0, sizeof(*addc_llc));
712
713 prim_lnk_idx = link->link_idx;
714 lnk_idx = link_new->link_idx;
715 addc_llc->link_num = link_new->link_id;
716 addc_llc->num_rkeys = *num_rkeys_todo;
717 n = *num_rkeys_todo;
718 for (i = 0; i < min_t(u8, n, SMC_LLC_RKEYS_PER_CONT_MSG); i++) {
719 if (!*buf_pos) {
720 addc_llc->num_rkeys = addc_llc->num_rkeys -
721 *num_rkeys_todo;
722 *num_rkeys_todo = 0;
723 break;
724 }
725 rmb = *buf_pos;
726
727 addc_llc->rt[i].rmb_key = htonl(rmb->mr_rx[prim_lnk_idx]->rkey);
728 addc_llc->rt[i].rmb_key_new = htonl(rmb->mr_rx[lnk_idx]->rkey);
729 addc_llc->rt[i].rmb_vaddr_new =
730 cpu_to_be64((u64)sg_dma_address(rmb->sgt[lnk_idx].sgl));
731
732 (*num_rkeys_todo)--;
733 *buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos);
734 while (*buf_pos && !(*buf_pos)->used)
735 *buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos);
736 }
737 addc_llc->hd.common.type = SMC_LLC_ADD_LINK_CONT;
738 addc_llc->hd.length = sizeof(struct smc_llc_msg_add_link_cont);
739 if (lgr->role == SMC_CLNT)
740 addc_llc->hd.flags |= SMC_LLC_FLAG_RESP;
741 rc = smc_wr_tx_send(link, pend);
742put_out:
743 smc_wr_tx_link_put(link);
744 return rc;
745}
746
747static int smc_llc_cli_rkey_exchange(struct smc_link *link,
748 struct smc_link *link_new)
749{
750 struct smc_llc_msg_add_link_cont *addc_llc;
751 struct smc_link_group *lgr = link->lgr;
752 u8 max, num_rkeys_send, num_rkeys_recv;
753 struct smc_llc_qentry *qentry;
754 struct smc_buf_desc *buf_pos;
755 int buf_lst;
756 int rc = 0;
757 int i;
758
759 mutex_lock(&lgr->rmbs_lock);
760 num_rkeys_send = lgr->conns_num;
761 buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst);
762 do {
763 qentry = smc_llc_wait(lgr, NULL, SMC_LLC_WAIT_TIME,
764 SMC_LLC_ADD_LINK_CONT);
765 if (!qentry) {
766 rc = -ETIMEDOUT;
767 break;
768 }
769 addc_llc = &qentry->msg.add_link_cont;
770 num_rkeys_recv = addc_llc->num_rkeys;
771 max = min_t(u8, num_rkeys_recv, SMC_LLC_RKEYS_PER_CONT_MSG);
772 for (i = 0; i < max; i++) {
773 smc_rtoken_set(lgr, link->link_idx, link_new->link_idx,
774 addc_llc->rt[i].rmb_key,
775 addc_llc->rt[i].rmb_vaddr_new,
776 addc_llc->rt[i].rmb_key_new);
777 num_rkeys_recv--;
778 }
779 smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
780 rc = smc_llc_add_link_cont(link, link_new, &num_rkeys_send,
781 &buf_lst, &buf_pos);
782 if (rc)
783 break;
784 } while (num_rkeys_send || num_rkeys_recv);
785
786 mutex_unlock(&lgr->rmbs_lock);
787 return rc;
788}
789
790/* prepare and send an add link reject response */
791static int smc_llc_cli_add_link_reject(struct smc_llc_qentry *qentry)
792{
793 qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP;
794 qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_ADD_LNK_REJ;
795 qentry->msg.raw.hdr.add_link_rej_rsn = SMC_LLC_REJ_RSN_NO_ALT_PATH;
796 return smc_llc_send_message(qentry->link, &qentry->msg);
797}
798
799static int smc_llc_cli_conf_link(struct smc_link *link,
800 struct smc_init_info *ini,
801 struct smc_link *link_new,
802 enum smc_lgr_type lgr_new_t)
803{
804 struct smc_link_group *lgr = link->lgr;
805 struct smc_llc_qentry *qentry = NULL;
806 int rc = 0;
807
808 /* receive CONFIRM LINK request over RoCE fabric */
809 qentry = smc_llc_wait(lgr, NULL, SMC_LLC_WAIT_FIRST_TIME, 0);
810 if (!qentry) {
811 rc = smc_llc_send_delete_link(link, link_new->link_id,
812 SMC_LLC_REQ, false,
813 SMC_LLC_DEL_LOST_PATH);
814 return -ENOLINK;
815 }
816 if (qentry->msg.raw.hdr.common.type != SMC_LLC_CONFIRM_LINK) {
817 /* received DELETE_LINK instead */
818 qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP;
819 smc_llc_send_message(link, &qentry->msg);
820 smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
821 return -ENOLINK;
822 }
823 smc_llc_save_peer_uid(qentry);
824 smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
825
826 rc = smc_ib_modify_qp_rts(link_new);
827 if (rc) {
828 smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ,
829 false, SMC_LLC_DEL_LOST_PATH);
830 return -ENOLINK;
831 }
832 smc_wr_remember_qp_attr(link_new);
833
834 rc = smcr_buf_reg_lgr(link_new);
835 if (rc) {
836 smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ,
837 false, SMC_LLC_DEL_LOST_PATH);
838 return -ENOLINK;
839 }
840
841 /* send CONFIRM LINK response over RoCE fabric */
842 rc = smc_llc_send_confirm_link(link_new, SMC_LLC_RESP);
843 if (rc) {
844 smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ,
845 false, SMC_LLC_DEL_LOST_PATH);
846 return -ENOLINK;
847 }
848 smc_llc_link_active(link_new);
849 if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL ||
850 lgr_new_t == SMC_LGR_ASYMMETRIC_PEER)
851 smcr_lgr_set_type_asym(lgr, lgr_new_t, link_new->link_idx);
852 else
853 smcr_lgr_set_type(lgr, lgr_new_t);
854 return 0;
855}
856
857static void smc_llc_save_add_link_info(struct smc_link *link,
858 struct smc_llc_msg_add_link *add_llc)
859{
860 link->peer_qpn = ntoh24(add_llc->sender_qp_num);
861 memcpy(link->peer_gid, add_llc->sender_gid, SMC_GID_SIZE);
862 memcpy(link->peer_mac, add_llc->sender_mac, ETH_ALEN);
863 link->peer_psn = ntoh24(add_llc->initial_psn);
864 link->peer_mtu = add_llc->qp_mtu;
865}
866
867/* as an SMC client, process an add link request */
868int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry)
869{
870 struct smc_llc_msg_add_link *llc = &qentry->msg.add_link;
871 enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC;
872 struct smc_link_group *lgr = smc_get_lgr(link);
873 struct smc_link *lnk_new = NULL;
874 struct smc_init_info ini;
875 int lnk_idx, rc = 0;
876
877 if (!llc->qp_mtu)
878 goto out_reject;
879
880 ini.vlan_id = lgr->vlan_id;
881 smc_pnet_find_alt_roce(lgr, &ini, link->smcibdev);
882 if (!memcmp(llc->sender_gid, link->peer_gid, SMC_GID_SIZE) &&
883 !memcmp(llc->sender_mac, link->peer_mac, ETH_ALEN)) {
884 if (!ini.ib_dev)
885 goto out_reject;
886 lgr_new_t = SMC_LGR_ASYMMETRIC_PEER;
887 }
888 if (!ini.ib_dev) {
889 lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL;
890 ini.ib_dev = link->smcibdev;
891 ini.ib_port = link->ibport;
892 }
893 lnk_idx = smc_llc_alloc_alt_link(lgr, lgr_new_t);
894 if (lnk_idx < 0)
895 goto out_reject;
896 lnk_new = &lgr->lnk[lnk_idx];
897 rc = smcr_link_init(lgr, lnk_new, lnk_idx, &ini);
898 if (rc)
899 goto out_reject;
900 smc_llc_save_add_link_info(lnk_new, llc);
901 lnk_new->link_id = llc->link_num; /* SMC server assigns link id */
902 smc_llc_link_set_uid(lnk_new);
903
904 rc = smc_ib_ready_link(lnk_new);
905 if (rc)
906 goto out_clear_lnk;
907
908 rc = smcr_buf_map_lgr(lnk_new);
909 if (rc)
910 goto out_clear_lnk;
911
912 rc = smc_llc_send_add_link(link,
913 lnk_new->smcibdev->mac[ini.ib_port - 1],
914 lnk_new->gid, lnk_new, SMC_LLC_RESP);
915 if (rc)
916 goto out_clear_lnk;
917 rc = smc_llc_cli_rkey_exchange(link, lnk_new);
918 if (rc) {
919 rc = 0;
920 goto out_clear_lnk;
921 }
922 rc = smc_llc_cli_conf_link(link, &ini, lnk_new, lgr_new_t);
923 if (!rc)
924 goto out;
925out_clear_lnk:
926 lnk_new->state = SMC_LNK_INACTIVE;
927 smcr_link_clear(lnk_new, false);
928out_reject:
929 smc_llc_cli_add_link_reject(qentry);
930out:
931 kfree(qentry);
932 return rc;
933}
934
935/* as an SMC client, invite server to start the add_link processing */
936static void smc_llc_cli_add_link_invite(struct smc_link *link,
937 struct smc_llc_qentry *qentry)
938{
939 struct smc_link_group *lgr = smc_get_lgr(link);
940 struct smc_init_info ini;
941
942 if (lgr->type == SMC_LGR_SYMMETRIC ||
943 lgr->type == SMC_LGR_ASYMMETRIC_PEER)
944 goto out;
945
946 ini.vlan_id = lgr->vlan_id;
947 smc_pnet_find_alt_roce(lgr, &ini, link->smcibdev);
948 if (!ini.ib_dev)
949 goto out;
950
951 smc_llc_send_add_link(link, ini.ib_dev->mac[ini.ib_port - 1],
952 ini.ib_gid, NULL, SMC_LLC_REQ);
953out:
954 kfree(qentry);
955}
956
957static bool smc_llc_is_empty_llc_message(union smc_llc_msg *llc)
958{
959 int i;
960
961 for (i = 0; i < ARRAY_SIZE(llc->raw.data); i++)
962 if (llc->raw.data[i])
963 return false;
964 return true;
965}
966
967static bool smc_llc_is_local_add_link(union smc_llc_msg *llc)
968{
969 if (llc->raw.hdr.common.type == SMC_LLC_ADD_LINK &&
970 smc_llc_is_empty_llc_message(llc))
971 return true;
972 return false;
973}
974
975static void smc_llc_process_cli_add_link(struct smc_link_group *lgr)
976{
977 struct smc_llc_qentry *qentry;
978
979 qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl);
980
981 mutex_lock(&lgr->llc_conf_mutex);
982 if (smc_llc_is_local_add_link(&qentry->msg))
983 smc_llc_cli_add_link_invite(qentry->link, qentry);
984 else
985 smc_llc_cli_add_link(qentry->link, qentry);
986 mutex_unlock(&lgr->llc_conf_mutex);
987}
988
989static int smc_llc_active_link_count(struct smc_link_group *lgr)
990{
991 int i, link_count = 0;
992
993 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
994 if (!smc_link_active(&lgr->lnk[i]))
995 continue;
996 link_count++;
997 }
998 return link_count;
999}
1000
1001/* find the asymmetric link when 3 links are established */
1002static struct smc_link *smc_llc_find_asym_link(struct smc_link_group *lgr)
1003{
1004 int asym_idx = -ENOENT;
1005 int i, j, k;
1006 bool found;
1007
1008 /* determine asymmetric link */
1009 found = false;
1010 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
1011 for (j = i + 1; j < SMC_LINKS_PER_LGR_MAX; j++) {
1012 if (!smc_link_usable(&lgr->lnk[i]) ||
1013 !smc_link_usable(&lgr->lnk[j]))
1014 continue;
1015 if (!memcmp(lgr->lnk[i].gid, lgr->lnk[j].gid,
1016 SMC_GID_SIZE)) {
1017 found = true; /* asym_lnk is i or j */
1018 break;
1019 }
1020 }
1021 if (found)
1022 break;
1023 }
1024 if (!found)
1025 goto out; /* no asymmetric link */
1026 for (k = 0; k < SMC_LINKS_PER_LGR_MAX; k++) {
1027 if (!smc_link_usable(&lgr->lnk[k]))
1028 continue;
1029 if (k != i &&
1030 !memcmp(lgr->lnk[i].peer_gid, lgr->lnk[k].peer_gid,
1031 SMC_GID_SIZE)) {
1032 asym_idx = i;
1033 break;
1034 }
1035 if (k != j &&
1036 !memcmp(lgr->lnk[j].peer_gid, lgr->lnk[k].peer_gid,
1037 SMC_GID_SIZE)) {
1038 asym_idx = j;
1039 break;
1040 }
1041 }
1042out:
1043 return (asym_idx < 0) ? NULL : &lgr->lnk[asym_idx];
1044}
1045
1046static void smc_llc_delete_asym_link(struct smc_link_group *lgr)
1047{
1048 struct smc_link *lnk_new = NULL, *lnk_asym;
1049 struct smc_llc_qentry *qentry;
1050 int rc;
1051
1052 lnk_asym = smc_llc_find_asym_link(lgr);
1053 if (!lnk_asym)
1054 return; /* no asymmetric link */
1055 if (!smc_link_downing(&lnk_asym->state))
1056 return;
1057 lnk_new = smc_switch_conns(lgr, lnk_asym, false);
1058 smc_wr_tx_wait_no_pending_sends(lnk_asym);
1059 if (!lnk_new)
1060 goto out_free;
1061 /* change flow type from ADD_LINK into DEL_LINK */
1062 lgr->llc_flow_lcl.type = SMC_LLC_FLOW_DEL_LINK;
1063 rc = smc_llc_send_delete_link(lnk_new, lnk_asym->link_id, SMC_LLC_REQ,
1064 true, SMC_LLC_DEL_NO_ASYM_NEEDED);
1065 if (rc) {
1066 smcr_link_down_cond(lnk_new);
1067 goto out_free;
1068 }
1069 qentry = smc_llc_wait(lgr, lnk_new, SMC_LLC_WAIT_TIME,
1070 SMC_LLC_DELETE_LINK);
1071 if (!qentry) {
1072 smcr_link_down_cond(lnk_new);
1073 goto out_free;
1074 }
1075 smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
1076out_free:
1077 smcr_link_clear(lnk_asym, true);
1078}
1079
1080static int smc_llc_srv_rkey_exchange(struct smc_link *link,
1081 struct smc_link *link_new)
1082{
1083 struct smc_llc_msg_add_link_cont *addc_llc;
1084 struct smc_link_group *lgr = link->lgr;
1085 u8 max, num_rkeys_send, num_rkeys_recv;
1086 struct smc_llc_qentry *qentry = NULL;
1087 struct smc_buf_desc *buf_pos;
1088 int buf_lst;
1089 int rc = 0;
1090 int i;
1091
1092 mutex_lock(&lgr->rmbs_lock);
1093 num_rkeys_send = lgr->conns_num;
1094 buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst);
1095 do {
1096 smc_llc_add_link_cont(link, link_new, &num_rkeys_send,
1097 &buf_lst, &buf_pos);
1098 qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_TIME,
1099 SMC_LLC_ADD_LINK_CONT);
1100 if (!qentry) {
1101 rc = -ETIMEDOUT;
1102 goto out;
1103 }
1104 addc_llc = &qentry->msg.add_link_cont;
1105 num_rkeys_recv = addc_llc->num_rkeys;
1106 max = min_t(u8, num_rkeys_recv, SMC_LLC_RKEYS_PER_CONT_MSG);
1107 for (i = 0; i < max; i++) {
1108 smc_rtoken_set(lgr, link->link_idx, link_new->link_idx,
1109 addc_llc->rt[i].rmb_key,
1110 addc_llc->rt[i].rmb_vaddr_new,
1111 addc_llc->rt[i].rmb_key_new);
1112 num_rkeys_recv--;
1113 }
1114 smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
1115 } while (num_rkeys_send || num_rkeys_recv);
1116out:
1117 mutex_unlock(&lgr->rmbs_lock);
1118 return rc;
1119}
1120
1121static int smc_llc_srv_conf_link(struct smc_link *link,
1122 struct smc_link *link_new,
1123 enum smc_lgr_type lgr_new_t)
1124{
1125 struct smc_link_group *lgr = link->lgr;
1126 struct smc_llc_qentry *qentry = NULL;
1127 int rc;
1128
1129 /* send CONFIRM LINK request over the RoCE fabric */
1130 rc = smc_llc_send_confirm_link(link_new, SMC_LLC_REQ);
1131 if (rc)
1132 return -ENOLINK;
1133 /* receive CONFIRM LINK response over the RoCE fabric */
1134 qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_FIRST_TIME, 0);
1135 if (!qentry ||
1136 qentry->msg.raw.hdr.common.type != SMC_LLC_CONFIRM_LINK) {
1137 /* send DELETE LINK */
1138 smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ,
1139 false, SMC_LLC_DEL_LOST_PATH);
1140 if (qentry)
1141 smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
1142 return -ENOLINK;
1143 }
1144 smc_llc_save_peer_uid(qentry);
1145 smc_llc_link_active(link_new);
1146 if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL ||
1147 lgr_new_t == SMC_LGR_ASYMMETRIC_PEER)
1148 smcr_lgr_set_type_asym(lgr, lgr_new_t, link_new->link_idx);
1149 else
1150 smcr_lgr_set_type(lgr, lgr_new_t);
1151 smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
1152 return 0;
1153}
1154
1155int smc_llc_srv_add_link(struct smc_link *link)
1156{
1157 enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC;
1158 struct smc_link_group *lgr = link->lgr;
1159 struct smc_llc_msg_add_link *add_llc;
1160 struct smc_llc_qentry *qentry = NULL;
1161 struct smc_link *link_new;
1162 struct smc_init_info ini;
1163 int lnk_idx, rc = 0;
1164
1165 /* ignore client add link recommendation, start new flow */
1166 ini.vlan_id = lgr->vlan_id;
1167 smc_pnet_find_alt_roce(lgr, &ini, link->smcibdev);
1168 if (!ini.ib_dev) {
1169 lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL;
1170 ini.ib_dev = link->smcibdev;
1171 ini.ib_port = link->ibport;
1172 }
1173 lnk_idx = smc_llc_alloc_alt_link(lgr, lgr_new_t);
1174 if (lnk_idx < 0)
1175 return 0;
1176
1177 rc = smcr_link_init(lgr, &lgr->lnk[lnk_idx], lnk_idx, &ini);
1178 if (rc)
1179 return rc;
1180 link_new = &lgr->lnk[lnk_idx];
1181 rc = smc_llc_send_add_link(link,
1182 link_new->smcibdev->mac[ini.ib_port - 1],
1183 link_new->gid, link_new, SMC_LLC_REQ);
1184 if (rc)
1185 goto out_err;
1186 /* receive ADD LINK response over the RoCE fabric */
1187 qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_TIME, SMC_LLC_ADD_LINK);
1188 if (!qentry) {
1189 rc = -ETIMEDOUT;
1190 goto out_err;
1191 }
1192 add_llc = &qentry->msg.add_link;
1193 if (add_llc->hd.flags & SMC_LLC_FLAG_ADD_LNK_REJ) {
1194 smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
1195 rc = -ENOLINK;
1196 goto out_err;
1197 }
1198 if (lgr->type == SMC_LGR_SINGLE &&
1199 (!memcmp(add_llc->sender_gid, link->peer_gid, SMC_GID_SIZE) &&
1200 !memcmp(add_llc->sender_mac, link->peer_mac, ETH_ALEN))) {
1201 lgr_new_t = SMC_LGR_ASYMMETRIC_PEER;
1202 }
1203 smc_llc_save_add_link_info(link_new, add_llc);
1204 smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
1205
1206 rc = smc_ib_ready_link(link_new);
1207 if (rc)
1208 goto out_err;
1209 rc = smcr_buf_map_lgr(link_new);
1210 if (rc)
1211 goto out_err;
1212 rc = smcr_buf_reg_lgr(link_new);
1213 if (rc)
1214 goto out_err;
1215 rc = smc_llc_srv_rkey_exchange(link, link_new);
1216 if (rc)
1217 goto out_err;
1218 rc = smc_llc_srv_conf_link(link, link_new, lgr_new_t);
1219 if (rc)
1220 goto out_err;
1221 return 0;
1222out_err:
1223 link_new->state = SMC_LNK_INACTIVE;
1224 smcr_link_clear(link_new, false);
1225 return rc;
1226}
1227
1228static void smc_llc_process_srv_add_link(struct smc_link_group *lgr)
1229{
1230 struct smc_link *link = lgr->llc_flow_lcl.qentry->link;
1231 int rc;
1232
1233 smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
1234
1235 mutex_lock(&lgr->llc_conf_mutex);
1236 rc = smc_llc_srv_add_link(link);
1237 if (!rc && lgr->type == SMC_LGR_SYMMETRIC) {
1238 /* delete any asymmetric link */
1239 smc_llc_delete_asym_link(lgr);
1240 }
1241 mutex_unlock(&lgr->llc_conf_mutex);
1242}
1243
1244/* enqueue a local add_link req to trigger a new add_link flow */
1245void smc_llc_add_link_local(struct smc_link *link)
1246{
1247 struct smc_llc_msg_add_link add_llc = {};
1248
1249 add_llc.hd.length = sizeof(add_llc);
1250 add_llc.hd.common.type = SMC_LLC_ADD_LINK;
1251 /* no dev and port needed */
1252 smc_llc_enqueue(link, (union smc_llc_msg *)&add_llc);
1253}
1254
1255/* worker to process an add link message */
1256static void smc_llc_add_link_work(struct work_struct *work)
1257{
1258 struct smc_link_group *lgr = container_of(work, struct smc_link_group,
1259 llc_add_link_work);
1260
1261 if (list_empty(&lgr->list)) {
1262 /* link group is terminating */
1263 smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
1264 goto out;
1265 }
1266
1267 if (lgr->role == SMC_CLNT)
1268 smc_llc_process_cli_add_link(lgr);
1269 else
1270 smc_llc_process_srv_add_link(lgr);
1271out:
1272 smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
1273}
1274
1275/* enqueue a local del_link msg to trigger a new del_link flow,
1276 * called only for role SMC_SERV
1277 */
1278void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id)
1279{
1280 struct smc_llc_msg_del_link del_llc = {};
1281
1282 del_llc.hd.length = sizeof(del_llc);
1283 del_llc.hd.common.type = SMC_LLC_DELETE_LINK;
1284 del_llc.link_num = del_link_id;
1285 del_llc.reason = htonl(SMC_LLC_DEL_LOST_PATH);
1286 del_llc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
1287 smc_llc_enqueue(link, (union smc_llc_msg *)&del_llc);
1288}
1289
1290static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr)
1291{
1292 struct smc_link *lnk_del = NULL, *lnk_asym, *lnk;
1293 struct smc_llc_msg_del_link *del_llc;
1294 struct smc_llc_qentry *qentry;
1295 int active_links;
1296 int lnk_idx;
1297
1298 qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl);
1299 lnk = qentry->link;
1300 del_llc = &qentry->msg.delete_link;
1301
1302 if (del_llc->hd.flags & SMC_LLC_FLAG_DEL_LINK_ALL) {
1303 smc_lgr_terminate_sched(lgr);
1304 goto out;
1305 }
1306 mutex_lock(&lgr->llc_conf_mutex);
1307 /* delete single link */
1308 for (lnk_idx = 0; lnk_idx < SMC_LINKS_PER_LGR_MAX; lnk_idx++) {
1309 if (lgr->lnk[lnk_idx].link_id != del_llc->link_num)
1310 continue;
1311 lnk_del = &lgr->lnk[lnk_idx];
1312 break;
1313 }
1314 del_llc->hd.flags |= SMC_LLC_FLAG_RESP;
1315 if (!lnk_del) {
1316 /* link was not found */
1317 del_llc->reason = htonl(SMC_LLC_DEL_NOLNK);
1318 smc_llc_send_message(lnk, &qentry->msg);
1319 goto out_unlock;
1320 }
1321 lnk_asym = smc_llc_find_asym_link(lgr);
1322
1323 del_llc->reason = 0;
1324 smc_llc_send_message(lnk, &qentry->msg); /* response */
1325
1326 if (smc_link_downing(&lnk_del->state))
1327 smc_switch_conns(lgr, lnk_del, false);
1328 smcr_link_clear(lnk_del, true);
1329
1330 active_links = smc_llc_active_link_count(lgr);
1331 if (lnk_del == lnk_asym) {
1332 /* expected deletion of asym link, don't change lgr state */
1333 } else if (active_links == 1) {
1334 smcr_lgr_set_type(lgr, SMC_LGR_SINGLE);
1335 } else if (!active_links) {
1336 smcr_lgr_set_type(lgr, SMC_LGR_NONE);
1337 smc_lgr_terminate_sched(lgr);
1338 }
1339out_unlock:
1340 mutex_unlock(&lgr->llc_conf_mutex);
1341out:
1342 kfree(qentry);
1343}
1344
1345/* try to send a DELETE LINK ALL request on any active link,
1346 * waiting for send completion
1347 */
1348void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, u32 rsn)
1349{
1350 struct smc_llc_msg_del_link delllc = {};
1351 int i;
1352
1353 delllc.hd.common.type = SMC_LLC_DELETE_LINK;
1354 delllc.hd.length = sizeof(delllc);
1355 if (ord)
1356 delllc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
1357 delllc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL;
1358 delllc.reason = htonl(rsn);
1359
1360 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
1361 if (!smc_link_sendable(&lgr->lnk[i]))
1362 continue;
1363 if (!smc_llc_send_message_wait(&lgr->lnk[i], &delllc))
1364 break;
1365 }
1366}
1367
1368static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr)
1369{
1370 struct smc_llc_msg_del_link *del_llc;
1371 struct smc_link *lnk, *lnk_del;
1372 struct smc_llc_qentry *qentry;
1373 int active_links;
1374 int i;
1375
1376 mutex_lock(&lgr->llc_conf_mutex);
1377 qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl);
1378 lnk = qentry->link;
1379 del_llc = &qentry->msg.delete_link;
1380
1381 if (qentry->msg.delete_link.hd.flags & SMC_LLC_FLAG_DEL_LINK_ALL) {
1382 /* delete entire lgr */
1383 smc_llc_send_link_delete_all(lgr, true, ntohl(
1384 qentry->msg.delete_link.reason));
1385 smc_lgr_terminate_sched(lgr);
1386 goto out;
1387 }
1388 /* delete single link */
1389 lnk_del = NULL;
1390 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
1391 if (lgr->lnk[i].link_id == del_llc->link_num) {
1392 lnk_del = &lgr->lnk[i];
1393 break;
1394 }
1395 }
1396 if (!lnk_del)
1397 goto out; /* asymmetric link already deleted */
1398
1399 if (smc_link_downing(&lnk_del->state)) {
1400 if (smc_switch_conns(lgr, lnk_del, false))
1401 smc_wr_tx_wait_no_pending_sends(lnk_del);
1402 }
1403 if (!list_empty(&lgr->list)) {
1404 /* qentry is either a request from peer (send it back to
1405 * initiate the DELETE_LINK processing), or a locally
1406 * enqueued DELETE_LINK request (forward it)
1407 */
1408 if (!smc_llc_send_message(lnk, &qentry->msg)) {
1409 struct smc_llc_qentry *qentry2;
1410
1411 qentry2 = smc_llc_wait(lgr, lnk, SMC_LLC_WAIT_TIME,
1412 SMC_LLC_DELETE_LINK);
1413 if (qentry2)
1414 smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
1415 }
1416 }
1417 smcr_link_clear(lnk_del, true);
1418
1419 active_links = smc_llc_active_link_count(lgr);
1420 if (active_links == 1) {
1421 smcr_lgr_set_type(lgr, SMC_LGR_SINGLE);
1422 } else if (!active_links) {
1423 smcr_lgr_set_type(lgr, SMC_LGR_NONE);
1424 smc_lgr_terminate_sched(lgr);
1425 }
1426
1427 if (lgr->type == SMC_LGR_SINGLE && !list_empty(&lgr->list)) {
1428 /* trigger setup of asymm alt link */
1429 smc_llc_add_link_local(lnk);
1430 }
1431out:
1432 mutex_unlock(&lgr->llc_conf_mutex);
1433 kfree(qentry);
1434}
1435
1436static void smc_llc_delete_link_work(struct work_struct *work)
1437{
1438 struct smc_link_group *lgr = container_of(work, struct smc_link_group,
1439 llc_del_link_work);
1440
1441 if (list_empty(&lgr->list)) {
1442 /* link group is terminating */
1443 smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
1444 goto out;
1445 }
1446
1447 if (lgr->role == SMC_CLNT)
1448 smc_llc_process_cli_delete_link(lgr);
1449 else
1450 smc_llc_process_srv_delete_link(lgr);
1451out:
1452 smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
1453}
1454
1455/* process a confirm_rkey request from peer, remote flow */
1456static void smc_llc_rmt_conf_rkey(struct smc_link_group *lgr)
1457{
1458 struct smc_llc_msg_confirm_rkey *llc;
1459 struct smc_llc_qentry *qentry;
1460 struct smc_link *link;
1461 int num_entries;
1462 int rk_idx;
1463 int i;
1464
1465 qentry = lgr->llc_flow_rmt.qentry;
1466 llc = &qentry->msg.confirm_rkey;
1467 link = qentry->link;
1468
1469 num_entries = llc->rtoken[0].num_rkeys;
1470 /* first rkey entry is for receiving link */
1471 rk_idx = smc_rtoken_add(link,
1472 llc->rtoken[0].rmb_vaddr,
1473 llc->rtoken[0].rmb_key);
1474 if (rk_idx < 0)
1475 goto out_err;
1476
1477 for (i = 1; i <= min_t(u8, num_entries, SMC_LLC_RKEYS_PER_MSG - 1); i++)
1478 smc_rtoken_set2(lgr, rk_idx, llc->rtoken[i].link_id,
1479 llc->rtoken[i].rmb_vaddr,
1480 llc->rtoken[i].rmb_key);
1481 /* max links is 3 so there is no need to support conf_rkey_cont msgs */
1482 goto out;
1483out_err:
1484 llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG;
1485 llc->hd.flags |= SMC_LLC_FLAG_RKEY_RETRY;
1486out:
1487 llc->hd.flags |= SMC_LLC_FLAG_RESP;
1488 smc_llc_send_message(link, &qentry->msg);
1489 smc_llc_flow_qentry_del(&lgr->llc_flow_rmt);
1490}
1491
1492/* process a delete_rkey request from peer, remote flow */
1493static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr)
1494{
1495 struct smc_llc_msg_delete_rkey *llc;
1496 struct smc_llc_qentry *qentry;
1497 struct smc_link *link;
1498 u8 err_mask = 0;
1499 int i, max;
1500
1501 qentry = lgr->llc_flow_rmt.qentry;
1502 llc = &qentry->msg.delete_rkey;
1503 link = qentry->link;
1504
1505 max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX);
1506 for (i = 0; i < max; i++) {
1507 if (smc_rtoken_delete(link, llc->rkey[i]))
1508 err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i);
1509 }
1510 if (err_mask) {
1511 llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG;
1512 llc->err_mask = err_mask;
1513 }
1514 llc->hd.flags |= SMC_LLC_FLAG_RESP;
1515 smc_llc_send_message(link, &qentry->msg);
1516 smc_llc_flow_qentry_del(&lgr->llc_flow_rmt);
1517}
1518
1519static void smc_llc_protocol_violation(struct smc_link_group *lgr, u8 type)
1520{
1521 pr_warn_ratelimited("smc: SMC-R lg %*phN LLC protocol violation: "
1522 "llc_type %d\n", SMC_LGR_ID_SIZE, &lgr->id, type);
1523 smc_llc_set_termination_rsn(lgr, SMC_LLC_DEL_PROT_VIOL);
1524 smc_lgr_terminate_sched(lgr);
1525}
1526
1527/* flush the llc event queue */
1528static void smc_llc_event_flush(struct smc_link_group *lgr)
1529{
1530 struct smc_llc_qentry *qentry, *q;
1531
1532 spin_lock_bh(&lgr->llc_event_q_lock);
1533 list_for_each_entry_safe(qentry, q, &lgr->llc_event_q, list) {
1534 list_del_init(&qentry->list);
1535 kfree(qentry);
1536 }
1537 spin_unlock_bh(&lgr->llc_event_q_lock);
1538}
1539
1540static void smc_llc_event_handler(struct smc_llc_qentry *qentry)
1541{
1542 union smc_llc_msg *llc = &qentry->msg;
1543 struct smc_link *link = qentry->link;
1544 struct smc_link_group *lgr = link->lgr;
1545
1546 if (!smc_link_usable(link))
1547 goto out;
1548
1549 switch (llc->raw.hdr.common.type) {
1550 case SMC_LLC_TEST_LINK:
1551 llc->test_link.hd.flags |= SMC_LLC_FLAG_RESP;
1552 smc_llc_send_message(link, llc);
1553 break;
1554 case SMC_LLC_ADD_LINK:
1555 if (list_empty(&lgr->list))
1556 goto out; /* lgr is terminating */
1557 if (lgr->role == SMC_CLNT) {
1558 if (smc_llc_is_local_add_link(llc)) {
1559 if (lgr->llc_flow_lcl.type ==
1560 SMC_LLC_FLOW_ADD_LINK)
1561 break; /* add_link in progress */
1562 if (smc_llc_flow_start(&lgr->llc_flow_lcl,
1563 qentry)) {
1564 schedule_work(&lgr->llc_add_link_work);
1565 }
1566 return;
1567 }
1568 if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_ADD_LINK &&
1569 !lgr->llc_flow_lcl.qentry) {
1570 /* a flow is waiting for this message */
1571 smc_llc_flow_qentry_set(&lgr->llc_flow_lcl,
1572 qentry);
1573 wake_up(&lgr->llc_msg_waiter);
1574 } else if (smc_llc_flow_start(&lgr->llc_flow_lcl,
1575 qentry)) {
1576 schedule_work(&lgr->llc_add_link_work);
1577 }
1578 } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) {
1579 /* as smc server, handle client suggestion */
1580 schedule_work(&lgr->llc_add_link_work);
1581 }
1582 return;
1583 case SMC_LLC_CONFIRM_LINK:
1584 case SMC_LLC_ADD_LINK_CONT:
1585 if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) {
1586 /* a flow is waiting for this message */
1587 smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, qentry);
1588 wake_up(&lgr->llc_msg_waiter);
1589 return;
1590 }
1591 break;
1592 case SMC_LLC_DELETE_LINK:
1593 if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_ADD_LINK &&
1594 !lgr->llc_flow_lcl.qentry) {
1595 /* DEL LINK REQ during ADD LINK SEQ */
1596 smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, qentry);
1597 wake_up(&lgr->llc_msg_waiter);
1598 } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) {
1599 schedule_work(&lgr->llc_del_link_work);
1600 }
1601 return;
1602 case SMC_LLC_CONFIRM_RKEY:
1603 /* new request from remote, assign to remote flow */
1604 if (smc_llc_flow_start(&lgr->llc_flow_rmt, qentry)) {
1605 /* process here, does not wait for more llc msgs */
1606 smc_llc_rmt_conf_rkey(lgr);
1607 smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt);
1608 }
1609 return;
1610 case SMC_LLC_CONFIRM_RKEY_CONT:
1611 /* not used because max links is 3, and 3 rkeys fit into
1612 * one CONFIRM_RKEY message
1613 */
1614 break;
1615 case SMC_LLC_DELETE_RKEY:
1616 /* new request from remote, assign to remote flow */
1617 if (smc_llc_flow_start(&lgr->llc_flow_rmt, qentry)) {
1618 /* process here, does not wait for more llc msgs */
1619 smc_llc_rmt_delete_rkey(lgr);
1620 smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt);
1621 }
1622 return;
1623 default:
1624 smc_llc_protocol_violation(lgr, llc->raw.hdr.common.type);
1625 break;
1626 }
1627out:
1628 kfree(qentry);
1629}
1630
1631/* worker to process llc messages on the event queue */
1632static void smc_llc_event_work(struct work_struct *work)
1633{
1634 struct smc_link_group *lgr = container_of(work, struct smc_link_group,
1635 llc_event_work);
1636 struct smc_llc_qentry *qentry;
1637
1638 if (!lgr->llc_flow_lcl.type && lgr->delayed_event) {
1639 qentry = lgr->delayed_event;
1640 lgr->delayed_event = NULL;
1641 if (smc_link_usable(qentry->link))
1642 smc_llc_event_handler(qentry);
1643 else
1644 kfree(qentry);
1645 }
1646
1647again:
1648 spin_lock_bh(&lgr->llc_event_q_lock);
1649 if (!list_empty(&lgr->llc_event_q)) {
1650 qentry = list_first_entry(&lgr->llc_event_q,
1651 struct smc_llc_qentry, list);
1652 list_del_init(&qentry->list);
1653 spin_unlock_bh(&lgr->llc_event_q_lock);
1654 smc_llc_event_handler(qentry);
1655 goto again;
1656 }
1657 spin_unlock_bh(&lgr->llc_event_q_lock);
1658}
1659
1660/* process llc responses in tasklet context */
1661static void smc_llc_rx_response(struct smc_link *link,
1662 struct smc_llc_qentry *qentry)
1663{
1664 enum smc_llc_flowtype flowtype = link->lgr->llc_flow_lcl.type;
1665 struct smc_llc_flow *flow = &link->lgr->llc_flow_lcl;
1666 u8 llc_type = qentry->msg.raw.hdr.common.type;
1667
1668 switch (llc_type) {
1669 case SMC_LLC_TEST_LINK:
1670 if (smc_link_active(link))
1671 complete(&link->llc_testlink_resp);
1672 break;
1673 case SMC_LLC_ADD_LINK:
1674 case SMC_LLC_ADD_LINK_CONT:
1675 case SMC_LLC_CONFIRM_LINK:
1676 if (flowtype != SMC_LLC_FLOW_ADD_LINK || flow->qentry)
1677 break; /* drop out-of-flow response */
1678 goto assign;
1679 case SMC_LLC_DELETE_LINK:
1680 if (flowtype != SMC_LLC_FLOW_DEL_LINK || flow->qentry)
1681 break; /* drop out-of-flow response */
1682 goto assign;
1683 case SMC_LLC_CONFIRM_RKEY:
1684 case SMC_LLC_DELETE_RKEY:
1685 if (flowtype != SMC_LLC_FLOW_RKEY || flow->qentry)
1686 break; /* drop out-of-flow response */
1687 goto assign;
1688 case SMC_LLC_CONFIRM_RKEY_CONT:
1689 /* not used because max links is 3 */
1690 break;
1691 default:
1692 smc_llc_protocol_violation(link->lgr, llc_type);
1693 break;
1694 }
1695 kfree(qentry);
1696 return;
1697assign:
1698 /* assign responses to the local flow, we requested them */
1699 smc_llc_flow_qentry_set(&link->lgr->llc_flow_lcl, qentry);
1700 wake_up(&link->lgr->llc_msg_waiter);
1701}
1702
1703static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc)
1704{
1705 struct smc_link_group *lgr = link->lgr;
1706 struct smc_llc_qentry *qentry;
1707 unsigned long flags;
1708
1709 qentry = kmalloc(sizeof(*qentry), GFP_ATOMIC);
1710 if (!qentry)
1711 return;
1712 qentry->link = link;
1713 INIT_LIST_HEAD(&qentry->list);
1714 memcpy(&qentry->msg, llc, sizeof(union smc_llc_msg));
1715
1716 /* process responses immediately */
1717 if (llc->raw.hdr.flags & SMC_LLC_FLAG_RESP) {
1718 smc_llc_rx_response(link, qentry);
1719 return;
1720 }
1721
1722 /* add requests to event queue */
1723 spin_lock_irqsave(&lgr->llc_event_q_lock, flags);
1724 list_add_tail(&qentry->list, &lgr->llc_event_q);
1725 spin_unlock_irqrestore(&lgr->llc_event_q_lock, flags);
1726 queue_work(system_highpri_wq, &lgr->llc_event_work);
1727}
1728
1729/* copy received msg and add it to the event queue */
1730static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
1731{
1732 struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
1733 union smc_llc_msg *llc = buf;
1734
1735 if (wc->byte_len < sizeof(*llc))
1736 return; /* short message */
1737 if (llc->raw.hdr.length != sizeof(*llc))
1738 return; /* invalid message */
1739
1740 smc_llc_enqueue(link, llc);
1741}
1742
1743/***************************** worker, utils *********************************/
1744
1745static void smc_llc_testlink_work(struct work_struct *work)
1746{
1747 struct smc_link *link = container_of(to_delayed_work(work),
1748 struct smc_link, llc_testlink_wrk);
1749 unsigned long next_interval;
1750 unsigned long expire_time;
1751 u8 user_data[16] = { 0 };
1752 int rc;
1753
1754 if (!smc_link_active(link))
1755 return; /* don't reschedule worker */
1756 expire_time = link->wr_rx_tstamp + link->llc_testlink_time;
1757 if (time_is_after_jiffies(expire_time)) {
1758 next_interval = expire_time - jiffies;
1759 goto out;
1760 }
1761 reinit_completion(&link->llc_testlink_resp);
1762 smc_llc_send_test_link(link, user_data);
1763 /* receive TEST LINK response over RoCE fabric */
1764 rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp,
1765 SMC_LLC_WAIT_TIME);
1766 if (!smc_link_active(link))
1767 return; /* link state changed */
1768 if (rc <= 0) {
1769 smcr_link_down_cond_sched(link);
1770 return;
1771 }
1772 next_interval = link->llc_testlink_time;
1773out:
1774 schedule_delayed_work(&link->llc_testlink_wrk, next_interval);
1775}
1776
1777void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc)
1778{
1779 struct net *net = sock_net(smc->clcsock->sk);
1780
1781 INIT_WORK(&lgr->llc_event_work, smc_llc_event_work);
1782 INIT_WORK(&lgr->llc_add_link_work, smc_llc_add_link_work);
1783 INIT_WORK(&lgr->llc_del_link_work, smc_llc_delete_link_work);
1784 INIT_LIST_HEAD(&lgr->llc_event_q);
1785 spin_lock_init(&lgr->llc_event_q_lock);
1786 spin_lock_init(&lgr->llc_flow_lock);
1787 init_waitqueue_head(&lgr->llc_flow_waiter);
1788 init_waitqueue_head(&lgr->llc_msg_waiter);
1789 mutex_init(&lgr->llc_conf_mutex);
1790 lgr->llc_testlink_time = READ_ONCE(net->ipv4.sysctl_tcp_keepalive_time);
1791}
1792
1793/* called after lgr was removed from lgr_list */
1794void smc_llc_lgr_clear(struct smc_link_group *lgr)
1795{
1796 smc_llc_event_flush(lgr);
1797 wake_up_all(&lgr->llc_flow_waiter);
1798 wake_up_all(&lgr->llc_msg_waiter);
1799 cancel_work_sync(&lgr->llc_event_work);
1800 cancel_work_sync(&lgr->llc_add_link_work);
1801 cancel_work_sync(&lgr->llc_del_link_work);
1802 if (lgr->delayed_event) {
1803 kfree(lgr->delayed_event);
1804 lgr->delayed_event = NULL;
1805 }
1806}
1807
1808int smc_llc_link_init(struct smc_link *link)
1809{
1810 init_completion(&link->llc_testlink_resp);
1811 INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work);
1812 return 0;
1813}
1814
1815void smc_llc_link_active(struct smc_link *link)
1816{
1817 pr_warn_ratelimited("smc: SMC-R lg %*phN link added: id %*phN, "
1818 "peerid %*phN, ibdev %s, ibport %d\n",
1819 SMC_LGR_ID_SIZE, &link->lgr->id,
1820 SMC_LGR_ID_SIZE, &link->link_uid,
1821 SMC_LGR_ID_SIZE, &link->peer_link_uid,
1822 link->smcibdev->ibdev->name, link->ibport);
1823 link->state = SMC_LNK_ACTIVE;
1824 if (link->lgr->llc_testlink_time) {
1825 link->llc_testlink_time = link->lgr->llc_testlink_time;
1826 schedule_delayed_work(&link->llc_testlink_wrk,
1827 link->llc_testlink_time);
1828 }
1829}
1830
1831/* called in worker context */
1832void smc_llc_link_clear(struct smc_link *link, bool log)
1833{
1834 if (log)
1835 pr_warn_ratelimited("smc: SMC-R lg %*phN link removed: id %*phN"
1836 ", peerid %*phN, ibdev %s, ibport %d\n",
1837 SMC_LGR_ID_SIZE, &link->lgr->id,
1838 SMC_LGR_ID_SIZE, &link->link_uid,
1839 SMC_LGR_ID_SIZE, &link->peer_link_uid,
1840 link->smcibdev->ibdev->name, link->ibport);
1841 complete(&link->llc_testlink_resp);
1842 cancel_delayed_work_sync(&link->llc_testlink_wrk);
1843}
1844
1845/* register a new rtoken at the remote peer (for all links) */
1846int smc_llc_do_confirm_rkey(struct smc_link *send_link,
1847 struct smc_buf_desc *rmb_desc)
1848{
1849 struct smc_link_group *lgr = send_link->lgr;
1850 struct smc_llc_qentry *qentry = NULL;
1851 int rc = 0;
1852
1853 rc = smc_llc_send_confirm_rkey(send_link, rmb_desc);
1854 if (rc)
1855 goto out;
1856 /* receive CONFIRM RKEY response from server over RoCE fabric */
1857 qentry = smc_llc_wait(lgr, send_link, SMC_LLC_WAIT_TIME,
1858 SMC_LLC_CONFIRM_RKEY);
1859 if (!qentry || (qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_RKEY_NEG))
1860 rc = -EFAULT;
1861out:
1862 if (qentry)
1863 smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
1864 return rc;
1865}
1866
1867/* unregister an rtoken at the remote peer */
1868int smc_llc_do_delete_rkey(struct smc_link_group *lgr,
1869 struct smc_buf_desc *rmb_desc)
1870{
1871 struct smc_llc_qentry *qentry = NULL;
1872 struct smc_link *send_link;
1873 int rc = 0;
1874
1875 send_link = smc_llc_usable_link(lgr);
1876 if (!send_link)
1877 return -ENOLINK;
1878
1879 /* protected by llc_flow control */
1880 rc = smc_llc_send_delete_rkey(send_link, rmb_desc);
1881 if (rc)
1882 goto out;
1883 /* receive DELETE RKEY response from server over RoCE fabric */
1884 qentry = smc_llc_wait(lgr, send_link, SMC_LLC_WAIT_TIME,
1885 SMC_LLC_DELETE_RKEY);
1886 if (!qentry || (qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_RKEY_NEG))
1887 rc = -EFAULT;
1888out:
1889 if (qentry)
1890 smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
1891 return rc;
1892}
1893
1894void smc_llc_link_set_uid(struct smc_link *link)
1895{
1896 __be32 link_uid;
1897
1898 link_uid = htonl(*((u32 *)link->lgr->id) + link->link_id);
1899 memcpy(link->link_uid, &link_uid, SMC_LGR_ID_SIZE);
1900}
1901
1902/* save peers link user id, used for debug purposes */
1903void smc_llc_save_peer_uid(struct smc_llc_qentry *qentry)
1904{
1905 memcpy(qentry->link->peer_link_uid, qentry->msg.confirm_link.link_uid,
1906 SMC_LGR_ID_SIZE);
1907}
1908
1909/* evaluate confirm link request or response */
1910int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry,
1911 enum smc_llc_reqresp type)
1912{
1913 if (type == SMC_LLC_REQ) { /* SMC server assigns link_id */
1914 qentry->link->link_id = qentry->msg.confirm_link.link_num;
1915 smc_llc_link_set_uid(qentry->link);
1916 }
1917 if (!(qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_NO_RMBE_EYEC))
1918 return -ENOTSUPP;
1919 return 0;
1920}
1921
1922/***************************** init, exit, misc ******************************/
1923
1924static struct smc_wr_rx_handler smc_llc_rx_handlers[] = {
1925 {
1926 .handler = smc_llc_rx_handler,
1927 .type = SMC_LLC_CONFIRM_LINK
1928 },
1929 {
1930 .handler = smc_llc_rx_handler,
1931 .type = SMC_LLC_TEST_LINK
1932 },
1933 {
1934 .handler = smc_llc_rx_handler,
1935 .type = SMC_LLC_ADD_LINK
1936 },
1937 {
1938 .handler = smc_llc_rx_handler,
1939 .type = SMC_LLC_ADD_LINK_CONT
1940 },
1941 {
1942 .handler = smc_llc_rx_handler,
1943 .type = SMC_LLC_DELETE_LINK
1944 },
1945 {
1946 .handler = smc_llc_rx_handler,
1947 .type = SMC_LLC_CONFIRM_RKEY
1948 },
1949 {
1950 .handler = smc_llc_rx_handler,
1951 .type = SMC_LLC_CONFIRM_RKEY_CONT
1952 },
1953 {
1954 .handler = smc_llc_rx_handler,
1955 .type = SMC_LLC_DELETE_RKEY
1956 },
1957 {
1958 .handler = NULL,
1959 }
1960};
1961
1962int __init smc_llc_init(void)
1963{
1964 struct smc_wr_rx_handler *handler;
1965 int rc = 0;
1966
1967 for (handler = smc_llc_rx_handlers; handler->handler; handler++) {
1968 INIT_HLIST_NODE(&handler->list);
1969 rc = smc_wr_rx_register_handler(handler);
1970 if (rc)
1971 break;
1972 }
1973 return rc;
1974}
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
new file mode 100644
index 000000000..cc00a2ec4
--- /dev/null
+++ b/net/smc/smc_llc.h
@@ -0,0 +1,109 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
4 *
5 * Definitions for LLC (link layer control) message handling
6 *
7 * Copyright IBM Corp. 2016
8 *
9 * Author(s): Klaus Wacker <Klaus.Wacker@de.ibm.com>
10 * Ursula Braun <ubraun@linux.vnet.ibm.com>
11 */
12
13#ifndef SMC_LLC_H
14#define SMC_LLC_H
15
16#include "smc_wr.h"
17
18#define SMC_LLC_FLAG_RESP 0x80
19
20#define SMC_LLC_WAIT_FIRST_TIME (5 * HZ)
21#define SMC_LLC_WAIT_TIME (2 * HZ)
22
23enum smc_llc_reqresp {
24 SMC_LLC_REQ,
25 SMC_LLC_RESP
26};
27
28enum smc_llc_msg_type {
29 SMC_LLC_CONFIRM_LINK = 0x01,
30 SMC_LLC_ADD_LINK = 0x02,
31 SMC_LLC_ADD_LINK_CONT = 0x03,
32 SMC_LLC_DELETE_LINK = 0x04,
33 SMC_LLC_CONFIRM_RKEY = 0x06,
34 SMC_LLC_TEST_LINK = 0x07,
35 SMC_LLC_CONFIRM_RKEY_CONT = 0x08,
36 SMC_LLC_DELETE_RKEY = 0x09,
37};
38
39#define smc_link_downing(state) \
40 (cmpxchg(state, SMC_LNK_ACTIVE, SMC_LNK_INACTIVE) == SMC_LNK_ACTIVE)
41
42/* LLC DELETE LINK Request Reason Codes */
43#define SMC_LLC_DEL_LOST_PATH 0x00010000
44#define SMC_LLC_DEL_OP_INIT_TERM 0x00020000
45#define SMC_LLC_DEL_PROG_INIT_TERM 0x00030000
46#define SMC_LLC_DEL_PROT_VIOL 0x00040000
47#define SMC_LLC_DEL_NO_ASYM_NEEDED 0x00050000
48/* LLC DELETE LINK Response Reason Codes */
49#define SMC_LLC_DEL_NOLNK 0x00100000 /* Unknown Link ID (no link) */
50#define SMC_LLC_DEL_NOLGR 0x00200000 /* Unknown Link Group */
51
52/* returns a usable link of the link group, or NULL */
53static inline struct smc_link *smc_llc_usable_link(struct smc_link_group *lgr)
54{
55 int i;
56
57 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++)
58 if (smc_link_usable(&lgr->lnk[i]))
59 return &lgr->lnk[i];
60 return NULL;
61}
62
63/* set the termination reason code for the link group */
64static inline void smc_llc_set_termination_rsn(struct smc_link_group *lgr,
65 u32 rsn)
66{
67 if (!lgr->llc_termination_rsn)
68 lgr->llc_termination_rsn = rsn;
69}
70
71/* transmit */
72int smc_llc_send_confirm_link(struct smc_link *lnk,
73 enum smc_llc_reqresp reqresp);
74int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[],
75 struct smc_link *link_new,
76 enum smc_llc_reqresp reqresp);
77int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id,
78 enum smc_llc_reqresp reqresp, bool orderly,
79 u32 reason);
80void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id);
81void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc);
82void smc_llc_lgr_clear(struct smc_link_group *lgr);
83int smc_llc_link_init(struct smc_link *link);
84void smc_llc_link_active(struct smc_link *link);
85void smc_llc_link_clear(struct smc_link *link, bool log);
86int smc_llc_do_confirm_rkey(struct smc_link *send_link,
87 struct smc_buf_desc *rmb_desc);
88int smc_llc_do_delete_rkey(struct smc_link_group *lgr,
89 struct smc_buf_desc *rmb_desc);
90int smc_llc_flow_initiate(struct smc_link_group *lgr,
91 enum smc_llc_flowtype type);
92void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow);
93int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry,
94 enum smc_llc_reqresp type);
95void smc_llc_link_set_uid(struct smc_link *link);
96void smc_llc_save_peer_uid(struct smc_llc_qentry *qentry);
97struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr,
98 struct smc_link *lnk,
99 int time_out, u8 exp_msg);
100struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow);
101void smc_llc_flow_qentry_del(struct smc_llc_flow *flow);
102void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord,
103 u32 rsn);
104int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry);
105int smc_llc_srv_add_link(struct smc_link *link);
106void smc_llc_add_link_local(struct smc_link *link);
107int smc_llc_init(void) __init;
108
109#endif /* SMC_LLC_H */
diff --git a/net/smc/smc_netns.h b/net/smc/smc_netns.h
new file mode 100644
index 000000000..0f4f35aa4
--- /dev/null
+++ b/net/smc/smc_netns.h
@@ -0,0 +1,21 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/* Shared Memory Communications
3 *
4 * Network namespace definitions.
5 *
6 * Copyright IBM Corp. 2018
7 */
8
9#ifndef SMC_NETNS_H
10#define SMC_NETNS_H
11
12#include "smc_pnet.h"
13
14extern unsigned int smc_net_id;
15
16/* per-network namespace private data */
17struct smc_net {
18 struct smc_pnettable pnettable;
19 struct smc_pnetids_ndev pnetids_ndev;
20};
21#endif
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
new file mode 100644
index 000000000..30bae60d6
--- /dev/null
+++ b/net/smc/smc_pnet.c
@@ -0,0 +1,1174 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
4 *
5 * Generic netlink support functions to configure an SMC-R PNET table
6 *
7 * Copyright IBM Corp. 2016
8 *
9 * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com>
10 */
11
12#include <linux/module.h>
13#include <linux/list.h>
14#include <linux/ctype.h>
15#include <linux/mutex.h>
16#include <net/netlink.h>
17#include <net/genetlink.h>
18
19#include <uapi/linux/if.h>
20#include <uapi/linux/smc.h>
21
22#include <rdma/ib_verbs.h>
23
24#include <net/netns/generic.h>
25#include "smc_netns.h"
26
27#include "smc_pnet.h"
28#include "smc_ib.h"
29#include "smc_ism.h"
30#include "smc_core.h"
31
32static struct net_device *__pnet_find_base_ndev(struct net_device *ndev);
33static struct net_device *pnet_find_base_ndev(struct net_device *ndev);
34
35static const struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = {
36 [SMC_PNETID_NAME] = {
37 .type = NLA_NUL_STRING,
38 .len = SMC_MAX_PNETID_LEN
39 },
40 [SMC_PNETID_ETHNAME] = {
41 .type = NLA_NUL_STRING,
42 .len = IFNAMSIZ - 1
43 },
44 [SMC_PNETID_IBNAME] = {
45 .type = NLA_NUL_STRING,
46 .len = IB_DEVICE_NAME_MAX - 1
47 },
48 [SMC_PNETID_IBPORT] = { .type = NLA_U8 }
49};
50
51static struct genl_family smc_pnet_nl_family;
52
53enum smc_pnet_nametype {
54 SMC_PNET_ETH = 1,
55 SMC_PNET_IB = 2,
56};
57
58/* pnet entry stored in pnet table */
59struct smc_pnetentry {
60 struct list_head list;
61 char pnet_name[SMC_MAX_PNETID_LEN + 1];
62 enum smc_pnet_nametype type;
63 union {
64 struct {
65 char eth_name[IFNAMSIZ + 1];
66 struct net_device *ndev;
67 };
68 struct {
69 char ib_name[IB_DEVICE_NAME_MAX + 1];
70 u8 ib_port;
71 };
72 };
73};
74
75/* Check if the pnetid is set */
76bool smc_pnet_is_pnetid_set(u8 *pnetid)
77{
78 if (pnetid[0] == 0 || pnetid[0] == _S)
79 return false;
80 return true;
81}
82
83/* Check if two given pnetids match */
84static bool smc_pnet_match(u8 *pnetid1, u8 *pnetid2)
85{
86 int i;
87
88 for (i = 0; i < SMC_MAX_PNETID_LEN; i++) {
89 if ((pnetid1[i] == 0 || pnetid1[i] == _S) &&
90 (pnetid2[i] == 0 || pnetid2[i] == _S))
91 break;
92 if (pnetid1[i] != pnetid2[i])
93 return false;
94 }
95 return true;
96}
97
98/* Remove a pnetid from the pnet table.
99 */
100static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name)
101{
102 struct smc_pnetentry *pnetelem, *tmp_pe;
103 struct smc_pnettable *pnettable;
104 struct smc_ib_device *ibdev;
105 struct smcd_dev *smcd_dev;
106 struct smc_net *sn;
107 int rc = -ENOENT;
108 int ibport;
109
110 /* get pnettable for namespace */
111 sn = net_generic(net, smc_net_id);
112 pnettable = &sn->pnettable;
113
114 /* remove table entry */
115 mutex_lock(&pnettable->lock);
116 list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist,
117 list) {
118 if (!pnet_name ||
119 smc_pnet_match(pnetelem->pnet_name, pnet_name)) {
120 list_del(&pnetelem->list);
121 if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev) {
122 dev_put(pnetelem->ndev);
123 pr_warn_ratelimited("smc: net device %s "
124 "erased user defined "
125 "pnetid %.16s\n",
126 pnetelem->eth_name,
127 pnetelem->pnet_name);
128 }
129 kfree(pnetelem);
130 rc = 0;
131 }
132 }
133 mutex_unlock(&pnettable->lock);
134
135 /* if this is not the initial namespace, stop here */
136 if (net != &init_net)
137 return rc;
138
139 /* remove ib devices */
140 mutex_lock(&smc_ib_devices.mutex);
141 list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
142 for (ibport = 0; ibport < SMC_MAX_PORTS; ibport++) {
143 if (ibdev->pnetid_by_user[ibport] &&
144 (!pnet_name ||
145 smc_pnet_match(pnet_name,
146 ibdev->pnetid[ibport]))) {
147 pr_warn_ratelimited("smc: ib device %s ibport "
148 "%d erased user defined "
149 "pnetid %.16s\n",
150 ibdev->ibdev->name,
151 ibport + 1,
152 ibdev->pnetid[ibport]);
153 memset(ibdev->pnetid[ibport], 0,
154 SMC_MAX_PNETID_LEN);
155 ibdev->pnetid_by_user[ibport] = false;
156 rc = 0;
157 }
158 }
159 }
160 mutex_unlock(&smc_ib_devices.mutex);
161 /* remove smcd devices */
162 mutex_lock(&smcd_dev_list.mutex);
163 list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) {
164 if (smcd_dev->pnetid_by_user &&
165 (!pnet_name ||
166 smc_pnet_match(pnet_name, smcd_dev->pnetid))) {
167 pr_warn_ratelimited("smc: smcd device %s "
168 "erased user defined pnetid "
169 "%.16s\n", dev_name(&smcd_dev->dev),
170 smcd_dev->pnetid);
171 memset(smcd_dev->pnetid, 0, SMC_MAX_PNETID_LEN);
172 smcd_dev->pnetid_by_user = false;
173 rc = 0;
174 }
175 }
176 mutex_unlock(&smcd_dev_list.mutex);
177 return rc;
178}
179
180/* Add the reference to a given network device to the pnet table.
181 */
182static int smc_pnet_add_by_ndev(struct net_device *ndev)
183{
184 struct smc_pnetentry *pnetelem, *tmp_pe;
185 struct smc_pnettable *pnettable;
186 struct net *net = dev_net(ndev);
187 struct smc_net *sn;
188 int rc = -ENOENT;
189
190 /* get pnettable for namespace */
191 sn = net_generic(net, smc_net_id);
192 pnettable = &sn->pnettable;
193
194 mutex_lock(&pnettable->lock);
195 list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) {
196 if (pnetelem->type == SMC_PNET_ETH && !pnetelem->ndev &&
197 !strncmp(pnetelem->eth_name, ndev->name, IFNAMSIZ)) {
198 dev_hold(ndev);
199 pnetelem->ndev = ndev;
200 rc = 0;
201 pr_warn_ratelimited("smc: adding net device %s with "
202 "user defined pnetid %.16s\n",
203 pnetelem->eth_name,
204 pnetelem->pnet_name);
205 break;
206 }
207 }
208 mutex_unlock(&pnettable->lock);
209 return rc;
210}
211
212/* Remove the reference to a given network device from the pnet table.
213 */
214static int smc_pnet_remove_by_ndev(struct net_device *ndev)
215{
216 struct smc_pnetentry *pnetelem, *tmp_pe;
217 struct smc_pnettable *pnettable;
218 struct net *net = dev_net(ndev);
219 struct smc_net *sn;
220 int rc = -ENOENT;
221
222 /* get pnettable for namespace */
223 sn = net_generic(net, smc_net_id);
224 pnettable = &sn->pnettable;
225
226 mutex_lock(&pnettable->lock);
227 list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) {
228 if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev == ndev) {
229 dev_put(pnetelem->ndev);
230 pnetelem->ndev = NULL;
231 rc = 0;
232 pr_warn_ratelimited("smc: removing net device %s with "
233 "user defined pnetid %.16s\n",
234 pnetelem->eth_name,
235 pnetelem->pnet_name);
236 break;
237 }
238 }
239 mutex_unlock(&pnettable->lock);
240 return rc;
241}
242
243/* Apply pnetid to ib device when no pnetid is set.
244 */
245static bool smc_pnet_apply_ib(struct smc_ib_device *ib_dev, u8 ib_port,
246 char *pnet_name)
247{
248 bool applied = false;
249
250 mutex_lock(&smc_ib_devices.mutex);
251 if (!smc_pnet_is_pnetid_set(ib_dev->pnetid[ib_port - 1])) {
252 memcpy(ib_dev->pnetid[ib_port - 1], pnet_name,
253 SMC_MAX_PNETID_LEN);
254 ib_dev->pnetid_by_user[ib_port - 1] = true;
255 applied = true;
256 }
257 mutex_unlock(&smc_ib_devices.mutex);
258 return applied;
259}
260
261/* Apply pnetid to smcd device when no pnetid is set.
262 */
263static bool smc_pnet_apply_smcd(struct smcd_dev *smcd_dev, char *pnet_name)
264{
265 bool applied = false;
266
267 mutex_lock(&smcd_dev_list.mutex);
268 if (!smc_pnet_is_pnetid_set(smcd_dev->pnetid)) {
269 memcpy(smcd_dev->pnetid, pnet_name, SMC_MAX_PNETID_LEN);
270 smcd_dev->pnetid_by_user = true;
271 applied = true;
272 }
273 mutex_unlock(&smcd_dev_list.mutex);
274 return applied;
275}
276
277/* The limit for pnetid is 16 characters.
278 * Valid characters should be (single-byte character set) a-z, A-Z, 0-9.
279 * Lower case letters are converted to upper case.
280 * Interior blanks should not be used.
281 */
282static bool smc_pnetid_valid(const char *pnet_name, char *pnetid)
283{
284 char *bf = skip_spaces(pnet_name);
285 size_t len = strlen(bf);
286 char *end = bf + len;
287
288 if (!len)
289 return false;
290 while (--end >= bf && isspace(*end))
291 ;
292 if (end - bf >= SMC_MAX_PNETID_LEN)
293 return false;
294 while (bf <= end) {
295 if (!isalnum(*bf))
296 return false;
297 *pnetid++ = islower(*bf) ? toupper(*bf) : *bf;
298 bf++;
299 }
300 *pnetid = '\0';
301 return true;
302}
303
304/* Find an infiniband device by a given name. The device might not exist. */
305static struct smc_ib_device *smc_pnet_find_ib(char *ib_name)
306{
307 struct smc_ib_device *ibdev;
308
309 mutex_lock(&smc_ib_devices.mutex);
310 list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
311 if (!strncmp(ibdev->ibdev->name, ib_name,
312 sizeof(ibdev->ibdev->name)) ||
313 (ibdev->ibdev->dev.parent &&
314 !strncmp(dev_name(ibdev->ibdev->dev.parent), ib_name,
315 IB_DEVICE_NAME_MAX - 1))) {
316 goto out;
317 }
318 }
319 ibdev = NULL;
320out:
321 mutex_unlock(&smc_ib_devices.mutex);
322 return ibdev;
323}
324
325/* Find an smcd device by a given name. The device might not exist. */
326static struct smcd_dev *smc_pnet_find_smcd(char *smcd_name)
327{
328 struct smcd_dev *smcd_dev;
329
330 mutex_lock(&smcd_dev_list.mutex);
331 list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) {
332 if (!strncmp(dev_name(&smcd_dev->dev), smcd_name,
333 IB_DEVICE_NAME_MAX - 1))
334 goto out;
335 }
336 smcd_dev = NULL;
337out:
338 mutex_unlock(&smcd_dev_list.mutex);
339 return smcd_dev;
340}
341
342static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net,
343 char *eth_name, char *pnet_name)
344{
345 struct smc_pnetentry *tmp_pe, *new_pe;
346 struct net_device *ndev, *base_ndev;
347 u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
348 bool new_netdev;
349 int rc;
350
351 /* check if (base) netdev already has a pnetid. If there is one, we do
352 * not want to add a pnet table entry
353 */
354 rc = -EEXIST;
355 ndev = dev_get_by_name(net, eth_name); /* dev_hold() */
356 if (ndev) {
357 base_ndev = pnet_find_base_ndev(ndev);
358 if (!smc_pnetid_by_dev_port(base_ndev->dev.parent,
359 base_ndev->dev_port, ndev_pnetid))
360 goto out_put;
361 }
362
363 /* add a new netdev entry to the pnet table if there isn't one */
364 rc = -ENOMEM;
365 new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL);
366 if (!new_pe)
367 goto out_put;
368 new_pe->type = SMC_PNET_ETH;
369 memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN);
370 strncpy(new_pe->eth_name, eth_name, IFNAMSIZ);
371 new_pe->ndev = ndev;
372
373 rc = -EEXIST;
374 new_netdev = true;
375 mutex_lock(&pnettable->lock);
376 list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
377 if (tmp_pe->type == SMC_PNET_ETH &&
378 !strncmp(tmp_pe->eth_name, eth_name, IFNAMSIZ)) {
379 new_netdev = false;
380 break;
381 }
382 }
383 if (new_netdev) {
384 list_add_tail(&new_pe->list, &pnettable->pnetlist);
385 mutex_unlock(&pnettable->lock);
386 } else {
387 mutex_unlock(&pnettable->lock);
388 kfree(new_pe);
389 goto out_put;
390 }
391 if (ndev)
392 pr_warn_ratelimited("smc: net device %s "
393 "applied user defined pnetid %.16s\n",
394 new_pe->eth_name, new_pe->pnet_name);
395 return 0;
396
397out_put:
398 if (ndev)
399 dev_put(ndev);
400 return rc;
401}
402
403static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name,
404 u8 ib_port, char *pnet_name)
405{
406 struct smc_pnetentry *tmp_pe, *new_pe;
407 struct smc_ib_device *ib_dev;
408 bool smcddev_applied = true;
409 bool ibdev_applied = true;
410 struct smcd_dev *smcd_dev;
411 bool new_ibdev;
412
413 /* try to apply the pnetid to active devices */
414 ib_dev = smc_pnet_find_ib(ib_name);
415 if (ib_dev) {
416 ibdev_applied = smc_pnet_apply_ib(ib_dev, ib_port, pnet_name);
417 if (ibdev_applied)
418 pr_warn_ratelimited("smc: ib device %s ibport %d "
419 "applied user defined pnetid "
420 "%.16s\n", ib_dev->ibdev->name,
421 ib_port,
422 ib_dev->pnetid[ib_port - 1]);
423 }
424 smcd_dev = smc_pnet_find_smcd(ib_name);
425 if (smcd_dev) {
426 smcddev_applied = smc_pnet_apply_smcd(smcd_dev, pnet_name);
427 if (smcddev_applied)
428 pr_warn_ratelimited("smc: smcd device %s "
429 "applied user defined pnetid "
430 "%.16s\n", dev_name(&smcd_dev->dev),
431 smcd_dev->pnetid);
432 }
433 /* Apply fails when a device has a hardware-defined pnetid set, do not
434 * add a pnet table entry in that case.
435 */
436 if (!ibdev_applied || !smcddev_applied)
437 return -EEXIST;
438
439 /* add a new ib entry to the pnet table if there isn't one */
440 new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL);
441 if (!new_pe)
442 return -ENOMEM;
443 new_pe->type = SMC_PNET_IB;
444 memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN);
445 strncpy(new_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX);
446 new_pe->ib_port = ib_port;
447
448 new_ibdev = true;
449 mutex_lock(&pnettable->lock);
450 list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
451 if (tmp_pe->type == SMC_PNET_IB &&
452 !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) {
453 new_ibdev = false;
454 break;
455 }
456 }
457 if (new_ibdev) {
458 list_add_tail(&new_pe->list, &pnettable->pnetlist);
459 mutex_unlock(&pnettable->lock);
460 } else {
461 mutex_unlock(&pnettable->lock);
462 kfree(new_pe);
463 }
464 return (new_ibdev) ? 0 : -EEXIST;
465}
466
467/* Append a pnetid to the end of the pnet table if not already on this list.
468 */
469static int smc_pnet_enter(struct net *net, struct nlattr *tb[])
470{
471 char pnet_name[SMC_MAX_PNETID_LEN + 1];
472 struct smc_pnettable *pnettable;
473 bool new_netdev = false;
474 bool new_ibdev = false;
475 struct smc_net *sn;
476 u8 ibport = 1;
477 char *string;
478 int rc;
479
480 /* get pnettable for namespace */
481 sn = net_generic(net, smc_net_id);
482 pnettable = &sn->pnettable;
483
484 rc = -EINVAL;
485 if (!tb[SMC_PNETID_NAME])
486 goto error;
487 string = (char *)nla_data(tb[SMC_PNETID_NAME]);
488 if (!smc_pnetid_valid(string, pnet_name))
489 goto error;
490
491 if (tb[SMC_PNETID_ETHNAME]) {
492 string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]);
493 rc = smc_pnet_add_eth(pnettable, net, string, pnet_name);
494 if (!rc)
495 new_netdev = true;
496 else if (rc != -EEXIST)
497 goto error;
498 }
499
500 /* if this is not the initial namespace, stop here */
501 if (net != &init_net)
502 return new_netdev ? 0 : -EEXIST;
503
504 rc = -EINVAL;
505 if (tb[SMC_PNETID_IBNAME]) {
506 string = (char *)nla_data(tb[SMC_PNETID_IBNAME]);
507 string = strim(string);
508 if (tb[SMC_PNETID_IBPORT]) {
509 ibport = nla_get_u8(tb[SMC_PNETID_IBPORT]);
510 if (ibport < 1 || ibport > SMC_MAX_PORTS)
511 goto error;
512 }
513 rc = smc_pnet_add_ib(pnettable, string, ibport, pnet_name);
514 if (!rc)
515 new_ibdev = true;
516 else if (rc != -EEXIST)
517 goto error;
518 }
519 return (new_netdev || new_ibdev) ? 0 : -EEXIST;
520
521error:
522 return rc;
523}
524
525/* Convert an smc_pnetentry to a netlink attribute sequence */
526static int smc_pnet_set_nla(struct sk_buff *msg,
527 struct smc_pnetentry *pnetelem)
528{
529 if (nla_put_string(msg, SMC_PNETID_NAME, pnetelem->pnet_name))
530 return -1;
531 if (pnetelem->type == SMC_PNET_ETH) {
532 if (nla_put_string(msg, SMC_PNETID_ETHNAME,
533 pnetelem->eth_name))
534 return -1;
535 } else {
536 if (nla_put_string(msg, SMC_PNETID_ETHNAME, "n/a"))
537 return -1;
538 }
539 if (pnetelem->type == SMC_PNET_IB) {
540 if (nla_put_string(msg, SMC_PNETID_IBNAME, pnetelem->ib_name) ||
541 nla_put_u8(msg, SMC_PNETID_IBPORT, pnetelem->ib_port))
542 return -1;
543 } else {
544 if (nla_put_string(msg, SMC_PNETID_IBNAME, "n/a") ||
545 nla_put_u8(msg, SMC_PNETID_IBPORT, 0xff))
546 return -1;
547 }
548
549 return 0;
550}
551
552static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info)
553{
554 struct net *net = genl_info_net(info);
555
556 return smc_pnet_enter(net, info->attrs);
557}
558
559static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info)
560{
561 struct net *net = genl_info_net(info);
562
563 if (!info->attrs[SMC_PNETID_NAME])
564 return -EINVAL;
565 return smc_pnet_remove_by_pnetid(net,
566 (char *)nla_data(info->attrs[SMC_PNETID_NAME]));
567}
568
569static int smc_pnet_dump_start(struct netlink_callback *cb)
570{
571 cb->args[0] = 0;
572 return 0;
573}
574
575static int smc_pnet_dumpinfo(struct sk_buff *skb,
576 u32 portid, u32 seq, u32 flags,
577 struct smc_pnetentry *pnetelem)
578{
579 void *hdr;
580
581 hdr = genlmsg_put(skb, portid, seq, &smc_pnet_nl_family,
582 flags, SMC_PNETID_GET);
583 if (!hdr)
584 return -ENOMEM;
585 if (smc_pnet_set_nla(skb, pnetelem) < 0) {
586 genlmsg_cancel(skb, hdr);
587 return -EMSGSIZE;
588 }
589 genlmsg_end(skb, hdr);
590 return 0;
591}
592
593static int _smc_pnet_dump(struct net *net, struct sk_buff *skb, u32 portid,
594 u32 seq, u8 *pnetid, int start_idx)
595{
596 struct smc_pnettable *pnettable;
597 struct smc_pnetentry *pnetelem;
598 struct smc_net *sn;
599 int idx = 0;
600
601 /* get pnettable for namespace */
602 sn = net_generic(net, smc_net_id);
603 pnettable = &sn->pnettable;
604
605 /* dump pnettable entries */
606 mutex_lock(&pnettable->lock);
607 list_for_each_entry(pnetelem, &pnettable->pnetlist, list) {
608 if (pnetid && !smc_pnet_match(pnetelem->pnet_name, pnetid))
609 continue;
610 if (idx++ < start_idx)
611 continue;
612 /* if this is not the initial namespace, dump only netdev */
613 if (net != &init_net && pnetelem->type != SMC_PNET_ETH)
614 continue;
615 if (smc_pnet_dumpinfo(skb, portid, seq, NLM_F_MULTI,
616 pnetelem)) {
617 --idx;
618 break;
619 }
620 }
621 mutex_unlock(&pnettable->lock);
622 return idx;
623}
624
625static int smc_pnet_dump(struct sk_buff *skb, struct netlink_callback *cb)
626{
627 struct net *net = sock_net(skb->sk);
628 int idx;
629
630 idx = _smc_pnet_dump(net, skb, NETLINK_CB(cb->skb).portid,
631 cb->nlh->nlmsg_seq, NULL, cb->args[0]);
632
633 cb->args[0] = idx;
634 return skb->len;
635}
636
637/* Retrieve one PNETID entry */
638static int smc_pnet_get(struct sk_buff *skb, struct genl_info *info)
639{
640 struct net *net = genl_info_net(info);
641 struct sk_buff *msg;
642 void *hdr;
643
644 if (!info->attrs[SMC_PNETID_NAME])
645 return -EINVAL;
646
647 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
648 if (!msg)
649 return -ENOMEM;
650
651 _smc_pnet_dump(net, msg, info->snd_portid, info->snd_seq,
652 nla_data(info->attrs[SMC_PNETID_NAME]), 0);
653
654 /* finish multi part message and send it */
655 hdr = nlmsg_put(msg, info->snd_portid, info->snd_seq, NLMSG_DONE, 0,
656 NLM_F_MULTI);
657 if (!hdr) {
658 nlmsg_free(msg);
659 return -EMSGSIZE;
660 }
661 return genlmsg_reply(msg, info);
662}
663
664/* Remove and delete all pnetids from pnet table.
665 */
666static int smc_pnet_flush(struct sk_buff *skb, struct genl_info *info)
667{
668 struct net *net = genl_info_net(info);
669
670 smc_pnet_remove_by_pnetid(net, NULL);
671 return 0;
672}
673
674/* SMC_PNETID generic netlink operation definition */
675static const struct genl_ops smc_pnet_ops[] = {
676 {
677 .cmd = SMC_PNETID_GET,
678 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
679 /* can be retrieved by unprivileged users */
680 .doit = smc_pnet_get,
681 .dumpit = smc_pnet_dump,
682 .start = smc_pnet_dump_start
683 },
684 {
685 .cmd = SMC_PNETID_ADD,
686 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
687 .flags = GENL_ADMIN_PERM,
688 .doit = smc_pnet_add
689 },
690 {
691 .cmd = SMC_PNETID_DEL,
692 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
693 .flags = GENL_ADMIN_PERM,
694 .doit = smc_pnet_del
695 },
696 {
697 .cmd = SMC_PNETID_FLUSH,
698 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
699 .flags = GENL_ADMIN_PERM,
700 .doit = smc_pnet_flush
701 }
702};
703
704/* SMC_PNETID family definition */
705static struct genl_family smc_pnet_nl_family __ro_after_init = {
706 .hdrsize = 0,
707 .name = SMCR_GENL_FAMILY_NAME,
708 .version = SMCR_GENL_FAMILY_VERSION,
709 .maxattr = SMC_PNETID_MAX,
710 .policy = smc_pnet_policy,
711 .netnsok = true,
712 .module = THIS_MODULE,
713 .ops = smc_pnet_ops,
714 .n_ops = ARRAY_SIZE(smc_pnet_ops)
715};
716
717bool smc_pnet_is_ndev_pnetid(struct net *net, u8 *pnetid)
718{
719 struct smc_net *sn = net_generic(net, smc_net_id);
720 struct smc_pnetids_ndev_entry *pe;
721 bool rc = false;
722
723 read_lock(&sn->pnetids_ndev.lock);
724 list_for_each_entry(pe, &sn->pnetids_ndev.list, list) {
725 if (smc_pnet_match(pnetid, pe->pnetid)) {
726 rc = true;
727 goto unlock;
728 }
729 }
730
731unlock:
732 read_unlock(&sn->pnetids_ndev.lock);
733 return rc;
734}
735
736static int smc_pnet_add_pnetid(struct net *net, u8 *pnetid)
737{
738 struct smc_net *sn = net_generic(net, smc_net_id);
739 struct smc_pnetids_ndev_entry *pe, *pi;
740
741 pe = kzalloc(sizeof(*pe), GFP_KERNEL);
742 if (!pe)
743 return -ENOMEM;
744
745 write_lock(&sn->pnetids_ndev.lock);
746 list_for_each_entry(pi, &sn->pnetids_ndev.list, list) {
747 if (smc_pnet_match(pnetid, pe->pnetid)) {
748 refcount_inc(&pi->refcnt);
749 kfree(pe);
750 goto unlock;
751 }
752 }
753 refcount_set(&pe->refcnt, 1);
754 memcpy(pe->pnetid, pnetid, SMC_MAX_PNETID_LEN);
755 list_add_tail(&pe->list, &sn->pnetids_ndev.list);
756
757unlock:
758 write_unlock(&sn->pnetids_ndev.lock);
759 return 0;
760}
761
762static void smc_pnet_remove_pnetid(struct net *net, u8 *pnetid)
763{
764 struct smc_net *sn = net_generic(net, smc_net_id);
765 struct smc_pnetids_ndev_entry *pe, *pe2;
766
767 write_lock(&sn->pnetids_ndev.lock);
768 list_for_each_entry_safe(pe, pe2, &sn->pnetids_ndev.list, list) {
769 if (smc_pnet_match(pnetid, pe->pnetid)) {
770 if (refcount_dec_and_test(&pe->refcnt)) {
771 list_del(&pe->list);
772 kfree(pe);
773 }
774 break;
775 }
776 }
777 write_unlock(&sn->pnetids_ndev.lock);
778}
779
780static void smc_pnet_add_base_pnetid(struct net *net, struct net_device *dev,
781 u8 *ndev_pnetid)
782{
783 struct net_device *base_dev;
784
785 base_dev = __pnet_find_base_ndev(dev);
786 if (base_dev->flags & IFF_UP &&
787 !smc_pnetid_by_dev_port(base_dev->dev.parent, base_dev->dev_port,
788 ndev_pnetid)) {
789 /* add to PNETIDs list */
790 smc_pnet_add_pnetid(net, ndev_pnetid);
791 }
792}
793
794/* create initial list of netdevice pnetids */
795static void smc_pnet_create_pnetids_list(struct net *net)
796{
797 u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
798 struct net_device *dev;
799
800 rtnl_lock();
801 for_each_netdev(net, dev)
802 smc_pnet_add_base_pnetid(net, dev, ndev_pnetid);
803 rtnl_unlock();
804}
805
806/* clean up list of netdevice pnetids */
807static void smc_pnet_destroy_pnetids_list(struct net *net)
808{
809 struct smc_net *sn = net_generic(net, smc_net_id);
810 struct smc_pnetids_ndev_entry *pe, *temp_pe;
811
812 write_lock(&sn->pnetids_ndev.lock);
813 list_for_each_entry_safe(pe, temp_pe, &sn->pnetids_ndev.list, list) {
814 list_del(&pe->list);
815 kfree(pe);
816 }
817 write_unlock(&sn->pnetids_ndev.lock);
818}
819
820static int smc_pnet_netdev_event(struct notifier_block *this,
821 unsigned long event, void *ptr)
822{
823 struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
824 struct net *net = dev_net(event_dev);
825 u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
826
827 switch (event) {
828 case NETDEV_REBOOT:
829 case NETDEV_UNREGISTER:
830 smc_pnet_remove_by_ndev(event_dev);
831 return NOTIFY_OK;
832 case NETDEV_REGISTER:
833 smc_pnet_add_by_ndev(event_dev);
834 return NOTIFY_OK;
835 case NETDEV_UP:
836 smc_pnet_add_base_pnetid(net, event_dev, ndev_pnetid);
837 return NOTIFY_OK;
838 case NETDEV_DOWN:
839 event_dev = __pnet_find_base_ndev(event_dev);
840 if (!smc_pnetid_by_dev_port(event_dev->dev.parent,
841 event_dev->dev_port, ndev_pnetid)) {
842 /* remove from PNETIDs list */
843 smc_pnet_remove_pnetid(net, ndev_pnetid);
844 }
845 return NOTIFY_OK;
846 default:
847 return NOTIFY_DONE;
848 }
849}
850
851static struct notifier_block smc_netdev_notifier = {
852 .notifier_call = smc_pnet_netdev_event
853};
854
855/* init network namespace */
856int smc_pnet_net_init(struct net *net)
857{
858 struct smc_net *sn = net_generic(net, smc_net_id);
859 struct smc_pnettable *pnettable = &sn->pnettable;
860 struct smc_pnetids_ndev *pnetids_ndev = &sn->pnetids_ndev;
861
862 INIT_LIST_HEAD(&pnettable->pnetlist);
863 mutex_init(&pnettable->lock);
864 INIT_LIST_HEAD(&pnetids_ndev->list);
865 rwlock_init(&pnetids_ndev->lock);
866
867 smc_pnet_create_pnetids_list(net);
868
869 return 0;
870}
871
872int __init smc_pnet_init(void)
873{
874 int rc;
875
876 rc = genl_register_family(&smc_pnet_nl_family);
877 if (rc)
878 return rc;
879 rc = register_netdevice_notifier(&smc_netdev_notifier);
880 if (rc)
881 genl_unregister_family(&smc_pnet_nl_family);
882
883 return rc;
884}
885
886/* exit network namespace */
887void smc_pnet_net_exit(struct net *net)
888{
889 /* flush pnet table */
890 smc_pnet_remove_by_pnetid(net, NULL);
891 smc_pnet_destroy_pnetids_list(net);
892}
893
894void smc_pnet_exit(void)
895{
896 unregister_netdevice_notifier(&smc_netdev_notifier);
897 genl_unregister_family(&smc_pnet_nl_family);
898}
899
900static struct net_device *__pnet_find_base_ndev(struct net_device *ndev)
901{
902 int i, nest_lvl;
903
904 ASSERT_RTNL();
905 nest_lvl = ndev->lower_level;
906 for (i = 0; i < nest_lvl; i++) {
907 struct list_head *lower = &ndev->adj_list.lower;
908
909 if (list_empty(lower))
910 break;
911 lower = lower->next;
912 ndev = netdev_lower_get_next(ndev, &lower);
913 }
914 return ndev;
915}
916
917/* Determine one base device for stacked net devices.
918 * If the lower device level contains more than one devices
919 * (for instance with bonding slaves), just the first device
920 * is used to reach a base device.
921 */
922static struct net_device *pnet_find_base_ndev(struct net_device *ndev)
923{
924 rtnl_lock();
925 ndev = __pnet_find_base_ndev(ndev);
926 rtnl_unlock();
927 return ndev;
928}
929
930static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev,
931 u8 *pnetid)
932{
933 struct smc_pnettable *pnettable;
934 struct net *net = dev_net(ndev);
935 struct smc_pnetentry *pnetelem;
936 struct smc_net *sn;
937 int rc = -ENOENT;
938
939 /* get pnettable for namespace */
940 sn = net_generic(net, smc_net_id);
941 pnettable = &sn->pnettable;
942
943 mutex_lock(&pnettable->lock);
944 list_for_each_entry(pnetelem, &pnettable->pnetlist, list) {
945 if (pnetelem->type == SMC_PNET_ETH && ndev == pnetelem->ndev) {
946 /* get pnetid of netdev device */
947 memcpy(pnetid, pnetelem->pnet_name, SMC_MAX_PNETID_LEN);
948 rc = 0;
949 break;
950 }
951 }
952 mutex_unlock(&pnettable->lock);
953 return rc;
954}
955
956/* find a roce device for the given pnetid */
957static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id,
958 struct smc_init_info *ini,
959 struct smc_ib_device *known_dev)
960{
961 struct smc_ib_device *ibdev;
962 int i;
963
964 ini->ib_dev = NULL;
965 mutex_lock(&smc_ib_devices.mutex);
966 list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
967 if (ibdev == known_dev)
968 continue;
969 for (i = 1; i <= SMC_MAX_PORTS; i++) {
970 if (!rdma_is_port_valid(ibdev->ibdev, i))
971 continue;
972 if (smc_pnet_match(ibdev->pnetid[i - 1], pnet_id) &&
973 smc_ib_port_active(ibdev, i) &&
974 !test_bit(i - 1, ibdev->ports_going_away) &&
975 !smc_ib_determine_gid(ibdev, i, ini->vlan_id,
976 ini->ib_gid, NULL)) {
977 ini->ib_dev = ibdev;
978 ini->ib_port = i;
979 goto out;
980 }
981 }
982 }
983out:
984 mutex_unlock(&smc_ib_devices.mutex);
985}
986
987/* find alternate roce device with same pnet_id and vlan_id */
988void smc_pnet_find_alt_roce(struct smc_link_group *lgr,
989 struct smc_init_info *ini,
990 struct smc_ib_device *known_dev)
991{
992 _smc_pnet_find_roce_by_pnetid(lgr->pnet_id, ini, known_dev);
993}
994
995/* if handshake network device belongs to a roce device, return its
996 * IB device and port
997 */
998static void smc_pnet_find_rdma_dev(struct net_device *netdev,
999 struct smc_init_info *ini)
1000{
1001 struct smc_ib_device *ibdev;
1002
1003 mutex_lock(&smc_ib_devices.mutex);
1004 list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
1005 struct net_device *ndev;
1006 int i;
1007
1008 for (i = 1; i <= SMC_MAX_PORTS; i++) {
1009 if (!rdma_is_port_valid(ibdev->ibdev, i))
1010 continue;
1011 if (!ibdev->ibdev->ops.get_netdev)
1012 continue;
1013 ndev = ibdev->ibdev->ops.get_netdev(ibdev->ibdev, i);
1014 if (!ndev)
1015 continue;
1016 dev_put(ndev);
1017 if (netdev == ndev &&
1018 smc_ib_port_active(ibdev, i) &&
1019 !test_bit(i - 1, ibdev->ports_going_away) &&
1020 !smc_ib_determine_gid(ibdev, i, ini->vlan_id,
1021 ini->ib_gid, NULL)) {
1022 ini->ib_dev = ibdev;
1023 ini->ib_port = i;
1024 break;
1025 }
1026 }
1027 }
1028 mutex_unlock(&smc_ib_devices.mutex);
1029}
1030
1031/* Determine the corresponding IB device port based on the hardware PNETID.
1032 * Searching stops at the first matching active IB device port with vlan_id
1033 * configured.
1034 * If nothing found, check pnetid table.
1035 * If nothing found, try to use handshake device
1036 */
1037static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev,
1038 struct smc_init_info *ini)
1039{
1040 u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
1041
1042 ndev = pnet_find_base_ndev(ndev);
1043 if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port,
1044 ndev_pnetid) &&
1045 smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) {
1046 smc_pnet_find_rdma_dev(ndev, ini);
1047 return; /* pnetid could not be determined */
1048 }
1049 _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL);
1050}
1051
1052static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev,
1053 struct smc_init_info *ini)
1054{
1055 u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
1056 struct smcd_dev *ismdev;
1057
1058 ndev = pnet_find_base_ndev(ndev);
1059 if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port,
1060 ndev_pnetid) &&
1061 smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid))
1062 return; /* pnetid could not be determined */
1063
1064 mutex_lock(&smcd_dev_list.mutex);
1065 list_for_each_entry(ismdev, &smcd_dev_list.list, list) {
1066 if (smc_pnet_match(ismdev->pnetid, ndev_pnetid) &&
1067 !ismdev->going_away &&
1068 (!ini->ism_peer_gid[0] ||
1069 !smc_ism_cantalk(ini->ism_peer_gid[0], ini->vlan_id,
1070 ismdev))) {
1071 ini->ism_dev[0] = ismdev;
1072 break;
1073 }
1074 }
1075 mutex_unlock(&smcd_dev_list.mutex);
1076}
1077
1078/* PNET table analysis for a given sock:
1079 * determine ib_device and port belonging to used internal TCP socket
1080 * ethernet interface.
1081 */
1082void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini)
1083{
1084 struct dst_entry *dst = sk_dst_get(sk);
1085
1086 ini->ib_dev = NULL;
1087 ini->ib_port = 0;
1088 if (!dst)
1089 goto out;
1090 if (!dst->dev)
1091 goto out_rel;
1092
1093 smc_pnet_find_roce_by_pnetid(dst->dev, ini);
1094
1095out_rel:
1096 dst_release(dst);
1097out:
1098 return;
1099}
1100
1101void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini)
1102{
1103 struct dst_entry *dst = sk_dst_get(sk);
1104
1105 ini->ism_dev[0] = NULL;
1106 if (!dst)
1107 goto out;
1108 if (!dst->dev)
1109 goto out_rel;
1110
1111 smc_pnet_find_ism_by_pnetid(dst->dev, ini);
1112
1113out_rel:
1114 dst_release(dst);
1115out:
1116 return;
1117}
1118
1119/* Lookup and apply a pnet table entry to the given ib device.
1120 */
1121int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port)
1122{
1123 char *ib_name = smcibdev->ibdev->name;
1124 struct smc_pnettable *pnettable;
1125 struct smc_pnetentry *tmp_pe;
1126 struct smc_net *sn;
1127 int rc = -ENOENT;
1128
1129 /* get pnettable for init namespace */
1130 sn = net_generic(&init_net, smc_net_id);
1131 pnettable = &sn->pnettable;
1132
1133 mutex_lock(&pnettable->lock);
1134 list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
1135 if (tmp_pe->type == SMC_PNET_IB &&
1136 !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX) &&
1137 tmp_pe->ib_port == ib_port) {
1138 smc_pnet_apply_ib(smcibdev, ib_port, tmp_pe->pnet_name);
1139 rc = 0;
1140 break;
1141 }
1142 }
1143 mutex_unlock(&pnettable->lock);
1144
1145 return rc;
1146}
1147
1148/* Lookup and apply a pnet table entry to the given smcd device.
1149 */
1150int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev)
1151{
1152 const char *ib_name = dev_name(&smcddev->dev);
1153 struct smc_pnettable *pnettable;
1154 struct smc_pnetentry *tmp_pe;
1155 struct smc_net *sn;
1156 int rc = -ENOENT;
1157
1158 /* get pnettable for init namespace */
1159 sn = net_generic(&init_net, smc_net_id);
1160 pnettable = &sn->pnettable;
1161
1162 mutex_lock(&pnettable->lock);
1163 list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
1164 if (tmp_pe->type == SMC_PNET_IB &&
1165 !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) {
1166 smc_pnet_apply_smcd(smcddev, tmp_pe->pnet_name);
1167 rc = 0;
1168 break;
1169 }
1170 }
1171 mutex_unlock(&pnettable->lock);
1172
1173 return rc;
1174}
diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h
new file mode 100644
index 000000000..80a88eea4
--- /dev/null
+++ b/net/smc/smc_pnet.h
@@ -0,0 +1,70 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
4 *
5 * PNET table queries
6 *
7 * Copyright IBM Corp. 2016
8 *
9 * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com>
10 */
11
12#ifndef _SMC_PNET_H
13#define _SMC_PNET_H
14
15#include <net/smc.h>
16
17#if IS_ENABLED(CONFIG_HAVE_PNETID)
18#include <asm/pnet.h>
19#endif
20
21struct smc_ib_device;
22struct smcd_dev;
23struct smc_init_info;
24struct smc_link_group;
25
26/**
27 * struct smc_pnettable - SMC PNET table anchor
28 * @lock: Lock for list action
29 * @pnetlist: List of PNETIDs
30 */
31struct smc_pnettable {
32 struct mutex lock;
33 struct list_head pnetlist;
34};
35
36struct smc_pnetids_ndev { /* list of pnetids for net devices in UP state*/
37 struct list_head list;
38 rwlock_t lock;
39};
40
41struct smc_pnetids_ndev_entry {
42 struct list_head list;
43 u8 pnetid[SMC_MAX_PNETID_LEN];
44 refcount_t refcnt;
45};
46
47static inline int smc_pnetid_by_dev_port(struct device *dev,
48 unsigned short port, u8 *pnetid)
49{
50#if IS_ENABLED(CONFIG_HAVE_PNETID)
51 return pnet_id_by_dev_port(dev, port, pnetid);
52#else
53 return -ENOENT;
54#endif
55}
56
57int smc_pnet_init(void) __init;
58int smc_pnet_net_init(struct net *net);
59void smc_pnet_exit(void);
60void smc_pnet_net_exit(struct net *net);
61void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini);
62void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini);
63int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port);
64int smc_pnetid_by_table_smcd(struct smcd_dev *smcd);
65void smc_pnet_find_alt_roce(struct smc_link_group *lgr,
66 struct smc_init_info *ini,
67 struct smc_ib_device *known_dev);
68bool smc_pnet_is_ndev_pnetid(struct net *net, u8 *pnetid);
69bool smc_pnet_is_pnetid_set(u8 *pnetid);
70#endif
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
new file mode 100644
index 000000000..7f7e983e4
--- /dev/null
+++ b/net/smc/smc_rx.c
@@ -0,0 +1,444 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
4 *
5 * Manage RMBE
6 * copy new RMBE data into user space
7 *
8 * Copyright IBM Corp. 2016
9 *
10 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
11 */
12
13#include <linux/net.h>
14#include <linux/rcupdate.h>
15#include <linux/sched/signal.h>
16
17#include <net/sock.h>
18
19#include "smc.h"
20#include "smc_core.h"
21#include "smc_cdc.h"
22#include "smc_tx.h" /* smc_tx_consumer_update() */
23#include "smc_rx.h"
24
25/* callback implementation to wakeup consumers blocked with smc_rx_wait().
26 * indirectly called by smc_cdc_msg_recv_action().
27 */
28static void smc_rx_wake_up(struct sock *sk)
29{
30 struct socket_wq *wq;
31
32 /* derived from sock_def_readable() */
33 /* called already in smc_listen_work() */
34 rcu_read_lock();
35 wq = rcu_dereference(sk->sk_wq);
36 if (skwq_has_sleeper(wq))
37 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
38 EPOLLRDNORM | EPOLLRDBAND);
39 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
40 if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
41 (sk->sk_state == SMC_CLOSED))
42 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
43 rcu_read_unlock();
44}
45
46/* Update consumer cursor
47 * @conn connection to update
48 * @cons consumer cursor
49 * @len number of Bytes consumed
50 * Returns:
51 * 1 if we should end our receive, 0 otherwise
52 */
53static int smc_rx_update_consumer(struct smc_sock *smc,
54 union smc_host_cursor cons, size_t len)
55{
56 struct smc_connection *conn = &smc->conn;
57 struct sock *sk = &smc->sk;
58 bool force = false;
59 int diff, rc = 0;
60
61 smc_curs_add(conn->rmb_desc->len, &cons, len);
62
63 /* did we process urgent data? */
64 if (conn->urg_state == SMC_URG_VALID || conn->urg_rx_skip_pend) {
65 diff = smc_curs_comp(conn->rmb_desc->len, &cons,
66 &conn->urg_curs);
67 if (sock_flag(sk, SOCK_URGINLINE)) {
68 if (diff == 0) {
69 force = true;
70 rc = 1;
71 conn->urg_state = SMC_URG_READ;
72 }
73 } else {
74 if (diff == 1) {
75 /* skip urgent byte */
76 force = true;
77 smc_curs_add(conn->rmb_desc->len, &cons, 1);
78 conn->urg_rx_skip_pend = false;
79 } else if (diff < -1)
80 /* we read past urgent byte */
81 conn->urg_state = SMC_URG_READ;
82 }
83 }
84
85 smc_curs_copy(&conn->local_tx_ctrl.cons, &cons, conn);
86
87 /* send consumer cursor update if required */
88 /* similar to advertising new TCP rcv_wnd if required */
89 smc_tx_consumer_update(conn, force);
90
91 return rc;
92}
93
94static void smc_rx_update_cons(struct smc_sock *smc, size_t len)
95{
96 struct smc_connection *conn = &smc->conn;
97 union smc_host_cursor cons;
98
99 smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
100 smc_rx_update_consumer(smc, cons, len);
101}
102
103struct smc_spd_priv {
104 struct smc_sock *smc;
105 size_t len;
106};
107
108static void smc_rx_pipe_buf_release(struct pipe_inode_info *pipe,
109 struct pipe_buffer *buf)
110{
111 struct smc_spd_priv *priv = (struct smc_spd_priv *)buf->private;
112 struct smc_sock *smc = priv->smc;
113 struct smc_connection *conn;
114 struct sock *sk = &smc->sk;
115
116 if (sk->sk_state == SMC_CLOSED ||
117 sk->sk_state == SMC_PEERFINCLOSEWAIT ||
118 sk->sk_state == SMC_APPFINCLOSEWAIT)
119 goto out;
120 conn = &smc->conn;
121 lock_sock(sk);
122 smc_rx_update_cons(smc, priv->len);
123 release_sock(sk);
124 if (atomic_sub_and_test(priv->len, &conn->splice_pending))
125 smc_rx_wake_up(sk);
126out:
127 kfree(priv);
128 put_page(buf->page);
129 sock_put(sk);
130}
131
132static const struct pipe_buf_operations smc_pipe_ops = {
133 .release = smc_rx_pipe_buf_release,
134 .get = generic_pipe_buf_get
135};
136
137static void smc_rx_spd_release(struct splice_pipe_desc *spd,
138 unsigned int i)
139{
140 put_page(spd->pages[i]);
141}
142
143static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len,
144 struct smc_sock *smc)
145{
146 struct splice_pipe_desc spd;
147 struct partial_page partial;
148 struct smc_spd_priv *priv;
149 int bytes;
150
151 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
152 if (!priv)
153 return -ENOMEM;
154 priv->len = len;
155 priv->smc = smc;
156 partial.offset = src - (char *)smc->conn.rmb_desc->cpu_addr;
157 partial.len = len;
158 partial.private = (unsigned long)priv;
159
160 spd.nr_pages_max = 1;
161 spd.nr_pages = 1;
162 spd.pages = &smc->conn.rmb_desc->pages;
163 spd.partial = &partial;
164 spd.ops = &smc_pipe_ops;
165 spd.spd_release = smc_rx_spd_release;
166
167 bytes = splice_to_pipe(pipe, &spd);
168 if (bytes > 0) {
169 sock_hold(&smc->sk);
170 get_page(smc->conn.rmb_desc->pages);
171 atomic_add(bytes, &smc->conn.splice_pending);
172 }
173
174 return bytes;
175}
176
177static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn)
178{
179 return atomic_read(&conn->bytes_to_rcv) &&
180 !atomic_read(&conn->splice_pending);
181}
182
183/* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted
184 * @smc smc socket
185 * @timeo pointer to max seconds to wait, pointer to value 0 for no timeout
186 * @fcrit add'l criterion to evaluate as function pointer
187 * Returns:
188 * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown.
189 * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted).
190 */
191int smc_rx_wait(struct smc_sock *smc, long *timeo,
192 int (*fcrit)(struct smc_connection *conn))
193{
194 DEFINE_WAIT_FUNC(wait, woken_wake_function);
195 struct smc_connection *conn = &smc->conn;
196 struct smc_cdc_conn_state_flags *cflags =
197 &conn->local_tx_ctrl.conn_state_flags;
198 struct sock *sk = &smc->sk;
199 int rc;
200
201 if (fcrit(conn))
202 return 1;
203 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
204 add_wait_queue(sk_sleep(sk), &wait);
205 rc = sk_wait_event(sk, timeo,
206 sk->sk_err ||
207 cflags->peer_conn_abort ||
208 sk->sk_shutdown & RCV_SHUTDOWN ||
209 conn->killed ||
210 fcrit(conn),
211 &wait);
212 remove_wait_queue(sk_sleep(sk), &wait);
213 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
214 return rc;
215}
216
217static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len,
218 int flags)
219{
220 struct smc_connection *conn = &smc->conn;
221 union smc_host_cursor cons;
222 struct sock *sk = &smc->sk;
223 int rc = 0;
224
225 if (sock_flag(sk, SOCK_URGINLINE) ||
226 !(conn->urg_state == SMC_URG_VALID) ||
227 conn->urg_state == SMC_URG_READ)
228 return -EINVAL;
229
230 if (conn->urg_state == SMC_URG_VALID) {
231 if (!(flags & MSG_PEEK))
232 smc->conn.urg_state = SMC_URG_READ;
233 msg->msg_flags |= MSG_OOB;
234 if (len > 0) {
235 if (!(flags & MSG_TRUNC))
236 rc = memcpy_to_msg(msg, &conn->urg_rx_byte, 1);
237 len = 1;
238 smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
239 if (smc_curs_diff(conn->rmb_desc->len, &cons,
240 &conn->urg_curs) > 1)
241 conn->urg_rx_skip_pend = true;
242 /* Urgent Byte was already accounted for, but trigger
243 * skipping the urgent byte in non-inline case
244 */
245 if (!(flags & MSG_PEEK))
246 smc_rx_update_consumer(smc, cons, 0);
247 } else {
248 msg->msg_flags |= MSG_TRUNC;
249 }
250
251 return rc ? -EFAULT : len;
252 }
253
254 if (sk->sk_state == SMC_CLOSED || sk->sk_shutdown & RCV_SHUTDOWN)
255 return 0;
256
257 return -EAGAIN;
258}
259
260static bool smc_rx_recvmsg_data_available(struct smc_sock *smc)
261{
262 struct smc_connection *conn = &smc->conn;
263
264 if (smc_rx_data_available(conn))
265 return true;
266 else if (conn->urg_state == SMC_URG_VALID)
267 /* we received a single urgent Byte - skip */
268 smc_rx_update_cons(smc, 0);
269 return false;
270}
271
272/* smc_rx_recvmsg - receive data from RMBE
273 * @msg: copy data to receive buffer
274 * @pipe: copy data to pipe if set - indicates splice() call
275 *
276 * rcvbuf consumer: main API called by socket layer.
277 * Called under sk lock.
278 */
279int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
280 struct pipe_inode_info *pipe, size_t len, int flags)
281{
282 size_t copylen, read_done = 0, read_remaining = len;
283 size_t chunk_len, chunk_off, chunk_len_sum;
284 struct smc_connection *conn = &smc->conn;
285 int (*func)(struct smc_connection *conn);
286 union smc_host_cursor cons;
287 int readable, chunk;
288 char *rcvbuf_base;
289 struct sock *sk;
290 int splbytes;
291 long timeo;
292 int target; /* Read at least these many bytes */
293 int rc;
294
295 if (unlikely(flags & MSG_ERRQUEUE))
296 return -EINVAL; /* future work for sk.sk_family == AF_SMC */
297
298 sk = &smc->sk;
299 if (sk->sk_state == SMC_LISTEN)
300 return -ENOTCONN;
301 if (flags & MSG_OOB)
302 return smc_rx_recv_urg(smc, msg, len, flags);
303 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
304 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
305
306 /* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */
307 rcvbuf_base = conn->rx_off + conn->rmb_desc->cpu_addr;
308
309 do { /* while (read_remaining) */
310 if (read_done >= target || (pipe && read_done))
311 break;
312
313 if (conn->killed)
314 break;
315
316 if (smc_rx_recvmsg_data_available(smc))
317 goto copy;
318
319 if (sk->sk_shutdown & RCV_SHUTDOWN) {
320 /* smc_cdc_msg_recv_action() could have run after
321 * above smc_rx_recvmsg_data_available()
322 */
323 if (smc_rx_recvmsg_data_available(smc))
324 goto copy;
325 break;
326 }
327
328 if (read_done) {
329 if (sk->sk_err ||
330 sk->sk_state == SMC_CLOSED ||
331 !timeo ||
332 signal_pending(current))
333 break;
334 } else {
335 if (sk->sk_err) {
336 read_done = sock_error(sk);
337 break;
338 }
339 if (sk->sk_state == SMC_CLOSED) {
340 if (!sock_flag(sk, SOCK_DONE)) {
341 /* This occurs when user tries to read
342 * from never connected socket.
343 */
344 read_done = -ENOTCONN;
345 break;
346 }
347 break;
348 }
349 if (!timeo)
350 return -EAGAIN;
351 if (signal_pending(current)) {
352 read_done = sock_intr_errno(timeo);
353 break;
354 }
355 }
356
357 if (!smc_rx_data_available(conn)) {
358 smc_rx_wait(smc, &timeo, smc_rx_data_available);
359 continue;
360 }
361
362copy:
363 /* initialize variables for 1st iteration of subsequent loop */
364 /* could be just 1 byte, even after waiting on data above */
365 readable = atomic_read(&conn->bytes_to_rcv);
366 splbytes = atomic_read(&conn->splice_pending);
367 if (!readable || (msg && splbytes)) {
368 if (splbytes)
369 func = smc_rx_data_available_and_no_splice_pend;
370 else
371 func = smc_rx_data_available;
372 smc_rx_wait(smc, &timeo, func);
373 continue;
374 }
375
376 smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
377 /* subsequent splice() calls pick up where previous left */
378 if (splbytes)
379 smc_curs_add(conn->rmb_desc->len, &cons, splbytes);
380 if (conn->urg_state == SMC_URG_VALID &&
381 sock_flag(&smc->sk, SOCK_URGINLINE) &&
382 readable > 1)
383 readable--; /* always stop at urgent Byte */
384 /* not more than what user space asked for */
385 copylen = min_t(size_t, read_remaining, readable);
386 /* determine chunks where to read from rcvbuf */
387 /* either unwrapped case, or 1st chunk of wrapped case */
388 chunk_len = min_t(size_t, copylen, conn->rmb_desc->len -
389 cons.count);
390 chunk_len_sum = chunk_len;
391 chunk_off = cons.count;
392 smc_rmb_sync_sg_for_cpu(conn);
393 for (chunk = 0; chunk < 2; chunk++) {
394 if (!(flags & MSG_TRUNC)) {
395 if (msg) {
396 rc = memcpy_to_msg(msg, rcvbuf_base +
397 chunk_off,
398 chunk_len);
399 } else {
400 rc = smc_rx_splice(pipe, rcvbuf_base +
401 chunk_off, chunk_len,
402 smc);
403 }
404 if (rc < 0) {
405 if (!read_done)
406 read_done = -EFAULT;
407 smc_rmb_sync_sg_for_device(conn);
408 goto out;
409 }
410 }
411 read_remaining -= chunk_len;
412 read_done += chunk_len;
413
414 if (chunk_len_sum == copylen)
415 break; /* either on 1st or 2nd iteration */
416 /* prepare next (== 2nd) iteration */
417 chunk_len = copylen - chunk_len; /* remainder */
418 chunk_len_sum += chunk_len;
419 chunk_off = 0; /* modulo offset in recv ring buffer */
420 }
421 smc_rmb_sync_sg_for_device(conn);
422
423 /* update cursors */
424 if (!(flags & MSG_PEEK)) {
425 /* increased in recv tasklet smc_cdc_msg_rcv() */
426 smp_mb__before_atomic();
427 atomic_sub(copylen, &conn->bytes_to_rcv);
428 /* guarantee 0 <= bytes_to_rcv <= rmb_desc->len */
429 smp_mb__after_atomic();
430 if (msg && smc_rx_update_consumer(smc, cons, copylen))
431 goto out;
432 }
433 } while (read_remaining);
434out:
435 return read_done;
436}
437
438/* Initialize receive properties on connection establishment. NB: not __init! */
439void smc_rx_init(struct smc_sock *smc)
440{
441 smc->sk.sk_data_ready = smc_rx_wake_up;
442 atomic_set(&smc->conn.splice_pending, 0);
443 smc->conn.urg_state = SMC_URG_READ;
444}
diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h
new file mode 100644
index 000000000..db823c97d
--- /dev/null
+++ b/net/smc/smc_rx.h
@@ -0,0 +1,31 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
4 *
5 * Manage RMBE
6 *
7 * Copyright IBM Corp. 2016
8 *
9 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
10 */
11
12#ifndef SMC_RX_H
13#define SMC_RX_H
14
15#include <linux/socket.h>
16#include <linux/types.h>
17
18#include "smc.h"
19
20void smc_rx_init(struct smc_sock *smc);
21
22int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
23 struct pipe_inode_info *pipe, size_t len, int flags);
24int smc_rx_wait(struct smc_sock *smc, long *timeo,
25 int (*fcrit)(struct smc_connection *conn));
26static inline int smc_rx_data_available(struct smc_connection *conn)
27{
28 return atomic_read(&conn->bytes_to_rcv);
29}
30
31#endif /* SMC_RX_H */
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
new file mode 100644
index 000000000..52ef1fca0
--- /dev/null
+++ b/net/smc/smc_tx.c
@@ -0,0 +1,646 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
4 *
5 * Manage send buffer.
6 * Producer:
7 * Copy user space data into send buffer, if send buffer space available.
8 * Consumer:
9 * Trigger RDMA write into RMBE of peer and send CDC, if RMBE space available.
10 *
11 * Copyright IBM Corp. 2016
12 *
13 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
14 */
15
16#include <linux/net.h>
17#include <linux/rcupdate.h>
18#include <linux/workqueue.h>
19#include <linux/sched/signal.h>
20
21#include <net/sock.h>
22#include <net/tcp.h>
23
24#include "smc.h"
25#include "smc_wr.h"
26#include "smc_cdc.h"
27#include "smc_close.h"
28#include "smc_ism.h"
29#include "smc_tx.h"
30
31#define SMC_TX_WORK_DELAY 0
32#define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */
33
34/***************************** sndbuf producer *******************************/
35
36/* callback implementation for sk.sk_write_space()
37 * to wakeup sndbuf producers that blocked with smc_tx_wait().
38 * called under sk_socket lock.
39 */
40static void smc_tx_write_space(struct sock *sk)
41{
42 struct socket *sock = sk->sk_socket;
43 struct smc_sock *smc = smc_sk(sk);
44 struct socket_wq *wq;
45
46 /* similar to sk_stream_write_space */
47 if (atomic_read(&smc->conn.sndbuf_space) && sock) {
48 clear_bit(SOCK_NOSPACE, &sock->flags);
49 rcu_read_lock();
50 wq = rcu_dereference(sk->sk_wq);
51 if (skwq_has_sleeper(wq))
52 wake_up_interruptible_poll(&wq->wait,
53 EPOLLOUT | EPOLLWRNORM |
54 EPOLLWRBAND);
55 if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
56 sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
57 rcu_read_unlock();
58 }
59}
60
61/* Wakeup sndbuf producers that blocked with smc_tx_wait().
62 * Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space().
63 */
64void smc_tx_sndbuf_nonfull(struct smc_sock *smc)
65{
66 if (smc->sk.sk_socket &&
67 test_bit(SOCK_NOSPACE, &smc->sk.sk_socket->flags))
68 smc->sk.sk_write_space(&smc->sk);
69}
70
71/* blocks sndbuf producer until at least one byte of free space available
72 * or urgent Byte was consumed
73 */
74static int smc_tx_wait(struct smc_sock *smc, int flags)
75{
76 DEFINE_WAIT_FUNC(wait, woken_wake_function);
77 struct smc_connection *conn = &smc->conn;
78 struct sock *sk = &smc->sk;
79 long timeo;
80 int rc = 0;
81
82 /* similar to sk_stream_wait_memory */
83 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
84 add_wait_queue(sk_sleep(sk), &wait);
85 while (1) {
86 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
87 if (sk->sk_err ||
88 (sk->sk_shutdown & SEND_SHUTDOWN) ||
89 conn->killed ||
90 conn->local_tx_ctrl.conn_state_flags.peer_done_writing) {
91 rc = -EPIPE;
92 break;
93 }
94 if (smc_cdc_rxed_any_close(conn)) {
95 rc = -ECONNRESET;
96 break;
97 }
98 if (!timeo) {
99 /* ensure EPOLLOUT is subsequently generated */
100 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
101 rc = -EAGAIN;
102 break;
103 }
104 if (signal_pending(current)) {
105 rc = sock_intr_errno(timeo);
106 break;
107 }
108 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
109 if (atomic_read(&conn->sndbuf_space) && !conn->urg_tx_pend)
110 break; /* at least 1 byte of free & no urgent data */
111 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
112 sk_wait_event(sk, &timeo,
113 sk->sk_err ||
114 (sk->sk_shutdown & SEND_SHUTDOWN) ||
115 smc_cdc_rxed_any_close(conn) ||
116 (atomic_read(&conn->sndbuf_space) &&
117 !conn->urg_tx_pend),
118 &wait);
119 }
120 remove_wait_queue(sk_sleep(sk), &wait);
121 return rc;
122}
123
124static bool smc_tx_is_corked(struct smc_sock *smc)
125{
126 struct tcp_sock *tp = tcp_sk(smc->clcsock->sk);
127
128 return (tp->nonagle & TCP_NAGLE_CORK) ? true : false;
129}
130
131/* sndbuf producer: main API called by socket layer.
132 * called under sock lock.
133 */
134int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
135{
136 size_t copylen, send_done = 0, send_remaining = len;
137 size_t chunk_len, chunk_off, chunk_len_sum;
138 struct smc_connection *conn = &smc->conn;
139 union smc_host_cursor prep;
140 struct sock *sk = &smc->sk;
141 char *sndbuf_base;
142 int tx_cnt_prep;
143 int writespace;
144 int rc, chunk;
145
146 /* This should be in poll */
147 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
148
149 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) {
150 rc = -EPIPE;
151 goto out_err;
152 }
153
154 while (msg_data_left(msg)) {
155 if (sk->sk_state == SMC_INIT)
156 return -ENOTCONN;
157 if (smc->sk.sk_shutdown & SEND_SHUTDOWN ||
158 (smc->sk.sk_err == ECONNABORTED) ||
159 conn->killed)
160 return -EPIPE;
161 if (smc_cdc_rxed_any_close(conn))
162 return send_done ?: -ECONNRESET;
163
164 if (msg->msg_flags & MSG_OOB)
165 conn->local_tx_ctrl.prod_flags.urg_data_pending = 1;
166
167 if (!atomic_read(&conn->sndbuf_space) || conn->urg_tx_pend) {
168 if (send_done)
169 return send_done;
170 rc = smc_tx_wait(smc, msg->msg_flags);
171 if (rc)
172 goto out_err;
173 continue;
174 }
175
176 /* initialize variables for 1st iteration of subsequent loop */
177 /* could be just 1 byte, even after smc_tx_wait above */
178 writespace = atomic_read(&conn->sndbuf_space);
179 /* not more than what user space asked for */
180 copylen = min_t(size_t, send_remaining, writespace);
181 /* determine start of sndbuf */
182 sndbuf_base = conn->sndbuf_desc->cpu_addr;
183 smc_curs_copy(&prep, &conn->tx_curs_prep, conn);
184 tx_cnt_prep = prep.count;
185 /* determine chunks where to write into sndbuf */
186 /* either unwrapped case, or 1st chunk of wrapped case */
187 chunk_len = min_t(size_t, copylen, conn->sndbuf_desc->len -
188 tx_cnt_prep);
189 chunk_len_sum = chunk_len;
190 chunk_off = tx_cnt_prep;
191 smc_sndbuf_sync_sg_for_cpu(conn);
192 for (chunk = 0; chunk < 2; chunk++) {
193 rc = memcpy_from_msg(sndbuf_base + chunk_off,
194 msg, chunk_len);
195 if (rc) {
196 smc_sndbuf_sync_sg_for_device(conn);
197 if (send_done)
198 return send_done;
199 goto out_err;
200 }
201 send_done += chunk_len;
202 send_remaining -= chunk_len;
203
204 if (chunk_len_sum == copylen)
205 break; /* either on 1st or 2nd iteration */
206 /* prepare next (== 2nd) iteration */
207 chunk_len = copylen - chunk_len; /* remainder */
208 chunk_len_sum += chunk_len;
209 chunk_off = 0; /* modulo offset in send ring buffer */
210 }
211 smc_sndbuf_sync_sg_for_device(conn);
212 /* update cursors */
213 smc_curs_add(conn->sndbuf_desc->len, &prep, copylen);
214 smc_curs_copy(&conn->tx_curs_prep, &prep, conn);
215 /* increased in send tasklet smc_cdc_tx_handler() */
216 smp_mb__before_atomic();
217 atomic_sub(copylen, &conn->sndbuf_space);
218 /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
219 smp_mb__after_atomic();
220 /* since we just produced more new data into sndbuf,
221 * trigger sndbuf consumer: RDMA write into peer RMBE and CDC
222 */
223 if ((msg->msg_flags & MSG_OOB) && !send_remaining)
224 conn->urg_tx_pend = true;
225 if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) &&
226 (atomic_read(&conn->sndbuf_space) >
227 (conn->sndbuf_desc->len >> 1)))
228 /* for a corked socket defer the RDMA writes if there
229 * is still sufficient sndbuf_space available
230 */
231 queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work,
232 SMC_TX_CORK_DELAY);
233 else
234 smc_tx_sndbuf_nonempty(conn);
235 } /* while (msg_data_left(msg)) */
236
237 return send_done;
238
239out_err:
240 rc = sk_stream_error(sk, msg->msg_flags, rc);
241 /* make sure we wake any epoll edge trigger waiter */
242 if (unlikely(rc == -EAGAIN))
243 sk->sk_write_space(sk);
244 return rc;
245}
246
247/***************************** sndbuf consumer *******************************/
248
249/* sndbuf consumer: actual data transfer of one target chunk with ISM write */
250int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len,
251 u32 offset, int signal)
252{
253 struct smc_ism_position pos;
254 int rc;
255
256 memset(&pos, 0, sizeof(pos));
257 pos.token = conn->peer_token;
258 pos.index = conn->peer_rmbe_idx;
259 pos.offset = conn->tx_off + offset;
260 pos.signal = signal;
261 rc = smc_ism_write(conn->lgr->smcd, &pos, data, len);
262 if (rc)
263 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
264 return rc;
265}
266
267/* sndbuf consumer: actual data transfer of one target chunk with RDMA write */
268static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
269 int num_sges, struct ib_rdma_wr *rdma_wr)
270{
271 struct smc_link_group *lgr = conn->lgr;
272 struct smc_link *link = conn->lnk;
273 int rc;
274
275 rdma_wr->wr.wr_id = smc_wr_tx_get_next_wr_id(link);
276 rdma_wr->wr.num_sge = num_sges;
277 rdma_wr->remote_addr =
278 lgr->rtokens[conn->rtoken_idx][link->link_idx].dma_addr +
279 /* RMBE within RMB */
280 conn->tx_off +
281 /* offset within RMBE */
282 peer_rmbe_offset;
283 rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey;
284 rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL);
285 if (rc)
286 smcr_link_down_cond_sched(link);
287 return rc;
288}
289
290/* sndbuf consumer */
291static inline void smc_tx_advance_cursors(struct smc_connection *conn,
292 union smc_host_cursor *prod,
293 union smc_host_cursor *sent,
294 size_t len)
295{
296 smc_curs_add(conn->peer_rmbe_size, prod, len);
297 /* increased in recv tasklet smc_cdc_msg_rcv() */
298 smp_mb__before_atomic();
299 /* data in flight reduces usable snd_wnd */
300 atomic_sub(len, &conn->peer_rmbe_space);
301 /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */
302 smp_mb__after_atomic();
303 smc_curs_add(conn->sndbuf_desc->len, sent, len);
304}
305
306/* SMC-R helper for smc_tx_rdma_writes() */
307static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len,
308 size_t src_off, size_t src_len,
309 size_t dst_off, size_t dst_len,
310 struct smc_rdma_wr *wr_rdma_buf)
311{
312 struct smc_link *link = conn->lnk;
313
314 dma_addr_t dma_addr =
315 sg_dma_address(conn->sndbuf_desc->sgt[link->link_idx].sgl);
316 int src_len_sum = src_len, dst_len_sum = dst_len;
317 int sent_count = src_off;
318 int srcchunk, dstchunk;
319 int num_sges;
320 int rc;
321
322 for (dstchunk = 0; dstchunk < 2; dstchunk++) {
323 struct ib_sge *sge =
324 wr_rdma_buf->wr_tx_rdma[dstchunk].wr.sg_list;
325
326 num_sges = 0;
327 for (srcchunk = 0; srcchunk < 2; srcchunk++) {
328 sge[srcchunk].addr = dma_addr + src_off;
329 sge[srcchunk].length = src_len;
330 num_sges++;
331
332 src_off += src_len;
333 if (src_off >= conn->sndbuf_desc->len)
334 src_off -= conn->sndbuf_desc->len;
335 /* modulo in send ring */
336 if (src_len_sum == dst_len)
337 break; /* either on 1st or 2nd iteration */
338 /* prepare next (== 2nd) iteration */
339 src_len = dst_len - src_len; /* remainder */
340 src_len_sum += src_len;
341 }
342 rc = smc_tx_rdma_write(conn, dst_off, num_sges,
343 &wr_rdma_buf->wr_tx_rdma[dstchunk]);
344 if (rc)
345 return rc;
346 if (dst_len_sum == len)
347 break; /* either on 1st or 2nd iteration */
348 /* prepare next (== 2nd) iteration */
349 dst_off = 0; /* modulo offset in RMBE ring buffer */
350 dst_len = len - dst_len; /* remainder */
351 dst_len_sum += dst_len;
352 src_len = min_t(int, dst_len, conn->sndbuf_desc->len -
353 sent_count);
354 src_len_sum = src_len;
355 }
356 return 0;
357}
358
359/* SMC-D helper for smc_tx_rdma_writes() */
360static int smcd_tx_rdma_writes(struct smc_connection *conn, size_t len,
361 size_t src_off, size_t src_len,
362 size_t dst_off, size_t dst_len)
363{
364 int src_len_sum = src_len, dst_len_sum = dst_len;
365 int srcchunk, dstchunk;
366 int rc;
367
368 for (dstchunk = 0; dstchunk < 2; dstchunk++) {
369 for (srcchunk = 0; srcchunk < 2; srcchunk++) {
370 void *data = conn->sndbuf_desc->cpu_addr + src_off;
371
372 rc = smcd_tx_ism_write(conn, data, src_len, dst_off +
373 sizeof(struct smcd_cdc_msg), 0);
374 if (rc)
375 return rc;
376 dst_off += src_len;
377 src_off += src_len;
378 if (src_off >= conn->sndbuf_desc->len)
379 src_off -= conn->sndbuf_desc->len;
380 /* modulo in send ring */
381 if (src_len_sum == dst_len)
382 break; /* either on 1st or 2nd iteration */
383 /* prepare next (== 2nd) iteration */
384 src_len = dst_len - src_len; /* remainder */
385 src_len_sum += src_len;
386 }
387 if (dst_len_sum == len)
388 break; /* either on 1st or 2nd iteration */
389 /* prepare next (== 2nd) iteration */
390 dst_off = 0; /* modulo offset in RMBE ring buffer */
391 dst_len = len - dst_len; /* remainder */
392 dst_len_sum += dst_len;
393 src_len = min_t(int, dst_len, conn->sndbuf_desc->len - src_off);
394 src_len_sum = src_len;
395 }
396 return 0;
397}
398
399/* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit;
400 * usable snd_wnd as max transmit
401 */
402static int smc_tx_rdma_writes(struct smc_connection *conn,
403 struct smc_rdma_wr *wr_rdma_buf)
404{
405 size_t len, src_len, dst_off, dst_len; /* current chunk values */
406 union smc_host_cursor sent, prep, prod, cons;
407 struct smc_cdc_producer_flags *pflags;
408 int to_send, rmbespace;
409 int rc;
410
411 /* source: sndbuf */
412 smc_curs_copy(&sent, &conn->tx_curs_sent, conn);
413 smc_curs_copy(&prep, &conn->tx_curs_prep, conn);
414 /* cf. wmem_alloc - (snd_max - snd_una) */
415 to_send = smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep);
416 if (to_send <= 0)
417 return 0;
418
419 /* destination: RMBE */
420 /* cf. snd_wnd */
421 rmbespace = atomic_read(&conn->peer_rmbe_space);
422 if (rmbespace <= 0)
423 return 0;
424 smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn);
425 smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn);
426
427 /* if usable snd_wnd closes ask peer to advertise once it opens again */
428 pflags = &conn->local_tx_ctrl.prod_flags;
429 pflags->write_blocked = (to_send >= rmbespace);
430 /* cf. usable snd_wnd */
431 len = min(to_send, rmbespace);
432
433 /* initialize variables for first iteration of subsequent nested loop */
434 dst_off = prod.count;
435 if (prod.wrap == cons.wrap) {
436 /* the filled destination area is unwrapped,
437 * hence the available free destination space is wrapped
438 * and we need 2 destination chunks of sum len; start with 1st
439 * which is limited by what's available in sndbuf
440 */
441 dst_len = min_t(size_t,
442 conn->peer_rmbe_size - prod.count, len);
443 } else {
444 /* the filled destination area is wrapped,
445 * hence the available free destination space is unwrapped
446 * and we need a single destination chunk of entire len
447 */
448 dst_len = len;
449 }
450 /* dst_len determines the maximum src_len */
451 if (sent.count + dst_len <= conn->sndbuf_desc->len) {
452 /* unwrapped src case: single chunk of entire dst_len */
453 src_len = dst_len;
454 } else {
455 /* wrapped src case: 2 chunks of sum dst_len; start with 1st: */
456 src_len = conn->sndbuf_desc->len - sent.count;
457 }
458
459 if (conn->lgr->is_smcd)
460 rc = smcd_tx_rdma_writes(conn, len, sent.count, src_len,
461 dst_off, dst_len);
462 else
463 rc = smcr_tx_rdma_writes(conn, len, sent.count, src_len,
464 dst_off, dst_len, wr_rdma_buf);
465 if (rc)
466 return rc;
467
468 if (conn->urg_tx_pend && len == to_send)
469 pflags->urg_data_present = 1;
470 smc_tx_advance_cursors(conn, &prod, &sent, len);
471 /* update connection's cursors with advanced local cursors */
472 smc_curs_copy(&conn->local_tx_ctrl.prod, &prod, conn);
473 /* dst: peer RMBE */
474 smc_curs_copy(&conn->tx_curs_sent, &sent, conn);/* src: local sndbuf */
475
476 return 0;
477}
478
479/* Wakeup sndbuf consumers from any context (IRQ or process)
480 * since there is more data to transmit; usable snd_wnd as max transmit
481 */
482static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn)
483{
484 struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags;
485 struct smc_link *link = conn->lnk;
486 struct smc_rdma_wr *wr_rdma_buf;
487 struct smc_cdc_tx_pend *pend;
488 struct smc_wr_buf *wr_buf;
489 int rc;
490
491 if (!link || !smc_wr_tx_link_hold(link))
492 return -ENOLINK;
493 rc = smc_cdc_get_free_slot(conn, link, &wr_buf, &wr_rdma_buf, &pend);
494 if (rc < 0) {
495 smc_wr_tx_link_put(link);
496 if (rc == -EBUSY) {
497 struct smc_sock *smc =
498 container_of(conn, struct smc_sock, conn);
499
500 if (smc->sk.sk_err == ECONNABORTED)
501 return sock_error(&smc->sk);
502 if (conn->killed)
503 return -EPIPE;
504 rc = 0;
505 mod_delayed_work(conn->lgr->tx_wq, &conn->tx_work,
506 SMC_TX_WORK_DELAY);
507 }
508 return rc;
509 }
510
511 spin_lock_bh(&conn->send_lock);
512 if (link != conn->lnk) {
513 /* link of connection changed, tx_work will restart */
514 smc_wr_tx_put_slot(link,
515 (struct smc_wr_tx_pend_priv *)pend);
516 rc = -ENOLINK;
517 goto out_unlock;
518 }
519 if (!pflags->urg_data_present) {
520 rc = smc_tx_rdma_writes(conn, wr_rdma_buf);
521 if (rc) {
522 smc_wr_tx_put_slot(link,
523 (struct smc_wr_tx_pend_priv *)pend);
524 goto out_unlock;
525 }
526 }
527
528 rc = smc_cdc_msg_send(conn, wr_buf, pend);
529 if (!rc && pflags->urg_data_present) {
530 pflags->urg_data_pending = 0;
531 pflags->urg_data_present = 0;
532 }
533
534out_unlock:
535 spin_unlock_bh(&conn->send_lock);
536 smc_wr_tx_link_put(link);
537 return rc;
538}
539
540static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn)
541{
542 struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags;
543 int rc = 0;
544
545 spin_lock_bh(&conn->send_lock);
546 if (!pflags->urg_data_present)
547 rc = smc_tx_rdma_writes(conn, NULL);
548 if (!rc)
549 rc = smcd_cdc_msg_send(conn);
550
551 if (!rc && pflags->urg_data_present) {
552 pflags->urg_data_pending = 0;
553 pflags->urg_data_present = 0;
554 }
555 spin_unlock_bh(&conn->send_lock);
556 return rc;
557}
558
559int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
560{
561 int rc;
562
563 if (conn->killed ||
564 conn->local_rx_ctrl.conn_state_flags.peer_conn_abort)
565 return -EPIPE; /* connection being aborted */
566 if (conn->lgr->is_smcd)
567 rc = smcd_tx_sndbuf_nonempty(conn);
568 else
569 rc = smcr_tx_sndbuf_nonempty(conn);
570
571 if (!rc) {
572 /* trigger socket release if connection is closing */
573 struct smc_sock *smc = container_of(conn, struct smc_sock,
574 conn);
575 smc_close_wake_tx_prepared(smc);
576 }
577 return rc;
578}
579
580/* Wakeup sndbuf consumers from process context
581 * since there is more data to transmit
582 */
583void smc_tx_work(struct work_struct *work)
584{
585 struct smc_connection *conn = container_of(to_delayed_work(work),
586 struct smc_connection,
587 tx_work);
588 struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
589 int rc;
590
591 lock_sock(&smc->sk);
592 if (smc->sk.sk_err)
593 goto out;
594
595 rc = smc_tx_sndbuf_nonempty(conn);
596 if (!rc && conn->local_rx_ctrl.prod_flags.write_blocked &&
597 !atomic_read(&conn->bytes_to_rcv))
598 conn->local_rx_ctrl.prod_flags.write_blocked = 0;
599
600out:
601 release_sock(&smc->sk);
602}
603
604void smc_tx_consumer_update(struct smc_connection *conn, bool force)
605{
606 union smc_host_cursor cfed, cons, prod;
607 int sender_free = conn->rmb_desc->len;
608 int to_confirm;
609
610 smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
611 smc_curs_copy(&cfed, &conn->rx_curs_confirmed, conn);
612 to_confirm = smc_curs_diff(conn->rmb_desc->len, &cfed, &cons);
613 if (to_confirm > conn->rmbe_update_limit) {
614 smc_curs_copy(&prod, &conn->local_rx_ctrl.prod, conn);
615 sender_free = conn->rmb_desc->len -
616 smc_curs_diff_large(conn->rmb_desc->len,
617 &cfed, &prod);
618 }
619
620 if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
621 force ||
622 ((to_confirm > conn->rmbe_update_limit) &&
623 ((sender_free <= (conn->rmb_desc->len / 2)) ||
624 conn->local_rx_ctrl.prod_flags.write_blocked))) {
625 if (conn->killed ||
626 conn->local_rx_ctrl.conn_state_flags.peer_conn_abort)
627 return;
628 if ((smc_cdc_get_slot_and_msg_send(conn) < 0) &&
629 !conn->killed) {
630 queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work,
631 SMC_TX_WORK_DELAY);
632 return;
633 }
634 }
635 if (conn->local_rx_ctrl.prod_flags.write_blocked &&
636 !atomic_read(&conn->bytes_to_rcv))
637 conn->local_rx_ctrl.prod_flags.write_blocked = 0;
638}
639
640/***************************** send initialize *******************************/
641
642/* Initialize send properties on connection establishment. NB: not __init! */
643void smc_tx_init(struct smc_sock *smc)
644{
645 smc->sk.sk_write_space = smc_tx_write_space;
646}
diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h
new file mode 100644
index 000000000..07e6ad762
--- /dev/null
+++ b/net/smc/smc_tx.h
@@ -0,0 +1,39 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
4 *
5 * Manage send buffer
6 *
7 * Copyright IBM Corp. 2016
8 *
9 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
10 */
11
12#ifndef SMC_TX_H
13#define SMC_TX_H
14
15#include <linux/socket.h>
16#include <linux/types.h>
17
18#include "smc.h"
19#include "smc_cdc.h"
20
21static inline int smc_tx_prepared_sends(struct smc_connection *conn)
22{
23 union smc_host_cursor sent, prep;
24
25 smc_curs_copy(&sent, &conn->tx_curs_sent, conn);
26 smc_curs_copy(&prep, &conn->tx_curs_prep, conn);
27 return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep);
28}
29
30void smc_tx_work(struct work_struct *work);
31void smc_tx_init(struct smc_sock *smc);
32int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len);
33int smc_tx_sndbuf_nonempty(struct smc_connection *conn);
34void smc_tx_sndbuf_nonfull(struct smc_sock *smc);
35void smc_tx_consumer_update(struct smc_connection *conn, bool force);
36int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len,
37 u32 offset, int signal);
38
39#endif /* SMC_TX_H */
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
new file mode 100644
index 000000000..5a81f8c9e
--- /dev/null
+++ b/net/smc/smc_wr.c
@@ -0,0 +1,720 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
4 *
5 * Work Requests exploiting Infiniband API
6 *
7 * Work requests (WR) of type ib_post_send or ib_post_recv respectively
8 * are submitted to either RC SQ or RC RQ respectively
9 * (reliably connected send/receive queue)
10 * and become work queue entries (WQEs).
11 * While an SQ WR/WQE is pending, we track it until transmission completion.
12 * Through a send or receive completion queue (CQ) respectively,
13 * we get completion queue entries (CQEs) [aka work completions (WCs)].
14 * Since the CQ callback is called from IRQ context, we split work by using
15 * bottom halves implemented by tasklets.
16 *
17 * SMC uses this to exchange LLC (link layer control)
18 * and CDC (connection data control) messages.
19 *
20 * Copyright IBM Corp. 2016
21 *
22 * Author(s): Steffen Maier <maier@linux.vnet.ibm.com>
23 */
24
25#include <linux/atomic.h>
26#include <linux/hashtable.h>
27#include <linux/wait.h>
28#include <rdma/ib_verbs.h>
29#include <asm/div64.h>
30
31#include "smc.h"
32#include "smc_wr.h"
33
34#define SMC_WR_MAX_POLL_CQE 10 /* max. # of compl. queue elements in 1 poll */
35
36#define SMC_WR_RX_HASH_BITS 4
37static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS);
38static DEFINE_SPINLOCK(smc_wr_rx_hash_lock);
39
40struct smc_wr_tx_pend { /* control data for a pending send request */
41 u64 wr_id; /* work request id sent */
42 smc_wr_tx_handler handler;
43 enum ib_wc_status wc_status; /* CQE status */
44 struct smc_link *link;
45 u32 idx;
46 struct smc_wr_tx_pend_priv priv;
47 u8 compl_requested;
48};
49
50/******************************** send queue *********************************/
51
52/*------------------------------- completion --------------------------------*/
53
54/* returns true if at least one tx work request is pending on the given link */
55static inline bool smc_wr_is_tx_pend(struct smc_link *link)
56{
57 if (find_first_bit(link->wr_tx_mask, link->wr_tx_cnt) !=
58 link->wr_tx_cnt) {
59 return true;
60 }
61 return false;
62}
63
64/* wait till all pending tx work requests on the given link are completed */
65void smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
66{
67 wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link));
68}
69
70static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
71{
72 u32 i;
73
74 for (i = 0; i < link->wr_tx_cnt; i++) {
75 if (link->wr_tx_pends[i].wr_id == wr_id)
76 return i;
77 }
78 return link->wr_tx_cnt;
79}
80
81static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
82{
83 struct smc_wr_tx_pend pnd_snd;
84 struct smc_link *link;
85 u32 pnd_snd_idx;
86
87 link = wc->qp->qp_context;
88
89 if (wc->opcode == IB_WC_REG_MR) {
90 if (wc->status)
91 link->wr_reg_state = FAILED;
92 else
93 link->wr_reg_state = CONFIRMED;
94 smc_wr_wakeup_reg_wait(link);
95 return;
96 }
97
98 pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id);
99 if (pnd_snd_idx == link->wr_tx_cnt)
100 return;
101 link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
102 if (link->wr_tx_pends[pnd_snd_idx].compl_requested)
103 complete(&link->wr_tx_compl[pnd_snd_idx]);
104 memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], sizeof(pnd_snd));
105 /* clear the full struct smc_wr_tx_pend including .priv */
106 memset(&link->wr_tx_pends[pnd_snd_idx], 0,
107 sizeof(link->wr_tx_pends[pnd_snd_idx]));
108 memset(&link->wr_tx_bufs[pnd_snd_idx], 0,
109 sizeof(link->wr_tx_bufs[pnd_snd_idx]));
110 if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
111 return;
112 if (wc->status) {
113 /* terminate link */
114 smcr_link_down_cond_sched(link);
115 }
116 if (pnd_snd.handler)
117 pnd_snd.handler(&pnd_snd.priv, link, wc->status);
118 wake_up(&link->wr_tx_wait);
119}
120
121static void smc_wr_tx_tasklet_fn(unsigned long data)
122{
123 struct smc_ib_device *dev = (struct smc_ib_device *)data;
124 struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
125 int i = 0, rc;
126 int polled = 0;
127
128again:
129 polled++;
130 do {
131 memset(&wc, 0, sizeof(wc));
132 rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc);
133 if (polled == 1) {
134 ib_req_notify_cq(dev->roce_cq_send,
135 IB_CQ_NEXT_COMP |
136 IB_CQ_REPORT_MISSED_EVENTS);
137 }
138 if (!rc)
139 break;
140 for (i = 0; i < rc; i++)
141 smc_wr_tx_process_cqe(&wc[i]);
142 } while (rc > 0);
143 if (polled == 1)
144 goto again;
145}
146
147void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
148{
149 struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
150
151 tasklet_schedule(&dev->send_tasklet);
152}
153
154/*---------------------------- request submission ---------------------------*/
155
156static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx)
157{
158 *idx = link->wr_tx_cnt;
159 if (!smc_link_sendable(link))
160 return -ENOLINK;
161 for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) {
162 if (!test_and_set_bit(*idx, link->wr_tx_mask))
163 return 0;
164 }
165 *idx = link->wr_tx_cnt;
166 return -EBUSY;
167}
168
169/**
170 * smc_wr_tx_get_free_slot() - returns buffer for message assembly,
171 * and sets info for pending transmit tracking
172 * @link: Pointer to smc_link used to later send the message.
173 * @handler: Send completion handler function pointer.
174 * @wr_buf: Out value returns pointer to message buffer.
175 * @wr_rdma_buf: Out value returns pointer to rdma work request.
176 * @wr_pend_priv: Out value returns pointer serving as handler context.
177 *
178 * Return: 0 on success, or -errno on error.
179 */
180int smc_wr_tx_get_free_slot(struct smc_link *link,
181 smc_wr_tx_handler handler,
182 struct smc_wr_buf **wr_buf,
183 struct smc_rdma_wr **wr_rdma_buf,
184 struct smc_wr_tx_pend_priv **wr_pend_priv)
185{
186 struct smc_link_group *lgr = smc_get_lgr(link);
187 struct smc_wr_tx_pend *wr_pend;
188 u32 idx = link->wr_tx_cnt;
189 struct ib_send_wr *wr_ib;
190 u64 wr_id;
191 int rc;
192
193 *wr_buf = NULL;
194 *wr_pend_priv = NULL;
195 if (in_softirq() || lgr->terminating) {
196 rc = smc_wr_tx_get_free_slot_index(link, &idx);
197 if (rc)
198 return rc;
199 } else {
200 rc = wait_event_interruptible_timeout(
201 link->wr_tx_wait,
202 !smc_link_sendable(link) ||
203 lgr->terminating ||
204 (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
205 SMC_WR_TX_WAIT_FREE_SLOT_TIME);
206 if (!rc) {
207 /* timeout - terminate link */
208 smcr_link_down_cond_sched(link);
209 return -EPIPE;
210 }
211 if (idx == link->wr_tx_cnt)
212 return -EPIPE;
213 }
214 wr_id = smc_wr_tx_get_next_wr_id(link);
215 wr_pend = &link->wr_tx_pends[idx];
216 wr_pend->wr_id = wr_id;
217 wr_pend->handler = handler;
218 wr_pend->link = link;
219 wr_pend->idx = idx;
220 wr_ib = &link->wr_tx_ibs[idx];
221 wr_ib->wr_id = wr_id;
222 *wr_buf = &link->wr_tx_bufs[idx];
223 if (wr_rdma_buf)
224 *wr_rdma_buf = &link->wr_tx_rdmas[idx];
225 *wr_pend_priv = &wr_pend->priv;
226 return 0;
227}
228
229int smc_wr_tx_put_slot(struct smc_link *link,
230 struct smc_wr_tx_pend_priv *wr_pend_priv)
231{
232 struct smc_wr_tx_pend *pend;
233
234 pend = container_of(wr_pend_priv, struct smc_wr_tx_pend, priv);
235 if (pend->idx < link->wr_tx_cnt) {
236 u32 idx = pend->idx;
237
238 /* clear the full struct smc_wr_tx_pend including .priv */
239 memset(&link->wr_tx_pends[idx], 0,
240 sizeof(link->wr_tx_pends[idx]));
241 memset(&link->wr_tx_bufs[idx], 0,
242 sizeof(link->wr_tx_bufs[idx]));
243 test_and_clear_bit(idx, link->wr_tx_mask);
244 wake_up(&link->wr_tx_wait);
245 return 1;
246 }
247
248 return 0;
249}
250
251/* Send prepared WR slot via ib_post_send.
252 * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
253 */
254int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
255{
256 struct smc_wr_tx_pend *pend;
257 int rc;
258
259 ib_req_notify_cq(link->smcibdev->roce_cq_send,
260 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
261 pend = container_of(priv, struct smc_wr_tx_pend, priv);
262 rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL);
263 if (rc) {
264 smc_wr_tx_put_slot(link, priv);
265 smcr_link_down_cond_sched(link);
266 }
267 return rc;
268}
269
270/* Send prepared WR slot via ib_post_send and wait for send completion
271 * notification.
272 * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
273 */
274int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
275 unsigned long timeout)
276{
277 struct smc_wr_tx_pend *pend;
278 u32 pnd_idx;
279 int rc;
280
281 pend = container_of(priv, struct smc_wr_tx_pend, priv);
282 pend->compl_requested = 1;
283 pnd_idx = pend->idx;
284 init_completion(&link->wr_tx_compl[pnd_idx]);
285
286 rc = smc_wr_tx_send(link, priv);
287 if (rc)
288 return rc;
289 /* wait for completion by smc_wr_tx_process_cqe() */
290 rc = wait_for_completion_interruptible_timeout(
291 &link->wr_tx_compl[pnd_idx], timeout);
292 if (rc <= 0)
293 rc = -ENODATA;
294 if (rc > 0)
295 rc = 0;
296 return rc;
297}
298
299/* Register a memory region and wait for result. */
300int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
301{
302 int rc;
303
304 ib_req_notify_cq(link->smcibdev->roce_cq_send,
305 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
306 link->wr_reg_state = POSTED;
307 link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr;
308 link->wr_reg.mr = mr;
309 link->wr_reg.key = mr->rkey;
310 rc = ib_post_send(link->roce_qp, &link->wr_reg.wr, NULL);
311 if (rc)
312 return rc;
313
314 atomic_inc(&link->wr_reg_refcnt);
315 rc = wait_event_interruptible_timeout(link->wr_reg_wait,
316 (link->wr_reg_state != POSTED),
317 SMC_WR_REG_MR_WAIT_TIME);
318 if (atomic_dec_and_test(&link->wr_reg_refcnt))
319 wake_up_all(&link->wr_reg_wait);
320 if (!rc) {
321 /* timeout - terminate link */
322 smcr_link_down_cond_sched(link);
323 return -EPIPE;
324 }
325 if (rc == -ERESTARTSYS)
326 return -EINTR;
327 switch (link->wr_reg_state) {
328 case CONFIRMED:
329 rc = 0;
330 break;
331 case FAILED:
332 rc = -EIO;
333 break;
334 case POSTED:
335 rc = -EPIPE;
336 break;
337 }
338 return rc;
339}
340
341/****************************** receive queue ********************************/
342
343int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler)
344{
345 struct smc_wr_rx_handler *h_iter;
346 int rc = 0;
347
348 spin_lock(&smc_wr_rx_hash_lock);
349 hash_for_each_possible(smc_wr_rx_hash, h_iter, list, handler->type) {
350 if (h_iter->type == handler->type) {
351 rc = -EEXIST;
352 goto out_unlock;
353 }
354 }
355 hash_add(smc_wr_rx_hash, &handler->list, handler->type);
356out_unlock:
357 spin_unlock(&smc_wr_rx_hash_lock);
358 return rc;
359}
360
361/* Demultiplex a received work request based on the message type to its handler.
362 * Relies on smc_wr_rx_hash having been completely filled before any IB WRs,
363 * and not being modified any more afterwards so we don't need to lock it.
364 */
365static inline void smc_wr_rx_demultiplex(struct ib_wc *wc)
366{
367 struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
368 struct smc_wr_rx_handler *handler;
369 struct smc_wr_rx_hdr *wr_rx;
370 u64 temp_wr_id;
371 u32 index;
372
373 if (wc->byte_len < sizeof(*wr_rx))
374 return; /* short message */
375 temp_wr_id = wc->wr_id;
376 index = do_div(temp_wr_id, link->wr_rx_cnt);
377 wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index];
378 hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) {
379 if (handler->type == wr_rx->type)
380 handler->handler(wc, wr_rx);
381 }
382}
383
384static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
385{
386 struct smc_link *link;
387 int i;
388
389 for (i = 0; i < num; i++) {
390 link = wc[i].qp->qp_context;
391 if (wc[i].status == IB_WC_SUCCESS) {
392 link->wr_rx_tstamp = jiffies;
393 smc_wr_rx_demultiplex(&wc[i]);
394 smc_wr_rx_post(link); /* refill WR RX */
395 } else {
396 /* handle status errors */
397 switch (wc[i].status) {
398 case IB_WC_RETRY_EXC_ERR:
399 case IB_WC_RNR_RETRY_EXC_ERR:
400 case IB_WC_WR_FLUSH_ERR:
401 smcr_link_down_cond_sched(link);
402 break;
403 default:
404 smc_wr_rx_post(link); /* refill WR RX */
405 break;
406 }
407 }
408 }
409}
410
411static void smc_wr_rx_tasklet_fn(unsigned long data)
412{
413 struct smc_ib_device *dev = (struct smc_ib_device *)data;
414 struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
415 int polled = 0;
416 int rc;
417
418again:
419 polled++;
420 do {
421 memset(&wc, 0, sizeof(wc));
422 rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc);
423 if (polled == 1) {
424 ib_req_notify_cq(dev->roce_cq_recv,
425 IB_CQ_SOLICITED_MASK
426 | IB_CQ_REPORT_MISSED_EVENTS);
427 }
428 if (!rc)
429 break;
430 smc_wr_rx_process_cqes(&wc[0], rc);
431 } while (rc > 0);
432 if (polled == 1)
433 goto again;
434}
435
436void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
437{
438 struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
439
440 tasklet_schedule(&dev->recv_tasklet);
441}
442
443int smc_wr_rx_post_init(struct smc_link *link)
444{
445 u32 i;
446 int rc = 0;
447
448 for (i = 0; i < link->wr_rx_cnt; i++)
449 rc = smc_wr_rx_post(link);
450 return rc;
451}
452
453/***************************** init, exit, misc ******************************/
454
455void smc_wr_remember_qp_attr(struct smc_link *lnk)
456{
457 struct ib_qp_attr *attr = &lnk->qp_attr;
458 struct ib_qp_init_attr init_attr;
459
460 memset(attr, 0, sizeof(*attr));
461 memset(&init_attr, 0, sizeof(init_attr));
462 ib_query_qp(lnk->roce_qp, attr,
463 IB_QP_STATE |
464 IB_QP_CUR_STATE |
465 IB_QP_PKEY_INDEX |
466 IB_QP_PORT |
467 IB_QP_QKEY |
468 IB_QP_AV |
469 IB_QP_PATH_MTU |
470 IB_QP_TIMEOUT |
471 IB_QP_RETRY_CNT |
472 IB_QP_RNR_RETRY |
473 IB_QP_RQ_PSN |
474 IB_QP_ALT_PATH |
475 IB_QP_MIN_RNR_TIMER |
476 IB_QP_SQ_PSN |
477 IB_QP_PATH_MIG_STATE |
478 IB_QP_CAP |
479 IB_QP_DEST_QPN,
480 &init_attr);
481
482 lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT,
483 lnk->qp_attr.cap.max_send_wr);
484 lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3,
485 lnk->qp_attr.cap.max_recv_wr);
486}
487
488static void smc_wr_init_sge(struct smc_link *lnk)
489{
490 u32 i;
491
492 for (i = 0; i < lnk->wr_tx_cnt; i++) {
493 lnk->wr_tx_sges[i].addr =
494 lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
495 lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
496 lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
497 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[0].lkey =
498 lnk->roce_pd->local_dma_lkey;
499 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[1].lkey =
500 lnk->roce_pd->local_dma_lkey;
501 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[0].lkey =
502 lnk->roce_pd->local_dma_lkey;
503 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[1].lkey =
504 lnk->roce_pd->local_dma_lkey;
505 lnk->wr_tx_ibs[i].next = NULL;
506 lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i];
507 lnk->wr_tx_ibs[i].num_sge = 1;
508 lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
509 lnk->wr_tx_ibs[i].send_flags =
510 IB_SEND_SIGNALED | IB_SEND_SOLICITED;
511 lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE;
512 lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE;
513 lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list =
514 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge;
515 lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list =
516 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge;
517 }
518 for (i = 0; i < lnk->wr_rx_cnt; i++) {
519 lnk->wr_rx_sges[i].addr =
520 lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
521 lnk->wr_rx_sges[i].length = SMC_WR_BUF_SIZE;
522 lnk->wr_rx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
523 lnk->wr_rx_ibs[i].next = NULL;
524 lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[i];
525 lnk->wr_rx_ibs[i].num_sge = 1;
526 }
527 lnk->wr_reg.wr.next = NULL;
528 lnk->wr_reg.wr.num_sge = 0;
529 lnk->wr_reg.wr.send_flags = IB_SEND_SIGNALED;
530 lnk->wr_reg.wr.opcode = IB_WR_REG_MR;
531 lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
532}
533
534void smc_wr_free_link(struct smc_link *lnk)
535{
536 struct ib_device *ibdev;
537
538 if (!lnk->smcibdev)
539 return;
540 ibdev = lnk->smcibdev->ibdev;
541
542 smc_wr_wakeup_reg_wait(lnk);
543 smc_wr_wakeup_tx_wait(lnk);
544
545 smc_wr_tx_wait_no_pending_sends(lnk);
546 wait_event(lnk->wr_reg_wait, (!atomic_read(&lnk->wr_reg_refcnt)));
547 wait_event(lnk->wr_tx_wait, (!atomic_read(&lnk->wr_tx_refcnt)));
548
549 if (lnk->wr_rx_dma_addr) {
550 ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
551 SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
552 DMA_FROM_DEVICE);
553 lnk->wr_rx_dma_addr = 0;
554 }
555 if (lnk->wr_tx_dma_addr) {
556 ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr,
557 SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
558 DMA_TO_DEVICE);
559 lnk->wr_tx_dma_addr = 0;
560 }
561}
562
563void smc_wr_free_link_mem(struct smc_link *lnk)
564{
565 kfree(lnk->wr_tx_compl);
566 lnk->wr_tx_compl = NULL;
567 kfree(lnk->wr_tx_pends);
568 lnk->wr_tx_pends = NULL;
569 kfree(lnk->wr_tx_mask);
570 lnk->wr_tx_mask = NULL;
571 kfree(lnk->wr_tx_sges);
572 lnk->wr_tx_sges = NULL;
573 kfree(lnk->wr_tx_rdma_sges);
574 lnk->wr_tx_rdma_sges = NULL;
575 kfree(lnk->wr_rx_sges);
576 lnk->wr_rx_sges = NULL;
577 kfree(lnk->wr_tx_rdmas);
578 lnk->wr_tx_rdmas = NULL;
579 kfree(lnk->wr_rx_ibs);
580 lnk->wr_rx_ibs = NULL;
581 kfree(lnk->wr_tx_ibs);
582 lnk->wr_tx_ibs = NULL;
583 kfree(lnk->wr_tx_bufs);
584 lnk->wr_tx_bufs = NULL;
585 kfree(lnk->wr_rx_bufs);
586 lnk->wr_rx_bufs = NULL;
587}
588
589int smc_wr_alloc_link_mem(struct smc_link *link)
590{
591 /* allocate link related memory */
592 link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL);
593 if (!link->wr_tx_bufs)
594 goto no_mem;
595 link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE,
596 GFP_KERNEL);
597 if (!link->wr_rx_bufs)
598 goto no_mem_wr_tx_bufs;
599 link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]),
600 GFP_KERNEL);
601 if (!link->wr_tx_ibs)
602 goto no_mem_wr_rx_bufs;
603 link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3,
604 sizeof(link->wr_rx_ibs[0]),
605 GFP_KERNEL);
606 if (!link->wr_rx_ibs)
607 goto no_mem_wr_tx_ibs;
608 link->wr_tx_rdmas = kcalloc(SMC_WR_BUF_CNT,
609 sizeof(link->wr_tx_rdmas[0]),
610 GFP_KERNEL);
611 if (!link->wr_tx_rdmas)
612 goto no_mem_wr_rx_ibs;
613 link->wr_tx_rdma_sges = kcalloc(SMC_WR_BUF_CNT,
614 sizeof(link->wr_tx_rdma_sges[0]),
615 GFP_KERNEL);
616 if (!link->wr_tx_rdma_sges)
617 goto no_mem_wr_tx_rdmas;
618 link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]),
619 GFP_KERNEL);
620 if (!link->wr_tx_sges)
621 goto no_mem_wr_tx_rdma_sges;
622 link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3,
623 sizeof(link->wr_rx_sges[0]),
624 GFP_KERNEL);
625 if (!link->wr_rx_sges)
626 goto no_mem_wr_tx_sges;
627 link->wr_tx_mask = kcalloc(BITS_TO_LONGS(SMC_WR_BUF_CNT),
628 sizeof(*link->wr_tx_mask),
629 GFP_KERNEL);
630 if (!link->wr_tx_mask)
631 goto no_mem_wr_rx_sges;
632 link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT,
633 sizeof(link->wr_tx_pends[0]),
634 GFP_KERNEL);
635 if (!link->wr_tx_pends)
636 goto no_mem_wr_tx_mask;
637 link->wr_tx_compl = kcalloc(SMC_WR_BUF_CNT,
638 sizeof(link->wr_tx_compl[0]),
639 GFP_KERNEL);
640 if (!link->wr_tx_compl)
641 goto no_mem_wr_tx_pends;
642 return 0;
643
644no_mem_wr_tx_pends:
645 kfree(link->wr_tx_pends);
646no_mem_wr_tx_mask:
647 kfree(link->wr_tx_mask);
648no_mem_wr_rx_sges:
649 kfree(link->wr_rx_sges);
650no_mem_wr_tx_sges:
651 kfree(link->wr_tx_sges);
652no_mem_wr_tx_rdma_sges:
653 kfree(link->wr_tx_rdma_sges);
654no_mem_wr_tx_rdmas:
655 kfree(link->wr_tx_rdmas);
656no_mem_wr_rx_ibs:
657 kfree(link->wr_rx_ibs);
658no_mem_wr_tx_ibs:
659 kfree(link->wr_tx_ibs);
660no_mem_wr_rx_bufs:
661 kfree(link->wr_rx_bufs);
662no_mem_wr_tx_bufs:
663 kfree(link->wr_tx_bufs);
664no_mem:
665 return -ENOMEM;
666}
667
668void smc_wr_remove_dev(struct smc_ib_device *smcibdev)
669{
670 tasklet_kill(&smcibdev->recv_tasklet);
671 tasklet_kill(&smcibdev->send_tasklet);
672}
673
674void smc_wr_add_dev(struct smc_ib_device *smcibdev)
675{
676 tasklet_init(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn,
677 (unsigned long)smcibdev);
678 tasklet_init(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn,
679 (unsigned long)smcibdev);
680}
681
682int smc_wr_create_link(struct smc_link *lnk)
683{
684 struct ib_device *ibdev = lnk->smcibdev->ibdev;
685 int rc = 0;
686
687 smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0);
688 lnk->wr_rx_id = 0;
689 lnk->wr_rx_dma_addr = ib_dma_map_single(
690 ibdev, lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
691 DMA_FROM_DEVICE);
692 if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) {
693 lnk->wr_rx_dma_addr = 0;
694 rc = -EIO;
695 goto out;
696 }
697 lnk->wr_tx_dma_addr = ib_dma_map_single(
698 ibdev, lnk->wr_tx_bufs, SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
699 DMA_TO_DEVICE);
700 if (ib_dma_mapping_error(ibdev, lnk->wr_tx_dma_addr)) {
701 rc = -EIO;
702 goto dma_unmap;
703 }
704 smc_wr_init_sge(lnk);
705 memset(lnk->wr_tx_mask, 0,
706 BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask));
707 init_waitqueue_head(&lnk->wr_tx_wait);
708 atomic_set(&lnk->wr_tx_refcnt, 0);
709 init_waitqueue_head(&lnk->wr_reg_wait);
710 atomic_set(&lnk->wr_reg_refcnt, 0);
711 return rc;
712
713dma_unmap:
714 ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
715 SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
716 DMA_FROM_DEVICE);
717 lnk->wr_rx_dma_addr = 0;
718out:
719 return rc;
720}
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
new file mode 100644
index 000000000..cb58e6007
--- /dev/null
+++ b/net/smc/smc_wr.h
@@ -0,0 +1,131 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
4 *
5 * Work Requests exploiting Infiniband API
6 *
7 * Copyright IBM Corp. 2016
8 *
9 * Author(s): Steffen Maier <maier@linux.vnet.ibm.com>
10 */
11
12#ifndef SMC_WR_H
13#define SMC_WR_H
14
15#include <linux/atomic.h>
16#include <rdma/ib_verbs.h>
17#include <asm/div64.h>
18
19#include "smc.h"
20#include "smc_core.h"
21
22#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */
23
24#define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ)
25
26#define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */
27
28#define SMC_WR_TX_PEND_PRIV_SIZE 32
29
30struct smc_wr_tx_pend_priv {
31 u8 priv[SMC_WR_TX_PEND_PRIV_SIZE];
32};
33
34typedef void (*smc_wr_tx_handler)(struct smc_wr_tx_pend_priv *,
35 struct smc_link *,
36 enum ib_wc_status);
37
38typedef bool (*smc_wr_tx_filter)(struct smc_wr_tx_pend_priv *,
39 unsigned long);
40
41typedef void (*smc_wr_tx_dismisser)(struct smc_wr_tx_pend_priv *);
42
43struct smc_wr_rx_handler {
44 struct hlist_node list; /* hash table collision resolution */
45 void (*handler)(struct ib_wc *, void *);
46 u8 type;
47};
48
49/* Only used by RDMA write WRs.
50 * All other WRs (CDC/LLC) use smc_wr_tx_send handling WR_ID implicitly
51 */
52static inline long smc_wr_tx_get_next_wr_id(struct smc_link *link)
53{
54 return atomic_long_inc_return(&link->wr_tx_id);
55}
56
57static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val)
58{
59 atomic_long_set(wr_tx_id, val);
60}
61
62static inline bool smc_wr_tx_link_hold(struct smc_link *link)
63{
64 if (!smc_link_sendable(link))
65 return false;
66 atomic_inc(&link->wr_tx_refcnt);
67 return true;
68}
69
70static inline void smc_wr_tx_link_put(struct smc_link *link)
71{
72 if (atomic_dec_and_test(&link->wr_tx_refcnt))
73 wake_up_all(&link->wr_tx_wait);
74}
75
76static inline void smc_wr_wakeup_tx_wait(struct smc_link *lnk)
77{
78 wake_up_all(&lnk->wr_tx_wait);
79}
80
81static inline void smc_wr_wakeup_reg_wait(struct smc_link *lnk)
82{
83 wake_up(&lnk->wr_reg_wait);
84}
85
86/* post a new receive work request to fill a completed old work request entry */
87static inline int smc_wr_rx_post(struct smc_link *link)
88{
89 int rc;
90 u64 wr_id, temp_wr_id;
91 u32 index;
92
93 wr_id = ++link->wr_rx_id; /* tasklet context, thus not atomic */
94 temp_wr_id = wr_id;
95 index = do_div(temp_wr_id, link->wr_rx_cnt);
96 link->wr_rx_ibs[index].wr_id = wr_id;
97 rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], NULL);
98 return rc;
99}
100
101int smc_wr_create_link(struct smc_link *lnk);
102int smc_wr_alloc_link_mem(struct smc_link *lnk);
103void smc_wr_free_link(struct smc_link *lnk);
104void smc_wr_free_link_mem(struct smc_link *lnk);
105void smc_wr_remember_qp_attr(struct smc_link *lnk);
106void smc_wr_remove_dev(struct smc_ib_device *smcibdev);
107void smc_wr_add_dev(struct smc_ib_device *smcibdev);
108
109int smc_wr_tx_get_free_slot(struct smc_link *link, smc_wr_tx_handler handler,
110 struct smc_wr_buf **wr_buf,
111 struct smc_rdma_wr **wrs,
112 struct smc_wr_tx_pend_priv **wr_pend_priv);
113int smc_wr_tx_put_slot(struct smc_link *link,
114 struct smc_wr_tx_pend_priv *wr_pend_priv);
115int smc_wr_tx_send(struct smc_link *link,
116 struct smc_wr_tx_pend_priv *wr_pend_priv);
117int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
118 unsigned long timeout);
119void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
120void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type,
121 smc_wr_tx_filter filter,
122 smc_wr_tx_dismisser dismisser,
123 unsigned long data);
124void smc_wr_tx_wait_no_pending_sends(struct smc_link *link);
125
126int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler);
127int smc_wr_rx_post_init(struct smc_link *link);
128void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
129int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr);
130
131#endif /* SMC_WR_H */