| // SPDX-License-Identifier: GPL-2.0-or-later |
| /* |
| * Copyright (C) 2017, Microsoft Corporation. |
| * |
| * Author(s): Long Li <longli@microsoft.com> |
| */ |
| |
| #include "smbdirect.h" |
| #include "cifs_debug.h" |
| #include "cifsproto.h" |
| #include "smb2proto.h" |
| #include "../common/smbdirect/smbdirect_public.h" |
| |
| /* Port numbers for SMBD transport */ |
| #define SMB_PORT 445 |
| #define SMBD_PORT 5445 |
| |
| /* Address lookup and resolve timeout in ms */ |
| #define RDMA_RESOLVE_TIMEOUT 5000 |
| |
| /* SMBD negotiation timeout in seconds */ |
| #define SMBD_NEGOTIATE_TIMEOUT 120 |
| |
| /* The timeout to wait for a keepalive message from peer in seconds */ |
| #define KEEPALIVE_RECV_TIMEOUT 5 |
| |
| /* |
| * Default maximum number of RDMA read/write outstanding on this connection |
| * This value is possibly decreased during QP creation on hardware limit |
| */ |
| #define SMBD_CM_RESPONDER_RESOURCES 32 |
| |
| /* |
| * User configurable initial values per SMBD transport connection |
| * as defined in [MS-SMBD] 3.1.1.1 |
| * Those may change after a SMBD negotiation |
| */ |
| /* The local peer's maximum number of credits to grant to the peer */ |
| int smbd_receive_credit_max = 255; |
| |
| /* The remote peer's credit request of local peer */ |
| int smbd_send_credit_target = 255; |
| |
| /* The maximum single message size can be sent to remote peer */ |
| int smbd_max_send_size = 1364; |
| |
| /* |
| * The maximum fragmented upper-layer payload receive size supported |
| * |
| * Assume max_payload_per_credit is |
| * smbd_max_receive_size - 24 = 1340 |
| * |
| * The maximum number would be |
| * smbd_receive_credit_max * max_payload_per_credit |
| * |
| * 1340 * 255 = 341700 (0x536C4) |
| * |
| * The minimum value from the spec is 131072 (0x20000) |
| * |
| * For now we use the logic we used in ksmbd before: |
| * (1364 * 255) / 2 = 173910 (0x2A756) |
| */ |
| int smbd_max_fragmented_recv_size = (1364 * 255) / 2; |
| |
| /* The maximum single-message size which can be received */ |
| int smbd_max_receive_size = 1364; |
| |
| /* The timeout to initiate send of a keepalive message on idle */ |
| int smbd_keep_alive_interval = 120; |
| |
| /* |
| * User configurable initial values for RDMA transport |
| * The actual values used may be lower and are limited to hardware capabilities |
| */ |
| /* Default maximum number of pages in a single RDMA write/read */ |
| int smbd_max_frmr_depth = 2048; |
| |
| /* If payload is less than this byte, use RDMA send/recv not read/write */ |
| int rdma_readwrite_threshold = 4096; |
| |
| /* Transport logging functions |
| * Logging are defined as classes. They can be OR'ed to define the actual |
| * logging level via module parameter smbd_logging_class |
| * e.g. cifs.smbd_logging_class=0xa0 will log all log_rdma_recv() and |
| * log_rdma_event() |
| */ |
| #define LOG_OUTGOING 0x1 |
| #define LOG_INCOMING 0x2 |
| #define LOG_READ 0x4 |
| #define LOG_WRITE 0x8 |
| #define LOG_RDMA_SEND 0x10 |
| #define LOG_RDMA_RECV 0x20 |
| #define LOG_KEEP_ALIVE 0x40 |
| #define LOG_RDMA_EVENT 0x80 |
| #define LOG_RDMA_MR 0x100 |
| static unsigned int smbd_logging_class; |
| module_param(smbd_logging_class, uint, 0644); |
| MODULE_PARM_DESC(smbd_logging_class, |
| "Logging class for SMBD transport 0x0 to 0x100"); |
| |
| #define ERR 0x0 |
| #define INFO 0x1 |
| static unsigned int smbd_logging_level = ERR; |
| module_param(smbd_logging_level, uint, 0644); |
| MODULE_PARM_DESC(smbd_logging_level, |
| "Logging level for SMBD transport, 0 (default): error, 1: info"); |
| |
| static bool smbd_logging_needed(struct smbdirect_socket *sc, |
| void *private_ptr, |
| unsigned int lvl, |
| unsigned int cls) |
| { |
| #define BUILD_BUG_SAME(x) BUILD_BUG_ON(x != SMBDIRECT_LOG_ ##x) |
| BUILD_BUG_SAME(ERR); |
| BUILD_BUG_SAME(INFO); |
| #undef BUILD_BUG_SAME |
| #define BUILD_BUG_SAME(x) BUILD_BUG_ON(x != SMBDIRECT_ ##x) |
| BUILD_BUG_SAME(LOG_OUTGOING); |
| BUILD_BUG_SAME(LOG_INCOMING); |
| BUILD_BUG_SAME(LOG_READ); |
| BUILD_BUG_SAME(LOG_WRITE); |
| BUILD_BUG_SAME(LOG_RDMA_SEND); |
| BUILD_BUG_SAME(LOG_RDMA_RECV); |
| BUILD_BUG_SAME(LOG_KEEP_ALIVE); |
| BUILD_BUG_SAME(LOG_RDMA_EVENT); |
| BUILD_BUG_SAME(LOG_RDMA_MR); |
| #undef BUILD_BUG_SAME |
| |
| if (lvl <= smbd_logging_level || cls & smbd_logging_class) |
| return true; |
| return false; |
| } |
| |
| static void smbd_logging_vaprintf(struct smbdirect_socket *sc, |
| const char *func, |
| unsigned int line, |
| void *private_ptr, |
| unsigned int lvl, |
| unsigned int cls, |
| struct va_format *vaf) |
| { |
| cifs_dbg(VFS, "%s:%u %pV", func, line, vaf); |
| } |
| |
| #define log_rdma(level, class, fmt, args...) \ |
| do { \ |
| if (level <= smbd_logging_level || class & smbd_logging_class) \ |
| cifs_dbg(VFS, "%s:%d " fmt, __func__, __LINE__, ##args);\ |
| } while (0) |
| |
| #define log_outgoing(level, fmt, args...) \ |
| log_rdma(level, LOG_OUTGOING, fmt, ##args) |
| #define log_incoming(level, fmt, args...) \ |
| log_rdma(level, LOG_INCOMING, fmt, ##args) |
| #define log_read(level, fmt, args...) log_rdma(level, LOG_READ, fmt, ##args) |
| #define log_write(level, fmt, args...) log_rdma(level, LOG_WRITE, fmt, ##args) |
| #define log_rdma_send(level, fmt, args...) \ |
| log_rdma(level, LOG_RDMA_SEND, fmt, ##args) |
| #define log_rdma_recv(level, fmt, args...) \ |
| log_rdma(level, LOG_RDMA_RECV, fmt, ##args) |
| #define log_keep_alive(level, fmt, args...) \ |
| log_rdma(level, LOG_KEEP_ALIVE, fmt, ##args) |
| #define log_rdma_event(level, fmt, args...) \ |
| log_rdma(level, LOG_RDMA_EVENT, fmt, ##args) |
| #define log_rdma_mr(level, fmt, args...) \ |
| log_rdma(level, LOG_RDMA_MR, fmt, ##args) |
| |
| static int smbd_post_send_full_iter(struct smbdirect_socket *sc, |
| struct smbdirect_send_batch *batch, |
| struct iov_iter *iter, |
| u32 remaining_data_length) |
| { |
| int bytes = 0; |
| |
| /* |
| * smbdirect_connection_send_single_iter() respects the |
| * negotiated max_send_size, so we need to |
| * loop until the full iter is posted |
| */ |
| |
| while (iov_iter_count(iter) > 0) { |
| int rc; |
| |
| rc = smbdirect_connection_send_single_iter(sc, |
| batch, |
| iter, |
| 0, /* flags */ |
| remaining_data_length); |
| if (rc < 0) |
| return rc; |
| remaining_data_length -= rc; |
| bytes += rc; |
| } |
| |
| return bytes; |
| } |
| |
| /* |
| * Destroy the transport and related RDMA and memory resources |
| * Need to go through all the pending counters and make sure on one is using |
| * the transport while it is destroyed |
| */ |
| void smbd_destroy(struct TCP_Server_Info *server) |
| { |
| struct smbd_connection *info = server->smbd_conn; |
| |
| if (!info) { |
| log_rdma_event(INFO, "rdma session already destroyed\n"); |
| return; |
| } |
| |
| smbdirect_socket_release(info->socket); |
| |
| kfree(info); |
| server->smbd_conn = NULL; |
| } |
| |
| /* |
| * Reconnect this SMBD connection, called from upper layer |
| * return value: 0 on success, or actual error code |
| */ |
| int smbd_reconnect(struct TCP_Server_Info *server) |
| { |
| log_rdma_event(INFO, "reconnecting rdma session\n"); |
| |
| if (!server->smbd_conn) { |
| log_rdma_event(INFO, "rdma session already destroyed\n"); |
| goto create_conn; |
| } |
| |
| /* |
| * This is possible if transport is disconnected and we haven't received |
| * notification from RDMA, but upper layer has detected timeout |
| */ |
| log_rdma_event(INFO, "disconnecting transport\n"); |
| smbd_destroy(server); |
| |
| create_conn: |
| log_rdma_event(INFO, "creating rdma session\n"); |
| server->smbd_conn = smbd_get_connection( |
| server, (struct sockaddr *) &server->dstaddr); |
| |
| if (server->smbd_conn) { |
| cifs_dbg(VFS, "RDMA transport re-established\n"); |
| trace_smb3_smbd_connect_done(server->hostname, server->conn_id, &server->dstaddr); |
| return 0; |
| } |
| trace_smb3_smbd_connect_err(server->hostname, server->conn_id, &server->dstaddr); |
| return -ENOENT; |
| } |
| |
| /* Create a SMBD connection, called by upper layer */ |
| static struct smbd_connection *_smbd_get_connection( |
| struct TCP_Server_Info *server, struct sockaddr *dstaddr, int port) |
| { |
| struct net *net = cifs_net_ns(server); |
| struct smbd_connection *info; |
| struct smbdirect_socket *sc; |
| struct smbdirect_socket_parameters init_params = {}; |
| struct smbdirect_socket_parameters *sp; |
| __be16 *sport; |
| u64 port_flags = 0; |
| int ret; |
| |
| switch (port) { |
| case SMBD_PORT: |
| /* |
| * only allow iWarp devices |
| * for port 5445. |
| */ |
| port_flags |= SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW; |
| break; |
| case SMB_PORT: |
| /* |
| * only allow InfiniBand, RoCEv1 or RoCEv2 |
| * devices for port 445. |
| * |
| * (Basically don't allow iWarp devices) |
| */ |
| port_flags |= SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB; |
| break; |
| } |
| |
| /* |
| * Create the initial parameters |
| */ |
| sp = &init_params; |
| sp->flags = port_flags; |
| sp->resolve_addr_timeout_msec = RDMA_RESOLVE_TIMEOUT; |
| sp->resolve_route_timeout_msec = RDMA_RESOLVE_TIMEOUT; |
| sp->rdma_connect_timeout_msec = RDMA_RESOLVE_TIMEOUT; |
| sp->negotiate_timeout_msec = SMBD_NEGOTIATE_TIMEOUT * 1000; |
| sp->initiator_depth = 1; |
| sp->responder_resources = SMBD_CM_RESPONDER_RESOURCES; |
| sp->recv_credit_max = smbd_receive_credit_max; |
| sp->send_credit_target = smbd_send_credit_target; |
| sp->max_send_size = smbd_max_send_size; |
| sp->max_fragmented_recv_size = smbd_max_fragmented_recv_size; |
| sp->max_recv_size = smbd_max_receive_size; |
| sp->max_frmr_depth = smbd_max_frmr_depth; |
| sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000; |
| sp->keepalive_timeout_msec = KEEPALIVE_RECV_TIMEOUT * 1000; |
| |
| info = kzalloc_obj(*info); |
| if (!info) |
| return NULL; |
| ret = smbdirect_socket_create_kern(net, &sc); |
| if (ret) |
| goto socket_init_failed; |
| smbdirect_socket_set_logging(sc, NULL, smbd_logging_needed, smbd_logging_vaprintf); |
| ret = smbdirect_socket_set_initial_parameters(sc, sp); |
| if (ret) |
| goto set_params_failed; |
| ret = smbdirect_socket_set_kernel_settings(sc, IB_POLL_SOFTIRQ, GFP_KERNEL); |
| if (ret) |
| goto set_settings_failed; |
| |
| if (dstaddr->sa_family == AF_INET6) |
| sport = &((struct sockaddr_in6 *)dstaddr)->sin6_port; |
| else |
| sport = &((struct sockaddr_in *)dstaddr)->sin_port; |
| |
| *sport = htons(port); |
| |
| ret = smbdirect_connect_sync(sc, dstaddr); |
| if (ret) { |
| log_rdma_event(ERR, "connect to %pISpsfc failed: %1pe\n", |
| dstaddr, ERR_PTR(ret)); |
| goto connect_failed; |
| } |
| |
| info->socket = sc; |
| return info; |
| |
| connect_failed: |
| set_settings_failed: |
| set_params_failed: |
| smbdirect_socket_release(sc); |
| socket_init_failed: |
| kfree(info); |
| return NULL; |
| } |
| |
| const struct smbdirect_socket_parameters *smbd_get_parameters(struct smbd_connection *conn) |
| { |
| if (unlikely(!conn->socket)) { |
| static const struct smbdirect_socket_parameters zero_params; |
| |
| return &zero_params; |
| } |
| |
| return smbdirect_socket_get_current_parameters(conn->socket); |
| } |
| |
| struct smbd_connection *smbd_get_connection( |
| struct TCP_Server_Info *server, struct sockaddr *dstaddr) |
| { |
| struct smbd_connection *ret; |
| const struct smbdirect_socket_parameters *sp; |
| int port = SMBD_PORT; |
| |
| try_again: |
| ret = _smbd_get_connection(server, dstaddr, port); |
| |
| /* Try SMB_PORT if SMBD_PORT doesn't work */ |
| if (!ret && port == SMBD_PORT) { |
| port = SMB_PORT; |
| goto try_again; |
| } |
| if (!ret) |
| return NULL; |
| |
| sp = smbd_get_parameters(ret); |
| |
| server->rdma_readwrite_threshold = |
| rdma_readwrite_threshold > sp->max_fragmented_send_size ? |
| sp->max_fragmented_send_size : |
| rdma_readwrite_threshold; |
| |
| return ret; |
| } |
| |
| /* |
| * Receive data from the transport's receive reassembly queue |
| * All the incoming data packets are placed in reassembly queue |
| * iter: the buffer to read data into |
| * size: the length of data to read |
| * return value: actual data read |
| * |
| * Note: this implementation copies the data from reassembly queue to receive |
| * buffers used by upper layer. This is not the optimal code path. A better way |
| * to do it is to not have upper layer allocate its receive buffers but rather |
| * borrow the buffer from reassembly queue, and return it after data is |
| * consumed. But this will require more changes to upper layer code, and also |
| * need to consider packet boundaries while they still being reassembled. |
| */ |
| int smbd_recv(struct smbd_connection *info, struct msghdr *msg) |
| { |
| struct smbdirect_socket *sc = info->socket; |
| |
| if (!smbdirect_connection_is_connected(sc)) |
| return -ENOTCONN; |
| |
| return smbdirect_connection_recvmsg(sc, msg, 0); |
| } |
| |
| /* |
| * Send data to transport |
| * Each rqst is transported as a SMBDirect payload |
| * rqst: the data to write |
| * return value: 0 if successfully write, otherwise error code |
| */ |
| int smbd_send(struct TCP_Server_Info *server, |
| int num_rqst, struct smb_rqst *rqst_array) |
| { |
| struct smbd_connection *info = server->smbd_conn; |
| struct smbdirect_socket *sc = info->socket; |
| const struct smbdirect_socket_parameters *sp = smbd_get_parameters(info); |
| struct smb_rqst *rqst; |
| struct iov_iter iter; |
| struct smbdirect_send_batch_storage bstorage; |
| struct smbdirect_send_batch *batch; |
| unsigned int remaining_data_length, klen; |
| int rc, i, rqst_idx; |
| int error = 0; |
| |
| if (!smbdirect_connection_is_connected(sc)) |
| return -EAGAIN; |
| |
| /* |
| * Add in the page array if there is one. The caller needs to set |
| * rq_tailsz to PAGE_SIZE when the buffer has multiple pages and |
| * ends at page boundary |
| */ |
| remaining_data_length = 0; |
| for (i = 0; i < num_rqst; i++) |
| remaining_data_length += smb_rqst_len(server, &rqst_array[i]); |
| |
| if (unlikely(remaining_data_length > sp->max_fragmented_send_size)) { |
| /* assertion: payload never exceeds negotiated maximum */ |
| log_write(ERR, "payload size %d > max size %d\n", |
| remaining_data_length, sp->max_fragmented_send_size); |
| return -EINVAL; |
| } |
| |
| log_write(INFO, "num_rqst=%d total length=%u\n", |
| num_rqst, remaining_data_length); |
| |
| rqst_idx = 0; |
| batch = smbdirect_init_send_batch_storage(&bstorage, false, 0); |
| do { |
| rqst = &rqst_array[rqst_idx]; |
| |
| cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n", |
| rqst_idx, smb_rqst_len(server, rqst)); |
| for (i = 0; i < rqst->rq_nvec; i++) |
| dump_smb(rqst->rq_iov[i].iov_base, rqst->rq_iov[i].iov_len); |
| |
| log_write(INFO, "RDMA-WR[%u] nvec=%d len=%u iter=%zu rqlen=%lu\n", |
| rqst_idx, rqst->rq_nvec, remaining_data_length, |
| iov_iter_count(&rqst->rq_iter), smb_rqst_len(server, rqst)); |
| |
| /* Send the metadata pages. */ |
| klen = 0; |
| for (i = 0; i < rqst->rq_nvec; i++) |
| klen += rqst->rq_iov[i].iov_len; |
| iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen); |
| |
| rc = smbd_post_send_full_iter(sc, batch, &iter, remaining_data_length); |
| if (rc < 0) { |
| error = rc; |
| break; |
| } |
| remaining_data_length -= rc; |
| |
| if (iov_iter_count(&rqst->rq_iter) > 0) { |
| /* And then the data pages if there are any */ |
| rc = smbd_post_send_full_iter(sc, batch, &rqst->rq_iter, |
| remaining_data_length); |
| if (rc < 0) { |
| error = rc; |
| break; |
| } |
| remaining_data_length -= rc; |
| } |
| |
| } while (++rqst_idx < num_rqst); |
| |
| rc = smbdirect_connection_send_batch_flush(sc, batch, true); |
| if (unlikely(!rc && error)) |
| rc = error; |
| |
| /* |
| * As an optimization, we don't wait for individual I/O to finish |
| * before sending the next one. |
| * Send them all and wait for pending send count to get to 0 |
| * that means all the I/Os have been out and we are good to return |
| */ |
| |
| error = rc; |
| rc = smbdirect_connection_send_wait_zero_pending(sc); |
| if (unlikely(rc && !error)) |
| error = -EAGAIN; |
| |
| if (unlikely(error)) |
| return error; |
| |
| return 0; |
| } |
| |
| /* |
| * Register memory for RDMA read/write |
| * iter: the buffer to register memory with |
| * writing: true if this is a RDMA write (SMB read), false for RDMA read |
| * need_invalidate: true if this MR needs to be locally invalidated after I/O |
| * return value: the MR registered, NULL if failed. |
| */ |
| struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info, |
| struct iov_iter *iter, |
| bool writing, bool need_invalidate) |
| { |
| struct smbdirect_socket *sc = info->socket; |
| |
| if (!smbdirect_connection_is_connected(sc)) |
| return NULL; |
| |
| return smbdirect_connection_register_mr_io(sc, iter, writing, need_invalidate); |
| } |
| |
| void smbd_mr_fill_buffer_descriptor(struct smbdirect_mr_io *mr, |
| struct smbdirect_buffer_descriptor_v1 *v1) |
| { |
| smbdirect_mr_io_fill_buffer_descriptor(mr, v1); |
| } |
| |
| /* |
| * Deregister a MR after I/O is done |
| * This function may wait if remote invalidation is not used |
| * and we have to locally invalidate the buffer to prevent data is being |
| * modified by remote peer after upper layer consumes it |
| */ |
| void smbd_deregister_mr(struct smbdirect_mr_io *mr) |
| { |
| smbdirect_connection_deregister_mr_io(mr); |
| } |
| |
| void smbd_debug_proc_show(struct TCP_Server_Info *server, struct seq_file *m) |
| { |
| if (!server->rdma) |
| return; |
| |
| if (!server->smbd_conn) { |
| seq_puts(m, "\nSMBDirect transport not available"); |
| return; |
| } |
| |
| smbdirect_connection_legacy_debug_proc_show(server->smbd_conn->socket, |
| server->rdma_readwrite_threshold, |
| m); |
| } |