The credit calculation in virtio_transport_get_credit() uses unsigned arithmetic: ret = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt); If the peer shrinks its advertised buffer (peer_buf_alloc) while bytes are in flight, the subtraction can underflow and produce a large positive value, potentially allowing more data to be queued than the peer can handle. Use s64 arithmetic for the subtraction and clamp negative results to zero, matching the approach already used in virtio_transport_has_space(). Fixes: 06a8fc78367d ("VSOCK: Introduce virtio_vsock_common.ko") Suggested-by: Stefano Garzarella Signed-off-by: Melbin K Mathew --- net/vmw_vsock/virtio_transport_common.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index dcc8a1d5851e..d692b227912d 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -494,14 +494,25 @@ EXPORT_SYMBOL_GPL(virtio_transport_consume_skb_sent); u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit) { u32 ret; + u32 inflight; + s64 bytes; if (!credit) return 0; spin_lock_bh(&vvs->tx_lock); - ret = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt); - if (ret > credit) - ret = credit; + + /* + * Compute available space using s64 to avoid underflow if + * peer_buf_alloc < inflight bytes (can happen if peer shrinks + * its advertised buffer while data is in flight). + */ + inflight = vvs->tx_cnt - vvs->peer_fwd_cnt; + bytes = (s64)vvs->peer_buf_alloc - inflight; + if (bytes < 0) + bytes = 0; + + ret = (bytes > credit) ? credit : (u32)bytes; vvs->tx_cnt += ret; vvs->bytes_unsent += ret; spin_unlock_bh(&vvs->tx_lock); -- 2.34.1 The virtio vsock transport derives its TX credit directly from peer_buf_alloc, which is set from the remote endpoint's SO_VM_SOCKETS_BUFFER_SIZE value. On the host side this means that the amount of data we are willing to queue for a connection is scaled by a guest-chosen buffer size, rather than the host's own vsock configuration. A malicious guest can advertise a large buffer and read slowly, causing the host to allocate a correspondingly large amount of sk_buff memory. Introduce a small helper, virtio_transport_tx_buf_alloc(), that returns min(peer_buf_alloc, buf_alloc), and use it wherever we consume peer_buf_alloc: - virtio_transport_get_credit() - virtio_transport_has_space() - virtio_transport_seqpacket_enqueue() This ensures the effective TX window is bounded by both the peer's advertised buffer and our own buf_alloc (already clamped to buffer_max_size via SO_VM_SOCKETS_BUFFER_MAX_SIZE), so a remote guest cannot force the host to queue more data than allowed by the host's own vsock settings. On an unpatched Ubuntu 22.04 host (~64 GiB RAM), running a PoC with 32 guest vsock connections advertising 2 GiB each and reading slowly drove Slab/SUnreclaim from ~0.5 GiB to ~57 GiB; the system only recovered after killing the QEMU process. With this patch applied: Before: MemFree: ~61.6 GiB Slab: ~142 MiB SUnreclaim: ~117 MiB After 32 high-credit connections: MemFree: ~61.5 GiB Slab: ~178 MiB SUnreclaim: ~152 MiB Only ~35 MiB increase in Slab/SUnreclaim, no host OOM, and the guest remains responsive. Compatibility with non-virtio transports: - VMCI uses the AF_VSOCK buffer knobs to size its queue pairs per socket based on the local vsk->buffer_* values; the remote side cannot enlarge those queues beyond what the local endpoint configured. - Hyper-V's vsock transport uses fixed-size VMBus ring buffers and an MTU bound; there is no peer-controlled credit field comparable to peer_buf_alloc, and the remote endpoint cannot drive in-flight kernel memory above those ring sizes. - The loopback path reuses virtio_transport_common.c, so it naturally follows the same semantics as the virtio transport. This change is limited to virtio_transport_common.c and thus affects virtio and loopback, bringing them in line with the "remote window intersected with local policy" behaviour that VMCI and Hyper-V already effectively have. Fixes: 06a8fc78367d ("VSOCK: Introduce virtio_vsock_common.ko") Suggested-by: Stefano Garzarella Signed-off-by: Melbin K Mathew --- net/vmw_vsock/virtio_transport_common.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index d692b227912d..92575e9d02cd 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -491,6 +491,18 @@ void virtio_transport_consume_skb_sent(struct sk_buff *skb, bool consume) } EXPORT_SYMBOL_GPL(virtio_transport_consume_skb_sent); +/* + * Return the effective peer buffer size for TX credit. + * + * The peer advertises its receive buffer via peer_buf_alloc, but we cap + * it to our local buf_alloc so a remote peer cannot force us to queue + * more data than our own buffer configuration allows. + */ +static u32 virtio_transport_tx_buf_alloc(struct virtio_vsock_sock *vvs) +{ + return min(vvs->peer_buf_alloc, vvs->buf_alloc); +} + u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit) { u32 ret; @@ -508,7 +520,7 @@ u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit) * its advertised buffer while data is in flight). */ inflight = vvs->tx_cnt - vvs->peer_fwd_cnt; - bytes = (s64)vvs->peer_buf_alloc - inflight; + bytes = (s64)virtio_transport_tx_buf_alloc(vvs) - inflight; if (bytes < 0) bytes = 0; @@ -842,7 +854,7 @@ virtio_transport_seqpacket_enqueue(struct vsock_sock *vsk, spin_lock_bh(&vvs->tx_lock); - if (len > vvs->peer_buf_alloc) { + if (len > virtio_transport_tx_buf_alloc(vvs)) { spin_unlock_bh(&vvs->tx_lock); return -EMSGSIZE; } @@ -893,7 +905,7 @@ static s64 virtio_transport_has_space(struct vsock_sock *vsk) struct virtio_vsock_sock *vvs = vsk->trans; s64 bytes; - bytes = (s64)vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt); + bytes = (s64)virtio_transport_tx_buf_alloc(vvs) - + (vvs->tx_cnt - vvs->peer_fwd_cnt); if (bytes < 0) bytes = 0; -- 2.34.1 The test requires the sender (client) to send all messages before waking up the receiver (server). Since virtio-vsock had a bug and did not respect the size of the TX buffer, this test worked, but now that we have fixed the bug, it hangs because the sender fills the TX buffer before waking up the receiver. Set the buffer size in the sender (client) as well, as we already do for the receiver (server). Fixes: 5c338112e48a ("test/vsock: rework message bounds test") Suggested-by: Stefano Garzarella Signed-off-by: Melbin K Mathew --- tools/testing/vsock/vsock_test.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index 9e1250790f33..0e8e173dfbdc 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -351,6 +351,7 @@ static void test_stream_msg_peek_server(const struct test_opts *opts) static void test_seqpacket_msg_bounds_client(const struct test_opts *opts) { + unsigned long long sock_buf_size; unsigned long curr_hash; size_t max_msg_size; int page_size; @@ -363,6 +364,16 @@ static void test_seqpacket_msg_bounds_client(const struct test_opts *opts) exit(EXIT_FAILURE); } + sock_buf_size = SOCK_BUF_SIZE; + + setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_MAX_SIZE, + sock_buf_size, + "setsockopt(SO_VM_SOCKETS_BUFFER_MAX_SIZE)"); + + setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE, + sock_buf_size, + "setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)"); + /* Wait, until receiver sets buffer size. */ control_expectln("SRVREADY"); -- 2.34.1 Add a regression test for the TX credit bounds fix. The test verifies that a sender with a small local buffer size cannot queue excessive data even when the peer advertises a large receive buffer. The client: - Sets a small buffer size (64 KiB) - Connects to server (which advertises 2 MiB buffer) - Sends in non-blocking mode until EAGAIN - Verifies total queued data is bounded This guards against the original vulnerability where a remote peer could cause unbounded kernel memory allocation by advertising a large buffer and reading slowly. Suggested-by: Stefano Garzarella Signed-off-by: Melbin K Mathew --- tools/testing/vsock/vsock_test.c | 103 +++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index 0e8e173dfbdc..9f4598ee45f9 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -347,6 +347,7 @@ static void test_stream_msg_peek_server(const struct test_opts *opts) } #define SOCK_BUF_SIZE (2 * 1024 * 1024) +#define SMALL_SOCK_BUF_SIZE (64 * 1024ULL) #define MAX_MSG_PAGES 4 static void test_seqpacket_msg_bounds_client(const struct test_opts *opts) @@ -2203,6 +2204,103 @@ static void test_stream_nolinger_server(const struct test_opts *opts) close(fd); } +static void test_stream_tx_credit_bounds_client(const struct test_opts *opts) +{ + unsigned long long sock_buf_size; + char buf[4096]; + size_t total = 0; + ssize_t sent; + int fd; + int flags; + + memset(buf, 'A', sizeof(buf)); + + fd = vsock_stream_connect(opts->peer_cid, opts->peer_port); + if (fd < 0) { + perror("connect"); + exit(EXIT_FAILURE); + } + + sock_buf_size = SMALL_SOCK_BUF_SIZE; + + setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_MAX_SIZE, + sock_buf_size, + "setsockopt(SO_VM_SOCKETS_BUFFER_MAX_SIZE)"); + + setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE, + sock_buf_size, + "setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)"); + + flags = fcntl(fd, F_GETFL); + if (flags < 0) { + perror("fcntl(F_GETFL)"); + exit(EXIT_FAILURE); + } + + if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) < 0) { + perror("fcntl(F_SETFL)"); + exit(EXIT_FAILURE); + } + + control_expectln("SRVREADY"); + + for (;;) { + sent = send(fd, buf, sizeof(buf), 0); + if (sent > 0) { + total += sent; + continue; + } + if (sent < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) + break; + + perror("send"); + exit(EXIT_FAILURE); + } + + /* + * With TX credit bounded by local buffer size, sending should + * stall quickly. Allow some overhead but fail if we queued an + * unreasonable amount. + */ + if (total > (size_t)(SMALL_SOCK_BUF_SIZE * 4)) { + fprintf(stderr, + "TX credit too large: queued %zu bytes (expected <= %llu)\n", + total, (unsigned long long)(SMALL_SOCK_BUF_SIZE * 4)); + exit(EXIT_FAILURE); + } + + control_writeln("CLIDONE"); + close(fd); +} + +static void test_stream_tx_credit_bounds_server(const struct test_opts *opts) +{ + unsigned long long sock_buf_size; + int fd; + + fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL); + if (fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + + /* Server advertises large buffer; client should still be bounded */ + sock_buf_size = SOCK_BUF_SIZE; + + setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_MAX_SIZE, + sock_buf_size, + "setsockopt(SO_VM_SOCKETS_BUFFER_MAX_SIZE)"); + + setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE, + sock_buf_size, + "setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)"); + + control_writeln("SRVREADY"); + control_expectln("CLIDONE"); + + close(fd); +} + static struct test_case test_cases[] = { { .name = "SOCK_STREAM connection reset", @@ -2382,6 +2480,11 @@ static struct test_case test_cases[] = { .run_client = test_seqpacket_unread_bytes_client, .run_server = test_seqpacket_unread_bytes_server, }, + { + .name = "SOCK_STREAM TX credit bounds", + .run_client = test_stream_tx_credit_bounds_client, + .run_server = test_stream_tx_credit_bounds_server, + }, {}, }; -- 2.34.1