Migrate the NFS Direct I/O path from the legacy iov_iter_get_pages_alloc2() API to the modern iov_iter_extract_pages() API. This migration enables support for PCI Peer-to-Peer DMA (P2PDMA) by allowing the setting the ITER_ALLOW_P2PDMA flag. Pass ITER_ALLOW_P2PDMA to iov_iter_extract_pages() only if the local mount indicates support via the NFS_CAP_P2PDMA capability bit (detected at mount time for RDMA transports). Fix the memory safety bug in the Direct I/O loop where pages were being unpinned immediately after request creation. Instead, we now leverage pin-aware nfs_page structures to hold the pins until the I/O is complete The manual release in the loop is updated to only clean up pages that failed to be handed over to an nfs_page request. Signed-off-by: Pranjal Shrivastava --- fs/nfs/direct.c | 51 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index c8429b430181..6916541af9db 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -165,11 +165,17 @@ int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter) return 0; } -static void nfs_direct_release_pages(struct page **pages, unsigned int npages) +static void nfs_direct_release_pages(struct page **pages, unsigned int npages, + bool pinned) { unsigned int i; - for (i = 0; i < npages; i++) - put_page(pages[i]); + + if (pinned) { + unpin_user_pages(pages, npages); + } else { + for (i = 0; i < npages; i++) + put_page(pages[i]); + } } void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, @@ -354,23 +360,30 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, inode_dio_begin(inode); while (iov_iter_count(iter)) { - struct page **pagevec; + /* Tell extract pages to allocate the page array */ + struct page **pagevec = NULL; size_t bytes; size_t pgbase; unsigned npages, i; + bool pinned = iov_iter_extract_will_pin(iter); + iov_iter_extraction_t extraction_flags = 0; + + if (NFS_SERVER(inode)->caps & NFS_CAP_P2PDMA) + extraction_flags |= ITER_ALLOW_P2PDMA; - result = iov_iter_get_pages_alloc2(iter, &pagevec, - rsize, &pgbase); + result = iov_iter_extract_pages(iter, &pagevec, + rsize, ~0U, + extraction_flags, &pgbase); if (result < 0) break; - + bytes = result; npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE; for (i = 0; i < npages; i++) { struct nfs_page *req; unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); /* XXX do we need to do the eof zeroing found in async_filler? */ - req = nfs_page_create_from_page(dreq->ctx, pagevec[i], false, + req = nfs_page_create_from_page(dreq->ctx, pagevec[i], pinned, pgbase, pos, req_len); if (IS_ERR(req)) { result = PTR_ERR(req); @@ -386,7 +399,8 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, requested_bytes += req_len; pos += req_len; } - nfs_direct_release_pages(pagevec, npages); + if (i < npages) + nfs_direct_release_pages(pagevec + i, npages - i, pinned); kvfree(pagevec); if (result < 0) break; @@ -882,13 +896,21 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, NFS_I(inode)->write_io += iov_iter_count(iter); while (iov_iter_count(iter)) { - struct page **pagevec; + + /* Tell extract pages to allocate the page array */ + struct page **pagevec = NULL; size_t bytes; size_t pgbase; unsigned npages, i; + bool pinned = iov_iter_extract_will_pin(iter); + iov_iter_extraction_t extraction_flags = 0; + + if (NFS_SERVER(inode)->caps & NFS_CAP_P2PDMA) + extraction_flags |= ITER_ALLOW_P2PDMA; - result = iov_iter_get_pages_alloc2(iter, &pagevec, - wsize, &pgbase); + result = iov_iter_extract_pages(iter, &pagevec, + wsize, ~0U, + extraction_flags, &pgbase); if (result < 0) break; @@ -898,7 +920,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, struct nfs_page *req; unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); - req = nfs_page_create_from_page(dreq->ctx, pagevec[i], false, + req = nfs_page_create_from_page(dreq->ctx, pagevec[i], pinned, pgbase, pos, req_len); if (IS_ERR(req)) { result = PTR_ERR(req); @@ -942,7 +964,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, desc.pg_error = 0; defer = true; } - nfs_direct_release_pages(pagevec, npages); + if (i < npages) + nfs_direct_release_pages(pagevec + i, npages - i, pinned); kvfree(pagevec); if (result < 0) break; -- 2.53.0.1185.g05d4b7b318-goog