This reverts commit 7ffa8f3d30236e0ab897c30bdb01224ff1fe1c89. --- fs/ntfs/aops.c | 1744 ++++++++++++++++++++++++ fs/ntfs/aops.h | 88 ++ fs/ntfs/attrib.c | 2624 ++++++++++++++++++++++++++++++++++++ fs/ntfs/attrib.h | 102 ++ fs/ntfs/bitmap.c | 179 +++ fs/ntfs/bitmap.h | 104 ++ fs/ntfs/collate.c | 110 ++ fs/ntfs/collate.h | 36 + fs/ntfs/compress.c | 950 +++++++++++++ fs/ntfs/debug.c | 159 +++ fs/ntfs/debug.h | 57 + fs/ntfs/dir.c | 1540 +++++++++++++++++++++ fs/ntfs/dir.h | 34 + fs/ntfs/endian.h | 79 ++ fs/ntfs/file.c | 1997 +++++++++++++++++++++++++++ fs/ntfs/index.c | 440 ++++++ fs/ntfs/index.h | 134 ++ fs/ntfs/inode.c | 3102 ++++++++++++++++++++++++++++++++++++++++++ fs/ntfs/inode.h | 310 +++++ fs/ntfs/layout.h | 2421 +++++++++++++++++++++++++++++++++ fs/ntfs/lcnalloc.c | 1000 ++++++++++++++ fs/ntfs/lcnalloc.h | 131 ++ fs/ntfs/logfile.c | 849 ++++++++++++ fs/ntfs/logfile.h | 295 ++++ fs/ntfs/malloc.h | 77 ++ fs/ntfs/mft.c | 2907 ++++++++++++++++++++++++++++++++++++++++ fs/ntfs/mft.h | 110 ++ fs/ntfs/mst.c | 189 +++ fs/ntfs/namei.c | 392 ++++++ fs/ntfs/ntfs.h | 150 +++ fs/ntfs/quota.c | 103 ++ fs/ntfs/quota.h | 21 + fs/ntfs/runlist.c | 1893 ++++++++++++++++++++++++++ fs/ntfs/runlist.h | 88 ++ fs/ntfs/super.c | 3202 ++++++++++++++++++++++++++++++++++++++++++++ fs/ntfs/sysctl.c | 58 + fs/ntfs/sysctl.h | 27 + fs/ntfs/time.h | 89 ++ fs/ntfs/types.h | 55 + fs/ntfs/unistr.c | 384 ++++++ fs/ntfs/upcase.c | 73 + fs/ntfs/volume.h | 164 +++ 42 files changed, 28467 insertions(+) create mode 100644 fs/ntfs/aops.c create mode 100644 fs/ntfs/aops.h create mode 100644 fs/ntfs/attrib.c create mode 100644 fs/ntfs/attrib.h create mode 100644 fs/ntfs/bitmap.c create mode 100644 fs/ntfs/bitmap.h create mode 100644 fs/ntfs/collate.c create mode 100644 fs/ntfs/collate.h create mode 100644 fs/ntfs/compress.c create mode 100644 fs/ntfs/debug.c create mode 100644 fs/ntfs/debug.h create mode 100644 fs/ntfs/dir.c create mode 100644 fs/ntfs/dir.h create mode 100644 fs/ntfs/endian.h create mode 100644 fs/ntfs/file.c create mode 100644 fs/ntfs/index.c create mode 100644 fs/ntfs/index.h create mode 100644 fs/ntfs/inode.c create mode 100644 fs/ntfs/inode.h create mode 100644 fs/ntfs/layout.h create mode 100644 fs/ntfs/lcnalloc.c create mode 100644 fs/ntfs/lcnalloc.h create mode 100644 fs/ntfs/logfile.c create mode 100644 fs/ntfs/logfile.h create mode 100644 fs/ntfs/malloc.h create mode 100644 fs/ntfs/mft.c create mode 100644 fs/ntfs/mft.h create mode 100644 fs/ntfs/mst.c create mode 100644 fs/ntfs/namei.c create mode 100644 fs/ntfs/ntfs.h create mode 100644 fs/ntfs/quota.c create mode 100644 fs/ntfs/quota.h create mode 100644 fs/ntfs/runlist.c create mode 100644 fs/ntfs/runlist.h create mode 100644 fs/ntfs/super.c create mode 100644 fs/ntfs/sysctl.c create mode 100644 fs/ntfs/sysctl.h create mode 100644 fs/ntfs/time.h create mode 100644 fs/ntfs/types.h create mode 100644 fs/ntfs/unistr.c create mode 100644 fs/ntfs/upcase.c create mode 100644 fs/ntfs/volume.h diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c new file mode 100644 index 000000000000..2d01517a2d59 --- /dev/null +++ b/fs/ntfs/aops.c @@ -0,0 +1,1744 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * aops.c - NTFS kernel address space operations and page cache handling. + * + * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. + * Copyright (c) 2002 Richard Russon + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "aops.h" +#include "attrib.h" +#include "debug.h" +#include "inode.h" +#include "mft.h" +#include "runlist.h" +#include "types.h" +#include "ntfs.h" + +/** + * ntfs_end_buffer_async_read - async io completion for reading attributes + * @bh: buffer head on which io is completed + * @uptodate: whether @bh is now uptodate or not + * + * Asynchronous I/O completion handler for reading pages belonging to the + * attribute address space of an inode. The inodes can either be files or + * directories or they can be fake inodes describing some attribute. + * + * If NInoMstProtected(), perform the post read mst fixups when all IO on the + * page has been completed and mark the page uptodate or set the error bit on + * the page. To determine the size of the records that need fixing up, we + * cheat a little bit by setting the index_block_size in ntfs_inode to the ntfs + * record size, and index_block_size_bits, to the log(base 2) of the ntfs + * record size. + */ +static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) +{ + unsigned long flags; + struct buffer_head *first, *tmp; + struct page *page; + struct inode *vi; + ntfs_inode *ni; + int page_uptodate = 1; + + page = bh->b_page; + vi = page->mapping->host; + ni = NTFS_I(vi); + + if (likely(uptodate)) { + loff_t i_size; + s64 file_ofs, init_size; + + set_buffer_uptodate(bh); + + file_ofs = ((s64)page->index << PAGE_SHIFT) + + bh_offset(bh); + read_lock_irqsave(&ni->size_lock, flags); + init_size = ni->initialized_size; + i_size = i_size_read(vi); + read_unlock_irqrestore(&ni->size_lock, flags); + if (unlikely(init_size > i_size)) { + /* Race with shrinking truncate. */ + init_size = i_size; + } + /* Check for the current buffer head overflowing. */ + if (unlikely(file_ofs + bh->b_size > init_size)) { + int ofs; + void *kaddr; + + ofs = 0; + if (file_ofs < init_size) + ofs = init_size - file_ofs; + kaddr = kmap_atomic(page); + memset(kaddr + bh_offset(bh) + ofs, 0, + bh->b_size - ofs); + flush_dcache_page(page); + kunmap_atomic(kaddr); + } + } else { + clear_buffer_uptodate(bh); + SetPageError(page); + ntfs_error(ni->vol->sb, "Buffer I/O error, logical block " + "0x%llx.", (unsigned long long)bh->b_blocknr); + } + first = page_buffers(page); + spin_lock_irqsave(&first->b_uptodate_lock, flags); + clear_buffer_async_read(bh); + unlock_buffer(bh); + tmp = bh; + do { + if (!buffer_uptodate(tmp)) + page_uptodate = 0; + if (buffer_async_read(tmp)) { + if (likely(buffer_locked(tmp))) + goto still_busy; + /* Async buffers must be locked. */ + BUG(); + } + tmp = tmp->b_this_page; + } while (tmp != bh); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); + /* + * If none of the buffers had errors then we can set the page uptodate, + * but we first have to perform the post read mst fixups, if the + * attribute is mst protected, i.e. if NInoMstProteced(ni) is true. + * Note we ignore fixup errors as those are detected when + * map_mft_record() is called which gives us per record granularity + * rather than per page granularity. + */ + if (!NInoMstProtected(ni)) { + if (likely(page_uptodate && !PageError(page))) + SetPageUptodate(page); + } else { + u8 *kaddr; + unsigned int i, recs; + u32 rec_size; + + rec_size = ni->itype.index.block_size; + recs = PAGE_SIZE / rec_size; + /* Should have been verified before we got here... */ + BUG_ON(!recs); + kaddr = kmap_atomic(page); + for (i = 0; i < recs; i++) + post_read_mst_fixup((NTFS_RECORD*)(kaddr + + i * rec_size), rec_size); + kunmap_atomic(kaddr); + flush_dcache_page(page); + if (likely(page_uptodate && !PageError(page))) + SetPageUptodate(page); + } + unlock_page(page); + return; +still_busy: + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); + return; +} + +/** + * ntfs_read_block - fill a @folio of an address space with data + * @folio: page cache folio to fill with data + * + * We read each buffer asynchronously and when all buffers are read in, our io + * completion handler ntfs_end_buffer_read_async(), if required, automatically + * applies the mst fixups to the folio before finally marking it uptodate and + * unlocking it. + * + * We only enforce allocated_size limit because i_size is checked for in + * generic_file_read(). + * + * Return 0 on success and -errno on error. + * + * Contains an adapted version of fs/buffer.c::block_read_full_folio(). + */ +static int ntfs_read_block(struct folio *folio) +{ + loff_t i_size; + VCN vcn; + LCN lcn; + s64 init_size; + struct inode *vi; + ntfs_inode *ni; + ntfs_volume *vol; + runlist_element *rl; + struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; + sector_t iblock, lblock, zblock; + unsigned long flags; + unsigned int blocksize, vcn_ofs; + int i, nr; + unsigned char blocksize_bits; + + vi = folio->mapping->host; + ni = NTFS_I(vi); + vol = ni->vol; + + /* $MFT/$DATA must have its complete runlist in memory at all times. */ + BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni)); + + blocksize = vol->sb->s_blocksize; + blocksize_bits = vol->sb->s_blocksize_bits; + + head = folio_buffers(folio); + if (!head) + head = create_empty_buffers(folio, blocksize, 0); + bh = head; + + /* + * We may be racing with truncate. To avoid some of the problems we + * now take a snapshot of the various sizes and use those for the whole + * of the function. In case of an extending truncate it just means we + * may leave some buffers unmapped which are now allocated. This is + * not a problem since these buffers will just get mapped when a write + * occurs. In case of a shrinking truncate, we will detect this later + * on due to the runlist being incomplete and if the folio is being + * fully truncated, truncate will throw it away as soon as we unlock + * it so no need to worry what we do with it. + */ + iblock = (s64)folio->index << (PAGE_SHIFT - blocksize_bits); + read_lock_irqsave(&ni->size_lock, flags); + lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits; + init_size = ni->initialized_size; + i_size = i_size_read(vi); + read_unlock_irqrestore(&ni->size_lock, flags); + if (unlikely(init_size > i_size)) { + /* Race with shrinking truncate. */ + init_size = i_size; + } + zblock = (init_size + blocksize - 1) >> blocksize_bits; + + /* Loop through all the buffers in the folio. */ + rl = NULL; + nr = i = 0; + do { + int err = 0; + + if (unlikely(buffer_uptodate(bh))) + continue; + if (unlikely(buffer_mapped(bh))) { + arr[nr++] = bh; + continue; + } + bh->b_bdev = vol->sb->s_bdev; + /* Is the block within the allowed limits? */ + if (iblock < lblock) { + bool is_retry = false; + + /* Convert iblock into corresponding vcn and offset. */ + vcn = (VCN)iblock << blocksize_bits >> + vol->cluster_size_bits; + vcn_ofs = ((VCN)iblock << blocksize_bits) & + vol->cluster_size_mask; + if (!rl) { +lock_retry_remap: + down_read(&ni->runlist.lock); + rl = ni->runlist.rl; + } + if (likely(rl != NULL)) { + /* Seek to element containing target vcn. */ + while (rl->length && rl[1].vcn <= vcn) + rl++; + lcn = ntfs_rl_vcn_to_lcn(rl, vcn); + } else + lcn = LCN_RL_NOT_MAPPED; + /* Successful remap. */ + if (lcn >= 0) { + /* Setup buffer head to correct block. */ + bh->b_blocknr = ((lcn << vol->cluster_size_bits) + + vcn_ofs) >> blocksize_bits; + set_buffer_mapped(bh); + /* Only read initialized data blocks. */ + if (iblock < zblock) { + arr[nr++] = bh; + continue; + } + /* Fully non-initialized data block, zero it. */ + goto handle_zblock; + } + /* It is a hole, need to zero it. */ + if (lcn == LCN_HOLE) + goto handle_hole; + /* If first try and runlist unmapped, map and retry. */ + if (!is_retry && lcn == LCN_RL_NOT_MAPPED) { + is_retry = true; + /* + * Attempt to map runlist, dropping lock for + * the duration. + */ + up_read(&ni->runlist.lock); + err = ntfs_map_runlist(ni, vcn); + if (likely(!err)) + goto lock_retry_remap; + rl = NULL; + } else if (!rl) + up_read(&ni->runlist.lock); + /* + * If buffer is outside the runlist, treat it as a + * hole. This can happen due to concurrent truncate + * for example. + */ + if (err == -ENOENT || lcn == LCN_ENOENT) { + err = 0; + goto handle_hole; + } + /* Hard error, zero out region. */ + if (!err) + err = -EIO; + bh->b_blocknr = -1; + folio_set_error(folio); + ntfs_error(vol->sb, "Failed to read from inode 0x%lx, " + "attribute type 0x%x, vcn 0x%llx, " + "offset 0x%x because its location on " + "disk could not be determined%s " + "(error code %i).", ni->mft_no, + ni->type, (unsigned long long)vcn, + vcn_ofs, is_retry ? " even after " + "retrying" : "", err); + } + /* + * Either iblock was outside lblock limits or + * ntfs_rl_vcn_to_lcn() returned error. Just zero that portion + * of the folio and set the buffer uptodate. + */ +handle_hole: + bh->b_blocknr = -1UL; + clear_buffer_mapped(bh); +handle_zblock: + folio_zero_range(folio, i * blocksize, blocksize); + if (likely(!err)) + set_buffer_uptodate(bh); + } while (i++, iblock++, (bh = bh->b_this_page) != head); + + /* Release the lock if we took it. */ + if (rl) + up_read(&ni->runlist.lock); + + /* Check we have at least one buffer ready for i/o. */ + if (nr) { + struct buffer_head *tbh; + + /* Lock the buffers. */ + for (i = 0; i < nr; i++) { + tbh = arr[i]; + lock_buffer(tbh); + tbh->b_end_io = ntfs_end_buffer_async_read; + set_buffer_async_read(tbh); + } + /* Finally, start i/o on the buffers. */ + for (i = 0; i < nr; i++) { + tbh = arr[i]; + if (likely(!buffer_uptodate(tbh))) + submit_bh(REQ_OP_READ, tbh); + else + ntfs_end_buffer_async_read(tbh, 1); + } + return 0; + } + /* No i/o was scheduled on any of the buffers. */ + if (likely(!folio_test_error(folio))) + folio_mark_uptodate(folio); + else /* Signal synchronous i/o error. */ + nr = -EIO; + folio_unlock(folio); + return nr; +} + +/** + * ntfs_read_folio - fill a @folio of a @file with data from the device + * @file: open file to which the folio @folio belongs or NULL + * @folio: page cache folio to fill with data + * + * For non-resident attributes, ntfs_read_folio() fills the @folio of the open + * file @file by calling the ntfs version of the generic block_read_full_folio() + * function, ntfs_read_block(), which in turn creates and reads in the buffers + * associated with the folio asynchronously. + * + * For resident attributes, OTOH, ntfs_read_folio() fills @folio by copying the + * data from the mft record (which at this stage is most likely in memory) and + * fills the remainder with zeroes. Thus, in this case, I/O is synchronous, as + * even if the mft record is not cached at this point in time, we need to wait + * for it to be read in before we can do the copy. + * + * Return 0 on success and -errno on error. + */ +static int ntfs_read_folio(struct file *file, struct folio *folio) +{ + struct page *page = &folio->page; + loff_t i_size; + struct inode *vi; + ntfs_inode *ni, *base_ni; + u8 *addr; + ntfs_attr_search_ctx *ctx; + MFT_RECORD *mrec; + unsigned long flags; + u32 attr_len; + int err = 0; + +retry_readpage: + BUG_ON(!PageLocked(page)); + vi = page->mapping->host; + i_size = i_size_read(vi); + /* Is the page fully outside i_size? (truncate in progress) */ + if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >> + PAGE_SHIFT)) { + zero_user(page, 0, PAGE_SIZE); + ntfs_debug("Read outside i_size - truncated?"); + goto done; + } + /* + * This can potentially happen because we clear PageUptodate() during + * ntfs_writepage() of MstProtected() attributes. + */ + if (PageUptodate(page)) { + unlock_page(page); + return 0; + } + ni = NTFS_I(vi); + /* + * Only $DATA attributes can be encrypted and only unnamed $DATA + * attributes can be compressed. Index root can have the flags set but + * this means to create compressed/encrypted files, not that the + * attribute is compressed/encrypted. Note we need to check for + * AT_INDEX_ALLOCATION since this is the type of both directory and + * index inodes. + */ + if (ni->type != AT_INDEX_ALLOCATION) { + /* If attribute is encrypted, deny access, just like NT4. */ + if (NInoEncrypted(ni)) { + BUG_ON(ni->type != AT_DATA); + err = -EACCES; + goto err_out; + } + /* Compressed data streams are handled in compress.c. */ + if (NInoNonResident(ni) && NInoCompressed(ni)) { + BUG_ON(ni->type != AT_DATA); + BUG_ON(ni->name_len); + return ntfs_read_compressed_block(page); + } + } + /* NInoNonResident() == NInoIndexAllocPresent() */ + if (NInoNonResident(ni)) { + /* Normal, non-resident data stream. */ + return ntfs_read_block(folio); + } + /* + * Attribute is resident, implying it is not compressed or encrypted. + * This also means the attribute is smaller than an mft record and + * hence smaller than a page, so can simply zero out any pages with + * index above 0. Note the attribute can actually be marked compressed + * but if it is resident the actual data is not compressed so we are + * ok to ignore the compressed flag here. + */ + if (unlikely(page->index > 0)) { + zero_user(page, 0, PAGE_SIZE); + goto done; + } + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + /* Map, pin, and lock the mft record. */ + mrec = map_mft_record(base_ni); + if (IS_ERR(mrec)) { + err = PTR_ERR(mrec); + goto err_out; + } + /* + * If a parallel write made the attribute non-resident, drop the mft + * record and retry the read_folio. + */ + if (unlikely(NInoNonResident(ni))) { + unmap_mft_record(base_ni); + goto retry_readpage; + } + ctx = ntfs_attr_get_search_ctx(base_ni, mrec); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto unm_err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) + goto put_unm_err_out; + attr_len = le32_to_cpu(ctx->attr->data.resident.value_length); + read_lock_irqsave(&ni->size_lock, flags); + if (unlikely(attr_len > ni->initialized_size)) + attr_len = ni->initialized_size; + i_size = i_size_read(vi); + read_unlock_irqrestore(&ni->size_lock, flags); + if (unlikely(attr_len > i_size)) { + /* Race with shrinking truncate. */ + attr_len = i_size; + } + addr = kmap_atomic(page); + /* Copy the data to the page. */ + memcpy(addr, (u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset), + attr_len); + /* Zero the remainder of the page. */ + memset(addr + attr_len, 0, PAGE_SIZE - attr_len); + flush_dcache_page(page); + kunmap_atomic(addr); +put_unm_err_out: + ntfs_attr_put_search_ctx(ctx); +unm_err_out: + unmap_mft_record(base_ni); +done: + SetPageUptodate(page); +err_out: + unlock_page(page); + return err; +} + +#ifdef NTFS_RW + +/** + * ntfs_write_block - write a @folio to the backing store + * @folio: page cache folio to write out + * @wbc: writeback control structure + * + * This function is for writing folios belonging to non-resident, non-mst + * protected attributes to their backing store. + * + * For a folio with buffers, map and write the dirty buffers asynchronously + * under folio writeback. For a folio without buffers, create buffers for the + * folio, then proceed as above. + * + * If a folio doesn't have buffers the folio dirty state is definitive. If + * a folio does have buffers, the folio dirty state is just a hint, + * and the buffer dirty state is definitive. (A hint which has rules: + * dirty buffers against a clean folio is illegal. Other combinations are + * legal and need to be handled. In particular a dirty folio containing + * clean buffers for example.) + * + * Return 0 on success and -errno on error. + * + * Based on ntfs_read_block() and __block_write_full_folio(). + */ +static int ntfs_write_block(struct folio *folio, struct writeback_control *wbc) +{ + VCN vcn; + LCN lcn; + s64 initialized_size; + loff_t i_size; + sector_t block, dblock, iblock; + struct inode *vi; + ntfs_inode *ni; + ntfs_volume *vol; + runlist_element *rl; + struct buffer_head *bh, *head; + unsigned long flags; + unsigned int blocksize, vcn_ofs; + int err; + bool need_end_writeback; + unsigned char blocksize_bits; + + vi = folio->mapping->host; + ni = NTFS_I(vi); + vol = ni->vol; + + ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index " + "0x%lx.", ni->mft_no, ni->type, folio->index); + + BUG_ON(!NInoNonResident(ni)); + BUG_ON(NInoMstProtected(ni)); + blocksize = vol->sb->s_blocksize; + blocksize_bits = vol->sb->s_blocksize_bits; + head = folio_buffers(folio); + if (!head) { + BUG_ON(!folio_test_uptodate(folio)); + head = create_empty_buffers(folio, blocksize, + (1 << BH_Uptodate) | (1 << BH_Dirty)); + } + bh = head; + + /* NOTE: Different naming scheme to ntfs_read_block()! */ + + /* The first block in the folio. */ + block = (s64)folio->index << (PAGE_SHIFT - blocksize_bits); + + read_lock_irqsave(&ni->size_lock, flags); + i_size = i_size_read(vi); + initialized_size = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); + + /* The first out of bounds block for the data size. */ + dblock = (i_size + blocksize - 1) >> blocksize_bits; + + /* The last (fully or partially) initialized block. */ + iblock = initialized_size >> blocksize_bits; + + /* + * Be very careful. We have no exclusion from block_dirty_folio + * here, and the (potentially unmapped) buffers may become dirty at + * any time. If a buffer becomes dirty here after we've inspected it + * then we just miss that fact, and the folio stays dirty. + * + * Buffers outside i_size may be dirtied by block_dirty_folio; + * handle that here by just cleaning them. + */ + + /* + * Loop through all the buffers in the folio, mapping all the dirty + * buffers to disk addresses and handling any aliases from the + * underlying block device's mapping. + */ + rl = NULL; + err = 0; + do { + bool is_retry = false; + + if (unlikely(block >= dblock)) { + /* + * Mapped buffers outside i_size will occur, because + * this folio can be outside i_size when there is a + * truncate in progress. The contents of such buffers + * were zeroed by ntfs_writepage(). + * + * FIXME: What about the small race window where + * ntfs_writepage() has not done any clearing because + * the folio was within i_size but before we get here, + * vmtruncate() modifies i_size? + */ + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + continue; + } + + /* Clean buffers are not written out, so no need to map them. */ + if (!buffer_dirty(bh)) + continue; + + /* Make sure we have enough initialized size. */ + if (unlikely((block >= iblock) && + (initialized_size < i_size))) { + /* + * If this folio is fully outside initialized + * size, zero out all folios between the current + * initialized size and the current folio. Just + * use ntfs_read_folio() to do the zeroing + * transparently. + */ + if (block > iblock) { + // TODO: + // For each folio do: + // - read_cache_folio() + // Again for each folio do: + // - wait_on_folio_locked() + // - Check (folio_test_uptodate(folio) && + // !folio_test_error(folio)) + // Update initialized size in the attribute and + // in the inode. + // Again, for each folio do: + // block_dirty_folio(); + // folio_put() + // We don't need to wait on the writes. + // Update iblock. + } + /* + * The current folio straddles initialized size. Zero + * all non-uptodate buffers and set them uptodate (and + * dirty?). Note, there aren't any non-uptodate buffers + * if the folio is uptodate. + * FIXME: For an uptodate folio, the buffers may need to + * be written out because they were not initialized on + * disk before. + */ + if (!folio_test_uptodate(folio)) { + // TODO: + // Zero any non-uptodate buffers up to i_size. + // Set them uptodate and dirty. + } + // TODO: + // Update initialized size in the attribute and in the + // inode (up to i_size). + // Update iblock. + // FIXME: This is inefficient. Try to batch the two + // size changes to happen in one go. + ntfs_error(vol->sb, "Writing beyond initialized size " + "is not supported yet. Sorry."); + err = -EOPNOTSUPP; + break; + // Do NOT set_buffer_new() BUT DO clear buffer range + // outside write request range. + // set_buffer_uptodate() on complete buffers as well as + // set_buffer_dirty(). + } + + /* No need to map buffers that are already mapped. */ + if (buffer_mapped(bh)) + continue; + + /* Unmapped, dirty buffer. Need to map it. */ + bh->b_bdev = vol->sb->s_bdev; + + /* Convert block into corresponding vcn and offset. */ + vcn = (VCN)block << blocksize_bits; + vcn_ofs = vcn & vol->cluster_size_mask; + vcn >>= vol->cluster_size_bits; + if (!rl) { +lock_retry_remap: + down_read(&ni->runlist.lock); + rl = ni->runlist.rl; + } + if (likely(rl != NULL)) { + /* Seek to element containing target vcn. */ + while (rl->length && rl[1].vcn <= vcn) + rl++; + lcn = ntfs_rl_vcn_to_lcn(rl, vcn); + } else + lcn = LCN_RL_NOT_MAPPED; + /* Successful remap. */ + if (lcn >= 0) { + /* Setup buffer head to point to correct block. */ + bh->b_blocknr = ((lcn << vol->cluster_size_bits) + + vcn_ofs) >> blocksize_bits; + set_buffer_mapped(bh); + continue; + } + /* It is a hole, need to instantiate it. */ + if (lcn == LCN_HOLE) { + u8 *kaddr; + unsigned long *bpos, *bend; + + /* Check if the buffer is zero. */ + kaddr = kmap_local_folio(folio, bh_offset(bh)); + bpos = (unsigned long *)kaddr; + bend = (unsigned long *)(kaddr + blocksize); + do { + if (unlikely(*bpos)) + break; + } while (likely(++bpos < bend)); + kunmap_local(kaddr); + if (bpos == bend) { + /* + * Buffer is zero and sparse, no need to write + * it. + */ + bh->b_blocknr = -1; + clear_buffer_dirty(bh); + continue; + } + // TODO: Instantiate the hole. + // clear_buffer_new(bh); + // clean_bdev_bh_alias(bh); + ntfs_error(vol->sb, "Writing into sparse regions is " + "not supported yet. Sorry."); + err = -EOPNOTSUPP; + break; + } + /* If first try and runlist unmapped, map and retry. */ + if (!is_retry && lcn == LCN_RL_NOT_MAPPED) { + is_retry = true; + /* + * Attempt to map runlist, dropping lock for + * the duration. + */ + up_read(&ni->runlist.lock); + err = ntfs_map_runlist(ni, vcn); + if (likely(!err)) + goto lock_retry_remap; + rl = NULL; + } else if (!rl) + up_read(&ni->runlist.lock); + /* + * If buffer is outside the runlist, truncate has cut it out + * of the runlist. Just clean and clear the buffer and set it + * uptodate so it can get discarded by the VM. + */ + if (err == -ENOENT || lcn == LCN_ENOENT) { + bh->b_blocknr = -1; + clear_buffer_dirty(bh); + folio_zero_range(folio, bh_offset(bh), blocksize); + set_buffer_uptodate(bh); + err = 0; + continue; + } + /* Failed to map the buffer, even after retrying. */ + if (!err) + err = -EIO; + bh->b_blocknr = -1; + ntfs_error(vol->sb, "Failed to write to inode 0x%lx, " + "attribute type 0x%x, vcn 0x%llx, offset 0x%x " + "because its location on disk could not be " + "determined%s (error code %i).", ni->mft_no, + ni->type, (unsigned long long)vcn, + vcn_ofs, is_retry ? " even after " + "retrying" : "", err); + break; + } while (block++, (bh = bh->b_this_page) != head); + + /* Release the lock if we took it. */ + if (rl) + up_read(&ni->runlist.lock); + + /* For the error case, need to reset bh to the beginning. */ + bh = head; + + /* Just an optimization, so ->read_folio() is not called later. */ + if (unlikely(!folio_test_uptodate(folio))) { + int uptodate = 1; + do { + if (!buffer_uptodate(bh)) { + uptodate = 0; + bh = head; + break; + } + } while ((bh = bh->b_this_page) != head); + if (uptodate) + folio_mark_uptodate(folio); + } + + /* Setup all mapped, dirty buffers for async write i/o. */ + do { + if (buffer_mapped(bh) && buffer_dirty(bh)) { + lock_buffer(bh); + if (test_clear_buffer_dirty(bh)) { + BUG_ON(!buffer_uptodate(bh)); + mark_buffer_async_write(bh); + } else + unlock_buffer(bh); + } else if (unlikely(err)) { + /* + * For the error case. The buffer may have been set + * dirty during attachment to a dirty folio. + */ + if (err != -ENOMEM) + clear_buffer_dirty(bh); + } + } while ((bh = bh->b_this_page) != head); + + if (unlikely(err)) { + // TODO: Remove the -EOPNOTSUPP check later on... + if (unlikely(err == -EOPNOTSUPP)) + err = 0; + else if (err == -ENOMEM) { + ntfs_warning(vol->sb, "Error allocating memory. " + "Redirtying folio so we try again " + "later."); + /* + * Put the folio back on mapping->dirty_pages, but + * leave its buffer's dirty state as-is. + */ + folio_redirty_for_writepage(wbc, folio); + err = 0; + } else + folio_set_error(folio); + } + + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); /* Keeps try_to_free_buffers() away. */ + + /* Submit the prepared buffers for i/o. */ + need_end_writeback = true; + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + submit_bh(REQ_OP_WRITE, bh); + need_end_writeback = false; + } + bh = next; + } while (bh != head); + folio_unlock(folio); + + /* If no i/o was started, need to end writeback here. */ + if (unlikely(need_end_writeback)) + folio_end_writeback(folio); + + ntfs_debug("Done."); + return err; +} + +/** + * ntfs_write_mst_block - write a @page to the backing store + * @page: page cache page to write out + * @wbc: writeback control structure + * + * This function is for writing pages belonging to non-resident, mst protected + * attributes to their backing store. The only supported attributes are index + * allocation and $MFT/$DATA. Both directory inodes and index inodes are + * supported for the index allocation case. + * + * The page must remain locked for the duration of the write because we apply + * the mst fixups, write, and then undo the fixups, so if we were to unlock the + * page before undoing the fixups, any other user of the page will see the + * page contents as corrupt. + * + * We clear the page uptodate flag for the duration of the function to ensure + * exclusion for the $MFT/$DATA case against someone mapping an mft record we + * are about to apply the mst fixups to. + * + * Return 0 on success and -errno on error. + * + * Based on ntfs_write_block(), ntfs_mft_writepage(), and + * write_mft_record_nolock(). + */ +static int ntfs_write_mst_block(struct page *page, + struct writeback_control *wbc) +{ + sector_t block, dblock, rec_block; + struct inode *vi = page->mapping->host; + ntfs_inode *ni = NTFS_I(vi); + ntfs_volume *vol = ni->vol; + u8 *kaddr; + unsigned int rec_size = ni->itype.index.block_size; + ntfs_inode *locked_nis[PAGE_SIZE / NTFS_BLOCK_SIZE]; + struct buffer_head *bh, *head, *tbh, *rec_start_bh; + struct buffer_head *bhs[MAX_BUF_PER_PAGE]; + runlist_element *rl; + int i, nr_locked_nis, nr_recs, nr_bhs, max_bhs, bhs_per_rec, err, err2; + unsigned bh_size, rec_size_bits; + bool sync, is_mft, page_is_dirty, rec_is_dirty; + unsigned char bh_size_bits; + + if (WARN_ON(rec_size < NTFS_BLOCK_SIZE)) + return -EINVAL; + + ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index " + "0x%lx.", vi->i_ino, ni->type, page->index); + BUG_ON(!NInoNonResident(ni)); + BUG_ON(!NInoMstProtected(ni)); + is_mft = (S_ISREG(vi->i_mode) && !vi->i_ino); + /* + * NOTE: ntfs_write_mst_block() would be called for $MFTMirr if a page + * in its page cache were to be marked dirty. However this should + * never happen with the current driver and considering we do not + * handle this case here we do want to BUG(), at least for now. + */ + BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) || + (NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION))); + bh_size = vol->sb->s_blocksize; + bh_size_bits = vol->sb->s_blocksize_bits; + max_bhs = PAGE_SIZE / bh_size; + BUG_ON(!max_bhs); + BUG_ON(max_bhs > MAX_BUF_PER_PAGE); + + /* Were we called for sync purposes? */ + sync = (wbc->sync_mode == WB_SYNC_ALL); + + /* Make sure we have mapped buffers. */ + bh = head = page_buffers(page); + BUG_ON(!bh); + + rec_size_bits = ni->itype.index.block_size_bits; + BUG_ON(!(PAGE_SIZE >> rec_size_bits)); + bhs_per_rec = rec_size >> bh_size_bits; + BUG_ON(!bhs_per_rec); + + /* The first block in the page. */ + rec_block = block = (sector_t)page->index << + (PAGE_SHIFT - bh_size_bits); + + /* The first out of bounds block for the data size. */ + dblock = (i_size_read(vi) + bh_size - 1) >> bh_size_bits; + + rl = NULL; + err = err2 = nr_bhs = nr_recs = nr_locked_nis = 0; + page_is_dirty = rec_is_dirty = false; + rec_start_bh = NULL; + do { + bool is_retry = false; + + if (likely(block < rec_block)) { + if (unlikely(block >= dblock)) { + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + continue; + } + /* + * This block is not the first one in the record. We + * ignore the buffer's dirty state because we could + * have raced with a parallel mark_ntfs_record_dirty(). + */ + if (!rec_is_dirty) + continue; + if (unlikely(err2)) { + if (err2 != -ENOMEM) + clear_buffer_dirty(bh); + continue; + } + } else /* if (block == rec_block) */ { + BUG_ON(block > rec_block); + /* This block is the first one in the record. */ + rec_block += bhs_per_rec; + err2 = 0; + if (unlikely(block >= dblock)) { + clear_buffer_dirty(bh); + continue; + } + if (!buffer_dirty(bh)) { + /* Clean records are not written out. */ + rec_is_dirty = false; + continue; + } + rec_is_dirty = true; + rec_start_bh = bh; + } + /* Need to map the buffer if it is not mapped already. */ + if (unlikely(!buffer_mapped(bh))) { + VCN vcn; + LCN lcn; + unsigned int vcn_ofs; + + bh->b_bdev = vol->sb->s_bdev; + /* Obtain the vcn and offset of the current block. */ + vcn = (VCN)block << bh_size_bits; + vcn_ofs = vcn & vol->cluster_size_mask; + vcn >>= vol->cluster_size_bits; + if (!rl) { +lock_retry_remap: + down_read(&ni->runlist.lock); + rl = ni->runlist.rl; + } + if (likely(rl != NULL)) { + /* Seek to element containing target vcn. */ + while (rl->length && rl[1].vcn <= vcn) + rl++; + lcn = ntfs_rl_vcn_to_lcn(rl, vcn); + } else + lcn = LCN_RL_NOT_MAPPED; + /* Successful remap. */ + if (likely(lcn >= 0)) { + /* Setup buffer head to correct block. */ + bh->b_blocknr = ((lcn << + vol->cluster_size_bits) + + vcn_ofs) >> bh_size_bits; + set_buffer_mapped(bh); + } else { + /* + * Remap failed. Retry to map the runlist once + * unless we are working on $MFT which always + * has the whole of its runlist in memory. + */ + if (!is_mft && !is_retry && + lcn == LCN_RL_NOT_MAPPED) { + is_retry = true; + /* + * Attempt to map runlist, dropping + * lock for the duration. + */ + up_read(&ni->runlist.lock); + err2 = ntfs_map_runlist(ni, vcn); + if (likely(!err2)) + goto lock_retry_remap; + if (err2 == -ENOMEM) + page_is_dirty = true; + lcn = err2; + } else { + err2 = -EIO; + if (!rl) + up_read(&ni->runlist.lock); + } + /* Hard error. Abort writing this record. */ + if (!err || err == -ENOMEM) + err = err2; + bh->b_blocknr = -1; + ntfs_error(vol->sb, "Cannot write ntfs record " + "0x%llx (inode 0x%lx, " + "attribute type 0x%x) because " + "its location on disk could " + "not be determined (error " + "code %lli).", + (long long)block << + bh_size_bits >> + vol->mft_record_size_bits, + ni->mft_no, ni->type, + (long long)lcn); + /* + * If this is not the first buffer, remove the + * buffers in this record from the list of + * buffers to write and clear their dirty bit + * if not error -ENOMEM. + */ + if (rec_start_bh != bh) { + while (bhs[--nr_bhs] != rec_start_bh) + ; + if (err2 != -ENOMEM) { + do { + clear_buffer_dirty( + rec_start_bh); + } while ((rec_start_bh = + rec_start_bh-> + b_this_page) != + bh); + } + } + continue; + } + } + BUG_ON(!buffer_uptodate(bh)); + BUG_ON(nr_bhs >= max_bhs); + bhs[nr_bhs++] = bh; + } while (block++, (bh = bh->b_this_page) != head); + if (unlikely(rl)) + up_read(&ni->runlist.lock); + /* If there were no dirty buffers, we are done. */ + if (!nr_bhs) + goto done; + /* Map the page so we can access its contents. */ + kaddr = kmap(page); + /* Clear the page uptodate flag whilst the mst fixups are applied. */ + BUG_ON(!PageUptodate(page)); + ClearPageUptodate(page); + for (i = 0; i < nr_bhs; i++) { + unsigned int ofs; + + /* Skip buffers which are not at the beginning of records. */ + if (i % bhs_per_rec) + continue; + tbh = bhs[i]; + ofs = bh_offset(tbh); + if (is_mft) { + ntfs_inode *tni; + unsigned long mft_no; + + /* Get the mft record number. */ + mft_no = (((s64)page->index << PAGE_SHIFT) + ofs) + >> rec_size_bits; + /* Check whether to write this mft record. */ + tni = NULL; + if (!ntfs_may_write_mft_record(vol, mft_no, + (MFT_RECORD*)(kaddr + ofs), &tni)) { + /* + * The record should not be written. This + * means we need to redirty the page before + * returning. + */ + page_is_dirty = true; + /* + * Remove the buffers in this mft record from + * the list of buffers to write. + */ + do { + bhs[i] = NULL; + } while (++i % bhs_per_rec); + continue; + } + /* + * The record should be written. If a locked ntfs + * inode was returned, add it to the array of locked + * ntfs inodes. + */ + if (tni) + locked_nis[nr_locked_nis++] = tni; + } + /* Apply the mst protection fixups. */ + err2 = pre_write_mst_fixup((NTFS_RECORD*)(kaddr + ofs), + rec_size); + if (unlikely(err2)) { + if (!err || err == -ENOMEM) + err = -EIO; + ntfs_error(vol->sb, "Failed to apply mst fixups " + "(inode 0x%lx, attribute type 0x%x, " + "page index 0x%lx, page offset 0x%x)!" + " Unmount and run chkdsk.", vi->i_ino, + ni->type, page->index, ofs); + /* + * Mark all the buffers in this record clean as we do + * not want to write corrupt data to disk. + */ + do { + clear_buffer_dirty(bhs[i]); + bhs[i] = NULL; + } while (++i % bhs_per_rec); + continue; + } + nr_recs++; + } + /* If no records are to be written out, we are done. */ + if (!nr_recs) + goto unm_done; + flush_dcache_page(page); + /* Lock buffers and start synchronous write i/o on them. */ + for (i = 0; i < nr_bhs; i++) { + tbh = bhs[i]; + if (!tbh) + continue; + if (!trylock_buffer(tbh)) + BUG(); + /* The buffer dirty state is now irrelevant, just clean it. */ + clear_buffer_dirty(tbh); + BUG_ON(!buffer_uptodate(tbh)); + BUG_ON(!buffer_mapped(tbh)); + get_bh(tbh); + tbh->b_end_io = end_buffer_write_sync; + submit_bh(REQ_OP_WRITE, tbh); + } + /* Synchronize the mft mirror now if not @sync. */ + if (is_mft && !sync) + goto do_mirror; +do_wait: + /* Wait on i/o completion of buffers. */ + for (i = 0; i < nr_bhs; i++) { + tbh = bhs[i]; + if (!tbh) + continue; + wait_on_buffer(tbh); + if (unlikely(!buffer_uptodate(tbh))) { + ntfs_error(vol->sb, "I/O error while writing ntfs " + "record buffer (inode 0x%lx, " + "attribute type 0x%x, page index " + "0x%lx, page offset 0x%lx)! Unmount " + "and run chkdsk.", vi->i_ino, ni->type, + page->index, bh_offset(tbh)); + if (!err || err == -ENOMEM) + err = -EIO; + /* + * Set the buffer uptodate so the page and buffer + * states do not become out of sync. + */ + set_buffer_uptodate(tbh); + } + } + /* If @sync, now synchronize the mft mirror. */ + if (is_mft && sync) { +do_mirror: + for (i = 0; i < nr_bhs; i++) { + unsigned long mft_no; + unsigned int ofs; + + /* + * Skip buffers which are not at the beginning of + * records. + */ + if (i % bhs_per_rec) + continue; + tbh = bhs[i]; + /* Skip removed buffers (and hence records). */ + if (!tbh) + continue; + ofs = bh_offset(tbh); + /* Get the mft record number. */ + mft_no = (((s64)page->index << PAGE_SHIFT) + ofs) + >> rec_size_bits; + if (mft_no < vol->mftmirr_size) + ntfs_sync_mft_mirror(vol, mft_no, + (MFT_RECORD*)(kaddr + ofs), + sync); + } + if (!sync) + goto do_wait; + } + /* Remove the mst protection fixups again. */ + for (i = 0; i < nr_bhs; i++) { + if (!(i % bhs_per_rec)) { + tbh = bhs[i]; + if (!tbh) + continue; + post_write_mst_fixup((NTFS_RECORD*)(kaddr + + bh_offset(tbh))); + } + } + flush_dcache_page(page); +unm_done: + /* Unlock any locked inodes. */ + while (nr_locked_nis-- > 0) { + ntfs_inode *tni, *base_tni; + + tni = locked_nis[nr_locked_nis]; + /* Get the base inode. */ + mutex_lock(&tni->extent_lock); + if (tni->nr_extents >= 0) + base_tni = tni; + else { + base_tni = tni->ext.base_ntfs_ino; + BUG_ON(!base_tni); + } + mutex_unlock(&tni->extent_lock); + ntfs_debug("Unlocking %s inode 0x%lx.", + tni == base_tni ? "base" : "extent", + tni->mft_no); + mutex_unlock(&tni->mrec_lock); + atomic_dec(&tni->count); + iput(VFS_I(base_tni)); + } + SetPageUptodate(page); + kunmap(page); +done: + if (unlikely(err && err != -ENOMEM)) { + /* + * Set page error if there is only one ntfs record in the page. + * Otherwise we would loose per-record granularity. + */ + if (ni->itype.index.block_size == PAGE_SIZE) + SetPageError(page); + NVolSetErrors(vol); + } + if (page_is_dirty) { + ntfs_debug("Page still contains one or more dirty ntfs " + "records. Redirtying the page starting at " + "record 0x%lx.", page->index << + (PAGE_SHIFT - rec_size_bits)); + redirty_page_for_writepage(wbc, page); + unlock_page(page); + } else { + /* + * Keep the VM happy. This must be done otherwise the + * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though + * the page is clean. + */ + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + end_page_writeback(page); + } + if (likely(!err)) + ntfs_debug("Done."); + return err; +} + +/** + * ntfs_writepage - write a @page to the backing store + * @page: page cache page to write out + * @wbc: writeback control structure + * + * This is called from the VM when it wants to have a dirty ntfs page cache + * page cleaned. The VM has already locked the page and marked it clean. + * + * For non-resident attributes, ntfs_writepage() writes the @page by calling + * the ntfs version of the generic block_write_full_folio() function, + * ntfs_write_block(), which in turn if necessary creates and writes the + * buffers associated with the page asynchronously. + * + * For resident attributes, OTOH, ntfs_writepage() writes the @page by copying + * the data to the mft record (which at this stage is most likely in memory). + * The mft record is then marked dirty and written out asynchronously via the + * vfs inode dirty code path for the inode the mft record belongs to or via the + * vm page dirty code path for the page the mft record is in. + * + * Based on ntfs_read_folio() and fs/buffer.c::block_write_full_folio(). + * + * Return 0 on success and -errno on error. + */ +static int ntfs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct folio *folio = page_folio(page); + loff_t i_size; + struct inode *vi = folio->mapping->host; + ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi); + char *addr; + ntfs_attr_search_ctx *ctx = NULL; + MFT_RECORD *m = NULL; + u32 attr_len; + int err; + +retry_writepage: + BUG_ON(!folio_test_locked(folio)); + i_size = i_size_read(vi); + /* Is the folio fully outside i_size? (truncate in progress) */ + if (unlikely(folio->index >= (i_size + PAGE_SIZE - 1) >> + PAGE_SHIFT)) { + /* + * The folio may have dirty, unmapped buffers. Make them + * freeable here, so the page does not leak. + */ + block_invalidate_folio(folio, 0, folio_size(folio)); + folio_unlock(folio); + ntfs_debug("Write outside i_size - truncated?"); + return 0; + } + /* + * Only $DATA attributes can be encrypted and only unnamed $DATA + * attributes can be compressed. Index root can have the flags set but + * this means to create compressed/encrypted files, not that the + * attribute is compressed/encrypted. Note we need to check for + * AT_INDEX_ALLOCATION since this is the type of both directory and + * index inodes. + */ + if (ni->type != AT_INDEX_ALLOCATION) { + /* If file is encrypted, deny access, just like NT4. */ + if (NInoEncrypted(ni)) { + folio_unlock(folio); + BUG_ON(ni->type != AT_DATA); + ntfs_debug("Denying write access to encrypted file."); + return -EACCES; + } + /* Compressed data streams are handled in compress.c. */ + if (NInoNonResident(ni) && NInoCompressed(ni)) { + BUG_ON(ni->type != AT_DATA); + BUG_ON(ni->name_len); + // TODO: Implement and replace this with + // return ntfs_write_compressed_block(page); + folio_unlock(folio); + ntfs_error(vi->i_sb, "Writing to compressed files is " + "not supported yet. Sorry."); + return -EOPNOTSUPP; + } + // TODO: Implement and remove this check. + if (NInoNonResident(ni) && NInoSparse(ni)) { + folio_unlock(folio); + ntfs_error(vi->i_sb, "Writing to sparse files is not " + "supported yet. Sorry."); + return -EOPNOTSUPP; + } + } + /* NInoNonResident() == NInoIndexAllocPresent() */ + if (NInoNonResident(ni)) { + /* We have to zero every time due to mmap-at-end-of-file. */ + if (folio->index >= (i_size >> PAGE_SHIFT)) { + /* The folio straddles i_size. */ + unsigned int ofs = i_size & (folio_size(folio) - 1); + folio_zero_segment(folio, ofs, folio_size(folio)); + } + /* Handle mst protected attributes. */ + if (NInoMstProtected(ni)) + return ntfs_write_mst_block(page, wbc); + /* Normal, non-resident data stream. */ + return ntfs_write_block(folio, wbc); + } + /* + * Attribute is resident, implying it is not compressed, encrypted, or + * mst protected. This also means the attribute is smaller than an mft + * record and hence smaller than a folio, so can simply return error on + * any folios with index above 0. Note the attribute can actually be + * marked compressed but if it is resident the actual data is not + * compressed so we are ok to ignore the compressed flag here. + */ + BUG_ON(folio_buffers(folio)); + BUG_ON(!folio_test_uptodate(folio)); + if (unlikely(folio->index > 0)) { + ntfs_error(vi->i_sb, "BUG()! folio->index (0x%lx) > 0. " + "Aborting write.", folio->index); + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); + folio_unlock(folio); + folio_end_writeback(folio); + return -EIO; + } + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + /* Map, pin, and lock the mft record. */ + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + ctx = NULL; + goto err_out; + } + /* + * If a parallel write made the attribute non-resident, drop the mft + * record and retry the writepage. + */ + if (unlikely(NInoNonResident(ni))) { + unmap_mft_record(base_ni); + goto retry_writepage; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) + goto err_out; + /* + * Keep the VM happy. This must be done otherwise + * PAGECACHE_TAG_DIRTY remains set even though the folio is clean. + */ + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); + folio_unlock(folio); + attr_len = le32_to_cpu(ctx->attr->data.resident.value_length); + i_size = i_size_read(vi); + if (unlikely(attr_len > i_size)) { + /* Race with shrinking truncate or a failed truncate. */ + attr_len = i_size; + /* + * If the truncate failed, fix it up now. If a concurrent + * truncate, we do its job, so it does not have to do anything. + */ + err = ntfs_resident_attr_value_resize(ctx->mrec, ctx->attr, + attr_len); + /* Shrinking cannot fail. */ + BUG_ON(err); + } + addr = kmap_local_folio(folio, 0); + /* Copy the data from the folio to the mft record. */ + memcpy((u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset), + addr, attr_len); + /* Zero out of bounds area in the page cache folio. */ + memset(addr + attr_len, 0, folio_size(folio) - attr_len); + kunmap_local(addr); + flush_dcache_folio(folio); + flush_dcache_mft_record_page(ctx->ntfs_ino); + /* We are done with the folio. */ + folio_end_writeback(folio); + /* Finally, mark the mft record dirty, so it gets written back. */ + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + return 0; +err_out: + if (err == -ENOMEM) { + ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying " + "page so we try again later."); + /* + * Put the folio back on mapping->dirty_pages, but leave its + * buffers' dirty state as-is. + */ + folio_redirty_for_writepage(wbc, folio); + err = 0; + } else { + ntfs_error(vi->i_sb, "Resident attribute write failed with " + "error %i.", err); + folio_set_error(folio); + NVolSetErrors(ni->vol); + } + folio_unlock(folio); + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + return err; +} + +#endif /* NTFS_RW */ + +/** + * ntfs_bmap - map logical file block to physical device block + * @mapping: address space mapping to which the block to be mapped belongs + * @block: logical block to map to its physical device block + * + * For regular, non-resident files (i.e. not compressed and not encrypted), map + * the logical @block belonging to the file described by the address space + * mapping @mapping to its physical device block. + * + * The size of the block is equal to the @s_blocksize field of the super block + * of the mounted file system which is guaranteed to be smaller than or equal + * to the cluster size thus the block is guaranteed to fit entirely inside the + * cluster which means we do not need to care how many contiguous bytes are + * available after the beginning of the block. + * + * Return the physical device block if the mapping succeeded or 0 if the block + * is sparse or there was an error. + * + * Note: This is a problem if someone tries to run bmap() on $Boot system file + * as that really is in block zero but there is nothing we can do. bmap() is + * just broken in that respect (just like it cannot distinguish sparse from + * not available or error). + */ +static sector_t ntfs_bmap(struct address_space *mapping, sector_t block) +{ + s64 ofs, size; + loff_t i_size; + LCN lcn; + unsigned long blocksize, flags; + ntfs_inode *ni = NTFS_I(mapping->host); + ntfs_volume *vol = ni->vol; + unsigned delta; + unsigned char blocksize_bits, cluster_size_shift; + + ntfs_debug("Entering for mft_no 0x%lx, logical block 0x%llx.", + ni->mft_no, (unsigned long long)block); + if (ni->type != AT_DATA || !NInoNonResident(ni) || NInoEncrypted(ni)) { + ntfs_error(vol->sb, "BMAP does not make sense for %s " + "attributes, returning 0.", + (ni->type != AT_DATA) ? "non-data" : + (!NInoNonResident(ni) ? "resident" : + "encrypted")); + return 0; + } + /* None of these can happen. */ + BUG_ON(NInoCompressed(ni)); + BUG_ON(NInoMstProtected(ni)); + blocksize = vol->sb->s_blocksize; + blocksize_bits = vol->sb->s_blocksize_bits; + ofs = (s64)block << blocksize_bits; + read_lock_irqsave(&ni->size_lock, flags); + size = ni->initialized_size; + i_size = i_size_read(VFS_I(ni)); + read_unlock_irqrestore(&ni->size_lock, flags); + /* + * If the offset is outside the initialized size or the block straddles + * the initialized size then pretend it is a hole unless the + * initialized size equals the file size. + */ + if (unlikely(ofs >= size || (ofs + blocksize > size && size < i_size))) + goto hole; + cluster_size_shift = vol->cluster_size_bits; + down_read(&ni->runlist.lock); + lcn = ntfs_attr_vcn_to_lcn_nolock(ni, ofs >> cluster_size_shift, false); + up_read(&ni->runlist.lock); + if (unlikely(lcn < LCN_HOLE)) { + /* + * Step down to an integer to avoid gcc doing a long long + * comparision in the switch when we know @lcn is between + * LCN_HOLE and LCN_EIO (i.e. -1 to -5). + * + * Otherwise older gcc (at least on some architectures) will + * try to use __cmpdi2() which is of course not available in + * the kernel. + */ + switch ((int)lcn) { + case LCN_ENOENT: + /* + * If the offset is out of bounds then pretend it is a + * hole. + */ + goto hole; + case LCN_ENOMEM: + ntfs_error(vol->sb, "Not enough memory to complete " + "mapping for inode 0x%lx. " + "Returning 0.", ni->mft_no); + break; + default: + ntfs_error(vol->sb, "Failed to complete mapping for " + "inode 0x%lx. Run chkdsk. " + "Returning 0.", ni->mft_no); + break; + } + return 0; + } + if (lcn < 0) { + /* It is a hole. */ +hole: + ntfs_debug("Done (returning hole)."); + return 0; + } + /* + * The block is really allocated and fullfils all our criteria. + * Convert the cluster to units of block size and return the result. + */ + delta = ofs & vol->cluster_size_mask; + if (unlikely(sizeof(block) < sizeof(lcn))) { + block = lcn = ((lcn << cluster_size_shift) + delta) >> + blocksize_bits; + /* If the block number was truncated return 0. */ + if (unlikely(block != lcn)) { + ntfs_error(vol->sb, "Physical block 0x%llx is too " + "large to be returned, returning 0.", + (long long)lcn); + return 0; + } + } else + block = ((lcn << cluster_size_shift) + delta) >> + blocksize_bits; + ntfs_debug("Done (returning block 0x%llx).", (unsigned long long)lcn); + return block; +} + +/* + * ntfs_normal_aops - address space operations for normal inodes and attributes + * + * Note these are not used for compressed or mst protected inodes and + * attributes. + */ +const struct address_space_operations ntfs_normal_aops = { + .read_folio = ntfs_read_folio, +#ifdef NTFS_RW + .writepage = ntfs_writepage, + .dirty_folio = block_dirty_folio, +#endif /* NTFS_RW */ + .bmap = ntfs_bmap, + .migrate_folio = buffer_migrate_folio, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_folio = generic_error_remove_folio, +}; + +/* + * ntfs_compressed_aops - address space operations for compressed inodes + */ +const struct address_space_operations ntfs_compressed_aops = { + .read_folio = ntfs_read_folio, +#ifdef NTFS_RW + .writepage = ntfs_writepage, + .dirty_folio = block_dirty_folio, +#endif /* NTFS_RW */ + .migrate_folio = buffer_migrate_folio, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_folio = generic_error_remove_folio, +}; + +/* + * ntfs_mst_aops - general address space operations for mst protecteed inodes + * and attributes + */ +const struct address_space_operations ntfs_mst_aops = { + .read_folio = ntfs_read_folio, /* Fill page with data. */ +#ifdef NTFS_RW + .writepage = ntfs_writepage, /* Write dirty page to disk. */ + .dirty_folio = filemap_dirty_folio, +#endif /* NTFS_RW */ + .migrate_folio = buffer_migrate_folio, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_folio = generic_error_remove_folio, +}; + +#ifdef NTFS_RW + +/** + * mark_ntfs_record_dirty - mark an ntfs record dirty + * @page: page containing the ntfs record to mark dirty + * @ofs: byte offset within @page at which the ntfs record begins + * + * Set the buffers and the page in which the ntfs record is located dirty. + * + * The latter also marks the vfs inode the ntfs record belongs to dirty + * (I_DIRTY_PAGES only). + * + * If the page does not have buffers, we create them and set them uptodate. + * The page may not be locked which is why we need to handle the buffers under + * the mapping->i_private_lock. Once the buffers are marked dirty we no longer + * need the lock since try_to_free_buffers() does not free dirty buffers. + */ +void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { + struct address_space *mapping = page->mapping; + ntfs_inode *ni = NTFS_I(mapping->host); + struct buffer_head *bh, *head, *buffers_to_free = NULL; + unsigned int end, bh_size, bh_ofs; + + BUG_ON(!PageUptodate(page)); + end = ofs + ni->itype.index.block_size; + bh_size = VFS_I(ni)->i_sb->s_blocksize; + spin_lock(&mapping->i_private_lock); + if (unlikely(!page_has_buffers(page))) { + spin_unlock(&mapping->i_private_lock); + bh = head = alloc_page_buffers(page, bh_size, true); + spin_lock(&mapping->i_private_lock); + if (likely(!page_has_buffers(page))) { + struct buffer_head *tail; + + do { + set_buffer_uptodate(bh); + tail = bh; + bh = bh->b_this_page; + } while (bh); + tail->b_this_page = head; + attach_page_private(page, head); + } else + buffers_to_free = bh; + } + bh = head = page_buffers(page); + BUG_ON(!bh); + do { + bh_ofs = bh_offset(bh); + if (bh_ofs + bh_size <= ofs) + continue; + if (unlikely(bh_ofs >= end)) + break; + set_buffer_dirty(bh); + } while ((bh = bh->b_this_page) != head); + spin_unlock(&mapping->i_private_lock); + filemap_dirty_folio(mapping, page_folio(page)); + if (unlikely(buffers_to_free)) { + do { + bh = buffers_to_free->b_this_page; + free_buffer_head(buffers_to_free); + buffers_to_free = bh; + } while (buffers_to_free); + } +} + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h new file mode 100644 index 000000000000..8d0958a149cb --- /dev/null +++ b/fs/ntfs/aops.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * aops.h - Defines for NTFS kernel address space operations and page cache + * handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2004 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + */ + +#ifndef _LINUX_NTFS_AOPS_H +#define _LINUX_NTFS_AOPS_H + +#include +#include +#include +#include + +#include "inode.h" + +/** + * ntfs_unmap_page - release a page that was mapped using ntfs_map_page() + * @page: the page to release + * + * Unpin, unmap and release a page that was obtained from ntfs_map_page(). + */ +static inline void ntfs_unmap_page(struct page *page) +{ + kunmap(page); + put_page(page); +} + +/** + * ntfs_map_page - map a page into accessible memory, reading it if necessary + * @mapping: address space for which to obtain the page + * @index: index into the page cache for @mapping of the page to map + * + * Read a page from the page cache of the address space @mapping at position + * @index, where @index is in units of PAGE_SIZE, and not in bytes. + * + * If the page is not in memory it is loaded from disk first using the + * read_folio method defined in the address space operations of @mapping + * and the page is added to the page cache of @mapping in the process. + * + * If the page belongs to an mst protected attribute and it is marked as such + * in its ntfs inode (NInoMstProtected()) the mst fixups are applied but no + * error checking is performed. This means the caller has to verify whether + * the ntfs record(s) contained in the page are valid or not using one of the + * ntfs_is_XXXX_record{,p}() macros, where XXXX is the record type you are + * expecting to see. (For details of the macros, see fs/ntfs/layout.h.) + * + * If the page is in high memory it is mapped into memory directly addressible + * by the kernel. + * + * Finally the page count is incremented, thus pinning the page into place. + * + * The above means that page_address(page) can be used on all pages obtained + * with ntfs_map_page() to get the kernel virtual address of the page. + * + * When finished with the page, the caller has to call ntfs_unmap_page() to + * unpin, unmap and release the page. + * + * Note this does not grant exclusive access. If such is desired, the caller + * must provide it independently of the ntfs_{un}map_page() calls by using + * a {rw_}semaphore or other means of serialization. A spin lock cannot be + * used as ntfs_map_page() can block. + * + * The unlocked and uptodate page is returned on success or an encoded error + * on failure. Caller has to test for error using the IS_ERR() macro on the + * return value. If that evaluates to 'true', the negative error code can be + * obtained using PTR_ERR() on the return value of ntfs_map_page(). + */ +static inline struct page *ntfs_map_page(struct address_space *mapping, + unsigned long index) +{ + struct page *page = read_mapping_page(mapping, index, NULL); + + if (!IS_ERR(page)) + kmap(page); + return page; +} + +#ifdef NTFS_RW + +extern void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs); + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_AOPS_H */ diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c new file mode 100644 index 000000000000..f79408f9127a --- /dev/null +++ b/fs/ntfs/attrib.c @@ -0,0 +1,2624 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * attrib.c - NTFS attribute operations. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. + * Copyright (c) 2002 Richard Russon + */ + +#include +#include +#include +#include +#include + +#include "attrib.h" +#include "debug.h" +#include "layout.h" +#include "lcnalloc.h" +#include "malloc.h" +#include "mft.h" +#include "ntfs.h" +#include "types.h" + +/** + * ntfs_map_runlist_nolock - map (a part of) a runlist of an ntfs inode + * @ni: ntfs inode for which to map (part of) a runlist + * @vcn: map runlist part containing this vcn + * @ctx: active attribute search context if present or NULL if not + * + * Map the part of a runlist containing the @vcn of the ntfs inode @ni. + * + * If @ctx is specified, it is an active search context of @ni and its base mft + * record. This is needed when ntfs_map_runlist_nolock() encounters unmapped + * runlist fragments and allows their mapping. If you do not have the mft + * record mapped, you can specify @ctx as NULL and ntfs_map_runlist_nolock() + * will perform the necessary mapping and unmapping. + * + * Note, ntfs_map_runlist_nolock() saves the state of @ctx on entry and + * restores it before returning. Thus, @ctx will be left pointing to the same + * attribute on return as on entry. However, the actual pointers in @ctx may + * point to different memory locations on return, so you must remember to reset + * any cached pointers from the @ctx, i.e. after the call to + * ntfs_map_runlist_nolock(), you will probably want to do: + * m = ctx->mrec; + * a = ctx->attr; + * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that + * you cache ctx->mrec in a variable @m of type MFT_RECORD *. + * + * Return 0 on success and -errno on error. There is one special error code + * which is not an error as such. This is -ENOENT. It means that @vcn is out + * of bounds of the runlist. + * + * Note the runlist can be NULL after this function returns if @vcn is zero and + * the attribute has zero allocated size, i.e. there simply is no runlist. + * + * WARNING: If @ctx is supplied, regardless of whether success or failure is + * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx + * is no longer valid, i.e. you need to either call + * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. + * In that case PTR_ERR(@ctx->mrec) will give you the error code for + * why the mapping of the old inode failed. + * + * Locking: - The runlist described by @ni must be locked for writing on entry + * and is locked on return. Note the runlist will be modified. + * - If @ctx is NULL, the base mft record of @ni must not be mapped on + * entry and it will be left unmapped on return. + * - If @ctx is not NULL, the base mft record must be mapped on entry + * and it will be left mapped on return. + */ +int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx) +{ + VCN end_vcn; + unsigned long flags; + ntfs_inode *base_ni; + MFT_RECORD *m; + ATTR_RECORD *a; + runlist_element *rl; + struct page *put_this_page = NULL; + int err = 0; + bool ctx_is_temporary, ctx_needs_reset; + ntfs_attr_search_ctx old_ctx = { NULL, }; + + ntfs_debug("Mapping runlist part containing vcn 0x%llx.", + (unsigned long long)vcn); + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + if (!ctx) { + ctx_is_temporary = ctx_needs_reset = true; + m = map_mft_record(base_ni); + if (IS_ERR(m)) + return PTR_ERR(m); + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + } else { + VCN allocated_size_vcn; + + BUG_ON(IS_ERR(ctx->mrec)); + a = ctx->attr; + BUG_ON(!a->non_resident); + ctx_is_temporary = false; + end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); + read_lock_irqsave(&ni->size_lock, flags); + allocated_size_vcn = ni->allocated_size >> + ni->vol->cluster_size_bits; + read_unlock_irqrestore(&ni->size_lock, flags); + if (!a->data.non_resident.lowest_vcn && end_vcn <= 0) + end_vcn = allocated_size_vcn - 1; + /* + * If we already have the attribute extent containing @vcn in + * @ctx, no need to look it up again. We slightly cheat in + * that if vcn exceeds the allocated size, we will refuse to + * map the runlist below, so there is definitely no need to get + * the right attribute extent. + */ + if (vcn >= allocated_size_vcn || (a->type == ni->type && + a->name_length == ni->name_len && + !memcmp((u8*)a + le16_to_cpu(a->name_offset), + ni->name, ni->name_len) && + sle64_to_cpu(a->data.non_resident.lowest_vcn) + <= vcn && end_vcn >= vcn)) + ctx_needs_reset = false; + else { + /* Save the old search context. */ + old_ctx = *ctx; + /* + * If the currently mapped (extent) inode is not the + * base inode we will unmap it when we reinitialize the + * search context which means we need to get a + * reference to the page containing the mapped mft + * record so we do not accidentally drop changes to the + * mft record when it has not been marked dirty yet. + */ + if (old_ctx.base_ntfs_ino && old_ctx.ntfs_ino != + old_ctx.base_ntfs_ino) { + put_this_page = old_ctx.ntfs_ino->page; + get_page(put_this_page); + } + /* + * Reinitialize the search context so we can lookup the + * needed attribute extent. + */ + ntfs_attr_reinit_search_ctx(ctx); + ctx_needs_reset = true; + } + } + if (ctx_needs_reset) { + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, vcn, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + BUG_ON(!ctx->attr->non_resident); + } + a = ctx->attr; + /* + * Only decompress the mapping pairs if @vcn is inside it. Otherwise + * we get into problems when we try to map an out of bounds vcn because + * we then try to map the already mapped runlist fragment and + * ntfs_mapping_pairs_decompress() fails. + */ + end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn) + 1; + if (unlikely(vcn && vcn >= end_vcn)) { + err = -ENOENT; + goto err_out; + } + rl = ntfs_mapping_pairs_decompress(ni->vol, a, ni->runlist.rl); + if (IS_ERR(rl)) + err = PTR_ERR(rl); + else + ni->runlist.rl = rl; +err_out: + if (ctx_is_temporary) { + if (likely(ctx)) + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + } else if (ctx_needs_reset) { + /* + * If there is no attribute list, restoring the search context + * is accomplished simply by copying the saved context back over + * the caller supplied context. If there is an attribute list, + * things are more complicated as we need to deal with mapping + * of mft records and resulting potential changes in pointers. + */ + if (NInoAttrList(base_ni)) { + /* + * If the currently mapped (extent) inode is not the + * one we had before, we need to unmap it and map the + * old one. + */ + if (ctx->ntfs_ino != old_ctx.ntfs_ino) { + /* + * If the currently mapped inode is not the + * base inode, unmap it. + */ + if (ctx->base_ntfs_ino && ctx->ntfs_ino != + ctx->base_ntfs_ino) { + unmap_extent_mft_record(ctx->ntfs_ino); + ctx->mrec = ctx->base_mrec; + BUG_ON(!ctx->mrec); + } + /* + * If the old mapped inode is not the base + * inode, map it. + */ + if (old_ctx.base_ntfs_ino && + old_ctx.ntfs_ino != + old_ctx.base_ntfs_ino) { +retry_map: + ctx->mrec = map_mft_record( + old_ctx.ntfs_ino); + /* + * Something bad has happened. If out + * of memory retry till it succeeds. + * Any other errors are fatal and we + * return the error code in ctx->mrec. + * Let the caller deal with it... We + * just need to fudge things so the + * caller can reinit and/or put the + * search context safely. + */ + if (IS_ERR(ctx->mrec)) { + if (PTR_ERR(ctx->mrec) == + -ENOMEM) { + schedule(); + goto retry_map; + } else + old_ctx.ntfs_ino = + old_ctx. + base_ntfs_ino; + } + } + } + /* Update the changed pointers in the saved context. */ + if (ctx->mrec != old_ctx.mrec) { + if (!IS_ERR(ctx->mrec)) + old_ctx.attr = (ATTR_RECORD*)( + (u8*)ctx->mrec + + ((u8*)old_ctx.attr - + (u8*)old_ctx.mrec)); + old_ctx.mrec = ctx->mrec; + } + } + /* Restore the search context to the saved one. */ + *ctx = old_ctx; + /* + * We drop the reference on the page we took earlier. In the + * case that IS_ERR(ctx->mrec) is true this means we might lose + * some changes to the mft record that had been made between + * the last time it was marked dirty/written out and now. This + * at this stage is not a problem as the mapping error is fatal + * enough that the mft record cannot be written out anyway and + * the caller is very likely to shutdown the whole inode + * immediately and mark the volume dirty for chkdsk to pick up + * the pieces anyway. + */ + if (put_this_page) + put_page(put_this_page); + } + return err; +} + +/** + * ntfs_map_runlist - map (a part of) a runlist of an ntfs inode + * @ni: ntfs inode for which to map (part of) a runlist + * @vcn: map runlist part containing this vcn + * + * Map the part of a runlist containing the @vcn of the ntfs inode @ni. + * + * Return 0 on success and -errno on error. There is one special error code + * which is not an error as such. This is -ENOENT. It means that @vcn is out + * of bounds of the runlist. + * + * Locking: - The runlist must be unlocked on entry and is unlocked on return. + * - This function takes the runlist lock for writing and may modify + * the runlist. + */ +int ntfs_map_runlist(ntfs_inode *ni, VCN vcn) +{ + int err = 0; + + down_write(&ni->runlist.lock); + /* Make sure someone else didn't do the work while we were sleeping. */ + if (likely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) <= + LCN_RL_NOT_MAPPED)) + err = ntfs_map_runlist_nolock(ni, vcn, NULL); + up_write(&ni->runlist.lock); + return err; +} + +/** + * ntfs_attr_vcn_to_lcn_nolock - convert a vcn into a lcn given an ntfs inode + * @ni: ntfs inode of the attribute whose runlist to search + * @vcn: vcn to convert + * @write_locked: true if the runlist is locked for writing + * + * Find the virtual cluster number @vcn in the runlist of the ntfs attribute + * described by the ntfs inode @ni and return the corresponding logical cluster + * number (lcn). + * + * If the @vcn is not mapped yet, the attempt is made to map the attribute + * extent containing the @vcn and the vcn to lcn conversion is retried. + * + * If @write_locked is true the caller has locked the runlist for writing and + * if false for reading. + * + * Since lcns must be >= 0, we use negative return codes with special meaning: + * + * Return code Meaning / Description + * ========================================== + * LCN_HOLE Hole / not allocated on disk. + * LCN_ENOENT There is no such vcn in the runlist, i.e. @vcn is out of bounds. + * LCN_ENOMEM Not enough memory to map runlist. + * LCN_EIO Critical error (runlist/file is corrupt, i/o error, etc). + * + * Locking: - The runlist must be locked on entry and is left locked on return. + * - If @write_locked is 'false', i.e. the runlist is locked for reading, + * the lock may be dropped inside the function so you cannot rely on + * the runlist still being the same when this function returns. + */ +LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn, + const bool write_locked) +{ + LCN lcn; + unsigned long flags; + bool is_retry = false; + + BUG_ON(!ni); + ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.", + ni->mft_no, (unsigned long long)vcn, + write_locked ? "write" : "read"); + BUG_ON(!NInoNonResident(ni)); + BUG_ON(vcn < 0); + if (!ni->runlist.rl) { + read_lock_irqsave(&ni->size_lock, flags); + if (!ni->allocated_size) { + read_unlock_irqrestore(&ni->size_lock, flags); + return LCN_ENOENT; + } + read_unlock_irqrestore(&ni->size_lock, flags); + } +retry_remap: + /* Convert vcn to lcn. If that fails map the runlist and retry once. */ + lcn = ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn); + if (likely(lcn >= LCN_HOLE)) { + ntfs_debug("Done, lcn 0x%llx.", (long long)lcn); + return lcn; + } + if (lcn != LCN_RL_NOT_MAPPED) { + if (lcn != LCN_ENOENT) + lcn = LCN_EIO; + } else if (!is_retry) { + int err; + + if (!write_locked) { + up_read(&ni->runlist.lock); + down_write(&ni->runlist.lock); + if (unlikely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) != + LCN_RL_NOT_MAPPED)) { + up_write(&ni->runlist.lock); + down_read(&ni->runlist.lock); + goto retry_remap; + } + } + err = ntfs_map_runlist_nolock(ni, vcn, NULL); + if (!write_locked) { + up_write(&ni->runlist.lock); + down_read(&ni->runlist.lock); + } + if (likely(!err)) { + is_retry = true; + goto retry_remap; + } + if (err == -ENOENT) + lcn = LCN_ENOENT; + else if (err == -ENOMEM) + lcn = LCN_ENOMEM; + else + lcn = LCN_EIO; + } + if (lcn != LCN_ENOENT) + ntfs_error(ni->vol->sb, "Failed with error code %lli.", + (long long)lcn); + return lcn; +} + +/** + * ntfs_attr_find_vcn_nolock - find a vcn in the runlist of an ntfs inode + * @ni: ntfs inode describing the runlist to search + * @vcn: vcn to find + * @ctx: active attribute search context if present or NULL if not + * + * Find the virtual cluster number @vcn in the runlist described by the ntfs + * inode @ni and return the address of the runlist element containing the @vcn. + * + * If the @vcn is not mapped yet, the attempt is made to map the attribute + * extent containing the @vcn and the vcn to lcn conversion is retried. + * + * If @ctx is specified, it is an active search context of @ni and its base mft + * record. This is needed when ntfs_attr_find_vcn_nolock() encounters unmapped + * runlist fragments and allows their mapping. If you do not have the mft + * record mapped, you can specify @ctx as NULL and ntfs_attr_find_vcn_nolock() + * will perform the necessary mapping and unmapping. + * + * Note, ntfs_attr_find_vcn_nolock() saves the state of @ctx on entry and + * restores it before returning. Thus, @ctx will be left pointing to the same + * attribute on return as on entry. However, the actual pointers in @ctx may + * point to different memory locations on return, so you must remember to reset + * any cached pointers from the @ctx, i.e. after the call to + * ntfs_attr_find_vcn_nolock(), you will probably want to do: + * m = ctx->mrec; + * a = ctx->attr; + * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that + * you cache ctx->mrec in a variable @m of type MFT_RECORD *. + * Note you need to distinguish between the lcn of the returned runlist element + * being >= 0 and LCN_HOLE. In the later case you have to return zeroes on + * read and allocate clusters on write. + * + * Return the runlist element containing the @vcn on success and + * ERR_PTR(-errno) on error. You need to test the return value with IS_ERR() + * to decide if the return is success or failure and PTR_ERR() to get to the + * error code if IS_ERR() is true. + * + * The possible error return codes are: + * -ENOENT - No such vcn in the runlist, i.e. @vcn is out of bounds. + * -ENOMEM - Not enough memory to map runlist. + * -EIO - Critical error (runlist/file is corrupt, i/o error, etc). + * + * WARNING: If @ctx is supplied, regardless of whether success or failure is + * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx + * is no longer valid, i.e. you need to either call + * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. + * In that case PTR_ERR(@ctx->mrec) will give you the error code for + * why the mapping of the old inode failed. + * + * Locking: - The runlist described by @ni must be locked for writing on entry + * and is locked on return. Note the runlist may be modified when + * needed runlist fragments need to be mapped. + * - If @ctx is NULL, the base mft record of @ni must not be mapped on + * entry and it will be left unmapped on return. + * - If @ctx is not NULL, the base mft record must be mapped on entry + * and it will be left mapped on return. + */ +runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn, + ntfs_attr_search_ctx *ctx) +{ + unsigned long flags; + runlist_element *rl; + int err = 0; + bool is_retry = false; + + BUG_ON(!ni); + ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, with%s ctx.", + ni->mft_no, (unsigned long long)vcn, ctx ? "" : "out"); + BUG_ON(!NInoNonResident(ni)); + BUG_ON(vcn < 0); + if (!ni->runlist.rl) { + read_lock_irqsave(&ni->size_lock, flags); + if (!ni->allocated_size) { + read_unlock_irqrestore(&ni->size_lock, flags); + return ERR_PTR(-ENOENT); + } + read_unlock_irqrestore(&ni->size_lock, flags); + } +retry_remap: + rl = ni->runlist.rl; + if (likely(rl && vcn >= rl[0].vcn)) { + while (likely(rl->length)) { + if (unlikely(vcn < rl[1].vcn)) { + if (likely(rl->lcn >= LCN_HOLE)) { + ntfs_debug("Done."); + return rl; + } + break; + } + rl++; + } + if (likely(rl->lcn != LCN_RL_NOT_MAPPED)) { + if (likely(rl->lcn == LCN_ENOENT)) + err = -ENOENT; + else + err = -EIO; + } + } + if (!err && !is_retry) { + /* + * If the search context is invalid we cannot map the unmapped + * region. + */ + if (IS_ERR(ctx->mrec)) + err = PTR_ERR(ctx->mrec); + else { + /* + * The @vcn is in an unmapped region, map the runlist + * and retry. + */ + err = ntfs_map_runlist_nolock(ni, vcn, ctx); + if (likely(!err)) { + is_retry = true; + goto retry_remap; + } + } + if (err == -EINVAL) + err = -EIO; + } else if (!err) + err = -EIO; + if (err != -ENOENT) + ntfs_error(ni->vol->sb, "Failed with error code %i.", err); + return ERR_PTR(err); +} + +/** + * ntfs_attr_find - find (next) attribute in mft record + * @type: attribute type to find + * @name: attribute name to find (optional, i.e. NULL means don't care) + * @name_len: attribute name length (only needed if @name present) + * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present) + * @val: attribute value to find (optional, resident attributes only) + * @val_len: attribute value length + * @ctx: search context with mft record and attribute to search from + * + * You should not need to call this function directly. Use ntfs_attr_lookup() + * instead. + * + * ntfs_attr_find() takes a search context @ctx as parameter and searches the + * mft record specified by @ctx->mrec, beginning at @ctx->attr, for an + * attribute of @type, optionally @name and @val. + * + * If the attribute is found, ntfs_attr_find() returns 0 and @ctx->attr will + * point to the found attribute. + * + * If the attribute is not found, ntfs_attr_find() returns -ENOENT and + * @ctx->attr will point to the attribute before which the attribute being + * searched for would need to be inserted if such an action were to be desired. + * + * On actual error, ntfs_attr_find() returns -EIO. In this case @ctx->attr is + * undefined and in particular do not rely on it not changing. + * + * If @ctx->is_first is 'true', the search begins with @ctx->attr itself. If it + * is 'false', the search begins after @ctx->attr. + * + * If @ic is IGNORE_CASE, the @name comparisson is not case sensitive and + * @ctx->ntfs_ino must be set to the ntfs inode to which the mft record + * @ctx->mrec belongs. This is so we can get at the ntfs volume and hence at + * the upcase table. If @ic is CASE_SENSITIVE, the comparison is case + * sensitive. When @name is present, @name_len is the @name length in Unicode + * characters. + * + * If @name is not present (NULL), we assume that the unnamed attribute is + * being searched for. + * + * Finally, the resident attribute value @val is looked for, if present. If + * @val is not present (NULL), @val_len is ignored. + * + * ntfs_attr_find() only searches the specified mft record and it ignores the + * presence of an attribute list attribute (unless it is the one being searched + * for, obviously). If you need to take attribute lists into consideration, + * use ntfs_attr_lookup() instead (see below). This also means that you cannot + * use ntfs_attr_find() to search for extent records of non-resident + * attributes, as extents with lowest_vcn != 0 are usually described by the + * attribute list attribute only. - Note that it is possible that the first + * extent is only in the attribute list while the last extent is in the base + * mft record, so do not rely on being able to find the first extent in the + * base mft record. + * + * Warning: Never use @val when looking for attribute types which can be + * non-resident as this most likely will result in a crash! + */ +static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name, + const u32 name_len, const IGNORE_CASE_BOOL ic, + const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx) +{ + ATTR_RECORD *a; + ntfs_volume *vol = ctx->ntfs_ino->vol; + ntfschar *upcase = vol->upcase; + u32 upcase_len = vol->upcase_len; + + /* + * Iterate over attributes in mft record starting at @ctx->attr, or the + * attribute following that, if @ctx->is_first is 'true'. + */ + if (ctx->is_first) { + a = ctx->attr; + ctx->is_first = false; + } else + a = (ATTR_RECORD*)((u8*)ctx->attr + + le32_to_cpu(ctx->attr->length)); + for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) { + u8 *mrec_end = (u8 *)ctx->mrec + + le32_to_cpu(ctx->mrec->bytes_allocated); + u8 *name_end; + + /* check whether ATTR_RECORD wrap */ + if ((u8 *)a < (u8 *)ctx->mrec) + break; + + /* check whether Attribute Record Header is within bounds */ + if ((u8 *)a > mrec_end || + (u8 *)a + sizeof(ATTR_RECORD) > mrec_end) + break; + + /* check whether ATTR_RECORD's name is within bounds */ + name_end = (u8 *)a + le16_to_cpu(a->name_offset) + + a->name_length * sizeof(ntfschar); + if (name_end > mrec_end) + break; + + ctx->attr = a; + if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) || + a->type == AT_END)) + return -ENOENT; + if (unlikely(!a->length)) + break; + + /* check whether ATTR_RECORD's length wrap */ + if ((u8 *)a + le32_to_cpu(a->length) < (u8 *)a) + break; + /* check whether ATTR_RECORD's length is within bounds */ + if ((u8 *)a + le32_to_cpu(a->length) > mrec_end) + break; + + if (a->type != type) + continue; + /* + * If @name is present, compare the two names. If @name is + * missing, assume we want an unnamed attribute. + */ + if (!name) { + /* The search failed if the found attribute is named. */ + if (a->name_length) + return -ENOENT; + } else if (!ntfs_are_names_equal(name, name_len, + (ntfschar*)((u8*)a + le16_to_cpu(a->name_offset)), + a->name_length, ic, upcase, upcase_len)) { + register int rc; + + rc = ntfs_collate_names(name, name_len, + (ntfschar*)((u8*)a + + le16_to_cpu(a->name_offset)), + a->name_length, 1, IGNORE_CASE, + upcase, upcase_len); + /* + * If @name collates before a->name, there is no + * matching attribute. + */ + if (rc == -1) + return -ENOENT; + /* If the strings are not equal, continue search. */ + if (rc) + continue; + rc = ntfs_collate_names(name, name_len, + (ntfschar*)((u8*)a + + le16_to_cpu(a->name_offset)), + a->name_length, 1, CASE_SENSITIVE, + upcase, upcase_len); + if (rc == -1) + return -ENOENT; + if (rc) + continue; + } + /* + * The names match or @name not present and attribute is + * unnamed. If no @val specified, we have found the attribute + * and are done. + */ + if (!val) + return 0; + /* @val is present; compare values. */ + else { + register int rc; + + rc = memcmp(val, (u8*)a + le16_to_cpu( + a->data.resident.value_offset), + min_t(u32, val_len, le32_to_cpu( + a->data.resident.value_length))); + /* + * If @val collates before the current attribute's + * value, there is no matching attribute. + */ + if (!rc) { + register u32 avl; + + avl = le32_to_cpu( + a->data.resident.value_length); + if (val_len == avl) + return 0; + if (val_len < avl) + return -ENOENT; + } else if (rc < 0) + return -ENOENT; + } + } + ntfs_error(vol->sb, "Inode is corrupt. Run chkdsk."); + NVolSetErrors(vol); + return -EIO; +} + +/** + * load_attribute_list - load an attribute list into memory + * @vol: ntfs volume from which to read + * @runlist: runlist of the attribute list + * @al_start: destination buffer + * @size: size of the destination buffer in bytes + * @initialized_size: initialized size of the attribute list + * + * Walk the runlist @runlist and load all clusters from it copying them into + * the linear buffer @al. The maximum number of bytes copied to @al is @size + * bytes. Note, @size does not need to be a multiple of the cluster size. If + * @initialized_size is less than @size, the region in @al between + * @initialized_size and @size will be zeroed and not read from disk. + * + * Return 0 on success or -errno on error. + */ +int load_attribute_list(ntfs_volume *vol, runlist *runlist, u8 *al_start, + const s64 size, const s64 initialized_size) +{ + LCN lcn; + u8 *al = al_start; + u8 *al_end = al + initialized_size; + runlist_element *rl; + struct buffer_head *bh; + struct super_block *sb; + unsigned long block_size; + unsigned long block, max_block; + int err = 0; + unsigned char block_size_bits; + + ntfs_debug("Entering."); + if (!vol || !runlist || !al || size <= 0 || initialized_size < 0 || + initialized_size > size) + return -EINVAL; + if (!initialized_size) { + memset(al, 0, size); + return 0; + } + sb = vol->sb; + block_size = sb->s_blocksize; + block_size_bits = sb->s_blocksize_bits; + down_read(&runlist->lock); + rl = runlist->rl; + if (!rl) { + ntfs_error(sb, "Cannot read attribute list since runlist is " + "missing."); + goto err_out; + } + /* Read all clusters specified by the runlist one run at a time. */ + while (rl->length) { + lcn = ntfs_rl_vcn_to_lcn(rl, rl->vcn); + ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.", + (unsigned long long)rl->vcn, + (unsigned long long)lcn); + /* The attribute list cannot be sparse. */ + if (lcn < 0) { + ntfs_error(sb, "ntfs_rl_vcn_to_lcn() failed. Cannot " + "read attribute list."); + goto err_out; + } + block = lcn << vol->cluster_size_bits >> block_size_bits; + /* Read the run from device in chunks of block_size bytes. */ + max_block = block + (rl->length << vol->cluster_size_bits >> + block_size_bits); + ntfs_debug("max_block = 0x%lx.", max_block); + do { + ntfs_debug("Reading block = 0x%lx.", block); + bh = sb_bread(sb, block); + if (!bh) { + ntfs_error(sb, "sb_bread() failed. Cannot " + "read attribute list."); + goto err_out; + } + if (al + block_size >= al_end) + goto do_final; + memcpy(al, bh->b_data, block_size); + brelse(bh); + al += block_size; + } while (++block < max_block); + rl++; + } + if (initialized_size < size) { +initialize: + memset(al_start + initialized_size, 0, size - initialized_size); + } +done: + up_read(&runlist->lock); + return err; +do_final: + if (al < al_end) { + /* + * Partial block. + * + * Note: The attribute list can be smaller than its allocation + * by multiple clusters. This has been encountered by at least + * two people running Windows XP, thus we cannot do any + * truncation sanity checking here. (AIA) + */ + memcpy(al, bh->b_data, al_end - al); + brelse(bh); + if (initialized_size < size) + goto initialize; + goto done; + } + brelse(bh); + /* Real overflow! */ + ntfs_error(sb, "Attribute list buffer overflow. Read attribute list " + "is truncated."); +err_out: + err = -EIO; + goto done; +} + +/** + * ntfs_external_attr_find - find an attribute in the attribute list of an inode + * @type: attribute type to find + * @name: attribute name to find (optional, i.e. NULL means don't care) + * @name_len: attribute name length (only needed if @name present) + * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present) + * @lowest_vcn: lowest vcn to find (optional, non-resident attributes only) + * @val: attribute value to find (optional, resident attributes only) + * @val_len: attribute value length + * @ctx: search context with mft record and attribute to search from + * + * You should not need to call this function directly. Use ntfs_attr_lookup() + * instead. + * + * Find an attribute by searching the attribute list for the corresponding + * attribute list entry. Having found the entry, map the mft record if the + * attribute is in a different mft record/inode, ntfs_attr_find() the attribute + * in there and return it. + * + * On first search @ctx->ntfs_ino must be the base mft record and @ctx must + * have been obtained from a call to ntfs_attr_get_search_ctx(). On subsequent + * calls @ctx->ntfs_ino can be any extent inode, too (@ctx->base_ntfs_ino is + * then the base inode). + * + * After finishing with the attribute/mft record you need to call + * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any + * mapped inodes, etc). + * + * If the attribute is found, ntfs_external_attr_find() returns 0 and + * @ctx->attr will point to the found attribute. @ctx->mrec will point to the + * mft record in which @ctx->attr is located and @ctx->al_entry will point to + * the attribute list entry for the attribute. + * + * If the attribute is not found, ntfs_external_attr_find() returns -ENOENT and + * @ctx->attr will point to the attribute in the base mft record before which + * the attribute being searched for would need to be inserted if such an action + * were to be desired. @ctx->mrec will point to the mft record in which + * @ctx->attr is located and @ctx->al_entry will point to the attribute list + * entry of the attribute before which the attribute being searched for would + * need to be inserted if such an action were to be desired. + * + * Thus to insert the not found attribute, one wants to add the attribute to + * @ctx->mrec (the base mft record) and if there is not enough space, the + * attribute should be placed in a newly allocated extent mft record. The + * attribute list entry for the inserted attribute should be inserted in the + * attribute list attribute at @ctx->al_entry. + * + * On actual error, ntfs_external_attr_find() returns -EIO. In this case + * @ctx->attr is undefined and in particular do not rely on it not changing. + */ +static int ntfs_external_attr_find(const ATTR_TYPE type, + const ntfschar *name, const u32 name_len, + const IGNORE_CASE_BOOL ic, const VCN lowest_vcn, + const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx) +{ + ntfs_inode *base_ni, *ni; + ntfs_volume *vol; + ATTR_LIST_ENTRY *al_entry, *next_al_entry; + u8 *al_start, *al_end; + ATTR_RECORD *a; + ntfschar *al_name; + u32 al_name_len; + int err = 0; + static const char *es = " Unmount and run chkdsk."; + + ni = ctx->ntfs_ino; + base_ni = ctx->base_ntfs_ino; + ntfs_debug("Entering for inode 0x%lx, type 0x%x.", ni->mft_no, type); + if (!base_ni) { + /* First call happens with the base mft record. */ + base_ni = ctx->base_ntfs_ino = ctx->ntfs_ino; + ctx->base_mrec = ctx->mrec; + } + if (ni == base_ni) + ctx->base_attr = ctx->attr; + if (type == AT_END) + goto not_found; + vol = base_ni->vol; + al_start = base_ni->attr_list; + al_end = al_start + base_ni->attr_list_size; + if (!ctx->al_entry) + ctx->al_entry = (ATTR_LIST_ENTRY*)al_start; + /* + * Iterate over entries in attribute list starting at @ctx->al_entry, + * or the entry following that, if @ctx->is_first is 'true'. + */ + if (ctx->is_first) { + al_entry = ctx->al_entry; + ctx->is_first = false; + } else + al_entry = (ATTR_LIST_ENTRY*)((u8*)ctx->al_entry + + le16_to_cpu(ctx->al_entry->length)); + for (;; al_entry = next_al_entry) { + /* Out of bounds check. */ + if ((u8*)al_entry < base_ni->attr_list || + (u8*)al_entry > al_end) + break; /* Inode is corrupt. */ + ctx->al_entry = al_entry; + /* Catch the end of the attribute list. */ + if ((u8*)al_entry == al_end) + goto not_found; + if (!al_entry->length) + break; + if ((u8*)al_entry + 6 > al_end || (u8*)al_entry + + le16_to_cpu(al_entry->length) > al_end) + break; + next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry + + le16_to_cpu(al_entry->length)); + if (le32_to_cpu(al_entry->type) > le32_to_cpu(type)) + goto not_found; + if (type != al_entry->type) + continue; + /* + * If @name is present, compare the two names. If @name is + * missing, assume we want an unnamed attribute. + */ + al_name_len = al_entry->name_length; + al_name = (ntfschar*)((u8*)al_entry + al_entry->name_offset); + if (!name) { + if (al_name_len) + goto not_found; + } else if (!ntfs_are_names_equal(al_name, al_name_len, name, + name_len, ic, vol->upcase, vol->upcase_len)) { + register int rc; + + rc = ntfs_collate_names(name, name_len, al_name, + al_name_len, 1, IGNORE_CASE, + vol->upcase, vol->upcase_len); + /* + * If @name collates before al_name, there is no + * matching attribute. + */ + if (rc == -1) + goto not_found; + /* If the strings are not equal, continue search. */ + if (rc) + continue; + /* + * FIXME: Reverse engineering showed 0, IGNORE_CASE but + * that is inconsistent with ntfs_attr_find(). The + * subsequent rc checks were also different. Perhaps I + * made a mistake in one of the two. Need to recheck + * which is correct or at least see what is going on... + * (AIA) + */ + rc = ntfs_collate_names(name, name_len, al_name, + al_name_len, 1, CASE_SENSITIVE, + vol->upcase, vol->upcase_len); + if (rc == -1) + goto not_found; + if (rc) + continue; + } + /* + * The names match or @name not present and attribute is + * unnamed. Now check @lowest_vcn. Continue search if the + * next attribute list entry still fits @lowest_vcn. Otherwise + * we have reached the right one or the search has failed. + */ + if (lowest_vcn && (u8*)next_al_entry >= al_start && + (u8*)next_al_entry + 6 < al_end && + (u8*)next_al_entry + le16_to_cpu( + next_al_entry->length) <= al_end && + sle64_to_cpu(next_al_entry->lowest_vcn) <= + lowest_vcn && + next_al_entry->type == al_entry->type && + next_al_entry->name_length == al_name_len && + ntfs_are_names_equal((ntfschar*)((u8*) + next_al_entry + + next_al_entry->name_offset), + next_al_entry->name_length, + al_name, al_name_len, CASE_SENSITIVE, + vol->upcase, vol->upcase_len)) + continue; + if (MREF_LE(al_entry->mft_reference) == ni->mft_no) { + if (MSEQNO_LE(al_entry->mft_reference) != ni->seq_no) { + ntfs_error(vol->sb, "Found stale mft " + "reference in attribute list " + "of base inode 0x%lx.%s", + base_ni->mft_no, es); + err = -EIO; + break; + } + } else { /* Mft references do not match. */ + /* If there is a mapped record unmap it first. */ + if (ni != base_ni) + unmap_extent_mft_record(ni); + /* Do we want the base record back? */ + if (MREF_LE(al_entry->mft_reference) == + base_ni->mft_no) { + ni = ctx->ntfs_ino = base_ni; + ctx->mrec = ctx->base_mrec; + } else { + /* We want an extent record. */ + ctx->mrec = map_extent_mft_record(base_ni, + le64_to_cpu( + al_entry->mft_reference), &ni); + if (IS_ERR(ctx->mrec)) { + ntfs_error(vol->sb, "Failed to map " + "extent mft record " + "0x%lx of base inode " + "0x%lx.%s", + MREF_LE(al_entry-> + mft_reference), + base_ni->mft_no, es); + err = PTR_ERR(ctx->mrec); + if (err == -ENOENT) + err = -EIO; + /* Cause @ctx to be sanitized below. */ + ni = NULL; + break; + } + ctx->ntfs_ino = ni; + } + ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec + + le16_to_cpu(ctx->mrec->attrs_offset)); + } + /* + * ctx->vfs_ino, ctx->mrec, and ctx->attr now point to the + * mft record containing the attribute represented by the + * current al_entry. + */ + /* + * We could call into ntfs_attr_find() to find the right + * attribute in this mft record but this would be less + * efficient and not quite accurate as ntfs_attr_find() ignores + * the attribute instance numbers for example which become + * important when one plays with attribute lists. Also, + * because a proper match has been found in the attribute list + * entry above, the comparison can now be optimized. So it is + * worth re-implementing a simplified ntfs_attr_find() here. + */ + a = ctx->attr; + /* + * Use a manual loop so we can still use break and continue + * with the same meanings as above. + */ +do_next_attr_loop: + if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec + + le32_to_cpu(ctx->mrec->bytes_allocated)) + break; + if (a->type == AT_END) + break; + if (!a->length) + break; + if (al_entry->instance != a->instance) + goto do_next_attr; + /* + * If the type and/or the name are mismatched between the + * attribute list entry and the attribute record, there is + * corruption so we break and return error EIO. + */ + if (al_entry->type != a->type) + break; + if (!ntfs_are_names_equal((ntfschar*)((u8*)a + + le16_to_cpu(a->name_offset)), a->name_length, + al_name, al_name_len, CASE_SENSITIVE, + vol->upcase, vol->upcase_len)) + break; + ctx->attr = a; + /* + * If no @val specified or @val specified and it matches, we + * have found it! + */ + if (!val || (!a->non_resident && le32_to_cpu( + a->data.resident.value_length) == val_len && + !memcmp((u8*)a + + le16_to_cpu(a->data.resident.value_offset), + val, val_len))) { + ntfs_debug("Done, found."); + return 0; + } +do_next_attr: + /* Proceed to the next attribute in the current mft record. */ + a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length)); + goto do_next_attr_loop; + } + if (!err) { + ntfs_error(vol->sb, "Base inode 0x%lx contains corrupt " + "attribute list attribute.%s", base_ni->mft_no, + es); + err = -EIO; + } + if (ni != base_ni) { + if (ni) + unmap_extent_mft_record(ni); + ctx->ntfs_ino = base_ni; + ctx->mrec = ctx->base_mrec; + ctx->attr = ctx->base_attr; + } + if (err != -ENOMEM) + NVolSetErrors(vol); + return err; +not_found: + /* + * If we were looking for AT_END, we reset the search context @ctx and + * use ntfs_attr_find() to seek to the end of the base mft record. + */ + if (type == AT_END) { + ntfs_attr_reinit_search_ctx(ctx); + return ntfs_attr_find(AT_END, name, name_len, ic, val, val_len, + ctx); + } + /* + * The attribute was not found. Before we return, we want to ensure + * @ctx->mrec and @ctx->attr indicate the position at which the + * attribute should be inserted in the base mft record. Since we also + * want to preserve @ctx->al_entry we cannot reinitialize the search + * context using ntfs_attr_reinit_search_ctx() as this would set + * @ctx->al_entry to NULL. Thus we do the necessary bits manually (see + * ntfs_attr_init_search_ctx() below). Note, we _only_ preserve + * @ctx->al_entry as the remaining fields (base_*) are identical to + * their non base_ counterparts and we cannot set @ctx->base_attr + * correctly yet as we do not know what @ctx->attr will be set to by + * the call to ntfs_attr_find() below. + */ + if (ni != base_ni) + unmap_extent_mft_record(ni); + ctx->mrec = ctx->base_mrec; + ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec + + le16_to_cpu(ctx->mrec->attrs_offset)); + ctx->is_first = true; + ctx->ntfs_ino = base_ni; + ctx->base_ntfs_ino = NULL; + ctx->base_mrec = NULL; + ctx->base_attr = NULL; + /* + * In case there are multiple matches in the base mft record, need to + * keep enumerating until we get an attribute not found response (or + * another error), otherwise we would keep returning the same attribute + * over and over again and all programs using us for enumeration would + * lock up in a tight loop. + */ + do { + err = ntfs_attr_find(type, name, name_len, ic, val, val_len, + ctx); + } while (!err); + ntfs_debug("Done, not found."); + return err; +} + +/** + * ntfs_attr_lookup - find an attribute in an ntfs inode + * @type: attribute type to find + * @name: attribute name to find (optional, i.e. NULL means don't care) + * @name_len: attribute name length (only needed if @name present) + * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present) + * @lowest_vcn: lowest vcn to find (optional, non-resident attributes only) + * @val: attribute value to find (optional, resident attributes only) + * @val_len: attribute value length + * @ctx: search context with mft record and attribute to search from + * + * Find an attribute in an ntfs inode. On first search @ctx->ntfs_ino must + * be the base mft record and @ctx must have been obtained from a call to + * ntfs_attr_get_search_ctx(). + * + * This function transparently handles attribute lists and @ctx is used to + * continue searches where they were left off at. + * + * After finishing with the attribute/mft record you need to call + * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any + * mapped inodes, etc). + * + * Return 0 if the search was successful and -errno if not. + * + * When 0, @ctx->attr is the found attribute and it is in mft record + * @ctx->mrec. If an attribute list attribute is present, @ctx->al_entry is + * the attribute list entry of the found attribute. + * + * When -ENOENT, @ctx->attr is the attribute which collates just after the + * attribute being searched for, i.e. if one wants to add the attribute to the + * mft record this is the correct place to insert it into. If an attribute + * list attribute is present, @ctx->al_entry is the attribute list entry which + * collates just after the attribute list entry of the attribute being searched + * for, i.e. if one wants to add the attribute to the mft record this is the + * correct place to insert its attribute list entry into. + * + * When -errno != -ENOENT, an error occurred during the lookup. @ctx->attr is + * then undefined and in particular you should not rely on it not changing. + */ +int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name, + const u32 name_len, const IGNORE_CASE_BOOL ic, + const VCN lowest_vcn, const u8 *val, const u32 val_len, + ntfs_attr_search_ctx *ctx) +{ + ntfs_inode *base_ni; + + ntfs_debug("Entering."); + BUG_ON(IS_ERR(ctx->mrec)); + if (ctx->base_ntfs_ino) + base_ni = ctx->base_ntfs_ino; + else + base_ni = ctx->ntfs_ino; + /* Sanity check, just for debugging really. */ + BUG_ON(!base_ni); + if (!NInoAttrList(base_ni) || type == AT_ATTRIBUTE_LIST) + return ntfs_attr_find(type, name, name_len, ic, val, val_len, + ctx); + return ntfs_external_attr_find(type, name, name_len, ic, lowest_vcn, + val, val_len, ctx); +} + +/** + * ntfs_attr_init_search_ctx - initialize an attribute search context + * @ctx: attribute search context to initialize + * @ni: ntfs inode with which to initialize the search context + * @mrec: mft record with which to initialize the search context + * + * Initialize the attribute search context @ctx with @ni and @mrec. + */ +static inline void ntfs_attr_init_search_ctx(ntfs_attr_search_ctx *ctx, + ntfs_inode *ni, MFT_RECORD *mrec) +{ + *ctx = (ntfs_attr_search_ctx) { + .mrec = mrec, + /* Sanity checks are performed elsewhere. */ + .attr = (ATTR_RECORD*)((u8*)mrec + + le16_to_cpu(mrec->attrs_offset)), + .is_first = true, + .ntfs_ino = ni, + }; +} + +/** + * ntfs_attr_reinit_search_ctx - reinitialize an attribute search context + * @ctx: attribute search context to reinitialize + * + * Reinitialize the attribute search context @ctx, unmapping an associated + * extent mft record if present, and initialize the search context again. + * + * This is used when a search for a new attribute is being started to reset + * the search context to the beginning. + */ +void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx) +{ + if (likely(!ctx->base_ntfs_ino)) { + /* No attribute list. */ + ctx->is_first = true; + /* Sanity checks are performed elsewhere. */ + ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec + + le16_to_cpu(ctx->mrec->attrs_offset)); + /* + * This needs resetting due to ntfs_external_attr_find() which + * can leave it set despite having zeroed ctx->base_ntfs_ino. + */ + ctx->al_entry = NULL; + return; + } /* Attribute list. */ + if (ctx->ntfs_ino != ctx->base_ntfs_ino) + unmap_extent_mft_record(ctx->ntfs_ino); + ntfs_attr_init_search_ctx(ctx, ctx->base_ntfs_ino, ctx->base_mrec); + return; +} + +/** + * ntfs_attr_get_search_ctx - allocate/initialize a new attribute search context + * @ni: ntfs inode with which to initialize the search context + * @mrec: mft record with which to initialize the search context + * + * Allocate a new attribute search context, initialize it with @ni and @mrec, + * and return it. Return NULL if allocation failed. + */ +ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, MFT_RECORD *mrec) +{ + ntfs_attr_search_ctx *ctx; + + ctx = kmem_cache_alloc(ntfs_attr_ctx_cache, GFP_NOFS); + if (ctx) + ntfs_attr_init_search_ctx(ctx, ni, mrec); + return ctx; +} + +/** + * ntfs_attr_put_search_ctx - release an attribute search context + * @ctx: attribute search context to free + * + * Release the attribute search context @ctx, unmapping an associated extent + * mft record if present. + */ +void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx) +{ + if (ctx->base_ntfs_ino && ctx->ntfs_ino != ctx->base_ntfs_ino) + unmap_extent_mft_record(ctx->ntfs_ino); + kmem_cache_free(ntfs_attr_ctx_cache, ctx); + return; +} + +#ifdef NTFS_RW + +/** + * ntfs_attr_find_in_attrdef - find an attribute in the $AttrDef system file + * @vol: ntfs volume to which the attribute belongs + * @type: attribute type which to find + * + * Search for the attribute definition record corresponding to the attribute + * @type in the $AttrDef system file. + * + * Return the attribute type definition record if found and NULL if not found. + */ +static ATTR_DEF *ntfs_attr_find_in_attrdef(const ntfs_volume *vol, + const ATTR_TYPE type) +{ + ATTR_DEF *ad; + + BUG_ON(!vol->attrdef); + BUG_ON(!type); + for (ad = vol->attrdef; (u8*)ad - (u8*)vol->attrdef < + vol->attrdef_size && ad->type; ++ad) { + /* We have not found it yet, carry on searching. */ + if (likely(le32_to_cpu(ad->type) < le32_to_cpu(type))) + continue; + /* We found the attribute; return it. */ + if (likely(ad->type == type)) + return ad; + /* We have gone too far already. No point in continuing. */ + break; + } + /* Attribute not found. */ + ntfs_debug("Attribute type 0x%x not found in $AttrDef.", + le32_to_cpu(type)); + return NULL; +} + +/** + * ntfs_attr_size_bounds_check - check a size of an attribute type for validity + * @vol: ntfs volume to which the attribute belongs + * @type: attribute type which to check + * @size: size which to check + * + * Check whether the @size in bytes is valid for an attribute of @type on the + * ntfs volume @vol. This information is obtained from $AttrDef system file. + * + * Return 0 if valid, -ERANGE if not valid, or -ENOENT if the attribute is not + * listed in $AttrDef. + */ +int ntfs_attr_size_bounds_check(const ntfs_volume *vol, const ATTR_TYPE type, + const s64 size) +{ + ATTR_DEF *ad; + + BUG_ON(size < 0); + /* + * $ATTRIBUTE_LIST has a maximum size of 256kiB, but this is not + * listed in $AttrDef. + */ + if (unlikely(type == AT_ATTRIBUTE_LIST && size > 256 * 1024)) + return -ERANGE; + /* Get the $AttrDef entry for the attribute @type. */ + ad = ntfs_attr_find_in_attrdef(vol, type); + if (unlikely(!ad)) + return -ENOENT; + /* Do the bounds check. */ + if (((sle64_to_cpu(ad->min_size) > 0) && + size < sle64_to_cpu(ad->min_size)) || + ((sle64_to_cpu(ad->max_size) > 0) && size > + sle64_to_cpu(ad->max_size))) + return -ERANGE; + return 0; +} + +/** + * ntfs_attr_can_be_non_resident - check if an attribute can be non-resident + * @vol: ntfs volume to which the attribute belongs + * @type: attribute type which to check + * + * Check whether the attribute of @type on the ntfs volume @vol is allowed to + * be non-resident. This information is obtained from $AttrDef system file. + * + * Return 0 if the attribute is allowed to be non-resident, -EPERM if not, and + * -ENOENT if the attribute is not listed in $AttrDef. + */ +int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, const ATTR_TYPE type) +{ + ATTR_DEF *ad; + + /* Find the attribute definition record in $AttrDef. */ + ad = ntfs_attr_find_in_attrdef(vol, type); + if (unlikely(!ad)) + return -ENOENT; + /* Check the flags and return the result. */ + if (ad->flags & ATTR_DEF_RESIDENT) + return -EPERM; + return 0; +} + +/** + * ntfs_attr_can_be_resident - check if an attribute can be resident + * @vol: ntfs volume to which the attribute belongs + * @type: attribute type which to check + * + * Check whether the attribute of @type on the ntfs volume @vol is allowed to + * be resident. This information is derived from our ntfs knowledge and may + * not be completely accurate, especially when user defined attributes are + * present. Basically we allow everything to be resident except for index + * allocation and $EA attributes. + * + * Return 0 if the attribute is allowed to be non-resident and -EPERM if not. + * + * Warning: In the system file $MFT the attribute $Bitmap must be non-resident + * otherwise windows will not boot (blue screen of death)! We cannot + * check for this here as we do not know which inode's $Bitmap is + * being asked about so the caller needs to special case this. + */ +int ntfs_attr_can_be_resident(const ntfs_volume *vol, const ATTR_TYPE type) +{ + if (type == AT_INDEX_ALLOCATION) + return -EPERM; + return 0; +} + +/** + * ntfs_attr_record_resize - resize an attribute record + * @m: mft record containing attribute record + * @a: attribute record to resize + * @new_size: new size in bytes to which to resize the attribute record @a + * + * Resize the attribute record @a, i.e. the resident part of the attribute, in + * the mft record @m to @new_size bytes. + * + * Return 0 on success and -errno on error. The following error codes are + * defined: + * -ENOSPC - Not enough space in the mft record @m to perform the resize. + * + * Note: On error, no modifications have been performed whatsoever. + * + * Warning: If you make a record smaller without having copied all the data you + * are interested in the data may be overwritten. + */ +int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size) +{ + ntfs_debug("Entering for new_size %u.", new_size); + /* Align to 8 bytes if it is not already done. */ + if (new_size & 7) + new_size = (new_size + 7) & ~7; + /* If the actual attribute length has changed, move things around. */ + if (new_size != le32_to_cpu(a->length)) { + u32 new_muse = le32_to_cpu(m->bytes_in_use) - + le32_to_cpu(a->length) + new_size; + /* Not enough space in this mft record. */ + if (new_muse > le32_to_cpu(m->bytes_allocated)) + return -ENOSPC; + /* Move attributes following @a to their new location. */ + memmove((u8*)a + new_size, (u8*)a + le32_to_cpu(a->length), + le32_to_cpu(m->bytes_in_use) - ((u8*)a - + (u8*)m) - le32_to_cpu(a->length)); + /* Adjust @m to reflect the change in used space. */ + m->bytes_in_use = cpu_to_le32(new_muse); + /* Adjust @a to reflect the new size. */ + if (new_size >= offsetof(ATTR_REC, length) + sizeof(a->length)) + a->length = cpu_to_le32(new_size); + } + return 0; +} + +/** + * ntfs_resident_attr_value_resize - resize the value of a resident attribute + * @m: mft record containing attribute record + * @a: attribute record whose value to resize + * @new_size: new size in bytes to which to resize the attribute value of @a + * + * Resize the value of the attribute @a in the mft record @m to @new_size bytes. + * If the value is made bigger, the newly allocated space is cleared. + * + * Return 0 on success and -errno on error. The following error codes are + * defined: + * -ENOSPC - Not enough space in the mft record @m to perform the resize. + * + * Note: On error, no modifications have been performed whatsoever. + * + * Warning: If you make a record smaller without having copied all the data you + * are interested in the data may be overwritten. + */ +int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a, + const u32 new_size) +{ + u32 old_size; + + /* Resize the resident part of the attribute record. */ + if (ntfs_attr_record_resize(m, a, + le16_to_cpu(a->data.resident.value_offset) + new_size)) + return -ENOSPC; + /* + * The resize succeeded! If we made the attribute value bigger, clear + * the area between the old size and @new_size. + */ + old_size = le32_to_cpu(a->data.resident.value_length); + if (new_size > old_size) + memset((u8*)a + le16_to_cpu(a->data.resident.value_offset) + + old_size, 0, new_size - old_size); + /* Finally update the length of the attribute value. */ + a->data.resident.value_length = cpu_to_le32(new_size); + return 0; +} + +/** + * ntfs_attr_make_non_resident - convert a resident to a non-resident attribute + * @ni: ntfs inode describing the attribute to convert + * @data_size: size of the resident data to copy to the non-resident attribute + * + * Convert the resident ntfs attribute described by the ntfs inode @ni to a + * non-resident one. + * + * @data_size must be equal to the attribute value size. This is needed since + * we need to know the size before we can map the mft record and our callers + * always know it. The reason we cannot simply read the size from the vfs + * inode i_size is that this is not necessarily uptodate. This happens when + * ntfs_attr_make_non_resident() is called in the ->truncate call path(s). + * + * Return 0 on success and -errno on error. The following error return codes + * are defined: + * -EPERM - The attribute is not allowed to be non-resident. + * -ENOMEM - Not enough memory. + * -ENOSPC - Not enough disk space. + * -EINVAL - Attribute not defined on the volume. + * -EIO - I/o error or other error. + * Note that -ENOSPC is also returned in the case that there is not enough + * space in the mft record to do the conversion. This can happen when the mft + * record is already very full. The caller is responsible for trying to make + * space in the mft record and trying again. FIXME: Do we need a separate + * error return code for this kind of -ENOSPC or is it always worth trying + * again in case the attribute may then fit in a resident state so no need to + * make it non-resident at all? Ho-hum... (AIA) + * + * NOTE to self: No changes in the attribute list are required to move from + * a resident to a non-resident attribute. + * + * Locking: - The caller must hold i_mutex on the inode. + */ +int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size) +{ + s64 new_size; + struct inode *vi = VFS_I(ni); + ntfs_volume *vol = ni->vol; + ntfs_inode *base_ni; + MFT_RECORD *m; + ATTR_RECORD *a; + ntfs_attr_search_ctx *ctx; + struct page *page; + runlist_element *rl; + u8 *kaddr; + unsigned long flags; + int mp_size, mp_ofs, name_ofs, arec_size, err, err2; + u32 attr_size; + u8 old_res_attr_flags; + + /* Check that the attribute is allowed to be non-resident. */ + err = ntfs_attr_can_be_non_resident(vol, ni->type); + if (unlikely(err)) { + if (err == -EPERM) + ntfs_debug("Attribute is not allowed to be " + "non-resident."); + else + ntfs_debug("Attribute not defined on the NTFS " + "volume!"); + return err; + } + /* + * FIXME: Compressed and encrypted attributes are not supported when + * writing and we should never have gotten here for them. + */ + BUG_ON(NInoCompressed(ni)); + BUG_ON(NInoEncrypted(ni)); + /* + * The size needs to be aligned to a cluster boundary for allocation + * purposes. + */ + new_size = (data_size + vol->cluster_size - 1) & + ~(vol->cluster_size - 1); + if (new_size > 0) { + /* + * Will need the page later and since the page lock nests + * outside all ntfs locks, we need to get the page now. + */ + page = find_or_create_page(vi->i_mapping, 0, + mapping_gfp_mask(vi->i_mapping)); + if (unlikely(!page)) + return -ENOMEM; + /* Start by allocating clusters to hold the attribute value. */ + rl = ntfs_cluster_alloc(vol, 0, new_size >> + vol->cluster_size_bits, -1, DATA_ZONE, true); + if (IS_ERR(rl)) { + err = PTR_ERR(rl); + ntfs_debug("Failed to allocate cluster%s, error code " + "%i.", (new_size >> + vol->cluster_size_bits) > 1 ? "s" : "", + err); + goto page_err_out; + } + } else { + rl = NULL; + page = NULL; + } + /* Determine the size of the mapping pairs array. */ + mp_size = ntfs_get_size_for_mapping_pairs(vol, rl, 0, -1); + if (unlikely(mp_size < 0)) { + err = mp_size; + ntfs_debug("Failed to get size for mapping pairs array, error " + "code %i.", err); + goto rl_err_out; + } + down_write(&ni->runlist.lock); + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + ctx = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + m = ctx->mrec; + a = ctx->attr; + BUG_ON(NInoNonResident(ni)); + BUG_ON(a->non_resident); + /* + * Calculate new offsets for the name and the mapping pairs array. + */ + if (NInoSparse(ni) || NInoCompressed(ni)) + name_ofs = (offsetof(ATTR_REC, + data.non_resident.compressed_size) + + sizeof(a->data.non_resident.compressed_size) + + 7) & ~7; + else + name_ofs = (offsetof(ATTR_REC, + data.non_resident.compressed_size) + 7) & ~7; + mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7; + /* + * Determine the size of the resident part of the now non-resident + * attribute record. + */ + arec_size = (mp_ofs + mp_size + 7) & ~7; + /* + * If the page is not uptodate bring it uptodate by copying from the + * attribute value. + */ + attr_size = le32_to_cpu(a->data.resident.value_length); + BUG_ON(attr_size != data_size); + if (page && !PageUptodate(page)) { + kaddr = kmap_atomic(page); + memcpy(kaddr, (u8*)a + + le16_to_cpu(a->data.resident.value_offset), + attr_size); + memset(kaddr + attr_size, 0, PAGE_SIZE - attr_size); + kunmap_atomic(kaddr); + flush_dcache_page(page); + SetPageUptodate(page); + } + /* Backup the attribute flag. */ + old_res_attr_flags = a->data.resident.flags; + /* Resize the resident part of the attribute record. */ + err = ntfs_attr_record_resize(m, a, arec_size); + if (unlikely(err)) + goto err_out; + /* + * Convert the resident part of the attribute record to describe a + * non-resident attribute. + */ + a->non_resident = 1; + /* Move the attribute name if it exists and update the offset. */ + if (a->name_length) + memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset), + a->name_length * sizeof(ntfschar)); + a->name_offset = cpu_to_le16(name_ofs); + /* Setup the fields specific to non-resident attributes. */ + a->data.non_resident.lowest_vcn = 0; + a->data.non_resident.highest_vcn = cpu_to_sle64((new_size - 1) >> + vol->cluster_size_bits); + a->data.non_resident.mapping_pairs_offset = cpu_to_le16(mp_ofs); + memset(&a->data.non_resident.reserved, 0, + sizeof(a->data.non_resident.reserved)); + a->data.non_resident.allocated_size = cpu_to_sle64(new_size); + a->data.non_resident.data_size = + a->data.non_resident.initialized_size = + cpu_to_sle64(attr_size); + if (NInoSparse(ni) || NInoCompressed(ni)) { + a->data.non_resident.compression_unit = 0; + if (NInoCompressed(ni) || vol->major_ver < 3) + a->data.non_resident.compression_unit = 4; + a->data.non_resident.compressed_size = + a->data.non_resident.allocated_size; + } else + a->data.non_resident.compression_unit = 0; + /* Generate the mapping pairs array into the attribute record. */ + err = ntfs_mapping_pairs_build(vol, (u8*)a + mp_ofs, + arec_size - mp_ofs, rl, 0, -1, NULL); + if (unlikely(err)) { + ntfs_debug("Failed to build mapping pairs, error code %i.", + err); + goto undo_err_out; + } + /* Setup the in-memory attribute structure to be non-resident. */ + ni->runlist.rl = rl; + write_lock_irqsave(&ni->size_lock, flags); + ni->allocated_size = new_size; + if (NInoSparse(ni) || NInoCompressed(ni)) { + ni->itype.compressed.size = ni->allocated_size; + if (a->data.non_resident.compression_unit) { + ni->itype.compressed.block_size = 1U << (a->data. + non_resident.compression_unit + + vol->cluster_size_bits); + ni->itype.compressed.block_size_bits = + ffs(ni->itype.compressed.block_size) - + 1; + ni->itype.compressed.block_clusters = 1U << + a->data.non_resident.compression_unit; + } else { + ni->itype.compressed.block_size = 0; + ni->itype.compressed.block_size_bits = 0; + ni->itype.compressed.block_clusters = 0; + } + vi->i_blocks = ni->itype.compressed.size >> 9; + } else + vi->i_blocks = ni->allocated_size >> 9; + write_unlock_irqrestore(&ni->size_lock, flags); + /* + * This needs to be last since the address space operations ->read_folio + * and ->writepage can run concurrently with us as they are not + * serialized on i_mutex. Note, we are not allowed to fail once we flip + * this switch, which is another reason to do this last. + */ + NInoSetNonResident(ni); + /* Mark the mft record dirty, so it gets written back. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); + if (page) { + set_page_dirty(page); + unlock_page(page); + put_page(page); + } + ntfs_debug("Done."); + return 0; +undo_err_out: + /* Convert the attribute back into a resident attribute. */ + a->non_resident = 0; + /* Move the attribute name if it exists and update the offset. */ + name_ofs = (offsetof(ATTR_RECORD, data.resident.reserved) + + sizeof(a->data.resident.reserved) + 7) & ~7; + if (a->name_length) + memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset), + a->name_length * sizeof(ntfschar)); + mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7; + a->name_offset = cpu_to_le16(name_ofs); + arec_size = (mp_ofs + attr_size + 7) & ~7; + /* Resize the resident part of the attribute record. */ + err2 = ntfs_attr_record_resize(m, a, arec_size); + if (unlikely(err2)) { + /* + * This cannot happen (well if memory corruption is at work it + * could happen in theory), but deal with it as well as we can. + * If the old size is too small, truncate the attribute, + * otherwise simply give it a larger allocated size. + * FIXME: Should check whether chkdsk complains when the + * allocated size is much bigger than the resident value size. + */ + arec_size = le32_to_cpu(a->length); + if ((mp_ofs + attr_size) > arec_size) { + err2 = attr_size; + attr_size = arec_size - mp_ofs; + ntfs_error(vol->sb, "Failed to undo partial resident " + "to non-resident attribute " + "conversion. Truncating inode 0x%lx, " + "attribute type 0x%x from %i bytes to " + "%i bytes to maintain metadata " + "consistency. THIS MEANS YOU ARE " + "LOSING %i BYTES DATA FROM THIS %s.", + vi->i_ino, + (unsigned)le32_to_cpu(ni->type), + err2, attr_size, err2 - attr_size, + ((ni->type == AT_DATA) && + !ni->name_len) ? "FILE": "ATTRIBUTE"); + write_lock_irqsave(&ni->size_lock, flags); + ni->initialized_size = attr_size; + i_size_write(vi, attr_size); + write_unlock_irqrestore(&ni->size_lock, flags); + } + } + /* Setup the fields specific to resident attributes. */ + a->data.resident.value_length = cpu_to_le32(attr_size); + a->data.resident.value_offset = cpu_to_le16(mp_ofs); + a->data.resident.flags = old_res_attr_flags; + memset(&a->data.resident.reserved, 0, + sizeof(a->data.resident.reserved)); + /* Copy the data from the page back to the attribute value. */ + if (page) { + kaddr = kmap_atomic(page); + memcpy((u8*)a + mp_ofs, kaddr, attr_size); + kunmap_atomic(kaddr); + } + /* Setup the allocated size in the ntfs inode in case it changed. */ + write_lock_irqsave(&ni->size_lock, flags); + ni->allocated_size = arec_size - mp_ofs; + write_unlock_irqrestore(&ni->size_lock, flags); + /* Mark the mft record dirty, so it gets written back. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); +err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + ni->runlist.rl = NULL; + up_write(&ni->runlist.lock); +rl_err_out: + if (rl) { + if (ntfs_cluster_free_from_rl(vol, rl) < 0) { + ntfs_error(vol->sb, "Failed to release allocated " + "cluster(s) in error code path. Run " + "chkdsk to recover the lost " + "cluster(s)."); + NVolSetErrors(vol); + } + ntfs_free(rl); +page_err_out: + unlock_page(page); + put_page(page); + } + if (err == -EINVAL) + err = -EIO; + return err; +} + +/** + * ntfs_attr_extend_allocation - extend the allocated space of an attribute + * @ni: ntfs inode of the attribute whose allocation to extend + * @new_alloc_size: new size in bytes to which to extend the allocation to + * @new_data_size: new size in bytes to which to extend the data to + * @data_start: beginning of region which is required to be non-sparse + * + * Extend the allocated space of an attribute described by the ntfs inode @ni + * to @new_alloc_size bytes. If @data_start is -1, the whole extension may be + * implemented as a hole in the file (as long as both the volume and the ntfs + * inode @ni have sparse support enabled). If @data_start is >= 0, then the + * region between the old allocated size and @data_start - 1 may be made sparse + * but the regions between @data_start and @new_alloc_size must be backed by + * actual clusters. + * + * If @new_data_size is -1, it is ignored. If it is >= 0, then the data size + * of the attribute is extended to @new_data_size. Note that the i_size of the + * vfs inode is not updated. Only the data size in the base attribute record + * is updated. The caller has to update i_size separately if this is required. + * WARNING: It is a BUG() for @new_data_size to be smaller than the old data + * size as well as for @new_data_size to be greater than @new_alloc_size. + * + * For resident attributes this involves resizing the attribute record and if + * necessary moving it and/or other attributes into extent mft records and/or + * converting the attribute to a non-resident attribute which in turn involves + * extending the allocation of a non-resident attribute as described below. + * + * For non-resident attributes this involves allocating clusters in the data + * zone on the volume (except for regions that are being made sparse) and + * extending the run list to describe the allocated clusters as well as + * updating the mapping pairs array of the attribute. This in turn involves + * resizing the attribute record and if necessary moving it and/or other + * attributes into extent mft records and/or splitting the attribute record + * into multiple extent attribute records. + * + * Also, the attribute list attribute is updated if present and in some of the + * above cases (the ones where extent mft records/attributes come into play), + * an attribute list attribute is created if not already present. + * + * Return the new allocated size on success and -errno on error. In the case + * that an error is encountered but a partial extension at least up to + * @data_start (if present) is possible, the allocation is partially extended + * and this is returned. This means the caller must check the returned size to + * determine if the extension was partial. If @data_start is -1 then partial + * allocations are not performed. + * + * WARNING: Do not call ntfs_attr_extend_allocation() for $MFT/$DATA. + * + * Locking: This function takes the runlist lock of @ni for writing as well as + * locking the mft record of the base ntfs inode. These locks are maintained + * throughout execution of the function. These locks are required so that the + * attribute can be resized safely and so that it can for example be converted + * from resident to non-resident safely. + * + * TODO: At present attribute list attribute handling is not implemented. + * + * TODO: At present it is not safe to call this function for anything other + * than the $DATA attribute(s) of an uncompressed and unencrypted file. + */ +s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size, + const s64 new_data_size, const s64 data_start) +{ + VCN vcn; + s64 ll, allocated_size, start = data_start; + struct inode *vi = VFS_I(ni); + ntfs_volume *vol = ni->vol; + ntfs_inode *base_ni; + MFT_RECORD *m; + ATTR_RECORD *a; + ntfs_attr_search_ctx *ctx; + runlist_element *rl, *rl2; + unsigned long flags; + int err, mp_size; + u32 attr_len = 0; /* Silence stupid gcc warning. */ + bool mp_rebuilt; + +#ifdef DEBUG + read_lock_irqsave(&ni->size_lock, flags); + allocated_size = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " + "old_allocated_size 0x%llx, " + "new_allocated_size 0x%llx, new_data_size 0x%llx, " + "data_start 0x%llx.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), + (unsigned long long)allocated_size, + (unsigned long long)new_alloc_size, + (unsigned long long)new_data_size, + (unsigned long long)start); +#endif +retry_extend: + /* + * For non-resident attributes, @start and @new_size need to be aligned + * to cluster boundaries for allocation purposes. + */ + if (NInoNonResident(ni)) { + if (start > 0) + start &= ~(s64)vol->cluster_size_mask; + new_alloc_size = (new_alloc_size + vol->cluster_size - 1) & + ~(s64)vol->cluster_size_mask; + } + BUG_ON(new_data_size >= 0 && new_data_size > new_alloc_size); + /* Check if new size is allowed in $AttrDef. */ + err = ntfs_attr_size_bounds_check(vol, ni->type, new_alloc_size); + if (unlikely(err)) { + /* Only emit errors when the write will fail completely. */ + read_lock_irqsave(&ni->size_lock, flags); + allocated_size = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (start < 0 || start >= allocated_size) { + if (err == -ERANGE) { + ntfs_error(vol->sb, "Cannot extend allocation " + "of inode 0x%lx, attribute " + "type 0x%x, because the new " + "allocation would exceed the " + "maximum allowed size for " + "this attribute type.", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type)); + } else { + ntfs_error(vol->sb, "Cannot extend allocation " + "of inode 0x%lx, attribute " + "type 0x%x, because this " + "attribute type is not " + "defined on the NTFS volume. " + "Possible corruption! You " + "should run chkdsk!", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type)); + } + } + /* Translate error code to be POSIX conformant for write(2). */ + if (err == -ERANGE) + err = -EFBIG; + else + err = -EIO; + return err; + } + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + /* + * We will be modifying both the runlist (if non-resident) and the mft + * record so lock them both down. + */ + down_write(&ni->runlist.lock); + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + ctx = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + read_lock_irqsave(&ni->size_lock, flags); + allocated_size = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + /* + * If non-resident, seek to the last extent. If resident, there is + * only one extent, so seek to that. + */ + vcn = NInoNonResident(ni) ? allocated_size >> vol->cluster_size_bits : + 0; + /* + * Abort if someone did the work whilst we waited for the locks. If we + * just converted the attribute from resident to non-resident it is + * likely that exactly this has happened already. We cannot quite + * abort if we need to update the data size. + */ + if (unlikely(new_alloc_size <= allocated_size)) { + ntfs_debug("Allocated size already exceeds requested size."); + new_alloc_size = allocated_size; + if (new_data_size < 0) + goto done; + /* + * We want the first attribute extent so that we can update the + * data size. + */ + vcn = 0; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, vcn, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + m = ctx->mrec; + a = ctx->attr; + /* Use goto to reduce indentation. */ + if (a->non_resident) + goto do_non_resident_extend; + BUG_ON(NInoNonResident(ni)); + /* The total length of the attribute value. */ + attr_len = le32_to_cpu(a->data.resident.value_length); + /* + * Extend the attribute record to be able to store the new attribute + * size. ntfs_attr_record_resize() will not do anything if the size is + * not changing. + */ + if (new_alloc_size < vol->mft_record_size && + !ntfs_attr_record_resize(m, a, + le16_to_cpu(a->data.resident.value_offset) + + new_alloc_size)) { + /* The resize succeeded! */ + write_lock_irqsave(&ni->size_lock, flags); + ni->allocated_size = le32_to_cpu(a->length) - + le16_to_cpu(a->data.resident.value_offset); + write_unlock_irqrestore(&ni->size_lock, flags); + if (new_data_size >= 0) { + BUG_ON(new_data_size < attr_len); + a->data.resident.value_length = + cpu_to_le32((u32)new_data_size); + } + goto flush_done; + } + /* + * We have to drop all the locks so we can call + * ntfs_attr_make_non_resident(). This could be optimised by try- + * locking the first page cache page and only if that fails dropping + * the locks, locking the page, and redoing all the locking and + * lookups. While this would be a huge optimisation, it is not worth + * it as this is definitely a slow code path. + */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); + /* + * Not enough space in the mft record, try to make the attribute + * non-resident and if successful restart the extension process. + */ + err = ntfs_attr_make_non_resident(ni, attr_len); + if (likely(!err)) + goto retry_extend; + /* + * Could not make non-resident. If this is due to this not being + * permitted for this attribute type or there not being enough space, + * try to make other attributes non-resident. Otherwise fail. + */ + if (unlikely(err != -EPERM && err != -ENOSPC)) { + /* Only emit errors when the write will fail completely. */ + read_lock_irqsave(&ni->size_lock, flags); + allocated_size = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Cannot extend allocation of " + "inode 0x%lx, attribute type 0x%x, " + "because the conversion from resident " + "to non-resident attribute failed " + "with error code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + if (err != -ENOMEM) + err = -EIO; + goto conv_err_out; + } + /* TODO: Not implemented from here, abort. */ + read_lock_irqsave(&ni->size_lock, flags); + allocated_size = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (start < 0 || start >= allocated_size) { + if (err == -ENOSPC) + ntfs_error(vol->sb, "Not enough space in the mft " + "record/on disk for the non-resident " + "attribute value. This case is not " + "implemented yet."); + else /* if (err == -EPERM) */ + ntfs_error(vol->sb, "This attribute type may not be " + "non-resident. This case is not " + "implemented yet."); + } + err = -EOPNOTSUPP; + goto conv_err_out; +#if 0 + // TODO: Attempt to make other attributes non-resident. + if (!err) + goto do_resident_extend; + /* + * Both the attribute list attribute and the standard information + * attribute must remain in the base inode. Thus, if this is one of + * these attributes, we have to try to move other attributes out into + * extent mft records instead. + */ + if (ni->type == AT_ATTRIBUTE_LIST || + ni->type == AT_STANDARD_INFORMATION) { + // TODO: Attempt to move other attributes into extent mft + // records. + err = -EOPNOTSUPP; + if (!err) + goto do_resident_extend; + goto err_out; + } + // TODO: Attempt to move this attribute to an extent mft record, but + // only if it is not already the only attribute in an mft record in + // which case there would be nothing to gain. + err = -EOPNOTSUPP; + if (!err) + goto do_resident_extend; + /* There is nothing we can do to make enough space. )-: */ + goto err_out; +#endif +do_non_resident_extend: + BUG_ON(!NInoNonResident(ni)); + if (new_alloc_size == allocated_size) { + BUG_ON(vcn); + goto alloc_done; + } + /* + * If the data starts after the end of the old allocation, this is a + * $DATA attribute and sparse attributes are enabled on the volume and + * for this inode, then create a sparse region between the old + * allocated size and the start of the data. Otherwise simply proceed + * with filling the whole space between the old allocated size and the + * new allocated size with clusters. + */ + if ((start >= 0 && start <= allocated_size) || ni->type != AT_DATA || + !NVolSparseEnabled(vol) || NInoSparseDisabled(ni)) + goto skip_sparse; + // TODO: This is not implemented yet. We just fill in with real + // clusters for now... + ntfs_debug("Inserting holes is not-implemented yet. Falling back to " + "allocating real clusters instead."); +skip_sparse: + rl = ni->runlist.rl; + if (likely(rl)) { + /* Seek to the end of the runlist. */ + while (rl->length) + rl++; + } + /* If this attribute extent is not mapped, map it now. */ + if (unlikely(!rl || rl->lcn == LCN_RL_NOT_MAPPED || + (rl->lcn == LCN_ENOENT && rl > ni->runlist.rl && + (rl-1)->lcn == LCN_RL_NOT_MAPPED))) { + if (!rl && !allocated_size) + goto first_alloc; + rl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl); + if (IS_ERR(rl)) { + err = PTR_ERR(rl); + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Cannot extend allocation " + "of inode 0x%lx, attribute " + "type 0x%x, because the " + "mapping of a runlist " + "fragment failed with error " + "code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), + err); + if (err != -ENOMEM) + err = -EIO; + goto err_out; + } + ni->runlist.rl = rl; + /* Seek to the end of the runlist. */ + while (rl->length) + rl++; + } + /* + * We now know the runlist of the last extent is mapped and @rl is at + * the end of the runlist. We want to begin allocating clusters + * starting at the last allocated cluster to reduce fragmentation. If + * there are no valid LCNs in the attribute we let the cluster + * allocator choose the starting cluster. + */ + /* If the last LCN is a hole or simillar seek back to last real LCN. */ + while (rl->lcn < 0 && rl > ni->runlist.rl) + rl--; +first_alloc: + // FIXME: Need to implement partial allocations so at least part of the + // write can be performed when start >= 0. (Needed for POSIX write(2) + // conformance.) + rl2 = ntfs_cluster_alloc(vol, allocated_size >> vol->cluster_size_bits, + (new_alloc_size - allocated_size) >> + vol->cluster_size_bits, (rl && (rl->lcn >= 0)) ? + rl->lcn + rl->length : -1, DATA_ZONE, true); + if (IS_ERR(rl2)) { + err = PTR_ERR(rl2); + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Cannot extend allocation of " + "inode 0x%lx, attribute type 0x%x, " + "because the allocation of clusters " + "failed with error code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + if (err != -ENOMEM && err != -ENOSPC) + err = -EIO; + goto err_out; + } + rl = ntfs_runlists_merge(ni->runlist.rl, rl2); + if (IS_ERR(rl)) { + err = PTR_ERR(rl); + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Cannot extend allocation of " + "inode 0x%lx, attribute type 0x%x, " + "because the runlist merge failed " + "with error code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + if (err != -ENOMEM) + err = -EIO; + if (ntfs_cluster_free_from_rl(vol, rl2)) { + ntfs_error(vol->sb, "Failed to release allocated " + "cluster(s) in error code path. Run " + "chkdsk to recover the lost " + "cluster(s)."); + NVolSetErrors(vol); + } + ntfs_free(rl2); + goto err_out; + } + ni->runlist.rl = rl; + ntfs_debug("Allocated 0x%llx clusters.", (long long)(new_alloc_size - + allocated_size) >> vol->cluster_size_bits); + /* Find the runlist element with which the attribute extent starts. */ + ll = sle64_to_cpu(a->data.non_resident.lowest_vcn); + rl2 = ntfs_rl_find_vcn_nolock(rl, ll); + BUG_ON(!rl2); + BUG_ON(!rl2->length); + BUG_ON(rl2->lcn < LCN_HOLE); + mp_rebuilt = false; + /* Get the size for the new mapping pairs array for this extent. */ + mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1); + if (unlikely(mp_size <= 0)) { + err = mp_size; + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Cannot extend allocation of " + "inode 0x%lx, attribute type 0x%x, " + "because determining the size for the " + "mapping pairs failed with error code " + "%i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + err = -EIO; + goto undo_alloc; + } + /* Extend the attribute record to fit the bigger mapping pairs array. */ + attr_len = le32_to_cpu(a->length); + err = ntfs_attr_record_resize(m, a, mp_size + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); + if (unlikely(err)) { + BUG_ON(err != -ENOSPC); + // TODO: Deal with this by moving this extent to a new mft + // record or by starting a new extent in a new mft record, + // possibly by extending this extent partially and filling it + // and creating a new extent for the remainder, or by making + // other attributes non-resident and/or by moving other + // attributes out of this mft record. + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Not enough space in the mft " + "record for the extended attribute " + "record. This case is not " + "implemented yet."); + err = -EOPNOTSUPP; + goto undo_alloc; + } + mp_rebuilt = true; + /* Generate the mapping pairs array directly into the attr record. */ + err = ntfs_mapping_pairs_build(vol, (u8*)a + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset), + mp_size, rl2, ll, -1, NULL); + if (unlikely(err)) { + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Cannot extend allocation of " + "inode 0x%lx, attribute type 0x%x, " + "because building the mapping pairs " + "failed with error code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + err = -EIO; + goto undo_alloc; + } + /* Update the highest_vcn. */ + a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >> + vol->cluster_size_bits) - 1); + /* + * We now have extended the allocated size of the attribute. Reflect + * this in the ntfs_inode structure and the attribute record. + */ + if (a->data.non_resident.lowest_vcn) { + /* + * We are not in the first attribute extent, switch to it, but + * first ensure the changes will make it to disk later. + */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_reinit_search_ctx(ctx); + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) + goto restore_undo_alloc; + /* @m is not used any more so no need to set it. */ + a = ctx->attr; + } + write_lock_irqsave(&ni->size_lock, flags); + ni->allocated_size = new_alloc_size; + a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size); + /* + * FIXME: This would fail if @ni is a directory, $MFT, or an index, + * since those can have sparse/compressed set. For example can be + * set compressed even though it is not compressed itself and in that + * case the bit means that files are to be created compressed in the + * directory... At present this is ok as this code is only called for + * regular files, and only for their $DATA attribute(s). + * FIXME: The calculation is wrong if we created a hole above. For now + * it does not matter as we never create holes. + */ + if (NInoSparse(ni) || NInoCompressed(ni)) { + ni->itype.compressed.size += new_alloc_size - allocated_size; + a->data.non_resident.compressed_size = + cpu_to_sle64(ni->itype.compressed.size); + vi->i_blocks = ni->itype.compressed.size >> 9; + } else + vi->i_blocks = new_alloc_size >> 9; + write_unlock_irqrestore(&ni->size_lock, flags); +alloc_done: + if (new_data_size >= 0) { + BUG_ON(new_data_size < + sle64_to_cpu(a->data.non_resident.data_size)); + a->data.non_resident.data_size = cpu_to_sle64(new_data_size); + } +flush_done: + /* Ensure the changes make it to disk. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); +done: + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); + ntfs_debug("Done, new_allocated_size 0x%llx.", + (unsigned long long)new_alloc_size); + return new_alloc_size; +restore_undo_alloc: + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Cannot complete extension of allocation " + "of inode 0x%lx, attribute type 0x%x, because " + "lookup of first attribute extent failed with " + "error code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + if (err == -ENOENT) + err = -EIO; + ntfs_attr_reinit_search_ctx(ctx); + if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, CASE_SENSITIVE, + allocated_size >> vol->cluster_size_bits, NULL, 0, + ctx)) { + ntfs_error(vol->sb, "Failed to find last attribute extent of " + "attribute in error code path. Run chkdsk to " + "recover."); + write_lock_irqsave(&ni->size_lock, flags); + ni->allocated_size = new_alloc_size; + /* + * FIXME: This would fail if @ni is a directory... See above. + * FIXME: The calculation is wrong if we created a hole above. + * For now it does not matter as we never create holes. + */ + if (NInoSparse(ni) || NInoCompressed(ni)) { + ni->itype.compressed.size += new_alloc_size - + allocated_size; + vi->i_blocks = ni->itype.compressed.size >> 9; + } else + vi->i_blocks = new_alloc_size >> 9; + write_unlock_irqrestore(&ni->size_lock, flags); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); + /* + * The only thing that is now wrong is the allocated size of the + * base attribute extent which chkdsk should be able to fix. + */ + NVolSetErrors(vol); + return err; + } + ctx->attr->data.non_resident.highest_vcn = cpu_to_sle64( + (allocated_size >> vol->cluster_size_bits) - 1); +undo_alloc: + ll = allocated_size >> vol->cluster_size_bits; + if (ntfs_cluster_free(ni, ll, -1, ctx) < 0) { + ntfs_error(vol->sb, "Failed to release allocated cluster(s) " + "in error code path. Run chkdsk to recover " + "the lost cluster(s)."); + NVolSetErrors(vol); + } + m = ctx->mrec; + a = ctx->attr; + /* + * If the runlist truncation fails and/or the search context is no + * longer valid, we cannot resize the attribute record or build the + * mapping pairs array thus we mark the inode bad so that no access to + * the freed clusters can happen. + */ + if (ntfs_rl_truncate_nolock(vol, &ni->runlist, ll) || IS_ERR(m)) { + ntfs_error(vol->sb, "Failed to %s in error code path. Run " + "chkdsk to recover.", IS_ERR(m) ? + "restore attribute search context" : + "truncate attribute runlist"); + NVolSetErrors(vol); + } else if (mp_rebuilt) { + if (ntfs_attr_record_resize(m, a, attr_len)) { + ntfs_error(vol->sb, "Failed to restore attribute " + "record in error code path. Run " + "chkdsk to recover."); + NVolSetErrors(vol); + } else /* if (success) */ { + if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( + a->data.non_resident. + mapping_pairs_offset), attr_len - + le16_to_cpu(a->data.non_resident. + mapping_pairs_offset), rl2, ll, -1, + NULL)) { + ntfs_error(vol->sb, "Failed to restore " + "mapping pairs array in error " + "code path. Run chkdsk to " + "recover."); + NVolSetErrors(vol); + } + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + } + } +err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); +conv_err_out: + ntfs_debug("Failed. Returning error code %i.", err); + return err; +} + +/** + * ntfs_attr_set - fill (a part of) an attribute with a byte + * @ni: ntfs inode describing the attribute to fill + * @ofs: offset inside the attribute at which to start to fill + * @cnt: number of bytes to fill + * @val: the unsigned 8-bit value with which to fill the attribute + * + * Fill @cnt bytes of the attribute described by the ntfs inode @ni starting at + * byte offset @ofs inside the attribute with the constant byte @val. + * + * This function is effectively like memset() applied to an ntfs attribute. + * Note this function actually only operates on the page cache pages belonging + * to the ntfs attribute and it marks them dirty after doing the memset(). + * Thus it relies on the vm dirty page write code paths to cause the modified + * pages to be written to the mft record/disk. + * + * Return 0 on success and -errno on error. An error code of -ESPIPE means + * that @ofs + @cnt were outside the end of the attribute and no write was + * performed. + */ +int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val) +{ + ntfs_volume *vol = ni->vol; + struct address_space *mapping; + struct page *page; + u8 *kaddr; + pgoff_t idx, end; + unsigned start_ofs, end_ofs, size; + + ntfs_debug("Entering for ofs 0x%llx, cnt 0x%llx, val 0x%hx.", + (long long)ofs, (long long)cnt, val); + BUG_ON(ofs < 0); + BUG_ON(cnt < 0); + if (!cnt) + goto done; + /* + * FIXME: Compressed and encrypted attributes are not supported when + * writing and we should never have gotten here for them. + */ + BUG_ON(NInoCompressed(ni)); + BUG_ON(NInoEncrypted(ni)); + mapping = VFS_I(ni)->i_mapping; + /* Work out the starting index and page offset. */ + idx = ofs >> PAGE_SHIFT; + start_ofs = ofs & ~PAGE_MASK; + /* Work out the ending index and page offset. */ + end = ofs + cnt; + end_ofs = end & ~PAGE_MASK; + /* If the end is outside the inode size return -ESPIPE. */ + if (unlikely(end > i_size_read(VFS_I(ni)))) { + ntfs_error(vol->sb, "Request exceeds end of attribute."); + return -ESPIPE; + } + end >>= PAGE_SHIFT; + /* If there is a first partial page, need to do it the slow way. */ + if (start_ofs) { + page = read_mapping_page(mapping, idx, NULL); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to read first partial " + "page (error, index 0x%lx).", idx); + return PTR_ERR(page); + } + /* + * If the last page is the same as the first page, need to + * limit the write to the end offset. + */ + size = PAGE_SIZE; + if (idx == end) + size = end_ofs; + kaddr = kmap_atomic(page); + memset(kaddr + start_ofs, val, size - start_ofs); + flush_dcache_page(page); + kunmap_atomic(kaddr); + set_page_dirty(page); + put_page(page); + balance_dirty_pages_ratelimited(mapping); + cond_resched(); + if (idx == end) + goto done; + idx++; + } + /* Do the whole pages the fast way. */ + for (; idx < end; idx++) { + /* Find or create the current page. (The page is locked.) */ + page = grab_cache_page(mapping, idx); + if (unlikely(!page)) { + ntfs_error(vol->sb, "Insufficient memory to grab " + "page (index 0x%lx).", idx); + return -ENOMEM; + } + kaddr = kmap_atomic(page); + memset(kaddr, val, PAGE_SIZE); + flush_dcache_page(page); + kunmap_atomic(kaddr); + /* + * If the page has buffers, mark them uptodate since buffer + * state and not page state is definitive in 2.6 kernels. + */ + if (page_has_buffers(page)) { + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + do { + set_buffer_uptodate(bh); + } while ((bh = bh->b_this_page) != head); + } + /* Now that buffers are uptodate, set the page uptodate, too. */ + SetPageUptodate(page); + /* + * Set the page and all its buffers dirty and mark the inode + * dirty, too. The VM will write the page later on. + */ + set_page_dirty(page); + /* Finally unlock and release the page. */ + unlock_page(page); + put_page(page); + balance_dirty_pages_ratelimited(mapping); + cond_resched(); + } + /* If there is a last partial page, need to do it the slow way. */ + if (end_ofs) { + page = read_mapping_page(mapping, idx, NULL); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to read last partial page " + "(error, index 0x%lx).", idx); + return PTR_ERR(page); + } + kaddr = kmap_atomic(page); + memset(kaddr, val, end_ofs); + flush_dcache_page(page); + kunmap_atomic(kaddr); + set_page_dirty(page); + put_page(page); + balance_dirty_pages_ratelimited(mapping); + cond_resched(); + } +done: + ntfs_debug("Done."); + return 0; +} + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/attrib.h b/fs/ntfs/attrib.h new file mode 100644 index 000000000000..fe0890d3d072 --- /dev/null +++ b/fs/ntfs/attrib.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * attrib.h - Defines for attribute handling in NTFS Linux kernel driver. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2005 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + */ + +#ifndef _LINUX_NTFS_ATTRIB_H +#define _LINUX_NTFS_ATTRIB_H + +#include "endian.h" +#include "types.h" +#include "layout.h" +#include "inode.h" +#include "runlist.h" +#include "volume.h" + +/** + * ntfs_attr_search_ctx - used in attribute search functions + * @mrec: buffer containing mft record to search + * @attr: attribute record in @mrec where to begin/continue search + * @is_first: if true ntfs_attr_lookup() begins search with @attr, else after + * + * Structure must be initialized to zero before the first call to one of the + * attribute search functions. Initialize @mrec to point to the mft record to + * search, and @attr to point to the first attribute within @mrec (not necessary + * if calling the _first() functions), and set @is_first to 'true' (not necessary + * if calling the _first() functions). + * + * If @is_first is 'true', the search begins with @attr. If @is_first is 'false', + * the search begins after @attr. This is so that, after the first call to one + * of the search attribute functions, we can call the function again, without + * any modification of the search context, to automagically get the next + * matching attribute. + */ +typedef struct { + MFT_RECORD *mrec; + ATTR_RECORD *attr; + bool is_first; + ntfs_inode *ntfs_ino; + ATTR_LIST_ENTRY *al_entry; + ntfs_inode *base_ntfs_ino; + MFT_RECORD *base_mrec; + ATTR_RECORD *base_attr; +} ntfs_attr_search_ctx; + +extern int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, + ntfs_attr_search_ctx *ctx); +extern int ntfs_map_runlist(ntfs_inode *ni, VCN vcn); + +extern LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn, + const bool write_locked); + +extern runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, + const VCN vcn, ntfs_attr_search_ctx *ctx); + +int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name, + const u32 name_len, const IGNORE_CASE_BOOL ic, + const VCN lowest_vcn, const u8 *val, const u32 val_len, + ntfs_attr_search_ctx *ctx); + +extern int load_attribute_list(ntfs_volume *vol, runlist *rl, u8 *al_start, + const s64 size, const s64 initialized_size); + +static inline s64 ntfs_attr_size(const ATTR_RECORD *a) +{ + if (!a->non_resident) + return (s64)le32_to_cpu(a->data.resident.value_length); + return sle64_to_cpu(a->data.non_resident.data_size); +} + +extern void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx); +extern ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, + MFT_RECORD *mrec); +extern void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx); + +#ifdef NTFS_RW + +extern int ntfs_attr_size_bounds_check(const ntfs_volume *vol, + const ATTR_TYPE type, const s64 size); +extern int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, + const ATTR_TYPE type); +extern int ntfs_attr_can_be_resident(const ntfs_volume *vol, + const ATTR_TYPE type); + +extern int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size); +extern int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a, + const u32 new_size); + +extern int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size); + +extern s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size, + const s64 new_data_size, const s64 data_start); + +extern int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, + const u8 val); + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_ATTRIB_H */ diff --git a/fs/ntfs/bitmap.c b/fs/ntfs/bitmap.c new file mode 100644 index 000000000000..0675b2400873 --- /dev/null +++ b/fs/ntfs/bitmap.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * bitmap.c - NTFS kernel bitmap handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2004-2005 Anton Altaparmakov + */ + +#ifdef NTFS_RW + +#include + +#include "bitmap.h" +#include "debug.h" +#include "aops.h" +#include "ntfs.h" + +/** + * __ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value + * @vi: vfs inode describing the bitmap + * @start_bit: first bit to set + * @count: number of bits to set + * @value: value to set the bits to (i.e. 0 or 1) + * @is_rollback: if 'true' this is a rollback operation + * + * Set @count bits starting at bit @start_bit in the bitmap described by the + * vfs inode @vi to @value, where @value is either 0 or 1. + * + * @is_rollback should always be 'false', it is for internal use to rollback + * errors. You probably want to use ntfs_bitmap_set_bits_in_run() instead. + * + * Return 0 on success and -errno on error. + */ +int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, + const s64 count, const u8 value, const bool is_rollback) +{ + s64 cnt = count; + pgoff_t index, end_index; + struct address_space *mapping; + struct page *page; + u8 *kaddr; + int pos, len; + u8 bit; + + BUG_ON(!vi); + ntfs_debug("Entering for i_ino 0x%lx, start_bit 0x%llx, count 0x%llx, " + "value %u.%s", vi->i_ino, (unsigned long long)start_bit, + (unsigned long long)cnt, (unsigned int)value, + is_rollback ? " (rollback)" : ""); + BUG_ON(start_bit < 0); + BUG_ON(cnt < 0); + BUG_ON(value > 1); + /* + * Calculate the indices for the pages containing the first and last + * bits, i.e. @start_bit and @start_bit + @cnt - 1, respectively. + */ + index = start_bit >> (3 + PAGE_SHIFT); + end_index = (start_bit + cnt - 1) >> (3 + PAGE_SHIFT); + + /* Get the page containing the first bit (@start_bit). */ + mapping = vi->i_mapping; + page = ntfs_map_page(mapping, index); + if (IS_ERR(page)) { + if (!is_rollback) + ntfs_error(vi->i_sb, "Failed to map first page (error " + "%li), aborting.", PTR_ERR(page)); + return PTR_ERR(page); + } + kaddr = page_address(page); + + /* Set @pos to the position of the byte containing @start_bit. */ + pos = (start_bit >> 3) & ~PAGE_MASK; + + /* Calculate the position of @start_bit in the first byte. */ + bit = start_bit & 7; + + /* If the first byte is partial, modify the appropriate bits in it. */ + if (bit) { + u8 *byte = kaddr + pos; + while ((bit & 7) && cnt) { + cnt--; + if (value) + *byte |= 1 << bit++; + else + *byte &= ~(1 << bit++); + } + /* If we are done, unmap the page and return success. */ + if (!cnt) + goto done; + + /* Update @pos to the new position. */ + pos++; + } + /* + * Depending on @value, modify all remaining whole bytes in the page up + * to @cnt. + */ + len = min_t(s64, cnt >> 3, PAGE_SIZE - pos); + memset(kaddr + pos, value ? 0xff : 0, len); + cnt -= len << 3; + + /* Update @len to point to the first not-done byte in the page. */ + if (cnt < 8) + len += pos; + + /* If we are not in the last page, deal with all subsequent pages. */ + while (index < end_index) { + BUG_ON(cnt <= 0); + + /* Update @index and get the next page. */ + flush_dcache_page(page); + set_page_dirty(page); + ntfs_unmap_page(page); + page = ntfs_map_page(mapping, ++index); + if (IS_ERR(page)) + goto rollback; + kaddr = page_address(page); + /* + * Depending on @value, modify all remaining whole bytes in the + * page up to @cnt. + */ + len = min_t(s64, cnt >> 3, PAGE_SIZE); + memset(kaddr, value ? 0xff : 0, len); + cnt -= len << 3; + } + /* + * The currently mapped page is the last one. If the last byte is + * partial, modify the appropriate bits in it. Note, @len is the + * position of the last byte inside the page. + */ + if (cnt) { + u8 *byte; + + BUG_ON(cnt > 7); + + bit = cnt; + byte = kaddr + len; + while (bit--) { + if (value) + *byte |= 1 << bit; + else + *byte &= ~(1 << bit); + } + } +done: + /* We are done. Unmap the page and return success. */ + flush_dcache_page(page); + set_page_dirty(page); + ntfs_unmap_page(page); + ntfs_debug("Done."); + return 0; +rollback: + /* + * Current state: + * - no pages are mapped + * - @count - @cnt is the number of bits that have been modified + */ + if (is_rollback) + return PTR_ERR(page); + if (count != cnt) + pos = __ntfs_bitmap_set_bits_in_run(vi, start_bit, count - cnt, + value ? 0 : 1, true); + else + pos = 0; + if (!pos) { + /* Rollback was successful. */ + ntfs_error(vi->i_sb, "Failed to map subsequent page (error " + "%li), aborting.", PTR_ERR(page)); + } else { + /* Rollback failed. */ + ntfs_error(vi->i_sb, "Failed to map subsequent page (error " + "%li) and rollback failed (error %i). " + "Aborting and leaving inconsistent metadata. " + "Unmount and run chkdsk.", PTR_ERR(page), pos); + NVolSetErrors(NTFS_SB(vi->i_sb)); + } + return PTR_ERR(page); +} + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/bitmap.h b/fs/ntfs/bitmap.h new file mode 100644 index 000000000000..9dd2224ca9c4 --- /dev/null +++ b/fs/ntfs/bitmap.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * bitmap.h - Defines for NTFS kernel bitmap handling. Part of the Linux-NTFS + * project. + * + * Copyright (c) 2004 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_BITMAP_H +#define _LINUX_NTFS_BITMAP_H + +#ifdef NTFS_RW + +#include + +#include "types.h" + +extern int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, + const s64 count, const u8 value, const bool is_rollback); + +/** + * ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value + * @vi: vfs inode describing the bitmap + * @start_bit: first bit to set + * @count: number of bits to set + * @value: value to set the bits to (i.e. 0 or 1) + * + * Set @count bits starting at bit @start_bit in the bitmap described by the + * vfs inode @vi to @value, where @value is either 0 or 1. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_bitmap_set_bits_in_run(struct inode *vi, + const s64 start_bit, const s64 count, const u8 value) +{ + return __ntfs_bitmap_set_bits_in_run(vi, start_bit, count, value, + false); +} + +/** + * ntfs_bitmap_set_run - set a run of bits in a bitmap + * @vi: vfs inode describing the bitmap + * @start_bit: first bit to set + * @count: number of bits to set + * + * Set @count bits starting at bit @start_bit in the bitmap described by the + * vfs inode @vi. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_bitmap_set_run(struct inode *vi, const s64 start_bit, + const s64 count) +{ + return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 1); +} + +/** + * ntfs_bitmap_clear_run - clear a run of bits in a bitmap + * @vi: vfs inode describing the bitmap + * @start_bit: first bit to clear + * @count: number of bits to clear + * + * Clear @count bits starting at bit @start_bit in the bitmap described by the + * vfs inode @vi. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_bitmap_clear_run(struct inode *vi, const s64 start_bit, + const s64 count) +{ + return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 0); +} + +/** + * ntfs_bitmap_set_bit - set a bit in a bitmap + * @vi: vfs inode describing the bitmap + * @bit: bit to set + * + * Set bit @bit in the bitmap described by the vfs inode @vi. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_bitmap_set_bit(struct inode *vi, const s64 bit) +{ + return ntfs_bitmap_set_run(vi, bit, 1); +} + +/** + * ntfs_bitmap_clear_bit - clear a bit in a bitmap + * @vi: vfs inode describing the bitmap + * @bit: bit to clear + * + * Clear bit @bit in the bitmap described by the vfs inode @vi. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_bitmap_clear_bit(struct inode *vi, const s64 bit) +{ + return ntfs_bitmap_clear_run(vi, bit, 1); +} + +#endif /* NTFS_RW */ + +#endif /* defined _LINUX_NTFS_BITMAP_H */ diff --git a/fs/ntfs/collate.c b/fs/ntfs/collate.c new file mode 100644 index 000000000000..3ab6ec96abfe --- /dev/null +++ b/fs/ntfs/collate.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * collate.c - NTFS kernel collation handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2004 Anton Altaparmakov + */ + +#include "collate.h" +#include "debug.h" +#include "ntfs.h" + +static int ntfs_collate_binary(ntfs_volume *vol, + const void *data1, const int data1_len, + const void *data2, const int data2_len) +{ + int rc; + + ntfs_debug("Entering."); + rc = memcmp(data1, data2, min(data1_len, data2_len)); + if (!rc && (data1_len != data2_len)) { + if (data1_len < data2_len) + rc = -1; + else + rc = 1; + } + ntfs_debug("Done, returning %i", rc); + return rc; +} + +static int ntfs_collate_ntofs_ulong(ntfs_volume *vol, + const void *data1, const int data1_len, + const void *data2, const int data2_len) +{ + int rc; + u32 d1, d2; + + ntfs_debug("Entering."); + // FIXME: We don't really want to bug here. + BUG_ON(data1_len != data2_len); + BUG_ON(data1_len != 4); + d1 = le32_to_cpup(data1); + d2 = le32_to_cpup(data2); + if (d1 < d2) + rc = -1; + else { + if (d1 == d2) + rc = 0; + else + rc = 1; + } + ntfs_debug("Done, returning %i", rc); + return rc; +} + +typedef int (*ntfs_collate_func_t)(ntfs_volume *, const void *, const int, + const void *, const int); + +static ntfs_collate_func_t ntfs_do_collate0x0[3] = { + ntfs_collate_binary, + NULL/*ntfs_collate_file_name*/, + NULL/*ntfs_collate_unicode_string*/, +}; + +static ntfs_collate_func_t ntfs_do_collate0x1[4] = { + ntfs_collate_ntofs_ulong, + NULL/*ntfs_collate_ntofs_sid*/, + NULL/*ntfs_collate_ntofs_security_hash*/, + NULL/*ntfs_collate_ntofs_ulongs*/, +}; + +/** + * ntfs_collate - collate two data items using a specified collation rule + * @vol: ntfs volume to which the data items belong + * @cr: collation rule to use when comparing the items + * @data1: first data item to collate + * @data1_len: length in bytes of @data1 + * @data2: second data item to collate + * @data2_len: length in bytes of @data2 + * + * Collate the two data items @data1 and @data2 using the collation rule @cr + * and return -1, 0, ir 1 if @data1 is found, respectively, to collate before, + * to match, or to collate after @data2. + * + * For speed we use the collation rule @cr as an index into two tables of + * function pointers to call the appropriate collation function. + */ +int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr, + const void *data1, const int data1_len, + const void *data2, const int data2_len) { + int i; + + ntfs_debug("Entering."); + /* + * FIXME: At the moment we only support COLLATION_BINARY and + * COLLATION_NTOFS_ULONG, so we BUG() for everything else for now. + */ + BUG_ON(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG); + i = le32_to_cpu(cr); + BUG_ON(i < 0); + if (i <= 0x02) + return ntfs_do_collate0x0[i](vol, data1, data1_len, + data2, data2_len); + BUG_ON(i < 0x10); + i -= 0x10; + if (likely(i <= 3)) + return ntfs_do_collate0x1[i](vol, data1, data1_len, + data2, data2_len); + BUG(); + return 0; +} diff --git a/fs/ntfs/collate.h b/fs/ntfs/collate.h new file mode 100644 index 000000000000..f2255619b4f4 --- /dev/null +++ b/fs/ntfs/collate.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * collate.h - Defines for NTFS kernel collation handling. Part of the + * Linux-NTFS project. + * + * Copyright (c) 2004 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_COLLATE_H +#define _LINUX_NTFS_COLLATE_H + +#include "types.h" +#include "volume.h" + +static inline bool ntfs_is_collation_rule_supported(COLLATION_RULE cr) { + int i; + + /* + * FIXME: At the moment we only support COLLATION_BINARY and + * COLLATION_NTOFS_ULONG, so we return false for everything else for + * now. + */ + if (unlikely(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG)) + return false; + i = le32_to_cpu(cr); + if (likely(((i >= 0) && (i <= 0x02)) || + ((i >= 0x10) && (i <= 0x13)))) + return true; + return false; +} + +extern int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr, + const void *data1, const int data1_len, + const void *data2, const int data2_len); + +#endif /* _LINUX_NTFS_COLLATE_H */ diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c new file mode 100644 index 000000000000..761aaa0195d6 --- /dev/null +++ b/fs/ntfs/compress.c @@ -0,0 +1,950 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * compress.c - NTFS kernel compressed attributes handling. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2004 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + */ + +#include +#include +#include +#include +#include + +#include "attrib.h" +#include "inode.h" +#include "debug.h" +#include "ntfs.h" + +/** + * ntfs_compression_constants - enum of constants used in the compression code + */ +typedef enum { + /* Token types and access mask. */ + NTFS_SYMBOL_TOKEN = 0, + NTFS_PHRASE_TOKEN = 1, + NTFS_TOKEN_MASK = 1, + + /* Compression sub-block constants. */ + NTFS_SB_SIZE_MASK = 0x0fff, + NTFS_SB_SIZE = 0x1000, + NTFS_SB_IS_COMPRESSED = 0x8000, + + /* + * The maximum compression block size is by definition 16 * the cluster + * size, with the maximum supported cluster size being 4kiB. Thus the + * maximum compression buffer size is 64kiB, so we use this when + * initializing the compression buffer. + */ + NTFS_MAX_CB_SIZE = 64 * 1024, +} ntfs_compression_constants; + +/* + * ntfs_compression_buffer - one buffer for the decompression engine + */ +static u8 *ntfs_compression_buffer; + +/* + * ntfs_cb_lock - spinlock which protects ntfs_compression_buffer + */ +static DEFINE_SPINLOCK(ntfs_cb_lock); + +/** + * allocate_compression_buffers - allocate the decompression buffers + * + * Caller has to hold the ntfs_lock mutex. + * + * Return 0 on success or -ENOMEM if the allocations failed. + */ +int allocate_compression_buffers(void) +{ + BUG_ON(ntfs_compression_buffer); + + ntfs_compression_buffer = vmalloc(NTFS_MAX_CB_SIZE); + if (!ntfs_compression_buffer) + return -ENOMEM; + return 0; +} + +/** + * free_compression_buffers - free the decompression buffers + * + * Caller has to hold the ntfs_lock mutex. + */ +void free_compression_buffers(void) +{ + BUG_ON(!ntfs_compression_buffer); + vfree(ntfs_compression_buffer); + ntfs_compression_buffer = NULL; +} + +/** + * zero_partial_compressed_page - zero out of bounds compressed page region + */ +static void zero_partial_compressed_page(struct page *page, + const s64 initialized_size) +{ + u8 *kp = page_address(page); + unsigned int kp_ofs; + + ntfs_debug("Zeroing page region outside initialized size."); + if (((s64)page->index << PAGE_SHIFT) >= initialized_size) { + clear_page(kp); + return; + } + kp_ofs = initialized_size & ~PAGE_MASK; + memset(kp + kp_ofs, 0, PAGE_SIZE - kp_ofs); + return; +} + +/** + * handle_bounds_compressed_page - test for&handle out of bounds compressed page + */ +static inline void handle_bounds_compressed_page(struct page *page, + const loff_t i_size, const s64 initialized_size) +{ + if ((page->index >= (initialized_size >> PAGE_SHIFT)) && + (initialized_size < i_size)) + zero_partial_compressed_page(page, initialized_size); + return; +} + +/** + * ntfs_decompress - decompress a compression block into an array of pages + * @dest_pages: destination array of pages + * @completed_pages: scratch space to track completed pages + * @dest_index: current index into @dest_pages (IN/OUT) + * @dest_ofs: current offset within @dest_pages[@dest_index] (IN/OUT) + * @dest_max_index: maximum index into @dest_pages (IN) + * @dest_max_ofs: maximum offset within @dest_pages[@dest_max_index] (IN) + * @xpage: the target page (-1 if none) (IN) + * @xpage_done: set to 1 if xpage was completed successfully (IN/OUT) + * @cb_start: compression block to decompress (IN) + * @cb_size: size of compression block @cb_start in bytes (IN) + * @i_size: file size when we started the read (IN) + * @initialized_size: initialized file size when we started the read (IN) + * + * The caller must have disabled preemption. ntfs_decompress() reenables it when + * the critical section is finished. + * + * This decompresses the compression block @cb_start into the array of + * destination pages @dest_pages starting at index @dest_index into @dest_pages + * and at offset @dest_pos into the page @dest_pages[@dest_index]. + * + * When the page @dest_pages[@xpage] is completed, @xpage_done is set to 1. + * If xpage is -1 or @xpage has not been completed, @xpage_done is not modified. + * + * @cb_start is a pointer to the compression block which needs decompressing + * and @cb_size is the size of @cb_start in bytes (8-64kiB). + * + * Return 0 if success or -EOVERFLOW on error in the compressed stream. + * @xpage_done indicates whether the target page (@dest_pages[@xpage]) was + * completed during the decompression of the compression block (@cb_start). + * + * Warning: This function *REQUIRES* PAGE_SIZE >= 4096 or it will blow up + * unpredicatbly! You have been warned! + * + * Note to hackers: This function may not sleep until it has finished accessing + * the compression block @cb_start as it is a per-CPU buffer. + */ +static int ntfs_decompress(struct page *dest_pages[], int completed_pages[], + int *dest_index, int *dest_ofs, const int dest_max_index, + const int dest_max_ofs, const int xpage, char *xpage_done, + u8 *const cb_start, const u32 cb_size, const loff_t i_size, + const s64 initialized_size) +{ + /* + * Pointers into the compressed data, i.e. the compression block (cb), + * and the therein contained sub-blocks (sb). + */ + u8 *cb_end = cb_start + cb_size; /* End of cb. */ + u8 *cb = cb_start; /* Current position in cb. */ + u8 *cb_sb_start; /* Beginning of the current sb in the cb. */ + u8 *cb_sb_end; /* End of current sb / beginning of next sb. */ + + /* Variables for uncompressed data / destination. */ + struct page *dp; /* Current destination page being worked on. */ + u8 *dp_addr; /* Current pointer into dp. */ + u8 *dp_sb_start; /* Start of current sub-block in dp. */ + u8 *dp_sb_end; /* End of current sb in dp (dp_sb_start + + NTFS_SB_SIZE). */ + u16 do_sb_start; /* @dest_ofs when starting this sub-block. */ + u16 do_sb_end; /* @dest_ofs of end of this sb (do_sb_start + + NTFS_SB_SIZE). */ + + /* Variables for tag and token parsing. */ + u8 tag; /* Current tag. */ + int token; /* Loop counter for the eight tokens in tag. */ + int nr_completed_pages = 0; + + /* Default error code. */ + int err = -EOVERFLOW; + + ntfs_debug("Entering, cb_size = 0x%x.", cb_size); +do_next_sb: + ntfs_debug("Beginning sub-block at offset = 0x%zx in the cb.", + cb - cb_start); + /* + * Have we reached the end of the compression block or the end of the + * decompressed data? The latter can happen for example if the current + * position in the compression block is one byte before its end so the + * first two checks do not detect it. + */ + if (cb == cb_end || !le16_to_cpup((le16*)cb) || + (*dest_index == dest_max_index && + *dest_ofs == dest_max_ofs)) { + int i; + + ntfs_debug("Completed. Returning success (0)."); + err = 0; +return_error: + /* We can sleep from now on, so we drop lock. */ + spin_unlock(&ntfs_cb_lock); + /* Second stage: finalize completed pages. */ + if (nr_completed_pages > 0) { + for (i = 0; i < nr_completed_pages; i++) { + int di = completed_pages[i]; + + dp = dest_pages[di]; + /* + * If we are outside the initialized size, zero + * the out of bounds page range. + */ + handle_bounds_compressed_page(dp, i_size, + initialized_size); + flush_dcache_page(dp); + kunmap(dp); + SetPageUptodate(dp); + unlock_page(dp); + if (di == xpage) + *xpage_done = 1; + else + put_page(dp); + dest_pages[di] = NULL; + } + } + return err; + } + + /* Setup offsets for the current sub-block destination. */ + do_sb_start = *dest_ofs; + do_sb_end = do_sb_start + NTFS_SB_SIZE; + + /* Check that we are still within allowed boundaries. */ + if (*dest_index == dest_max_index && do_sb_end > dest_max_ofs) + goto return_overflow; + + /* Does the minimum size of a compressed sb overflow valid range? */ + if (cb + 6 > cb_end) + goto return_overflow; + + /* Setup the current sub-block source pointers and validate range. */ + cb_sb_start = cb; + cb_sb_end = cb_sb_start + (le16_to_cpup((le16*)cb) & NTFS_SB_SIZE_MASK) + + 3; + if (cb_sb_end > cb_end) + goto return_overflow; + + /* Get the current destination page. */ + dp = dest_pages[*dest_index]; + if (!dp) { + /* No page present. Skip decompression of this sub-block. */ + cb = cb_sb_end; + + /* Advance destination position to next sub-block. */ + *dest_ofs = (*dest_ofs + NTFS_SB_SIZE) & ~PAGE_MASK; + if (!*dest_ofs && (++*dest_index > dest_max_index)) + goto return_overflow; + goto do_next_sb; + } + + /* We have a valid destination page. Setup the destination pointers. */ + dp_addr = (u8*)page_address(dp) + do_sb_start; + + /* Now, we are ready to process the current sub-block (sb). */ + if (!(le16_to_cpup((le16*)cb) & NTFS_SB_IS_COMPRESSED)) { + ntfs_debug("Found uncompressed sub-block."); + /* This sb is not compressed, just copy it into destination. */ + + /* Advance source position to first data byte. */ + cb += 2; + + /* An uncompressed sb must be full size. */ + if (cb_sb_end - cb != NTFS_SB_SIZE) + goto return_overflow; + + /* Copy the block and advance the source position. */ + memcpy(dp_addr, cb, NTFS_SB_SIZE); + cb += NTFS_SB_SIZE; + + /* Advance destination position to next sub-block. */ + *dest_ofs += NTFS_SB_SIZE; + if (!(*dest_ofs &= ~PAGE_MASK)) { +finalize_page: + /* + * First stage: add current page index to array of + * completed pages. + */ + completed_pages[nr_completed_pages++] = *dest_index; + if (++*dest_index > dest_max_index) + goto return_overflow; + } + goto do_next_sb; + } + ntfs_debug("Found compressed sub-block."); + /* This sb is compressed, decompress it into destination. */ + + /* Setup destination pointers. */ + dp_sb_start = dp_addr; + dp_sb_end = dp_sb_start + NTFS_SB_SIZE; + + /* Forward to the first tag in the sub-block. */ + cb += 2; +do_next_tag: + if (cb == cb_sb_end) { + /* Check if the decompressed sub-block was not full-length. */ + if (dp_addr < dp_sb_end) { + int nr_bytes = do_sb_end - *dest_ofs; + + ntfs_debug("Filling incomplete sub-block with " + "zeroes."); + /* Zero remainder and update destination position. */ + memset(dp_addr, 0, nr_bytes); + *dest_ofs += nr_bytes; + } + /* We have finished the current sub-block. */ + if (!(*dest_ofs &= ~PAGE_MASK)) + goto finalize_page; + goto do_next_sb; + } + + /* Check we are still in range. */ + if (cb > cb_sb_end || dp_addr > dp_sb_end) + goto return_overflow; + + /* Get the next tag and advance to first token. */ + tag = *cb++; + + /* Parse the eight tokens described by the tag. */ + for (token = 0; token < 8; token++, tag >>= 1) { + u16 lg, pt, length, max_non_overlap; + register u16 i; + u8 *dp_back_addr; + + /* Check if we are done / still in range. */ + if (cb >= cb_sb_end || dp_addr > dp_sb_end) + break; + + /* Determine token type and parse appropriately.*/ + if ((tag & NTFS_TOKEN_MASK) == NTFS_SYMBOL_TOKEN) { + /* + * We have a symbol token, copy the symbol across, and + * advance the source and destination positions. + */ + *dp_addr++ = *cb++; + ++*dest_ofs; + + /* Continue with the next token. */ + continue; + } + + /* + * We have a phrase token. Make sure it is not the first tag in + * the sb as this is illegal and would confuse the code below. + */ + if (dp_addr == dp_sb_start) + goto return_overflow; + + /* + * Determine the number of bytes to go back (p) and the number + * of bytes to copy (l). We use an optimized algorithm in which + * we first calculate log2(current destination position in sb), + * which allows determination of l and p in O(1) rather than + * O(n). We just need an arch-optimized log2() function now. + */ + lg = 0; + for (i = *dest_ofs - do_sb_start - 1; i >= 0x10; i >>= 1) + lg++; + + /* Get the phrase token into i. */ + pt = le16_to_cpup((le16*)cb); + + /* + * Calculate starting position of the byte sequence in + * the destination using the fact that p = (pt >> (12 - lg)) + 1 + * and make sure we don't go too far back. + */ + dp_back_addr = dp_addr - (pt >> (12 - lg)) - 1; + if (dp_back_addr < dp_sb_start) + goto return_overflow; + + /* Now calculate the length of the byte sequence. */ + length = (pt & (0xfff >> lg)) + 3; + + /* Advance destination position and verify it is in range. */ + *dest_ofs += length; + if (*dest_ofs > do_sb_end) + goto return_overflow; + + /* The number of non-overlapping bytes. */ + max_non_overlap = dp_addr - dp_back_addr; + + if (length <= max_non_overlap) { + /* The byte sequence doesn't overlap, just copy it. */ + memcpy(dp_addr, dp_back_addr, length); + + /* Advance destination pointer. */ + dp_addr += length; + } else { + /* + * The byte sequence does overlap, copy non-overlapping + * part and then do a slow byte by byte copy for the + * overlapping part. Also, advance the destination + * pointer. + */ + memcpy(dp_addr, dp_back_addr, max_non_overlap); + dp_addr += max_non_overlap; + dp_back_addr += max_non_overlap; + length -= max_non_overlap; + while (length--) + *dp_addr++ = *dp_back_addr++; + } + + /* Advance source position and continue with the next token. */ + cb += 2; + } + + /* No tokens left in the current tag. Continue with the next tag. */ + goto do_next_tag; + +return_overflow: + ntfs_error(NULL, "Failed. Returning -EOVERFLOW."); + goto return_error; +} + +/** + * ntfs_read_compressed_block - read a compressed block into the page cache + * @page: locked page in the compression block(s) we need to read + * + * When we are called the page has already been verified to be locked and the + * attribute is known to be non-resident, not encrypted, but compressed. + * + * 1. Determine which compression block(s) @page is in. + * 2. Get hold of all pages corresponding to this/these compression block(s). + * 3. Read the (first) compression block. + * 4. Decompress it into the corresponding pages. + * 5. Throw the compressed data away and proceed to 3. for the next compression + * block or return success if no more compression blocks left. + * + * Warning: We have to be careful what we do about existing pages. They might + * have been written to so that we would lose data if we were to just overwrite + * them with the out-of-date uncompressed data. + * + * FIXME: For PAGE_SIZE > cb_size we are not doing the Right Thing(TM) at + * the end of the file I think. We need to detect this case and zero the out + * of bounds remainder of the page in question and mark it as handled. At the + * moment we would just return -EIO on such a page. This bug will only become + * apparent if pages are above 8kiB and the NTFS volume only uses 512 byte + * clusters so is probably not going to be seen by anyone. Still this should + * be fixed. (AIA) + * + * FIXME: Again for PAGE_SIZE > cb_size we are screwing up both in + * handling sparse and compressed cbs. (AIA) + * + * FIXME: At the moment we don't do any zeroing out in the case that + * initialized_size is less than data_size. This should be safe because of the + * nature of the compression algorithm used. Just in case we check and output + * an error message in read inode if the two sizes are not equal for a + * compressed file. (AIA) + */ +int ntfs_read_compressed_block(struct page *page) +{ + loff_t i_size; + s64 initialized_size; + struct address_space *mapping = page->mapping; + ntfs_inode *ni = NTFS_I(mapping->host); + ntfs_volume *vol = ni->vol; + struct super_block *sb = vol->sb; + runlist_element *rl; + unsigned long flags, block_size = sb->s_blocksize; + unsigned char block_size_bits = sb->s_blocksize_bits; + u8 *cb, *cb_pos, *cb_end; + struct buffer_head **bhs; + unsigned long offset, index = page->index; + u32 cb_size = ni->itype.compressed.block_size; + u64 cb_size_mask = cb_size - 1UL; + VCN vcn; + LCN lcn; + /* The first wanted vcn (minimum alignment is PAGE_SIZE). */ + VCN start_vcn = (((s64)index << PAGE_SHIFT) & ~cb_size_mask) >> + vol->cluster_size_bits; + /* + * The first vcn after the last wanted vcn (minimum alignment is again + * PAGE_SIZE. + */ + VCN end_vcn = ((((s64)(index + 1UL) << PAGE_SHIFT) + cb_size - 1) + & ~cb_size_mask) >> vol->cluster_size_bits; + /* Number of compression blocks (cbs) in the wanted vcn range. */ + unsigned int nr_cbs = (end_vcn - start_vcn) << vol->cluster_size_bits + >> ni->itype.compressed.block_size_bits; + /* + * Number of pages required to store the uncompressed data from all + * compression blocks (cbs) overlapping @page. Due to alignment + * guarantees of start_vcn and end_vcn, no need to round up here. + */ + unsigned int nr_pages = (end_vcn - start_vcn) << + vol->cluster_size_bits >> PAGE_SHIFT; + unsigned int xpage, max_page, cur_page, cur_ofs, i; + unsigned int cb_clusters, cb_max_ofs; + int block, max_block, cb_max_page, bhs_size, nr_bhs, err = 0; + struct page **pages; + int *completed_pages; + unsigned char xpage_done = 0; + + ntfs_debug("Entering, page->index = 0x%lx, cb_size = 0x%x, nr_pages = " + "%i.", index, cb_size, nr_pages); + /* + * Bad things happen if we get here for anything that is not an + * unnamed $DATA attribute. + */ + BUG_ON(ni->type != AT_DATA); + BUG_ON(ni->name_len); + + pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); + completed_pages = kmalloc_array(nr_pages + 1, sizeof(int), GFP_NOFS); + + /* Allocate memory to store the buffer heads we need. */ + bhs_size = cb_size / block_size * sizeof(struct buffer_head *); + bhs = kmalloc(bhs_size, GFP_NOFS); + + if (unlikely(!pages || !bhs || !completed_pages)) { + kfree(bhs); + kfree(pages); + kfree(completed_pages); + unlock_page(page); + ntfs_error(vol->sb, "Failed to allocate internal buffers."); + return -ENOMEM; + } + + /* + * We have already been given one page, this is the one we must do. + * Once again, the alignment guarantees keep it simple. + */ + offset = start_vcn << vol->cluster_size_bits >> PAGE_SHIFT; + xpage = index - offset; + pages[xpage] = page; + /* + * The remaining pages need to be allocated and inserted into the page + * cache, alignment guarantees keep all the below much simpler. (-8 + */ + read_lock_irqsave(&ni->size_lock, flags); + i_size = i_size_read(VFS_I(ni)); + initialized_size = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); + max_page = ((i_size + PAGE_SIZE - 1) >> PAGE_SHIFT) - + offset; + /* Is the page fully outside i_size? (truncate in progress) */ + if (xpage >= max_page) { + kfree(bhs); + kfree(pages); + kfree(completed_pages); + zero_user(page, 0, PAGE_SIZE); + ntfs_debug("Compressed read outside i_size - truncated?"); + SetPageUptodate(page); + unlock_page(page); + return 0; + } + if (nr_pages < max_page) + max_page = nr_pages; + for (i = 0; i < max_page; i++, offset++) { + if (i != xpage) + pages[i] = grab_cache_page_nowait(mapping, offset); + page = pages[i]; + if (page) { + /* + * We only (re)read the page if it isn't already read + * in and/or dirty or we would be losing data or at + * least wasting our time. + */ + if (!PageDirty(page) && (!PageUptodate(page) || + PageError(page))) { + ClearPageError(page); + kmap(page); + continue; + } + unlock_page(page); + put_page(page); + pages[i] = NULL; + } + } + + /* + * We have the runlist, and all the destination pages we need to fill. + * Now read the first compression block. + */ + cur_page = 0; + cur_ofs = 0; + cb_clusters = ni->itype.compressed.block_clusters; +do_next_cb: + nr_cbs--; + nr_bhs = 0; + + /* Read all cb buffer heads one cluster at a time. */ + rl = NULL; + for (vcn = start_vcn, start_vcn += cb_clusters; vcn < start_vcn; + vcn++) { + bool is_retry = false; + + if (!rl) { +lock_retry_remap: + down_read(&ni->runlist.lock); + rl = ni->runlist.rl; + } + if (likely(rl != NULL)) { + /* Seek to element containing target vcn. */ + while (rl->length && rl[1].vcn <= vcn) + rl++; + lcn = ntfs_rl_vcn_to_lcn(rl, vcn); + } else + lcn = LCN_RL_NOT_MAPPED; + ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.", + (unsigned long long)vcn, + (unsigned long long)lcn); + if (lcn < 0) { + /* + * When we reach the first sparse cluster we have + * finished with the cb. + */ + if (lcn == LCN_HOLE) + break; + if (is_retry || lcn != LCN_RL_NOT_MAPPED) + goto rl_err; + is_retry = true; + /* + * Attempt to map runlist, dropping lock for the + * duration. + */ + up_read(&ni->runlist.lock); + if (!ntfs_map_runlist(ni, vcn)) + goto lock_retry_remap; + goto map_rl_err; + } + block = lcn << vol->cluster_size_bits >> block_size_bits; + /* Read the lcn from device in chunks of block_size bytes. */ + max_block = block + (vol->cluster_size >> block_size_bits); + do { + ntfs_debug("block = 0x%x.", block); + if (unlikely(!(bhs[nr_bhs] = sb_getblk(sb, block)))) + goto getblk_err; + nr_bhs++; + } while (++block < max_block); + } + + /* Release the lock if we took it. */ + if (rl) + up_read(&ni->runlist.lock); + + /* Setup and initiate io on all buffer heads. */ + for (i = 0; i < nr_bhs; i++) { + struct buffer_head *tbh = bhs[i]; + + if (!trylock_buffer(tbh)) + continue; + if (unlikely(buffer_uptodate(tbh))) { + unlock_buffer(tbh); + continue; + } + get_bh(tbh); + tbh->b_end_io = end_buffer_read_sync; + submit_bh(REQ_OP_READ, tbh); + } + + /* Wait for io completion on all buffer heads. */ + for (i = 0; i < nr_bhs; i++) { + struct buffer_head *tbh = bhs[i]; + + if (buffer_uptodate(tbh)) + continue; + wait_on_buffer(tbh); + /* + * We need an optimization barrier here, otherwise we start + * hitting the below fixup code when accessing a loopback + * mounted ntfs partition. This indicates either there is a + * race condition in the loop driver or, more likely, gcc + * overoptimises the code without the barrier and it doesn't + * do the Right Thing(TM). + */ + barrier(); + if (unlikely(!buffer_uptodate(tbh))) { + ntfs_warning(vol->sb, "Buffer is unlocked but not " + "uptodate! Unplugging the disk queue " + "and rescheduling."); + get_bh(tbh); + io_schedule(); + put_bh(tbh); + if (unlikely(!buffer_uptodate(tbh))) + goto read_err; + ntfs_warning(vol->sb, "Buffer is now uptodate. Good."); + } + } + + /* + * Get the compression buffer. We must not sleep any more + * until we are finished with it. + */ + spin_lock(&ntfs_cb_lock); + cb = ntfs_compression_buffer; + + BUG_ON(!cb); + + cb_pos = cb; + cb_end = cb + cb_size; + + /* Copy the buffer heads into the contiguous buffer. */ + for (i = 0; i < nr_bhs; i++) { + memcpy(cb_pos, bhs[i]->b_data, block_size); + cb_pos += block_size; + } + + /* Just a precaution. */ + if (cb_pos + 2 <= cb + cb_size) + *(u16*)cb_pos = 0; + + /* Reset cb_pos back to the beginning. */ + cb_pos = cb; + + /* We now have both source (if present) and destination. */ + ntfs_debug("Successfully read the compression block."); + + /* The last page and maximum offset within it for the current cb. */ + cb_max_page = (cur_page << PAGE_SHIFT) + cur_ofs + cb_size; + cb_max_ofs = cb_max_page & ~PAGE_MASK; + cb_max_page >>= PAGE_SHIFT; + + /* Catch end of file inside a compression block. */ + if (cb_max_page > max_page) + cb_max_page = max_page; + + if (vcn == start_vcn - cb_clusters) { + /* Sparse cb, zero out page range overlapping the cb. */ + ntfs_debug("Found sparse compression block."); + /* We can sleep from now on, so we drop lock. */ + spin_unlock(&ntfs_cb_lock); + if (cb_max_ofs) + cb_max_page--; + for (; cur_page < cb_max_page; cur_page++) { + page = pages[cur_page]; + if (page) { + if (likely(!cur_ofs)) + clear_page(page_address(page)); + else + memset(page_address(page) + cur_ofs, 0, + PAGE_SIZE - + cur_ofs); + flush_dcache_page(page); + kunmap(page); + SetPageUptodate(page); + unlock_page(page); + if (cur_page == xpage) + xpage_done = 1; + else + put_page(page); + pages[cur_page] = NULL; + } + cb_pos += PAGE_SIZE - cur_ofs; + cur_ofs = 0; + if (cb_pos >= cb_end) + break; + } + /* If we have a partial final page, deal with it now. */ + if (cb_max_ofs && cb_pos < cb_end) { + page = pages[cur_page]; + if (page) + memset(page_address(page) + cur_ofs, 0, + cb_max_ofs - cur_ofs); + /* + * No need to update cb_pos at this stage: + * cb_pos += cb_max_ofs - cur_ofs; + */ + cur_ofs = cb_max_ofs; + } + } else if (vcn == start_vcn) { + /* We can't sleep so we need two stages. */ + unsigned int cur2_page = cur_page; + unsigned int cur_ofs2 = cur_ofs; + u8 *cb_pos2 = cb_pos; + + ntfs_debug("Found uncompressed compression block."); + /* Uncompressed cb, copy it to the destination pages. */ + /* + * TODO: As a big optimization, we could detect this case + * before we read all the pages and use block_read_full_folio() + * on all full pages instead (we still have to treat partial + * pages especially but at least we are getting rid of the + * synchronous io for the majority of pages. + * Or if we choose not to do the read-ahead/-behind stuff, we + * could just return block_read_full_folio(pages[xpage]) as long + * as PAGE_SIZE <= cb_size. + */ + if (cb_max_ofs) + cb_max_page--; + /* First stage: copy data into destination pages. */ + for (; cur_page < cb_max_page; cur_page++) { + page = pages[cur_page]; + if (page) + memcpy(page_address(page) + cur_ofs, cb_pos, + PAGE_SIZE - cur_ofs); + cb_pos += PAGE_SIZE - cur_ofs; + cur_ofs = 0; + if (cb_pos >= cb_end) + break; + } + /* If we have a partial final page, deal with it now. */ + if (cb_max_ofs && cb_pos < cb_end) { + page = pages[cur_page]; + if (page) + memcpy(page_address(page) + cur_ofs, cb_pos, + cb_max_ofs - cur_ofs); + cb_pos += cb_max_ofs - cur_ofs; + cur_ofs = cb_max_ofs; + } + /* We can sleep from now on, so drop lock. */ + spin_unlock(&ntfs_cb_lock); + /* Second stage: finalize pages. */ + for (; cur2_page < cb_max_page; cur2_page++) { + page = pages[cur2_page]; + if (page) { + /* + * If we are outside the initialized size, zero + * the out of bounds page range. + */ + handle_bounds_compressed_page(page, i_size, + initialized_size); + flush_dcache_page(page); + kunmap(page); + SetPageUptodate(page); + unlock_page(page); + if (cur2_page == xpage) + xpage_done = 1; + else + put_page(page); + pages[cur2_page] = NULL; + } + cb_pos2 += PAGE_SIZE - cur_ofs2; + cur_ofs2 = 0; + if (cb_pos2 >= cb_end) + break; + } + } else { + /* Compressed cb, decompress it into the destination page(s). */ + unsigned int prev_cur_page = cur_page; + + ntfs_debug("Found compressed compression block."); + err = ntfs_decompress(pages, completed_pages, &cur_page, + &cur_ofs, cb_max_page, cb_max_ofs, xpage, + &xpage_done, cb_pos, cb_size - (cb_pos - cb), + i_size, initialized_size); + /* + * We can sleep from now on, lock already dropped by + * ntfs_decompress(). + */ + if (err) { + ntfs_error(vol->sb, "ntfs_decompress() failed in inode " + "0x%lx with error code %i. Skipping " + "this compression block.", + ni->mft_no, -err); + /* Release the unfinished pages. */ + for (; prev_cur_page < cur_page; prev_cur_page++) { + page = pages[prev_cur_page]; + if (page) { + flush_dcache_page(page); + kunmap(page); + unlock_page(page); + if (prev_cur_page != xpage) + put_page(page); + pages[prev_cur_page] = NULL; + } + } + } + } + + /* Release the buffer heads. */ + for (i = 0; i < nr_bhs; i++) + brelse(bhs[i]); + + /* Do we have more work to do? */ + if (nr_cbs) + goto do_next_cb; + + /* We no longer need the list of buffer heads. */ + kfree(bhs); + + /* Clean up if we have any pages left. Should never happen. */ + for (cur_page = 0; cur_page < max_page; cur_page++) { + page = pages[cur_page]; + if (page) { + ntfs_error(vol->sb, "Still have pages left! " + "Terminating them with extreme " + "prejudice. Inode 0x%lx, page index " + "0x%lx.", ni->mft_no, page->index); + flush_dcache_page(page); + kunmap(page); + unlock_page(page); + if (cur_page != xpage) + put_page(page); + pages[cur_page] = NULL; + } + } + + /* We no longer need the list of pages. */ + kfree(pages); + kfree(completed_pages); + + /* If we have completed the requested page, we return success. */ + if (likely(xpage_done)) + return 0; + + ntfs_debug("Failed. Returning error code %s.", err == -EOVERFLOW ? + "EOVERFLOW" : (!err ? "EIO" : "unknown error")); + return err < 0 ? err : -EIO; + +read_err: + ntfs_error(vol->sb, "IO error while reading compressed data."); + /* Release the buffer heads. */ + for (i = 0; i < nr_bhs; i++) + brelse(bhs[i]); + goto err_out; + +map_rl_err: + ntfs_error(vol->sb, "ntfs_map_runlist() failed. Cannot read " + "compression block."); + goto err_out; + +rl_err: + up_read(&ni->runlist.lock); + ntfs_error(vol->sb, "ntfs_rl_vcn_to_lcn() failed. Cannot read " + "compression block."); + goto err_out; + +getblk_err: + up_read(&ni->runlist.lock); + ntfs_error(vol->sb, "getblk() failed. Cannot read compression block."); + +err_out: + kfree(bhs); + for (i = cur_page; i < max_page; i++) { + page = pages[i]; + if (page) { + flush_dcache_page(page); + kunmap(page); + unlock_page(page); + if (i != xpage) + put_page(page); + } + } + kfree(pages); + kfree(completed_pages); + return -EIO; +} diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c new file mode 100644 index 000000000000..a3c1c5656f8f --- /dev/null +++ b/fs/ntfs/debug.c @@ -0,0 +1,159 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * debug.c - NTFS kernel debug support. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2004 Anton Altaparmakov + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include "debug.h" + +/** + * __ntfs_warning - output a warning to the syslog + * @function: name of function outputting the warning + * @sb: super block of mounted ntfs filesystem + * @fmt: warning string containing format specifications + * @...: a variable number of arguments specified in @fmt + * + * Outputs a warning to the syslog for the mounted ntfs filesystem described + * by @sb. + * + * @fmt and the corresponding @... is printf style format string containing + * the warning string and the corresponding format arguments, respectively. + * + * @function is the name of the function from which __ntfs_warning is being + * called. + * + * Note, you should be using debug.h::ntfs_warning(@sb, @fmt, @...) instead + * as this provides the @function parameter automatically. + */ +void __ntfs_warning(const char *function, const struct super_block *sb, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + int flen = 0; + +#ifndef DEBUG + if (!printk_ratelimit()) + return; +#endif + if (function) + flen = strlen(function); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + if (sb) + pr_warn("(device %s): %s(): %pV\n", + sb->s_id, flen ? function : "", &vaf); + else + pr_warn("%s(): %pV\n", flen ? function : "", &vaf); + va_end(args); +} + +/** + * __ntfs_error - output an error to the syslog + * @function: name of function outputting the error + * @sb: super block of mounted ntfs filesystem + * @fmt: error string containing format specifications + * @...: a variable number of arguments specified in @fmt + * + * Outputs an error to the syslog for the mounted ntfs filesystem described + * by @sb. + * + * @fmt and the corresponding @... is printf style format string containing + * the error string and the corresponding format arguments, respectively. + * + * @function is the name of the function from which __ntfs_error is being + * called. + * + * Note, you should be using debug.h::ntfs_error(@sb, @fmt, @...) instead + * as this provides the @function parameter automatically. + */ +void __ntfs_error(const char *function, const struct super_block *sb, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + int flen = 0; + +#ifndef DEBUG + if (!printk_ratelimit()) + return; +#endif + if (function) + flen = strlen(function); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + if (sb) + pr_err("(device %s): %s(): %pV\n", + sb->s_id, flen ? function : "", &vaf); + else + pr_err("%s(): %pV\n", flen ? function : "", &vaf); + va_end(args); +} + +#ifdef DEBUG + +/* If 1, output debug messages, and if 0, don't. */ +int debug_msgs = 0; + +void __ntfs_debug(const char *file, int line, const char *function, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + int flen = 0; + + if (!debug_msgs) + return; + if (function) + flen = strlen(function); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_debug("(%s, %d): %s(): %pV", file, line, flen ? function : "", &vaf); + va_end(args); +} + +/* Dump a runlist. Caller has to provide synchronisation for @rl. */ +void ntfs_debug_dump_runlist(const runlist_element *rl) +{ + int i; + const char *lcn_str[5] = { "LCN_HOLE ", "LCN_RL_NOT_MAPPED", + "LCN_ENOENT ", "LCN_unknown " }; + + if (!debug_msgs) + return; + pr_debug("Dumping runlist (values in hex):\n"); + if (!rl) { + pr_debug("Run list not present.\n"); + return; + } + pr_debug("VCN LCN Run length\n"); + for (i = 0; ; i++) { + LCN lcn = (rl + i)->lcn; + + if (lcn < (LCN)0) { + int index = -lcn - 1; + + if (index > -LCN_ENOENT - 1) + index = 3; + pr_debug("%-16Lx %s %-16Lx%s\n", + (long long)(rl + i)->vcn, lcn_str[index], + (long long)(rl + i)->length, + (rl + i)->length ? "" : + " (runlist end)"); + } else + pr_debug("%-16Lx %-16Lx %-16Lx%s\n", + (long long)(rl + i)->vcn, + (long long)(rl + i)->lcn, + (long long)(rl + i)->length, + (rl + i)->length ? "" : + " (runlist end)"); + if (!(rl + i)->length) + break; + } +} + +#endif diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h new file mode 100644 index 000000000000..6fdef388f129 --- /dev/null +++ b/fs/ntfs/debug.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * debug.h - NTFS kernel debug support. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2004 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_DEBUG_H +#define _LINUX_NTFS_DEBUG_H + +#include + +#include "runlist.h" + +#ifdef DEBUG + +extern int debug_msgs; + +extern __printf(4, 5) +void __ntfs_debug(const char *file, int line, const char *function, + const char *format, ...); +/** + * ntfs_debug - write a debug level message to syslog + * @f: a printf format string containing the message + * @...: the variables to substitute into @f + * + * ntfs_debug() writes a DEBUG level message to the syslog but only if the + * driver was compiled with -DDEBUG. Otherwise, the call turns into a NOP. + */ +#define ntfs_debug(f, a...) \ + __ntfs_debug(__FILE__, __LINE__, __func__, f, ##a) + +extern void ntfs_debug_dump_runlist(const runlist_element *rl); + +#else /* !DEBUG */ + +#define ntfs_debug(fmt, ...) \ +do { \ + if (0) \ + no_printk(fmt, ##__VA_ARGS__); \ +} while (0) + +#define ntfs_debug_dump_runlist(rl) do {} while (0) + +#endif /* !DEBUG */ + +extern __printf(3, 4) +void __ntfs_warning(const char *function, const struct super_block *sb, + const char *fmt, ...); +#define ntfs_warning(sb, f, a...) __ntfs_warning(__func__, sb, f, ##a) + +extern __printf(3, 4) +void __ntfs_error(const char *function, const struct super_block *sb, + const char *fmt, ...); +#define ntfs_error(sb, f, a...) __ntfs_error(__func__, sb, f, ##a) + +#endif /* _LINUX_NTFS_DEBUG_H */ diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c new file mode 100644 index 000000000000..629723a8d712 --- /dev/null +++ b/fs/ntfs/dir.c @@ -0,0 +1,1540 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * dir.c - NTFS kernel directory operations. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2007 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + */ + +#include +#include +#include + +#include "dir.h" +#include "aops.h" +#include "attrib.h" +#include "mft.h" +#include "debug.h" +#include "ntfs.h" + +/* + * The little endian Unicode string $I30 as a global constant. + */ +ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'), + cpu_to_le16('3'), cpu_to_le16('0'), 0 }; + +/** + * ntfs_lookup_inode_by_name - find an inode in a directory given its name + * @dir_ni: ntfs inode of the directory in which to search for the name + * @uname: Unicode name for which to search in the directory + * @uname_len: length of the name @uname in Unicode characters + * @res: return the found file name if necessary (see below) + * + * Look for an inode with name @uname in the directory with inode @dir_ni. + * ntfs_lookup_inode_by_name() walks the contents of the directory looking for + * the Unicode name. If the name is found in the directory, the corresponding + * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it + * is a 64-bit number containing the sequence number. + * + * On error, a negative value is returned corresponding to the error code. In + * particular if the inode is not found -ENOENT is returned. Note that you + * can't just check the return value for being negative, you have to check the + * inode number for being negative which you can extract using MREC(return + * value). + * + * Note, @uname_len does not include the (optional) terminating NULL character. + * + * Note, we look for a case sensitive match first but we also look for a case + * insensitive match at the same time. If we find a case insensitive match, we + * save that for the case that we don't find an exact match, where we return + * the case insensitive match and setup @res (which we allocate!) with the mft + * reference, the file name type, length and with a copy of the little endian + * Unicode file name itself. If we match a file name which is in the DOS name + * space, we only return the mft reference and file name type in @res. + * ntfs_lookup() then uses this to find the long file name in the inode itself. + * This is to avoid polluting the dcache with short file names. We want them to + * work but we don't care for how quickly one can access them. This also fixes + * the dcache aliasing issues. + * + * Locking: - Caller must hold i_mutex on the directory. + * - Each page cache page in the index allocation mapping must be + * locked whilst being accessed otherwise we may find a corrupt + * page due to it being under ->writepage at the moment which + * applies the mst protection fixups before writing out and then + * removes them again after the write is complete after which it + * unlocks the page. + */ +MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, + const int uname_len, ntfs_name **res) +{ + ntfs_volume *vol = dir_ni->vol; + struct super_block *sb = vol->sb; + MFT_RECORD *m; + INDEX_ROOT *ir; + INDEX_ENTRY *ie; + INDEX_ALLOCATION *ia; + u8 *index_end; + u64 mref; + ntfs_attr_search_ctx *ctx; + int err, rc; + VCN vcn, old_vcn; + struct address_space *ia_mapping; + struct page *page; + u8 *kaddr; + ntfs_name *name = NULL; + + BUG_ON(!S_ISDIR(VFS_I(dir_ni)->i_mode)); + BUG_ON(NInoAttr(dir_ni)); + /* Get hold of the mft record for the directory. */ + m = map_mft_record(dir_ni); + if (IS_ERR(m)) { + ntfs_error(sb, "map_mft_record() failed with error code %ld.", + -PTR_ERR(m)); + return ERR_MREF(PTR_ERR(m)); + } + ctx = ntfs_attr_get_search_ctx(dir_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + /* Find the index root attribute in the mft record. */ + err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, + 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) { + ntfs_error(sb, "Index root attribute missing in " + "directory inode 0x%lx.", + dir_ni->mft_no); + err = -EIO; + } + goto err_out; + } + /* Get to the index root value (it's been verified in read_inode). */ + ir = (INDEX_ROOT*)((u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset)); + index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); + /* The first index entry. */ + ie = (INDEX_ENTRY*)((u8*)&ir->index + + le32_to_cpu(ir->index.entries_offset)); + /* + * Loop until we exceed valid memory (corruption case) or until we + * reach the last entry. + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + /* Bounds checks. */ + if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->key_length) > + index_end) + goto dir_err_out; + /* + * The last entry cannot contain a name. It can however contain + * a pointer to a child node in the B+tree so we just break out. + */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* + * We perform a case sensitive comparison and if that matches + * we are done and return the mft reference of the inode (i.e. + * the inode number together with the sequence number for + * consistency checking). We convert it to cpu format before + * returning. + */ + if (ntfs_are_names_equal(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, + CASE_SENSITIVE, vol->upcase, vol->upcase_len)) { +found_it: + /* + * We have a perfect match, so we don't need to care + * about having matched imperfectly before, so we can + * free name and set *res to NULL. + * However, if the perfect match is a short file name, + * we need to signal this through *res, so that + * ntfs_lookup() can fix dcache aliasing issues. + * As an optimization we just reuse an existing + * allocation of *res. + */ + if (ie->key.file_name.file_name_type == FILE_NAME_DOS) { + if (!name) { + name = kmalloc(sizeof(ntfs_name), + GFP_NOFS); + if (!name) { + err = -ENOMEM; + goto err_out; + } + } + name->mref = le64_to_cpu( + ie->data.dir.indexed_file); + name->type = FILE_NAME_DOS; + name->len = 0; + *res = name; + } else { + kfree(name); + *res = NULL; + } + mref = le64_to_cpu(ie->data.dir.indexed_file); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(dir_ni); + return mref; + } + /* + * For a case insensitive mount, we also perform a case + * insensitive comparison (provided the file name is not in the + * POSIX namespace). If the comparison matches, and the name is + * in the WIN32 namespace, we cache the filename in *res so + * that the caller, ntfs_lookup(), can work on it. If the + * comparison matches, and the name is in the DOS namespace, we + * only cache the mft reference and the file name type (we set + * the name length to zero for simplicity). + */ + if (!NVolCaseSensitive(vol) && + ie->key.file_name.file_name_type && + ntfs_are_names_equal(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, + IGNORE_CASE, vol->upcase, vol->upcase_len)) { + int name_size = sizeof(ntfs_name); + u8 type = ie->key.file_name.file_name_type; + u8 len = ie->key.file_name.file_name_length; + + /* Only one case insensitive matching name allowed. */ + if (name) { + ntfs_error(sb, "Found already allocated name " + "in phase 1. Please run chkdsk " + "and if that doesn't find any " + "errors please report you saw " + "this message to " + "linux-ntfs-dev@lists." + "sourceforge.net."); + goto dir_err_out; + } + + if (type != FILE_NAME_DOS) + name_size += len * sizeof(ntfschar); + name = kmalloc(name_size, GFP_NOFS); + if (!name) { + err = -ENOMEM; + goto err_out; + } + name->mref = le64_to_cpu(ie->data.dir.indexed_file); + name->type = type; + if (type != FILE_NAME_DOS) { + name->len = len; + memcpy(name->name, ie->key.file_name.file_name, + len * sizeof(ntfschar)); + } else + name->len = 0; + *res = name; + } + /* + * Not a perfect match, need to do full blown collation so we + * know which way in the B+tree we have to go. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + IGNORE_CASE, vol->upcase, vol->upcase_len); + /* + * If uname collates before the name of the current entry, there + * is definitely no such name in this index but we might need to + * descend into the B+tree so we just break out of the loop. + */ + if (rc == -1) + break; + /* The names are not equal, continue the search. */ + if (rc) + continue; + /* + * Names match with case insensitive comparison, now try the + * case sensitive comparison, which is required for proper + * collation. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + CASE_SENSITIVE, vol->upcase, vol->upcase_len); + if (rc == -1) + break; + if (rc) + continue; + /* + * Perfect match, this will never happen as the + * ntfs_are_names_equal() call will have gotten a match but we + * still treat it correctly. + */ + goto found_it; + } + /* + * We have finished with this index without success. Check for the + * presence of a child node and if not present return -ENOENT, unless + * we have got a matching name cached in name in which case return the + * mft reference associated with it. + */ + if (!(ie->flags & INDEX_ENTRY_NODE)) { + if (name) { + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(dir_ni); + return name->mref; + } + ntfs_debug("Entry not found."); + err = -ENOENT; + goto err_out; + } /* Child node present, descend into it. */ + /* Consistency check: Verify that an index allocation exists. */ + if (!NInoIndexAllocPresent(dir_ni)) { + ntfs_error(sb, "No index allocation attribute but index entry " + "requires one. Directory inode 0x%lx is " + "corrupt or driver bug.", dir_ni->mft_no); + goto err_out; + } + /* Get the starting vcn of the index_block holding the child node. */ + vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8)); + ia_mapping = VFS_I(dir_ni)->i_mapping; + /* + * We are done with the index root and the mft record. Release them, + * otherwise we deadlock with ntfs_map_page(). + */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(dir_ni); + m = NULL; + ctx = NULL; +descend_into_child_node: + /* + * Convert vcn to index into the index allocation attribute in units + * of PAGE_SIZE and map the page cache page, reading it from + * disk if necessary. + */ + page = ntfs_map_page(ia_mapping, vcn << + dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT); + if (IS_ERR(page)) { + ntfs_error(sb, "Failed to map directory index page, error %ld.", + -PTR_ERR(page)); + err = PTR_ERR(page); + goto err_out; + } + lock_page(page); + kaddr = (u8*)page_address(page); +fast_descend_into_child_node: + /* Get to the index allocation block. */ + ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << + dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK)); + /* Bounds checks. */ + if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) { + ntfs_error(sb, "Out of bounds check failed. Corrupt directory " + "inode 0x%lx or driver bug.", dir_ni->mft_no); + goto unm_err_out; + } + /* Catch multi sector transfer fixup errors. */ + if (unlikely(!ntfs_is_indx_record(ia->magic))) { + ntfs_error(sb, "Directory index record with vcn 0x%llx is " + "corrupt. Corrupt inode 0x%lx. Run chkdsk.", + (unsigned long long)vcn, dir_ni->mft_no); + goto unm_err_out; + } + if (sle64_to_cpu(ia->index_block_vcn) != vcn) { + ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " + "different from expected VCN (0x%llx). " + "Directory inode 0x%lx is corrupt or driver " + "bug.", (unsigned long long) + sle64_to_cpu(ia->index_block_vcn), + (unsigned long long)vcn, dir_ni->mft_no); + goto unm_err_out; + } + if (le32_to_cpu(ia->index.allocated_size) + 0x18 != + dir_ni->itype.index.block_size) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " + "0x%lx has a size (%u) differing from the " + "directory specified size (%u). Directory " + "inode is corrupt or driver bug.", + (unsigned long long)vcn, dir_ni->mft_no, + le32_to_cpu(ia->index.allocated_size) + 0x18, + dir_ni->itype.index.block_size); + goto unm_err_out; + } + index_end = (u8*)ia + dir_ni->itype.index.block_size; + if (index_end > kaddr + PAGE_SIZE) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " + "0x%lx crosses page boundary. Impossible! " + "Cannot access! This is probably a bug in the " + "driver.", (unsigned long long)vcn, + dir_ni->mft_no); + goto unm_err_out; + } + index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); + if (index_end > (u8*)ia + dir_ni->itype.index.block_size) { + ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory " + "inode 0x%lx exceeds maximum size.", + (unsigned long long)vcn, dir_ni->mft_no); + goto unm_err_out; + } + /* The first index entry. */ + ie = (INDEX_ENTRY*)((u8*)&ia->index + + le32_to_cpu(ia->index.entries_offset)); + /* + * Iterate similar to above big loop but applied to index buffer, thus + * loop until we exceed valid memory (corruption case) or until we + * reach the last entry. + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + /* Bounds check. */ + if ((u8*)ie < (u8*)ia || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->key_length) > + index_end) { + ntfs_error(sb, "Index entry out of bounds in " + "directory inode 0x%lx.", + dir_ni->mft_no); + goto unm_err_out; + } + /* + * The last entry cannot contain a name. It can however contain + * a pointer to a child node in the B+tree so we just break out. + */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* + * We perform a case sensitive comparison and if that matches + * we are done and return the mft reference of the inode (i.e. + * the inode number together with the sequence number for + * consistency checking). We convert it to cpu format before + * returning. + */ + if (ntfs_are_names_equal(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, + CASE_SENSITIVE, vol->upcase, vol->upcase_len)) { +found_it2: + /* + * We have a perfect match, so we don't need to care + * about having matched imperfectly before, so we can + * free name and set *res to NULL. + * However, if the perfect match is a short file name, + * we need to signal this through *res, so that + * ntfs_lookup() can fix dcache aliasing issues. + * As an optimization we just reuse an existing + * allocation of *res. + */ + if (ie->key.file_name.file_name_type == FILE_NAME_DOS) { + if (!name) { + name = kmalloc(sizeof(ntfs_name), + GFP_NOFS); + if (!name) { + err = -ENOMEM; + goto unm_err_out; + } + } + name->mref = le64_to_cpu( + ie->data.dir.indexed_file); + name->type = FILE_NAME_DOS; + name->len = 0; + *res = name; + } else { + kfree(name); + *res = NULL; + } + mref = le64_to_cpu(ie->data.dir.indexed_file); + unlock_page(page); + ntfs_unmap_page(page); + return mref; + } + /* + * For a case insensitive mount, we also perform a case + * insensitive comparison (provided the file name is not in the + * POSIX namespace). If the comparison matches, and the name is + * in the WIN32 namespace, we cache the filename in *res so + * that the caller, ntfs_lookup(), can work on it. If the + * comparison matches, and the name is in the DOS namespace, we + * only cache the mft reference and the file name type (we set + * the name length to zero for simplicity). + */ + if (!NVolCaseSensitive(vol) && + ie->key.file_name.file_name_type && + ntfs_are_names_equal(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, + IGNORE_CASE, vol->upcase, vol->upcase_len)) { + int name_size = sizeof(ntfs_name); + u8 type = ie->key.file_name.file_name_type; + u8 len = ie->key.file_name.file_name_length; + + /* Only one case insensitive matching name allowed. */ + if (name) { + ntfs_error(sb, "Found already allocated name " + "in phase 2. Please run chkdsk " + "and if that doesn't find any " + "errors please report you saw " + "this message to " + "linux-ntfs-dev@lists." + "sourceforge.net."); + unlock_page(page); + ntfs_unmap_page(page); + goto dir_err_out; + } + + if (type != FILE_NAME_DOS) + name_size += len * sizeof(ntfschar); + name = kmalloc(name_size, GFP_NOFS); + if (!name) { + err = -ENOMEM; + goto unm_err_out; + } + name->mref = le64_to_cpu(ie->data.dir.indexed_file); + name->type = type; + if (type != FILE_NAME_DOS) { + name->len = len; + memcpy(name->name, ie->key.file_name.file_name, + len * sizeof(ntfschar)); + } else + name->len = 0; + *res = name; + } + /* + * Not a perfect match, need to do full blown collation so we + * know which way in the B+tree we have to go. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + IGNORE_CASE, vol->upcase, vol->upcase_len); + /* + * If uname collates before the name of the current entry, there + * is definitely no such name in this index but we might need to + * descend into the B+tree so we just break out of the loop. + */ + if (rc == -1) + break; + /* The names are not equal, continue the search. */ + if (rc) + continue; + /* + * Names match with case insensitive comparison, now try the + * case sensitive comparison, which is required for proper + * collation. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + CASE_SENSITIVE, vol->upcase, vol->upcase_len); + if (rc == -1) + break; + if (rc) + continue; + /* + * Perfect match, this will never happen as the + * ntfs_are_names_equal() call will have gotten a match but we + * still treat it correctly. + */ + goto found_it2; + } + /* + * We have finished with this index buffer without success. Check for + * the presence of a child node. + */ + if (ie->flags & INDEX_ENTRY_NODE) { + if ((ia->index.flags & NODE_MASK) == LEAF_NODE) { + ntfs_error(sb, "Index entry with child node found in " + "a leaf node in directory inode 0x%lx.", + dir_ni->mft_no); + goto unm_err_out; + } + /* Child node present, descend into it. */ + old_vcn = vcn; + vcn = sle64_to_cpup((sle64*)((u8*)ie + + le16_to_cpu(ie->length) - 8)); + if (vcn >= 0) { + /* If vcn is in the same page cache page as old_vcn we + * recycle the mapped page. */ + if (old_vcn << vol->cluster_size_bits >> + PAGE_SHIFT == vcn << + vol->cluster_size_bits >> + PAGE_SHIFT) + goto fast_descend_into_child_node; + unlock_page(page); + ntfs_unmap_page(page); + goto descend_into_child_node; + } + ntfs_error(sb, "Negative child node vcn in directory inode " + "0x%lx.", dir_ni->mft_no); + goto unm_err_out; + } + /* + * No child node present, return -ENOENT, unless we have got a matching + * name cached in name in which case return the mft reference + * associated with it. + */ + if (name) { + unlock_page(page); + ntfs_unmap_page(page); + return name->mref; + } + ntfs_debug("Entry not found."); + err = -ENOENT; +unm_err_out: + unlock_page(page); + ntfs_unmap_page(page); +err_out: + if (!err) + err = -EIO; + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(dir_ni); + if (name) { + kfree(name); + *res = NULL; + } + return ERR_MREF(err); +dir_err_out: + ntfs_error(sb, "Corrupt directory. Aborting lookup."); + goto err_out; +} + +#if 0 + +// TODO: (AIA) +// The algorithm embedded in this code will be required for the time when we +// want to support adding of entries to directories, where we require correct +// collation of file names in order not to cause corruption of the filesystem. + +/** + * ntfs_lookup_inode_by_name - find an inode in a directory given its name + * @dir_ni: ntfs inode of the directory in which to search for the name + * @uname: Unicode name for which to search in the directory + * @uname_len: length of the name @uname in Unicode characters + * + * Look for an inode with name @uname in the directory with inode @dir_ni. + * ntfs_lookup_inode_by_name() walks the contents of the directory looking for + * the Unicode name. If the name is found in the directory, the corresponding + * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it + * is a 64-bit number containing the sequence number. + * + * On error, a negative value is returned corresponding to the error code. In + * particular if the inode is not found -ENOENT is returned. Note that you + * can't just check the return value for being negative, you have to check the + * inode number for being negative which you can extract using MREC(return + * value). + * + * Note, @uname_len does not include the (optional) terminating NULL character. + */ +u64 ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, + const int uname_len) +{ + ntfs_volume *vol = dir_ni->vol; + struct super_block *sb = vol->sb; + MFT_RECORD *m; + INDEX_ROOT *ir; + INDEX_ENTRY *ie; + INDEX_ALLOCATION *ia; + u8 *index_end; + u64 mref; + ntfs_attr_search_ctx *ctx; + int err, rc; + IGNORE_CASE_BOOL ic; + VCN vcn, old_vcn; + struct address_space *ia_mapping; + struct page *page; + u8 *kaddr; + + /* Get hold of the mft record for the directory. */ + m = map_mft_record(dir_ni); + if (IS_ERR(m)) { + ntfs_error(sb, "map_mft_record() failed with error code %ld.", + -PTR_ERR(m)); + return ERR_MREF(PTR_ERR(m)); + } + ctx = ntfs_attr_get_search_ctx(dir_ni, m); + if (!ctx) { + err = -ENOMEM; + goto err_out; + } + /* Find the index root attribute in the mft record. */ + err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, + 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) { + ntfs_error(sb, "Index root attribute missing in " + "directory inode 0x%lx.", + dir_ni->mft_no); + err = -EIO; + } + goto err_out; + } + /* Get to the index root value (it's been verified in read_inode). */ + ir = (INDEX_ROOT*)((u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset)); + index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); + /* The first index entry. */ + ie = (INDEX_ENTRY*)((u8*)&ir->index + + le32_to_cpu(ir->index.entries_offset)); + /* + * Loop until we exceed valid memory (corruption case) or until we + * reach the last entry. + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + /* Bounds checks. */ + if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->key_length) > + index_end) + goto dir_err_out; + /* + * The last entry cannot contain a name. It can however contain + * a pointer to a child node in the B+tree so we just break out. + */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* + * If the current entry has a name type of POSIX, the name is + * case sensitive and not otherwise. This has the effect of us + * not being able to access any POSIX file names which collate + * after the non-POSIX one when they only differ in case, but + * anyone doing screwy stuff like that deserves to burn in + * hell... Doing that kind of stuff on NT4 actually causes + * corruption on the partition even when using SP6a and Linux + * is not involved at all. + */ + ic = ie->key.file_name.file_name_type ? IGNORE_CASE : + CASE_SENSITIVE; + /* + * If the names match perfectly, we are done and return the + * mft reference of the inode (i.e. the inode number together + * with the sequence number for consistency checking. We + * convert it to cpu format before returning. + */ + if (ntfs_are_names_equal(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, ic, + vol->upcase, vol->upcase_len)) { +found_it: + mref = le64_to_cpu(ie->data.dir.indexed_file); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(dir_ni); + return mref; + } + /* + * Not a perfect match, need to do full blown collation so we + * know which way in the B+tree we have to go. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + IGNORE_CASE, vol->upcase, vol->upcase_len); + /* + * If uname collates before the name of the current entry, there + * is definitely no such name in this index but we might need to + * descend into the B+tree so we just break out of the loop. + */ + if (rc == -1) + break; + /* The names are not equal, continue the search. */ + if (rc) + continue; + /* + * Names match with case insensitive comparison, now try the + * case sensitive comparison, which is required for proper + * collation. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + CASE_SENSITIVE, vol->upcase, vol->upcase_len); + if (rc == -1) + break; + if (rc) + continue; + /* + * Perfect match, this will never happen as the + * ntfs_are_names_equal() call will have gotten a match but we + * still treat it correctly. + */ + goto found_it; + } + /* + * We have finished with this index without success. Check for the + * presence of a child node. + */ + if (!(ie->flags & INDEX_ENTRY_NODE)) { + /* No child node, return -ENOENT. */ + err = -ENOENT; + goto err_out; + } /* Child node present, descend into it. */ + /* Consistency check: Verify that an index allocation exists. */ + if (!NInoIndexAllocPresent(dir_ni)) { + ntfs_error(sb, "No index allocation attribute but index entry " + "requires one. Directory inode 0x%lx is " + "corrupt or driver bug.", dir_ni->mft_no); + goto err_out; + } + /* Get the starting vcn of the index_block holding the child node. */ + vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8); + ia_mapping = VFS_I(dir_ni)->i_mapping; + /* + * We are done with the index root and the mft record. Release them, + * otherwise we deadlock with ntfs_map_page(). + */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(dir_ni); + m = NULL; + ctx = NULL; +descend_into_child_node: + /* + * Convert vcn to index into the index allocation attribute in units + * of PAGE_SIZE and map the page cache page, reading it from + * disk if necessary. + */ + page = ntfs_map_page(ia_mapping, vcn << + dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT); + if (IS_ERR(page)) { + ntfs_error(sb, "Failed to map directory index page, error %ld.", + -PTR_ERR(page)); + err = PTR_ERR(page); + goto err_out; + } + lock_page(page); + kaddr = (u8*)page_address(page); +fast_descend_into_child_node: + /* Get to the index allocation block. */ + ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << + dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK)); + /* Bounds checks. */ + if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) { + ntfs_error(sb, "Out of bounds check failed. Corrupt directory " + "inode 0x%lx or driver bug.", dir_ni->mft_no); + goto unm_err_out; + } + /* Catch multi sector transfer fixup errors. */ + if (unlikely(!ntfs_is_indx_record(ia->magic))) { + ntfs_error(sb, "Directory index record with vcn 0x%llx is " + "corrupt. Corrupt inode 0x%lx. Run chkdsk.", + (unsigned long long)vcn, dir_ni->mft_no); + goto unm_err_out; + } + if (sle64_to_cpu(ia->index_block_vcn) != vcn) { + ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " + "different from expected VCN (0x%llx). " + "Directory inode 0x%lx is corrupt or driver " + "bug.", (unsigned long long) + sle64_to_cpu(ia->index_block_vcn), + (unsigned long long)vcn, dir_ni->mft_no); + goto unm_err_out; + } + if (le32_to_cpu(ia->index.allocated_size) + 0x18 != + dir_ni->itype.index.block_size) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " + "0x%lx has a size (%u) differing from the " + "directory specified size (%u). Directory " + "inode is corrupt or driver bug.", + (unsigned long long)vcn, dir_ni->mft_no, + le32_to_cpu(ia->index.allocated_size) + 0x18, + dir_ni->itype.index.block_size); + goto unm_err_out; + } + index_end = (u8*)ia + dir_ni->itype.index.block_size; + if (index_end > kaddr + PAGE_SIZE) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " + "0x%lx crosses page boundary. Impossible! " + "Cannot access! This is probably a bug in the " + "driver.", (unsigned long long)vcn, + dir_ni->mft_no); + goto unm_err_out; + } + index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); + if (index_end > (u8*)ia + dir_ni->itype.index.block_size) { + ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory " + "inode 0x%lx exceeds maximum size.", + (unsigned long long)vcn, dir_ni->mft_no); + goto unm_err_out; + } + /* The first index entry. */ + ie = (INDEX_ENTRY*)((u8*)&ia->index + + le32_to_cpu(ia->index.entries_offset)); + /* + * Iterate similar to above big loop but applied to index buffer, thus + * loop until we exceed valid memory (corruption case) or until we + * reach the last entry. + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + /* Bounds check. */ + if ((u8*)ie < (u8*)ia || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->key_length) > + index_end) { + ntfs_error(sb, "Index entry out of bounds in " + "directory inode 0x%lx.", + dir_ni->mft_no); + goto unm_err_out; + } + /* + * The last entry cannot contain a name. It can however contain + * a pointer to a child node in the B+tree so we just break out. + */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* + * If the current entry has a name type of POSIX, the name is + * case sensitive and not otherwise. This has the effect of us + * not being able to access any POSIX file names which collate + * after the non-POSIX one when they only differ in case, but + * anyone doing screwy stuff like that deserves to burn in + * hell... Doing that kind of stuff on NT4 actually causes + * corruption on the partition even when using SP6a and Linux + * is not involved at all. + */ + ic = ie->key.file_name.file_name_type ? IGNORE_CASE : + CASE_SENSITIVE; + /* + * If the names match perfectly, we are done and return the + * mft reference of the inode (i.e. the inode number together + * with the sequence number for consistency checking. We + * convert it to cpu format before returning. + */ + if (ntfs_are_names_equal(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, ic, + vol->upcase, vol->upcase_len)) { +found_it2: + mref = le64_to_cpu(ie->data.dir.indexed_file); + unlock_page(page); + ntfs_unmap_page(page); + return mref; + } + /* + * Not a perfect match, need to do full blown collation so we + * know which way in the B+tree we have to go. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + IGNORE_CASE, vol->upcase, vol->upcase_len); + /* + * If uname collates before the name of the current entry, there + * is definitely no such name in this index but we might need to + * descend into the B+tree so we just break out of the loop. + */ + if (rc == -1) + break; + /* The names are not equal, continue the search. */ + if (rc) + continue; + /* + * Names match with case insensitive comparison, now try the + * case sensitive comparison, which is required for proper + * collation. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + CASE_SENSITIVE, vol->upcase, vol->upcase_len); + if (rc == -1) + break; + if (rc) + continue; + /* + * Perfect match, this will never happen as the + * ntfs_are_names_equal() call will have gotten a match but we + * still treat it correctly. + */ + goto found_it2; + } + /* + * We have finished with this index buffer without success. Check for + * the presence of a child node. + */ + if (ie->flags & INDEX_ENTRY_NODE) { + if ((ia->index.flags & NODE_MASK) == LEAF_NODE) { + ntfs_error(sb, "Index entry with child node found in " + "a leaf node in directory inode 0x%lx.", + dir_ni->mft_no); + goto unm_err_out; + } + /* Child node present, descend into it. */ + old_vcn = vcn; + vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8); + if (vcn >= 0) { + /* If vcn is in the same page cache page as old_vcn we + * recycle the mapped page. */ + if (old_vcn << vol->cluster_size_bits >> + PAGE_SHIFT == vcn << + vol->cluster_size_bits >> + PAGE_SHIFT) + goto fast_descend_into_child_node; + unlock_page(page); + ntfs_unmap_page(page); + goto descend_into_child_node; + } + ntfs_error(sb, "Negative child node vcn in directory inode " + "0x%lx.", dir_ni->mft_no); + goto unm_err_out; + } + /* No child node, return -ENOENT. */ + ntfs_debug("Entry not found."); + err = -ENOENT; +unm_err_out: + unlock_page(page); + ntfs_unmap_page(page); +err_out: + if (!err) + err = -EIO; + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(dir_ni); + return ERR_MREF(err); +dir_err_out: + ntfs_error(sb, "Corrupt directory. Aborting lookup."); + goto err_out; +} + +#endif + +/** + * ntfs_filldir - ntfs specific filldir method + * @vol: current ntfs volume + * @ndir: ntfs inode of current directory + * @ia_page: page in which the index allocation buffer @ie is in resides + * @ie: current index entry + * @name: buffer to use for the converted name + * @actor: what to feed the entries to + * + * Convert the Unicode @name to the loaded NLS and pass it to the @filldir + * callback. + * + * If @ia_page is not NULL it is the locked page containing the index + * allocation block containing the index entry @ie. + * + * Note, we drop (and then reacquire) the page lock on @ia_page across the + * @filldir() call otherwise we would deadlock with NFSd when it calls ->lookup + * since ntfs_lookup() will lock the same page. As an optimization, we do not + * retake the lock if we are returning a non-zero value as ntfs_readdir() + * would need to drop the lock immediately anyway. + */ +static inline int ntfs_filldir(ntfs_volume *vol, + ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie, + u8 *name, struct dir_context *actor) +{ + unsigned long mref; + int name_len; + unsigned dt_type; + FILE_NAME_TYPE_FLAGS name_type; + + name_type = ie->key.file_name.file_name_type; + if (name_type == FILE_NAME_DOS) { + ntfs_debug("Skipping DOS name space entry."); + return 0; + } + if (MREF_LE(ie->data.dir.indexed_file) == FILE_root) { + ntfs_debug("Skipping root directory self reference entry."); + return 0; + } + if (MREF_LE(ie->data.dir.indexed_file) < FILE_first_user && + !NVolShowSystemFiles(vol)) { + ntfs_debug("Skipping system file."); + return 0; + } + name_len = ntfs_ucstonls(vol, (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, &name, + NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1); + if (name_len <= 0) { + ntfs_warning(vol->sb, "Skipping unrepresentable inode 0x%llx.", + (long long)MREF_LE(ie->data.dir.indexed_file)); + return 0; + } + if (ie->key.file_name.file_attributes & + FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT) + dt_type = DT_DIR; + else + dt_type = DT_REG; + mref = MREF_LE(ie->data.dir.indexed_file); + /* + * Drop the page lock otherwise we deadlock with NFS when it calls + * ->lookup since ntfs_lookup() will lock the same page. + */ + if (ia_page) + unlock_page(ia_page); + ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode " + "0x%lx, DT_%s.", name, name_len, actor->pos, mref, + dt_type == DT_DIR ? "DIR" : "REG"); + if (!dir_emit(actor, name, name_len, mref, dt_type)) + return 1; + /* Relock the page but not if we are aborting ->readdir. */ + if (ia_page) + lock_page(ia_page); + return 0; +} + +/* + * We use the same basic approach as the old NTFS driver, i.e. we parse the + * index root entries and then the index allocation entries that are marked + * as in use in the index bitmap. + * + * While this will return the names in random order this doesn't matter for + * ->readdir but OTOH results in a faster ->readdir. + * + * VFS calls ->readdir without BKL but with i_mutex held. This protects the VFS + * parts (e.g. ->f_pos and ->i_size, and it also protects against directory + * modifications). + * + * Locking: - Caller must hold i_mutex on the directory. + * - Each page cache page in the index allocation mapping must be + * locked whilst being accessed otherwise we may find a corrupt + * page due to it being under ->writepage at the moment which + * applies the mst protection fixups before writing out and then + * removes them again after the write is complete after which it + * unlocks the page. + */ +static int ntfs_readdir(struct file *file, struct dir_context *actor) +{ + s64 ia_pos, ia_start, prev_ia_pos, bmp_pos; + loff_t i_size; + struct inode *bmp_vi, *vdir = file_inode(file); + struct super_block *sb = vdir->i_sb; + ntfs_inode *ndir = NTFS_I(vdir); + ntfs_volume *vol = NTFS_SB(sb); + MFT_RECORD *m; + INDEX_ROOT *ir = NULL; + INDEX_ENTRY *ie; + INDEX_ALLOCATION *ia; + u8 *name = NULL; + int rc, err, ir_pos, cur_bmp_pos; + struct address_space *ia_mapping, *bmp_mapping; + struct page *bmp_page = NULL, *ia_page = NULL; + u8 *kaddr, *bmp, *index_end; + ntfs_attr_search_ctx *ctx; + + ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.", + vdir->i_ino, actor->pos); + rc = err = 0; + /* Are we at end of dir yet? */ + i_size = i_size_read(vdir); + if (actor->pos >= i_size + vol->mft_record_size) + return 0; + /* Emulate . and .. for all directories. */ + if (!dir_emit_dots(file, actor)) + return 0; + m = NULL; + ctx = NULL; + /* + * Allocate a buffer to store the current name being processed + * converted to format determined by current NLS. + */ + name = kmalloc(NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1, GFP_NOFS); + if (unlikely(!name)) { + err = -ENOMEM; + goto err_out; + } + /* Are we jumping straight into the index allocation attribute? */ + if (actor->pos >= vol->mft_record_size) + goto skip_index_root; + /* Get hold of the mft record for the directory. */ + m = map_mft_record(ndir); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(ndir, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + /* Get the offset into the index root attribute. */ + ir_pos = (s64)actor->pos; + /* Find the index root attribute in the mft record. */ + err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, + 0, ctx); + if (unlikely(err)) { + ntfs_error(sb, "Index root attribute missing in directory " + "inode 0x%lx.", vdir->i_ino); + goto err_out; + } + /* + * Copy the index root attribute value to a buffer so that we can put + * the search context and unmap the mft record before calling the + * filldir() callback. We need to do this because of NFSd which calls + * ->lookup() from its filldir callback() and this causes NTFS to + * deadlock as ntfs_lookup() maps the mft record of the directory and + * we have got it mapped here already. The only solution is for us to + * unmap the mft record here so that a call to ntfs_lookup() is able to + * map the mft record without deadlocking. + */ + rc = le32_to_cpu(ctx->attr->data.resident.value_length); + ir = kmalloc(rc, GFP_NOFS); + if (unlikely(!ir)) { + err = -ENOMEM; + goto err_out; + } + /* Copy the index root value (it has been verified in read_inode). */ + memcpy(ir, (u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset), rc); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ndir); + ctx = NULL; + m = NULL; + index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); + /* The first index entry. */ + ie = (INDEX_ENTRY*)((u8*)&ir->index + + le32_to_cpu(ir->index.entries_offset)); + /* + * Loop until we exceed valid memory (corruption case) or until we + * reach the last entry or until filldir tells us it has had enough + * or signals an error (both covered by the rc test). + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + ntfs_debug("In index root, offset 0x%zx.", (u8*)ie - (u8*)ir); + /* Bounds checks. */ + if (unlikely((u8*)ie < (u8*)ir || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->key_length) > + index_end)) + goto err_out; + /* The last entry cannot contain a name. */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* Skip index root entry if continuing previous readdir. */ + if (ir_pos > (u8*)ie - (u8*)ir) + continue; + /* Advance the position even if going to skip the entry. */ + actor->pos = (u8*)ie - (u8*)ir; + /* Submit the name to the filldir callback. */ + rc = ntfs_filldir(vol, ndir, NULL, ie, name, actor); + if (rc) { + kfree(ir); + goto abort; + } + } + /* We are done with the index root and can free the buffer. */ + kfree(ir); + ir = NULL; + /* If there is no index allocation attribute we are finished. */ + if (!NInoIndexAllocPresent(ndir)) + goto EOD; + /* Advance fpos to the beginning of the index allocation. */ + actor->pos = vol->mft_record_size; +skip_index_root: + kaddr = NULL; + prev_ia_pos = -1LL; + /* Get the offset into the index allocation attribute. */ + ia_pos = (s64)actor->pos - vol->mft_record_size; + ia_mapping = vdir->i_mapping; + ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino); + bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4); + if (IS_ERR(bmp_vi)) { + ntfs_error(sb, "Failed to get bitmap attribute."); + err = PTR_ERR(bmp_vi); + goto err_out; + } + bmp_mapping = bmp_vi->i_mapping; + /* Get the starting bitmap bit position and sanity check it. */ + bmp_pos = ia_pos >> ndir->itype.index.block_size_bits; + if (unlikely(bmp_pos >> 3 >= i_size_read(bmp_vi))) { + ntfs_error(sb, "Current index allocation position exceeds " + "index bitmap size."); + goto iput_err_out; + } + /* Get the starting bit position in the current bitmap page. */ + cur_bmp_pos = bmp_pos & ((PAGE_SIZE * 8) - 1); + bmp_pos &= ~(u64)((PAGE_SIZE * 8) - 1); +get_next_bmp_page: + ntfs_debug("Reading bitmap with page index 0x%llx, bit ofs 0x%llx", + (unsigned long long)bmp_pos >> (3 + PAGE_SHIFT), + (unsigned long long)bmp_pos & + (unsigned long long)((PAGE_SIZE * 8) - 1)); + bmp_page = ntfs_map_page(bmp_mapping, + bmp_pos >> (3 + PAGE_SHIFT)); + if (IS_ERR(bmp_page)) { + ntfs_error(sb, "Reading index bitmap failed."); + err = PTR_ERR(bmp_page); + bmp_page = NULL; + goto iput_err_out; + } + bmp = (u8*)page_address(bmp_page); + /* Find next index block in use. */ + while (!(bmp[cur_bmp_pos >> 3] & (1 << (cur_bmp_pos & 7)))) { +find_next_index_buffer: + cur_bmp_pos++; + /* + * If we have reached the end of the bitmap page, get the next + * page, and put away the old one. + */ + if (unlikely((cur_bmp_pos >> 3) >= PAGE_SIZE)) { + ntfs_unmap_page(bmp_page); + bmp_pos += PAGE_SIZE * 8; + cur_bmp_pos = 0; + goto get_next_bmp_page; + } + /* If we have reached the end of the bitmap, we are done. */ + if (unlikely(((bmp_pos + cur_bmp_pos) >> 3) >= i_size)) + goto unm_EOD; + ia_pos = (bmp_pos + cur_bmp_pos) << + ndir->itype.index.block_size_bits; + } + ntfs_debug("Handling index buffer 0x%llx.", + (unsigned long long)bmp_pos + cur_bmp_pos); + /* If the current index buffer is in the same page we reuse the page. */ + if ((prev_ia_pos & (s64)PAGE_MASK) != + (ia_pos & (s64)PAGE_MASK)) { + prev_ia_pos = ia_pos; + if (likely(ia_page != NULL)) { + unlock_page(ia_page); + ntfs_unmap_page(ia_page); + } + /* + * Map the page cache page containing the current ia_pos, + * reading it from disk if necessary. + */ + ia_page = ntfs_map_page(ia_mapping, ia_pos >> PAGE_SHIFT); + if (IS_ERR(ia_page)) { + ntfs_error(sb, "Reading index allocation data failed."); + err = PTR_ERR(ia_page); + ia_page = NULL; + goto err_out; + } + lock_page(ia_page); + kaddr = (u8*)page_address(ia_page); + } + /* Get the current index buffer. */ + ia = (INDEX_ALLOCATION*)(kaddr + (ia_pos & ~PAGE_MASK & + ~(s64)(ndir->itype.index.block_size - 1))); + /* Bounds checks. */ + if (unlikely((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE)) { + ntfs_error(sb, "Out of bounds check failed. Corrupt directory " + "inode 0x%lx or driver bug.", vdir->i_ino); + goto err_out; + } + /* Catch multi sector transfer fixup errors. */ + if (unlikely(!ntfs_is_indx_record(ia->magic))) { + ntfs_error(sb, "Directory index record with vcn 0x%llx is " + "corrupt. Corrupt inode 0x%lx. Run chkdsk.", + (unsigned long long)ia_pos >> + ndir->itype.index.vcn_size_bits, vdir->i_ino); + goto err_out; + } + if (unlikely(sle64_to_cpu(ia->index_block_vcn) != (ia_pos & + ~(s64)(ndir->itype.index.block_size - 1)) >> + ndir->itype.index.vcn_size_bits)) { + ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " + "different from expected VCN (0x%llx). " + "Directory inode 0x%lx is corrupt or driver " + "bug. ", (unsigned long long) + sle64_to_cpu(ia->index_block_vcn), + (unsigned long long)ia_pos >> + ndir->itype.index.vcn_size_bits, vdir->i_ino); + goto err_out; + } + if (unlikely(le32_to_cpu(ia->index.allocated_size) + 0x18 != + ndir->itype.index.block_size)) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " + "0x%lx has a size (%u) differing from the " + "directory specified size (%u). Directory " + "inode is corrupt or driver bug.", + (unsigned long long)ia_pos >> + ndir->itype.index.vcn_size_bits, vdir->i_ino, + le32_to_cpu(ia->index.allocated_size) + 0x18, + ndir->itype.index.block_size); + goto err_out; + } + index_end = (u8*)ia + ndir->itype.index.block_size; + if (unlikely(index_end > kaddr + PAGE_SIZE)) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " + "0x%lx crosses page boundary. Impossible! " + "Cannot access! This is probably a bug in the " + "driver.", (unsigned long long)ia_pos >> + ndir->itype.index.vcn_size_bits, vdir->i_ino); + goto err_out; + } + ia_start = ia_pos & ~(s64)(ndir->itype.index.block_size - 1); + index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); + if (unlikely(index_end > (u8*)ia + ndir->itype.index.block_size)) { + ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory " + "inode 0x%lx exceeds maximum size.", + (unsigned long long)ia_pos >> + ndir->itype.index.vcn_size_bits, vdir->i_ino); + goto err_out; + } + /* The first index entry in this index buffer. */ + ie = (INDEX_ENTRY*)((u8*)&ia->index + + le32_to_cpu(ia->index.entries_offset)); + /* + * Loop until we exceed valid memory (corruption case) or until we + * reach the last entry or until filldir tells us it has had enough + * or signals an error (both covered by the rc test). + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + ntfs_debug("In index allocation, offset 0x%llx.", + (unsigned long long)ia_start + + (unsigned long long)((u8*)ie - (u8*)ia)); + /* Bounds checks. */ + if (unlikely((u8*)ie < (u8*)ia || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->key_length) > + index_end)) + goto err_out; + /* The last entry cannot contain a name. */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* Skip index block entry if continuing previous readdir. */ + if (ia_pos - ia_start > (u8*)ie - (u8*)ia) + continue; + /* Advance the position even if going to skip the entry. */ + actor->pos = (u8*)ie - (u8*)ia + + (sle64_to_cpu(ia->index_block_vcn) << + ndir->itype.index.vcn_size_bits) + + vol->mft_record_size; + /* + * Submit the name to the @filldir callback. Note, + * ntfs_filldir() drops the lock on @ia_page but it retakes it + * before returning, unless a non-zero value is returned in + * which case the page is left unlocked. + */ + rc = ntfs_filldir(vol, ndir, ia_page, ie, name, actor); + if (rc) { + /* @ia_page is already unlocked in this case. */ + ntfs_unmap_page(ia_page); + ntfs_unmap_page(bmp_page); + iput(bmp_vi); + goto abort; + } + } + goto find_next_index_buffer; +unm_EOD: + if (ia_page) { + unlock_page(ia_page); + ntfs_unmap_page(ia_page); + } + ntfs_unmap_page(bmp_page); + iput(bmp_vi); +EOD: + /* We are finished, set fpos to EOD. */ + actor->pos = i_size + vol->mft_record_size; +abort: + kfree(name); + return 0; +err_out: + if (bmp_page) { + ntfs_unmap_page(bmp_page); +iput_err_out: + iput(bmp_vi); + } + if (ia_page) { + unlock_page(ia_page); + ntfs_unmap_page(ia_page); + } + kfree(ir); + kfree(name); + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(ndir); + if (!err) + err = -EIO; + ntfs_debug("Failed. Returning error code %i.", -err); + return err; +} + +/** + * ntfs_dir_open - called when an inode is about to be opened + * @vi: inode to be opened + * @filp: file structure describing the inode + * + * Limit directory size to the page cache limit on architectures where unsigned + * long is 32-bits. This is the most we can do for now without overflowing the + * page cache page index. Doing it this way means we don't run into problems + * because of existing too large directories. It would be better to allow the + * user to read the accessible part of the directory but I doubt very much + * anyone is going to hit this check on a 32-bit architecture, so there is no + * point in adding the extra complexity required to support this. + * + * On 64-bit architectures, the check is hopefully optimized away by the + * compiler. + */ +static int ntfs_dir_open(struct inode *vi, struct file *filp) +{ + if (sizeof(unsigned long) < 8) { + if (i_size_read(vi) > MAX_LFS_FILESIZE) + return -EFBIG; + } + return 0; +} + +#ifdef NTFS_RW + +/** + * ntfs_dir_fsync - sync a directory to disk + * @filp: directory to be synced + * @start: offset in bytes of the beginning of data range to sync + * @end: offset in bytes of the end of data range (inclusive) + * @datasync: if non-zero only flush user data and not metadata + * + * Data integrity sync of a directory to disk. Used for fsync, fdatasync, and + * msync system calls. This function is based on file.c::ntfs_file_fsync(). + * + * Write the mft record and all associated extent mft records as well as the + * $INDEX_ALLOCATION and $BITMAP attributes and then sync the block device. + * + * If @datasync is true, we do not wait on the inode(s) to be written out + * but we always wait on the page cache pages to be written out. + * + * Note: In the past @filp could be NULL so we ignore it as we don't need it + * anyway. + * + * Locking: Caller must hold i_mutex on the inode. + * + * TODO: We should probably also write all attribute/index inodes associated + * with this inode but since we have no simple way of getting to them we ignore + * this problem for now. We do write the $BITMAP attribute if it is present + * which is the important one for a directory so things are not too bad. + */ +static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) +{ + struct inode *bmp_vi, *vi = filp->f_mapping->host; + int err, ret; + ntfs_attr na; + + ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); + + err = file_write_and_wait_range(filp, start, end); + if (err) + return err; + inode_lock(vi); + + BUG_ON(!S_ISDIR(vi->i_mode)); + /* If the bitmap attribute inode is in memory sync it, too. */ + na.mft_no = vi->i_ino; + na.type = AT_BITMAP; + na.name = I30; + na.name_len = 4; + bmp_vi = ilookup5(vi->i_sb, vi->i_ino, ntfs_test_inode, &na); + if (bmp_vi) { + write_inode_now(bmp_vi, !datasync); + iput(bmp_vi); + } + ret = __ntfs_write_inode(vi, 1); + write_inode_now(vi, !datasync); + err = sync_blockdev(vi->i_sb->s_bdev); + if (unlikely(err && !ret)) + ret = err; + if (likely(!ret)) + ntfs_debug("Done."); + else + ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " + "%u.", datasync ? "data" : "", vi->i_ino, -ret); + inode_unlock(vi); + return ret; +} + +#endif /* NTFS_RW */ + +WRAP_DIR_ITER(ntfs_readdir) // FIXME! +const struct file_operations ntfs_dir_ops = { + .llseek = generic_file_llseek, /* Seek inside directory. */ + .read = generic_read_dir, /* Return -EISDIR. */ + .iterate_shared = shared_ntfs_readdir, /* Read directory contents. */ +#ifdef NTFS_RW + .fsync = ntfs_dir_fsync, /* Sync a directory to disk. */ +#endif /* NTFS_RW */ + /*.ioctl = ,*/ /* Perform function on the + mounted filesystem. */ + .open = ntfs_dir_open, /* Open directory. */ +}; diff --git a/fs/ntfs/dir.h b/fs/ntfs/dir.h new file mode 100644 index 000000000000..0e326753df40 --- /dev/null +++ b/fs/ntfs/dir.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * dir.h - Defines for directory handling in NTFS Linux kernel driver. Part of + * the Linux-NTFS project. + * + * Copyright (c) 2002-2004 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_DIR_H +#define _LINUX_NTFS_DIR_H + +#include "layout.h" +#include "inode.h" +#include "types.h" + +/* + * ntfs_name is used to return the file name to the caller of + * ntfs_lookup_inode_by_name() in order for the caller (namei.c::ntfs_lookup()) + * to be able to deal with dcache aliasing issues. + */ +typedef struct { + MFT_REF mref; + FILE_NAME_TYPE_FLAGS type; + u8 len; + ntfschar name[0]; +} __attribute__ ((__packed__)) ntfs_name; + +/* The little endian Unicode string $I30 as a global constant. */ +extern ntfschar I30[5]; + +extern MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, + const ntfschar *uname, const int uname_len, ntfs_name **res); + +#endif /* _LINUX_NTFS_FS_DIR_H */ diff --git a/fs/ntfs/endian.h b/fs/ntfs/endian.h new file mode 100644 index 000000000000..f30c139bf9ae --- /dev/null +++ b/fs/ntfs/endian.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * endian.h - Defines for endianness handling in NTFS Linux kernel driver. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2004 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_ENDIAN_H +#define _LINUX_NTFS_ENDIAN_H + +#include +#include "types.h" + +/* + * Signed endianness conversion functions. + */ + +static inline s16 sle16_to_cpu(sle16 x) +{ + return le16_to_cpu((__force le16)x); +} + +static inline s32 sle32_to_cpu(sle32 x) +{ + return le32_to_cpu((__force le32)x); +} + +static inline s64 sle64_to_cpu(sle64 x) +{ + return le64_to_cpu((__force le64)x); +} + +static inline s16 sle16_to_cpup(sle16 *x) +{ + return le16_to_cpu(*(__force le16*)x); +} + +static inline s32 sle32_to_cpup(sle32 *x) +{ + return le32_to_cpu(*(__force le32*)x); +} + +static inline s64 sle64_to_cpup(sle64 *x) +{ + return le64_to_cpu(*(__force le64*)x); +} + +static inline sle16 cpu_to_sle16(s16 x) +{ + return (__force sle16)cpu_to_le16(x); +} + +static inline sle32 cpu_to_sle32(s32 x) +{ + return (__force sle32)cpu_to_le32(x); +} + +static inline sle64 cpu_to_sle64(s64 x) +{ + return (__force sle64)cpu_to_le64(x); +} + +static inline sle16 cpu_to_sle16p(s16 *x) +{ + return (__force sle16)cpu_to_le16(*x); +} + +static inline sle32 cpu_to_sle32p(s32 *x) +{ + return (__force sle32)cpu_to_le32(*x); +} + +static inline sle64 cpu_to_sle64p(s64 *x) +{ + return (__force sle64)cpu_to_le64(*x); +} + +#endif /* _LINUX_NTFS_ENDIAN_H */ diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c new file mode 100644 index 000000000000..297c0b9db621 --- /dev/null +++ b/fs/ntfs/file.c @@ -0,0 +1,1997 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "attrib.h" +#include "bitmap.h" +#include "inode.h" +#include "debug.h" +#include "lcnalloc.h" +#include "malloc.h" +#include "mft.h" +#include "ntfs.h" + +/** + * ntfs_file_open - called when an inode is about to be opened + * @vi: inode to be opened + * @filp: file structure describing the inode + * + * Limit file size to the page cache limit on architectures where unsigned long + * is 32-bits. This is the most we can do for now without overflowing the page + * cache page index. Doing it this way means we don't run into problems because + * of existing too large files. It would be better to allow the user to read + * the beginning of the file but I doubt very much anyone is going to hit this + * check on a 32-bit architecture, so there is no point in adding the extra + * complexity required to support this. + * + * On 64-bit architectures, the check is hopefully optimized away by the + * compiler. + * + * After the check passes, just call generic_file_open() to do its work. + */ +static int ntfs_file_open(struct inode *vi, struct file *filp) +{ + if (sizeof(unsigned long) < 8) { + if (i_size_read(vi) > MAX_LFS_FILESIZE) + return -EOVERFLOW; + } + return generic_file_open(vi, filp); +} + +#ifdef NTFS_RW + +/** + * ntfs_attr_extend_initialized - extend the initialized size of an attribute + * @ni: ntfs inode of the attribute to extend + * @new_init_size: requested new initialized size in bytes + * + * Extend the initialized size of an attribute described by the ntfs inode @ni + * to @new_init_size bytes. This involves zeroing any non-sparse space between + * the old initialized size and @new_init_size both in the page cache and on + * disk (if relevant complete pages are already uptodate in the page cache then + * these are simply marked dirty). + * + * As a side-effect, the file size (vfs inode->i_size) may be incremented as, + * in the resident attribute case, it is tied to the initialized size and, in + * the non-resident attribute case, it may not fall below the initialized size. + * + * Note that if the attribute is resident, we do not need to touch the page + * cache at all. This is because if the page cache page is not uptodate we + * bring it uptodate later, when doing the write to the mft record since we + * then already have the page mapped. And if the page is uptodate, the + * non-initialized region will already have been zeroed when the page was + * brought uptodate and the region may in fact already have been overwritten + * with new data via mmap() based writes, so we cannot just zero it. And since + * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped + * is unspecified, we choose not to do zeroing and thus we do not need to touch + * the page at all. For a more detailed explanation see ntfs_truncate() in + * fs/ntfs/inode.c. + * + * Return 0 on success and -errno on error. In the case that an error is + * encountered it is possible that the initialized size will already have been + * incremented some way towards @new_init_size but it is guaranteed that if + * this is the case, the necessary zeroing will also have happened and that all + * metadata is self-consistent. + * + * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be + * held by the caller. + */ +static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size) +{ + s64 old_init_size; + loff_t old_i_size; + pgoff_t index, end_index; + unsigned long flags; + struct inode *vi = VFS_I(ni); + ntfs_inode *base_ni; + MFT_RECORD *m = NULL; + ATTR_RECORD *a; + ntfs_attr_search_ctx *ctx = NULL; + struct address_space *mapping; + struct page *page = NULL; + u8 *kattr; + int err; + u32 attr_len; + + read_lock_irqsave(&ni->size_lock, flags); + old_init_size = ni->initialized_size; + old_i_size = i_size_read(vi); + BUG_ON(new_init_size > ni->allocated_size); + read_unlock_irqrestore(&ni->size_lock, flags); + ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " + "old_initialized_size 0x%llx, " + "new_initialized_size 0x%llx, i_size 0x%llx.", + vi->i_ino, (unsigned)le32_to_cpu(ni->type), + (unsigned long long)old_init_size, + (unsigned long long)new_init_size, old_i_size); + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + /* Use goto to reduce indentation and we need the label below anyway. */ + if (NInoNonResident(ni)) + goto do_non_resident_extend; + BUG_ON(old_init_size != old_i_size); + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + m = ctx->mrec; + a = ctx->attr; + BUG_ON(a->non_resident); + /* The total length of the attribute value. */ + attr_len = le32_to_cpu(a->data.resident.value_length); + BUG_ON(old_i_size != (loff_t)attr_len); + /* + * Do the zeroing in the mft record and update the attribute size in + * the mft record. + */ + kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); + memset(kattr + attr_len, 0, new_init_size - attr_len); + a->data.resident.value_length = cpu_to_le32((u32)new_init_size); + /* Finally, update the sizes in the vfs and ntfs inodes. */ + write_lock_irqsave(&ni->size_lock, flags); + i_size_write(vi, new_init_size); + ni->initialized_size = new_init_size; + write_unlock_irqrestore(&ni->size_lock, flags); + goto done; +do_non_resident_extend: + /* + * If the new initialized size @new_init_size exceeds the current file + * size (vfs inode->i_size), we need to extend the file size to the + * new initialized size. + */ + if (new_init_size > old_i_size) { + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + m = ctx->mrec; + a = ctx->attr; + BUG_ON(!a->non_resident); + BUG_ON(old_i_size != (loff_t) + sle64_to_cpu(a->data.non_resident.data_size)); + a->data.non_resident.data_size = cpu_to_sle64(new_init_size); + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + /* Update the file size in the vfs inode. */ + i_size_write(vi, new_init_size); + ntfs_attr_put_search_ctx(ctx); + ctx = NULL; + unmap_mft_record(base_ni); + m = NULL; + } + mapping = vi->i_mapping; + index = old_init_size >> PAGE_SHIFT; + end_index = (new_init_size + PAGE_SIZE - 1) >> PAGE_SHIFT; + do { + /* + * Read the page. If the page is not present, this will zero + * the uninitialized regions for us. + */ + page = read_mapping_page(mapping, index, NULL); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto init_err_out; + } + /* + * Update the initialized size in the ntfs inode. This is + * enough to make ntfs_writepage() work. + */ + write_lock_irqsave(&ni->size_lock, flags); + ni->initialized_size = (s64)(index + 1) << PAGE_SHIFT; + if (ni->initialized_size > new_init_size) + ni->initialized_size = new_init_size; + write_unlock_irqrestore(&ni->size_lock, flags); + /* Set the page dirty so it gets written out. */ + set_page_dirty(page); + put_page(page); + /* + * Play nice with the vm and the rest of the system. This is + * very much needed as we can potentially be modifying the + * initialised size from a very small value to a really huge + * value, e.g. + * f = open(somefile, O_TRUNC); + * truncate(f, 10GiB); + * seek(f, 10GiB); + * write(f, 1); + * And this would mean we would be marking dirty hundreds of + * thousands of pages or as in the above example more than + * two and a half million pages! + * + * TODO: For sparse pages could optimize this workload by using + * the FsMisc / MiscFs page bit as a "PageIsSparse" bit. This + * would be set in read_folio for sparse pages and here we would + * not need to mark dirty any pages which have this bit set. + * The only caveat is that we have to clear the bit everywhere + * where we allocate any clusters that lie in the page or that + * contain the page. + * + * TODO: An even greater optimization would be for us to only + * call read_folio() on pages which are not in sparse regions as + * determined from the runlist. This would greatly reduce the + * number of pages we read and make dirty in the case of sparse + * files. + */ + balance_dirty_pages_ratelimited(mapping); + cond_resched(); + } while (++index < end_index); + read_lock_irqsave(&ni->size_lock, flags); + BUG_ON(ni->initialized_size != new_init_size); + read_unlock_irqrestore(&ni->size_lock, flags); + /* Now bring in sync the initialized_size in the mft record. */ + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + goto init_err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto init_err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto init_err_out; + } + m = ctx->mrec; + a = ctx->attr; + BUG_ON(!a->non_resident); + a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size); +done: + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.", + (unsigned long long)new_init_size, i_size_read(vi)); + return 0; +init_err_out: + write_lock_irqsave(&ni->size_lock, flags); + ni->initialized_size = old_init_size; + write_unlock_irqrestore(&ni->size_lock, flags); +err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + ntfs_debug("Failed. Returning error code %i.", err); + return err; +} + +static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb, + struct iov_iter *from) +{ + loff_t pos; + s64 end, ll; + ssize_t err; + unsigned long flags; + struct file *file = iocb->ki_filp; + struct inode *vi = file_inode(file); + ntfs_inode *ni = NTFS_I(vi); + ntfs_volume *vol = ni->vol; + + ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos " + "0x%llx, count 0x%zx.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), + (unsigned long long)iocb->ki_pos, + iov_iter_count(from)); + err = generic_write_checks(iocb, from); + if (unlikely(err <= 0)) + goto out; + /* + * All checks have passed. Before we start doing any writing we want + * to abort any totally illegal writes. + */ + BUG_ON(NInoMstProtected(ni)); + BUG_ON(ni->type != AT_DATA); + /* If file is encrypted, deny access, just like NT4. */ + if (NInoEncrypted(ni)) { + /* Only $DATA attributes can be encrypted. */ + /* + * Reminder for later: Encrypted files are _always_ + * non-resident so that the content can always be encrypted. + */ + ntfs_debug("Denying write access to encrypted file."); + err = -EACCES; + goto out; + } + if (NInoCompressed(ni)) { + /* Only unnamed $DATA attribute can be compressed. */ + BUG_ON(ni->name_len); + /* + * Reminder for later: If resident, the data is not actually + * compressed. Only on the switch to non-resident does + * compression kick in. This is in contrast to encrypted files + * (see above). + */ + ntfs_error(vi->i_sb, "Writing to compressed files is not " + "implemented yet. Sorry."); + err = -EOPNOTSUPP; + goto out; + } + err = file_remove_privs(file); + if (unlikely(err)) + goto out; + /* + * Our ->update_time method always succeeds thus file_update_time() + * cannot fail either so there is no need to check the return code. + */ + file_update_time(file); + pos = iocb->ki_pos; + /* The first byte after the last cluster being written to. */ + end = (pos + iov_iter_count(from) + vol->cluster_size_mask) & + ~(u64)vol->cluster_size_mask; + /* + * If the write goes beyond the allocated size, extend the allocation + * to cover the whole of the write, rounded up to the nearest cluster. + */ + read_lock_irqsave(&ni->size_lock, flags); + ll = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (end > ll) { + /* + * Extend the allocation without changing the data size. + * + * Note we ensure the allocation is big enough to at least + * write some data but we do not require the allocation to be + * complete, i.e. it may be partial. + */ + ll = ntfs_attr_extend_allocation(ni, end, -1, pos); + if (likely(ll >= 0)) { + BUG_ON(pos >= ll); + /* If the extension was partial truncate the write. */ + if (end > ll) { + ntfs_debug("Truncating write to inode 0x%lx, " + "attribute type 0x%x, because " + "the allocation was only " + "partially extended.", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type)); + iov_iter_truncate(from, ll - pos); + } + } else { + err = ll; + read_lock_irqsave(&ni->size_lock, flags); + ll = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + /* Perform a partial write if possible or fail. */ + if (pos < ll) { + ntfs_debug("Truncating write to inode 0x%lx " + "attribute type 0x%x, because " + "extending the allocation " + "failed (error %d).", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type), + (int)-err); + iov_iter_truncate(from, ll - pos); + } else { + if (err != -ENOSPC) + ntfs_error(vi->i_sb, "Cannot perform " + "write to inode " + "0x%lx, attribute " + "type 0x%x, because " + "extending the " + "allocation failed " + "(error %ld).", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type), + (long)-err); + else + ntfs_debug("Cannot perform write to " + "inode 0x%lx, " + "attribute type 0x%x, " + "because there is not " + "space left.", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type)); + goto out; + } + } + } + /* + * If the write starts beyond the initialized size, extend it up to the + * beginning of the write and initialize all non-sparse space between + * the old initialized size and the new one. This automatically also + * increments the vfs inode->i_size to keep it above or equal to the + * initialized_size. + */ + read_lock_irqsave(&ni->size_lock, flags); + ll = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (pos > ll) { + /* + * Wait for ongoing direct i/o to complete before proceeding. + * New direct i/o cannot start as we hold i_mutex. + */ + inode_dio_wait(vi); + err = ntfs_attr_extend_initialized(ni, pos); + if (unlikely(err < 0)) + ntfs_error(vi->i_sb, "Cannot perform write to inode " + "0x%lx, attribute type 0x%x, because " + "extending the initialized size " + "failed (error %d).", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), + (int)-err); + } +out: + return err; +} + +/** + * __ntfs_grab_cache_pages - obtain a number of locked pages + * @mapping: address space mapping from which to obtain page cache pages + * @index: starting index in @mapping at which to begin obtaining pages + * @nr_pages: number of page cache pages to obtain + * @pages: array of pages in which to return the obtained page cache pages + * @cached_page: allocated but as yet unused page + * + * Obtain @nr_pages locked page cache pages from the mapping @mapping and + * starting at index @index. + * + * If a page is newly created, add it to lru list + * + * Note, the page locks are obtained in ascending page index order. + */ +static inline int __ntfs_grab_cache_pages(struct address_space *mapping, + pgoff_t index, const unsigned nr_pages, struct page **pages, + struct page **cached_page) +{ + int err, nr; + + BUG_ON(!nr_pages); + err = nr = 0; + do { + pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK | + FGP_ACCESSED); + if (!pages[nr]) { + if (!*cached_page) { + *cached_page = page_cache_alloc(mapping); + if (unlikely(!*cached_page)) { + err = -ENOMEM; + goto err_out; + } + } + err = add_to_page_cache_lru(*cached_page, mapping, + index, + mapping_gfp_constraint(mapping, GFP_KERNEL)); + if (unlikely(err)) { + if (err == -EEXIST) + continue; + goto err_out; + } + pages[nr] = *cached_page; + *cached_page = NULL; + } + index++; + nr++; + } while (nr < nr_pages); +out: + return err; +err_out: + while (nr > 0) { + unlock_page(pages[--nr]); + put_page(pages[nr]); + } + goto out; +} + +static inline void ntfs_submit_bh_for_read(struct buffer_head *bh) +{ + lock_buffer(bh); + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(REQ_OP_READ, bh); +} + +/** + * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data + * @pages: array of destination pages + * @nr_pages: number of pages in @pages + * @pos: byte position in file at which the write begins + * @bytes: number of bytes to be written + * + * This is called for non-resident attributes from ntfs_file_buffered_write() + * with i_mutex held on the inode (@pages[0]->mapping->host). There are + * @nr_pages pages in @pages which are locked but not kmap()ped. The source + * data has not yet been copied into the @pages. + * + * Need to fill any holes with actual clusters, allocate buffers if necessary, + * ensure all the buffers are mapped, and bring uptodate any buffers that are + * only partially being written to. + * + * If @nr_pages is greater than one, we are guaranteed that the cluster size is + * greater than PAGE_SIZE, that all pages in @pages are entirely inside + * the same cluster and that they are the entirety of that cluster, and that + * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole. + * + * i_size is not to be modified yet. + * + * Return 0 on success or -errno on error. + */ +static int ntfs_prepare_pages_for_non_resident_write(struct page **pages, + unsigned nr_pages, s64 pos, size_t bytes) +{ + VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend; + LCN lcn; + s64 bh_pos, vcn_len, end, initialized_size; + sector_t lcn_block; + struct folio *folio; + struct inode *vi; + ntfs_inode *ni, *base_ni = NULL; + ntfs_volume *vol; + runlist_element *rl, *rl2; + struct buffer_head *bh, *head, *wait[2], **wait_bh = wait; + ntfs_attr_search_ctx *ctx = NULL; + MFT_RECORD *m = NULL; + ATTR_RECORD *a = NULL; + unsigned long flags; + u32 attr_rec_len = 0; + unsigned blocksize, u; + int err, mp_size; + bool rl_write_locked, was_hole, is_retry; + unsigned char blocksize_bits; + struct { + u8 runlist_merged:1; + u8 mft_attr_mapped:1; + u8 mp_rebuilt:1; + u8 attr_switched:1; + } status = { 0, 0, 0, 0 }; + + BUG_ON(!nr_pages); + BUG_ON(!pages); + BUG_ON(!*pages); + vi = pages[0]->mapping->host; + ni = NTFS_I(vi); + vol = ni->vol; + ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page " + "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", + vi->i_ino, ni->type, pages[0]->index, nr_pages, + (long long)pos, bytes); + blocksize = vol->sb->s_blocksize; + blocksize_bits = vol->sb->s_blocksize_bits; + rl_write_locked = false; + rl = NULL; + err = 0; + vcn = lcn = -1; + vcn_len = 0; + lcn_block = -1; + was_hole = false; + cpos = pos >> vol->cluster_size_bits; + end = pos + bytes; + cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits; + /* + * Loop over each buffer in each folio. Use goto to + * reduce indentation. + */ + u = 0; +do_next_folio: + folio = page_folio(pages[u]); + bh_pos = folio_pos(folio); + head = folio_buffers(folio); + if (!head) + /* + * create_empty_buffers() will create uptodate/dirty + * buffers if the folio is uptodate/dirty. + */ + head = create_empty_buffers(folio, blocksize, 0); + bh = head; + do { + VCN cdelta; + s64 bh_end; + unsigned bh_cofs; + + /* Clear buffer_new on all buffers to reinitialise state. */ + if (buffer_new(bh)) + clear_buffer_new(bh); + bh_end = bh_pos + blocksize; + bh_cpos = bh_pos >> vol->cluster_size_bits; + bh_cofs = bh_pos & vol->cluster_size_mask; + if (buffer_mapped(bh)) { + /* + * The buffer is already mapped. If it is uptodate, + * ignore it. + */ + if (buffer_uptodate(bh)) + continue; + /* + * The buffer is not uptodate. If the folio is uptodate + * set the buffer uptodate and otherwise ignore it. + */ + if (folio_test_uptodate(folio)) { + set_buffer_uptodate(bh); + continue; + } + /* + * Neither the folio nor the buffer are uptodate. If + * the buffer is only partially being written to, we + * need to read it in before the write, i.e. now. + */ + if ((bh_pos < pos && bh_end > pos) || + (bh_pos < end && bh_end > end)) { + /* + * If the buffer is fully or partially within + * the initialized size, do an actual read. + * Otherwise, simply zero the buffer. + */ + read_lock_irqsave(&ni->size_lock, flags); + initialized_size = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (bh_pos < initialized_size) { + ntfs_submit_bh_for_read(bh); + *wait_bh++ = bh; + } else { + folio_zero_range(folio, bh_offset(bh), + blocksize); + set_buffer_uptodate(bh); + } + } + continue; + } + /* Unmapped buffer. Need to map it. */ + bh->b_bdev = vol->sb->s_bdev; + /* + * If the current buffer is in the same clusters as the map + * cache, there is no need to check the runlist again. The + * map cache is made up of @vcn, which is the first cached file + * cluster, @vcn_len which is the number of cached file + * clusters, @lcn is the device cluster corresponding to @vcn, + * and @lcn_block is the block number corresponding to @lcn. + */ + cdelta = bh_cpos - vcn; + if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) { +map_buffer_cached: + BUG_ON(lcn < 0); + bh->b_blocknr = lcn_block + + (cdelta << (vol->cluster_size_bits - + blocksize_bits)) + + (bh_cofs >> blocksize_bits); + set_buffer_mapped(bh); + /* + * If the folio is uptodate so is the buffer. If the + * buffer is fully outside the write, we ignore it if + * it was already allocated and we mark it dirty so it + * gets written out if we allocated it. On the other + * hand, if we allocated the buffer but we are not + * marking it dirty we set buffer_new so we can do + * error recovery. + */ + if (folio_test_uptodate(folio)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + if (unlikely(was_hole)) { + /* We allocated the buffer. */ + clean_bdev_bh_alias(bh); + if (bh_end <= pos || bh_pos >= end) + mark_buffer_dirty(bh); + else + set_buffer_new(bh); + } + continue; + } + /* Page is _not_ uptodate. */ + if (likely(!was_hole)) { + /* + * Buffer was already allocated. If it is not + * uptodate and is only partially being written + * to, we need to read it in before the write, + * i.e. now. + */ + if (!buffer_uptodate(bh) && bh_pos < end && + bh_end > pos && + (bh_pos < pos || + bh_end > end)) { + /* + * If the buffer is fully or partially + * within the initialized size, do an + * actual read. Otherwise, simply zero + * the buffer. + */ + read_lock_irqsave(&ni->size_lock, + flags); + initialized_size = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, + flags); + if (bh_pos < initialized_size) { + ntfs_submit_bh_for_read(bh); + *wait_bh++ = bh; + } else { + folio_zero_range(folio, + bh_offset(bh), + blocksize); + set_buffer_uptodate(bh); + } + } + continue; + } + /* We allocated the buffer. */ + clean_bdev_bh_alias(bh); + /* + * If the buffer is fully outside the write, zero it, + * set it uptodate, and mark it dirty so it gets + * written out. If it is partially being written to, + * zero region surrounding the write but leave it to + * commit write to do anything else. Finally, if the + * buffer is fully being overwritten, do nothing. + */ + if (bh_end <= pos || bh_pos >= end) { + if (!buffer_uptodate(bh)) { + folio_zero_range(folio, bh_offset(bh), + blocksize); + set_buffer_uptodate(bh); + } + mark_buffer_dirty(bh); + continue; + } + set_buffer_new(bh); + if (!buffer_uptodate(bh) && + (bh_pos < pos || bh_end > end)) { + u8 *kaddr; + unsigned pofs; + + kaddr = kmap_local_folio(folio, 0); + if (bh_pos < pos) { + pofs = bh_pos & ~PAGE_MASK; + memset(kaddr + pofs, 0, pos - bh_pos); + } + if (bh_end > end) { + pofs = end & ~PAGE_MASK; + memset(kaddr + pofs, 0, bh_end - end); + } + kunmap_local(kaddr); + flush_dcache_folio(folio); + } + continue; + } + /* + * Slow path: this is the first buffer in the cluster. If it + * is outside allocated size and is not uptodate, zero it and + * set it uptodate. + */ + read_lock_irqsave(&ni->size_lock, flags); + initialized_size = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (bh_pos > initialized_size) { + if (folio_test_uptodate(folio)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + } else if (!buffer_uptodate(bh)) { + folio_zero_range(folio, bh_offset(bh), + blocksize); + set_buffer_uptodate(bh); + } + continue; + } + is_retry = false; + if (!rl) { + down_read(&ni->runlist.lock); +retry_remap: + rl = ni->runlist.rl; + } + if (likely(rl != NULL)) { + /* Seek to element containing target cluster. */ + while (rl->length && rl[1].vcn <= bh_cpos) + rl++; + lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos); + if (likely(lcn >= 0)) { + /* + * Successful remap, setup the map cache and + * use that to deal with the buffer. + */ + was_hole = false; + vcn = bh_cpos; + vcn_len = rl[1].vcn - vcn; + lcn_block = lcn << (vol->cluster_size_bits - + blocksize_bits); + cdelta = 0; + /* + * If the number of remaining clusters touched + * by the write is smaller or equal to the + * number of cached clusters, unlock the + * runlist as the map cache will be used from + * now on. + */ + if (likely(vcn + vcn_len >= cend)) { + if (rl_write_locked) { + up_write(&ni->runlist.lock); + rl_write_locked = false; + } else + up_read(&ni->runlist.lock); + rl = NULL; + } + goto map_buffer_cached; + } + } else + lcn = LCN_RL_NOT_MAPPED; + /* + * If it is not a hole and not out of bounds, the runlist is + * probably unmapped so try to map it now. + */ + if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) { + if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) { + /* Attempt to map runlist. */ + if (!rl_write_locked) { + /* + * We need the runlist locked for + * writing, so if it is locked for + * reading relock it now and retry in + * case it changed whilst we dropped + * the lock. + */ + up_read(&ni->runlist.lock); + down_write(&ni->runlist.lock); + rl_write_locked = true; + goto retry_remap; + } + err = ntfs_map_runlist_nolock(ni, bh_cpos, + NULL); + if (likely(!err)) { + is_retry = true; + goto retry_remap; + } + /* + * If @vcn is out of bounds, pretend @lcn is + * LCN_ENOENT. As long as the buffer is out + * of bounds this will work fine. + */ + if (err == -ENOENT) { + lcn = LCN_ENOENT; + err = 0; + goto rl_not_mapped_enoent; + } + } else + err = -EIO; + /* Failed to map the buffer, even after retrying. */ + bh->b_blocknr = -1; + ntfs_error(vol->sb, "Failed to write to inode 0x%lx, " + "attribute type 0x%x, vcn 0x%llx, " + "vcn offset 0x%x, because its " + "location on disk could not be " + "determined%s (error code %i).", + ni->mft_no, ni->type, + (unsigned long long)bh_cpos, + (unsigned)bh_pos & + vol->cluster_size_mask, + is_retry ? " even after retrying" : "", + err); + break; + } +rl_not_mapped_enoent: + /* + * The buffer is in a hole or out of bounds. We need to fill + * the hole, unless the buffer is in a cluster which is not + * touched by the write, in which case we just leave the buffer + * unmapped. This can only happen when the cluster size is + * less than the page cache size. + */ + if (unlikely(vol->cluster_size < PAGE_SIZE)) { + bh_cend = (bh_end + vol->cluster_size - 1) >> + vol->cluster_size_bits; + if ((bh_cend <= cpos || bh_cpos >= cend)) { + bh->b_blocknr = -1; + /* + * If the buffer is uptodate we skip it. If it + * is not but the folio is uptodate, we can set + * the buffer uptodate. If the folio is not + * uptodate, we can clear the buffer and set it + * uptodate. Whether this is worthwhile is + * debatable and this could be removed. + */ + if (folio_test_uptodate(folio)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + } else if (!buffer_uptodate(bh)) { + folio_zero_range(folio, bh_offset(bh), + blocksize); + set_buffer_uptodate(bh); + } + continue; + } + } + /* + * Out of bounds buffer is invalid if it was not really out of + * bounds. + */ + BUG_ON(lcn != LCN_HOLE); + /* + * We need the runlist locked for writing, so if it is locked + * for reading relock it now and retry in case it changed + * whilst we dropped the lock. + */ + BUG_ON(!rl); + if (!rl_write_locked) { + up_read(&ni->runlist.lock); + down_write(&ni->runlist.lock); + rl_write_locked = true; + goto retry_remap; + } + /* Find the previous last allocated cluster. */ + BUG_ON(rl->lcn != LCN_HOLE); + lcn = -1; + rl2 = rl; + while (--rl2 >= ni->runlist.rl) { + if (rl2->lcn >= 0) { + lcn = rl2->lcn + rl2->length; + break; + } + } + rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE, + false); + if (IS_ERR(rl2)) { + err = PTR_ERR(rl2); + ntfs_debug("Failed to allocate cluster, error code %i.", + err); + break; + } + lcn = rl2->lcn; + rl = ntfs_runlists_merge(ni->runlist.rl, rl2); + if (IS_ERR(rl)) { + err = PTR_ERR(rl); + if (err != -ENOMEM) + err = -EIO; + if (ntfs_cluster_free_from_rl(vol, rl2)) { + ntfs_error(vol->sb, "Failed to release " + "allocated cluster in error " + "code path. Run chkdsk to " + "recover the lost cluster."); + NVolSetErrors(vol); + } + ntfs_free(rl2); + break; + } + ni->runlist.rl = rl; + status.runlist_merged = 1; + ntfs_debug("Allocated cluster, lcn 0x%llx.", + (unsigned long long)lcn); + /* Map and lock the mft record and get the attribute record. */ + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + break; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + unmap_mft_record(base_ni); + break; + } + status.mft_attr_mapped = 1; + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, bh_cpos, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + break; + } + m = ctx->mrec; + a = ctx->attr; + /* + * Find the runlist element with which the attribute extent + * starts. Note, we cannot use the _attr_ version because we + * have mapped the mft record. That is ok because we know the + * runlist fragment must be mapped already to have ever gotten + * here, so we can just use the _rl_ version. + */ + vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn); + rl2 = ntfs_rl_find_vcn_nolock(rl, vcn); + BUG_ON(!rl2); + BUG_ON(!rl2->length); + BUG_ON(rl2->lcn < LCN_HOLE); + highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); + /* + * If @highest_vcn is zero, calculate the real highest_vcn + * (which can really be zero). + */ + if (!highest_vcn) + highest_vcn = (sle64_to_cpu( + a->data.non_resident.allocated_size) >> + vol->cluster_size_bits) - 1; + /* + * Determine the size of the mapping pairs array for the new + * extent, i.e. the old extent with the hole filled. + */ + mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn, + highest_vcn); + if (unlikely(mp_size <= 0)) { + if (!(err = mp_size)) + err = -EIO; + ntfs_debug("Failed to get size for mapping pairs " + "array, error code %i.", err); + break; + } + /* + * Resize the attribute record to fit the new mapping pairs + * array. + */ + attr_rec_len = le32_to_cpu(a->length); + err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset)); + if (unlikely(err)) { + BUG_ON(err != -ENOSPC); + // TODO: Deal with this by using the current attribute + // and fill it with as much of the mapping pairs + // array as possible. Then loop over each attribute + // extent rewriting the mapping pairs arrays as we go + // along and if when we reach the end we have not + // enough space, try to resize the last attribute + // extent and if even that fails, add a new attribute + // extent. + // We could also try to resize at each step in the hope + // that we will not need to rewrite every single extent. + // Note, we may need to decompress some extents to fill + // the runlist as we are walking the extents... + ntfs_error(vol->sb, "Not enough space in the mft " + "record for the extended attribute " + "record. This case is not " + "implemented yet."); + err = -EOPNOTSUPP; + break ; + } + status.mp_rebuilt = 1; + /* + * Generate the mapping pairs array directly into the attribute + * record. + */ + err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset), + mp_size, rl2, vcn, highest_vcn, NULL); + if (unlikely(err)) { + ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, " + "attribute type 0x%x, because building " + "the mapping pairs failed with error " + "code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + err = -EIO; + break; + } + /* Update the highest_vcn but only if it was not set. */ + if (unlikely(!a->data.non_resident.highest_vcn)) + a->data.non_resident.highest_vcn = + cpu_to_sle64(highest_vcn); + /* + * If the attribute is sparse/compressed, update the compressed + * size in the ntfs_inode structure and the attribute record. + */ + if (likely(NInoSparse(ni) || NInoCompressed(ni))) { + /* + * If we are not in the first attribute extent, switch + * to it, but first ensure the changes will make it to + * disk later. + */ + if (a->data.non_resident.lowest_vcn) { + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_reinit_search_ctx(ctx); + err = ntfs_attr_lookup(ni->type, ni->name, + ni->name_len, CASE_SENSITIVE, + 0, NULL, 0, ctx); + if (unlikely(err)) { + status.attr_switched = 1; + break; + } + /* @m is not used any more so do not set it. */ + a = ctx->attr; + } + write_lock_irqsave(&ni->size_lock, flags); + ni->itype.compressed.size += vol->cluster_size; + a->data.non_resident.compressed_size = + cpu_to_sle64(ni->itype.compressed.size); + write_unlock_irqrestore(&ni->size_lock, flags); + } + /* Ensure the changes make it to disk. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + /* Successfully filled the hole. */ + status.runlist_merged = 0; + status.mft_attr_mapped = 0; + status.mp_rebuilt = 0; + /* Setup the map cache and use that to deal with the buffer. */ + was_hole = true; + vcn = bh_cpos; + vcn_len = 1; + lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits); + cdelta = 0; + /* + * If the number of remaining clusters in the @pages is smaller + * or equal to the number of cached clusters, unlock the + * runlist as the map cache will be used from now on. + */ + if (likely(vcn + vcn_len >= cend)) { + up_write(&ni->runlist.lock); + rl_write_locked = false; + rl = NULL; + } + goto map_buffer_cached; + } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); + /* If there are no errors, do the next page. */ + if (likely(!err && ++u < nr_pages)) + goto do_next_folio; + /* If there are no errors, release the runlist lock if we took it. */ + if (likely(!err)) { + if (unlikely(rl_write_locked)) { + up_write(&ni->runlist.lock); + rl_write_locked = false; + } else if (unlikely(rl)) + up_read(&ni->runlist.lock); + rl = NULL; + } + /* If we issued read requests, let them complete. */ + read_lock_irqsave(&ni->size_lock, flags); + initialized_size = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); + while (wait_bh > wait) { + bh = *--wait_bh; + wait_on_buffer(bh); + if (likely(buffer_uptodate(bh))) { + folio = bh->b_folio; + bh_pos = folio_pos(folio) + bh_offset(bh); + /* + * If the buffer overflows the initialized size, need + * to zero the overflowing region. + */ + if (unlikely(bh_pos + blocksize > initialized_size)) { + int ofs = 0; + + if (likely(bh_pos < initialized_size)) + ofs = initialized_size - bh_pos; + folio_zero_segment(folio, bh_offset(bh) + ofs, + blocksize); + } + } else /* if (unlikely(!buffer_uptodate(bh))) */ + err = -EIO; + } + if (likely(!err)) { + /* Clear buffer_new on all buffers. */ + u = 0; + do { + bh = head = page_buffers(pages[u]); + do { + if (buffer_new(bh)) + clear_buffer_new(bh); + } while ((bh = bh->b_this_page) != head); + } while (++u < nr_pages); + ntfs_debug("Done."); + return err; + } + if (status.attr_switched) { + /* Get back to the attribute extent we modified. */ + ntfs_attr_reinit_search_ctx(ctx); + if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) { + ntfs_error(vol->sb, "Failed to find required " + "attribute extent of attribute in " + "error code path. Run chkdsk to " + "recover."); + write_lock_irqsave(&ni->size_lock, flags); + ni->itype.compressed.size += vol->cluster_size; + write_unlock_irqrestore(&ni->size_lock, flags); + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + /* + * The only thing that is now wrong is the compressed + * size of the base attribute extent which chkdsk + * should be able to fix. + */ + NVolSetErrors(vol); + } else { + m = ctx->mrec; + a = ctx->attr; + status.attr_switched = 0; + } + } + /* + * If the runlist has been modified, need to restore it by punching a + * hole into it and we then need to deallocate the on-disk cluster as + * well. Note, we only modify the runlist if we are able to generate a + * new mapping pairs array, i.e. only when the mapped attribute extent + * is not switched. + */ + if (status.runlist_merged && !status.attr_switched) { + BUG_ON(!rl_write_locked); + /* Make the file cluster we allocated sparse in the runlist. */ + if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) { + ntfs_error(vol->sb, "Failed to punch hole into " + "attribute runlist in error code " + "path. Run chkdsk to recover the " + "lost cluster."); + NVolSetErrors(vol); + } else /* if (success) */ { + status.runlist_merged = 0; + /* + * Deallocate the on-disk cluster we allocated but only + * if we succeeded in punching its vcn out of the + * runlist. + */ + down_write(&vol->lcnbmp_lock); + if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) { + ntfs_error(vol->sb, "Failed to release " + "allocated cluster in error " + "code path. Run chkdsk to " + "recover the lost cluster."); + NVolSetErrors(vol); + } + up_write(&vol->lcnbmp_lock); + } + } + /* + * Resize the attribute record to its old size and rebuild the mapping + * pairs array. Note, we only can do this if the runlist has been + * restored to its old state which also implies that the mapped + * attribute extent is not switched. + */ + if (status.mp_rebuilt && !status.runlist_merged) { + if (ntfs_attr_record_resize(m, a, attr_rec_len)) { + ntfs_error(vol->sb, "Failed to restore attribute " + "record in error code path. Run " + "chkdsk to recover."); + NVolSetErrors(vol); + } else /* if (success) */ { + if (ntfs_mapping_pairs_build(vol, (u8*)a + + le16_to_cpu(a->data.non_resident. + mapping_pairs_offset), attr_rec_len - + le16_to_cpu(a->data.non_resident. + mapping_pairs_offset), ni->runlist.rl, + vcn, highest_vcn, NULL)) { + ntfs_error(vol->sb, "Failed to restore " + "mapping pairs array in error " + "code path. Run chkdsk to " + "recover."); + NVolSetErrors(vol); + } + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + } + } + /* Release the mft record and the attribute. */ + if (status.mft_attr_mapped) { + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + } + /* Release the runlist lock. */ + if (rl_write_locked) + up_write(&ni->runlist.lock); + else if (rl) + up_read(&ni->runlist.lock); + /* + * Zero out any newly allocated blocks to avoid exposing stale data. + * If BH_New is set, we know that the block was newly allocated above + * and that it has not been fully zeroed and marked dirty yet. + */ + nr_pages = u; + u = 0; + end = bh_cpos << vol->cluster_size_bits; + do { + folio = page_folio(pages[u]); + bh = head = folio_buffers(folio); + do { + if (u == nr_pages && + folio_pos(folio) + bh_offset(bh) >= end) + break; + if (!buffer_new(bh)) + continue; + clear_buffer_new(bh); + if (!buffer_uptodate(bh)) { + if (folio_test_uptodate(folio)) + set_buffer_uptodate(bh); + else { + folio_zero_range(folio, bh_offset(bh), + blocksize); + set_buffer_uptodate(bh); + } + } + mark_buffer_dirty(bh); + } while ((bh = bh->b_this_page) != head); + } while (++u <= nr_pages); + ntfs_error(vol->sb, "Failed. Returning error code %i.", err); + return err; +} + +static inline void ntfs_flush_dcache_pages(struct page **pages, + unsigned nr_pages) +{ + BUG_ON(!nr_pages); + /* + * Warning: Do not do the decrement at the same time as the call to + * flush_dcache_page() because it is a NULL macro on i386 and hence the + * decrement never happens so the loop never terminates. + */ + do { + --nr_pages; + flush_dcache_page(pages[nr_pages]); + } while (nr_pages > 0); +} + +/** + * ntfs_commit_pages_after_non_resident_write - commit the received data + * @pages: array of destination pages + * @nr_pages: number of pages in @pages + * @pos: byte position in file at which the write begins + * @bytes: number of bytes to be written + * + * See description of ntfs_commit_pages_after_write(), below. + */ +static inline int ntfs_commit_pages_after_non_resident_write( + struct page **pages, const unsigned nr_pages, + s64 pos, size_t bytes) +{ + s64 end, initialized_size; + struct inode *vi; + ntfs_inode *ni, *base_ni; + struct buffer_head *bh, *head; + ntfs_attr_search_ctx *ctx; + MFT_RECORD *m; + ATTR_RECORD *a; + unsigned long flags; + unsigned blocksize, u; + int err; + + vi = pages[0]->mapping->host; + ni = NTFS_I(vi); + blocksize = vi->i_sb->s_blocksize; + end = pos + bytes; + u = 0; + do { + s64 bh_pos; + struct page *page; + bool partial; + + page = pages[u]; + bh_pos = (s64)page->index << PAGE_SHIFT; + bh = head = page_buffers(page); + partial = false; + do { + s64 bh_end; + + bh_end = bh_pos + blocksize; + if (bh_end <= pos || bh_pos >= end) { + if (!buffer_uptodate(bh)) + partial = true; + } else { + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + } + } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); + /* + * If all buffers are now uptodate but the page is not, set the + * page uptodate. + */ + if (!partial && !PageUptodate(page)) + SetPageUptodate(page); + } while (++u < nr_pages); + /* + * Finally, if we do not need to update initialized_size or i_size we + * are finished. + */ + read_lock_irqsave(&ni->size_lock, flags); + initialized_size = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (end <= initialized_size) { + ntfs_debug("Done."); + return 0; + } + /* + * Update initialized_size/i_size as appropriate, both in the inode and + * the mft record. + */ + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + /* Map, pin, and lock the mft record. */ + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + ctx = NULL; + goto err_out; + } + BUG_ON(!NInoNonResident(ni)); + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + a = ctx->attr; + BUG_ON(!a->non_resident); + write_lock_irqsave(&ni->size_lock, flags); + BUG_ON(end > ni->allocated_size); + ni->initialized_size = end; + a->data.non_resident.initialized_size = cpu_to_sle64(end); + if (end > i_size_read(vi)) { + i_size_write(vi, end); + a->data.non_resident.data_size = + a->data.non_resident.initialized_size; + } + write_unlock_irqrestore(&ni->size_lock, flags); + /* Mark the mft record dirty, so it gets written back. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + ntfs_debug("Done."); + return 0; +err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error " + "code %i).", err); + if (err != -ENOMEM) + NVolSetErrors(ni->vol); + return err; +} + +/** + * ntfs_commit_pages_after_write - commit the received data + * @pages: array of destination pages + * @nr_pages: number of pages in @pages + * @pos: byte position in file at which the write begins + * @bytes: number of bytes to be written + * + * This is called from ntfs_file_buffered_write() with i_mutex held on the inode + * (@pages[0]->mapping->host). There are @nr_pages pages in @pages which are + * locked but not kmap()ped. The source data has already been copied into the + * @page. ntfs_prepare_pages_for_non_resident_write() has been called before + * the data was copied (for non-resident attributes only) and it returned + * success. + * + * Need to set uptodate and mark dirty all buffers within the boundary of the + * write. If all buffers in a page are uptodate we set the page uptodate, too. + * + * Setting the buffers dirty ensures that they get written out later when + * ntfs_writepage() is invoked by the VM. + * + * Finally, we need to update i_size and initialized_size as appropriate both + * in the inode and the mft record. + * + * This is modelled after fs/buffer.c::generic_commit_write(), which marks + * buffers uptodate and dirty, sets the page uptodate if all buffers in the + * page are uptodate, and updates i_size if the end of io is beyond i_size. In + * that case, it also marks the inode dirty. + * + * If things have gone as outlined in + * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page + * content modifications here for non-resident attributes. For resident + * attributes we need to do the uptodate bringing here which we combine with + * the copying into the mft record which means we save one atomic kmap. + * + * Return 0 on success or -errno on error. + */ +static int ntfs_commit_pages_after_write(struct page **pages, + const unsigned nr_pages, s64 pos, size_t bytes) +{ + s64 end, initialized_size; + loff_t i_size; + struct inode *vi; + ntfs_inode *ni, *base_ni; + struct page *page; + ntfs_attr_search_ctx *ctx; + MFT_RECORD *m; + ATTR_RECORD *a; + char *kattr, *kaddr; + unsigned long flags; + u32 attr_len; + int err; + + BUG_ON(!nr_pages); + BUG_ON(!pages); + page = pages[0]; + BUG_ON(!page); + vi = page->mapping->host; + ni = NTFS_I(vi); + ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page " + "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", + vi->i_ino, ni->type, page->index, nr_pages, + (long long)pos, bytes); + if (NInoNonResident(ni)) + return ntfs_commit_pages_after_non_resident_write(pages, + nr_pages, pos, bytes); + BUG_ON(nr_pages > 1); + /* + * Attribute is resident, implying it is not compressed, encrypted, or + * sparse. + */ + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + BUG_ON(NInoNonResident(ni)); + /* Map, pin, and lock the mft record. */ + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + ctx = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + a = ctx->attr; + BUG_ON(a->non_resident); + /* The total length of the attribute value. */ + attr_len = le32_to_cpu(a->data.resident.value_length); + i_size = i_size_read(vi); + BUG_ON(attr_len != i_size); + BUG_ON(pos > attr_len); + end = pos + bytes; + BUG_ON(end > le32_to_cpu(a->length) - + le16_to_cpu(a->data.resident.value_offset)); + kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); + kaddr = kmap_atomic(page); + /* Copy the received data from the page to the mft record. */ + memcpy(kattr + pos, kaddr + pos, bytes); + /* Update the attribute length if necessary. */ + if (end > attr_len) { + attr_len = end; + a->data.resident.value_length = cpu_to_le32(attr_len); + } + /* + * If the page is not uptodate, bring the out of bounds area(s) + * uptodate by copying data from the mft record to the page. + */ + if (!PageUptodate(page)) { + if (pos > 0) + memcpy(kaddr, kattr, pos); + if (end < attr_len) + memcpy(kaddr + end, kattr + end, attr_len - end); + /* Zero the region outside the end of the attribute value. */ + memset(kaddr + attr_len, 0, PAGE_SIZE - attr_len); + flush_dcache_page(page); + SetPageUptodate(page); + } + kunmap_atomic(kaddr); + /* Update initialized_size/i_size if necessary. */ + read_lock_irqsave(&ni->size_lock, flags); + initialized_size = ni->initialized_size; + BUG_ON(end > ni->allocated_size); + read_unlock_irqrestore(&ni->size_lock, flags); + BUG_ON(initialized_size != i_size); + if (end > initialized_size) { + write_lock_irqsave(&ni->size_lock, flags); + ni->initialized_size = end; + i_size_write(vi, end); + write_unlock_irqrestore(&ni->size_lock, flags); + } + /* Mark the mft record dirty, so it gets written back. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + ntfs_debug("Done."); + return 0; +err_out: + if (err == -ENOMEM) { + ntfs_warning(vi->i_sb, "Error allocating memory required to " + "commit the write."); + if (PageUptodate(page)) { + ntfs_warning(vi->i_sb, "Page is uptodate, setting " + "dirty so the write will be retried " + "later on by the VM."); + /* + * Put the page on mapping->dirty_pages, but leave its + * buffers' dirty state as-is. + */ + __set_page_dirty_nobuffers(page); + err = 0; + } else + ntfs_error(vi->i_sb, "Page is not uptodate. Written " + "data has been lost."); + } else { + ntfs_error(vi->i_sb, "Resident attribute commit write failed " + "with error %i.", err); + NVolSetErrors(ni->vol); + } + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + return err; +} + +/* + * Copy as much as we can into the pages and return the number of bytes which + * were successfully copied. If a fault is encountered then clear the pages + * out to (ofs + bytes) and return the number of bytes which were copied. + */ +static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages, + unsigned ofs, struct iov_iter *i, size_t bytes) +{ + struct page **last_page = pages + nr_pages; + size_t total = 0; + unsigned len, copied; + + do { + len = PAGE_SIZE - ofs; + if (len > bytes) + len = bytes; + copied = copy_page_from_iter_atomic(*pages, ofs, len, i); + total += copied; + bytes -= copied; + if (!bytes) + break; + if (copied < len) + goto err; + ofs = 0; + } while (++pages < last_page); +out: + return total; +err: + /* Zero the rest of the target like __copy_from_user(). */ + len = PAGE_SIZE - copied; + do { + if (len > bytes) + len = bytes; + zero_user(*pages, copied, len); + bytes -= len; + copied = 0; + len = PAGE_SIZE; + } while (++pages < last_page); + goto out; +} + +/** + * ntfs_perform_write - perform buffered write to a file + * @file: file to write to + * @i: iov_iter with data to write + * @pos: byte offset in file at which to begin writing to + */ +static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i, + loff_t pos) +{ + struct address_space *mapping = file->f_mapping; + struct inode *vi = mapping->host; + ntfs_inode *ni = NTFS_I(vi); + ntfs_volume *vol = ni->vol; + struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER]; + struct page *cached_page = NULL; + VCN last_vcn; + LCN lcn; + size_t bytes; + ssize_t status, written = 0; + unsigned nr_pages; + + ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos " + "0x%llx, count 0x%lx.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), + (unsigned long long)pos, + (unsigned long)iov_iter_count(i)); + /* + * If a previous ntfs_truncate() failed, repeat it and abort if it + * fails again. + */ + if (unlikely(NInoTruncateFailed(ni))) { + int err; + + inode_dio_wait(vi); + err = ntfs_truncate(vi); + if (err || NInoTruncateFailed(ni)) { + if (!err) + err = -EIO; + ntfs_error(vol->sb, "Cannot perform write to inode " + "0x%lx, attribute type 0x%x, because " + "ntfs_truncate() failed (error code " + "%i).", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + return err; + } + } + /* + * Determine the number of pages per cluster for non-resident + * attributes. + */ + nr_pages = 1; + if (vol->cluster_size > PAGE_SIZE && NInoNonResident(ni)) + nr_pages = vol->cluster_size >> PAGE_SHIFT; + last_vcn = -1; + do { + VCN vcn; + pgoff_t start_idx; + unsigned ofs, do_pages, u; + size_t copied; + + start_idx = pos >> PAGE_SHIFT; + ofs = pos & ~PAGE_MASK; + bytes = PAGE_SIZE - ofs; + do_pages = 1; + if (nr_pages > 1) { + vcn = pos >> vol->cluster_size_bits; + if (vcn != last_vcn) { + last_vcn = vcn; + /* + * Get the lcn of the vcn the write is in. If + * it is a hole, need to lock down all pages in + * the cluster. + */ + down_read(&ni->runlist.lock); + lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >> + vol->cluster_size_bits, false); + up_read(&ni->runlist.lock); + if (unlikely(lcn < LCN_HOLE)) { + if (lcn == LCN_ENOMEM) + status = -ENOMEM; + else { + status = -EIO; + ntfs_error(vol->sb, "Cannot " + "perform write to " + "inode 0x%lx, " + "attribute type 0x%x, " + "because the attribute " + "is corrupt.", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type)); + } + break; + } + if (lcn == LCN_HOLE) { + start_idx = (pos & ~(s64) + vol->cluster_size_mask) + >> PAGE_SHIFT; + bytes = vol->cluster_size - (pos & + vol->cluster_size_mask); + do_pages = nr_pages; + } + } + } + if (bytes > iov_iter_count(i)) + bytes = iov_iter_count(i); +again: + /* + * Bring in the user page(s) that we will copy from _first_. + * Otherwise there is a nasty deadlock on copying from the same + * page(s) as we are writing to, without it/them being marked + * up-to-date. Note, at present there is nothing to stop the + * pages being swapped out between us bringing them into memory + * and doing the actual copying. + */ + if (unlikely(fault_in_iov_iter_readable(i, bytes))) { + status = -EFAULT; + break; + } + /* Get and lock @do_pages starting at index @start_idx. */ + status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages, + pages, &cached_page); + if (unlikely(status)) + break; + /* + * For non-resident attributes, we need to fill any holes with + * actual clusters and ensure all bufferes are mapped. We also + * need to bring uptodate any buffers that are only partially + * being written to. + */ + if (NInoNonResident(ni)) { + status = ntfs_prepare_pages_for_non_resident_write( + pages, do_pages, pos, bytes); + if (unlikely(status)) { + do { + unlock_page(pages[--do_pages]); + put_page(pages[do_pages]); + } while (do_pages); + break; + } + } + u = (pos >> PAGE_SHIFT) - pages[0]->index; + copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs, + i, bytes); + ntfs_flush_dcache_pages(pages + u, do_pages - u); + status = 0; + if (likely(copied == bytes)) { + status = ntfs_commit_pages_after_write(pages, do_pages, + pos, bytes); + } + do { + unlock_page(pages[--do_pages]); + put_page(pages[do_pages]); + } while (do_pages); + if (unlikely(status < 0)) { + iov_iter_revert(i, copied); + break; + } + cond_resched(); + if (unlikely(copied < bytes)) { + iov_iter_revert(i, copied); + if (copied) + bytes = copied; + else if (bytes > PAGE_SIZE - ofs) + bytes = PAGE_SIZE - ofs; + goto again; + } + pos += copied; + written += copied; + balance_dirty_pages_ratelimited(mapping); + if (fatal_signal_pending(current)) { + status = -EINTR; + break; + } + } while (iov_iter_count(i)); + if (cached_page) + put_page(cached_page); + ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", + written ? "written" : "status", (unsigned long)written, + (long)status); + return written ? written : status; +} + +/** + * ntfs_file_write_iter - simple wrapper for ntfs_file_write_iter_nolock() + * @iocb: IO state structure + * @from: iov_iter with data to write + * + * Basically the same as generic_file_write_iter() except that it ends up + * up calling ntfs_perform_write() instead of generic_perform_write() and that + * O_DIRECT is not implemented. + */ +static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *vi = file_inode(file); + ssize_t written = 0; + ssize_t err; + + inode_lock(vi); + /* We can write back this queue in page reclaim. */ + err = ntfs_prepare_file_for_write(iocb, from); + if (iov_iter_count(from) && !err) + written = ntfs_perform_write(file, from, iocb->ki_pos); + inode_unlock(vi); + iocb->ki_pos += written; + if (likely(written > 0)) + written = generic_write_sync(iocb, written); + return written ? written : err; +} + +/** + * ntfs_file_fsync - sync a file to disk + * @filp: file to be synced + * @datasync: if non-zero only flush user data and not metadata + * + * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync + * system calls. This function is inspired by fs/buffer.c::file_fsync(). + * + * If @datasync is false, write the mft record and all associated extent mft + * records as well as the $DATA attribute and then sync the block device. + * + * If @datasync is true and the attribute is non-resident, we skip the writing + * of the mft record and all associated extent mft records (this might still + * happen due to the write_inode_now() call). + * + * Also, if @datasync is true, we do not wait on the inode to be written out + * but we always wait on the page cache pages to be written out. + * + * Locking: Caller must hold i_mutex on the inode. + * + * TODO: We should probably also write all attribute/index inodes associated + * with this inode but since we have no simple way of getting to them we ignore + * this problem for now. + */ +static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) +{ + struct inode *vi = filp->f_mapping->host; + int err, ret = 0; + + ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); + + err = file_write_and_wait_range(filp, start, end); + if (err) + return err; + inode_lock(vi); + + BUG_ON(S_ISDIR(vi->i_mode)); + if (!datasync || !NInoNonResident(NTFS_I(vi))) + ret = __ntfs_write_inode(vi, 1); + write_inode_now(vi, !datasync); + /* + * NOTE: If we were to use mapping->private_list (see ext2 and + * fs/buffer.c) for dirty blocks then we could optimize the below to be + * sync_mapping_buffers(vi->i_mapping). + */ + err = sync_blockdev(vi->i_sb->s_bdev); + if (unlikely(err && !ret)) + ret = err; + if (likely(!ret)) + ntfs_debug("Done."); + else + ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " + "%u.", datasync ? "data" : "", vi->i_ino, -ret); + inode_unlock(vi); + return ret; +} + +#endif /* NTFS_RW */ + +const struct file_operations ntfs_file_ops = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, +#ifdef NTFS_RW + .write_iter = ntfs_file_write_iter, + .fsync = ntfs_file_fsync, +#endif /* NTFS_RW */ + .mmap = generic_file_mmap, + .open = ntfs_file_open, + .splice_read = filemap_splice_read, +}; + +const struct inode_operations ntfs_file_inode_ops = { +#ifdef NTFS_RW + .setattr = ntfs_setattr, +#endif /* NTFS_RW */ +}; + +const struct file_operations ntfs_empty_file_ops = {}; + +const struct inode_operations ntfs_empty_inode_ops = {}; diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c new file mode 100644 index 000000000000..d46c2c03a032 --- /dev/null +++ b/fs/ntfs/index.c @@ -0,0 +1,440 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * index.c - NTFS kernel index handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2004-2005 Anton Altaparmakov + */ + +#include + +#include "aops.h" +#include "collate.h" +#include "debug.h" +#include "index.h" +#include "ntfs.h" + +/** + * ntfs_index_ctx_get - allocate and initialize a new index context + * @idx_ni: ntfs index inode with which to initialize the context + * + * Allocate a new index context, initialize it with @idx_ni and return it. + * Return NULL if allocation failed. + * + * Locking: Caller must hold i_mutex on the index inode. + */ +ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni) +{ + ntfs_index_context *ictx; + + ictx = kmem_cache_alloc(ntfs_index_ctx_cache, GFP_NOFS); + if (ictx) + *ictx = (ntfs_index_context){ .idx_ni = idx_ni }; + return ictx; +} + +/** + * ntfs_index_ctx_put - release an index context + * @ictx: index context to free + * + * Release the index context @ictx, releasing all associated resources. + * + * Locking: Caller must hold i_mutex on the index inode. + */ +void ntfs_index_ctx_put(ntfs_index_context *ictx) +{ + if (ictx->entry) { + if (ictx->is_in_root) { + if (ictx->actx) + ntfs_attr_put_search_ctx(ictx->actx); + if (ictx->base_ni) + unmap_mft_record(ictx->base_ni); + } else { + struct page *page = ictx->page; + if (page) { + BUG_ON(!PageLocked(page)); + unlock_page(page); + ntfs_unmap_page(page); + } + } + } + kmem_cache_free(ntfs_index_ctx_cache, ictx); + return; +} + +/** + * ntfs_index_lookup - find a key in an index and return its index entry + * @key: [IN] key for which to search in the index + * @key_len: [IN] length of @key in bytes + * @ictx: [IN/OUT] context describing the index and the returned entry + * + * Before calling ntfs_index_lookup(), @ictx must have been obtained from a + * call to ntfs_index_ctx_get(). + * + * Look for the @key in the index specified by the index lookup context @ictx. + * ntfs_index_lookup() walks the contents of the index looking for the @key. + * + * If the @key is found in the index, 0 is returned and @ictx is setup to + * describe the index entry containing the matching @key. @ictx->entry is the + * index entry and @ictx->data and @ictx->data_len are the index entry data and + * its length in bytes, respectively. + * + * If the @key is not found in the index, -ENOENT is returned and @ictx is + * setup to describe the index entry whose key collates immediately after the + * search @key, i.e. this is the position in the index at which an index entry + * with a key of @key would need to be inserted. + * + * If an error occurs return the negative error code and @ictx is left + * untouched. + * + * When finished with the entry and its data, call ntfs_index_ctx_put() to free + * the context and other associated resources. + * + * If the index entry was modified, call flush_dcache_index_entry_page() + * immediately after the modification and either ntfs_index_entry_mark_dirty() + * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to + * ensure that the changes are written to disk. + * + * Locking: - Caller must hold i_mutex on the index inode. + * - Each page cache page in the index allocation mapping must be + * locked whilst being accessed otherwise we may find a corrupt + * page due to it being under ->writepage at the moment which + * applies the mst protection fixups before writing out and then + * removes them again after the write is complete after which it + * unlocks the page. + */ +int ntfs_index_lookup(const void *key, const int key_len, + ntfs_index_context *ictx) +{ + VCN vcn, old_vcn; + ntfs_inode *idx_ni = ictx->idx_ni; + ntfs_volume *vol = idx_ni->vol; + struct super_block *sb = vol->sb; + ntfs_inode *base_ni = idx_ni->ext.base_ntfs_ino; + MFT_RECORD *m; + INDEX_ROOT *ir; + INDEX_ENTRY *ie; + INDEX_ALLOCATION *ia; + u8 *index_end, *kaddr; + ntfs_attr_search_ctx *actx; + struct address_space *ia_mapping; + struct page *page; + int rc, err = 0; + + ntfs_debug("Entering."); + BUG_ON(!NInoAttr(idx_ni)); + BUG_ON(idx_ni->type != AT_INDEX_ALLOCATION); + BUG_ON(idx_ni->nr_extents != -1); + BUG_ON(!base_ni); + BUG_ON(!key); + BUG_ON(key_len <= 0); + if (!ntfs_is_collation_rule_supported( + idx_ni->itype.index.collation_rule)) { + ntfs_error(sb, "Index uses unsupported collation rule 0x%x. " + "Aborting lookup.", le32_to_cpu( + idx_ni->itype.index.collation_rule)); + return -EOPNOTSUPP; + } + /* Get hold of the mft record for the index inode. */ + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + ntfs_error(sb, "map_mft_record() failed with error code %ld.", + -PTR_ERR(m)); + return PTR_ERR(m); + } + actx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!actx)) { + err = -ENOMEM; + goto err_out; + } + /* Find the index root attribute in the mft record. */ + err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, actx); + if (unlikely(err)) { + if (err == -ENOENT) { + ntfs_error(sb, "Index root attribute missing in inode " + "0x%lx.", idx_ni->mft_no); + err = -EIO; + } + goto err_out; + } + /* Get to the index root value (it has been verified in read_inode). */ + ir = (INDEX_ROOT*)((u8*)actx->attr + + le16_to_cpu(actx->attr->data.resident.value_offset)); + index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); + /* The first index entry. */ + ie = (INDEX_ENTRY*)((u8*)&ir->index + + le32_to_cpu(ir->index.entries_offset)); + /* + * Loop until we exceed valid memory (corruption case) or until we + * reach the last entry. + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + /* Bounds checks. */ + if ((u8*)ie < (u8*)actx->mrec || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->length) > index_end) + goto idx_err_out; + /* + * The last entry cannot contain a key. It can however contain + * a pointer to a child node in the B+tree so we just break out. + */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* Further bounds checks. */ + if ((u32)sizeof(INDEX_ENTRY_HEADER) + + le16_to_cpu(ie->key_length) > + le16_to_cpu(ie->data.vi.data_offset) || + (u32)le16_to_cpu(ie->data.vi.data_offset) + + le16_to_cpu(ie->data.vi.data_length) > + le16_to_cpu(ie->length)) + goto idx_err_out; + /* If the keys match perfectly, we setup @ictx and return 0. */ + if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key, + &ie->key, key_len)) { +ir_done: + ictx->is_in_root = true; + ictx->ir = ir; + ictx->actx = actx; + ictx->base_ni = base_ni; + ictx->ia = NULL; + ictx->page = NULL; +done: + ictx->entry = ie; + ictx->data = (u8*)ie + + le16_to_cpu(ie->data.vi.data_offset); + ictx->data_len = le16_to_cpu(ie->data.vi.data_length); + ntfs_debug("Done."); + return err; + } + /* + * Not a perfect match, need to do full blown collation so we + * know which way in the B+tree we have to go. + */ + rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key, + key_len, &ie->key, le16_to_cpu(ie->key_length)); + /* + * If @key collates before the key of the current entry, there + * is definitely no such key in this index but we might need to + * descend into the B+tree so we just break out of the loop. + */ + if (rc == -1) + break; + /* + * A match should never happen as the memcmp() call should have + * cought it, but we still treat it correctly. + */ + if (!rc) + goto ir_done; + /* The keys are not equal, continue the search. */ + } + /* + * We have finished with this index without success. Check for the + * presence of a child node and if not present setup @ictx and return + * -ENOENT. + */ + if (!(ie->flags & INDEX_ENTRY_NODE)) { + ntfs_debug("Entry not found."); + err = -ENOENT; + goto ir_done; + } /* Child node present, descend into it. */ + /* Consistency check: Verify that an index allocation exists. */ + if (!NInoIndexAllocPresent(idx_ni)) { + ntfs_error(sb, "No index allocation attribute but index entry " + "requires one. Inode 0x%lx is corrupt or " + "driver bug.", idx_ni->mft_no); + goto err_out; + } + /* Get the starting vcn of the index_block holding the child node. */ + vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8)); + ia_mapping = VFS_I(idx_ni)->i_mapping; + /* + * We are done with the index root and the mft record. Release them, + * otherwise we deadlock with ntfs_map_page(). + */ + ntfs_attr_put_search_ctx(actx); + unmap_mft_record(base_ni); + m = NULL; + actx = NULL; +descend_into_child_node: + /* + * Convert vcn to index into the index allocation attribute in units + * of PAGE_SIZE and map the page cache page, reading it from + * disk if necessary. + */ + page = ntfs_map_page(ia_mapping, vcn << + idx_ni->itype.index.vcn_size_bits >> PAGE_SHIFT); + if (IS_ERR(page)) { + ntfs_error(sb, "Failed to map index page, error %ld.", + -PTR_ERR(page)); + err = PTR_ERR(page); + goto err_out; + } + lock_page(page); + kaddr = (u8*)page_address(page); +fast_descend_into_child_node: + /* Get to the index allocation block. */ + ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << + idx_ni->itype.index.vcn_size_bits) & ~PAGE_MASK)); + /* Bounds checks. */ + if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) { + ntfs_error(sb, "Out of bounds check failed. Corrupt inode " + "0x%lx or driver bug.", idx_ni->mft_no); + goto unm_err_out; + } + /* Catch multi sector transfer fixup errors. */ + if (unlikely(!ntfs_is_indx_record(ia->magic))) { + ntfs_error(sb, "Index record with vcn 0x%llx is corrupt. " + "Corrupt inode 0x%lx. Run chkdsk.", + (long long)vcn, idx_ni->mft_no); + goto unm_err_out; + } + if (sle64_to_cpu(ia->index_block_vcn) != vcn) { + ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " + "different from expected VCN (0x%llx). Inode " + "0x%lx is corrupt or driver bug.", + (unsigned long long) + sle64_to_cpu(ia->index_block_vcn), + (unsigned long long)vcn, idx_ni->mft_no); + goto unm_err_out; + } + if (le32_to_cpu(ia->index.allocated_size) + 0x18 != + idx_ni->itype.index.block_size) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx has " + "a size (%u) differing from the index " + "specified size (%u). Inode is corrupt or " + "driver bug.", (unsigned long long)vcn, + idx_ni->mft_no, + le32_to_cpu(ia->index.allocated_size) + 0x18, + idx_ni->itype.index.block_size); + goto unm_err_out; + } + index_end = (u8*)ia + idx_ni->itype.index.block_size; + if (index_end > kaddr + PAGE_SIZE) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx " + "crosses page boundary. Impossible! Cannot " + "access! This is probably a bug in the " + "driver.", (unsigned long long)vcn, + idx_ni->mft_no); + goto unm_err_out; + } + index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); + if (index_end > (u8*)ia + idx_ni->itype.index.block_size) { + ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of inode " + "0x%lx exceeds maximum size.", + (unsigned long long)vcn, idx_ni->mft_no); + goto unm_err_out; + } + /* The first index entry. */ + ie = (INDEX_ENTRY*)((u8*)&ia->index + + le32_to_cpu(ia->index.entries_offset)); + /* + * Iterate similar to above big loop but applied to index buffer, thus + * loop until we exceed valid memory (corruption case) or until we + * reach the last entry. + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + /* Bounds checks. */ + if ((u8*)ie < (u8*)ia || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->length) > index_end) { + ntfs_error(sb, "Index entry out of bounds in inode " + "0x%lx.", idx_ni->mft_no); + goto unm_err_out; + } + /* + * The last entry cannot contain a key. It can however contain + * a pointer to a child node in the B+tree so we just break out. + */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* Further bounds checks. */ + if ((u32)sizeof(INDEX_ENTRY_HEADER) + + le16_to_cpu(ie->key_length) > + le16_to_cpu(ie->data.vi.data_offset) || + (u32)le16_to_cpu(ie->data.vi.data_offset) + + le16_to_cpu(ie->data.vi.data_length) > + le16_to_cpu(ie->length)) { + ntfs_error(sb, "Index entry out of bounds in inode " + "0x%lx.", idx_ni->mft_no); + goto unm_err_out; + } + /* If the keys match perfectly, we setup @ictx and return 0. */ + if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key, + &ie->key, key_len)) { +ia_done: + ictx->is_in_root = false; + ictx->actx = NULL; + ictx->base_ni = NULL; + ictx->ia = ia; + ictx->page = page; + goto done; + } + /* + * Not a perfect match, need to do full blown collation so we + * know which way in the B+tree we have to go. + */ + rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key, + key_len, &ie->key, le16_to_cpu(ie->key_length)); + /* + * If @key collates before the key of the current entry, there + * is definitely no such key in this index but we might need to + * descend into the B+tree so we just break out of the loop. + */ + if (rc == -1) + break; + /* + * A match should never happen as the memcmp() call should have + * cought it, but we still treat it correctly. + */ + if (!rc) + goto ia_done; + /* The keys are not equal, continue the search. */ + } + /* + * We have finished with this index buffer without success. Check for + * the presence of a child node and if not present return -ENOENT. + */ + if (!(ie->flags & INDEX_ENTRY_NODE)) { + ntfs_debug("Entry not found."); + err = -ENOENT; + goto ia_done; + } + if ((ia->index.flags & NODE_MASK) == LEAF_NODE) { + ntfs_error(sb, "Index entry with child node found in a leaf " + "node in inode 0x%lx.", idx_ni->mft_no); + goto unm_err_out; + } + /* Child node present, descend into it. */ + old_vcn = vcn; + vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8)); + if (vcn >= 0) { + /* + * If vcn is in the same page cache page as old_vcn we recycle + * the mapped page. + */ + if (old_vcn << vol->cluster_size_bits >> + PAGE_SHIFT == vcn << + vol->cluster_size_bits >> + PAGE_SHIFT) + goto fast_descend_into_child_node; + unlock_page(page); + ntfs_unmap_page(page); + goto descend_into_child_node; + } + ntfs_error(sb, "Negative child node vcn in inode 0x%lx.", + idx_ni->mft_no); +unm_err_out: + unlock_page(page); + ntfs_unmap_page(page); +err_out: + if (!err) + err = -EIO; + if (actx) + ntfs_attr_put_search_ctx(actx); + if (m) + unmap_mft_record(base_ni); + return err; +idx_err_out: + ntfs_error(sb, "Corrupt index. Aborting lookup."); + goto err_out; +} diff --git a/fs/ntfs/index.h b/fs/ntfs/index.h new file mode 100644 index 000000000000..bb3c3ae55138 --- /dev/null +++ b/fs/ntfs/index.h @@ -0,0 +1,134 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * index.h - Defines for NTFS kernel index handling. Part of the Linux-NTFS + * project. + * + * Copyright (c) 2004 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_INDEX_H +#define _LINUX_NTFS_INDEX_H + +#include + +#include "types.h" +#include "layout.h" +#include "inode.h" +#include "attrib.h" +#include "mft.h" +#include "aops.h" + +/** + * @idx_ni: index inode containing the @entry described by this context + * @entry: index entry (points into @ir or @ia) + * @data: index entry data (points into @entry) + * @data_len: length in bytes of @data + * @is_in_root: 'true' if @entry is in @ir and 'false' if it is in @ia + * @ir: index root if @is_in_root and NULL otherwise + * @actx: attribute search context if @is_in_root and NULL otherwise + * @base_ni: base inode if @is_in_root and NULL otherwise + * @ia: index block if @is_in_root is 'false' and NULL otherwise + * @page: page if @is_in_root is 'false' and NULL otherwise + * + * @idx_ni is the index inode this context belongs to. + * + * @entry is the index entry described by this context. @data and @data_len + * are the index entry data and its length in bytes, respectively. @data + * simply points into @entry. This is probably what the user is interested in. + * + * If @is_in_root is 'true', @entry is in the index root attribute @ir described + * by the attribute search context @actx and the base inode @base_ni. @ia and + * @page are NULL in this case. + * + * If @is_in_root is 'false', @entry is in the index allocation attribute and @ia + * and @page point to the index allocation block and the mapped, locked page it + * is in, respectively. @ir, @actx and @base_ni are NULL in this case. + * + * To obtain a context call ntfs_index_ctx_get(). + * + * We use this context to allow ntfs_index_lookup() to return the found index + * @entry and its @data without having to allocate a buffer and copy the @entry + * and/or its @data into it. + * + * When finished with the @entry and its @data, call ntfs_index_ctx_put() to + * free the context and other associated resources. + * + * If the index entry was modified, call flush_dcache_index_entry_page() + * immediately after the modification and either ntfs_index_entry_mark_dirty() + * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to + * ensure that the changes are written to disk. + */ +typedef struct { + ntfs_inode *idx_ni; + INDEX_ENTRY *entry; + void *data; + u16 data_len; + bool is_in_root; + INDEX_ROOT *ir; + ntfs_attr_search_ctx *actx; + ntfs_inode *base_ni; + INDEX_ALLOCATION *ia; + struct page *page; +} ntfs_index_context; + +extern ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni); +extern void ntfs_index_ctx_put(ntfs_index_context *ictx); + +extern int ntfs_index_lookup(const void *key, const int key_len, + ntfs_index_context *ictx); + +#ifdef NTFS_RW + +/** + * ntfs_index_entry_flush_dcache_page - flush_dcache_page() for index entries + * @ictx: ntfs index context describing the index entry + * + * Call flush_dcache_page() for the page in which an index entry resides. + * + * This must be called every time an index entry is modified, just after the + * modification. + * + * If the index entry is in the index root attribute, simply flush the page + * containing the mft record containing the index root attribute. + * + * If the index entry is in an index block belonging to the index allocation + * attribute, simply flush the page cache page containing the index block. + */ +static inline void ntfs_index_entry_flush_dcache_page(ntfs_index_context *ictx) +{ + if (ictx->is_in_root) + flush_dcache_mft_record_page(ictx->actx->ntfs_ino); + else + flush_dcache_page(ictx->page); +} + +/** + * ntfs_index_entry_mark_dirty - mark an index entry dirty + * @ictx: ntfs index context describing the index entry + * + * Mark the index entry described by the index entry context @ictx dirty. + * + * If the index entry is in the index root attribute, simply mark the mft + * record containing the index root attribute dirty. This ensures the mft + * record, and hence the index root attribute, will be written out to disk + * later. + * + * If the index entry is in an index block belonging to the index allocation + * attribute, mark the buffers belonging to the index record as well as the + * page cache page the index block is in dirty. This automatically marks the + * VFS inode of the ntfs index inode to which the index entry belongs dirty, + * too (I_DIRTY_PAGES) and this in turn ensures the page buffers, and hence the + * dirty index block, will be written out to disk later. + */ +static inline void ntfs_index_entry_mark_dirty(ntfs_index_context *ictx) +{ + if (ictx->is_in_root) + mark_mft_record_dirty(ictx->actx->ntfs_ino); + else + mark_ntfs_record_dirty(ictx->page, + (u8*)ictx->ia - (u8*)page_address(ictx->page)); +} + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_INDEX_H */ diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c new file mode 100644 index 000000000000..aba1e22db4e9 --- /dev/null +++ b/fs/ntfs/inode.c @@ -0,0 +1,3102 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * inode.c - NTFS kernel inode handling. + * + * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "aops.h" +#include "attrib.h" +#include "bitmap.h" +#include "dir.h" +#include "debug.h" +#include "inode.h" +#include "lcnalloc.h" +#include "malloc.h" +#include "mft.h" +#include "time.h" +#include "ntfs.h" + +/** + * ntfs_test_inode - compare two (possibly fake) inodes for equality + * @vi: vfs inode which to test + * @data: data which is being tested with + * + * Compare the ntfs attribute embedded in the ntfs specific part of the vfs + * inode @vi for equality with the ntfs attribute @data. + * + * If searching for the normal file/directory inode, set @na->type to AT_UNUSED. + * @na->name and @na->name_len are then ignored. + * + * Return 1 if the attributes match and 0 if not. + * + * NOTE: This function runs with the inode_hash_lock spin lock held so it is not + * allowed to sleep. + */ +int ntfs_test_inode(struct inode *vi, void *data) +{ + ntfs_attr *na = (ntfs_attr *)data; + ntfs_inode *ni; + + if (vi->i_ino != na->mft_no) + return 0; + ni = NTFS_I(vi); + /* If !NInoAttr(ni), @vi is a normal file or directory inode. */ + if (likely(!NInoAttr(ni))) { + /* If not looking for a normal inode this is a mismatch. */ + if (unlikely(na->type != AT_UNUSED)) + return 0; + } else { + /* A fake inode describing an attribute. */ + if (ni->type != na->type) + return 0; + if (ni->name_len != na->name_len) + return 0; + if (na->name_len && memcmp(ni->name, na->name, + na->name_len * sizeof(ntfschar))) + return 0; + } + /* Match! */ + return 1; +} + +/** + * ntfs_init_locked_inode - initialize an inode + * @vi: vfs inode to initialize + * @data: data which to initialize @vi to + * + * Initialize the vfs inode @vi with the values from the ntfs attribute @data in + * order to enable ntfs_test_inode() to do its work. + * + * If initializing the normal file/directory inode, set @na->type to AT_UNUSED. + * In that case, @na->name and @na->name_len should be set to NULL and 0, + * respectively. Although that is not strictly necessary as + * ntfs_read_locked_inode() will fill them in later. + * + * Return 0 on success and -errno on error. + * + * NOTE: This function runs with the inode->i_lock spin lock held so it is not + * allowed to sleep. (Hence the GFP_ATOMIC allocation.) + */ +static int ntfs_init_locked_inode(struct inode *vi, void *data) +{ + ntfs_attr *na = (ntfs_attr *)data; + ntfs_inode *ni = NTFS_I(vi); + + vi->i_ino = na->mft_no; + + ni->type = na->type; + if (na->type == AT_INDEX_ALLOCATION) + NInoSetMstProtected(ni); + + ni->name = na->name; + ni->name_len = na->name_len; + + /* If initializing a normal inode, we are done. */ + if (likely(na->type == AT_UNUSED)) { + BUG_ON(na->name); + BUG_ON(na->name_len); + return 0; + } + + /* It is a fake inode. */ + NInoSetAttr(ni); + + /* + * We have I30 global constant as an optimization as it is the name + * in >99.9% of named attributes! The other <0.1% incur a GFP_ATOMIC + * allocation but that is ok. And most attributes are unnamed anyway, + * thus the fraction of named attributes with name != I30 is actually + * absolutely tiny. + */ + if (na->name_len && na->name != I30) { + unsigned int i; + + BUG_ON(!na->name); + i = na->name_len * sizeof(ntfschar); + ni->name = kmalloc(i + sizeof(ntfschar), GFP_ATOMIC); + if (!ni->name) + return -ENOMEM; + memcpy(ni->name, na->name, i); + ni->name[na->name_len] = 0; + } + return 0; +} + +static int ntfs_read_locked_inode(struct inode *vi); +static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi); +static int ntfs_read_locked_index_inode(struct inode *base_vi, + struct inode *vi); + +/** + * ntfs_iget - obtain a struct inode corresponding to a specific normal inode + * @sb: super block of mounted volume + * @mft_no: mft record number / inode number to obtain + * + * Obtain the struct inode corresponding to a specific normal inode (i.e. a + * file or directory). + * + * If the inode is in the cache, it is just returned with an increased + * reference count. Otherwise, a new struct inode is allocated and initialized, + * and finally ntfs_read_locked_inode() is called to read in the inode and + * fill in the remainder of the inode structure. + * + * Return the struct inode on success. Check the return value with IS_ERR() and + * if true, the function failed and the error code is obtained from PTR_ERR(). + */ +struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no) +{ + struct inode *vi; + int err; + ntfs_attr na; + + na.mft_no = mft_no; + na.type = AT_UNUSED; + na.name = NULL; + na.name_len = 0; + + vi = iget5_locked(sb, mft_no, ntfs_test_inode, + ntfs_init_locked_inode, &na); + if (unlikely(!vi)) + return ERR_PTR(-ENOMEM); + + err = 0; + + /* If this is a freshly allocated inode, need to read it now. */ + if (vi->i_state & I_NEW) { + err = ntfs_read_locked_inode(vi); + unlock_new_inode(vi); + } + /* + * There is no point in keeping bad inodes around if the failure was + * due to ENOMEM. We want to be able to retry again later. + */ + if (unlikely(err == -ENOMEM)) { + iput(vi); + vi = ERR_PTR(err); + } + return vi; +} + +/** + * ntfs_attr_iget - obtain a struct inode corresponding to an attribute + * @base_vi: vfs base inode containing the attribute + * @type: attribute type + * @name: Unicode name of the attribute (NULL if unnamed) + * @name_len: length of @name in Unicode characters (0 if unnamed) + * + * Obtain the (fake) struct inode corresponding to the attribute specified by + * @type, @name, and @name_len, which is present in the base mft record + * specified by the vfs inode @base_vi. + * + * If the attribute inode is in the cache, it is just returned with an + * increased reference count. Otherwise, a new struct inode is allocated and + * initialized, and finally ntfs_read_locked_attr_inode() is called to read the + * attribute and fill in the inode structure. + * + * Note, for index allocation attributes, you need to use ntfs_index_iget() + * instead of ntfs_attr_iget() as working with indices is a lot more complex. + * + * Return the struct inode of the attribute inode on success. Check the return + * value with IS_ERR() and if true, the function failed and the error code is + * obtained from PTR_ERR(). + */ +struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type, + ntfschar *name, u32 name_len) +{ + struct inode *vi; + int err; + ntfs_attr na; + + /* Make sure no one calls ntfs_attr_iget() for indices. */ + BUG_ON(type == AT_INDEX_ALLOCATION); + + na.mft_no = base_vi->i_ino; + na.type = type; + na.name = name; + na.name_len = name_len; + + vi = iget5_locked(base_vi->i_sb, na.mft_no, ntfs_test_inode, + ntfs_init_locked_inode, &na); + if (unlikely(!vi)) + return ERR_PTR(-ENOMEM); + + err = 0; + + /* If this is a freshly allocated inode, need to read it now. */ + if (vi->i_state & I_NEW) { + err = ntfs_read_locked_attr_inode(base_vi, vi); + unlock_new_inode(vi); + } + /* + * There is no point in keeping bad attribute inodes around. This also + * simplifies things in that we never need to check for bad attribute + * inodes elsewhere. + */ + if (unlikely(err)) { + iput(vi); + vi = ERR_PTR(err); + } + return vi; +} + +/** + * ntfs_index_iget - obtain a struct inode corresponding to an index + * @base_vi: vfs base inode containing the index related attributes + * @name: Unicode name of the index + * @name_len: length of @name in Unicode characters + * + * Obtain the (fake) struct inode corresponding to the index specified by @name + * and @name_len, which is present in the base mft record specified by the vfs + * inode @base_vi. + * + * If the index inode is in the cache, it is just returned with an increased + * reference count. Otherwise, a new struct inode is allocated and + * initialized, and finally ntfs_read_locked_index_inode() is called to read + * the index related attributes and fill in the inode structure. + * + * Return the struct inode of the index inode on success. Check the return + * value with IS_ERR() and if true, the function failed and the error code is + * obtained from PTR_ERR(). + */ +struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name, + u32 name_len) +{ + struct inode *vi; + int err; + ntfs_attr na; + + na.mft_no = base_vi->i_ino; + na.type = AT_INDEX_ALLOCATION; + na.name = name; + na.name_len = name_len; + + vi = iget5_locked(base_vi->i_sb, na.mft_no, ntfs_test_inode, + ntfs_init_locked_inode, &na); + if (unlikely(!vi)) + return ERR_PTR(-ENOMEM); + + err = 0; + + /* If this is a freshly allocated inode, need to read it now. */ + if (vi->i_state & I_NEW) { + err = ntfs_read_locked_index_inode(base_vi, vi); + unlock_new_inode(vi); + } + /* + * There is no point in keeping bad index inodes around. This also + * simplifies things in that we never need to check for bad index + * inodes elsewhere. + */ + if (unlikely(err)) { + iput(vi); + vi = ERR_PTR(err); + } + return vi; +} + +struct inode *ntfs_alloc_big_inode(struct super_block *sb) +{ + ntfs_inode *ni; + + ntfs_debug("Entering."); + ni = alloc_inode_sb(sb, ntfs_big_inode_cache, GFP_NOFS); + if (likely(ni != NULL)) { + ni->state = 0; + return VFS_I(ni); + } + ntfs_error(sb, "Allocation of NTFS big inode structure failed."); + return NULL; +} + +void ntfs_free_big_inode(struct inode *inode) +{ + kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode)); +} + +static inline ntfs_inode *ntfs_alloc_extent_inode(void) +{ + ntfs_inode *ni; + + ntfs_debug("Entering."); + ni = kmem_cache_alloc(ntfs_inode_cache, GFP_NOFS); + if (likely(ni != NULL)) { + ni->state = 0; + return ni; + } + ntfs_error(NULL, "Allocation of NTFS inode structure failed."); + return NULL; +} + +static void ntfs_destroy_extent_inode(ntfs_inode *ni) +{ + ntfs_debug("Entering."); + BUG_ON(ni->page); + if (!atomic_dec_and_test(&ni->count)) + BUG(); + kmem_cache_free(ntfs_inode_cache, ni); +} + +/* + * The attribute runlist lock has separate locking rules from the + * normal runlist lock, so split the two lock-classes: + */ +static struct lock_class_key attr_list_rl_lock_class; + +/** + * __ntfs_init_inode - initialize ntfs specific part of an inode + * @sb: super block of mounted volume + * @ni: freshly allocated ntfs inode which to initialize + * + * Initialize an ntfs inode to defaults. + * + * NOTE: ni->mft_no, ni->state, ni->type, ni->name, and ni->name_len are left + * untouched. Make sure to initialize them elsewhere. + * + * Return zero on success and -ENOMEM on error. + */ +void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni) +{ + ntfs_debug("Entering."); + rwlock_init(&ni->size_lock); + ni->initialized_size = ni->allocated_size = 0; + ni->seq_no = 0; + atomic_set(&ni->count, 1); + ni->vol = NTFS_SB(sb); + ntfs_init_runlist(&ni->runlist); + mutex_init(&ni->mrec_lock); + ni->page = NULL; + ni->page_ofs = 0; + ni->attr_list_size = 0; + ni->attr_list = NULL; + ntfs_init_runlist(&ni->attr_list_rl); + lockdep_set_class(&ni->attr_list_rl.lock, + &attr_list_rl_lock_class); + ni->itype.index.block_size = 0; + ni->itype.index.vcn_size = 0; + ni->itype.index.collation_rule = 0; + ni->itype.index.block_size_bits = 0; + ni->itype.index.vcn_size_bits = 0; + mutex_init(&ni->extent_lock); + ni->nr_extents = 0; + ni->ext.base_ntfs_ino = NULL; +} + +/* + * Extent inodes get MFT-mapped in a nested way, while the base inode + * is still mapped. Teach this nesting to the lock validator by creating + * a separate class for nested inode's mrec_lock's: + */ +static struct lock_class_key extent_inode_mrec_lock_key; + +inline ntfs_inode *ntfs_new_extent_inode(struct super_block *sb, + unsigned long mft_no) +{ + ntfs_inode *ni = ntfs_alloc_extent_inode(); + + ntfs_debug("Entering."); + if (likely(ni != NULL)) { + __ntfs_init_inode(sb, ni); + lockdep_set_class(&ni->mrec_lock, &extent_inode_mrec_lock_key); + ni->mft_no = mft_no; + ni->type = AT_UNUSED; + ni->name = NULL; + ni->name_len = 0; + } + return ni; +} + +/** + * ntfs_is_extended_system_file - check if a file is in the $Extend directory + * @ctx: initialized attribute search context + * + * Search all file name attributes in the inode described by the attribute + * search context @ctx and check if any of the names are in the $Extend system + * directory. + * + * Return values: + * 1: file is in $Extend directory + * 0: file is not in $Extend directory + * -errno: failed to determine if the file is in the $Extend directory + */ +static int ntfs_is_extended_system_file(ntfs_attr_search_ctx *ctx) +{ + int nr_links, err; + + /* Restart search. */ + ntfs_attr_reinit_search_ctx(ctx); + + /* Get number of hard links. */ + nr_links = le16_to_cpu(ctx->mrec->link_count); + + /* Loop through all hard links. */ + while (!(err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0, NULL, 0, + ctx))) { + FILE_NAME_ATTR *file_name_attr; + ATTR_RECORD *attr = ctx->attr; + u8 *p, *p2; + + nr_links--; + /* + * Maximum sanity checking as we are called on an inode that + * we suspect might be corrupt. + */ + p = (u8*)attr + le32_to_cpu(attr->length); + if (p < (u8*)ctx->mrec || (u8*)p > (u8*)ctx->mrec + + le32_to_cpu(ctx->mrec->bytes_in_use)) { +err_corrupt_attr: + ntfs_error(ctx->ntfs_ino->vol->sb, "Corrupt file name " + "attribute. You should run chkdsk."); + return -EIO; + } + if (attr->non_resident) { + ntfs_error(ctx->ntfs_ino->vol->sb, "Non-resident file " + "name. You should run chkdsk."); + return -EIO; + } + if (attr->flags) { + ntfs_error(ctx->ntfs_ino->vol->sb, "File name with " + "invalid flags. You should run " + "chkdsk."); + return -EIO; + } + if (!(attr->data.resident.flags & RESIDENT_ATTR_IS_INDEXED)) { + ntfs_error(ctx->ntfs_ino->vol->sb, "Unindexed file " + "name. You should run chkdsk."); + return -EIO; + } + file_name_attr = (FILE_NAME_ATTR*)((u8*)attr + + le16_to_cpu(attr->data.resident.value_offset)); + p2 = (u8 *)file_name_attr + le32_to_cpu(attr->data.resident.value_length); + if (p2 < (u8*)attr || p2 > p) + goto err_corrupt_attr; + /* This attribute is ok, but is it in the $Extend directory? */ + if (MREF_LE(file_name_attr->parent_directory) == FILE_Extend) + return 1; /* YES, it's an extended system file. */ + } + if (unlikely(err != -ENOENT)) + return err; + if (unlikely(nr_links)) { + ntfs_error(ctx->ntfs_ino->vol->sb, "Inode hard link count " + "doesn't match number of name attributes. You " + "should run chkdsk."); + return -EIO; + } + return 0; /* NO, it is not an extended system file. */ +} + +/** + * ntfs_read_locked_inode - read an inode from its device + * @vi: inode to read + * + * ntfs_read_locked_inode() is called from ntfs_iget() to read the inode + * described by @vi into memory from the device. + * + * The only fields in @vi that we need to/can look at when the function is + * called are i_sb, pointing to the mounted device's super block, and i_ino, + * the number of the inode to load. + * + * ntfs_read_locked_inode() maps, pins and locks the mft record number i_ino + * for reading and sets up the necessary @vi fields as well as initializing + * the ntfs inode. + * + * Q: What locks are held when the function is called? + * A: i_state has I_NEW set, hence the inode is locked, also + * i_count is set to 1, so it is not going to go away + * i_flags is set to 0 and we have no business touching it. Only an ioctl() + * is allowed to write to them. We should of course be honouring them but + * we need to do that using the IS_* macros defined in include/linux/fs.h. + * In any case ntfs_read_locked_inode() has nothing to do with i_flags. + * + * Return 0 on success and -errno on error. In the error case, the inode will + * have had make_bad_inode() executed on it. + */ +static int ntfs_read_locked_inode(struct inode *vi) +{ + ntfs_volume *vol = NTFS_SB(vi->i_sb); + ntfs_inode *ni; + struct inode *bvi; + MFT_RECORD *m; + ATTR_RECORD *a; + STANDARD_INFORMATION *si; + ntfs_attr_search_ctx *ctx; + int err = 0; + + ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino); + + /* Setup the generic vfs inode parts now. */ + vi->i_uid = vol->uid; + vi->i_gid = vol->gid; + vi->i_mode = 0; + + /* + * Initialize the ntfs specific part of @vi special casing + * FILE_MFT which we need to do at mount time. + */ + if (vi->i_ino != FILE_MFT) + ntfs_init_big_inode(vi); + ni = NTFS_I(vi); + + m = map_mft_record(ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(ni, m); + if (!ctx) { + err = -ENOMEM; + goto unm_err_out; + } + + if (!(m->flags & MFT_RECORD_IN_USE)) { + ntfs_error(vi->i_sb, "Inode is not in use!"); + goto unm_err_out; + } + if (m->base_mft_record) { + ntfs_error(vi->i_sb, "Inode is an extent inode!"); + goto unm_err_out; + } + + /* Transfer information from mft record into vfs and ntfs inodes. */ + vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); + + /* + * FIXME: Keep in mind that link_count is two for files which have both + * a long file name and a short file name as separate entries, so if + * we are hiding short file names this will be too high. Either we need + * to account for the short file names by subtracting them or we need + * to make sure we delete files even though i_nlink is not zero which + * might be tricky due to vfs interactions. Need to think about this + * some more when implementing the unlink command. + */ + set_nlink(vi, le16_to_cpu(m->link_count)); + /* + * FIXME: Reparse points can have the directory bit set even though + * they would be S_IFLNK. Need to deal with this further below when we + * implement reparse points / symbolic links but it will do for now. + * Also if not a directory, it could be something else, rather than + * a regular file. But again, will do for now. + */ + /* Everyone gets all permissions. */ + vi->i_mode |= S_IRWXUGO; + /* If read-only, no one gets write permissions. */ + if (IS_RDONLY(vi)) + vi->i_mode &= ~S_IWUGO; + if (m->flags & MFT_RECORD_IS_DIRECTORY) { + vi->i_mode |= S_IFDIR; + /* + * Apply the directory permissions mask set in the mount + * options. + */ + vi->i_mode &= ~vol->dmask; + /* Things break without this kludge! */ + if (vi->i_nlink > 1) + set_nlink(vi, 1); + } else { + vi->i_mode |= S_IFREG; + /* Apply the file permissions mask set in the mount options. */ + vi->i_mode &= ~vol->fmask; + } + /* + * Find the standard information attribute in the mft record. At this + * stage we haven't setup the attribute list stuff yet, so this could + * in fact fail if the standard information is in an extent record, but + * I don't think this actually ever happens. + */ + err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0, 0, 0, NULL, 0, + ctx); + if (unlikely(err)) { + if (err == -ENOENT) { + /* + * TODO: We should be performing a hot fix here (if the + * recover mount option is set) by creating a new + * attribute. + */ + ntfs_error(vi->i_sb, "$STANDARD_INFORMATION attribute " + "is missing."); + } + goto unm_err_out; + } + a = ctx->attr; + /* Get the standard information attribute value. */ + if ((u8 *)a + le16_to_cpu(a->data.resident.value_offset) + + le32_to_cpu(a->data.resident.value_length) > + (u8 *)ctx->mrec + vol->mft_record_size) { + ntfs_error(vi->i_sb, "Corrupt standard information attribute in inode."); + goto unm_err_out; + } + si = (STANDARD_INFORMATION*)((u8*)a + + le16_to_cpu(a->data.resident.value_offset)); + + /* Transfer information from the standard information into vi. */ + /* + * Note: The i_?times do not quite map perfectly onto the NTFS times, + * but they are close enough, and in the end it doesn't really matter + * that much... + */ + /* + * mtime is the last change of the data within the file. Not changed + * when only metadata is changed, e.g. a rename doesn't affect mtime. + */ + inode_set_mtime_to_ts(vi, ntfs2utc(si->last_data_change_time)); + /* + * ctime is the last change of the metadata of the file. This obviously + * always changes, when mtime is changed. ctime can be changed on its + * own, mtime is then not changed, e.g. when a file is renamed. + */ + inode_set_ctime_to_ts(vi, ntfs2utc(si->last_mft_change_time)); + /* + * Last access to the data within the file. Not changed during a rename + * for example but changed whenever the file is written to. + */ + inode_set_atime_to_ts(vi, ntfs2utc(si->last_access_time)); + + /* Find the attribute list attribute if present. */ + ntfs_attr_reinit_search_ctx(ctx); + err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx); + if (err) { + if (unlikely(err != -ENOENT)) { + ntfs_error(vi->i_sb, "Failed to lookup attribute list " + "attribute."); + goto unm_err_out; + } + } else /* if (!err) */ { + if (vi->i_ino == FILE_MFT) + goto skip_attr_list_load; + ntfs_debug("Attribute list found in inode 0x%lx.", vi->i_ino); + NInoSetAttrList(ni); + a = ctx->attr; + if (a->flags & ATTR_COMPRESSION_MASK) { + ntfs_error(vi->i_sb, "Attribute list attribute is " + "compressed."); + goto unm_err_out; + } + if (a->flags & ATTR_IS_ENCRYPTED || + a->flags & ATTR_IS_SPARSE) { + if (a->non_resident) { + ntfs_error(vi->i_sb, "Non-resident attribute " + "list attribute is encrypted/" + "sparse."); + goto unm_err_out; + } + ntfs_warning(vi->i_sb, "Resident attribute list " + "attribute in inode 0x%lx is marked " + "encrypted/sparse which is not true. " + "However, Windows allows this and " + "chkdsk does not detect or correct it " + "so we will just ignore the invalid " + "flags and pretend they are not set.", + vi->i_ino); + } + /* Now allocate memory for the attribute list. */ + ni->attr_list_size = (u32)ntfs_attr_size(a); + ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); + if (!ni->attr_list) { + ntfs_error(vi->i_sb, "Not enough memory to allocate " + "buffer for attribute list."); + err = -ENOMEM; + goto unm_err_out; + } + if (a->non_resident) { + NInoSetAttrListNonResident(ni); + if (a->data.non_resident.lowest_vcn) { + ntfs_error(vi->i_sb, "Attribute list has non " + "zero lowest_vcn."); + goto unm_err_out; + } + /* + * Setup the runlist. No need for locking as we have + * exclusive access to the inode at this time. + */ + ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol, + a, NULL); + if (IS_ERR(ni->attr_list_rl.rl)) { + err = PTR_ERR(ni->attr_list_rl.rl); + ni->attr_list_rl.rl = NULL; + ntfs_error(vi->i_sb, "Mapping pairs " + "decompression failed."); + goto unm_err_out; + } + /* Now load the attribute list. */ + if ((err = load_attribute_list(vol, &ni->attr_list_rl, + ni->attr_list, ni->attr_list_size, + sle64_to_cpu(a->data.non_resident. + initialized_size)))) { + ntfs_error(vi->i_sb, "Failed to load " + "attribute list attribute."); + goto unm_err_out; + } + } else /* if (!a->non_resident) */ { + if ((u8*)a + le16_to_cpu(a->data.resident.value_offset) + + le32_to_cpu( + a->data.resident.value_length) > + (u8*)ctx->mrec + vol->mft_record_size) { + ntfs_error(vi->i_sb, "Corrupt attribute list " + "in inode."); + goto unm_err_out; + } + /* Now copy the attribute list. */ + memcpy(ni->attr_list, (u8*)a + le16_to_cpu( + a->data.resident.value_offset), + le32_to_cpu( + a->data.resident.value_length)); + } + } +skip_attr_list_load: + /* + * If an attribute list is present we now have the attribute list value + * in ntfs_ino->attr_list and it is ntfs_ino->attr_list_size bytes. + */ + if (S_ISDIR(vi->i_mode)) { + loff_t bvi_size; + ntfs_inode *bni; + INDEX_ROOT *ir; + u8 *ir_end, *index_end; + + /* It is a directory, find index root attribute. */ + ntfs_attr_reinit_search_ctx(ctx); + err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, + 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) { + // FIXME: File is corrupt! Hot-fix with empty + // index root attribute if recovery option is + // set. + ntfs_error(vi->i_sb, "$INDEX_ROOT attribute " + "is missing."); + } + goto unm_err_out; + } + a = ctx->attr; + /* Set up the state. */ + if (unlikely(a->non_resident)) { + ntfs_error(vol->sb, "$INDEX_ROOT attribute is not " + "resident."); + goto unm_err_out; + } + /* Ensure the attribute name is placed before the value. */ + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu(a->data.resident.value_offset)))) { + ntfs_error(vol->sb, "$INDEX_ROOT attribute name is " + "placed after the attribute value."); + goto unm_err_out; + } + /* + * Compressed/encrypted index root just means that the newly + * created files in that directory should be created compressed/ + * encrypted. However index root cannot be both compressed and + * encrypted. + */ + if (a->flags & ATTR_COMPRESSION_MASK) + NInoSetCompressed(ni); + if (a->flags & ATTR_IS_ENCRYPTED) { + if (a->flags & ATTR_COMPRESSION_MASK) { + ntfs_error(vi->i_sb, "Found encrypted and " + "compressed attribute."); + goto unm_err_out; + } + NInoSetEncrypted(ni); + } + if (a->flags & ATTR_IS_SPARSE) + NInoSetSparse(ni); + ir = (INDEX_ROOT*)((u8*)a + + le16_to_cpu(a->data.resident.value_offset)); + ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length); + if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) { + ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is " + "corrupt."); + goto unm_err_out; + } + index_end = (u8*)&ir->index + + le32_to_cpu(ir->index.index_length); + if (index_end > ir_end) { + ntfs_error(vi->i_sb, "Directory index is corrupt."); + goto unm_err_out; + } + if (ir->type != AT_FILE_NAME) { + ntfs_error(vi->i_sb, "Indexed attribute is not " + "$FILE_NAME."); + goto unm_err_out; + } + if (ir->collation_rule != COLLATION_FILE_NAME) { + ntfs_error(vi->i_sb, "Index collation rule is not " + "COLLATION_FILE_NAME."); + goto unm_err_out; + } + ni->itype.index.collation_rule = ir->collation_rule; + ni->itype.index.block_size = le32_to_cpu(ir->index_block_size); + if (ni->itype.index.block_size & + (ni->itype.index.block_size - 1)) { + ntfs_error(vi->i_sb, "Index block size (%u) is not a " + "power of two.", + ni->itype.index.block_size); + goto unm_err_out; + } + if (ni->itype.index.block_size > PAGE_SIZE) { + ntfs_error(vi->i_sb, "Index block size (%u) > " + "PAGE_SIZE (%ld) is not " + "supported. Sorry.", + ni->itype.index.block_size, + PAGE_SIZE); + err = -EOPNOTSUPP; + goto unm_err_out; + } + if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) { + ntfs_error(vi->i_sb, "Index block size (%u) < " + "NTFS_BLOCK_SIZE (%i) is not " + "supported. Sorry.", + ni->itype.index.block_size, + NTFS_BLOCK_SIZE); + err = -EOPNOTSUPP; + goto unm_err_out; + } + ni->itype.index.block_size_bits = + ffs(ni->itype.index.block_size) - 1; + /* Determine the size of a vcn in the directory index. */ + if (vol->cluster_size <= ni->itype.index.block_size) { + ni->itype.index.vcn_size = vol->cluster_size; + ni->itype.index.vcn_size_bits = vol->cluster_size_bits; + } else { + ni->itype.index.vcn_size = vol->sector_size; + ni->itype.index.vcn_size_bits = vol->sector_size_bits; + } + + /* Setup the index allocation attribute, even if not present. */ + NInoSetMstProtected(ni); + ni->type = AT_INDEX_ALLOCATION; + ni->name = I30; + ni->name_len = 4; + + if (!(ir->index.flags & LARGE_INDEX)) { + /* No index allocation. */ + vi->i_size = ni->initialized_size = + ni->allocated_size = 0; + /* We are done with the mft record, so we release it. */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); + m = NULL; + ctx = NULL; + goto skip_large_dir_stuff; + } /* LARGE_INDEX: Index allocation present. Setup state. */ + NInoSetIndexAllocPresent(ni); + /* Find index allocation attribute. */ + ntfs_attr_reinit_search_ctx(ctx); + err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, I30, 4, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION " + "attribute is not present but " + "$INDEX_ROOT indicated it is."); + else + ntfs_error(vi->i_sb, "Failed to lookup " + "$INDEX_ALLOCATION " + "attribute."); + goto unm_err_out; + } + a = ctx->attr; + if (!a->non_resident) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " + "is resident."); + goto unm_err_out; + } + /* + * Ensure the attribute name is placed before the mapping pairs + * array. + */ + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset)))) { + ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name " + "is placed after the mapping pairs " + "array."); + goto unm_err_out; + } + if (a->flags & ATTR_IS_ENCRYPTED) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " + "is encrypted."); + goto unm_err_out; + } + if (a->flags & ATTR_IS_SPARSE) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " + "is sparse."); + goto unm_err_out; + } + if (a->flags & ATTR_COMPRESSION_MASK) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " + "is compressed."); + goto unm_err_out; + } + if (a->data.non_resident.lowest_vcn) { + ntfs_error(vi->i_sb, "First extent of " + "$INDEX_ALLOCATION attribute has non " + "zero lowest_vcn."); + goto unm_err_out; + } + vi->i_size = sle64_to_cpu(a->data.non_resident.data_size); + ni->initialized_size = sle64_to_cpu( + a->data.non_resident.initialized_size); + ni->allocated_size = sle64_to_cpu( + a->data.non_resident.allocated_size); + /* + * We are done with the mft record, so we release it. Otherwise + * we would deadlock in ntfs_attr_iget(). + */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); + m = NULL; + ctx = NULL; + /* Get the index bitmap attribute inode. */ + bvi = ntfs_attr_iget(vi, AT_BITMAP, I30, 4); + if (IS_ERR(bvi)) { + ntfs_error(vi->i_sb, "Failed to get bitmap attribute."); + err = PTR_ERR(bvi); + goto unm_err_out; + } + bni = NTFS_I(bvi); + if (NInoCompressed(bni) || NInoEncrypted(bni) || + NInoSparse(bni)) { + ntfs_error(vi->i_sb, "$BITMAP attribute is compressed " + "and/or encrypted and/or sparse."); + goto iput_unm_err_out; + } + /* Consistency check bitmap size vs. index allocation size. */ + bvi_size = i_size_read(bvi); + if ((bvi_size << 3) < (vi->i_size >> + ni->itype.index.block_size_bits)) { + ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) " + "for index allocation (0x%llx).", + bvi_size << 3, vi->i_size); + goto iput_unm_err_out; + } + /* No longer need the bitmap attribute inode. */ + iput(bvi); +skip_large_dir_stuff: + /* Setup the operations for this inode. */ + vi->i_op = &ntfs_dir_inode_ops; + vi->i_fop = &ntfs_dir_ops; + vi->i_mapping->a_ops = &ntfs_mst_aops; + } else { + /* It is a file. */ + ntfs_attr_reinit_search_ctx(ctx); + + /* Setup the data attribute, even if not present. */ + ni->type = AT_DATA; + ni->name = NULL; + ni->name_len = 0; + + /* Find first extent of the unnamed data attribute. */ + err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, 0, NULL, 0, ctx); + if (unlikely(err)) { + vi->i_size = ni->initialized_size = + ni->allocated_size = 0; + if (err != -ENOENT) { + ntfs_error(vi->i_sb, "Failed to lookup $DATA " + "attribute."); + goto unm_err_out; + } + /* + * FILE_Secure does not have an unnamed $DATA + * attribute, so we special case it here. + */ + if (vi->i_ino == FILE_Secure) + goto no_data_attr_special_case; + /* + * Most if not all the system files in the $Extend + * system directory do not have unnamed data + * attributes so we need to check if the parent + * directory of the file is FILE_Extend and if it is + * ignore this error. To do this we need to get the + * name of this inode from the mft record as the name + * contains the back reference to the parent directory. + */ + if (ntfs_is_extended_system_file(ctx) > 0) + goto no_data_attr_special_case; + // FIXME: File is corrupt! Hot-fix with empty data + // attribute if recovery option is set. + ntfs_error(vi->i_sb, "$DATA attribute is missing."); + goto unm_err_out; + } + a = ctx->attr; + /* Setup the state. */ + if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) { + if (a->flags & ATTR_COMPRESSION_MASK) { + NInoSetCompressed(ni); + if (vol->cluster_size > 4096) { + ntfs_error(vi->i_sb, "Found " + "compressed data but " + "compression is " + "disabled due to " + "cluster size (%i) > " + "4kiB.", + vol->cluster_size); + goto unm_err_out; + } + if ((a->flags & ATTR_COMPRESSION_MASK) + != ATTR_IS_COMPRESSED) { + ntfs_error(vi->i_sb, "Found unknown " + "compression method " + "or corrupt file."); + goto unm_err_out; + } + } + if (a->flags & ATTR_IS_SPARSE) + NInoSetSparse(ni); + } + if (a->flags & ATTR_IS_ENCRYPTED) { + if (NInoCompressed(ni)) { + ntfs_error(vi->i_sb, "Found encrypted and " + "compressed data."); + goto unm_err_out; + } + NInoSetEncrypted(ni); + } + if (a->non_resident) { + NInoSetNonResident(ni); + if (NInoCompressed(ni) || NInoSparse(ni)) { + if (NInoCompressed(ni) && a->data.non_resident. + compression_unit != 4) { + ntfs_error(vi->i_sb, "Found " + "non-standard " + "compression unit (%u " + "instead of 4). " + "Cannot handle this.", + a->data.non_resident. + compression_unit); + err = -EOPNOTSUPP; + goto unm_err_out; + } + if (a->data.non_resident.compression_unit) { + ni->itype.compressed.block_size = 1U << + (a->data.non_resident. + compression_unit + + vol->cluster_size_bits); + ni->itype.compressed.block_size_bits = + ffs(ni->itype. + compressed. + block_size) - 1; + ni->itype.compressed.block_clusters = + 1U << a->data. + non_resident. + compression_unit; + } else { + ni->itype.compressed.block_size = 0; + ni->itype.compressed.block_size_bits = + 0; + ni->itype.compressed.block_clusters = + 0; + } + ni->itype.compressed.size = sle64_to_cpu( + a->data.non_resident. + compressed_size); + } + if (a->data.non_resident.lowest_vcn) { + ntfs_error(vi->i_sb, "First extent of $DATA " + "attribute has non zero " + "lowest_vcn."); + goto unm_err_out; + } + vi->i_size = sle64_to_cpu( + a->data.non_resident.data_size); + ni->initialized_size = sle64_to_cpu( + a->data.non_resident.initialized_size); + ni->allocated_size = sle64_to_cpu( + a->data.non_resident.allocated_size); + } else { /* Resident attribute. */ + vi->i_size = ni->initialized_size = le32_to_cpu( + a->data.resident.value_length); + ni->allocated_size = le32_to_cpu(a->length) - + le16_to_cpu( + a->data.resident.value_offset); + if (vi->i_size > ni->allocated_size) { + ntfs_error(vi->i_sb, "Resident data attribute " + "is corrupt (size exceeds " + "allocation)."); + goto unm_err_out; + } + } +no_data_attr_special_case: + /* We are done with the mft record, so we release it. */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); + m = NULL; + ctx = NULL; + /* Setup the operations for this inode. */ + vi->i_op = &ntfs_file_inode_ops; + vi->i_fop = &ntfs_file_ops; + vi->i_mapping->a_ops = &ntfs_normal_aops; + if (NInoMstProtected(ni)) + vi->i_mapping->a_ops = &ntfs_mst_aops; + else if (NInoCompressed(ni)) + vi->i_mapping->a_ops = &ntfs_compressed_aops; + } + /* + * The number of 512-byte blocks used on disk (for stat). This is in so + * far inaccurate as it doesn't account for any named streams or other + * special non-resident attributes, but that is how Windows works, too, + * so we are at least consistent with Windows, if not entirely + * consistent with the Linux Way. Doing it the Linux Way would cause a + * significant slowdown as it would involve iterating over all + * attributes in the mft record and adding the allocated/compressed + * sizes of all non-resident attributes present to give us the Linux + * correct size that should go into i_blocks (after division by 512). + */ + if (S_ISREG(vi->i_mode) && (NInoCompressed(ni) || NInoSparse(ni))) + vi->i_blocks = ni->itype.compressed.size >> 9; + else + vi->i_blocks = ni->allocated_size >> 9; + ntfs_debug("Done."); + return 0; +iput_unm_err_out: + iput(bvi); +unm_err_out: + if (!err) + err = -EIO; + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(ni); +err_out: + ntfs_error(vol->sb, "Failed with error code %i. Marking corrupt " + "inode 0x%lx as bad. Run chkdsk.", err, vi->i_ino); + make_bad_inode(vi); + if (err != -EOPNOTSUPP && err != -ENOMEM) + NVolSetErrors(vol); + return err; +} + +/** + * ntfs_read_locked_attr_inode - read an attribute inode from its base inode + * @base_vi: base inode + * @vi: attribute inode to read + * + * ntfs_read_locked_attr_inode() is called from ntfs_attr_iget() to read the + * attribute inode described by @vi into memory from the base mft record + * described by @base_ni. + * + * ntfs_read_locked_attr_inode() maps, pins and locks the base inode for + * reading and looks up the attribute described by @vi before setting up the + * necessary fields in @vi as well as initializing the ntfs inode. + * + * Q: What locks are held when the function is called? + * A: i_state has I_NEW set, hence the inode is locked, also + * i_count is set to 1, so it is not going to go away + * + * Return 0 on success and -errno on error. In the error case, the inode will + * have had make_bad_inode() executed on it. + * + * Note this cannot be called for AT_INDEX_ALLOCATION. + */ +static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi) +{ + ntfs_volume *vol = NTFS_SB(vi->i_sb); + ntfs_inode *ni, *base_ni; + MFT_RECORD *m; + ATTR_RECORD *a; + ntfs_attr_search_ctx *ctx; + int err = 0; + + ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino); + + ntfs_init_big_inode(vi); + + ni = NTFS_I(vi); + base_ni = NTFS_I(base_vi); + + /* Just mirror the values from the base inode. */ + vi->i_uid = base_vi->i_uid; + vi->i_gid = base_vi->i_gid; + set_nlink(vi, base_vi->i_nlink); + inode_set_mtime_to_ts(vi, inode_get_mtime(base_vi)); + inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi)); + inode_set_atime_to_ts(vi, inode_get_atime(base_vi)); + vi->i_generation = ni->seq_no = base_ni->seq_no; + + /* Set inode type to zero but preserve permissions. */ + vi->i_mode = base_vi->i_mode & ~S_IFMT; + + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (!ctx) { + err = -ENOMEM; + goto unm_err_out; + } + /* Find the attribute. */ + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) + goto unm_err_out; + a = ctx->attr; + if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) { + if (a->flags & ATTR_COMPRESSION_MASK) { + NInoSetCompressed(ni); + if ((ni->type != AT_DATA) || (ni->type == AT_DATA && + ni->name_len)) { + ntfs_error(vi->i_sb, "Found compressed " + "non-data or named data " + "attribute. Please report " + "you saw this message to " + "linux-ntfs-dev@lists." + "sourceforge.net"); + goto unm_err_out; + } + if (vol->cluster_size > 4096) { + ntfs_error(vi->i_sb, "Found compressed " + "attribute but compression is " + "disabled due to cluster size " + "(%i) > 4kiB.", + vol->cluster_size); + goto unm_err_out; + } + if ((a->flags & ATTR_COMPRESSION_MASK) != + ATTR_IS_COMPRESSED) { + ntfs_error(vi->i_sb, "Found unknown " + "compression method."); + goto unm_err_out; + } + } + /* + * The compressed/sparse flag set in an index root just means + * to compress all files. + */ + if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) { + ntfs_error(vi->i_sb, "Found mst protected attribute " + "but the attribute is %s. Please " + "report you saw this message to " + "linux-ntfs-dev@lists.sourceforge.net", + NInoCompressed(ni) ? "compressed" : + "sparse"); + goto unm_err_out; + } + if (a->flags & ATTR_IS_SPARSE) + NInoSetSparse(ni); + } + if (a->flags & ATTR_IS_ENCRYPTED) { + if (NInoCompressed(ni)) { + ntfs_error(vi->i_sb, "Found encrypted and compressed " + "data."); + goto unm_err_out; + } + /* + * The encryption flag set in an index root just means to + * encrypt all files. + */ + if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) { + ntfs_error(vi->i_sb, "Found mst protected attribute " + "but the attribute is encrypted. " + "Please report you saw this message " + "to linux-ntfs-dev@lists.sourceforge." + "net"); + goto unm_err_out; + } + if (ni->type != AT_DATA) { + ntfs_error(vi->i_sb, "Found encrypted non-data " + "attribute."); + goto unm_err_out; + } + NInoSetEncrypted(ni); + } + if (!a->non_resident) { + /* Ensure the attribute name is placed before the value. */ + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu(a->data.resident.value_offset)))) { + ntfs_error(vol->sb, "Attribute name is placed after " + "the attribute value."); + goto unm_err_out; + } + if (NInoMstProtected(ni)) { + ntfs_error(vi->i_sb, "Found mst protected attribute " + "but the attribute is resident. " + "Please report you saw this message to " + "linux-ntfs-dev@lists.sourceforge.net"); + goto unm_err_out; + } + vi->i_size = ni->initialized_size = le32_to_cpu( + a->data.resident.value_length); + ni->allocated_size = le32_to_cpu(a->length) - + le16_to_cpu(a->data.resident.value_offset); + if (vi->i_size > ni->allocated_size) { + ntfs_error(vi->i_sb, "Resident attribute is corrupt " + "(size exceeds allocation)."); + goto unm_err_out; + } + } else { + NInoSetNonResident(ni); + /* + * Ensure the attribute name is placed before the mapping pairs + * array. + */ + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset)))) { + ntfs_error(vol->sb, "Attribute name is placed after " + "the mapping pairs array."); + goto unm_err_out; + } + if (NInoCompressed(ni) || NInoSparse(ni)) { + if (NInoCompressed(ni) && a->data.non_resident. + compression_unit != 4) { + ntfs_error(vi->i_sb, "Found non-standard " + "compression unit (%u instead " + "of 4). Cannot handle this.", + a->data.non_resident. + compression_unit); + err = -EOPNOTSUPP; + goto unm_err_out; + } + if (a->data.non_resident.compression_unit) { + ni->itype.compressed.block_size = 1U << + (a->data.non_resident. + compression_unit + + vol->cluster_size_bits); + ni->itype.compressed.block_size_bits = + ffs(ni->itype.compressed. + block_size) - 1; + ni->itype.compressed.block_clusters = 1U << + a->data.non_resident. + compression_unit; + } else { + ni->itype.compressed.block_size = 0; + ni->itype.compressed.block_size_bits = 0; + ni->itype.compressed.block_clusters = 0; + } + ni->itype.compressed.size = sle64_to_cpu( + a->data.non_resident.compressed_size); + } + if (a->data.non_resident.lowest_vcn) { + ntfs_error(vi->i_sb, "First extent of attribute has " + "non-zero lowest_vcn."); + goto unm_err_out; + } + vi->i_size = sle64_to_cpu(a->data.non_resident.data_size); + ni->initialized_size = sle64_to_cpu( + a->data.non_resident.initialized_size); + ni->allocated_size = sle64_to_cpu( + a->data.non_resident.allocated_size); + } + vi->i_mapping->a_ops = &ntfs_normal_aops; + if (NInoMstProtected(ni)) + vi->i_mapping->a_ops = &ntfs_mst_aops; + else if (NInoCompressed(ni)) + vi->i_mapping->a_ops = &ntfs_compressed_aops; + if ((NInoCompressed(ni) || NInoSparse(ni)) && ni->type != AT_INDEX_ROOT) + vi->i_blocks = ni->itype.compressed.size >> 9; + else + vi->i_blocks = ni->allocated_size >> 9; + /* + * Make sure the base inode does not go away and attach it to the + * attribute inode. + */ + igrab(base_vi); + ni->ext.base_ntfs_ino = base_ni; + ni->nr_extents = -1; + + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + + ntfs_debug("Done."); + return 0; + +unm_err_out: + if (!err) + err = -EIO; + if (ctx) + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); +err_out: + ntfs_error(vol->sb, "Failed with error code %i while reading attribute " + "inode (mft_no 0x%lx, type 0x%x, name_len %i). " + "Marking corrupt inode and base inode 0x%lx as bad. " + "Run chkdsk.", err, vi->i_ino, ni->type, ni->name_len, + base_vi->i_ino); + make_bad_inode(vi); + if (err != -ENOMEM) + NVolSetErrors(vol); + return err; +} + +/** + * ntfs_read_locked_index_inode - read an index inode from its base inode + * @base_vi: base inode + * @vi: index inode to read + * + * ntfs_read_locked_index_inode() is called from ntfs_index_iget() to read the + * index inode described by @vi into memory from the base mft record described + * by @base_ni. + * + * ntfs_read_locked_index_inode() maps, pins and locks the base inode for + * reading and looks up the attributes relating to the index described by @vi + * before setting up the necessary fields in @vi as well as initializing the + * ntfs inode. + * + * Note, index inodes are essentially attribute inodes (NInoAttr() is true) + * with the attribute type set to AT_INDEX_ALLOCATION. Apart from that, they + * are setup like directory inodes since directories are a special case of + * indices ao they need to be treated in much the same way. Most importantly, + * for small indices the index allocation attribute might not actually exist. + * However, the index root attribute always exists but this does not need to + * have an inode associated with it and this is why we define a new inode type + * index. Also, like for directories, we need to have an attribute inode for + * the bitmap attribute corresponding to the index allocation attribute and we + * can store this in the appropriate field of the inode, just like we do for + * normal directory inodes. + * + * Q: What locks are held when the function is called? + * A: i_state has I_NEW set, hence the inode is locked, also + * i_count is set to 1, so it is not going to go away + * + * Return 0 on success and -errno on error. In the error case, the inode will + * have had make_bad_inode() executed on it. + */ +static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) +{ + loff_t bvi_size; + ntfs_volume *vol = NTFS_SB(vi->i_sb); + ntfs_inode *ni, *base_ni, *bni; + struct inode *bvi; + MFT_RECORD *m; + ATTR_RECORD *a; + ntfs_attr_search_ctx *ctx; + INDEX_ROOT *ir; + u8 *ir_end, *index_end; + int err = 0; + + ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino); + ntfs_init_big_inode(vi); + ni = NTFS_I(vi); + base_ni = NTFS_I(base_vi); + /* Just mirror the values from the base inode. */ + vi->i_uid = base_vi->i_uid; + vi->i_gid = base_vi->i_gid; + set_nlink(vi, base_vi->i_nlink); + inode_set_mtime_to_ts(vi, inode_get_mtime(base_vi)); + inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi)); + inode_set_atime_to_ts(vi, inode_get_atime(base_vi)); + vi->i_generation = ni->seq_no = base_ni->seq_no; + /* Set inode type to zero but preserve permissions. */ + vi->i_mode = base_vi->i_mode & ~S_IFMT; + /* Map the mft record for the base inode. */ + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (!ctx) { + err = -ENOMEM; + goto unm_err_out; + } + /* Find the index root attribute. */ + err = ntfs_attr_lookup(AT_INDEX_ROOT, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is " + "missing."); + goto unm_err_out; + } + a = ctx->attr; + /* Set up the state. */ + if (unlikely(a->non_resident)) { + ntfs_error(vol->sb, "$INDEX_ROOT attribute is not resident."); + goto unm_err_out; + } + /* Ensure the attribute name is placed before the value. */ + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu(a->data.resident.value_offset)))) { + ntfs_error(vol->sb, "$INDEX_ROOT attribute name is placed " + "after the attribute value."); + goto unm_err_out; + } + /* + * Compressed/encrypted/sparse index root is not allowed, except for + * directories of course but those are not dealt with here. + */ + if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_ENCRYPTED | + ATTR_IS_SPARSE)) { + ntfs_error(vi->i_sb, "Found compressed/encrypted/sparse index " + "root attribute."); + goto unm_err_out; + } + ir = (INDEX_ROOT*)((u8*)a + le16_to_cpu(a->data.resident.value_offset)); + ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length); + if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) { + ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is corrupt."); + goto unm_err_out; + } + index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); + if (index_end > ir_end) { + ntfs_error(vi->i_sb, "Index is corrupt."); + goto unm_err_out; + } + if (ir->type) { + ntfs_error(vi->i_sb, "Index type is not 0 (type is 0x%x).", + le32_to_cpu(ir->type)); + goto unm_err_out; + } + ni->itype.index.collation_rule = ir->collation_rule; + ntfs_debug("Index collation rule is 0x%x.", + le32_to_cpu(ir->collation_rule)); + ni->itype.index.block_size = le32_to_cpu(ir->index_block_size); + if (!is_power_of_2(ni->itype.index.block_size)) { + ntfs_error(vi->i_sb, "Index block size (%u) is not a power of " + "two.", ni->itype.index.block_size); + goto unm_err_out; + } + if (ni->itype.index.block_size > PAGE_SIZE) { + ntfs_error(vi->i_sb, "Index block size (%u) > PAGE_SIZE " + "(%ld) is not supported. Sorry.", + ni->itype.index.block_size, PAGE_SIZE); + err = -EOPNOTSUPP; + goto unm_err_out; + } + if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) { + ntfs_error(vi->i_sb, "Index block size (%u) < NTFS_BLOCK_SIZE " + "(%i) is not supported. Sorry.", + ni->itype.index.block_size, NTFS_BLOCK_SIZE); + err = -EOPNOTSUPP; + goto unm_err_out; + } + ni->itype.index.block_size_bits = ffs(ni->itype.index.block_size) - 1; + /* Determine the size of a vcn in the index. */ + if (vol->cluster_size <= ni->itype.index.block_size) { + ni->itype.index.vcn_size = vol->cluster_size; + ni->itype.index.vcn_size_bits = vol->cluster_size_bits; + } else { + ni->itype.index.vcn_size = vol->sector_size; + ni->itype.index.vcn_size_bits = vol->sector_size_bits; + } + /* Check for presence of index allocation attribute. */ + if (!(ir->index.flags & LARGE_INDEX)) { + /* No index allocation. */ + vi->i_size = ni->initialized_size = ni->allocated_size = 0; + /* We are done with the mft record, so we release it. */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + m = NULL; + ctx = NULL; + goto skip_large_index_stuff; + } /* LARGE_INDEX: Index allocation present. Setup state. */ + NInoSetIndexAllocPresent(ni); + /* Find index allocation attribute. */ + ntfs_attr_reinit_search_ctx(ctx); + err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " + "not present but $INDEX_ROOT " + "indicated it is."); + else + ntfs_error(vi->i_sb, "Failed to lookup " + "$INDEX_ALLOCATION attribute."); + goto unm_err_out; + } + a = ctx->attr; + if (!a->non_resident) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " + "resident."); + goto unm_err_out; + } + /* + * Ensure the attribute name is placed before the mapping pairs array. + */ + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset)))) { + ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name is " + "placed after the mapping pairs array."); + goto unm_err_out; + } + if (a->flags & ATTR_IS_ENCRYPTED) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " + "encrypted."); + goto unm_err_out; + } + if (a->flags & ATTR_IS_SPARSE) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is sparse."); + goto unm_err_out; + } + if (a->flags & ATTR_COMPRESSION_MASK) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " + "compressed."); + goto unm_err_out; + } + if (a->data.non_resident.lowest_vcn) { + ntfs_error(vi->i_sb, "First extent of $INDEX_ALLOCATION " + "attribute has non zero lowest_vcn."); + goto unm_err_out; + } + vi->i_size = sle64_to_cpu(a->data.non_resident.data_size); + ni->initialized_size = sle64_to_cpu( + a->data.non_resident.initialized_size); + ni->allocated_size = sle64_to_cpu(a->data.non_resident.allocated_size); + /* + * We are done with the mft record, so we release it. Otherwise + * we would deadlock in ntfs_attr_iget(). + */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + m = NULL; + ctx = NULL; + /* Get the index bitmap attribute inode. */ + bvi = ntfs_attr_iget(base_vi, AT_BITMAP, ni->name, ni->name_len); + if (IS_ERR(bvi)) { + ntfs_error(vi->i_sb, "Failed to get bitmap attribute."); + err = PTR_ERR(bvi); + goto unm_err_out; + } + bni = NTFS_I(bvi); + if (NInoCompressed(bni) || NInoEncrypted(bni) || + NInoSparse(bni)) { + ntfs_error(vi->i_sb, "$BITMAP attribute is compressed and/or " + "encrypted and/or sparse."); + goto iput_unm_err_out; + } + /* Consistency check bitmap size vs. index allocation size. */ + bvi_size = i_size_read(bvi); + if ((bvi_size << 3) < (vi->i_size >> ni->itype.index.block_size_bits)) { + ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) for " + "index allocation (0x%llx).", bvi_size << 3, + vi->i_size); + goto iput_unm_err_out; + } + iput(bvi); +skip_large_index_stuff: + /* Setup the operations for this index inode. */ + vi->i_mapping->a_ops = &ntfs_mst_aops; + vi->i_blocks = ni->allocated_size >> 9; + /* + * Make sure the base inode doesn't go away and attach it to the + * index inode. + */ + igrab(base_vi); + ni->ext.base_ntfs_ino = base_ni; + ni->nr_extents = -1; + + ntfs_debug("Done."); + return 0; +iput_unm_err_out: + iput(bvi); +unm_err_out: + if (!err) + err = -EIO; + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); +err_out: + ntfs_error(vi->i_sb, "Failed with error code %i while reading index " + "inode (mft_no 0x%lx, name_len %i.", err, vi->i_ino, + ni->name_len); + make_bad_inode(vi); + if (err != -EOPNOTSUPP && err != -ENOMEM) + NVolSetErrors(vol); + return err; +} + +/* + * The MFT inode has special locking, so teach the lock validator + * about this by splitting off the locking rules of the MFT from + * the locking rules of other inodes. The MFT inode can never be + * accessed from the VFS side (or even internally), only by the + * map_mft functions. + */ +static struct lock_class_key mft_ni_runlist_lock_key, mft_ni_mrec_lock_key; + +/** + * ntfs_read_inode_mount - special read_inode for mount time use only + * @vi: inode to read + * + * Read inode FILE_MFT at mount time, only called with super_block lock + * held from within the read_super() code path. + * + * This function exists because when it is called the page cache for $MFT/$DATA + * is not initialized and hence we cannot get at the contents of mft records + * by calling map_mft_record*(). + * + * Further it needs to cope with the circular references problem, i.e. cannot + * load any attributes other than $ATTRIBUTE_LIST until $DATA is loaded, because + * we do not know where the other extent mft records are yet and again, because + * we cannot call map_mft_record*() yet. Obviously this applies only when an + * attribute list is actually present in $MFT inode. + * + * We solve these problems by starting with the $DATA attribute before anything + * else and iterating using ntfs_attr_lookup($DATA) over all extents. As each + * extent is found, we ntfs_mapping_pairs_decompress() including the implied + * ntfs_runlists_merge(). Each step of the iteration necessarily provides + * sufficient information for the next step to complete. + * + * This should work but there are two possible pit falls (see inline comments + * below), but only time will tell if they are real pits or just smoke... + */ +int ntfs_read_inode_mount(struct inode *vi) +{ + VCN next_vcn, last_vcn, highest_vcn; + s64 block; + struct super_block *sb = vi->i_sb; + ntfs_volume *vol = NTFS_SB(sb); + struct buffer_head *bh; + ntfs_inode *ni; + MFT_RECORD *m = NULL; + ATTR_RECORD *a; + ntfs_attr_search_ctx *ctx; + unsigned int i, nr_blocks; + int err; + + ntfs_debug("Entering."); + + /* Initialize the ntfs specific part of @vi. */ + ntfs_init_big_inode(vi); + + ni = NTFS_I(vi); + + /* Setup the data attribute. It is special as it is mst protected. */ + NInoSetNonResident(ni); + NInoSetMstProtected(ni); + NInoSetSparseDisabled(ni); + ni->type = AT_DATA; + ni->name = NULL; + ni->name_len = 0; + /* + * This sets up our little cheat allowing us to reuse the async read io + * completion handler for directories. + */ + ni->itype.index.block_size = vol->mft_record_size; + ni->itype.index.block_size_bits = vol->mft_record_size_bits; + + /* Very important! Needed to be able to call map_mft_record*(). */ + vol->mft_ino = vi; + + /* Allocate enough memory to read the first mft record. */ + if (vol->mft_record_size > 64 * 1024) { + ntfs_error(sb, "Unsupported mft record size %i (max 64kiB).", + vol->mft_record_size); + goto err_out; + } + i = vol->mft_record_size; + if (i < sb->s_blocksize) + i = sb->s_blocksize; + m = (MFT_RECORD*)ntfs_malloc_nofs(i); + if (!m) { + ntfs_error(sb, "Failed to allocate buffer for $MFT record 0."); + goto err_out; + } + + /* Determine the first block of the $MFT/$DATA attribute. */ + block = vol->mft_lcn << vol->cluster_size_bits >> + sb->s_blocksize_bits; + nr_blocks = vol->mft_record_size >> sb->s_blocksize_bits; + if (!nr_blocks) + nr_blocks = 1; + + /* Load $MFT/$DATA's first mft record. */ + for (i = 0; i < nr_blocks; i++) { + bh = sb_bread(sb, block++); + if (!bh) { + ntfs_error(sb, "Device read failed."); + goto err_out; + } + memcpy((char*)m + (i << sb->s_blocksize_bits), bh->b_data, + sb->s_blocksize); + brelse(bh); + } + + if (le32_to_cpu(m->bytes_allocated) != vol->mft_record_size) { + ntfs_error(sb, "Incorrect mft record size %u in superblock, should be %u.", + le32_to_cpu(m->bytes_allocated), vol->mft_record_size); + goto err_out; + } + + /* Apply the mst fixups. */ + if (post_read_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size)) { + /* FIXME: Try to use the $MFTMirr now. */ + ntfs_error(sb, "MST fixup failed. $MFT is corrupt."); + goto err_out; + } + + /* Sanity check offset to the first attribute */ + if (le16_to_cpu(m->attrs_offset) >= le32_to_cpu(m->bytes_allocated)) { + ntfs_error(sb, "Incorrect mft offset to the first attribute %u in superblock.", + le16_to_cpu(m->attrs_offset)); + goto err_out; + } + + /* Need this to sanity check attribute list references to $MFT. */ + vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); + + /* Provides read_folio() for map_mft_record(). */ + vi->i_mapping->a_ops = &ntfs_mst_aops; + + ctx = ntfs_attr_get_search_ctx(ni, m); + if (!ctx) { + err = -ENOMEM; + goto err_out; + } + + /* Find the attribute list attribute if present. */ + err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx); + if (err) { + if (unlikely(err != -ENOENT)) { + ntfs_error(sb, "Failed to lookup attribute list " + "attribute. You should run chkdsk."); + goto put_err_out; + } + } else /* if (!err) */ { + ATTR_LIST_ENTRY *al_entry, *next_al_entry; + u8 *al_end; + static const char *es = " Not allowed. $MFT is corrupt. " + "You should run chkdsk."; + + ntfs_debug("Attribute list attribute found in $MFT."); + NInoSetAttrList(ni); + a = ctx->attr; + if (a->flags & ATTR_COMPRESSION_MASK) { + ntfs_error(sb, "Attribute list attribute is " + "compressed.%s", es); + goto put_err_out; + } + if (a->flags & ATTR_IS_ENCRYPTED || + a->flags & ATTR_IS_SPARSE) { + if (a->non_resident) { + ntfs_error(sb, "Non-resident attribute list " + "attribute is encrypted/" + "sparse.%s", es); + goto put_err_out; + } + ntfs_warning(sb, "Resident attribute list attribute " + "in $MFT system file is marked " + "encrypted/sparse which is not true. " + "However, Windows allows this and " + "chkdsk does not detect or correct it " + "so we will just ignore the invalid " + "flags and pretend they are not set."); + } + /* Now allocate memory for the attribute list. */ + ni->attr_list_size = (u32)ntfs_attr_size(a); + if (!ni->attr_list_size) { + ntfs_error(sb, "Attr_list_size is zero"); + goto put_err_out; + } + ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); + if (!ni->attr_list) { + ntfs_error(sb, "Not enough memory to allocate buffer " + "for attribute list."); + goto put_err_out; + } + if (a->non_resident) { + NInoSetAttrListNonResident(ni); + if (a->data.non_resident.lowest_vcn) { + ntfs_error(sb, "Attribute list has non zero " + "lowest_vcn. $MFT is corrupt. " + "You should run chkdsk."); + goto put_err_out; + } + /* Setup the runlist. */ + ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol, + a, NULL); + if (IS_ERR(ni->attr_list_rl.rl)) { + err = PTR_ERR(ni->attr_list_rl.rl); + ni->attr_list_rl.rl = NULL; + ntfs_error(sb, "Mapping pairs decompression " + "failed with error code %i.", + -err); + goto put_err_out; + } + /* Now load the attribute list. */ + if ((err = load_attribute_list(vol, &ni->attr_list_rl, + ni->attr_list, ni->attr_list_size, + sle64_to_cpu(a->data. + non_resident.initialized_size)))) { + ntfs_error(sb, "Failed to load attribute list " + "attribute with error code %i.", + -err); + goto put_err_out; + } + } else /* if (!ctx.attr->non_resident) */ { + if ((u8*)a + le16_to_cpu( + a->data.resident.value_offset) + + le32_to_cpu( + a->data.resident.value_length) > + (u8*)ctx->mrec + vol->mft_record_size) { + ntfs_error(sb, "Corrupt attribute list " + "attribute."); + goto put_err_out; + } + /* Now copy the attribute list. */ + memcpy(ni->attr_list, (u8*)a + le16_to_cpu( + a->data.resident.value_offset), + le32_to_cpu( + a->data.resident.value_length)); + } + /* The attribute list is now setup in memory. */ + /* + * FIXME: I don't know if this case is actually possible. + * According to logic it is not possible but I have seen too + * many weird things in MS software to rely on logic... Thus we + * perform a manual search and make sure the first $MFT/$DATA + * extent is in the base inode. If it is not we abort with an + * error and if we ever see a report of this error we will need + * to do some magic in order to have the necessary mft record + * loaded and in the right place in the page cache. But + * hopefully logic will prevail and this never happens... + */ + al_entry = (ATTR_LIST_ENTRY*)ni->attr_list; + al_end = (u8*)al_entry + ni->attr_list_size; + for (;; al_entry = next_al_entry) { + /* Out of bounds check. */ + if ((u8*)al_entry < ni->attr_list || + (u8*)al_entry > al_end) + goto em_put_err_out; + /* Catch the end of the attribute list. */ + if ((u8*)al_entry == al_end) + goto em_put_err_out; + if (!al_entry->length) + goto em_put_err_out; + if ((u8*)al_entry + 6 > al_end || (u8*)al_entry + + le16_to_cpu(al_entry->length) > al_end) + goto em_put_err_out; + next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry + + le16_to_cpu(al_entry->length)); + if (le32_to_cpu(al_entry->type) > le32_to_cpu(AT_DATA)) + goto em_put_err_out; + if (AT_DATA != al_entry->type) + continue; + /* We want an unnamed attribute. */ + if (al_entry->name_length) + goto em_put_err_out; + /* Want the first entry, i.e. lowest_vcn == 0. */ + if (al_entry->lowest_vcn) + goto em_put_err_out; + /* First entry has to be in the base mft record. */ + if (MREF_LE(al_entry->mft_reference) != vi->i_ino) { + /* MFT references do not match, logic fails. */ + ntfs_error(sb, "BUG: The first $DATA extent " + "of $MFT is not in the base " + "mft record. Please report " + "you saw this message to " + "linux-ntfs-dev@lists." + "sourceforge.net"); + goto put_err_out; + } else { + /* Sequence numbers must match. */ + if (MSEQNO_LE(al_entry->mft_reference) != + ni->seq_no) + goto em_put_err_out; + /* Got it. All is ok. We can stop now. */ + break; + } + } + } + + ntfs_attr_reinit_search_ctx(ctx); + + /* Now load all attribute extents. */ + a = NULL; + next_vcn = last_vcn = highest_vcn = 0; + while (!(err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, next_vcn, NULL, 0, + ctx))) { + runlist_element *nrl; + + /* Cache the current attribute. */ + a = ctx->attr; + /* $MFT must be non-resident. */ + if (!a->non_resident) { + ntfs_error(sb, "$MFT must be non-resident but a " + "resident extent was found. $MFT is " + "corrupt. Run chkdsk."); + goto put_err_out; + } + /* $MFT must be uncompressed and unencrypted. */ + if (a->flags & ATTR_COMPRESSION_MASK || + a->flags & ATTR_IS_ENCRYPTED || + a->flags & ATTR_IS_SPARSE) { + ntfs_error(sb, "$MFT must be uncompressed, " + "non-sparse, and unencrypted but a " + "compressed/sparse/encrypted extent " + "was found. $MFT is corrupt. Run " + "chkdsk."); + goto put_err_out; + } + /* + * Decompress the mapping pairs array of this extent and merge + * the result into the existing runlist. No need for locking + * as we have exclusive access to the inode at this time and we + * are a mount in progress task, too. + */ + nrl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl); + if (IS_ERR(nrl)) { + ntfs_error(sb, "ntfs_mapping_pairs_decompress() " + "failed with error code %ld. $MFT is " + "corrupt.", PTR_ERR(nrl)); + goto put_err_out; + } + ni->runlist.rl = nrl; + + /* Are we in the first extent? */ + if (!next_vcn) { + if (a->data.non_resident.lowest_vcn) { + ntfs_error(sb, "First extent of $DATA " + "attribute has non zero " + "lowest_vcn. $MFT is corrupt. " + "You should run chkdsk."); + goto put_err_out; + } + /* Get the last vcn in the $DATA attribute. */ + last_vcn = sle64_to_cpu( + a->data.non_resident.allocated_size) + >> vol->cluster_size_bits; + /* Fill in the inode size. */ + vi->i_size = sle64_to_cpu( + a->data.non_resident.data_size); + ni->initialized_size = sle64_to_cpu( + a->data.non_resident.initialized_size); + ni->allocated_size = sle64_to_cpu( + a->data.non_resident.allocated_size); + /* + * Verify the number of mft records does not exceed + * 2^32 - 1. + */ + if ((vi->i_size >> vol->mft_record_size_bits) >= + (1ULL << 32)) { + ntfs_error(sb, "$MFT is too big! Aborting."); + goto put_err_out; + } + /* + * We have got the first extent of the runlist for + * $MFT which means it is now relatively safe to call + * the normal ntfs_read_inode() function. + * Complete reading the inode, this will actually + * re-read the mft record for $MFT, this time entering + * it into the page cache with which we complete the + * kick start of the volume. It should be safe to do + * this now as the first extent of $MFT/$DATA is + * already known and we would hope that we don't need + * further extents in order to find the other + * attributes belonging to $MFT. Only time will tell if + * this is really the case. If not we will have to play + * magic at this point, possibly duplicating a lot of + * ntfs_read_inode() at this point. We will need to + * ensure we do enough of its work to be able to call + * ntfs_read_inode() on extents of $MFT/$DATA. But lets + * hope this never happens... + */ + ntfs_read_locked_inode(vi); + if (is_bad_inode(vi)) { + ntfs_error(sb, "ntfs_read_inode() of $MFT " + "failed. BUG or corrupt $MFT. " + "Run chkdsk and if no errors " + "are found, please report you " + "saw this message to " + "linux-ntfs-dev@lists." + "sourceforge.net"); + ntfs_attr_put_search_ctx(ctx); + /* Revert to the safe super operations. */ + ntfs_free(m); + return -1; + } + /* + * Re-initialize some specifics about $MFT's inode as + * ntfs_read_inode() will have set up the default ones. + */ + /* Set uid and gid to root. */ + vi->i_uid = GLOBAL_ROOT_UID; + vi->i_gid = GLOBAL_ROOT_GID; + /* Regular file. No access for anyone. */ + vi->i_mode = S_IFREG; + /* No VFS initiated operations allowed for $MFT. */ + vi->i_op = &ntfs_empty_inode_ops; + vi->i_fop = &ntfs_empty_file_ops; + } + + /* Get the lowest vcn for the next extent. */ + highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); + next_vcn = highest_vcn + 1; + + /* Only one extent or error, which we catch below. */ + if (next_vcn <= 0) + break; + + /* Avoid endless loops due to corruption. */ + if (next_vcn < sle64_to_cpu( + a->data.non_resident.lowest_vcn)) { + ntfs_error(sb, "$MFT has corrupt attribute list " + "attribute. Run chkdsk."); + goto put_err_out; + } + } + if (err != -ENOENT) { + ntfs_error(sb, "Failed to lookup $MFT/$DATA attribute extent. " + "$MFT is corrupt. Run chkdsk."); + goto put_err_out; + } + if (!a) { + ntfs_error(sb, "$MFT/$DATA attribute not found. $MFT is " + "corrupt. Run chkdsk."); + goto put_err_out; + } + if (highest_vcn && highest_vcn != last_vcn - 1) { + ntfs_error(sb, "Failed to load the complete runlist for " + "$MFT/$DATA. Driver bug or corrupt $MFT. " + "Run chkdsk."); + ntfs_debug("highest_vcn = 0x%llx, last_vcn - 1 = 0x%llx", + (unsigned long long)highest_vcn, + (unsigned long long)last_vcn - 1); + goto put_err_out; + } + ntfs_attr_put_search_ctx(ctx); + ntfs_debug("Done."); + ntfs_free(m); + + /* + * Split the locking rules of the MFT inode from the + * locking rules of other inodes: + */ + lockdep_set_class(&ni->runlist.lock, &mft_ni_runlist_lock_key); + lockdep_set_class(&ni->mrec_lock, &mft_ni_mrec_lock_key); + + return 0; + +em_put_err_out: + ntfs_error(sb, "Couldn't find first extent of $DATA attribute in " + "attribute list. $MFT is corrupt. Run chkdsk."); +put_err_out: + ntfs_attr_put_search_ctx(ctx); +err_out: + ntfs_error(sb, "Failed. Marking inode as bad."); + make_bad_inode(vi); + ntfs_free(m); + return -1; +} + +static void __ntfs_clear_inode(ntfs_inode *ni) +{ + /* Free all alocated memory. */ + down_write(&ni->runlist.lock); + if (ni->runlist.rl) { + ntfs_free(ni->runlist.rl); + ni->runlist.rl = NULL; + } + up_write(&ni->runlist.lock); + + if (ni->attr_list) { + ntfs_free(ni->attr_list); + ni->attr_list = NULL; + } + + down_write(&ni->attr_list_rl.lock); + if (ni->attr_list_rl.rl) { + ntfs_free(ni->attr_list_rl.rl); + ni->attr_list_rl.rl = NULL; + } + up_write(&ni->attr_list_rl.lock); + + if (ni->name_len && ni->name != I30) { + /* Catch bugs... */ + BUG_ON(!ni->name); + kfree(ni->name); + } +} + +void ntfs_clear_extent_inode(ntfs_inode *ni) +{ + ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); + + BUG_ON(NInoAttr(ni)); + BUG_ON(ni->nr_extents != -1); + +#ifdef NTFS_RW + if (NInoDirty(ni)) { + if (!is_bad_inode(VFS_I(ni->ext.base_ntfs_ino))) + ntfs_error(ni->vol->sb, "Clearing dirty extent inode! " + "Losing data! This is a BUG!!!"); + // FIXME: Do something!!! + } +#endif /* NTFS_RW */ + + __ntfs_clear_inode(ni); + + /* Bye, bye... */ + ntfs_destroy_extent_inode(ni); +} + +/** + * ntfs_evict_big_inode - clean up the ntfs specific part of an inode + * @vi: vfs inode pending annihilation + * + * When the VFS is going to remove an inode from memory, ntfs_clear_big_inode() + * is called, which deallocates all memory belonging to the NTFS specific part + * of the inode and returns. + * + * If the MFT record is dirty, we commit it before doing anything else. + */ +void ntfs_evict_big_inode(struct inode *vi) +{ + ntfs_inode *ni = NTFS_I(vi); + + truncate_inode_pages_final(&vi->i_data); + clear_inode(vi); + +#ifdef NTFS_RW + if (NInoDirty(ni)) { + bool was_bad = (is_bad_inode(vi)); + + /* Committing the inode also commits all extent inodes. */ + ntfs_commit_inode(vi); + + if (!was_bad && (is_bad_inode(vi) || NInoDirty(ni))) { + ntfs_error(vi->i_sb, "Failed to commit dirty inode " + "0x%lx. Losing data!", vi->i_ino); + // FIXME: Do something!!! + } + } +#endif /* NTFS_RW */ + + /* No need to lock at this stage as no one else has a reference. */ + if (ni->nr_extents > 0) { + int i; + + for (i = 0; i < ni->nr_extents; i++) + ntfs_clear_extent_inode(ni->ext.extent_ntfs_inos[i]); + kfree(ni->ext.extent_ntfs_inos); + } + + __ntfs_clear_inode(ni); + + if (NInoAttr(ni)) { + /* Release the base inode if we are holding it. */ + if (ni->nr_extents == -1) { + iput(VFS_I(ni->ext.base_ntfs_ino)); + ni->nr_extents = 0; + ni->ext.base_ntfs_ino = NULL; + } + } + BUG_ON(ni->page); + if (!atomic_dec_and_test(&ni->count)) + BUG(); + return; +} + +/** + * ntfs_show_options - show mount options in /proc/mounts + * @sf: seq_file in which to write our mount options + * @root: root of the mounted tree whose mount options to display + * + * Called by the VFS once for each mounted ntfs volume when someone reads + * /proc/mounts in order to display the NTFS specific mount options of each + * mount. The mount options of fs specified by @root are written to the seq file + * @sf and success is returned. + */ +int ntfs_show_options(struct seq_file *sf, struct dentry *root) +{ + ntfs_volume *vol = NTFS_SB(root->d_sb); + int i; + + seq_printf(sf, ",uid=%i", from_kuid_munged(&init_user_ns, vol->uid)); + seq_printf(sf, ",gid=%i", from_kgid_munged(&init_user_ns, vol->gid)); + if (vol->fmask == vol->dmask) + seq_printf(sf, ",umask=0%o", vol->fmask); + else { + seq_printf(sf, ",fmask=0%o", vol->fmask); + seq_printf(sf, ",dmask=0%o", vol->dmask); + } + seq_printf(sf, ",nls=%s", vol->nls_map->charset); + if (NVolCaseSensitive(vol)) + seq_printf(sf, ",case_sensitive"); + if (NVolShowSystemFiles(vol)) + seq_printf(sf, ",show_sys_files"); + if (!NVolSparseEnabled(vol)) + seq_printf(sf, ",disable_sparse"); + for (i = 0; on_errors_arr[i].val; i++) { + if (on_errors_arr[i].val & vol->on_errors) + seq_printf(sf, ",errors=%s", on_errors_arr[i].str); + } + seq_printf(sf, ",mft_zone_multiplier=%i", vol->mft_zone_multiplier); + return 0; +} + +#ifdef NTFS_RW + +static const char *es = " Leaving inconsistent metadata. Unmount and run " + "chkdsk."; + +/** + * ntfs_truncate - called when the i_size of an ntfs inode is changed + * @vi: inode for which the i_size was changed + * + * We only support i_size changes for normal files at present, i.e. not + * compressed and not encrypted. This is enforced in ntfs_setattr(), see + * below. + * + * The kernel guarantees that @vi is a regular file (S_ISREG() is true) and + * that the change is allowed. + * + * This implies for us that @vi is a file inode rather than a directory, index, + * or attribute inode as well as that @vi is a base inode. + * + * Returns 0 on success or -errno on error. + * + * Called with ->i_mutex held. + */ +int ntfs_truncate(struct inode *vi) +{ + s64 new_size, old_size, nr_freed, new_alloc_size, old_alloc_size; + VCN highest_vcn; + unsigned long flags; + ntfs_inode *base_ni, *ni = NTFS_I(vi); + ntfs_volume *vol = ni->vol; + ntfs_attr_search_ctx *ctx; + MFT_RECORD *m; + ATTR_RECORD *a; + const char *te = " Leaving file length out of sync with i_size."; + int err, mp_size, size_change, alloc_change; + + ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); + BUG_ON(NInoAttr(ni)); + BUG_ON(S_ISDIR(vi->i_mode)); + BUG_ON(NInoMstProtected(ni)); + BUG_ON(ni->nr_extents < 0); +retry_truncate: + /* + * Lock the runlist for writing and map the mft record to ensure it is + * safe to mess with the attribute runlist and sizes. + */ + down_write(&ni->runlist.lock); + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + ntfs_error(vi->i_sb, "Failed to map mft record for inode 0x%lx " + "(error code %d).%s", vi->i_ino, err, te); + ctx = NULL; + m = NULL; + goto old_bad_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + ntfs_error(vi->i_sb, "Failed to allocate a search context for " + "inode 0x%lx (not enough memory).%s", + vi->i_ino, te); + err = -ENOMEM; + goto old_bad_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) { + ntfs_error(vi->i_sb, "Open attribute is missing from " + "mft record. Inode 0x%lx is corrupt. " + "Run chkdsk.%s", vi->i_ino, te); + err = -EIO; + } else + ntfs_error(vi->i_sb, "Failed to lookup attribute in " + "inode 0x%lx (error code %d).%s", + vi->i_ino, err, te); + goto old_bad_out; + } + m = ctx->mrec; + a = ctx->attr; + /* + * The i_size of the vfs inode is the new size for the attribute value. + */ + new_size = i_size_read(vi); + /* The current size of the attribute value is the old size. */ + old_size = ntfs_attr_size(a); + /* Calculate the new allocated size. */ + if (NInoNonResident(ni)) + new_alloc_size = (new_size + vol->cluster_size - 1) & + ~(s64)vol->cluster_size_mask; + else + new_alloc_size = (new_size + 7) & ~7; + /* The current allocated size is the old allocated size. */ + read_lock_irqsave(&ni->size_lock, flags); + old_alloc_size = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + /* + * The change in the file size. This will be 0 if no change, >0 if the + * size is growing, and <0 if the size is shrinking. + */ + size_change = -1; + if (new_size - old_size >= 0) { + size_change = 1; + if (new_size == old_size) + size_change = 0; + } + /* As above for the allocated size. */ + alloc_change = -1; + if (new_alloc_size - old_alloc_size >= 0) { + alloc_change = 1; + if (new_alloc_size == old_alloc_size) + alloc_change = 0; + } + /* + * If neither the size nor the allocation are being changed there is + * nothing to do. + */ + if (!size_change && !alloc_change) + goto unm_done; + /* If the size is changing, check if new size is allowed in $AttrDef. */ + if (size_change) { + err = ntfs_attr_size_bounds_check(vol, ni->type, new_size); + if (unlikely(err)) { + if (err == -ERANGE) { + ntfs_error(vol->sb, "Truncate would cause the " + "inode 0x%lx to %simum size " + "for its attribute type " + "(0x%x). Aborting truncate.", + vi->i_ino, + new_size > old_size ? "exceed " + "the max" : "go under the min", + le32_to_cpu(ni->type)); + err = -EFBIG; + } else { + ntfs_error(vol->sb, "Inode 0x%lx has unknown " + "attribute type 0x%x. " + "Aborting truncate.", + vi->i_ino, + le32_to_cpu(ni->type)); + err = -EIO; + } + /* Reset the vfs inode size to the old size. */ + i_size_write(vi, old_size); + goto err_out; + } + } + if (NInoCompressed(ni) || NInoEncrypted(ni)) { + ntfs_warning(vi->i_sb, "Changes in inode size are not " + "supported yet for %s files, ignoring.", + NInoCompressed(ni) ? "compressed" : + "encrypted"); + err = -EOPNOTSUPP; + goto bad_out; + } + if (a->non_resident) + goto do_non_resident_truncate; + BUG_ON(NInoNonResident(ni)); + /* Resize the attribute record to best fit the new attribute size. */ + if (new_size < vol->mft_record_size && + !ntfs_resident_attr_value_resize(m, a, new_size)) { + /* The resize succeeded! */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + write_lock_irqsave(&ni->size_lock, flags); + /* Update the sizes in the ntfs inode and all is done. */ + ni->allocated_size = le32_to_cpu(a->length) - + le16_to_cpu(a->data.resident.value_offset); + /* + * Note ntfs_resident_attr_value_resize() has already done any + * necessary data clearing in the attribute record. When the + * file is being shrunk vmtruncate() will already have cleared + * the top part of the last partial page, i.e. since this is + * the resident case this is the page with index 0. However, + * when the file is being expanded, the page cache page data + * between the old data_size, i.e. old_size, and the new_size + * has not been zeroed. Fortunately, we do not need to zero it + * either since on one hand it will either already be zero due + * to both read_folio and writepage clearing partial page data + * beyond i_size in which case there is nothing to do or in the + * case of the file being mmap()ped at the same time, POSIX + * specifies that the behaviour is unspecified thus we do not + * have to do anything. This means that in our implementation + * in the rare case that the file is mmap()ped and a write + * occurred into the mmap()ped region just beyond the file size + * and writepage has not yet been called to write out the page + * (which would clear the area beyond the file size) and we now + * extend the file size to incorporate this dirty region + * outside the file size, a write of the page would result in + * this data being written to disk instead of being cleared. + * Given both POSIX and the Linux mmap(2) man page specify that + * this corner case is undefined, we choose to leave it like + * that as this is much simpler for us as we cannot lock the + * relevant page now since we are holding too many ntfs locks + * which would result in a lock reversal deadlock. + */ + ni->initialized_size = new_size; + write_unlock_irqrestore(&ni->size_lock, flags); + goto unm_done; + } + /* If the above resize failed, this must be an attribute extension. */ + BUG_ON(size_change < 0); + /* + * We have to drop all the locks so we can call + * ntfs_attr_make_non_resident(). This could be optimised by try- + * locking the first page cache page and only if that fails dropping + * the locks, locking the page, and redoing all the locking and + * lookups. While this would be a huge optimisation, it is not worth + * it as this is definitely a slow code path as it only ever can happen + * once for any given file. + */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); + /* + * Not enough space in the mft record, try to make the attribute + * non-resident and if successful restart the truncation process. + */ + err = ntfs_attr_make_non_resident(ni, old_size); + if (likely(!err)) + goto retry_truncate; + /* + * Could not make non-resident. If this is due to this not being + * permitted for this attribute type or there not being enough space, + * try to make other attributes non-resident. Otherwise fail. + */ + if (unlikely(err != -EPERM && err != -ENOSPC)) { + ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, attribute " + "type 0x%x, because the conversion from " + "resident to non-resident attribute failed " + "with error code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + if (err != -ENOMEM) + err = -EIO; + goto conv_err_out; + } + /* TODO: Not implemented from here, abort. */ + if (err == -ENOSPC) + ntfs_error(vol->sb, "Not enough space in the mft record/on " + "disk for the non-resident attribute value. " + "This case is not implemented yet."); + else /* if (err == -EPERM) */ + ntfs_error(vol->sb, "This attribute type may not be " + "non-resident. This case is not implemented " + "yet."); + err = -EOPNOTSUPP; + goto conv_err_out; +#if 0 + // TODO: Attempt to make other attributes non-resident. + if (!err) + goto do_resident_extend; + /* + * Both the attribute list attribute and the standard information + * attribute must remain in the base inode. Thus, if this is one of + * these attributes, we have to try to move other attributes out into + * extent mft records instead. + */ + if (ni->type == AT_ATTRIBUTE_LIST || + ni->type == AT_STANDARD_INFORMATION) { + // TODO: Attempt to move other attributes into extent mft + // records. + err = -EOPNOTSUPP; + if (!err) + goto do_resident_extend; + goto err_out; + } + // TODO: Attempt to move this attribute to an extent mft record, but + // only if it is not already the only attribute in an mft record in + // which case there would be nothing to gain. + err = -EOPNOTSUPP; + if (!err) + goto do_resident_extend; + /* There is nothing we can do to make enough space. )-: */ + goto err_out; +#endif +do_non_resident_truncate: + BUG_ON(!NInoNonResident(ni)); + if (alloc_change < 0) { + highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); + if (highest_vcn > 0 && + old_alloc_size >> vol->cluster_size_bits > + highest_vcn + 1) { + /* + * This attribute has multiple extents. Not yet + * supported. + */ + ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, " + "attribute type 0x%x, because the " + "attribute is highly fragmented (it " + "consists of multiple extents) and " + "this case is not implemented yet.", + vi->i_ino, + (unsigned)le32_to_cpu(ni->type)); + err = -EOPNOTSUPP; + goto bad_out; + } + } + /* + * If the size is shrinking, need to reduce the initialized_size and + * the data_size before reducing the allocation. + */ + if (size_change < 0) { + /* + * Make the valid size smaller (i_size is already up-to-date). + */ + write_lock_irqsave(&ni->size_lock, flags); + if (new_size < ni->initialized_size) { + ni->initialized_size = new_size; + a->data.non_resident.initialized_size = + cpu_to_sle64(new_size); + } + a->data.non_resident.data_size = cpu_to_sle64(new_size); + write_unlock_irqrestore(&ni->size_lock, flags); + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + /* If the allocated size is not changing, we are done. */ + if (!alloc_change) + goto unm_done; + /* + * If the size is shrinking it makes no sense for the + * allocation to be growing. + */ + BUG_ON(alloc_change > 0); + } else /* if (size_change >= 0) */ { + /* + * The file size is growing or staying the same but the + * allocation can be shrinking, growing or staying the same. + */ + if (alloc_change > 0) { + /* + * We need to extend the allocation and possibly update + * the data size. If we are updating the data size, + * since we are not touching the initialized_size we do + * not need to worry about the actual data on disk. + * And as far as the page cache is concerned, there + * will be no pages beyond the old data size and any + * partial region in the last page between the old and + * new data size (or the end of the page if the new + * data size is outside the page) does not need to be + * modified as explained above for the resident + * attribute truncate case. To do this, we simply drop + * the locks we hold and leave all the work to our + * friendly helper ntfs_attr_extend_allocation(). + */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); + err = ntfs_attr_extend_allocation(ni, new_size, + size_change > 0 ? new_size : -1, -1); + /* + * ntfs_attr_extend_allocation() will have done error + * output already. + */ + goto done; + } + if (!alloc_change) + goto alloc_done; + } + /* alloc_change < 0 */ + /* Free the clusters. */ + nr_freed = ntfs_cluster_free(ni, new_alloc_size >> + vol->cluster_size_bits, -1, ctx); + m = ctx->mrec; + a = ctx->attr; + if (unlikely(nr_freed < 0)) { + ntfs_error(vol->sb, "Failed to release cluster(s) (error code " + "%lli). Unmount and run chkdsk to recover " + "the lost cluster(s).", (long long)nr_freed); + NVolSetErrors(vol); + nr_freed = 0; + } + /* Truncate the runlist. */ + err = ntfs_rl_truncate_nolock(vol, &ni->runlist, + new_alloc_size >> vol->cluster_size_bits); + /* + * If the runlist truncation failed and/or the search context is no + * longer valid, we cannot resize the attribute record or build the + * mapping pairs array thus we mark the inode bad so that no access to + * the freed clusters can happen. + */ + if (unlikely(err || IS_ERR(m))) { + ntfs_error(vol->sb, "Failed to %s (error code %li).%s", + IS_ERR(m) ? + "restore attribute search context" : + "truncate attribute runlist", + IS_ERR(m) ? PTR_ERR(m) : err, es); + err = -EIO; + goto bad_out; + } + /* Get the size for the shrunk mapping pairs array for the runlist. */ + mp_size = ntfs_get_size_for_mapping_pairs(vol, ni->runlist.rl, 0, -1); + if (unlikely(mp_size <= 0)) { + ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, " + "attribute type 0x%x, because determining the " + "size for the mapping pairs failed with error " + "code %i.%s", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), mp_size, es); + err = -EIO; + goto bad_out; + } + /* + * Shrink the attribute record for the new mapping pairs array. Note, + * this cannot fail since we are making the attribute smaller thus by + * definition there is enough space to do so. + */ + err = ntfs_attr_record_resize(m, a, mp_size + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); + BUG_ON(err); + /* + * Generate the mapping pairs array directly into the attribute record. + */ + err = ntfs_mapping_pairs_build(vol, (u8*)a + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset), + mp_size, ni->runlist.rl, 0, -1, NULL); + if (unlikely(err)) { + ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, " + "attribute type 0x%x, because building the " + "mapping pairs failed with error code %i.%s", + vi->i_ino, (unsigned)le32_to_cpu(ni->type), + err, es); + err = -EIO; + goto bad_out; + } + /* Update the allocated/compressed size as well as the highest vcn. */ + a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >> + vol->cluster_size_bits) - 1); + write_lock_irqsave(&ni->size_lock, flags); + ni->allocated_size = new_alloc_size; + a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size); + if (NInoSparse(ni) || NInoCompressed(ni)) { + if (nr_freed) { + ni->itype.compressed.size -= nr_freed << + vol->cluster_size_bits; + BUG_ON(ni->itype.compressed.size < 0); + a->data.non_resident.compressed_size = cpu_to_sle64( + ni->itype.compressed.size); + vi->i_blocks = ni->itype.compressed.size >> 9; + } + } else + vi->i_blocks = new_alloc_size >> 9; + write_unlock_irqrestore(&ni->size_lock, flags); + /* + * We have shrunk the allocation. If this is a shrinking truncate we + * have already dealt with the initialized_size and the data_size above + * and we are done. If the truncate is only changing the allocation + * and not the data_size, we are also done. If this is an extending + * truncate, need to extend the data_size now which is ensured by the + * fact that @size_change is positive. + */ +alloc_done: + /* + * If the size is growing, need to update it now. If it is shrinking, + * we have already updated it above (before the allocation change). + */ + if (size_change > 0) + a->data.non_resident.data_size = cpu_to_sle64(new_size); + /* Ensure the modified mft record is written out. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); +unm_done: + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); +done: + /* Update the mtime and ctime on the base inode. */ + /* normally ->truncate shouldn't update ctime or mtime, + * but ntfs did before so it got a copy & paste version + * of file_update_time. one day someone should fix this + * for real. + */ + if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) { + struct timespec64 now = current_time(VFS_I(base_ni)); + struct timespec64 ctime = inode_get_ctime(VFS_I(base_ni)); + struct timespec64 mtime = inode_get_mtime(VFS_I(base_ni)); + int sync_it = 0; + + if (!timespec64_equal(&mtime, &now) || + !timespec64_equal(&ctime, &now)) + sync_it = 1; + inode_set_ctime_to_ts(VFS_I(base_ni), now); + inode_set_mtime_to_ts(VFS_I(base_ni), now); + + if (sync_it) + mark_inode_dirty_sync(VFS_I(base_ni)); + } + + if (likely(!err)) { + NInoClearTruncateFailed(ni); + ntfs_debug("Done."); + } + return err; +old_bad_out: + old_size = -1; +bad_out: + if (err != -ENOMEM && err != -EOPNOTSUPP) + NVolSetErrors(vol); + if (err != -EOPNOTSUPP) + NInoSetTruncateFailed(ni); + else if (old_size >= 0) + i_size_write(vi, old_size); +err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); +out: + ntfs_debug("Failed. Returning error code %i.", err); + return err; +conv_err_out: + if (err != -ENOMEM && err != -EOPNOTSUPP) + NVolSetErrors(vol); + if (err != -EOPNOTSUPP) + NInoSetTruncateFailed(ni); + else + i_size_write(vi, old_size); + goto out; +} + +/** + * ntfs_truncate_vfs - wrapper for ntfs_truncate() that has no return value + * @vi: inode for which the i_size was changed + * + * Wrapper for ntfs_truncate() that has no return value. + * + * See ntfs_truncate() description above for details. + */ +#ifdef NTFS_RW +void ntfs_truncate_vfs(struct inode *vi) { + ntfs_truncate(vi); +} +#endif + +/** + * ntfs_setattr - called from notify_change() when an attribute is being changed + * @idmap: idmap of the mount the inode was found from + * @dentry: dentry whose attributes to change + * @attr: structure describing the attributes and the changes + * + * We have to trap VFS attempts to truncate the file described by @dentry as + * soon as possible, because we do not implement changes in i_size yet. So we + * abort all i_size changes here. + * + * We also abort all changes of user, group, and mode as we do not implement + * the NTFS ACLs yet. + * + * Called with ->i_mutex held. + */ +int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr) +{ + struct inode *vi = d_inode(dentry); + int err; + unsigned int ia_valid = attr->ia_valid; + + err = setattr_prepare(&nop_mnt_idmap, dentry, attr); + if (err) + goto out; + /* We do not support NTFS ACLs yet. */ + if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE)) { + ntfs_warning(vi->i_sb, "Changes in user/group/mode are not " + "supported yet, ignoring."); + err = -EOPNOTSUPP; + goto out; + } + if (ia_valid & ATTR_SIZE) { + if (attr->ia_size != i_size_read(vi)) { + ntfs_inode *ni = NTFS_I(vi); + /* + * FIXME: For now we do not support resizing of + * compressed or encrypted files yet. + */ + if (NInoCompressed(ni) || NInoEncrypted(ni)) { + ntfs_warning(vi->i_sb, "Changes in inode size " + "are not supported yet for " + "%s files, ignoring.", + NInoCompressed(ni) ? + "compressed" : "encrypted"); + err = -EOPNOTSUPP; + } else { + truncate_setsize(vi, attr->ia_size); + ntfs_truncate_vfs(vi); + } + if (err || ia_valid == ATTR_SIZE) + goto out; + } else { + /* + * We skipped the truncate but must still update + * timestamps. + */ + ia_valid |= ATTR_MTIME | ATTR_CTIME; + } + } + if (ia_valid & ATTR_ATIME) + inode_set_atime_to_ts(vi, attr->ia_atime); + if (ia_valid & ATTR_MTIME) + inode_set_mtime_to_ts(vi, attr->ia_mtime); + if (ia_valid & ATTR_CTIME) + inode_set_ctime_to_ts(vi, attr->ia_ctime); + mark_inode_dirty(vi); +out: + return err; +} + +/** + * __ntfs_write_inode - write out a dirty inode + * @vi: inode to write out + * @sync: if true, write out synchronously + * + * Write out a dirty inode to disk including any extent inodes if present. + * + * If @sync is true, commit the inode to disk and wait for io completion. This + * is done using write_mft_record(). + * + * If @sync is false, just schedule the write to happen but do not wait for i/o + * completion. In 2.6 kernels, scheduling usually happens just by virtue of + * marking the page (and in this case mft record) dirty but we do not implement + * this yet as write_mft_record() largely ignores the @sync parameter and + * always performs synchronous writes. + * + * Return 0 on success and -errno on error. + */ +int __ntfs_write_inode(struct inode *vi, int sync) +{ + sle64 nt; + ntfs_inode *ni = NTFS_I(vi); + ntfs_attr_search_ctx *ctx; + MFT_RECORD *m; + STANDARD_INFORMATION *si; + int err = 0; + bool modified = false; + + ntfs_debug("Entering for %sinode 0x%lx.", NInoAttr(ni) ? "attr " : "", + vi->i_ino); + /* + * Dirty attribute inodes are written via their real inodes so just + * clean them here. Access time updates are taken care off when the + * real inode is written. + */ + if (NInoAttr(ni)) { + NInoClearDirty(ni); + ntfs_debug("Done."); + return 0; + } + /* Map, pin, and lock the mft record belonging to the inode. */ + m = map_mft_record(ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + goto err_out; + } + /* Update the access times in the standard information attribute. */ + ctx = ntfs_attr_get_search_ctx(ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto unm_err_out; + } + err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + ntfs_attr_put_search_ctx(ctx); + goto unm_err_out; + } + si = (STANDARD_INFORMATION*)((u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset)); + /* Update the access times if they have changed. */ + nt = utc2ntfs(inode_get_mtime(vi)); + if (si->last_data_change_time != nt) { + ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, " + "new = 0x%llx", vi->i_ino, (long long) + sle64_to_cpu(si->last_data_change_time), + (long long)sle64_to_cpu(nt)); + si->last_data_change_time = nt; + modified = true; + } + nt = utc2ntfs(inode_get_ctime(vi)); + if (si->last_mft_change_time != nt) { + ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, " + "new = 0x%llx", vi->i_ino, (long long) + sle64_to_cpu(si->last_mft_change_time), + (long long)sle64_to_cpu(nt)); + si->last_mft_change_time = nt; + modified = true; + } + nt = utc2ntfs(inode_get_atime(vi)); + if (si->last_access_time != nt) { + ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, " + "new = 0x%llx", vi->i_ino, + (long long)sle64_to_cpu(si->last_access_time), + (long long)sle64_to_cpu(nt)); + si->last_access_time = nt; + modified = true; + } + /* + * If we just modified the standard information attribute we need to + * mark the mft record it is in dirty. We do this manually so that + * mark_inode_dirty() is not called which would redirty the inode and + * hence result in an infinite loop of trying to write the inode. + * There is no need to mark the base inode nor the base mft record + * dirty, since we are going to write this mft record below in any case + * and the base mft record may actually not have been modified so it + * might not need to be written out. + * NOTE: It is not a problem when the inode for $MFT itself is being + * written out as mark_ntfs_record_dirty() will only set I_DIRTY_PAGES + * on the $MFT inode and hence __ntfs_write_inode() will not be + * re-invoked because of it which in turn is ok since the dirtied mft + * record will be cleaned and written out to disk below, i.e. before + * this function returns. + */ + if (modified) { + flush_dcache_mft_record_page(ctx->ntfs_ino); + if (!NInoTestSetDirty(ctx->ntfs_ino)) + mark_ntfs_record_dirty(ctx->ntfs_ino->page, + ctx->ntfs_ino->page_ofs); + } + ntfs_attr_put_search_ctx(ctx); + /* Now the access times are updated, write the base mft record. */ + if (NInoDirty(ni)) + err = write_mft_record(ni, m, sync); + /* Write all attached extent mft records. */ + mutex_lock(&ni->extent_lock); + if (ni->nr_extents > 0) { + ntfs_inode **extent_nis = ni->ext.extent_ntfs_inos; + int i; + + ntfs_debug("Writing %i extent inodes.", ni->nr_extents); + for (i = 0; i < ni->nr_extents; i++) { + ntfs_inode *tni = extent_nis[i]; + + if (NInoDirty(tni)) { + MFT_RECORD *tm = map_mft_record(tni); + int ret; + + if (IS_ERR(tm)) { + if (!err || err == -ENOMEM) + err = PTR_ERR(tm); + continue; + } + ret = write_mft_record(tni, tm, sync); + unmap_mft_record(tni); + if (unlikely(ret)) { + if (!err || err == -ENOMEM) + err = ret; + } + } + } + } + mutex_unlock(&ni->extent_lock); + unmap_mft_record(ni); + if (unlikely(err)) + goto err_out; + ntfs_debug("Done."); + return 0; +unm_err_out: + unmap_mft_record(ni); +err_out: + if (err == -ENOMEM) { + ntfs_warning(vi->i_sb, "Not enough memory to write inode. " + "Marking the inode dirty again, so the VFS " + "retries later."); + mark_inode_dirty(vi); + } else { + ntfs_error(vi->i_sb, "Failed (error %i): Run chkdsk.", -err); + NVolSetErrors(ni->vol); + } + return err; +} + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h new file mode 100644 index 000000000000..147ef4ddb691 --- /dev/null +++ b/fs/ntfs/inode.h @@ -0,0 +1,310 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * inode.h - Defines for inode structures NTFS Linux kernel driver. Part of + * the Linux-NTFS project. + * + * Copyright (c) 2001-2007 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + */ + +#ifndef _LINUX_NTFS_INODE_H +#define _LINUX_NTFS_INODE_H + +#include + +#include +#include +#include +#include +#include + +#include "layout.h" +#include "volume.h" +#include "types.h" +#include "runlist.h" +#include "debug.h" + +typedef struct _ntfs_inode ntfs_inode; + +/* + * The NTFS in-memory inode structure. It is just used as an extension to the + * fields already provided in the VFS inode. + */ +struct _ntfs_inode { + rwlock_t size_lock; /* Lock serializing access to inode sizes. */ + s64 initialized_size; /* Copy from the attribute record. */ + s64 allocated_size; /* Copy from the attribute record. */ + unsigned long state; /* NTFS specific flags describing this inode. + See ntfs_inode_state_bits below. */ + unsigned long mft_no; /* Number of the mft record / inode. */ + u16 seq_no; /* Sequence number of the mft record. */ + atomic_t count; /* Inode reference count for book keeping. */ + ntfs_volume *vol; /* Pointer to the ntfs volume of this inode. */ + /* + * If NInoAttr() is true, the below fields describe the attribute which + * this fake inode belongs to. The actual inode of this attribute is + * pointed to by base_ntfs_ino and nr_extents is always set to -1 (see + * below). For real inodes, we also set the type (AT_DATA for files and + * AT_INDEX_ALLOCATION for directories), with the name = NULL and + * name_len = 0 for files and name = I30 (global constant) and + * name_len = 4 for directories. + */ + ATTR_TYPE type; /* Attribute type of this fake inode. */ + ntfschar *name; /* Attribute name of this fake inode. */ + u32 name_len; /* Attribute name length of this fake inode. */ + runlist runlist; /* If state has the NI_NonResident bit set, + the runlist of the unnamed data attribute + (if a file) or of the index allocation + attribute (directory) or of the attribute + described by the fake inode (if NInoAttr()). + If runlist.rl is NULL, the runlist has not + been read in yet or has been unmapped. If + NI_NonResident is clear, the attribute is + resident (file and fake inode) or there is + no $I30 index allocation attribute + (small directory). In the latter case + runlist.rl is always NULL.*/ + /* + * The following fields are only valid for real inodes and extent + * inodes. + */ + struct mutex mrec_lock; /* Lock for serializing access to the + mft record belonging to this inode. */ + struct page *page; /* The page containing the mft record of the + inode. This should only be touched by the + (un)map_mft_record*() functions. */ + int page_ofs; /* Offset into the page at which the mft record + begins. This should only be touched by the + (un)map_mft_record*() functions. */ + /* + * Attribute list support (only for use by the attribute lookup + * functions). Setup during read_inode for all inodes with attribute + * lists. Only valid if NI_AttrList is set in state, and attr_list_rl is + * further only valid if NI_AttrListNonResident is set. + */ + u32 attr_list_size; /* Length of attribute list value in bytes. */ + u8 *attr_list; /* Attribute list value itself. */ + runlist attr_list_rl; /* Run list for the attribute list value. */ + union { + struct { /* It is a directory, $MFT, or an index inode. */ + u32 block_size; /* Size of an index block. */ + u32 vcn_size; /* Size of a vcn in this + index. */ + COLLATION_RULE collation_rule; /* The collation rule + for the index. */ + u8 block_size_bits; /* Log2 of the above. */ + u8 vcn_size_bits; /* Log2 of the above. */ + } index; + struct { /* It is a compressed/sparse file/attribute inode. */ + s64 size; /* Copy of compressed_size from + $DATA. */ + u32 block_size; /* Size of a compression block + (cb). */ + u8 block_size_bits; /* Log2 of the size of a cb. */ + u8 block_clusters; /* Number of clusters per cb. */ + } compressed; + } itype; + struct mutex extent_lock; /* Lock for accessing/modifying the + below . */ + s32 nr_extents; /* For a base mft record, the number of attached extent + inodes (0 if none), for extent records and for fake + inodes describing an attribute this is -1. */ + union { /* This union is only used if nr_extents != 0. */ + ntfs_inode **extent_ntfs_inos; /* For nr_extents > 0, array of + the ntfs inodes of the extent + mft records belonging to + this base inode which have + been loaded. */ + ntfs_inode *base_ntfs_ino; /* For nr_extents == -1, the + ntfs inode of the base mft + record. For fake inodes, the + real (base) inode to which + the attribute belongs. */ + } ext; +}; + +/* + * Defined bits for the state field in the ntfs_inode structure. + * (f) = files only, (d) = directories only, (a) = attributes/fake inodes only + */ +typedef enum { + NI_Dirty, /* 1: Mft record needs to be written to disk. */ + NI_AttrList, /* 1: Mft record contains an attribute list. */ + NI_AttrListNonResident, /* 1: Attribute list is non-resident. Implies + NI_AttrList is set. */ + + NI_Attr, /* 1: Fake inode for attribute i/o. + 0: Real inode or extent inode. */ + + NI_MstProtected, /* 1: Attribute is protected by MST fixups. + 0: Attribute is not protected by fixups. */ + NI_NonResident, /* 1: Unnamed data attr is non-resident (f). + 1: Attribute is non-resident (a). */ + NI_IndexAllocPresent = NI_NonResident, /* 1: $I30 index alloc attr is + present (d). */ + NI_Compressed, /* 1: Unnamed data attr is compressed (f). + 1: Create compressed files by default (d). + 1: Attribute is compressed (a). */ + NI_Encrypted, /* 1: Unnamed data attr is encrypted (f). + 1: Create encrypted files by default (d). + 1: Attribute is encrypted (a). */ + NI_Sparse, /* 1: Unnamed data attr is sparse (f). + 1: Create sparse files by default (d). + 1: Attribute is sparse (a). */ + NI_SparseDisabled, /* 1: May not create sparse regions. */ + NI_TruncateFailed, /* 1: Last ntfs_truncate() call failed. */ +} ntfs_inode_state_bits; + +/* + * NOTE: We should be adding dirty mft records to a list somewhere and they + * should be independent of the (ntfs/vfs) inode structure so that an inode can + * be removed but the record can be left dirty for syncing later. + */ + +/* + * Macro tricks to expand the NInoFoo(), NInoSetFoo(), and NInoClearFoo() + * functions. + */ +#define NINO_FNS(flag) \ +static inline int NIno##flag(ntfs_inode *ni) \ +{ \ + return test_bit(NI_##flag, &(ni)->state); \ +} \ +static inline void NInoSet##flag(ntfs_inode *ni) \ +{ \ + set_bit(NI_##flag, &(ni)->state); \ +} \ +static inline void NInoClear##flag(ntfs_inode *ni) \ +{ \ + clear_bit(NI_##flag, &(ni)->state); \ +} + +/* + * As above for NInoTestSetFoo() and NInoTestClearFoo(). + */ +#define TAS_NINO_FNS(flag) \ +static inline int NInoTestSet##flag(ntfs_inode *ni) \ +{ \ + return test_and_set_bit(NI_##flag, &(ni)->state); \ +} \ +static inline int NInoTestClear##flag(ntfs_inode *ni) \ +{ \ + return test_and_clear_bit(NI_##flag, &(ni)->state); \ +} + +/* Emit the ntfs inode bitops functions. */ +NINO_FNS(Dirty) +TAS_NINO_FNS(Dirty) +NINO_FNS(AttrList) +NINO_FNS(AttrListNonResident) +NINO_FNS(Attr) +NINO_FNS(MstProtected) +NINO_FNS(NonResident) +NINO_FNS(IndexAllocPresent) +NINO_FNS(Compressed) +NINO_FNS(Encrypted) +NINO_FNS(Sparse) +NINO_FNS(SparseDisabled) +NINO_FNS(TruncateFailed) + +/* + * The full structure containing a ntfs_inode and a vfs struct inode. Used for + * all real and fake inodes but not for extent inodes which lack the vfs struct + * inode. + */ +typedef struct { + ntfs_inode ntfs_inode; + struct inode vfs_inode; /* The vfs inode structure. */ +} big_ntfs_inode; + +/** + * NTFS_I - return the ntfs inode given a vfs inode + * @inode: VFS inode + * + * NTFS_I() returns the ntfs inode associated with the VFS @inode. + */ +static inline ntfs_inode *NTFS_I(struct inode *inode) +{ + return (ntfs_inode *)container_of(inode, big_ntfs_inode, vfs_inode); +} + +static inline struct inode *VFS_I(ntfs_inode *ni) +{ + return &((big_ntfs_inode *)ni)->vfs_inode; +} + +/** + * ntfs_attr - ntfs in memory attribute structure + * @mft_no: mft record number of the base mft record of this attribute + * @name: Unicode name of the attribute (NULL if unnamed) + * @name_len: length of @name in Unicode characters (0 if unnamed) + * @type: attribute type (see layout.h) + * + * This structure exists only to provide a small structure for the + * ntfs_{attr_}iget()/ntfs_test_inode()/ntfs_init_locked_inode() mechanism. + * + * NOTE: Elements are ordered by size to make the structure as compact as + * possible on all architectures. + */ +typedef struct { + unsigned long mft_no; + ntfschar *name; + u32 name_len; + ATTR_TYPE type; +} ntfs_attr; + +extern int ntfs_test_inode(struct inode *vi, void *data); + +extern struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no); +extern struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type, + ntfschar *name, u32 name_len); +extern struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name, + u32 name_len); + +extern struct inode *ntfs_alloc_big_inode(struct super_block *sb); +extern void ntfs_free_big_inode(struct inode *inode); +extern void ntfs_evict_big_inode(struct inode *vi); + +extern void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni); + +static inline void ntfs_init_big_inode(struct inode *vi) +{ + ntfs_inode *ni = NTFS_I(vi); + + ntfs_debug("Entering."); + __ntfs_init_inode(vi->i_sb, ni); + ni->mft_no = vi->i_ino; +} + +extern ntfs_inode *ntfs_new_extent_inode(struct super_block *sb, + unsigned long mft_no); +extern void ntfs_clear_extent_inode(ntfs_inode *ni); + +extern int ntfs_read_inode_mount(struct inode *vi); + +extern int ntfs_show_options(struct seq_file *sf, struct dentry *root); + +#ifdef NTFS_RW + +extern int ntfs_truncate(struct inode *vi); +extern void ntfs_truncate_vfs(struct inode *vi); + +extern int ntfs_setattr(struct mnt_idmap *idmap, + struct dentry *dentry, struct iattr *attr); + +extern int __ntfs_write_inode(struct inode *vi, int sync); + +static inline void ntfs_commit_inode(struct inode *vi) +{ + if (!is_bad_inode(vi)) + __ntfs_write_inode(vi, 1); + return; +} + +#else + +static inline void ntfs_truncate_vfs(struct inode *vi) {} + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_INODE_H */ diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h new file mode 100644 index 000000000000..5d4bf7a3259f --- /dev/null +++ b/fs/ntfs/layout.h @@ -0,0 +1,2421 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * layout.h - All NTFS associated on-disk structures. Part of the Linux-NTFS + * project. + * + * Copyright (c) 2001-2005 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + */ + +#ifndef _LINUX_NTFS_LAYOUT_H +#define _LINUX_NTFS_LAYOUT_H + +#include +#include +#include +#include + +#include "types.h" + +/* The NTFS oem_id "NTFS " */ +#define magicNTFS cpu_to_le64(0x202020205346544eULL) + +/* + * Location of bootsector on partition: + * The standard NTFS_BOOT_SECTOR is on sector 0 of the partition. + * On NT4 and above there is one backup copy of the boot sector to + * be found on the last sector of the partition (not normally accessible + * from within Windows as the bootsector contained number of sectors + * value is one less than the actual value!). + * On versions of NT 3.51 and earlier, the backup copy was located at + * number of sectors/2 (integer divide), i.e. in the middle of the volume. + */ + +/* + * BIOS parameter block (bpb) structure. + */ +typedef struct { + le16 bytes_per_sector; /* Size of a sector in bytes. */ + u8 sectors_per_cluster; /* Size of a cluster in sectors. */ + le16 reserved_sectors; /* zero */ + u8 fats; /* zero */ + le16 root_entries; /* zero */ + le16 sectors; /* zero */ + u8 media_type; /* 0xf8 = hard disk */ + le16 sectors_per_fat; /* zero */ + le16 sectors_per_track; /* irrelevant */ + le16 heads; /* irrelevant */ + le32 hidden_sectors; /* zero */ + le32 large_sectors; /* zero */ +} __attribute__ ((__packed__)) BIOS_PARAMETER_BLOCK; + +/* + * NTFS boot sector structure. + */ +typedef struct { + u8 jump[3]; /* Irrelevant (jump to boot up code).*/ + le64 oem_id; /* Magic "NTFS ". */ + BIOS_PARAMETER_BLOCK bpb; /* See BIOS_PARAMETER_BLOCK. */ + u8 unused[4]; /* zero, NTFS diskedit.exe states that + this is actually: + __u8 physical_drive; // 0x80 + __u8 current_head; // zero + __u8 extended_boot_signature; + // 0x80 + __u8 unused; // zero + */ +/*0x28*/sle64 number_of_sectors; /* Number of sectors in volume. Gives + maximum volume size of 2^63 sectors. + Assuming standard sector size of 512 + bytes, the maximum byte size is + approx. 4.7x10^21 bytes. (-; */ + sle64 mft_lcn; /* Cluster location of mft data. */ + sle64 mftmirr_lcn; /* Cluster location of copy of mft. */ + s8 clusters_per_mft_record; /* Mft record size in clusters. */ + u8 reserved0[3]; /* zero */ + s8 clusters_per_index_record; /* Index block size in clusters. */ + u8 reserved1[3]; /* zero */ + le64 volume_serial_number; /* Irrelevant (serial number). */ + le32 checksum; /* Boot sector checksum. */ +/*0x54*/u8 bootstrap[426]; /* Irrelevant (boot up code). */ + le16 end_of_sector_marker; /* End of bootsector magic. Always is + 0xaa55 in little endian. */ +/* sizeof() = 512 (0x200) bytes */ +} __attribute__ ((__packed__)) NTFS_BOOT_SECTOR; + +/* + * Magic identifiers present at the beginning of all ntfs record containing + * records (like mft records for example). + */ +enum { + /* Found in $MFT/$DATA. */ + magic_FILE = cpu_to_le32(0x454c4946), /* Mft entry. */ + magic_INDX = cpu_to_le32(0x58444e49), /* Index buffer. */ + magic_HOLE = cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */ + + /* Found in $LogFile/$DATA. */ + magic_RSTR = cpu_to_le32(0x52545352), /* Restart page. */ + magic_RCRD = cpu_to_le32(0x44524352), /* Log record page. */ + + /* Found in $LogFile/$DATA. (May be found in $MFT/$DATA, also?) */ + magic_CHKD = cpu_to_le32(0x444b4843), /* Modified by chkdsk. */ + + /* Found in all ntfs record containing records. */ + magic_BAAD = cpu_to_le32(0x44414142), /* Failed multi sector + transfer was detected. */ + /* + * Found in $LogFile/$DATA when a page is full of 0xff bytes and is + * thus not initialized. Page must be initialized before using it. + */ + magic_empty = cpu_to_le32(0xffffffff) /* Record is empty. */ +}; + +typedef le32 NTFS_RECORD_TYPE; + +/* + * Generic magic comparison macros. Finally found a use for the ## preprocessor + * operator! (-8 + */ + +static inline bool __ntfs_is_magic(le32 x, NTFS_RECORD_TYPE r) +{ + return (x == r); +} +#define ntfs_is_magic(x, m) __ntfs_is_magic(x, magic_##m) + +static inline bool __ntfs_is_magicp(le32 *p, NTFS_RECORD_TYPE r) +{ + return (*p == r); +} +#define ntfs_is_magicp(p, m) __ntfs_is_magicp(p, magic_##m) + +/* + * Specialised magic comparison macros for the NTFS_RECORD_TYPEs defined above. + */ +#define ntfs_is_file_record(x) ( ntfs_is_magic (x, FILE) ) +#define ntfs_is_file_recordp(p) ( ntfs_is_magicp(p, FILE) ) +#define ntfs_is_mft_record(x) ( ntfs_is_file_record (x) ) +#define ntfs_is_mft_recordp(p) ( ntfs_is_file_recordp(p) ) +#define ntfs_is_indx_record(x) ( ntfs_is_magic (x, INDX) ) +#define ntfs_is_indx_recordp(p) ( ntfs_is_magicp(p, INDX) ) +#define ntfs_is_hole_record(x) ( ntfs_is_magic (x, HOLE) ) +#define ntfs_is_hole_recordp(p) ( ntfs_is_magicp(p, HOLE) ) + +#define ntfs_is_rstr_record(x) ( ntfs_is_magic (x, RSTR) ) +#define ntfs_is_rstr_recordp(p) ( ntfs_is_magicp(p, RSTR) ) +#define ntfs_is_rcrd_record(x) ( ntfs_is_magic (x, RCRD) ) +#define ntfs_is_rcrd_recordp(p) ( ntfs_is_magicp(p, RCRD) ) + +#define ntfs_is_chkd_record(x) ( ntfs_is_magic (x, CHKD) ) +#define ntfs_is_chkd_recordp(p) ( ntfs_is_magicp(p, CHKD) ) + +#define ntfs_is_baad_record(x) ( ntfs_is_magic (x, BAAD) ) +#define ntfs_is_baad_recordp(p) ( ntfs_is_magicp(p, BAAD) ) + +#define ntfs_is_empty_record(x) ( ntfs_is_magic (x, empty) ) +#define ntfs_is_empty_recordp(p) ( ntfs_is_magicp(p, empty) ) + +/* + * The Update Sequence Array (usa) is an array of the le16 values which belong + * to the end of each sector protected by the update sequence record in which + * this array is contained. Note that the first entry is the Update Sequence + * Number (usn), a cyclic counter of how many times the protected record has + * been written to disk. The values 0 and -1 (ie. 0xffff) are not used. All + * last le16's of each sector have to be equal to the usn (during reading) or + * are set to it (during writing). If they are not, an incomplete multi sector + * transfer has occurred when the data was written. + * The maximum size for the update sequence array is fixed to: + * maximum size = usa_ofs + (usa_count * 2) = 510 bytes + * The 510 bytes comes from the fact that the last le16 in the array has to + * (obviously) finish before the last le16 of the first 512-byte sector. + * This formula can be used as a consistency check in that usa_ofs + + * (usa_count * 2) has to be less than or equal to 510. + */ +typedef struct { + NTFS_RECORD_TYPE magic; /* A four-byte magic identifying the record + type and/or status. */ + le16 usa_ofs; /* Offset to the Update Sequence Array (usa) + from the start of the ntfs record. */ + le16 usa_count; /* Number of le16 sized entries in the usa + including the Update Sequence Number (usn), + thus the number of fixups is the usa_count + minus 1. */ +} __attribute__ ((__packed__)) NTFS_RECORD; + +/* + * System files mft record numbers. All these files are always marked as used + * in the bitmap attribute of the mft; presumably in order to avoid accidental + * allocation for random other mft records. Also, the sequence number for each + * of the system files is always equal to their mft record number and it is + * never modified. + */ +typedef enum { + FILE_MFT = 0, /* Master file table (mft). Data attribute + contains the entries and bitmap attribute + records which ones are in use (bit==1). */ + FILE_MFTMirr = 1, /* Mft mirror: copy of first four mft records + in data attribute. If cluster size > 4kiB, + copy of first N mft records, with + N = cluster_size / mft_record_size. */ + FILE_LogFile = 2, /* Journalling log in data attribute. */ + FILE_Volume = 3, /* Volume name attribute and volume information + attribute (flags and ntfs version). Windows + refers to this file as volume DASD (Direct + Access Storage Device). */ + FILE_AttrDef = 4, /* Array of attribute definitions in data + attribute. */ + FILE_root = 5, /* Root directory. */ + FILE_Bitmap = 6, /* Allocation bitmap of all clusters (lcns) in + data attribute. */ + FILE_Boot = 7, /* Boot sector (always at cluster 0) in data + attribute. */ + FILE_BadClus = 8, /* Contains all bad clusters in the non-resident + data attribute. */ + FILE_Secure = 9, /* Shared security descriptors in data attribute + and two indexes into the descriptors. + Appeared in Windows 2000. Before that, this + file was named $Quota but was unused. */ + FILE_UpCase = 10, /* Uppercase equivalents of all 65536 Unicode + characters in data attribute. */ + FILE_Extend = 11, /* Directory containing other system files (eg. + $ObjId, $Quota, $Reparse and $UsnJrnl). This + is new to NTFS3.0. */ + FILE_reserved12 = 12, /* Reserved for future use (records 12-15). */ + FILE_reserved13 = 13, + FILE_reserved14 = 14, + FILE_reserved15 = 15, + FILE_first_user = 16, /* First user file, used as test limit for + whether to allow opening a file or not. */ +} NTFS_SYSTEM_FILES; + +/* + * These are the so far known MFT_RECORD_* flags (16-bit) which contain + * information about the mft record in which they are present. + */ +enum { + MFT_RECORD_IN_USE = cpu_to_le16(0x0001), + MFT_RECORD_IS_DIRECTORY = cpu_to_le16(0x0002), +} __attribute__ ((__packed__)); + +typedef le16 MFT_RECORD_FLAGS; + +/* + * mft references (aka file references or file record segment references) are + * used whenever a structure needs to refer to a record in the mft. + * + * A reference consists of a 48-bit index into the mft and a 16-bit sequence + * number used to detect stale references. + * + * For error reporting purposes we treat the 48-bit index as a signed quantity. + * + * The sequence number is a circular counter (skipping 0) describing how many + * times the referenced mft record has been (re)used. This has to match the + * sequence number of the mft record being referenced, otherwise the reference + * is considered stale and removed (FIXME: only ntfsck or the driver itself?). + * + * If the sequence number is zero it is assumed that no sequence number + * consistency checking should be performed. + * + * FIXME: Since inodes are 32-bit as of now, the driver needs to always check + * for high_part being 0 and if not either BUG(), cause a panic() or handle + * the situation in some other way. This shouldn't be a problem as a volume has + * to become HUGE in order to need more than 32-bits worth of mft records. + * Assuming the standard mft record size of 1kb only the records (never mind + * the non-resident attributes, etc.) would require 4Tb of space on their own + * for the first 32 bits worth of records. This is only if some strange person + * doesn't decide to foul play and make the mft sparse which would be a really + * horrible thing to do as it would trash our current driver implementation. )-: + * Do I hear screams "we want 64-bit inodes!" ?!? (-; + * + * FIXME: The mft zone is defined as the first 12% of the volume. This space is + * reserved so that the mft can grow contiguously and hence doesn't become + * fragmented. Volume free space includes the empty part of the mft zone and + * when the volume's free 88% are used up, the mft zone is shrunk by a factor + * of 2, thus making more space available for more files/data. This process is + * repeated every time there is no more free space except for the mft zone until + * there really is no more free space. + */ + +/* + * Typedef the MFT_REF as a 64-bit value for easier handling. + * Also define two unpacking macros to get to the reference (MREF) and + * sequence number (MSEQNO) respectively. + * The _LE versions are to be applied on little endian MFT_REFs. + * Note: The _LE versions will return a CPU endian formatted value! + */ +#define MFT_REF_MASK_CPU 0x0000ffffffffffffULL +#define MFT_REF_MASK_LE cpu_to_le64(MFT_REF_MASK_CPU) + +typedef u64 MFT_REF; +typedef le64 leMFT_REF; + +#define MK_MREF(m, s) ((MFT_REF)(((MFT_REF)(s) << 48) | \ + ((MFT_REF)(m) & MFT_REF_MASK_CPU))) +#define MK_LE_MREF(m, s) cpu_to_le64(MK_MREF(m, s)) + +#define MREF(x) ((unsigned long)((x) & MFT_REF_MASK_CPU)) +#define MSEQNO(x) ((u16)(((x) >> 48) & 0xffff)) +#define MREF_LE(x) ((unsigned long)(le64_to_cpu(x) & MFT_REF_MASK_CPU)) +#define MSEQNO_LE(x) ((u16)((le64_to_cpu(x) >> 48) & 0xffff)) + +#define IS_ERR_MREF(x) (((x) & 0x0000800000000000ULL) ? true : false) +#define ERR_MREF(x) ((u64)((s64)(x))) +#define MREF_ERR(x) ((int)((s64)(x))) + +/* + * The mft record header present at the beginning of every record in the mft. + * This is followed by a sequence of variable length attribute records which + * is terminated by an attribute of type AT_END which is a truncated attribute + * in that it only consists of the attribute type code AT_END and none of the + * other members of the attribute structure are present. + */ +typedef struct { +/*Ofs*/ +/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ + NTFS_RECORD_TYPE magic; /* Usually the magic is "FILE". */ + le16 usa_ofs; /* See NTFS_RECORD definition above. */ + le16 usa_count; /* See NTFS_RECORD definition above. */ + +/* 8*/ le64 lsn; /* $LogFile sequence number for this record. + Changed every time the record is modified. */ +/* 16*/ le16 sequence_number; /* Number of times this mft record has been + reused. (See description for MFT_REF + above.) NOTE: The increment (skipping zero) + is done when the file is deleted. NOTE: If + this is zero it is left zero. */ +/* 18*/ le16 link_count; /* Number of hard links, i.e. the number of + directory entries referencing this record. + NOTE: Only used in mft base records. + NOTE: When deleting a directory entry we + check the link_count and if it is 1 we + delete the file. Otherwise we delete the + FILE_NAME_ATTR being referenced by the + directory entry from the mft record and + decrement the link_count. + FIXME: Careful with Win32 + DOS names! */ +/* 20*/ le16 attrs_offset; /* Byte offset to the first attribute in this + mft record from the start of the mft record. + NOTE: Must be aligned to 8-byte boundary. */ +/* 22*/ MFT_RECORD_FLAGS flags; /* Bit array of MFT_RECORD_FLAGS. When a file + is deleted, the MFT_RECORD_IN_USE flag is + set to zero. */ +/* 24*/ le32 bytes_in_use; /* Number of bytes used in this mft record. + NOTE: Must be aligned to 8-byte boundary. */ +/* 28*/ le32 bytes_allocated; /* Number of bytes allocated for this mft + record. This should be equal to the mft + record size. */ +/* 32*/ leMFT_REF base_mft_record;/* This is zero for base mft records. + When it is not zero it is a mft reference + pointing to the base mft record to which + this record belongs (this is then used to + locate the attribute list attribute present + in the base record which describes this + extension record and hence might need + modification when the extension record + itself is modified, also locating the + attribute list also means finding the other + potential extents, belonging to the non-base + mft record). */ +/* 40*/ le16 next_attr_instance;/* The instance number that will be assigned to + the next attribute added to this mft record. + NOTE: Incremented each time after it is used. + NOTE: Every time the mft record is reused + this number is set to zero. NOTE: The first + instance number is always 0. */ +/* The below fields are specific to NTFS 3.1+ (Windows XP and above): */ +/* 42*/ le16 reserved; /* Reserved/alignment. */ +/* 44*/ le32 mft_record_number; /* Number of this mft record. */ +/* sizeof() = 48 bytes */ +/* + * When (re)using the mft record, we place the update sequence array at this + * offset, i.e. before we start with the attributes. This also makes sense, + * otherwise we could run into problems with the update sequence array + * containing in itself the last two bytes of a sector which would mean that + * multi sector transfer protection wouldn't work. As you can't protect data + * by overwriting it since you then can't get it back... + * When reading we obviously use the data from the ntfs record header. + */ +} __attribute__ ((__packed__)) MFT_RECORD; + +/* This is the version without the NTFS 3.1+ specific fields. */ +typedef struct { +/*Ofs*/ +/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ + NTFS_RECORD_TYPE magic; /* Usually the magic is "FILE". */ + le16 usa_ofs; /* See NTFS_RECORD definition above. */ + le16 usa_count; /* See NTFS_RECORD definition above. */ + +/* 8*/ le64 lsn; /* $LogFile sequence number for this record. + Changed every time the record is modified. */ +/* 16*/ le16 sequence_number; /* Number of times this mft record has been + reused. (See description for MFT_REF + above.) NOTE: The increment (skipping zero) + is done when the file is deleted. NOTE: If + this is zero it is left zero. */ +/* 18*/ le16 link_count; /* Number of hard links, i.e. the number of + directory entries referencing this record. + NOTE: Only used in mft base records. + NOTE: When deleting a directory entry we + check the link_count and if it is 1 we + delete the file. Otherwise we delete the + FILE_NAME_ATTR being referenced by the + directory entry from the mft record and + decrement the link_count. + FIXME: Careful with Win32 + DOS names! */ +/* 20*/ le16 attrs_offset; /* Byte offset to the first attribute in this + mft record from the start of the mft record. + NOTE: Must be aligned to 8-byte boundary. */ +/* 22*/ MFT_RECORD_FLAGS flags; /* Bit array of MFT_RECORD_FLAGS. When a file + is deleted, the MFT_RECORD_IN_USE flag is + set to zero. */ +/* 24*/ le32 bytes_in_use; /* Number of bytes used in this mft record. + NOTE: Must be aligned to 8-byte boundary. */ +/* 28*/ le32 bytes_allocated; /* Number of bytes allocated for this mft + record. This should be equal to the mft + record size. */ +/* 32*/ leMFT_REF base_mft_record;/* This is zero for base mft records. + When it is not zero it is a mft reference + pointing to the base mft record to which + this record belongs (this is then used to + locate the attribute list attribute present + in the base record which describes this + extension record and hence might need + modification when the extension record + itself is modified, also locating the + attribute list also means finding the other + potential extents, belonging to the non-base + mft record). */ +/* 40*/ le16 next_attr_instance;/* The instance number that will be assigned to + the next attribute added to this mft record. + NOTE: Incremented each time after it is used. + NOTE: Every time the mft record is reused + this number is set to zero. NOTE: The first + instance number is always 0. */ +/* sizeof() = 42 bytes */ +/* + * When (re)using the mft record, we place the update sequence array at this + * offset, i.e. before we start with the attributes. This also makes sense, + * otherwise we could run into problems with the update sequence array + * containing in itself the last two bytes of a sector which would mean that + * multi sector transfer protection wouldn't work. As you can't protect data + * by overwriting it since you then can't get it back... + * When reading we obviously use the data from the ntfs record header. + */ +} __attribute__ ((__packed__)) MFT_RECORD_OLD; + +/* + * System defined attributes (32-bit). Each attribute type has a corresponding + * attribute name (Unicode string of maximum 64 character length) as described + * by the attribute definitions present in the data attribute of the $AttrDef + * system file. On NTFS 3.0 volumes the names are just as the types are named + * in the below defines exchanging AT_ for the dollar sign ($). If that is not + * a revealing choice of symbol I do not know what is... (-; + */ +enum { + AT_UNUSED = cpu_to_le32( 0), + AT_STANDARD_INFORMATION = cpu_to_le32( 0x10), + AT_ATTRIBUTE_LIST = cpu_to_le32( 0x20), + AT_FILE_NAME = cpu_to_le32( 0x30), + AT_OBJECT_ID = cpu_to_le32( 0x40), + AT_SECURITY_DESCRIPTOR = cpu_to_le32( 0x50), + AT_VOLUME_NAME = cpu_to_le32( 0x60), + AT_VOLUME_INFORMATION = cpu_to_le32( 0x70), + AT_DATA = cpu_to_le32( 0x80), + AT_INDEX_ROOT = cpu_to_le32( 0x90), + AT_INDEX_ALLOCATION = cpu_to_le32( 0xa0), + AT_BITMAP = cpu_to_le32( 0xb0), + AT_REPARSE_POINT = cpu_to_le32( 0xc0), + AT_EA_INFORMATION = cpu_to_le32( 0xd0), + AT_EA = cpu_to_le32( 0xe0), + AT_PROPERTY_SET = cpu_to_le32( 0xf0), + AT_LOGGED_UTILITY_STREAM = cpu_to_le32( 0x100), + AT_FIRST_USER_DEFINED_ATTRIBUTE = cpu_to_le32( 0x1000), + AT_END = cpu_to_le32(0xffffffff) +}; + +typedef le32 ATTR_TYPE; + +/* + * The collation rules for sorting views/indexes/etc (32-bit). + * + * COLLATION_BINARY - Collate by binary compare where the first byte is most + * significant. + * COLLATION_UNICODE_STRING - Collate Unicode strings by comparing their binary + * Unicode values, except that when a character can be uppercased, the + * upper case value collates before the lower case one. + * COLLATION_FILE_NAME - Collate file names as Unicode strings. The collation + * is done very much like COLLATION_UNICODE_STRING. In fact I have no idea + * what the difference is. Perhaps the difference is that file names + * would treat some special characters in an odd way (see + * unistr.c::ntfs_collate_names() and unistr.c::legal_ansi_char_array[] + * for what I mean but COLLATION_UNICODE_STRING would not give any special + * treatment to any characters at all, but this is speculation. + * COLLATION_NTOFS_ULONG - Sorting is done according to ascending le32 key + * values. E.g. used for $SII index in FILE_Secure, which sorts by + * security_id (le32). + * COLLATION_NTOFS_SID - Sorting is done according to ascending SID values. + * E.g. used for $O index in FILE_Extend/$Quota. + * COLLATION_NTOFS_SECURITY_HASH - Sorting is done first by ascending hash + * values and second by ascending security_id values. E.g. used for $SDH + * index in FILE_Secure. + * COLLATION_NTOFS_ULONGS - Sorting is done according to a sequence of ascending + * le32 key values. E.g. used for $O index in FILE_Extend/$ObjId, which + * sorts by object_id (16-byte), by splitting up the object_id in four + * le32 values and using them as individual keys. E.g. take the following + * two security_ids, stored as follows on disk: + * 1st: a1 61 65 b7 65 7b d4 11 9e 3d 00 e0 81 10 42 59 + * 2nd: 38 14 37 d2 d2 f3 d4 11 a5 21 c8 6b 79 b1 97 45 + * To compare them, they are split into four le32 values each, like so: + * 1st: 0xb76561a1 0x11d47b65 0xe0003d9e 0x59421081 + * 2nd: 0xd2371438 0x11d4f3d2 0x6bc821a5 0x4597b179 + * Now, it is apparent why the 2nd object_id collates after the 1st: the + * first le32 value of the 1st object_id is less than the first le32 of + * the 2nd object_id. If the first le32 values of both object_ids were + * equal then the second le32 values would be compared, etc. + */ +enum { + COLLATION_BINARY = cpu_to_le32(0x00), + COLLATION_FILE_NAME = cpu_to_le32(0x01), + COLLATION_UNICODE_STRING = cpu_to_le32(0x02), + COLLATION_NTOFS_ULONG = cpu_to_le32(0x10), + COLLATION_NTOFS_SID = cpu_to_le32(0x11), + COLLATION_NTOFS_SECURITY_HASH = cpu_to_le32(0x12), + COLLATION_NTOFS_ULONGS = cpu_to_le32(0x13), +}; + +typedef le32 COLLATION_RULE; + +/* + * The flags (32-bit) describing attribute properties in the attribute + * definition structure. FIXME: This information is based on Regis's + * information and, according to him, it is not certain and probably + * incomplete. The INDEXABLE flag is fairly certainly correct as only the file + * name attribute has this flag set and this is the only attribute indexed in + * NT4. + */ +enum { + ATTR_DEF_INDEXABLE = cpu_to_le32(0x02), /* Attribute can be + indexed. */ + ATTR_DEF_MULTIPLE = cpu_to_le32(0x04), /* Attribute type + can be present multiple times in the + mft records of an inode. */ + ATTR_DEF_NOT_ZERO = cpu_to_le32(0x08), /* Attribute value + must contain at least one non-zero + byte. */ + ATTR_DEF_INDEXED_UNIQUE = cpu_to_le32(0x10), /* Attribute must be + indexed and the attribute value must be + unique for the attribute type in all of + the mft records of an inode. */ + ATTR_DEF_NAMED_UNIQUE = cpu_to_le32(0x20), /* Attribute must be + named and the name must be unique for + the attribute type in all of the mft + records of an inode. */ + ATTR_DEF_RESIDENT = cpu_to_le32(0x40), /* Attribute must be + resident. */ + ATTR_DEF_ALWAYS_LOG = cpu_to_le32(0x80), /* Always log + modifications to this attribute, + regardless of whether it is resident or + non-resident. Without this, only log + modifications if the attribute is + resident. */ +}; + +typedef le32 ATTR_DEF_FLAGS; + +/* + * The data attribute of FILE_AttrDef contains a sequence of attribute + * definitions for the NTFS volume. With this, it is supposed to be safe for an + * older NTFS driver to mount a volume containing a newer NTFS version without + * damaging it (that's the theory. In practice it's: not damaging it too much). + * Entries are sorted by attribute type. The flags describe whether the + * attribute can be resident/non-resident and possibly other things, but the + * actual bits are unknown. + */ +typedef struct { +/*hex ofs*/ +/* 0*/ ntfschar name[0x40]; /* Unicode name of the attribute. Zero + terminated. */ +/* 80*/ ATTR_TYPE type; /* Type of the attribute. */ +/* 84*/ le32 display_rule; /* Default display rule. + FIXME: What does it mean? (AIA) */ +/* 88*/ COLLATION_RULE collation_rule; /* Default collation rule. */ +/* 8c*/ ATTR_DEF_FLAGS flags; /* Flags describing the attribute. */ +/* 90*/ sle64 min_size; /* Optional minimum attribute size. */ +/* 98*/ sle64 max_size; /* Maximum size of attribute. */ +/* sizeof() = 0xa0 or 160 bytes */ +} __attribute__ ((__packed__)) ATTR_DEF; + +/* + * Attribute flags (16-bit). + */ +enum { + ATTR_IS_COMPRESSED = cpu_to_le16(0x0001), + ATTR_COMPRESSION_MASK = cpu_to_le16(0x00ff), /* Compression method + mask. Also, first + illegal value. */ + ATTR_IS_ENCRYPTED = cpu_to_le16(0x4000), + ATTR_IS_SPARSE = cpu_to_le16(0x8000), +} __attribute__ ((__packed__)); + +typedef le16 ATTR_FLAGS; + +/* + * Attribute compression. + * + * Only the data attribute is ever compressed in the current ntfs driver in + * Windows. Further, compression is only applied when the data attribute is + * non-resident. Finally, to use compression, the maximum allowed cluster size + * on a volume is 4kib. + * + * The compression method is based on independently compressing blocks of X + * clusters, where X is determined from the compression_unit value found in the + * non-resident attribute record header (more precisely: X = 2^compression_unit + * clusters). On Windows NT/2k, X always is 16 clusters (compression_unit = 4). + * + * There are three different cases of how a compression block of X clusters + * can be stored: + * + * 1) The data in the block is all zero (a sparse block): + * This is stored as a sparse block in the runlist, i.e. the runlist + * entry has length = X and lcn = -1. The mapping pairs array actually + * uses a delta_lcn value length of 0, i.e. delta_lcn is not present at + * all, which is then interpreted by the driver as lcn = -1. + * NOTE: Even uncompressed files can be sparse on NTFS 3.0 volumes, then + * the same principles apply as above, except that the length is not + * restricted to being any particular value. + * + * 2) The data in the block is not compressed: + * This happens when compression doesn't reduce the size of the block + * in clusters. I.e. if compression has a small effect so that the + * compressed data still occupies X clusters, then the uncompressed data + * is stored in the block. + * This case is recognised by the fact that the runlist entry has + * length = X and lcn >= 0. The mapping pairs array stores this as + * normal with a run length of X and some specific delta_lcn, i.e. + * delta_lcn has to be present. + * + * 3) The data in the block is compressed: + * The common case. This case is recognised by the fact that the run + * list entry has length L < X and lcn >= 0. The mapping pairs array + * stores this as normal with a run length of X and some specific + * delta_lcn, i.e. delta_lcn has to be present. This runlist entry is + * immediately followed by a sparse entry with length = X - L and + * lcn = -1. The latter entry is to make up the vcn counting to the + * full compression block size X. + * + * In fact, life is more complicated because adjacent entries of the same type + * can be coalesced. This means that one has to keep track of the number of + * clusters handled and work on a basis of X clusters at a time being one + * block. An example: if length L > X this means that this particular runlist + * entry contains a block of length X and part of one or more blocks of length + * L - X. Another example: if length L < X, this does not necessarily mean that + * the block is compressed as it might be that the lcn changes inside the block + * and hence the following runlist entry describes the continuation of the + * potentially compressed block. The block would be compressed if the + * following runlist entry describes at least X - L sparse clusters, thus + * making up the compression block length as described in point 3 above. (Of + * course, there can be several runlist entries with small lengths so that the + * sparse entry does not follow the first data containing entry with + * length < X.) + * + * NOTE: At the end of the compressed attribute value, there most likely is not + * just the right amount of data to make up a compression block, thus this data + * is not even attempted to be compressed. It is just stored as is, unless + * the number of clusters it occupies is reduced when compressed in which case + * it is stored as a compressed compression block, complete with sparse + * clusters at the end. + */ + +/* + * Flags of resident attributes (8-bit). + */ +enum { + RESIDENT_ATTR_IS_INDEXED = 0x01, /* Attribute is referenced in an index + (has implications for deleting and + modifying the attribute). */ +} __attribute__ ((__packed__)); + +typedef u8 RESIDENT_ATTR_FLAGS; + +/* + * Attribute record header. Always aligned to 8-byte boundary. + */ +typedef struct { +/*Ofs*/ +/* 0*/ ATTR_TYPE type; /* The (32-bit) type of the attribute. */ +/* 4*/ le32 length; /* Byte size of the resident part of the + attribute (aligned to 8-byte boundary). + Used to get to the next attribute. */ +/* 8*/ u8 non_resident; /* If 0, attribute is resident. + If 1, attribute is non-resident. */ +/* 9*/ u8 name_length; /* Unicode character size of name of attribute. + 0 if unnamed. */ +/* 10*/ le16 name_offset; /* If name_length != 0, the byte offset to the + beginning of the name from the attribute + record. Note that the name is stored as a + Unicode string. When creating, place offset + just at the end of the record header. Then, + follow with attribute value or mapping pairs + array, resident and non-resident attributes + respectively, aligning to an 8-byte + boundary. */ +/* 12*/ ATTR_FLAGS flags; /* Flags describing the attribute. */ +/* 14*/ le16 instance; /* The instance of this attribute record. This + number is unique within this mft record (see + MFT_RECORD/next_attribute_instance notes in + mft.h for more details). */ +/* 16*/ union { + /* Resident attributes. */ + struct { +/* 16 */ le32 value_length;/* Byte size of attribute value. */ +/* 20 */ le16 value_offset;/* Byte offset of the attribute + value from the start of the + attribute record. When creating, + align to 8-byte boundary if we + have a name present as this might + not have a length of a multiple + of 8-bytes. */ +/* 22 */ RESIDENT_ATTR_FLAGS flags; /* See above. */ +/* 23 */ s8 reserved; /* Reserved/alignment to 8-byte + boundary. */ + } __attribute__ ((__packed__)) resident; + /* Non-resident attributes. */ + struct { +/* 16*/ leVCN lowest_vcn;/* Lowest valid virtual cluster number + for this portion of the attribute value or + 0 if this is the only extent (usually the + case). - Only when an attribute list is used + does lowest_vcn != 0 ever occur. */ +/* 24*/ leVCN highest_vcn;/* Highest valid vcn of this extent of + the attribute value. - Usually there is only one + portion, so this usually equals the attribute + value size in clusters minus 1. Can be -1 for + zero length files. Can be 0 for "single extent" + attributes. */ +/* 32*/ le16 mapping_pairs_offset; /* Byte offset from the + beginning of the structure to the mapping pairs + array which contains the mappings between the + vcns and the logical cluster numbers (lcns). + When creating, place this at the end of this + record header aligned to 8-byte boundary. */ +/* 34*/ u8 compression_unit; /* The compression unit expressed + as the log to the base 2 of the number of + clusters in a compression unit. 0 means not + compressed. (This effectively limits the + compression unit size to be a power of two + clusters.) WinNT4 only uses a value of 4. + Sparse files have this set to 0 on XPSP2. */ +/* 35*/ u8 reserved[5]; /* Align to 8-byte boundary. */ +/* The sizes below are only used when lowest_vcn is zero, as otherwise it would + be difficult to keep them up-to-date.*/ +/* 40*/ sle64 allocated_size; /* Byte size of disk space + allocated to hold the attribute value. Always + is a multiple of the cluster size. When a file + is compressed, this field is a multiple of the + compression block size (2^compression_unit) and + it represents the logically allocated space + rather than the actual on disk usage. For this + use the compressed_size (see below). */ +/* 48*/ sle64 data_size; /* Byte size of the attribute + value. Can be larger than allocated_size if + attribute value is compressed or sparse. */ +/* 56*/ sle64 initialized_size; /* Byte size of initialized + portion of the attribute value. Usually equals + data_size. */ +/* sizeof(uncompressed attr) = 64*/ +/* 64*/ sle64 compressed_size; /* Byte size of the attribute + value after compression. Only present when + compressed or sparse. Always is a multiple of + the cluster size. Represents the actual amount + of disk space being used on the disk. */ +/* sizeof(compressed attr) = 72*/ + } __attribute__ ((__packed__)) non_resident; + } __attribute__ ((__packed__)) data; +} __attribute__ ((__packed__)) ATTR_RECORD; + +typedef ATTR_RECORD ATTR_REC; + +/* + * File attribute flags (32-bit) appearing in the file_attributes fields of the + * STANDARD_INFORMATION attribute of MFT_RECORDs and the FILENAME_ATTR + * attributes of MFT_RECORDs and directory index entries. + * + * All of the below flags appear in the directory index entries but only some + * appear in the STANDARD_INFORMATION attribute whilst only some others appear + * in the FILENAME_ATTR attribute of MFT_RECORDs. Unless otherwise stated the + * flags appear in all of the above. + */ +enum { + FILE_ATTR_READONLY = cpu_to_le32(0x00000001), + FILE_ATTR_HIDDEN = cpu_to_le32(0x00000002), + FILE_ATTR_SYSTEM = cpu_to_le32(0x00000004), + /* Old DOS volid. Unused in NT. = cpu_to_le32(0x00000008), */ + + FILE_ATTR_DIRECTORY = cpu_to_le32(0x00000010), + /* Note, FILE_ATTR_DIRECTORY is not considered valid in NT. It is + reserved for the DOS SUBDIRECTORY flag. */ + FILE_ATTR_ARCHIVE = cpu_to_le32(0x00000020), + FILE_ATTR_DEVICE = cpu_to_le32(0x00000040), + FILE_ATTR_NORMAL = cpu_to_le32(0x00000080), + + FILE_ATTR_TEMPORARY = cpu_to_le32(0x00000100), + FILE_ATTR_SPARSE_FILE = cpu_to_le32(0x00000200), + FILE_ATTR_REPARSE_POINT = cpu_to_le32(0x00000400), + FILE_ATTR_COMPRESSED = cpu_to_le32(0x00000800), + + FILE_ATTR_OFFLINE = cpu_to_le32(0x00001000), + FILE_ATTR_NOT_CONTENT_INDEXED = cpu_to_le32(0x00002000), + FILE_ATTR_ENCRYPTED = cpu_to_le32(0x00004000), + + FILE_ATTR_VALID_FLAGS = cpu_to_le32(0x00007fb7), + /* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the + FILE_ATTR_DEVICE and preserves everything else. This mask is used + to obtain all flags that are valid for reading. */ + FILE_ATTR_VALID_SET_FLAGS = cpu_to_le32(0x000031a7), + /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the + F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT, + F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask + is used to obtain all flags that are valid for setting. */ + /* + * The flag FILE_ATTR_DUP_FILENAME_INDEX_PRESENT is present in all + * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION + * attribute of an mft record. + */ + FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = cpu_to_le32(0x10000000), + /* Note, this is a copy of the corresponding bit from the mft record, + telling us whether this is a directory or not, i.e. whether it has + an index root attribute or not. */ + FILE_ATTR_DUP_VIEW_INDEX_PRESENT = cpu_to_le32(0x20000000), + /* Note, this is a copy of the corresponding bit from the mft record, + telling us whether this file has a view index present (eg. object id + index, quota index, one of the security indexes or the encrypting + filesystem related indexes). */ +}; + +typedef le32 FILE_ATTR_FLAGS; + +/* + * NOTE on times in NTFS: All times are in MS standard time format, i.e. they + * are the number of 100-nanosecond intervals since 1st January 1601, 00:00:00 + * universal coordinated time (UTC). (In Linux time starts 1st January 1970, + * 00:00:00 UTC and is stored as the number of 1-second intervals since then.) + */ + +/* + * Attribute: Standard information (0x10). + * + * NOTE: Always resident. + * NOTE: Present in all base file records on a volume. + * NOTE: There is conflicting information about the meaning of each of the time + * fields but the meaning as defined below has been verified to be + * correct by practical experimentation on Windows NT4 SP6a and is hence + * assumed to be the one and only correct interpretation. + */ +typedef struct { +/*Ofs*/ +/* 0*/ sle64 creation_time; /* Time file was created. Updated when + a filename is changed(?). */ +/* 8*/ sle64 last_data_change_time; /* Time the data attribute was last + modified. */ +/* 16*/ sle64 last_mft_change_time; /* Time this mft record was last + modified. */ +/* 24*/ sle64 last_access_time; /* Approximate time when the file was + last accessed (obviously this is not + updated on read-only volumes). In + Windows this is only updated when + accessed if some time delta has + passed since the last update. Also, + last access time updates can be + disabled altogether for speed. */ +/* 32*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */ +/* 36*/ union { + /* NTFS 1.2 */ + struct { + /* 36*/ u8 reserved12[12]; /* Reserved/alignment to 8-byte + boundary. */ + } __attribute__ ((__packed__)) v1; + /* sizeof() = 48 bytes */ + /* NTFS 3.x */ + struct { +/* + * If a volume has been upgraded from a previous NTFS version, then these + * fields are present only if the file has been accessed since the upgrade. + * Recognize the difference by comparing the length of the resident attribute + * value. If it is 48, then the following fields are missing. If it is 72 then + * the fields are present. Maybe just check like this: + * if (resident.ValueLength < sizeof(STANDARD_INFORMATION)) { + * Assume NTFS 1.2- format. + * If (volume version is 3.x) + * Upgrade attribute to NTFS 3.x format. + * else + * Use NTFS 1.2- format for access. + * } else + * Use NTFS 3.x format for access. + * Only problem is that it might be legal to set the length of the value to + * arbitrarily large values thus spoiling this check. - But chkdsk probably + * views that as a corruption, assuming that it behaves like this for all + * attributes. + */ + /* 36*/ le32 maximum_versions; /* Maximum allowed versions for + file. Zero if version numbering is disabled. */ + /* 40*/ le32 version_number; /* This file's version (if any). + Set to zero if maximum_versions is zero. */ + /* 44*/ le32 class_id; /* Class id from bidirectional + class id index (?). */ + /* 48*/ le32 owner_id; /* Owner_id of the user owning + the file. Translate via $Q index in FILE_Extend + /$Quota to the quota control entry for the user + owning the file. Zero if quotas are disabled. */ + /* 52*/ le32 security_id; /* Security_id for the file. + Translate via $SII index and $SDS data stream + in FILE_Secure to the security descriptor. */ + /* 56*/ le64 quota_charged; /* Byte size of the charge to + the quota for all streams of the file. Note: Is + zero if quotas are disabled. */ + /* 64*/ leUSN usn; /* Last update sequence number + of the file. This is a direct index into the + transaction log file ($UsnJrnl). It is zero if + the usn journal is disabled or this file has + not been subject to logging yet. See usnjrnl.h + for details. */ + } __attribute__ ((__packed__)) v3; + /* sizeof() = 72 bytes (NTFS 3.x) */ + } __attribute__ ((__packed__)) ver; +} __attribute__ ((__packed__)) STANDARD_INFORMATION; + +/* + * Attribute: Attribute list (0x20). + * + * - Can be either resident or non-resident. + * - Value consists of a sequence of variable length, 8-byte aligned, + * ATTR_LIST_ENTRY records. + * - The list is not terminated by anything at all! The only way to know when + * the end is reached is to keep track of the current offset and compare it to + * the attribute value size. + * - The attribute list attribute contains one entry for each attribute of + * the file in which the list is located, except for the list attribute + * itself. The list is sorted: first by attribute type, second by attribute + * name (if present), third by instance number. The extents of one + * non-resident attribute (if present) immediately follow after the initial + * extent. They are ordered by lowest_vcn and have their instace set to zero. + * It is not allowed to have two attributes with all sorting keys equal. + * - Further restrictions: + * - If not resident, the vcn to lcn mapping array has to fit inside the + * base mft record. + * - The attribute list attribute value has a maximum size of 256kb. This + * is imposed by the Windows cache manager. + * - Attribute lists are only used when the attributes of mft record do not + * fit inside the mft record despite all attributes (that can be made + * non-resident) having been made non-resident. This can happen e.g. when: + * - File has a large number of hard links (lots of file name + * attributes present). + * - The mapping pairs array of some non-resident attribute becomes so + * large due to fragmentation that it overflows the mft record. + * - The security descriptor is very complex (not applicable to + * NTFS 3.0 volumes). + * - There are many named streams. + */ +typedef struct { +/*Ofs*/ +/* 0*/ ATTR_TYPE type; /* Type of referenced attribute. */ +/* 4*/ le16 length; /* Byte size of this entry (8-byte aligned). */ +/* 6*/ u8 name_length; /* Size in Unicode chars of the name of the + attribute or 0 if unnamed. */ +/* 7*/ u8 name_offset; /* Byte offset to beginning of attribute name + (always set this to where the name would + start even if unnamed). */ +/* 8*/ leVCN lowest_vcn; /* Lowest virtual cluster number of this portion + of the attribute value. This is usually 0. It + is non-zero for the case where one attribute + does not fit into one mft record and thus + several mft records are allocated to hold + this attribute. In the latter case, each mft + record holds one extent of the attribute and + there is one attribute list entry for each + extent. NOTE: This is DEFINITELY a signed + value! The windows driver uses cmp, followed + by jg when comparing this, thus it treats it + as signed. */ +/* 16*/ leMFT_REF mft_reference;/* The reference of the mft record holding + the ATTR_RECORD for this portion of the + attribute value. */ +/* 24*/ le16 instance; /* If lowest_vcn = 0, the instance of the + attribute being referenced; otherwise 0. */ +/* 26*/ ntfschar name[0]; /* Use when creating only. When reading use + name_offset to determine the location of the + name. */ +/* sizeof() = 26 + (attribute_name_length * 2) bytes */ +} __attribute__ ((__packed__)) ATTR_LIST_ENTRY; + +/* + * The maximum allowed length for a file name. + */ +#define MAXIMUM_FILE_NAME_LENGTH 255 + +/* + * Possible namespaces for filenames in ntfs (8-bit). + */ +enum { + FILE_NAME_POSIX = 0x00, + /* This is the largest namespace. It is case sensitive and allows all + Unicode characters except for: '\0' and '/'. Beware that in + WinNT/2k/2003 by default files which eg have the same name except + for their case will not be distinguished by the standard utilities + and thus a "del filename" will delete both "filename" and "fileName" + without warning. However if for example Services For Unix (SFU) are + installed and the case sensitive option was enabled at installation + time, then you can create/access/delete such files. + Note that even SFU places restrictions on the filenames beyond the + '\0' and '/' and in particular the following set of characters is + not allowed: '"', '/', '<', '>', '\'. All other characters, + including the ones no allowed in WIN32 namespace are allowed. + Tested with SFU 3.5 (this is now free) running on Windows XP. */ + FILE_NAME_WIN32 = 0x01, + /* The standard WinNT/2k NTFS long filenames. Case insensitive. All + Unicode chars except: '\0', '"', '*', '/', ':', '<', '>', '?', '\', + and '|'. Further, names cannot end with a '.' or a space. */ + FILE_NAME_DOS = 0x02, + /* The standard DOS filenames (8.3 format). Uppercase only. All 8-bit + characters greater space, except: '"', '*', '+', ',', '/', ':', ';', + '<', '=', '>', '?', and '\'. */ + FILE_NAME_WIN32_AND_DOS = 0x03, + /* 3 means that both the Win32 and the DOS filenames are identical and + hence have been saved in this single filename record. */ +} __attribute__ ((__packed__)); + +typedef u8 FILE_NAME_TYPE_FLAGS; + +/* + * Attribute: Filename (0x30). + * + * NOTE: Always resident. + * NOTE: All fields, except the parent_directory, are only updated when the + * filename is changed. Until then, they just become out of sync with + * reality and the more up to date values are present in the standard + * information attribute. + * NOTE: There is conflicting information about the meaning of each of the time + * fields but the meaning as defined below has been verified to be + * correct by practical experimentation on Windows NT4 SP6a and is hence + * assumed to be the one and only correct interpretation. + */ +typedef struct { +/*hex ofs*/ +/* 0*/ leMFT_REF parent_directory; /* Directory this filename is + referenced from. */ +/* 8*/ sle64 creation_time; /* Time file was created. */ +/* 10*/ sle64 last_data_change_time; /* Time the data attribute was last + modified. */ +/* 18*/ sle64 last_mft_change_time; /* Time this mft record was last + modified. */ +/* 20*/ sle64 last_access_time; /* Time this mft record was last + accessed. */ +/* 28*/ sle64 allocated_size; /* Byte size of on-disk allocated space + for the unnamed data attribute. So + for normal $DATA, this is the + allocated_size from the unnamed + $DATA attribute and for compressed + and/or sparse $DATA, this is the + compressed_size from the unnamed + $DATA attribute. For a directory or + other inode without an unnamed $DATA + attribute, this is always 0. NOTE: + This is a multiple of the cluster + size. */ +/* 30*/ sle64 data_size; /* Byte size of actual data in unnamed + data attribute. For a directory or + other inode without an unnamed $DATA + attribute, this is always 0. */ +/* 38*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */ +/* 3c*/ union { + /* 3c*/ struct { + /* 3c*/ le16 packed_ea_size; /* Size of the buffer needed to + pack the extended attributes + (EAs), if such are present.*/ + /* 3e*/ le16 reserved; /* Reserved for alignment. */ + } __attribute__ ((__packed__)) ea; + /* 3c*/ struct { + /* 3c*/ le32 reparse_point_tag; /* Type of reparse point, + present only in reparse + points and only if there are + no EAs. */ + } __attribute__ ((__packed__)) rp; + } __attribute__ ((__packed__)) type; +/* 40*/ u8 file_name_length; /* Length of file name in + (Unicode) characters. */ +/* 41*/ FILE_NAME_TYPE_FLAGS file_name_type; /* Namespace of the file name.*/ +/* 42*/ ntfschar file_name[0]; /* File name in Unicode. */ +} __attribute__ ((__packed__)) FILE_NAME_ATTR; + +/* + * GUID structures store globally unique identifiers (GUID). A GUID is a + * 128-bit value consisting of one group of eight hexadecimal digits, followed + * by three groups of four hexadecimal digits each, followed by one group of + * twelve hexadecimal digits. GUIDs are Microsoft's implementation of the + * distributed computing environment (DCE) universally unique identifier (UUID). + * Example of a GUID: + * 1F010768-5A73-BC91-0010A52216A7 + */ +typedef struct { + le32 data1; /* The first eight hexadecimal digits of the GUID. */ + le16 data2; /* The first group of four hexadecimal digits. */ + le16 data3; /* The second group of four hexadecimal digits. */ + u8 data4[8]; /* The first two bytes are the third group of four + hexadecimal digits. The remaining six bytes are the + final 12 hexadecimal digits. */ +} __attribute__ ((__packed__)) GUID; + +/* + * FILE_Extend/$ObjId contains an index named $O. This index contains all + * object_ids present on the volume as the index keys and the corresponding + * mft_record numbers as the index entry data parts. The data part (defined + * below) also contains three other object_ids: + * birth_volume_id - object_id of FILE_Volume on which the file was first + * created. Optional (i.e. can be zero). + * birth_object_id - object_id of file when it was first created. Usually + * equals the object_id. Optional (i.e. can be zero). + * domain_id - Reserved (always zero). + */ +typedef struct { + leMFT_REF mft_reference;/* Mft record containing the object_id in + the index entry key. */ + union { + struct { + GUID birth_volume_id; + GUID birth_object_id; + GUID domain_id; + } __attribute__ ((__packed__)) origin; + u8 extended_info[48]; + } __attribute__ ((__packed__)) opt; +} __attribute__ ((__packed__)) OBJ_ID_INDEX_DATA; + +/* + * Attribute: Object id (NTFS 3.0+) (0x40). + * + * NOTE: Always resident. + */ +typedef struct { + GUID object_id; /* Unique id assigned to the + file.*/ + /* The following fields are optional. The attribute value size is 16 + bytes, i.e. sizeof(GUID), if these are not present at all. Note, + the entries can be present but one or more (or all) can be zero + meaning that that particular value(s) is(are) not defined. */ + union { + struct { + GUID birth_volume_id; /* Unique id of volume on which + the file was first created.*/ + GUID birth_object_id; /* Unique id of file when it was + first created. */ + GUID domain_id; /* Reserved, zero. */ + } __attribute__ ((__packed__)) origin; + u8 extended_info[48]; + } __attribute__ ((__packed__)) opt; +} __attribute__ ((__packed__)) OBJECT_ID_ATTR; + +/* + * The pre-defined IDENTIFIER_AUTHORITIES used as SID_IDENTIFIER_AUTHORITY in + * the SID structure (see below). + */ +//typedef enum { /* SID string prefix. */ +// SECURITY_NULL_SID_AUTHORITY = {0, 0, 0, 0, 0, 0}, /* S-1-0 */ +// SECURITY_WORLD_SID_AUTHORITY = {0, 0, 0, 0, 0, 1}, /* S-1-1 */ +// SECURITY_LOCAL_SID_AUTHORITY = {0, 0, 0, 0, 0, 2}, /* S-1-2 */ +// SECURITY_CREATOR_SID_AUTHORITY = {0, 0, 0, 0, 0, 3}, /* S-1-3 */ +// SECURITY_NON_UNIQUE_AUTHORITY = {0, 0, 0, 0, 0, 4}, /* S-1-4 */ +// SECURITY_NT_SID_AUTHORITY = {0, 0, 0, 0, 0, 5}, /* S-1-5 */ +//} IDENTIFIER_AUTHORITIES; + +/* + * These relative identifiers (RIDs) are used with the above identifier + * authorities to make up universal well-known SIDs. + * + * Note: The relative identifier (RID) refers to the portion of a SID, which + * identifies a user or group in relation to the authority that issued the SID. + * For example, the universal well-known SID Creator Owner ID (S-1-3-0) is + * made up of the identifier authority SECURITY_CREATOR_SID_AUTHORITY (3) and + * the relative identifier SECURITY_CREATOR_OWNER_RID (0). + */ +typedef enum { /* Identifier authority. */ + SECURITY_NULL_RID = 0, /* S-1-0 */ + SECURITY_WORLD_RID = 0, /* S-1-1 */ + SECURITY_LOCAL_RID = 0, /* S-1-2 */ + + SECURITY_CREATOR_OWNER_RID = 0, /* S-1-3 */ + SECURITY_CREATOR_GROUP_RID = 1, /* S-1-3 */ + + SECURITY_CREATOR_OWNER_SERVER_RID = 2, /* S-1-3 */ + SECURITY_CREATOR_GROUP_SERVER_RID = 3, /* S-1-3 */ + + SECURITY_DIALUP_RID = 1, + SECURITY_NETWORK_RID = 2, + SECURITY_BATCH_RID = 3, + SECURITY_INTERACTIVE_RID = 4, + SECURITY_SERVICE_RID = 6, + SECURITY_ANONYMOUS_LOGON_RID = 7, + SECURITY_PROXY_RID = 8, + SECURITY_ENTERPRISE_CONTROLLERS_RID=9, + SECURITY_SERVER_LOGON_RID = 9, + SECURITY_PRINCIPAL_SELF_RID = 0xa, + SECURITY_AUTHENTICATED_USER_RID = 0xb, + SECURITY_RESTRICTED_CODE_RID = 0xc, + SECURITY_TERMINAL_SERVER_RID = 0xd, + + SECURITY_LOGON_IDS_RID = 5, + SECURITY_LOGON_IDS_RID_COUNT = 3, + + SECURITY_LOCAL_SYSTEM_RID = 0x12, + + SECURITY_NT_NON_UNIQUE = 0x15, + + SECURITY_BUILTIN_DOMAIN_RID = 0x20, + + /* + * Well-known domain relative sub-authority values (RIDs). + */ + + /* Users. */ + DOMAIN_USER_RID_ADMIN = 0x1f4, + DOMAIN_USER_RID_GUEST = 0x1f5, + DOMAIN_USER_RID_KRBTGT = 0x1f6, + + /* Groups. */ + DOMAIN_GROUP_RID_ADMINS = 0x200, + DOMAIN_GROUP_RID_USERS = 0x201, + DOMAIN_GROUP_RID_GUESTS = 0x202, + DOMAIN_GROUP_RID_COMPUTERS = 0x203, + DOMAIN_GROUP_RID_CONTROLLERS = 0x204, + DOMAIN_GROUP_RID_CERT_ADMINS = 0x205, + DOMAIN_GROUP_RID_SCHEMA_ADMINS = 0x206, + DOMAIN_GROUP_RID_ENTERPRISE_ADMINS= 0x207, + DOMAIN_GROUP_RID_POLICY_ADMINS = 0x208, + + /* Aliases. */ + DOMAIN_ALIAS_RID_ADMINS = 0x220, + DOMAIN_ALIAS_RID_USERS = 0x221, + DOMAIN_ALIAS_RID_GUESTS = 0x222, + DOMAIN_ALIAS_RID_POWER_USERS = 0x223, + + DOMAIN_ALIAS_RID_ACCOUNT_OPS = 0x224, + DOMAIN_ALIAS_RID_SYSTEM_OPS = 0x225, + DOMAIN_ALIAS_RID_PRINT_OPS = 0x226, + DOMAIN_ALIAS_RID_BACKUP_OPS = 0x227, + + DOMAIN_ALIAS_RID_REPLICATOR = 0x228, + DOMAIN_ALIAS_RID_RAS_SERVERS = 0x229, + DOMAIN_ALIAS_RID_PREW2KCOMPACCESS = 0x22a, +} RELATIVE_IDENTIFIERS; + +/* + * The universal well-known SIDs: + * + * NULL_SID S-1-0-0 + * WORLD_SID S-1-1-0 + * LOCAL_SID S-1-2-0 + * CREATOR_OWNER_SID S-1-3-0 + * CREATOR_GROUP_SID S-1-3-1 + * CREATOR_OWNER_SERVER_SID S-1-3-2 + * CREATOR_GROUP_SERVER_SID S-1-3-3 + * + * (Non-unique IDs) S-1-4 + * + * NT well-known SIDs: + * + * NT_AUTHORITY_SID S-1-5 + * DIALUP_SID S-1-5-1 + * + * NETWORD_SID S-1-5-2 + * BATCH_SID S-1-5-3 + * INTERACTIVE_SID S-1-5-4 + * SERVICE_SID S-1-5-6 + * ANONYMOUS_LOGON_SID S-1-5-7 (aka null logon session) + * PROXY_SID S-1-5-8 + * SERVER_LOGON_SID S-1-5-9 (aka domain controller account) + * SELF_SID S-1-5-10 (self RID) + * AUTHENTICATED_USER_SID S-1-5-11 + * RESTRICTED_CODE_SID S-1-5-12 (running restricted code) + * TERMINAL_SERVER_SID S-1-5-13 (running on terminal server) + * + * (Logon IDs) S-1-5-5-X-Y + * + * (NT non-unique IDs) S-1-5-0x15-... + * + * (Built-in domain) S-1-5-0x20 + */ + +/* + * The SID_IDENTIFIER_AUTHORITY is a 48-bit value used in the SID structure. + * + * NOTE: This is stored as a big endian number, hence the high_part comes + * before the low_part. + */ +typedef union { + struct { + u16 high_part; /* High 16-bits. */ + u32 low_part; /* Low 32-bits. */ + } __attribute__ ((__packed__)) parts; + u8 value[6]; /* Value as individual bytes. */ +} __attribute__ ((__packed__)) SID_IDENTIFIER_AUTHORITY; + +/* + * The SID structure is a variable-length structure used to uniquely identify + * users or groups. SID stands for security identifier. + * + * The standard textual representation of the SID is of the form: + * S-R-I-S-S... + * Where: + * - The first "S" is the literal character 'S' identifying the following + * digits as a SID. + * - R is the revision level of the SID expressed as a sequence of digits + * either in decimal or hexadecimal (if the later, prefixed by "0x"). + * - I is the 48-bit identifier_authority, expressed as digits as R above. + * - S... is one or more sub_authority values, expressed as digits as above. + * + * Example SID; the domain-relative SID of the local Administrators group on + * Windows NT/2k: + * S-1-5-32-544 + * This translates to a SID with: + * revision = 1, + * sub_authority_count = 2, + * identifier_authority = {0,0,0,0,0,5}, // SECURITY_NT_AUTHORITY + * sub_authority[0] = 32, // SECURITY_BUILTIN_DOMAIN_RID + * sub_authority[1] = 544 // DOMAIN_ALIAS_RID_ADMINS + */ +typedef struct { + u8 revision; + u8 sub_authority_count; + SID_IDENTIFIER_AUTHORITY identifier_authority; + le32 sub_authority[1]; /* At least one sub_authority. */ +} __attribute__ ((__packed__)) SID; + +/* + * Current constants for SIDs. + */ +typedef enum { + SID_REVISION = 1, /* Current revision level. */ + SID_MAX_SUB_AUTHORITIES = 15, /* Maximum number of those. */ + SID_RECOMMENDED_SUB_AUTHORITIES = 1, /* Will change to around 6 in + a future revision. */ +} SID_CONSTANTS; + +/* + * The predefined ACE types (8-bit, see below). + */ +enum { + ACCESS_MIN_MS_ACE_TYPE = 0, + ACCESS_ALLOWED_ACE_TYPE = 0, + ACCESS_DENIED_ACE_TYPE = 1, + SYSTEM_AUDIT_ACE_TYPE = 2, + SYSTEM_ALARM_ACE_TYPE = 3, /* Not implemented as of Win2k. */ + ACCESS_MAX_MS_V2_ACE_TYPE = 3, + + ACCESS_ALLOWED_COMPOUND_ACE_TYPE= 4, + ACCESS_MAX_MS_V3_ACE_TYPE = 4, + + /* The following are Win2k only. */ + ACCESS_MIN_MS_OBJECT_ACE_TYPE = 5, + ACCESS_ALLOWED_OBJECT_ACE_TYPE = 5, + ACCESS_DENIED_OBJECT_ACE_TYPE = 6, + SYSTEM_AUDIT_OBJECT_ACE_TYPE = 7, + SYSTEM_ALARM_OBJECT_ACE_TYPE = 8, + ACCESS_MAX_MS_OBJECT_ACE_TYPE = 8, + + ACCESS_MAX_MS_V4_ACE_TYPE = 8, + + /* This one is for WinNT/2k. */ + ACCESS_MAX_MS_ACE_TYPE = 8, +} __attribute__ ((__packed__)); + +typedef u8 ACE_TYPES; + +/* + * The ACE flags (8-bit) for audit and inheritance (see below). + * + * SUCCESSFUL_ACCESS_ACE_FLAG is only used with system audit and alarm ACE + * types to indicate that a message is generated (in Windows!) for successful + * accesses. + * + * FAILED_ACCESS_ACE_FLAG is only used with system audit and alarm ACE types + * to indicate that a message is generated (in Windows!) for failed accesses. + */ +enum { + /* The inheritance flags. */ + OBJECT_INHERIT_ACE = 0x01, + CONTAINER_INHERIT_ACE = 0x02, + NO_PROPAGATE_INHERIT_ACE = 0x04, + INHERIT_ONLY_ACE = 0x08, + INHERITED_ACE = 0x10, /* Win2k only. */ + VALID_INHERIT_FLAGS = 0x1f, + + /* The audit flags. */ + SUCCESSFUL_ACCESS_ACE_FLAG = 0x40, + FAILED_ACCESS_ACE_FLAG = 0x80, +} __attribute__ ((__packed__)); + +typedef u8 ACE_FLAGS; + +/* + * An ACE is an access-control entry in an access-control list (ACL). + * An ACE defines access to an object for a specific user or group or defines + * the types of access that generate system-administration messages or alarms + * for a specific user or group. The user or group is identified by a security + * identifier (SID). + * + * Each ACE starts with an ACE_HEADER structure (aligned on 4-byte boundary), + * which specifies the type and size of the ACE. The format of the subsequent + * data depends on the ACE type. + */ +typedef struct { +/*Ofs*/ +/* 0*/ ACE_TYPES type; /* Type of the ACE. */ +/* 1*/ ACE_FLAGS flags; /* Flags describing the ACE. */ +/* 2*/ le16 size; /* Size in bytes of the ACE. */ +} __attribute__ ((__packed__)) ACE_HEADER; + +/* + * The access mask (32-bit). Defines the access rights. + * + * The specific rights (bits 0 to 15). These depend on the type of the object + * being secured by the ACE. + */ +enum { + /* Specific rights for files and directories are as follows: */ + + /* Right to read data from the file. (FILE) */ + FILE_READ_DATA = cpu_to_le32(0x00000001), + /* Right to list contents of a directory. (DIRECTORY) */ + FILE_LIST_DIRECTORY = cpu_to_le32(0x00000001), + + /* Right to write data to the file. (FILE) */ + FILE_WRITE_DATA = cpu_to_le32(0x00000002), + /* Right to create a file in the directory. (DIRECTORY) */ + FILE_ADD_FILE = cpu_to_le32(0x00000002), + + /* Right to append data to the file. (FILE) */ + FILE_APPEND_DATA = cpu_to_le32(0x00000004), + /* Right to create a subdirectory. (DIRECTORY) */ + FILE_ADD_SUBDIRECTORY = cpu_to_le32(0x00000004), + + /* Right to read extended attributes. (FILE/DIRECTORY) */ + FILE_READ_EA = cpu_to_le32(0x00000008), + + /* Right to write extended attributes. (FILE/DIRECTORY) */ + FILE_WRITE_EA = cpu_to_le32(0x00000010), + + /* Right to execute a file. (FILE) */ + FILE_EXECUTE = cpu_to_le32(0x00000020), + /* Right to traverse the directory. (DIRECTORY) */ + FILE_TRAVERSE = cpu_to_le32(0x00000020), + + /* + * Right to delete a directory and all the files it contains (its + * children), even if the files are read-only. (DIRECTORY) + */ + FILE_DELETE_CHILD = cpu_to_le32(0x00000040), + + /* Right to read file attributes. (FILE/DIRECTORY) */ + FILE_READ_ATTRIBUTES = cpu_to_le32(0x00000080), + + /* Right to change file attributes. (FILE/DIRECTORY) */ + FILE_WRITE_ATTRIBUTES = cpu_to_le32(0x00000100), + + /* + * The standard rights (bits 16 to 23). These are independent of the + * type of object being secured. + */ + + /* Right to delete the object. */ + DELETE = cpu_to_le32(0x00010000), + + /* + * Right to read the information in the object's security descriptor, + * not including the information in the SACL, i.e. right to read the + * security descriptor and owner. + */ + READ_CONTROL = cpu_to_le32(0x00020000), + + /* Right to modify the DACL in the object's security descriptor. */ + WRITE_DAC = cpu_to_le32(0x00040000), + + /* Right to change the owner in the object's security descriptor. */ + WRITE_OWNER = cpu_to_le32(0x00080000), + + /* + * Right to use the object for synchronization. Enables a process to + * wait until the object is in the signalled state. Some object types + * do not support this access right. + */ + SYNCHRONIZE = cpu_to_le32(0x00100000), + + /* + * The following STANDARD_RIGHTS_* are combinations of the above for + * convenience and are defined by the Win32 API. + */ + + /* These are currently defined to READ_CONTROL. */ + STANDARD_RIGHTS_READ = cpu_to_le32(0x00020000), + STANDARD_RIGHTS_WRITE = cpu_to_le32(0x00020000), + STANDARD_RIGHTS_EXECUTE = cpu_to_le32(0x00020000), + + /* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */ + STANDARD_RIGHTS_REQUIRED = cpu_to_le32(0x000f0000), + + /* + * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and + * SYNCHRONIZE access. + */ + STANDARD_RIGHTS_ALL = cpu_to_le32(0x001f0000), + + /* + * The access system ACL and maximum allowed access types (bits 24 to + * 25, bits 26 to 27 are reserved). + */ + ACCESS_SYSTEM_SECURITY = cpu_to_le32(0x01000000), + MAXIMUM_ALLOWED = cpu_to_le32(0x02000000), + + /* + * The generic rights (bits 28 to 31). These map onto the standard and + * specific rights. + */ + + /* Read, write, and execute access. */ + GENERIC_ALL = cpu_to_le32(0x10000000), + + /* Execute access. */ + GENERIC_EXECUTE = cpu_to_le32(0x20000000), + + /* + * Write access. For files, this maps onto: + * FILE_APPEND_DATA | FILE_WRITE_ATTRIBUTES | FILE_WRITE_DATA | + * FILE_WRITE_EA | STANDARD_RIGHTS_WRITE | SYNCHRONIZE + * For directories, the mapping has the same numerical value. See + * above for the descriptions of the rights granted. + */ + GENERIC_WRITE = cpu_to_le32(0x40000000), + + /* + * Read access. For files, this maps onto: + * FILE_READ_ATTRIBUTES | FILE_READ_DATA | FILE_READ_EA | + * STANDARD_RIGHTS_READ | SYNCHRONIZE + * For directories, the mapping has the same numberical value. See + * above for the descriptions of the rights granted. + */ + GENERIC_READ = cpu_to_le32(0x80000000), +}; + +typedef le32 ACCESS_MASK; + +/* + * The generic mapping array. Used to denote the mapping of each generic + * access right to a specific access mask. + * + * FIXME: What exactly is this and what is it for? (AIA) + */ +typedef struct { + ACCESS_MASK generic_read; + ACCESS_MASK generic_write; + ACCESS_MASK generic_execute; + ACCESS_MASK generic_all; +} __attribute__ ((__packed__)) GENERIC_MAPPING; + +/* + * The predefined ACE type structures are as defined below. + */ + +/* + * ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE, SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE + */ +typedef struct { +/* 0 ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */ + ACE_TYPES type; /* Type of the ACE. */ + ACE_FLAGS flags; /* Flags describing the ACE. */ + le16 size; /* Size in bytes of the ACE. */ +/* 4*/ ACCESS_MASK mask; /* Access mask associated with the ACE. */ + +/* 8*/ SID sid; /* The SID associated with the ACE. */ +} __attribute__ ((__packed__)) ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE, + SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE; + +/* + * The object ACE flags (32-bit). + */ +enum { + ACE_OBJECT_TYPE_PRESENT = cpu_to_le32(1), + ACE_INHERITED_OBJECT_TYPE_PRESENT = cpu_to_le32(2), +}; + +typedef le32 OBJECT_ACE_FLAGS; + +typedef struct { +/* 0 ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */ + ACE_TYPES type; /* Type of the ACE. */ + ACE_FLAGS flags; /* Flags describing the ACE. */ + le16 size; /* Size in bytes of the ACE. */ +/* 4*/ ACCESS_MASK mask; /* Access mask associated with the ACE. */ + +/* 8*/ OBJECT_ACE_FLAGS object_flags; /* Flags describing the object ACE. */ +/* 12*/ GUID object_type; +/* 28*/ GUID inherited_object_type; + +/* 44*/ SID sid; /* The SID associated with the ACE. */ +} __attribute__ ((__packed__)) ACCESS_ALLOWED_OBJECT_ACE, + ACCESS_DENIED_OBJECT_ACE, + SYSTEM_AUDIT_OBJECT_ACE, + SYSTEM_ALARM_OBJECT_ACE; + +/* + * An ACL is an access-control list (ACL). + * An ACL starts with an ACL header structure, which specifies the size of + * the ACL and the number of ACEs it contains. The ACL header is followed by + * zero or more access control entries (ACEs). The ACL as well as each ACE + * are aligned on 4-byte boundaries. + */ +typedef struct { + u8 revision; /* Revision of this ACL. */ + u8 alignment1; + le16 size; /* Allocated space in bytes for ACL. Includes this + header, the ACEs and the remaining free space. */ + le16 ace_count; /* Number of ACEs in the ACL. */ + le16 alignment2; +/* sizeof() = 8 bytes */ +} __attribute__ ((__packed__)) ACL; + +/* + * Current constants for ACLs. + */ +typedef enum { + /* Current revision. */ + ACL_REVISION = 2, + ACL_REVISION_DS = 4, + + /* History of revisions. */ + ACL_REVISION1 = 1, + MIN_ACL_REVISION = 2, + ACL_REVISION2 = 2, + ACL_REVISION3 = 3, + ACL_REVISION4 = 4, + MAX_ACL_REVISION = 4, +} ACL_CONSTANTS; + +/* + * The security descriptor control flags (16-bit). + * + * SE_OWNER_DEFAULTED - This boolean flag, when set, indicates that the SID + * pointed to by the Owner field was provided by a defaulting mechanism + * rather than explicitly provided by the original provider of the + * security descriptor. This may affect the treatment of the SID with + * respect to inheritance of an owner. + * + * SE_GROUP_DEFAULTED - This boolean flag, when set, indicates that the SID in + * the Group field was provided by a defaulting mechanism rather than + * explicitly provided by the original provider of the security + * descriptor. This may affect the treatment of the SID with respect to + * inheritance of a primary group. + * + * SE_DACL_PRESENT - This boolean flag, when set, indicates that the security + * descriptor contains a discretionary ACL. If this flag is set and the + * Dacl field of the SECURITY_DESCRIPTOR is null, then a null ACL is + * explicitly being specified. + * + * SE_DACL_DEFAULTED - This boolean flag, when set, indicates that the ACL + * pointed to by the Dacl field was provided by a defaulting mechanism + * rather than explicitly provided by the original provider of the + * security descriptor. This may affect the treatment of the ACL with + * respect to inheritance of an ACL. This flag is ignored if the + * DaclPresent flag is not set. + * + * SE_SACL_PRESENT - This boolean flag, when set, indicates that the security + * descriptor contains a system ACL pointed to by the Sacl field. If this + * flag is set and the Sacl field of the SECURITY_DESCRIPTOR is null, then + * an empty (but present) ACL is being specified. + * + * SE_SACL_DEFAULTED - This boolean flag, when set, indicates that the ACL + * pointed to by the Sacl field was provided by a defaulting mechanism + * rather than explicitly provided by the original provider of the + * security descriptor. This may affect the treatment of the ACL with + * respect to inheritance of an ACL. This flag is ignored if the + * SaclPresent flag is not set. + * + * SE_SELF_RELATIVE - This boolean flag, when set, indicates that the security + * descriptor is in self-relative form. In this form, all fields of the + * security descriptor are contiguous in memory and all pointer fields are + * expressed as offsets from the beginning of the security descriptor. + */ +enum { + SE_OWNER_DEFAULTED = cpu_to_le16(0x0001), + SE_GROUP_DEFAULTED = cpu_to_le16(0x0002), + SE_DACL_PRESENT = cpu_to_le16(0x0004), + SE_DACL_DEFAULTED = cpu_to_le16(0x0008), + + SE_SACL_PRESENT = cpu_to_le16(0x0010), + SE_SACL_DEFAULTED = cpu_to_le16(0x0020), + + SE_DACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0100), + SE_SACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0200), + SE_DACL_AUTO_INHERITED = cpu_to_le16(0x0400), + SE_SACL_AUTO_INHERITED = cpu_to_le16(0x0800), + + SE_DACL_PROTECTED = cpu_to_le16(0x1000), + SE_SACL_PROTECTED = cpu_to_le16(0x2000), + SE_RM_CONTROL_VALID = cpu_to_le16(0x4000), + SE_SELF_RELATIVE = cpu_to_le16(0x8000) +} __attribute__ ((__packed__)); + +typedef le16 SECURITY_DESCRIPTOR_CONTROL; + +/* + * Self-relative security descriptor. Contains the owner and group SIDs as well + * as the sacl and dacl ACLs inside the security descriptor itself. + */ +typedef struct { + u8 revision; /* Revision level of the security descriptor. */ + u8 alignment; + SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of + the descriptor as well as the following fields. */ + le32 owner; /* Byte offset to a SID representing an object's + owner. If this is NULL, no owner SID is present in + the descriptor. */ + le32 group; /* Byte offset to a SID representing an object's + primary group. If this is NULL, no primary group + SID is present in the descriptor. */ + le32 sacl; /* Byte offset to a system ACL. Only valid, if + SE_SACL_PRESENT is set in the control field. If + SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL + is specified. */ + le32 dacl; /* Byte offset to a discretionary ACL. Only valid, if + SE_DACL_PRESENT is set in the control field. If + SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL + (unconditionally granting access) is specified. */ +/* sizeof() = 0x14 bytes */ +} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_RELATIVE; + +/* + * Absolute security descriptor. Does not contain the owner and group SIDs, nor + * the sacl and dacl ACLs inside the security descriptor. Instead, it contains + * pointers to these structures in memory. Obviously, absolute security + * descriptors are only useful for in memory representations of security + * descriptors. On disk, a self-relative security descriptor is used. + */ +typedef struct { + u8 revision; /* Revision level of the security descriptor. */ + u8 alignment; + SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of + the descriptor as well as the following fields. */ + SID *owner; /* Points to a SID representing an object's owner. If + this is NULL, no owner SID is present in the + descriptor. */ + SID *group; /* Points to a SID representing an object's primary + group. If this is NULL, no primary group SID is + present in the descriptor. */ + ACL *sacl; /* Points to a system ACL. Only valid, if + SE_SACL_PRESENT is set in the control field. If + SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL + is specified. */ + ACL *dacl; /* Points to a discretionary ACL. Only valid, if + SE_DACL_PRESENT is set in the control field. If + SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL + (unconditionally granting access) is specified. */ +} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR; + +/* + * Current constants for security descriptors. + */ +typedef enum { + /* Current revision. */ + SECURITY_DESCRIPTOR_REVISION = 1, + SECURITY_DESCRIPTOR_REVISION1 = 1, + + /* The sizes of both the absolute and relative security descriptors is + the same as pointers, at least on ia32 architecture are 32-bit. */ + SECURITY_DESCRIPTOR_MIN_LENGTH = sizeof(SECURITY_DESCRIPTOR), +} SECURITY_DESCRIPTOR_CONSTANTS; + +/* + * Attribute: Security descriptor (0x50). A standard self-relative security + * descriptor. + * + * NOTE: Can be resident or non-resident. + * NOTE: Not used in NTFS 3.0+, as security descriptors are stored centrally + * in FILE_Secure and the correct descriptor is found using the security_id + * from the standard information attribute. + */ +typedef SECURITY_DESCRIPTOR_RELATIVE SECURITY_DESCRIPTOR_ATTR; + +/* + * On NTFS 3.0+, all security descriptors are stored in FILE_Secure. Only one + * referenced instance of each unique security descriptor is stored. + * + * FILE_Secure contains no unnamed data attribute, i.e. it has zero length. It + * does, however, contain two indexes ($SDH and $SII) as well as a named data + * stream ($SDS). + * + * Every unique security descriptor is assigned a unique security identifier + * (security_id, not to be confused with a SID). The security_id is unique for + * the NTFS volume and is used as an index into the $SII index, which maps + * security_ids to the security descriptor's storage location within the $SDS + * data attribute. The $SII index is sorted by ascending security_id. + * + * A simple hash is computed from each security descriptor. This hash is used + * as an index into the $SDH index, which maps security descriptor hashes to + * the security descriptor's storage location within the $SDS data attribute. + * The $SDH index is sorted by security descriptor hash and is stored in a B+ + * tree. When searching $SDH (with the intent of determining whether or not a + * new security descriptor is already present in the $SDS data stream), if a + * matching hash is found, but the security descriptors do not match, the + * search in the $SDH index is continued, searching for a next matching hash. + * + * When a precise match is found, the security_id coresponding to the security + * descriptor in the $SDS attribute is read from the found $SDH index entry and + * is stored in the $STANDARD_INFORMATION attribute of the file/directory to + * which the security descriptor is being applied. The $STANDARD_INFORMATION + * attribute is present in all base mft records (i.e. in all files and + * directories). + * + * If a match is not found, the security descriptor is assigned a new unique + * security_id and is added to the $SDS data attribute. Then, entries + * referencing the this security descriptor in the $SDS data attribute are + * added to the $SDH and $SII indexes. + * + * Note: Entries are never deleted from FILE_Secure, even if nothing + * references an entry any more. + */ + +/* + * This header precedes each security descriptor in the $SDS data stream. + * This is also the index entry data part of both the $SII and $SDH indexes. + */ +typedef struct { + le32 hash; /* Hash of the security descriptor. */ + le32 security_id; /* The security_id assigned to the descriptor. */ + le64 offset; /* Byte offset of this entry in the $SDS stream. */ + le32 length; /* Size in bytes of this entry in $SDS stream. */ +} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_HEADER; + +/* + * The $SDS data stream contains the security descriptors, aligned on 16-byte + * boundaries, sorted by security_id in a B+ tree. Security descriptors cannot + * cross 256kib boundaries (this restriction is imposed by the Windows cache + * manager). Each security descriptor is contained in a SDS_ENTRY structure. + * Also, each security descriptor is stored twice in the $SDS stream with a + * fixed offset of 0x40000 bytes (256kib, the Windows cache manager's max size) + * between them; i.e. if a SDS_ENTRY specifies an offset of 0x51d0, then the + * first copy of the security descriptor will be at offset 0x51d0 in the + * $SDS data stream and the second copy will be at offset 0x451d0. + */ +typedef struct { +/*Ofs*/ +/* 0 SECURITY_DESCRIPTOR_HEADER; -- Unfolded here as gcc doesn't like + unnamed structs. */ + le32 hash; /* Hash of the security descriptor. */ + le32 security_id; /* The security_id assigned to the descriptor. */ + le64 offset; /* Byte offset of this entry in the $SDS stream. */ + le32 length; /* Size in bytes of this entry in $SDS stream. */ +/* 20*/ SECURITY_DESCRIPTOR_RELATIVE sid; /* The self-relative security + descriptor. */ +} __attribute__ ((__packed__)) SDS_ENTRY; + +/* + * The index entry key used in the $SII index. The collation type is + * COLLATION_NTOFS_ULONG. + */ +typedef struct { + le32 security_id; /* The security_id assigned to the descriptor. */ +} __attribute__ ((__packed__)) SII_INDEX_KEY; + +/* + * The index entry key used in the $SDH index. The keys are sorted first by + * hash and then by security_id. The collation rule is + * COLLATION_NTOFS_SECURITY_HASH. + */ +typedef struct { + le32 hash; /* Hash of the security descriptor. */ + le32 security_id; /* The security_id assigned to the descriptor. */ +} __attribute__ ((__packed__)) SDH_INDEX_KEY; + +/* + * Attribute: Volume name (0x60). + * + * NOTE: Always resident. + * NOTE: Present only in FILE_Volume. + */ +typedef struct { + ntfschar name[0]; /* The name of the volume in Unicode. */ +} __attribute__ ((__packed__)) VOLUME_NAME; + +/* + * Possible flags for the volume (16-bit). + */ +enum { + VOLUME_IS_DIRTY = cpu_to_le16(0x0001), + VOLUME_RESIZE_LOG_FILE = cpu_to_le16(0x0002), + VOLUME_UPGRADE_ON_MOUNT = cpu_to_le16(0x0004), + VOLUME_MOUNTED_ON_NT4 = cpu_to_le16(0x0008), + + VOLUME_DELETE_USN_UNDERWAY = cpu_to_le16(0x0010), + VOLUME_REPAIR_OBJECT_ID = cpu_to_le16(0x0020), + + VOLUME_CHKDSK_UNDERWAY = cpu_to_le16(0x4000), + VOLUME_MODIFIED_BY_CHKDSK = cpu_to_le16(0x8000), + + VOLUME_FLAGS_MASK = cpu_to_le16(0xc03f), + + /* To make our life easier when checking if we must mount read-only. */ + VOLUME_MUST_MOUNT_RO_MASK = cpu_to_le16(0xc027), +} __attribute__ ((__packed__)); + +typedef le16 VOLUME_FLAGS; + +/* + * Attribute: Volume information (0x70). + * + * NOTE: Always resident. + * NOTE: Present only in FILE_Volume. + * NOTE: Windows 2000 uses NTFS 3.0 while Windows NT4 service pack 6a uses + * NTFS 1.2. I haven't personally seen other values yet. + */ +typedef struct { + le64 reserved; /* Not used (yet?). */ + u8 major_ver; /* Major version of the ntfs format. */ + u8 minor_ver; /* Minor version of the ntfs format. */ + VOLUME_FLAGS flags; /* Bit array of VOLUME_* flags. */ +} __attribute__ ((__packed__)) VOLUME_INFORMATION; + +/* + * Attribute: Data attribute (0x80). + * + * NOTE: Can be resident or non-resident. + * + * Data contents of a file (i.e. the unnamed stream) or of a named stream. + */ +typedef struct { + u8 data[0]; /* The file's data contents. */ +} __attribute__ ((__packed__)) DATA_ATTR; + +/* + * Index header flags (8-bit). + */ +enum { + /* + * When index header is in an index root attribute: + */ + SMALL_INDEX = 0, /* The index is small enough to fit inside the index + root attribute and there is no index allocation + attribute present. */ + LARGE_INDEX = 1, /* The index is too large to fit in the index root + attribute and/or an index allocation attribute is + present. */ + /* + * When index header is in an index block, i.e. is part of index + * allocation attribute: + */ + LEAF_NODE = 0, /* This is a leaf node, i.e. there are no more nodes + branching off it. */ + INDEX_NODE = 1, /* This node indexes other nodes, i.e. it is not a leaf + node. */ + NODE_MASK = 1, /* Mask for accessing the *_NODE bits. */ +} __attribute__ ((__packed__)); + +typedef u8 INDEX_HEADER_FLAGS; + +/* + * This is the header for indexes, describing the INDEX_ENTRY records, which + * follow the INDEX_HEADER. Together the index header and the index entries + * make up a complete index. + * + * IMPORTANT NOTE: The offset, length and size structure members are counted + * relative to the start of the index header structure and not relative to the + * start of the index root or index allocation structures themselves. + */ +typedef struct { + le32 entries_offset; /* Byte offset to first INDEX_ENTRY + aligned to 8-byte boundary. */ + le32 index_length; /* Data size of the index in bytes, + i.e. bytes used from allocated + size, aligned to 8-byte boundary. */ + le32 allocated_size; /* Byte size of this index (block), + multiple of 8 bytes. */ + /* NOTE: For the index root attribute, the above two numbers are always + equal, as the attribute is resident and it is resized as needed. In + the case of the index allocation attribute the attribute is not + resident and hence the allocated_size is a fixed value and must + equal the index_block_size specified by the INDEX_ROOT attribute + corresponding to the INDEX_ALLOCATION attribute this INDEX_BLOCK + belongs to. */ + INDEX_HEADER_FLAGS flags; /* Bit field of INDEX_HEADER_FLAGS. */ + u8 reserved[3]; /* Reserved/align to 8-byte boundary. */ +} __attribute__ ((__packed__)) INDEX_HEADER; + +/* + * Attribute: Index root (0x90). + * + * NOTE: Always resident. + * + * This is followed by a sequence of index entries (INDEX_ENTRY structures) + * as described by the index header. + * + * When a directory is small enough to fit inside the index root then this + * is the only attribute describing the directory. When the directory is too + * large to fit in the index root, on the other hand, two additional attributes + * are present: an index allocation attribute, containing sub-nodes of the B+ + * directory tree (see below), and a bitmap attribute, describing which virtual + * cluster numbers (vcns) in the index allocation attribute are in use by an + * index block. + * + * NOTE: The root directory (FILE_root) contains an entry for itself. Other + * directories do not contain entries for themselves, though. + */ +typedef struct { + ATTR_TYPE type; /* Type of the indexed attribute. Is + $FILE_NAME for directories, zero + for view indexes. No other values + allowed. */ + COLLATION_RULE collation_rule; /* Collation rule used to sort the + index entries. If type is $FILE_NAME, + this must be COLLATION_FILE_NAME. */ + le32 index_block_size; /* Size of each index block in bytes (in + the index allocation attribute). */ + u8 clusters_per_index_block; /* Cluster size of each index block (in + the index allocation attribute), when + an index block is >= than a cluster, + otherwise this will be the log of + the size (like how the encoding of + the mft record size and the index + record size found in the boot sector + work). Has to be a power of 2. */ + u8 reserved[3]; /* Reserved/align to 8-byte boundary. */ + INDEX_HEADER index; /* Index header describing the + following index entries. */ +} __attribute__ ((__packed__)) INDEX_ROOT; + +/* + * Attribute: Index allocation (0xa0). + * + * NOTE: Always non-resident (doesn't make sense to be resident anyway!). + * + * This is an array of index blocks. Each index block starts with an + * INDEX_BLOCK structure containing an index header, followed by a sequence of + * index entries (INDEX_ENTRY structures), as described by the INDEX_HEADER. + */ +typedef struct { +/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ + NTFS_RECORD_TYPE magic; /* Magic is "INDX". */ + le16 usa_ofs; /* See NTFS_RECORD definition. */ + le16 usa_count; /* See NTFS_RECORD definition. */ + +/* 8*/ sle64 lsn; /* $LogFile sequence number of the last + modification of this index block. */ +/* 16*/ leVCN index_block_vcn; /* Virtual cluster number of the index block. + If the cluster_size on the volume is <= the + index_block_size of the directory, + index_block_vcn counts in units of clusters, + and in units of sectors otherwise. */ +/* 24*/ INDEX_HEADER index; /* Describes the following index entries. */ +/* sizeof()= 40 (0x28) bytes */ +/* + * When creating the index block, we place the update sequence array at this + * offset, i.e. before we start with the index entries. This also makes sense, + * otherwise we could run into problems with the update sequence array + * containing in itself the last two bytes of a sector which would mean that + * multi sector transfer protection wouldn't work. As you can't protect data + * by overwriting it since you then can't get it back... + * When reading use the data from the ntfs record header. + */ +} __attribute__ ((__packed__)) INDEX_BLOCK; + +typedef INDEX_BLOCK INDEX_ALLOCATION; + +/* + * The system file FILE_Extend/$Reparse contains an index named $R listing + * all reparse points on the volume. The index entry keys are as defined + * below. Note, that there is no index data associated with the index entries. + * + * The index entries are sorted by the index key file_id. The collation rule is + * COLLATION_NTOFS_ULONGS. FIXME: Verify whether the reparse_tag is not the + * primary key / is not a key at all. (AIA) + */ +typedef struct { + le32 reparse_tag; /* Reparse point type (inc. flags). */ + leMFT_REF file_id; /* Mft record of the file containing the + reparse point attribute. */ +} __attribute__ ((__packed__)) REPARSE_INDEX_KEY; + +/* + * Quota flags (32-bit). + * + * The user quota flags. Names explain meaning. + */ +enum { + QUOTA_FLAG_DEFAULT_LIMITS = cpu_to_le32(0x00000001), + QUOTA_FLAG_LIMIT_REACHED = cpu_to_le32(0x00000002), + QUOTA_FLAG_ID_DELETED = cpu_to_le32(0x00000004), + + QUOTA_FLAG_USER_MASK = cpu_to_le32(0x00000007), + /* This is a bit mask for the user quota flags. */ + + /* + * These flags are only present in the quota defaults index entry, i.e. + * in the entry where owner_id = QUOTA_DEFAULTS_ID. + */ + QUOTA_FLAG_TRACKING_ENABLED = cpu_to_le32(0x00000010), + QUOTA_FLAG_ENFORCEMENT_ENABLED = cpu_to_le32(0x00000020), + QUOTA_FLAG_TRACKING_REQUESTED = cpu_to_le32(0x00000040), + QUOTA_FLAG_LOG_THRESHOLD = cpu_to_le32(0x00000080), + + QUOTA_FLAG_LOG_LIMIT = cpu_to_le32(0x00000100), + QUOTA_FLAG_OUT_OF_DATE = cpu_to_le32(0x00000200), + QUOTA_FLAG_CORRUPT = cpu_to_le32(0x00000400), + QUOTA_FLAG_PENDING_DELETES = cpu_to_le32(0x00000800), +}; + +typedef le32 QUOTA_FLAGS; + +/* + * The system file FILE_Extend/$Quota contains two indexes $O and $Q. Quotas + * are on a per volume and per user basis. + * + * The $Q index contains one entry for each existing user_id on the volume. The + * index key is the user_id of the user/group owning this quota control entry, + * i.e. the key is the owner_id. The user_id of the owner of a file, i.e. the + * owner_id, is found in the standard information attribute. The collation rule + * for $Q is COLLATION_NTOFS_ULONG. + * + * The $O index contains one entry for each user/group who has been assigned + * a quota on that volume. The index key holds the SID of the user_id the + * entry belongs to, i.e. the owner_id. The collation rule for $O is + * COLLATION_NTOFS_SID. + * + * The $O index entry data is the user_id of the user corresponding to the SID. + * This user_id is used as an index into $Q to find the quota control entry + * associated with the SID. + * + * The $Q index entry data is the quota control entry and is defined below. + */ +typedef struct { + le32 version; /* Currently equals 2. */ + QUOTA_FLAGS flags; /* Flags describing this quota entry. */ + le64 bytes_used; /* How many bytes of the quota are in use. */ + sle64 change_time; /* Last time this quota entry was changed. */ + sle64 threshold; /* Soft quota (-1 if not limited). */ + sle64 limit; /* Hard quota (-1 if not limited). */ + sle64 exceeded_time; /* How long the soft quota has been exceeded. */ + SID sid; /* The SID of the user/object associated with + this quota entry. Equals zero for the quota + defaults entry (and in fact on a WinXP + volume, it is not present at all). */ +} __attribute__ ((__packed__)) QUOTA_CONTROL_ENTRY; + +/* + * Predefined owner_id values (32-bit). + */ +enum { + QUOTA_INVALID_ID = cpu_to_le32(0x00000000), + QUOTA_DEFAULTS_ID = cpu_to_le32(0x00000001), + QUOTA_FIRST_USER_ID = cpu_to_le32(0x00000100), +}; + +/* + * Current constants for quota control entries. + */ +typedef enum { + /* Current version. */ + QUOTA_VERSION = 2, +} QUOTA_CONTROL_ENTRY_CONSTANTS; + +/* + * Index entry flags (16-bit). + */ +enum { + INDEX_ENTRY_NODE = cpu_to_le16(1), /* This entry contains a + sub-node, i.e. a reference to an index block in form of + a virtual cluster number (see below). */ + INDEX_ENTRY_END = cpu_to_le16(2), /* This signifies the last + entry in an index block. The index entry does not + represent a file but it can point to a sub-node. */ + + INDEX_ENTRY_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force + enum bit width to 16-bit. */ +} __attribute__ ((__packed__)); + +typedef le16 INDEX_ENTRY_FLAGS; + +/* + * This the index entry header (see below). + */ +typedef struct { +/* 0*/ union { + struct { /* Only valid when INDEX_ENTRY_END is not set. */ + leMFT_REF indexed_file; /* The mft reference of the file + described by this index + entry. Used for directory + indexes. */ + } __attribute__ ((__packed__)) dir; + struct { /* Used for views/indexes to find the entry's data. */ + le16 data_offset; /* Data byte offset from this + INDEX_ENTRY. Follows the + index key. */ + le16 data_length; /* Data length in bytes. */ + le32 reservedV; /* Reserved (zero). */ + } __attribute__ ((__packed__)) vi; + } __attribute__ ((__packed__)) data; +/* 8*/ le16 length; /* Byte size of this index entry, multiple of + 8-bytes. */ +/* 10*/ le16 key_length; /* Byte size of the key value, which is in the + index entry. It follows field reserved. Not + multiple of 8-bytes. */ +/* 12*/ INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */ +/* 14*/ le16 reserved; /* Reserved/align to 8-byte boundary. */ +/* sizeof() = 16 bytes */ +} __attribute__ ((__packed__)) INDEX_ENTRY_HEADER; + +/* + * This is an index entry. A sequence of such entries follows each INDEX_HEADER + * structure. Together they make up a complete index. The index follows either + * an index root attribute or an index allocation attribute. + * + * NOTE: Before NTFS 3.0 only filename attributes were indexed. + */ +typedef struct { +/*Ofs*/ +/* 0 INDEX_ENTRY_HEADER; -- Unfolded here as gcc dislikes unnamed structs. */ + union { + struct { /* Only valid when INDEX_ENTRY_END is not set. */ + leMFT_REF indexed_file; /* The mft reference of the file + described by this index + entry. Used for directory + indexes. */ + } __attribute__ ((__packed__)) dir; + struct { /* Used for views/indexes to find the entry's data. */ + le16 data_offset; /* Data byte offset from this + INDEX_ENTRY. Follows the + index key. */ + le16 data_length; /* Data length in bytes. */ + le32 reservedV; /* Reserved (zero). */ + } __attribute__ ((__packed__)) vi; + } __attribute__ ((__packed__)) data; + le16 length; /* Byte size of this index entry, multiple of + 8-bytes. */ + le16 key_length; /* Byte size of the key value, which is in the + index entry. It follows field reserved. Not + multiple of 8-bytes. */ + INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */ + le16 reserved; /* Reserved/align to 8-byte boundary. */ + +/* 16*/ union { /* The key of the indexed attribute. NOTE: Only present + if INDEX_ENTRY_END bit in flags is not set. NOTE: On + NTFS versions before 3.0 the only valid key is the + FILE_NAME_ATTR. On NTFS 3.0+ the following + additional index keys are defined: */ + FILE_NAME_ATTR file_name;/* $I30 index in directories. */ + SII_INDEX_KEY sii; /* $SII index in $Secure. */ + SDH_INDEX_KEY sdh; /* $SDH index in $Secure. */ + GUID object_id; /* $O index in FILE_Extend/$ObjId: The + object_id of the mft record found in + the data part of the index. */ + REPARSE_INDEX_KEY reparse; /* $R index in + FILE_Extend/$Reparse. */ + SID sid; /* $O index in FILE_Extend/$Quota: + SID of the owner of the user_id. */ + le32 owner_id; /* $Q index in FILE_Extend/$Quota: + user_id of the owner of the quota + control entry in the data part of + the index. */ + } __attribute__ ((__packed__)) key; + /* The (optional) index data is inserted here when creating. */ + // leVCN vcn; /* If INDEX_ENTRY_NODE bit in flags is set, the last + // eight bytes of this index entry contain the virtual + // cluster number of the index block that holds the + // entries immediately preceding the current entry (the + // vcn references the corresponding cluster in the data + // of the non-resident index allocation attribute). If + // the key_length is zero, then the vcn immediately + // follows the INDEX_ENTRY_HEADER. Regardless of + // key_length, the address of the 8-byte boundary + // aligned vcn of INDEX_ENTRY{_HEADER} *ie is given by + // (char*)ie + le16_to_cpu(ie*)->length) - sizeof(VCN), + // where sizeof(VCN) can be hardcoded as 8 if wanted. */ +} __attribute__ ((__packed__)) INDEX_ENTRY; + +/* + * Attribute: Bitmap (0xb0). + * + * Contains an array of bits (aka a bitfield). + * + * When used in conjunction with the index allocation attribute, each bit + * corresponds to one index block within the index allocation attribute. Thus + * the number of bits in the bitmap * index block size / cluster size is the + * number of clusters in the index allocation attribute. + */ +typedef struct { + u8 bitmap[0]; /* Array of bits. */ +} __attribute__ ((__packed__)) BITMAP_ATTR; + +/* + * The reparse point tag defines the type of the reparse point. It also + * includes several flags, which further describe the reparse point. + * + * The reparse point tag is an unsigned 32-bit value divided in three parts: + * + * 1. The least significant 16 bits (i.e. bits 0 to 15) specifiy the type of + * the reparse point. + * 2. The 13 bits after this (i.e. bits 16 to 28) are reserved for future use. + * 3. The most significant three bits are flags describing the reparse point. + * They are defined as follows: + * bit 29: Name surrogate bit. If set, the filename is an alias for + * another object in the system. + * bit 30: High-latency bit. If set, accessing the first byte of data will + * be slow. (E.g. the data is stored on a tape drive.) + * bit 31: Microsoft bit. If set, the tag is owned by Microsoft. User + * defined tags have to use zero here. + * + * These are the predefined reparse point tags: + */ +enum { + IO_REPARSE_TAG_IS_ALIAS = cpu_to_le32(0x20000000), + IO_REPARSE_TAG_IS_HIGH_LATENCY = cpu_to_le32(0x40000000), + IO_REPARSE_TAG_IS_MICROSOFT = cpu_to_le32(0x80000000), + + IO_REPARSE_TAG_RESERVED_ZERO = cpu_to_le32(0x00000000), + IO_REPARSE_TAG_RESERVED_ONE = cpu_to_le32(0x00000001), + IO_REPARSE_TAG_RESERVED_RANGE = cpu_to_le32(0x00000001), + + IO_REPARSE_TAG_NSS = cpu_to_le32(0x68000005), + IO_REPARSE_TAG_NSS_RECOVER = cpu_to_le32(0x68000006), + IO_REPARSE_TAG_SIS = cpu_to_le32(0x68000007), + IO_REPARSE_TAG_DFS = cpu_to_le32(0x68000008), + + IO_REPARSE_TAG_MOUNT_POINT = cpu_to_le32(0x88000003), + + IO_REPARSE_TAG_HSM = cpu_to_le32(0xa8000004), + + IO_REPARSE_TAG_SYMBOLIC_LINK = cpu_to_le32(0xe8000000), + + IO_REPARSE_TAG_VALID_VALUES = cpu_to_le32(0xe000ffff), +}; + +/* + * Attribute: Reparse point (0xc0). + * + * NOTE: Can be resident or non-resident. + */ +typedef struct { + le32 reparse_tag; /* Reparse point type (inc. flags). */ + le16 reparse_data_length; /* Byte size of reparse data. */ + le16 reserved; /* Align to 8-byte boundary. */ + u8 reparse_data[0]; /* Meaning depends on reparse_tag. */ +} __attribute__ ((__packed__)) REPARSE_POINT; + +/* + * Attribute: Extended attribute (EA) information (0xd0). + * + * NOTE: Always resident. (Is this true???) + */ +typedef struct { + le16 ea_length; /* Byte size of the packed extended + attributes. */ + le16 need_ea_count; /* The number of extended attributes which have + the NEED_EA bit set. */ + le32 ea_query_length; /* Byte size of the buffer required to query + the extended attributes when calling + ZwQueryEaFile() in Windows NT/2k. I.e. the + byte size of the unpacked extended + attributes. */ +} __attribute__ ((__packed__)) EA_INFORMATION; + +/* + * Extended attribute flags (8-bit). + */ +enum { + NEED_EA = 0x80 /* If set the file to which the EA belongs + cannot be interpreted without understanding + the associates extended attributes. */ +} __attribute__ ((__packed__)); + +typedef u8 EA_FLAGS; + +/* + * Attribute: Extended attribute (EA) (0xe0). + * + * NOTE: Can be resident or non-resident. + * + * Like the attribute list and the index buffer list, the EA attribute value is + * a sequence of EA_ATTR variable length records. + */ +typedef struct { + le32 next_entry_offset; /* Offset to the next EA_ATTR. */ + EA_FLAGS flags; /* Flags describing the EA. */ + u8 ea_name_length; /* Length of the name of the EA in bytes + excluding the '\0' byte terminator. */ + le16 ea_value_length; /* Byte size of the EA's value. */ + u8 ea_name[0]; /* Name of the EA. Note this is ASCII, not + Unicode and it is zero terminated. */ + u8 ea_value[0]; /* The value of the EA. Immediately follows + the name. */ +} __attribute__ ((__packed__)) EA_ATTR; + +/* + * Attribute: Property set (0xf0). + * + * Intended to support Native Structure Storage (NSS) - a feature removed from + * NTFS 3.0 during beta testing. + */ +typedef struct { + /* Irrelevant as feature unused. */ +} __attribute__ ((__packed__)) PROPERTY_SET; + +/* + * Attribute: Logged utility stream (0x100). + * + * NOTE: Can be resident or non-resident. + * + * Operations on this attribute are logged to the journal ($LogFile) like + * normal metadata changes. + * + * Used by the Encrypting File System (EFS). All encrypted files have this + * attribute with the name $EFS. + */ +typedef struct { + /* Can be anything the creator chooses. */ + /* EFS uses it as follows: */ + // FIXME: Type this info, verifying it along the way. (AIA) +} __attribute__ ((__packed__)) LOGGED_UTILITY_STREAM, EFS_ATTR; + +#endif /* _LINUX_NTFS_LAYOUT_H */ diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c new file mode 100644 index 000000000000..eda9972e6159 --- /dev/null +++ b/fs/ntfs/lcnalloc.c @@ -0,0 +1,1000 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * lcnalloc.c - Cluster (de)allocation code. Part of the Linux-NTFS project. + * + * Copyright (c) 2004-2005 Anton Altaparmakov + */ + +#ifdef NTFS_RW + +#include + +#include "lcnalloc.h" +#include "debug.h" +#include "bitmap.h" +#include "inode.h" +#include "volume.h" +#include "attrib.h" +#include "malloc.h" +#include "aops.h" +#include "ntfs.h" + +/** + * ntfs_cluster_free_from_rl_nolock - free clusters from runlist + * @vol: mounted ntfs volume on which to free the clusters + * @rl: runlist describing the clusters to free + * + * Free all the clusters described by the runlist @rl on the volume @vol. In + * the case of an error being returned, at least some of the clusters were not + * freed. + * + * Return 0 on success and -errno on error. + * + * Locking: - The volume lcn bitmap must be locked for writing on entry and is + * left locked on return. + */ +int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol, + const runlist_element *rl) +{ + struct inode *lcnbmp_vi = vol->lcnbmp_ino; + int ret = 0; + + ntfs_debug("Entering."); + if (!rl) + return 0; + for (; rl->length; rl++) { + int err; + + if (rl->lcn < 0) + continue; + err = ntfs_bitmap_clear_run(lcnbmp_vi, rl->lcn, rl->length); + if (unlikely(err && (!ret || ret == -ENOMEM) && ret != err)) + ret = err; + } + ntfs_debug("Done."); + return ret; +} + +/** + * ntfs_cluster_alloc - allocate clusters on an ntfs volume + * @vol: mounted ntfs volume on which to allocate the clusters + * @start_vcn: vcn to use for the first allocated cluster + * @count: number of clusters to allocate + * @start_lcn: starting lcn at which to allocate the clusters (or -1 if none) + * @zone: zone from which to allocate the clusters + * @is_extension: if 'true', this is an attribute extension + * + * Allocate @count clusters preferably starting at cluster @start_lcn or at the + * current allocator position if @start_lcn is -1, on the mounted ntfs volume + * @vol. @zone is either DATA_ZONE for allocation of normal clusters or + * MFT_ZONE for allocation of clusters for the master file table, i.e. the + * $MFT/$DATA attribute. + * + * @start_vcn specifies the vcn of the first allocated cluster. This makes + * merging the resulting runlist with the old runlist easier. + * + * If @is_extension is 'true', the caller is allocating clusters to extend an + * attribute and if it is 'false', the caller is allocating clusters to fill a + * hole in an attribute. Practically the difference is that if @is_extension + * is 'true' the returned runlist will be terminated with LCN_ENOENT and if + * @is_extension is 'false' the runlist will be terminated with + * LCN_RL_NOT_MAPPED. + * + * You need to check the return value with IS_ERR(). If this is false, the + * function was successful and the return value is a runlist describing the + * allocated cluster(s). If IS_ERR() is true, the function failed and + * PTR_ERR() gives you the error code. + * + * Notes on the allocation algorithm + * ================================= + * + * There are two data zones. First is the area between the end of the mft zone + * and the end of the volume, and second is the area between the start of the + * volume and the start of the mft zone. On unmodified/standard NTFS 1.x + * volumes, the second data zone does not exist due to the mft zone being + * expanded to cover the start of the volume in order to reserve space for the + * mft bitmap attribute. + * + * This is not the prettiest function but the complexity stems from the need of + * implementing the mft vs data zoned approach and from the fact that we have + * access to the lcn bitmap in portions of up to 8192 bytes at a time, so we + * need to cope with crossing over boundaries of two buffers. Further, the + * fact that the allocator allows for caller supplied hints as to the location + * of where allocation should begin and the fact that the allocator keeps track + * of where in the data zones the next natural allocation should occur, + * contribute to the complexity of the function. But it should all be + * worthwhile, because this allocator should: 1) be a full implementation of + * the MFT zone approach used by Windows NT, 2) cause reduction in + * fragmentation, and 3) be speedy in allocations (the code is not optimized + * for speed, but the algorithm is, so further speed improvements are probably + * possible). + * + * FIXME: We should be monitoring cluster allocation and increment the MFT zone + * size dynamically but this is something for the future. We will just cause + * heavier fragmentation by not doing it and I am not even sure Windows would + * grow the MFT zone dynamically, so it might even be correct not to do this. + * The overhead in doing dynamic MFT zone expansion would be very large and + * unlikely worth the effort. (AIA) + * + * TODO: I have added in double the required zone position pointer wrap around + * logic which can be optimized to having only one of the two logic sets. + * However, having the double logic will work fine, but if we have only one of + * the sets and we get it wrong somewhere, then we get into trouble, so + * removing the duplicate logic requires _very_ careful consideration of _all_ + * possible code paths. So at least for now, I am leaving the double logic - + * better safe than sorry... (AIA) + * + * Locking: - The volume lcn bitmap must be unlocked on entry and is unlocked + * on return. + * - This function takes the volume lcn bitmap lock for writing and + * modifies the bitmap contents. + */ +runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn, + const s64 count, const LCN start_lcn, + const NTFS_CLUSTER_ALLOCATION_ZONES zone, + const bool is_extension) +{ + LCN zone_start, zone_end, bmp_pos, bmp_initial_pos, last_read_pos, lcn; + LCN prev_lcn = 0, prev_run_len = 0, mft_zone_size; + s64 clusters; + loff_t i_size; + struct inode *lcnbmp_vi; + runlist_element *rl = NULL; + struct address_space *mapping; + struct page *page = NULL; + u8 *buf, *byte; + int err = 0, rlpos, rlsize, buf_size; + u8 pass, done_zones, search_zone, need_writeback = 0, bit; + + ntfs_debug("Entering for start_vcn 0x%llx, count 0x%llx, start_lcn " + "0x%llx, zone %s_ZONE.", (unsigned long long)start_vcn, + (unsigned long long)count, + (unsigned long long)start_lcn, + zone == MFT_ZONE ? "MFT" : "DATA"); + BUG_ON(!vol); + lcnbmp_vi = vol->lcnbmp_ino; + BUG_ON(!lcnbmp_vi); + BUG_ON(start_vcn < 0); + BUG_ON(count < 0); + BUG_ON(start_lcn < -1); + BUG_ON(zone < FIRST_ZONE); + BUG_ON(zone > LAST_ZONE); + + /* Return NULL if @count is zero. */ + if (!count) + return NULL; + /* Take the lcnbmp lock for writing. */ + down_write(&vol->lcnbmp_lock); + /* + * If no specific @start_lcn was requested, use the current data zone + * position, otherwise use the requested @start_lcn but make sure it + * lies outside the mft zone. Also set done_zones to 0 (no zones done) + * and pass depending on whether we are starting inside a zone (1) or + * at the beginning of a zone (2). If requesting from the MFT_ZONE, + * we either start at the current position within the mft zone or at + * the specified position. If the latter is out of bounds then we start + * at the beginning of the MFT_ZONE. + */ + done_zones = 0; + pass = 1; + /* + * zone_start and zone_end are the current search range. search_zone + * is 1 for mft zone, 2 for data zone 1 (end of mft zone till end of + * volume) and 4 for data zone 2 (start of volume till start of mft + * zone). + */ + zone_start = start_lcn; + if (zone_start < 0) { + if (zone == DATA_ZONE) + zone_start = vol->data1_zone_pos; + else + zone_start = vol->mft_zone_pos; + if (!zone_start) { + /* + * Zone starts at beginning of volume which means a + * single pass is sufficient. + */ + pass = 2; + } + } else if (zone == DATA_ZONE && zone_start >= vol->mft_zone_start && + zone_start < vol->mft_zone_end) { + zone_start = vol->mft_zone_end; + /* + * Starting at beginning of data1_zone which means a single + * pass in this zone is sufficient. + */ + pass = 2; + } else if (zone == MFT_ZONE && (zone_start < vol->mft_zone_start || + zone_start >= vol->mft_zone_end)) { + zone_start = vol->mft_lcn; + if (!vol->mft_zone_end) + zone_start = 0; + /* + * Starting at beginning of volume which means a single pass + * is sufficient. + */ + pass = 2; + } + if (zone == MFT_ZONE) { + zone_end = vol->mft_zone_end; + search_zone = 1; + } else /* if (zone == DATA_ZONE) */ { + /* Skip searching the mft zone. */ + done_zones |= 1; + if (zone_start >= vol->mft_zone_end) { + zone_end = vol->nr_clusters; + search_zone = 2; + } else { + zone_end = vol->mft_zone_start; + search_zone = 4; + } + } + /* + * bmp_pos is the current bit position inside the bitmap. We use + * bmp_initial_pos to determine whether or not to do a zone switch. + */ + bmp_pos = bmp_initial_pos = zone_start; + + /* Loop until all clusters are allocated, i.e. clusters == 0. */ + clusters = count; + rlpos = rlsize = 0; + mapping = lcnbmp_vi->i_mapping; + i_size = i_size_read(lcnbmp_vi); + while (1) { + ntfs_debug("Start of outer while loop: done_zones 0x%x, " + "search_zone %i, pass %i, zone_start 0x%llx, " + "zone_end 0x%llx, bmp_initial_pos 0x%llx, " + "bmp_pos 0x%llx, rlpos %i, rlsize %i.", + done_zones, search_zone, pass, + (unsigned long long)zone_start, + (unsigned long long)zone_end, + (unsigned long long)bmp_initial_pos, + (unsigned long long)bmp_pos, rlpos, rlsize); + /* Loop until we run out of free clusters. */ + last_read_pos = bmp_pos >> 3; + ntfs_debug("last_read_pos 0x%llx.", + (unsigned long long)last_read_pos); + if (last_read_pos > i_size) { + ntfs_debug("End of attribute reached. " + "Skipping to zone_pass_done."); + goto zone_pass_done; + } + if (likely(page)) { + if (need_writeback) { + ntfs_debug("Marking page dirty."); + flush_dcache_page(page); + set_page_dirty(page); + need_writeback = 0; + } + ntfs_unmap_page(page); + } + page = ntfs_map_page(mapping, last_read_pos >> + PAGE_SHIFT); + if (IS_ERR(page)) { + err = PTR_ERR(page); + ntfs_error(vol->sb, "Failed to map page."); + goto out; + } + buf_size = last_read_pos & ~PAGE_MASK; + buf = page_address(page) + buf_size; + buf_size = PAGE_SIZE - buf_size; + if (unlikely(last_read_pos + buf_size > i_size)) + buf_size = i_size - last_read_pos; + buf_size <<= 3; + lcn = bmp_pos & 7; + bmp_pos &= ~(LCN)7; + ntfs_debug("Before inner while loop: buf_size %i, lcn 0x%llx, " + "bmp_pos 0x%llx, need_writeback %i.", buf_size, + (unsigned long long)lcn, + (unsigned long long)bmp_pos, need_writeback); + while (lcn < buf_size && lcn + bmp_pos < zone_end) { + byte = buf + (lcn >> 3); + ntfs_debug("In inner while loop: buf_size %i, " + "lcn 0x%llx, bmp_pos 0x%llx, " + "need_writeback %i, byte ofs 0x%x, " + "*byte 0x%x.", buf_size, + (unsigned long long)lcn, + (unsigned long long)bmp_pos, + need_writeback, + (unsigned int)(lcn >> 3), + (unsigned int)*byte); + /* Skip full bytes. */ + if (*byte == 0xff) { + lcn = (lcn + 8) & ~(LCN)7; + ntfs_debug("Continuing while loop 1."); + continue; + } + bit = 1 << (lcn & 7); + ntfs_debug("bit 0x%x.", bit); + /* If the bit is already set, go onto the next one. */ + if (*byte & bit) { + lcn++; + ntfs_debug("Continuing while loop 2."); + continue; + } + /* + * Allocate more memory if needed, including space for + * the terminator element. + * ntfs_malloc_nofs() operates on whole pages only. + */ + if ((rlpos + 2) * sizeof(*rl) > rlsize) { + runlist_element *rl2; + + ntfs_debug("Reallocating memory."); + if (!rl) + ntfs_debug("First free bit is at LCN " + "0x%llx.", + (unsigned long long) + (lcn + bmp_pos)); + rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE); + if (unlikely(!rl2)) { + err = -ENOMEM; + ntfs_error(vol->sb, "Failed to " + "allocate memory."); + goto out; + } + memcpy(rl2, rl, rlsize); + ntfs_free(rl); + rl = rl2; + rlsize += PAGE_SIZE; + ntfs_debug("Reallocated memory, rlsize 0x%x.", + rlsize); + } + /* Allocate the bitmap bit. */ + *byte |= bit; + /* We need to write this bitmap page to disk. */ + need_writeback = 1; + ntfs_debug("*byte 0x%x, need_writeback is set.", + (unsigned int)*byte); + /* + * Coalesce with previous run if adjacent LCNs. + * Otherwise, append a new run. + */ + ntfs_debug("Adding run (lcn 0x%llx, len 0x%llx), " + "prev_lcn 0x%llx, lcn 0x%llx, " + "bmp_pos 0x%llx, prev_run_len 0x%llx, " + "rlpos %i.", + (unsigned long long)(lcn + bmp_pos), + 1ULL, (unsigned long long)prev_lcn, + (unsigned long long)lcn, + (unsigned long long)bmp_pos, + (unsigned long long)prev_run_len, + rlpos); + if (prev_lcn == lcn + bmp_pos - prev_run_len && rlpos) { + ntfs_debug("Coalescing to run (lcn 0x%llx, " + "len 0x%llx).", + (unsigned long long) + rl[rlpos - 1].lcn, + (unsigned long long) + rl[rlpos - 1].length); + rl[rlpos - 1].length = ++prev_run_len; + ntfs_debug("Run now (lcn 0x%llx, len 0x%llx), " + "prev_run_len 0x%llx.", + (unsigned long long) + rl[rlpos - 1].lcn, + (unsigned long long) + rl[rlpos - 1].length, + (unsigned long long) + prev_run_len); + } else { + if (likely(rlpos)) { + ntfs_debug("Adding new run, (previous " + "run lcn 0x%llx, " + "len 0x%llx).", + (unsigned long long) + rl[rlpos - 1].lcn, + (unsigned long long) + rl[rlpos - 1].length); + rl[rlpos].vcn = rl[rlpos - 1].vcn + + prev_run_len; + } else { + ntfs_debug("Adding new run, is first " + "run."); + rl[rlpos].vcn = start_vcn; + } + rl[rlpos].lcn = prev_lcn = lcn + bmp_pos; + rl[rlpos].length = prev_run_len = 1; + rlpos++; + } + /* Done? */ + if (!--clusters) { + LCN tc; + /* + * Update the current zone position. Positions + * of already scanned zones have been updated + * during the respective zone switches. + */ + tc = lcn + bmp_pos + 1; + ntfs_debug("Done. Updating current zone " + "position, tc 0x%llx, " + "search_zone %i.", + (unsigned long long)tc, + search_zone); + switch (search_zone) { + case 1: + ntfs_debug("Before checks, " + "vol->mft_zone_pos " + "0x%llx.", + (unsigned long long) + vol->mft_zone_pos); + if (tc >= vol->mft_zone_end) { + vol->mft_zone_pos = + vol->mft_lcn; + if (!vol->mft_zone_end) + vol->mft_zone_pos = 0; + } else if ((bmp_initial_pos >= + vol->mft_zone_pos || + tc > vol->mft_zone_pos) + && tc >= vol->mft_lcn) + vol->mft_zone_pos = tc; + ntfs_debug("After checks, " + "vol->mft_zone_pos " + "0x%llx.", + (unsigned long long) + vol->mft_zone_pos); + break; + case 2: + ntfs_debug("Before checks, " + "vol->data1_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data1_zone_pos); + if (tc >= vol->nr_clusters) + vol->data1_zone_pos = + vol->mft_zone_end; + else if ((bmp_initial_pos >= + vol->data1_zone_pos || + tc > vol->data1_zone_pos) + && tc >= vol->mft_zone_end) + vol->data1_zone_pos = tc; + ntfs_debug("After checks, " + "vol->data1_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data1_zone_pos); + break; + case 4: + ntfs_debug("Before checks, " + "vol->data2_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data2_zone_pos); + if (tc >= vol->mft_zone_start) + vol->data2_zone_pos = 0; + else if (bmp_initial_pos >= + vol->data2_zone_pos || + tc > vol->data2_zone_pos) + vol->data2_zone_pos = tc; + ntfs_debug("After checks, " + "vol->data2_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data2_zone_pos); + break; + default: + BUG(); + } + ntfs_debug("Finished. Going to out."); + goto out; + } + lcn++; + } + bmp_pos += buf_size; + ntfs_debug("After inner while loop: buf_size 0x%x, lcn " + "0x%llx, bmp_pos 0x%llx, need_writeback %i.", + buf_size, (unsigned long long)lcn, + (unsigned long long)bmp_pos, need_writeback); + if (bmp_pos < zone_end) { + ntfs_debug("Continuing outer while loop, " + "bmp_pos 0x%llx, zone_end 0x%llx.", + (unsigned long long)bmp_pos, + (unsigned long long)zone_end); + continue; + } +zone_pass_done: /* Finished with the current zone pass. */ + ntfs_debug("At zone_pass_done, pass %i.", pass); + if (pass == 1) { + /* + * Now do pass 2, scanning the first part of the zone + * we omitted in pass 1. + */ + pass = 2; + zone_end = zone_start; + switch (search_zone) { + case 1: /* mft_zone */ + zone_start = vol->mft_zone_start; + break; + case 2: /* data1_zone */ + zone_start = vol->mft_zone_end; + break; + case 4: /* data2_zone */ + zone_start = 0; + break; + default: + BUG(); + } + /* Sanity check. */ + if (zone_end < zone_start) + zone_end = zone_start; + bmp_pos = zone_start; + ntfs_debug("Continuing outer while loop, pass 2, " + "zone_start 0x%llx, zone_end 0x%llx, " + "bmp_pos 0x%llx.", + (unsigned long long)zone_start, + (unsigned long long)zone_end, + (unsigned long long)bmp_pos); + continue; + } /* pass == 2 */ +done_zones_check: + ntfs_debug("At done_zones_check, search_zone %i, done_zones " + "before 0x%x, done_zones after 0x%x.", + search_zone, done_zones, + done_zones | search_zone); + done_zones |= search_zone; + if (done_zones < 7) { + ntfs_debug("Switching zone."); + /* Now switch to the next zone we haven't done yet. */ + pass = 1; + switch (search_zone) { + case 1: + ntfs_debug("Switching from mft zone to data1 " + "zone."); + /* Update mft zone position. */ + if (rlpos) { + LCN tc; + + ntfs_debug("Before checks, " + "vol->mft_zone_pos " + "0x%llx.", + (unsigned long long) + vol->mft_zone_pos); + tc = rl[rlpos - 1].lcn + + rl[rlpos - 1].length; + if (tc >= vol->mft_zone_end) { + vol->mft_zone_pos = + vol->mft_lcn; + if (!vol->mft_zone_end) + vol->mft_zone_pos = 0; + } else if ((bmp_initial_pos >= + vol->mft_zone_pos || + tc > vol->mft_zone_pos) + && tc >= vol->mft_lcn) + vol->mft_zone_pos = tc; + ntfs_debug("After checks, " + "vol->mft_zone_pos " + "0x%llx.", + (unsigned long long) + vol->mft_zone_pos); + } + /* Switch from mft zone to data1 zone. */ +switch_to_data1_zone: search_zone = 2; + zone_start = bmp_initial_pos = + vol->data1_zone_pos; + zone_end = vol->nr_clusters; + if (zone_start == vol->mft_zone_end) + pass = 2; + if (zone_start >= zone_end) { + vol->data1_zone_pos = zone_start = + vol->mft_zone_end; + pass = 2; + } + break; + case 2: + ntfs_debug("Switching from data1 zone to " + "data2 zone."); + /* Update data1 zone position. */ + if (rlpos) { + LCN tc; + + ntfs_debug("Before checks, " + "vol->data1_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data1_zone_pos); + tc = rl[rlpos - 1].lcn + + rl[rlpos - 1].length; + if (tc >= vol->nr_clusters) + vol->data1_zone_pos = + vol->mft_zone_end; + else if ((bmp_initial_pos >= + vol->data1_zone_pos || + tc > vol->data1_zone_pos) + && tc >= vol->mft_zone_end) + vol->data1_zone_pos = tc; + ntfs_debug("After checks, " + "vol->data1_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data1_zone_pos); + } + /* Switch from data1 zone to data2 zone. */ + search_zone = 4; + zone_start = bmp_initial_pos = + vol->data2_zone_pos; + zone_end = vol->mft_zone_start; + if (!zone_start) + pass = 2; + if (zone_start >= zone_end) { + vol->data2_zone_pos = zone_start = + bmp_initial_pos = 0; + pass = 2; + } + break; + case 4: + ntfs_debug("Switching from data2 zone to " + "data1 zone."); + /* Update data2 zone position. */ + if (rlpos) { + LCN tc; + + ntfs_debug("Before checks, " + "vol->data2_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data2_zone_pos); + tc = rl[rlpos - 1].lcn + + rl[rlpos - 1].length; + if (tc >= vol->mft_zone_start) + vol->data2_zone_pos = 0; + else if (bmp_initial_pos >= + vol->data2_zone_pos || + tc > vol->data2_zone_pos) + vol->data2_zone_pos = tc; + ntfs_debug("After checks, " + "vol->data2_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data2_zone_pos); + } + /* Switch from data2 zone to data1 zone. */ + goto switch_to_data1_zone; + default: + BUG(); + } + ntfs_debug("After zone switch, search_zone %i, " + "pass %i, bmp_initial_pos 0x%llx, " + "zone_start 0x%llx, zone_end 0x%llx.", + search_zone, pass, + (unsigned long long)bmp_initial_pos, + (unsigned long long)zone_start, + (unsigned long long)zone_end); + bmp_pos = zone_start; + if (zone_start == zone_end) { + ntfs_debug("Empty zone, going to " + "done_zones_check."); + /* Empty zone. Don't bother searching it. */ + goto done_zones_check; + } + ntfs_debug("Continuing outer while loop."); + continue; + } /* done_zones == 7 */ + ntfs_debug("All zones are finished."); + /* + * All zones are finished! If DATA_ZONE, shrink mft zone. If + * MFT_ZONE, we have really run out of space. + */ + mft_zone_size = vol->mft_zone_end - vol->mft_zone_start; + ntfs_debug("vol->mft_zone_start 0x%llx, vol->mft_zone_end " + "0x%llx, mft_zone_size 0x%llx.", + (unsigned long long)vol->mft_zone_start, + (unsigned long long)vol->mft_zone_end, + (unsigned long long)mft_zone_size); + if (zone == MFT_ZONE || mft_zone_size <= 0) { + ntfs_debug("No free clusters left, going to out."); + /* Really no more space left on device. */ + err = -ENOSPC; + goto out; + } /* zone == DATA_ZONE && mft_zone_size > 0 */ + ntfs_debug("Shrinking mft zone."); + zone_end = vol->mft_zone_end; + mft_zone_size >>= 1; + if (mft_zone_size > 0) + vol->mft_zone_end = vol->mft_zone_start + mft_zone_size; + else /* mft zone and data2 zone no longer exist. */ + vol->data2_zone_pos = vol->mft_zone_start = + vol->mft_zone_end = 0; + if (vol->mft_zone_pos >= vol->mft_zone_end) { + vol->mft_zone_pos = vol->mft_lcn; + if (!vol->mft_zone_end) + vol->mft_zone_pos = 0; + } + bmp_pos = zone_start = bmp_initial_pos = + vol->data1_zone_pos = vol->mft_zone_end; + search_zone = 2; + pass = 2; + done_zones &= ~2; + ntfs_debug("After shrinking mft zone, mft_zone_size 0x%llx, " + "vol->mft_zone_start 0x%llx, " + "vol->mft_zone_end 0x%llx, " + "vol->mft_zone_pos 0x%llx, search_zone 2, " + "pass 2, dones_zones 0x%x, zone_start 0x%llx, " + "zone_end 0x%llx, vol->data1_zone_pos 0x%llx, " + "continuing outer while loop.", + (unsigned long long)mft_zone_size, + (unsigned long long)vol->mft_zone_start, + (unsigned long long)vol->mft_zone_end, + (unsigned long long)vol->mft_zone_pos, + done_zones, (unsigned long long)zone_start, + (unsigned long long)zone_end, + (unsigned long long)vol->data1_zone_pos); + } + ntfs_debug("After outer while loop."); +out: + ntfs_debug("At out."); + /* Add runlist terminator element. */ + if (likely(rl)) { + rl[rlpos].vcn = rl[rlpos - 1].vcn + rl[rlpos - 1].length; + rl[rlpos].lcn = is_extension ? LCN_ENOENT : LCN_RL_NOT_MAPPED; + rl[rlpos].length = 0; + } + if (likely(page && !IS_ERR(page))) { + if (need_writeback) { + ntfs_debug("Marking page dirty."); + flush_dcache_page(page); + set_page_dirty(page); + need_writeback = 0; + } + ntfs_unmap_page(page); + } + if (likely(!err)) { + up_write(&vol->lcnbmp_lock); + ntfs_debug("Done."); + return rl; + } + ntfs_error(vol->sb, "Failed to allocate clusters, aborting " + "(error %i).", err); + if (rl) { + int err2; + + if (err == -ENOSPC) + ntfs_debug("Not enough space to complete allocation, " + "err -ENOSPC, first free lcn 0x%llx, " + "could allocate up to 0x%llx " + "clusters.", + (unsigned long long)rl[0].lcn, + (unsigned long long)(count - clusters)); + /* Deallocate all allocated clusters. */ + ntfs_debug("Attempting rollback..."); + err2 = ntfs_cluster_free_from_rl_nolock(vol, rl); + if (err2) { + ntfs_error(vol->sb, "Failed to rollback (error %i). " + "Leaving inconsistent metadata! " + "Unmount and run chkdsk.", err2); + NVolSetErrors(vol); + } + /* Free the runlist. */ + ntfs_free(rl); + } else if (err == -ENOSPC) + ntfs_debug("No space left at all, err = -ENOSPC, first free " + "lcn = 0x%llx.", + (long long)vol->data1_zone_pos); + up_write(&vol->lcnbmp_lock); + return ERR_PTR(err); +} + +/** + * __ntfs_cluster_free - free clusters on an ntfs volume + * @ni: ntfs inode whose runlist describes the clusters to free + * @start_vcn: vcn in the runlist of @ni at which to start freeing clusters + * @count: number of clusters to free or -1 for all clusters + * @ctx: active attribute search context if present or NULL if not + * @is_rollback: true if this is a rollback operation + * + * Free @count clusters starting at the cluster @start_vcn in the runlist + * described by the vfs inode @ni. + * + * If @count is -1, all clusters from @start_vcn to the end of the runlist are + * deallocated. Thus, to completely free all clusters in a runlist, use + * @start_vcn = 0 and @count = -1. + * + * If @ctx is specified, it is an active search context of @ni and its base mft + * record. This is needed when __ntfs_cluster_free() encounters unmapped + * runlist fragments and allows their mapping. If you do not have the mft + * record mapped, you can specify @ctx as NULL and __ntfs_cluster_free() will + * perform the necessary mapping and unmapping. + * + * Note, __ntfs_cluster_free() saves the state of @ctx on entry and restores it + * before returning. Thus, @ctx will be left pointing to the same attribute on + * return as on entry. However, the actual pointers in @ctx may point to + * different memory locations on return, so you must remember to reset any + * cached pointers from the @ctx, i.e. after the call to __ntfs_cluster_free(), + * you will probably want to do: + * m = ctx->mrec; + * a = ctx->attr; + * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that + * you cache ctx->mrec in a variable @m of type MFT_RECORD *. + * + * @is_rollback should always be 'false', it is for internal use to rollback + * errors. You probably want to use ntfs_cluster_free() instead. + * + * Note, __ntfs_cluster_free() does not modify the runlist, so you have to + * remove from the runlist or mark sparse the freed runs later. + * + * Return the number of deallocated clusters (not counting sparse ones) on + * success and -errno on error. + * + * WARNING: If @ctx is supplied, regardless of whether success or failure is + * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx + * is no longer valid, i.e. you need to either call + * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. + * In that case PTR_ERR(@ctx->mrec) will give you the error code for + * why the mapping of the old inode failed. + * + * Locking: - The runlist described by @ni must be locked for writing on entry + * and is locked on return. Note the runlist may be modified when + * needed runlist fragments need to be mapped. + * - The volume lcn bitmap must be unlocked on entry and is unlocked + * on return. + * - This function takes the volume lcn bitmap lock for writing and + * modifies the bitmap contents. + * - If @ctx is NULL, the base mft record of @ni must not be mapped on + * entry and it will be left unmapped on return. + * - If @ctx is not NULL, the base mft record must be mapped on entry + * and it will be left mapped on return. + */ +s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count, + ntfs_attr_search_ctx *ctx, const bool is_rollback) +{ + s64 delta, to_free, total_freed, real_freed; + ntfs_volume *vol; + struct inode *lcnbmp_vi; + runlist_element *rl; + int err; + + BUG_ON(!ni); + ntfs_debug("Entering for i_ino 0x%lx, start_vcn 0x%llx, count " + "0x%llx.%s", ni->mft_no, (unsigned long long)start_vcn, + (unsigned long long)count, + is_rollback ? " (rollback)" : ""); + vol = ni->vol; + lcnbmp_vi = vol->lcnbmp_ino; + BUG_ON(!lcnbmp_vi); + BUG_ON(start_vcn < 0); + BUG_ON(count < -1); + /* + * Lock the lcn bitmap for writing but only if not rolling back. We + * must hold the lock all the way including through rollback otherwise + * rollback is not possible because once we have cleared a bit and + * dropped the lock, anyone could have set the bit again, thus + * allocating the cluster for another use. + */ + if (likely(!is_rollback)) + down_write(&vol->lcnbmp_lock); + + total_freed = real_freed = 0; + + rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, ctx); + if (IS_ERR(rl)) { + if (!is_rollback) + ntfs_error(vol->sb, "Failed to find first runlist " + "element (error %li), aborting.", + PTR_ERR(rl)); + err = PTR_ERR(rl); + goto err_out; + } + if (unlikely(rl->lcn < LCN_HOLE)) { + if (!is_rollback) + ntfs_error(vol->sb, "First runlist element has " + "invalid lcn, aborting."); + err = -EIO; + goto err_out; + } + /* Find the starting cluster inside the run that needs freeing. */ + delta = start_vcn - rl->vcn; + + /* The number of clusters in this run that need freeing. */ + to_free = rl->length - delta; + if (count >= 0 && to_free > count) + to_free = count; + + if (likely(rl->lcn >= 0)) { + /* Do the actual freeing of the clusters in this run. */ + err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn + delta, + to_free, likely(!is_rollback) ? 0 : 1); + if (unlikely(err)) { + if (!is_rollback) + ntfs_error(vol->sb, "Failed to clear first run " + "(error %i), aborting.", err); + goto err_out; + } + /* We have freed @to_free real clusters. */ + real_freed = to_free; + }; + /* Go to the next run and adjust the number of clusters left to free. */ + ++rl; + if (count >= 0) + count -= to_free; + + /* Keep track of the total "freed" clusters, including sparse ones. */ + total_freed = to_free; + /* + * Loop over the remaining runs, using @count as a capping value, and + * free them. + */ + for (; rl->length && count != 0; ++rl) { + if (unlikely(rl->lcn < LCN_HOLE)) { + VCN vcn; + + /* Attempt to map runlist. */ + vcn = rl->vcn; + rl = ntfs_attr_find_vcn_nolock(ni, vcn, ctx); + if (IS_ERR(rl)) { + err = PTR_ERR(rl); + if (!is_rollback) + ntfs_error(vol->sb, "Failed to map " + "runlist fragment or " + "failed to find " + "subsequent runlist " + "element."); + goto err_out; + } + if (unlikely(rl->lcn < LCN_HOLE)) { + if (!is_rollback) + ntfs_error(vol->sb, "Runlist element " + "has invalid lcn " + "(0x%llx).", + (unsigned long long) + rl->lcn); + err = -EIO; + goto err_out; + } + } + /* The number of clusters in this run that need freeing. */ + to_free = rl->length; + if (count >= 0 && to_free > count) + to_free = count; + + if (likely(rl->lcn >= 0)) { + /* Do the actual freeing of the clusters in the run. */ + err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn, + to_free, likely(!is_rollback) ? 0 : 1); + if (unlikely(err)) { + if (!is_rollback) + ntfs_error(vol->sb, "Failed to clear " + "subsequent run."); + goto err_out; + } + /* We have freed @to_free real clusters. */ + real_freed += to_free; + } + /* Adjust the number of clusters left to free. */ + if (count >= 0) + count -= to_free; + + /* Update the total done clusters. */ + total_freed += to_free; + } + if (likely(!is_rollback)) + up_write(&vol->lcnbmp_lock); + + BUG_ON(count > 0); + + /* We are done. Return the number of actually freed clusters. */ + ntfs_debug("Done."); + return real_freed; +err_out: + if (is_rollback) + return err; + /* If no real clusters were freed, no need to rollback. */ + if (!real_freed) { + up_write(&vol->lcnbmp_lock); + return err; + } + /* + * Attempt to rollback and if that succeeds just return the error code. + * If rollback fails, set the volume errors flag, emit an error + * message, and return the error code. + */ + delta = __ntfs_cluster_free(ni, start_vcn, total_freed, ctx, true); + if (delta < 0) { + ntfs_error(vol->sb, "Failed to rollback (error %i). Leaving " + "inconsistent metadata! Unmount and run " + "chkdsk.", (int)delta); + NVolSetErrors(vol); + } + up_write(&vol->lcnbmp_lock); + ntfs_error(vol->sb, "Aborting (error %i).", err); + return err; +} + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/lcnalloc.h b/fs/ntfs/lcnalloc.h new file mode 100644 index 000000000000..1589a6d8434b --- /dev/null +++ b/fs/ntfs/lcnalloc.h @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * lcnalloc.h - Exports for NTFS kernel cluster (de)allocation. Part of the + * Linux-NTFS project. + * + * Copyright (c) 2004-2005 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_LCNALLOC_H +#define _LINUX_NTFS_LCNALLOC_H + +#ifdef NTFS_RW + +#include + +#include "attrib.h" +#include "types.h" +#include "inode.h" +#include "runlist.h" +#include "volume.h" + +typedef enum { + FIRST_ZONE = 0, /* For sanity checking. */ + MFT_ZONE = 0, /* Allocate from $MFT zone. */ + DATA_ZONE = 1, /* Allocate from $DATA zone. */ + LAST_ZONE = 1, /* For sanity checking. */ +} NTFS_CLUSTER_ALLOCATION_ZONES; + +extern runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, + const VCN start_vcn, const s64 count, const LCN start_lcn, + const NTFS_CLUSTER_ALLOCATION_ZONES zone, + const bool is_extension); + +extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, + s64 count, ntfs_attr_search_ctx *ctx, const bool is_rollback); + +/** + * ntfs_cluster_free - free clusters on an ntfs volume + * @ni: ntfs inode whose runlist describes the clusters to free + * @start_vcn: vcn in the runlist of @ni at which to start freeing clusters + * @count: number of clusters to free or -1 for all clusters + * @ctx: active attribute search context if present or NULL if not + * + * Free @count clusters starting at the cluster @start_vcn in the runlist + * described by the ntfs inode @ni. + * + * If @count is -1, all clusters from @start_vcn to the end of the runlist are + * deallocated. Thus, to completely free all clusters in a runlist, use + * @start_vcn = 0 and @count = -1. + * + * If @ctx is specified, it is an active search context of @ni and its base mft + * record. This is needed when ntfs_cluster_free() encounters unmapped runlist + * fragments and allows their mapping. If you do not have the mft record + * mapped, you can specify @ctx as NULL and ntfs_cluster_free() will perform + * the necessary mapping and unmapping. + * + * Note, ntfs_cluster_free() saves the state of @ctx on entry and restores it + * before returning. Thus, @ctx will be left pointing to the same attribute on + * return as on entry. However, the actual pointers in @ctx may point to + * different memory locations on return, so you must remember to reset any + * cached pointers from the @ctx, i.e. after the call to ntfs_cluster_free(), + * you will probably want to do: + * m = ctx->mrec; + * a = ctx->attr; + * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that + * you cache ctx->mrec in a variable @m of type MFT_RECORD *. + * + * Note, ntfs_cluster_free() does not modify the runlist, so you have to remove + * from the runlist or mark sparse the freed runs later. + * + * Return the number of deallocated clusters (not counting sparse ones) on + * success and -errno on error. + * + * WARNING: If @ctx is supplied, regardless of whether success or failure is + * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx + * is no longer valid, i.e. you need to either call + * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. + * In that case PTR_ERR(@ctx->mrec) will give you the error code for + * why the mapping of the old inode failed. + * + * Locking: - The runlist described by @ni must be locked for writing on entry + * and is locked on return. Note the runlist may be modified when + * needed runlist fragments need to be mapped. + * - The volume lcn bitmap must be unlocked on entry and is unlocked + * on return. + * - This function takes the volume lcn bitmap lock for writing and + * modifies the bitmap contents. + * - If @ctx is NULL, the base mft record of @ni must not be mapped on + * entry and it will be left unmapped on return. + * - If @ctx is not NULL, the base mft record must be mapped on entry + * and it will be left mapped on return. + */ +static inline s64 ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, + s64 count, ntfs_attr_search_ctx *ctx) +{ + return __ntfs_cluster_free(ni, start_vcn, count, ctx, false); +} + +extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol, + const runlist_element *rl); + +/** + * ntfs_cluster_free_from_rl - free clusters from runlist + * @vol: mounted ntfs volume on which to free the clusters + * @rl: runlist describing the clusters to free + * + * Free all the clusters described by the runlist @rl on the volume @vol. In + * the case of an error being returned, at least some of the clusters were not + * freed. + * + * Return 0 on success and -errno on error. + * + * Locking: - This function takes the volume lcn bitmap lock for writing and + * modifies the bitmap contents. + * - The caller must have locked the runlist @rl for reading or + * writing. + */ +static inline int ntfs_cluster_free_from_rl(ntfs_volume *vol, + const runlist_element *rl) +{ + int ret; + + down_write(&vol->lcnbmp_lock); + ret = ntfs_cluster_free_from_rl_nolock(vol, rl); + up_write(&vol->lcnbmp_lock); + return ret; +} + +#endif /* NTFS_RW */ + +#endif /* defined _LINUX_NTFS_LCNALLOC_H */ diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c new file mode 100644 index 000000000000..6ce60ffc6ac0 --- /dev/null +++ b/fs/ntfs/logfile.c @@ -0,0 +1,849 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * logfile.c - NTFS kernel journal handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2002-2007 Anton Altaparmakov + */ + +#ifdef NTFS_RW + +#include +#include +#include +#include +#include +#include +#include + +#include "attrib.h" +#include "aops.h" +#include "debug.h" +#include "logfile.h" +#include "malloc.h" +#include "volume.h" +#include "ntfs.h" + +/** + * ntfs_check_restart_page_header - check the page header for consistency + * @vi: $LogFile inode to which the restart page header belongs + * @rp: restart page header to check + * @pos: position in @vi at which the restart page header resides + * + * Check the restart page header @rp for consistency and return 'true' if it is + * consistent and 'false' otherwise. + * + * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not + * require the full restart page. + */ +static bool ntfs_check_restart_page_header(struct inode *vi, + RESTART_PAGE_HEADER *rp, s64 pos) +{ + u32 logfile_system_page_size, logfile_log_page_size; + u16 ra_ofs, usa_count, usa_ofs, usa_end = 0; + bool have_usa = true; + + ntfs_debug("Entering."); + /* + * If the system or log page sizes are smaller than the ntfs block size + * or either is not a power of 2 we cannot handle this log file. + */ + logfile_system_page_size = le32_to_cpu(rp->system_page_size); + logfile_log_page_size = le32_to_cpu(rp->log_page_size); + if (logfile_system_page_size < NTFS_BLOCK_SIZE || + logfile_log_page_size < NTFS_BLOCK_SIZE || + logfile_system_page_size & + (logfile_system_page_size - 1) || + !is_power_of_2(logfile_log_page_size)) { + ntfs_error(vi->i_sb, "$LogFile uses unsupported page size."); + return false; + } + /* + * We must be either at !pos (1st restart page) or at pos = system page + * size (2nd restart page). + */ + if (pos && pos != logfile_system_page_size) { + ntfs_error(vi->i_sb, "Found restart area in incorrect " + "position in $LogFile."); + return false; + } + /* We only know how to handle version 1.1. */ + if (sle16_to_cpu(rp->major_ver) != 1 || + sle16_to_cpu(rp->minor_ver) != 1) { + ntfs_error(vi->i_sb, "$LogFile version %i.%i is not " + "supported. (This driver supports version " + "1.1 only.)", (int)sle16_to_cpu(rp->major_ver), + (int)sle16_to_cpu(rp->minor_ver)); + return false; + } + /* + * If chkdsk has been run the restart page may not be protected by an + * update sequence array. + */ + if (ntfs_is_chkd_record(rp->magic) && !le16_to_cpu(rp->usa_count)) { + have_usa = false; + goto skip_usa_checks; + } + /* Verify the size of the update sequence array. */ + usa_count = 1 + (logfile_system_page_size >> NTFS_BLOCK_SIZE_BITS); + if (usa_count != le16_to_cpu(rp->usa_count)) { + ntfs_error(vi->i_sb, "$LogFile restart page specifies " + "inconsistent update sequence array count."); + return false; + } + /* Verify the position of the update sequence array. */ + usa_ofs = le16_to_cpu(rp->usa_ofs); + usa_end = usa_ofs + usa_count * sizeof(u16); + if (usa_ofs < sizeof(RESTART_PAGE_HEADER) || + usa_end > NTFS_BLOCK_SIZE - sizeof(u16)) { + ntfs_error(vi->i_sb, "$LogFile restart page specifies " + "inconsistent update sequence array offset."); + return false; + } +skip_usa_checks: + /* + * Verify the position of the restart area. It must be: + * - aligned to 8-byte boundary, + * - after the update sequence array, and + * - within the system page size. + */ + ra_ofs = le16_to_cpu(rp->restart_area_offset); + if (ra_ofs & 7 || (have_usa ? ra_ofs < usa_end : + ra_ofs < sizeof(RESTART_PAGE_HEADER)) || + ra_ofs > logfile_system_page_size) { + ntfs_error(vi->i_sb, "$LogFile restart page specifies " + "inconsistent restart area offset."); + return false; + } + /* + * Only restart pages modified by chkdsk are allowed to have chkdsk_lsn + * set. + */ + if (!ntfs_is_chkd_record(rp->magic) && sle64_to_cpu(rp->chkdsk_lsn)) { + ntfs_error(vi->i_sb, "$LogFile restart page is not modified " + "by chkdsk but a chkdsk LSN is specified."); + return false; + } + ntfs_debug("Done."); + return true; +} + +/** + * ntfs_check_restart_area - check the restart area for consistency + * @vi: $LogFile inode to which the restart page belongs + * @rp: restart page whose restart area to check + * + * Check the restart area of the restart page @rp for consistency and return + * 'true' if it is consistent and 'false' otherwise. + * + * This function assumes that the restart page header has already been + * consistency checked. + * + * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not + * require the full restart page. + */ +static bool ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp) +{ + u64 file_size; + RESTART_AREA *ra; + u16 ra_ofs, ra_len, ca_ofs; + u8 fs_bits; + + ntfs_debug("Entering."); + ra_ofs = le16_to_cpu(rp->restart_area_offset); + ra = (RESTART_AREA*)((u8*)rp + ra_ofs); + /* + * Everything before ra->file_size must be before the first word + * protected by an update sequence number. This ensures that it is + * safe to access ra->client_array_offset. + */ + if (ra_ofs + offsetof(RESTART_AREA, file_size) > + NTFS_BLOCK_SIZE - sizeof(u16)) { + ntfs_error(vi->i_sb, "$LogFile restart area specifies " + "inconsistent file offset."); + return false; + } + /* + * Now that we can access ra->client_array_offset, make sure everything + * up to the log client array is before the first word protected by an + * update sequence number. This ensures we can access all of the + * restart area elements safely. Also, the client array offset must be + * aligned to an 8-byte boundary. + */ + ca_ofs = le16_to_cpu(ra->client_array_offset); + if (((ca_ofs + 7) & ~7) != ca_ofs || + ra_ofs + ca_ofs > NTFS_BLOCK_SIZE - sizeof(u16)) { + ntfs_error(vi->i_sb, "$LogFile restart area specifies " + "inconsistent client array offset."); + return false; + } + /* + * The restart area must end within the system page size both when + * calculated manually and as specified by ra->restart_area_length. + * Also, the calculated length must not exceed the specified length. + */ + ra_len = ca_ofs + le16_to_cpu(ra->log_clients) * + sizeof(LOG_CLIENT_RECORD); + if (ra_ofs + ra_len > le32_to_cpu(rp->system_page_size) || + ra_ofs + le16_to_cpu(ra->restart_area_length) > + le32_to_cpu(rp->system_page_size) || + ra_len > le16_to_cpu(ra->restart_area_length)) { + ntfs_error(vi->i_sb, "$LogFile restart area is out of bounds " + "of the system page size specified by the " + "restart page header and/or the specified " + "restart area length is inconsistent."); + return false; + } + /* + * The ra->client_free_list and ra->client_in_use_list must be either + * LOGFILE_NO_CLIENT or less than ra->log_clients or they are + * overflowing the client array. + */ + if ((ra->client_free_list != LOGFILE_NO_CLIENT && + le16_to_cpu(ra->client_free_list) >= + le16_to_cpu(ra->log_clients)) || + (ra->client_in_use_list != LOGFILE_NO_CLIENT && + le16_to_cpu(ra->client_in_use_list) >= + le16_to_cpu(ra->log_clients))) { + ntfs_error(vi->i_sb, "$LogFile restart area specifies " + "overflowing client free and/or in use lists."); + return false; + } + /* + * Check ra->seq_number_bits against ra->file_size for consistency. + * We cannot just use ffs() because the file size is not a power of 2. + */ + file_size = (u64)sle64_to_cpu(ra->file_size); + fs_bits = 0; + while (file_size) { + file_size >>= 1; + fs_bits++; + } + if (le32_to_cpu(ra->seq_number_bits) != 67 - fs_bits) { + ntfs_error(vi->i_sb, "$LogFile restart area specifies " + "inconsistent sequence number bits."); + return false; + } + /* The log record header length must be a multiple of 8. */ + if (((le16_to_cpu(ra->log_record_header_length) + 7) & ~7) != + le16_to_cpu(ra->log_record_header_length)) { + ntfs_error(vi->i_sb, "$LogFile restart area specifies " + "inconsistent log record header length."); + return false; + } + /* Dito for the log page data offset. */ + if (((le16_to_cpu(ra->log_page_data_offset) + 7) & ~7) != + le16_to_cpu(ra->log_page_data_offset)) { + ntfs_error(vi->i_sb, "$LogFile restart area specifies " + "inconsistent log page data offset."); + return false; + } + ntfs_debug("Done."); + return true; +} + +/** + * ntfs_check_log_client_array - check the log client array for consistency + * @vi: $LogFile inode to which the restart page belongs + * @rp: restart page whose log client array to check + * + * Check the log client array of the restart page @rp for consistency and + * return 'true' if it is consistent and 'false' otherwise. + * + * This function assumes that the restart page header and the restart area have + * already been consistency checked. + * + * Unlike ntfs_check_restart_page_header() and ntfs_check_restart_area(), this + * function needs @rp->system_page_size bytes in @rp, i.e. it requires the full + * restart page and the page must be multi sector transfer deprotected. + */ +static bool ntfs_check_log_client_array(struct inode *vi, + RESTART_PAGE_HEADER *rp) +{ + RESTART_AREA *ra; + LOG_CLIENT_RECORD *ca, *cr; + u16 nr_clients, idx; + bool in_free_list, idx_is_first; + + ntfs_debug("Entering."); + ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset)); + ca = (LOG_CLIENT_RECORD*)((u8*)ra + + le16_to_cpu(ra->client_array_offset)); + /* + * Check the ra->client_free_list first and then check the + * ra->client_in_use_list. Check each of the log client records in + * each of the lists and check that the array does not overflow the + * ra->log_clients value. Also keep track of the number of records + * visited as there cannot be more than ra->log_clients records and + * that way we detect eventual loops in within a list. + */ + nr_clients = le16_to_cpu(ra->log_clients); + idx = le16_to_cpu(ra->client_free_list); + in_free_list = true; +check_list: + for (idx_is_first = true; idx != LOGFILE_NO_CLIENT_CPU; nr_clients--, + idx = le16_to_cpu(cr->next_client)) { + if (!nr_clients || idx >= le16_to_cpu(ra->log_clients)) + goto err_out; + /* Set @cr to the current log client record. */ + cr = ca + idx; + /* The first log client record must not have a prev_client. */ + if (idx_is_first) { + if (cr->prev_client != LOGFILE_NO_CLIENT) + goto err_out; + idx_is_first = false; + } + } + /* Switch to and check the in use list if we just did the free list. */ + if (in_free_list) { + in_free_list = false; + idx = le16_to_cpu(ra->client_in_use_list); + goto check_list; + } + ntfs_debug("Done."); + return true; +err_out: + ntfs_error(vi->i_sb, "$LogFile log client array is corrupt."); + return false; +} + +/** + * ntfs_check_and_load_restart_page - check the restart page for consistency + * @vi: $LogFile inode to which the restart page belongs + * @rp: restart page to check + * @pos: position in @vi at which the restart page resides + * @wrp: [OUT] copy of the multi sector transfer deprotected restart page + * @lsn: [OUT] set to the current logfile lsn on success + * + * Check the restart page @rp for consistency and return 0 if it is consistent + * and -errno otherwise. The restart page may have been modified by chkdsk in + * which case its magic is CHKD instead of RSTR. + * + * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not + * require the full restart page. + * + * If @wrp is not NULL, on success, *@wrp will point to a buffer containing a + * copy of the complete multi sector transfer deprotected page. On failure, + * *@wrp is undefined. + * + * Simillarly, if @lsn is not NULL, on success *@lsn will be set to the current + * logfile lsn according to this restart page. On failure, *@lsn is undefined. + * + * The following error codes are defined: + * -EINVAL - The restart page is inconsistent. + * -ENOMEM - Not enough memory to load the restart page. + * -EIO - Failed to reading from $LogFile. + */ +static int ntfs_check_and_load_restart_page(struct inode *vi, + RESTART_PAGE_HEADER *rp, s64 pos, RESTART_PAGE_HEADER **wrp, + LSN *lsn) +{ + RESTART_AREA *ra; + RESTART_PAGE_HEADER *trp; + int size, err; + + ntfs_debug("Entering."); + /* Check the restart page header for consistency. */ + if (!ntfs_check_restart_page_header(vi, rp, pos)) { + /* Error output already done inside the function. */ + return -EINVAL; + } + /* Check the restart area for consistency. */ + if (!ntfs_check_restart_area(vi, rp)) { + /* Error output already done inside the function. */ + return -EINVAL; + } + ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset)); + /* + * Allocate a buffer to store the whole restart page so we can multi + * sector transfer deprotect it. + */ + trp = ntfs_malloc_nofs(le32_to_cpu(rp->system_page_size)); + if (!trp) { + ntfs_error(vi->i_sb, "Failed to allocate memory for $LogFile " + "restart page buffer."); + return -ENOMEM; + } + /* + * Read the whole of the restart page into the buffer. If it fits + * completely inside @rp, just copy it from there. Otherwise map all + * the required pages and copy the data from them. + */ + size = PAGE_SIZE - (pos & ~PAGE_MASK); + if (size >= le32_to_cpu(rp->system_page_size)) { + memcpy(trp, rp, le32_to_cpu(rp->system_page_size)); + } else { + pgoff_t idx; + struct page *page; + int have_read, to_read; + + /* First copy what we already have in @rp. */ + memcpy(trp, rp, size); + /* Copy the remaining data one page at a time. */ + have_read = size; + to_read = le32_to_cpu(rp->system_page_size) - size; + idx = (pos + size) >> PAGE_SHIFT; + BUG_ON((pos + size) & ~PAGE_MASK); + do { + page = ntfs_map_page(vi->i_mapping, idx); + if (IS_ERR(page)) { + ntfs_error(vi->i_sb, "Error mapping $LogFile " + "page (index %lu).", idx); + err = PTR_ERR(page); + if (err != -EIO && err != -ENOMEM) + err = -EIO; + goto err_out; + } + size = min_t(int, to_read, PAGE_SIZE); + memcpy((u8*)trp + have_read, page_address(page), size); + ntfs_unmap_page(page); + have_read += size; + to_read -= size; + idx++; + } while (to_read > 0); + } + /* + * Perform the multi sector transfer deprotection on the buffer if the + * restart page is protected. + */ + if ((!ntfs_is_chkd_record(trp->magic) || le16_to_cpu(trp->usa_count)) + && post_read_mst_fixup((NTFS_RECORD*)trp, + le32_to_cpu(rp->system_page_size))) { + /* + * A multi sector tranfer error was detected. We only need to + * abort if the restart page contents exceed the multi sector + * transfer fixup of the first sector. + */ + if (le16_to_cpu(rp->restart_area_offset) + + le16_to_cpu(ra->restart_area_length) > + NTFS_BLOCK_SIZE - sizeof(u16)) { + ntfs_error(vi->i_sb, "Multi sector transfer error " + "detected in $LogFile restart page."); + err = -EINVAL; + goto err_out; + } + } + /* + * If the restart page is modified by chkdsk or there are no active + * logfile clients, the logfile is consistent. Otherwise, need to + * check the log client records for consistency, too. + */ + err = 0; + if (ntfs_is_rstr_record(rp->magic) && + ra->client_in_use_list != LOGFILE_NO_CLIENT) { + if (!ntfs_check_log_client_array(vi, trp)) { + err = -EINVAL; + goto err_out; + } + } + if (lsn) { + if (ntfs_is_rstr_record(rp->magic)) + *lsn = sle64_to_cpu(ra->current_lsn); + else /* if (ntfs_is_chkd_record(rp->magic)) */ + *lsn = sle64_to_cpu(rp->chkdsk_lsn); + } + ntfs_debug("Done."); + if (wrp) + *wrp = trp; + else { +err_out: + ntfs_free(trp); + } + return err; +} + +/** + * ntfs_check_logfile - check the journal for consistency + * @log_vi: struct inode of loaded journal $LogFile to check + * @rp: [OUT] on success this is a copy of the current restart page + * + * Check the $LogFile journal for consistency and return 'true' if it is + * consistent and 'false' if not. On success, the current restart page is + * returned in *@rp. Caller must call ntfs_free(*@rp) when finished with it. + * + * At present we only check the two restart pages and ignore the log record + * pages. + * + * Note that the MstProtected flag is not set on the $LogFile inode and hence + * when reading pages they are not deprotected. This is because we do not know + * if the $LogFile was created on a system with a different page size to ours + * yet and mst deprotection would fail if our page size is smaller. + */ +bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp) +{ + s64 size, pos; + LSN rstr1_lsn, rstr2_lsn; + ntfs_volume *vol = NTFS_SB(log_vi->i_sb); + struct address_space *mapping = log_vi->i_mapping; + struct page *page = NULL; + u8 *kaddr = NULL; + RESTART_PAGE_HEADER *rstr1_ph = NULL; + RESTART_PAGE_HEADER *rstr2_ph = NULL; + int log_page_size, err; + bool logfile_is_empty = true; + u8 log_page_bits; + + ntfs_debug("Entering."); + /* An empty $LogFile must have been clean before it got emptied. */ + if (NVolLogFileEmpty(vol)) + goto is_empty; + size = i_size_read(log_vi); + /* Make sure the file doesn't exceed the maximum allowed size. */ + if (size > MaxLogFileSize) + size = MaxLogFileSize; + /* + * Truncate size to a multiple of the page cache size or the default + * log page size if the page cache size is between the default log page + * log page size if the page cache size is between the default log page + * size and twice that. + */ + if (PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <= + DefaultLogPageSize * 2) + log_page_size = DefaultLogPageSize; + else + log_page_size = PAGE_SIZE; + /* + * Use ntfs_ffs() instead of ffs() to enable the compiler to + * optimize log_page_size and log_page_bits into constants. + */ + log_page_bits = ntfs_ffs(log_page_size) - 1; + size &= ~(s64)(log_page_size - 1); + /* + * Ensure the log file is big enough to store at least the two restart + * pages and the minimum number of log record pages. + */ + if (size < log_page_size * 2 || (size - log_page_size * 2) >> + log_page_bits < MinLogRecordPages) { + ntfs_error(vol->sb, "$LogFile is too small."); + return false; + } + /* + * Read through the file looking for a restart page. Since the restart + * page header is at the beginning of a page we only need to search at + * what could be the beginning of a page (for each page size) rather + * than scanning the whole file byte by byte. If all potential places + * contain empty and uninitialzed records, the log file can be assumed + * to be empty. + */ + for (pos = 0; pos < size; pos <<= 1) { + pgoff_t idx = pos >> PAGE_SHIFT; + if (!page || page->index != idx) { + if (page) + ntfs_unmap_page(page); + page = ntfs_map_page(mapping, idx); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Error mapping $LogFile " + "page (index %lu).", idx); + goto err_out; + } + } + kaddr = (u8*)page_address(page) + (pos & ~PAGE_MASK); + /* + * A non-empty block means the logfile is not empty while an + * empty block after a non-empty block has been encountered + * means we are done. + */ + if (!ntfs_is_empty_recordp((le32*)kaddr)) + logfile_is_empty = false; + else if (!logfile_is_empty) + break; + /* + * A log record page means there cannot be a restart page after + * this so no need to continue searching. + */ + if (ntfs_is_rcrd_recordp((le32*)kaddr)) + break; + /* If not a (modified by chkdsk) restart page, continue. */ + if (!ntfs_is_rstr_recordp((le32*)kaddr) && + !ntfs_is_chkd_recordp((le32*)kaddr)) { + if (!pos) + pos = NTFS_BLOCK_SIZE >> 1; + continue; + } + /* + * Check the (modified by chkdsk) restart page for consistency + * and get a copy of the complete multi sector transfer + * deprotected restart page. + */ + err = ntfs_check_and_load_restart_page(log_vi, + (RESTART_PAGE_HEADER*)kaddr, pos, + !rstr1_ph ? &rstr1_ph : &rstr2_ph, + !rstr1_ph ? &rstr1_lsn : &rstr2_lsn); + if (!err) { + /* + * If we have now found the first (modified by chkdsk) + * restart page, continue looking for the second one. + */ + if (!pos) { + pos = NTFS_BLOCK_SIZE >> 1; + continue; + } + /* + * We have now found the second (modified by chkdsk) + * restart page, so we can stop looking. + */ + break; + } + /* + * Error output already done inside the function. Note, we do + * not abort if the restart page was invalid as we might still + * find a valid one further in the file. + */ + if (err != -EINVAL) { + ntfs_unmap_page(page); + goto err_out; + } + /* Continue looking. */ + if (!pos) + pos = NTFS_BLOCK_SIZE >> 1; + } + if (page) + ntfs_unmap_page(page); + if (logfile_is_empty) { + NVolSetLogFileEmpty(vol); +is_empty: + ntfs_debug("Done. ($LogFile is empty.)"); + return true; + } + if (!rstr1_ph) { + BUG_ON(rstr2_ph); + ntfs_error(vol->sb, "Did not find any restart pages in " + "$LogFile and it was not empty."); + return false; + } + /* If both restart pages were found, use the more recent one. */ + if (rstr2_ph) { + /* + * If the second restart area is more recent, switch to it. + * Otherwise just throw it away. + */ + if (rstr2_lsn > rstr1_lsn) { + ntfs_debug("Using second restart page as it is more " + "recent."); + ntfs_free(rstr1_ph); + rstr1_ph = rstr2_ph; + /* rstr1_lsn = rstr2_lsn; */ + } else { + ntfs_debug("Using first restart page as it is more " + "recent."); + ntfs_free(rstr2_ph); + } + rstr2_ph = NULL; + } + /* All consistency checks passed. */ + if (rp) + *rp = rstr1_ph; + else + ntfs_free(rstr1_ph); + ntfs_debug("Done."); + return true; +err_out: + if (rstr1_ph) + ntfs_free(rstr1_ph); + return false; +} + +/** + * ntfs_is_logfile_clean - check in the journal if the volume is clean + * @log_vi: struct inode of loaded journal $LogFile to check + * @rp: copy of the current restart page + * + * Analyze the $LogFile journal and return 'true' if it indicates the volume was + * shutdown cleanly and 'false' if not. + * + * At present we only look at the two restart pages and ignore the log record + * pages. This is a little bit crude in that there will be a very small number + * of cases where we think that a volume is dirty when in fact it is clean. + * This should only affect volumes that have not been shutdown cleanly but did + * not have any pending, non-check-pointed i/o, i.e. they were completely idle + * at least for the five seconds preceding the unclean shutdown. + * + * This function assumes that the $LogFile journal has already been consistency + * checked by a call to ntfs_check_logfile() and in particular if the $LogFile + * is empty this function requires that NVolLogFileEmpty() is true otherwise an + * empty volume will be reported as dirty. + */ +bool ntfs_is_logfile_clean(struct inode *log_vi, const RESTART_PAGE_HEADER *rp) +{ + ntfs_volume *vol = NTFS_SB(log_vi->i_sb); + RESTART_AREA *ra; + + ntfs_debug("Entering."); + /* An empty $LogFile must have been clean before it got emptied. */ + if (NVolLogFileEmpty(vol)) { + ntfs_debug("Done. ($LogFile is empty.)"); + return true; + } + BUG_ON(!rp); + if (!ntfs_is_rstr_record(rp->magic) && + !ntfs_is_chkd_record(rp->magic)) { + ntfs_error(vol->sb, "Restart page buffer is invalid. This is " + "probably a bug in that the $LogFile should " + "have been consistency checked before calling " + "this function."); + return false; + } + ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset)); + /* + * If the $LogFile has active clients, i.e. it is open, and we do not + * have the RESTART_VOLUME_IS_CLEAN bit set in the restart area flags, + * we assume there was an unclean shutdown. + */ + if (ra->client_in_use_list != LOGFILE_NO_CLIENT && + !(ra->flags & RESTART_VOLUME_IS_CLEAN)) { + ntfs_debug("Done. $LogFile indicates a dirty shutdown."); + return false; + } + /* $LogFile indicates a clean shutdown. */ + ntfs_debug("Done. $LogFile indicates a clean shutdown."); + return true; +} + +/** + * ntfs_empty_logfile - empty the contents of the $LogFile journal + * @log_vi: struct inode of loaded journal $LogFile to empty + * + * Empty the contents of the $LogFile journal @log_vi and return 'true' on + * success and 'false' on error. + * + * This function assumes that the $LogFile journal has already been consistency + * checked by a call to ntfs_check_logfile() and that ntfs_is_logfile_clean() + * has been used to ensure that the $LogFile is clean. + */ +bool ntfs_empty_logfile(struct inode *log_vi) +{ + VCN vcn, end_vcn; + ntfs_inode *log_ni = NTFS_I(log_vi); + ntfs_volume *vol = log_ni->vol; + struct super_block *sb = vol->sb; + runlist_element *rl; + unsigned long flags; + unsigned block_size, block_size_bits; + int err; + bool should_wait = true; + + ntfs_debug("Entering."); + if (NVolLogFileEmpty(vol)) { + ntfs_debug("Done."); + return true; + } + /* + * We cannot use ntfs_attr_set() because we may be still in the middle + * of a mount operation. Thus we do the emptying by hand by first + * zapping the page cache pages for the $LogFile/$DATA attribute and + * then emptying each of the buffers in each of the clusters specified + * by the runlist by hand. + */ + block_size = sb->s_blocksize; + block_size_bits = sb->s_blocksize_bits; + vcn = 0; + read_lock_irqsave(&log_ni->size_lock, flags); + end_vcn = (log_ni->initialized_size + vol->cluster_size_mask) >> + vol->cluster_size_bits; + read_unlock_irqrestore(&log_ni->size_lock, flags); + truncate_inode_pages(log_vi->i_mapping, 0); + down_write(&log_ni->runlist.lock); + rl = log_ni->runlist.rl; + if (unlikely(!rl || vcn < rl->vcn || !rl->length)) { +map_vcn: + err = ntfs_map_runlist_nolock(log_ni, vcn, NULL); + if (err) { + ntfs_error(sb, "Failed to map runlist fragment (error " + "%d).", -err); + goto err; + } + rl = log_ni->runlist.rl; + BUG_ON(!rl || vcn < rl->vcn || !rl->length); + } + /* Seek to the runlist element containing @vcn. */ + while (rl->length && vcn >= rl[1].vcn) + rl++; + do { + LCN lcn; + sector_t block, end_block; + s64 len; + + /* + * If this run is not mapped map it now and start again as the + * runlist will have been updated. + */ + lcn = rl->lcn; + if (unlikely(lcn == LCN_RL_NOT_MAPPED)) { + vcn = rl->vcn; + goto map_vcn; + } + /* If this run is not valid abort with an error. */ + if (unlikely(!rl->length || lcn < LCN_HOLE)) + goto rl_err; + /* Skip holes. */ + if (lcn == LCN_HOLE) + continue; + block = lcn << vol->cluster_size_bits >> block_size_bits; + len = rl->length; + if (rl[1].vcn > end_vcn) + len = end_vcn - rl->vcn; + end_block = (lcn + len) << vol->cluster_size_bits >> + block_size_bits; + /* Iterate over the blocks in the run and empty them. */ + do { + struct buffer_head *bh; + + /* Obtain the buffer, possibly not uptodate. */ + bh = sb_getblk(sb, block); + BUG_ON(!bh); + /* Setup buffer i/o submission. */ + lock_buffer(bh); + bh->b_end_io = end_buffer_write_sync; + get_bh(bh); + /* Set the entire contents of the buffer to 0xff. */ + memset(bh->b_data, -1, block_size); + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + if (buffer_dirty(bh)) + clear_buffer_dirty(bh); + /* + * Submit the buffer and wait for i/o to complete but + * only for the first buffer so we do not miss really + * serious i/o errors. Once the first buffer has + * completed ignore errors afterwards as we can assume + * that if one buffer worked all of them will work. + */ + submit_bh(REQ_OP_WRITE, bh); + if (should_wait) { + should_wait = false; + wait_on_buffer(bh); + if (unlikely(!buffer_uptodate(bh))) + goto io_err; + } + brelse(bh); + } while (++block < end_block); + } while ((++rl)->vcn < end_vcn); + up_write(&log_ni->runlist.lock); + /* + * Zap the pages again just in case any got instantiated whilst we were + * emptying the blocks by hand. FIXME: We may not have completed + * writing to all the buffer heads yet so this may happen too early. + * We really should use a kernel thread to do the emptying + * asynchronously and then we can also set the volume dirty and output + * an error message if emptying should fail. + */ + truncate_inode_pages(log_vi->i_mapping, 0); + /* Set the flag so we do not have to do it again on remount. */ + NVolSetLogFileEmpty(vol); + ntfs_debug("Done."); + return true; +io_err: + ntfs_error(sb, "Failed to write buffer. Unmount and run chkdsk."); + goto dirty_err; +rl_err: + ntfs_error(sb, "Runlist is corrupt. Unmount and run chkdsk."); +dirty_err: + NVolSetErrors(vol); + err = -EIO; +err: + up_write(&log_ni->runlist.lock); + ntfs_error(sb, "Failed to fill $LogFile with 0xff bytes (error %d).", + -err); + return false; +} + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h new file mode 100644 index 000000000000..429d4909cc72 --- /dev/null +++ b/fs/ntfs/logfile.h @@ -0,0 +1,295 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * logfile.h - Defines for NTFS kernel journal ($LogFile) handling. Part of + * the Linux-NTFS project. + * + * Copyright (c) 2000-2005 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_LOGFILE_H +#define _LINUX_NTFS_LOGFILE_H + +#ifdef NTFS_RW + +#include + +#include "types.h" +#include "endian.h" +#include "layout.h" + +/* + * Journal ($LogFile) organization: + * + * Two restart areas present in the first two pages (restart pages, one restart + * area in each page). When the volume is dismounted they should be identical, + * except for the update sequence array which usually has a different update + * sequence number. + * + * These are followed by log records organized in pages headed by a log record + * header going up to log file size. Not all pages contain log records when a + * volume is first formatted, but as the volume ages, all records will be used. + * When the log file fills up, the records at the beginning are purged (by + * modifying the oldest_lsn to a higher value presumably) and writing begins + * at the beginning of the file. Effectively, the log file is viewed as a + * circular entity. + * + * NOTE: Windows NT, 2000, and XP all use log file version 1.1 but they accept + * versions <= 1.x, including 0.-1. (Yes, that is a minus one in there!) We + * probably only want to support 1.1 as this seems to be the current version + * and we don't know how that differs from the older versions. The only + * exception is if the journal is clean as marked by the two restart pages + * then it doesn't matter whether we are on an earlier version. We can just + * reinitialize the logfile and start again with version 1.1. + */ + +/* Some $LogFile related constants. */ +#define MaxLogFileSize 0x100000000ULL +#define DefaultLogPageSize 4096 +#define MinLogRecordPages 48 + +/* + * Log file restart page header (begins the restart area). + */ +typedef struct { +/*Ofs*/ +/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ +/* 0*/ NTFS_RECORD_TYPE magic; /* The magic is "RSTR". */ +/* 4*/ le16 usa_ofs; /* See NTFS_RECORD definition in layout.h. + When creating, set this to be immediately + after this header structure (without any + alignment). */ +/* 6*/ le16 usa_count; /* See NTFS_RECORD definition in layout.h. */ + +/* 8*/ leLSN chkdsk_lsn; /* The last log file sequence number found by + chkdsk. Only used when the magic is changed + to "CHKD". Otherwise this is zero. */ +/* 16*/ le32 system_page_size; /* Byte size of system pages when the log file + was created, has to be >= 512 and a power of + 2. Use this to calculate the required size + of the usa (usa_count) and add it to usa_ofs. + Then verify that the result is less than the + value of the restart_area_offset. */ +/* 20*/ le32 log_page_size; /* Byte size of log file pages, has to be >= + 512 and a power of 2. The default is 4096 + and is used when the system page size is + between 4096 and 8192. Otherwise this is + set to the system page size instead. */ +/* 24*/ le16 restart_area_offset;/* Byte offset from the start of this header to + the RESTART_AREA. Value has to be aligned + to 8-byte boundary. When creating, set this + to be after the usa. */ +/* 26*/ sle16 minor_ver; /* Log file minor version. Only check if major + version is 1. */ +/* 28*/ sle16 major_ver; /* Log file major version. We only support + version 1.1. */ +/* sizeof() = 30 (0x1e) bytes */ +} __attribute__ ((__packed__)) RESTART_PAGE_HEADER; + +/* + * Constant for the log client indices meaning that there are no client records + * in this particular client array. Also inside the client records themselves, + * this means that there are no client records preceding or following this one. + */ +#define LOGFILE_NO_CLIENT cpu_to_le16(0xffff) +#define LOGFILE_NO_CLIENT_CPU 0xffff + +/* + * These are the so far known RESTART_AREA_* flags (16-bit) which contain + * information about the log file in which they are present. + */ +enum { + RESTART_VOLUME_IS_CLEAN = cpu_to_le16(0x0002), + RESTART_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */ +} __attribute__ ((__packed__)); + +typedef le16 RESTART_AREA_FLAGS; + +/* + * Log file restart area record. The offset of this record is found by adding + * the offset of the RESTART_PAGE_HEADER to the restart_area_offset value found + * in it. See notes at restart_area_offset above. + */ +typedef struct { +/*Ofs*/ +/* 0*/ leLSN current_lsn; /* The current, i.e. last LSN inside the log + when the restart area was last written. + This happens often but what is the interval? + Is it just fixed time or is it every time a + check point is written or somethine else? + On create set to 0. */ +/* 8*/ le16 log_clients; /* Number of log client records in the array of + log client records which follows this + restart area. Must be 1. */ +/* 10*/ le16 client_free_list; /* The index of the first free log client record + in the array of log client records. + LOGFILE_NO_CLIENT means that there are no + free log client records in the array. + If != LOGFILE_NO_CLIENT, check that + log_clients > client_free_list. On Win2k + and presumably earlier, on a clean volume + this is != LOGFILE_NO_CLIENT, and it should + be 0, i.e. the first (and only) client + record is free and thus the logfile is + closed and hence clean. A dirty volume + would have left the logfile open and hence + this would be LOGFILE_NO_CLIENT. On WinXP + and presumably later, the logfile is always + open, even on clean shutdown so this should + always be LOGFILE_NO_CLIENT. */ +/* 12*/ le16 client_in_use_list;/* The index of the first in-use log client + record in the array of log client records. + LOGFILE_NO_CLIENT means that there are no + in-use log client records in the array. If + != LOGFILE_NO_CLIENT check that log_clients + > client_in_use_list. On Win2k and + presumably earlier, on a clean volume this + is LOGFILE_NO_CLIENT, i.e. there are no + client records in use and thus the logfile + is closed and hence clean. A dirty volume + would have left the logfile open and hence + this would be != LOGFILE_NO_CLIENT, and it + should be 0, i.e. the first (and only) + client record is in use. On WinXP and + presumably later, the logfile is always + open, even on clean shutdown so this should + always be 0. */ +/* 14*/ RESTART_AREA_FLAGS flags;/* Flags modifying LFS behaviour. On Win2k + and presumably earlier this is always 0. On + WinXP and presumably later, if the logfile + was shutdown cleanly, the second bit, + RESTART_VOLUME_IS_CLEAN, is set. This bit + is cleared when the volume is mounted by + WinXP and set when the volume is dismounted, + thus if the logfile is dirty, this bit is + clear. Thus we don't need to check the + Windows version to determine if the logfile + is clean. Instead if the logfile is closed, + we know it must be clean. If it is open and + this bit is set, we also know it must be + clean. If on the other hand the logfile is + open and this bit is clear, we can be almost + certain that the logfile is dirty. */ +/* 16*/ le32 seq_number_bits; /* How many bits to use for the sequence + number. This is calculated as 67 - the + number of bits required to store the logfile + size in bytes and this can be used in with + the specified file_size as a consistency + check. */ +/* 20*/ le16 restart_area_length;/* Length of the restart area including the + client array. Following checks required if + version matches. Otherwise, skip them. + restart_area_offset + restart_area_length + has to be <= system_page_size. Also, + restart_area_length has to be >= + client_array_offset + (log_clients * + sizeof(log client record)). */ +/* 22*/ le16 client_array_offset;/* Offset from the start of this record to + the first log client record if versions are + matched. When creating, set this to be + after this restart area structure, aligned + to 8-bytes boundary. If the versions do not + match, this is ignored and the offset is + assumed to be (sizeof(RESTART_AREA) + 7) & + ~7, i.e. rounded up to first 8-byte + boundary. Either way, client_array_offset + has to be aligned to an 8-byte boundary. + Also, restart_area_offset + + client_array_offset has to be <= 510. + Finally, client_array_offset + (log_clients + * sizeof(log client record)) has to be <= + system_page_size. On Win2k and presumably + earlier, this is 0x30, i.e. immediately + following this record. On WinXP and + presumably later, this is 0x40, i.e. there + are 16 extra bytes between this record and + the client array. This probably means that + the RESTART_AREA record is actually bigger + in WinXP and later. */ +/* 24*/ sle64 file_size; /* Usable byte size of the log file. If the + restart_area_offset + the offset of the + file_size are > 510 then corruption has + occurred. This is the very first check when + starting with the restart_area as if it + fails it means that some of the above values + will be corrupted by the multi sector + transfer protection. The file_size has to + be rounded down to be a multiple of the + log_page_size in the RESTART_PAGE_HEADER and + then it has to be at least big enough to + store the two restart pages and 48 (0x30) + log record pages. */ +/* 32*/ le32 last_lsn_data_length;/* Length of data of last LSN, not including + the log record header. On create set to + 0. */ +/* 36*/ le16 log_record_header_length;/* Byte size of the log record header. + If the version matches then check that the + value of log_record_header_length is a + multiple of 8, i.e. + (log_record_header_length + 7) & ~7 == + log_record_header_length. When creating set + it to sizeof(LOG_RECORD_HEADER), aligned to + 8 bytes. */ +/* 38*/ le16 log_page_data_offset;/* Offset to the start of data in a log record + page. Must be a multiple of 8. On create + set it to immediately after the update + sequence array of the log record page. */ +/* 40*/ le32 restart_log_open_count;/* A counter that gets incremented every + time the logfile is restarted which happens + at mount time when the logfile is opened. + When creating set to a random value. Win2k + sets it to the low 32 bits of the current + system time in NTFS format (see time.h). */ +/* 44*/ le32 reserved; /* Reserved/alignment to 8-byte boundary. */ +/* sizeof() = 48 (0x30) bytes */ +} __attribute__ ((__packed__)) RESTART_AREA; + +/* + * Log client record. The offset of this record is found by adding the offset + * of the RESTART_AREA to the client_array_offset value found in it. + */ +typedef struct { +/*Ofs*/ +/* 0*/ leLSN oldest_lsn; /* Oldest LSN needed by this client. On create + set to 0. */ +/* 8*/ leLSN client_restart_lsn;/* LSN at which this client needs to restart + the volume, i.e. the current position within + the log file. At present, if clean this + should = current_lsn in restart area but it + probably also = current_lsn when dirty most + of the time. At create set to 0. */ +/* 16*/ le16 prev_client; /* The offset to the previous log client record + in the array of log client records. + LOGFILE_NO_CLIENT means there is no previous + client record, i.e. this is the first one. + This is always LOGFILE_NO_CLIENT. */ +/* 18*/ le16 next_client; /* The offset to the next log client record in + the array of log client records. + LOGFILE_NO_CLIENT means there are no next + client records, i.e. this is the last one. + This is always LOGFILE_NO_CLIENT. */ +/* 20*/ le16 seq_number; /* On Win2k and presumably earlier, this is set + to zero every time the logfile is restarted + and it is incremented when the logfile is + closed at dismount time. Thus it is 0 when + dirty and 1 when clean. On WinXP and + presumably later, this is always 0. */ +/* 22*/ u8 reserved[6]; /* Reserved/alignment. */ +/* 28*/ le32 client_name_length;/* Length of client name in bytes. Should + always be 8. */ +/* 32*/ ntfschar client_name[64];/* Name of the client in Unicode. Should + always be "NTFS" with the remaining bytes + set to 0. */ +/* sizeof() = 160 (0xa0) bytes */ +} __attribute__ ((__packed__)) LOG_CLIENT_RECORD; + +extern bool ntfs_check_logfile(struct inode *log_vi, + RESTART_PAGE_HEADER **rp); + +extern bool ntfs_is_logfile_clean(struct inode *log_vi, + const RESTART_PAGE_HEADER *rp); + +extern bool ntfs_empty_logfile(struct inode *log_vi); + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_LOGFILE_H */ diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h new file mode 100644 index 000000000000..7068425735f1 --- /dev/null +++ b/fs/ntfs/malloc.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * malloc.h - NTFS kernel memory handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2005 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_MALLOC_H +#define _LINUX_NTFS_MALLOC_H + +#include +#include +#include + +/** + * __ntfs_malloc - allocate memory in multiples of pages + * @size: number of bytes to allocate + * @gfp_mask: extra flags for the allocator + * + * Internal function. You probably want ntfs_malloc_nofs()... + * + * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and + * returns a pointer to the allocated memory. + * + * If there was insufficient memory to complete the request, return NULL. + * Depending on @gfp_mask the allocation may be guaranteed to succeed. + */ +static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask) +{ + if (likely(size <= PAGE_SIZE)) { + BUG_ON(!size); + /* kmalloc() has per-CPU caches so is faster for now. */ + return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM); + /* return (void *)__get_free_page(gfp_mask); */ + } + if (likely((size >> PAGE_SHIFT) < totalram_pages())) + return __vmalloc(size, gfp_mask); + return NULL; +} + +/** + * ntfs_malloc_nofs - allocate memory in multiples of pages + * @size: number of bytes to allocate + * + * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and + * returns a pointer to the allocated memory. + * + * If there was insufficient memory to complete the request, return NULL. + */ +static inline void *ntfs_malloc_nofs(unsigned long size) +{ + return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM); +} + +/** + * ntfs_malloc_nofs_nofail - allocate memory in multiples of pages + * @size: number of bytes to allocate + * + * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and + * returns a pointer to the allocated memory. + * + * This function guarantees that the allocation will succeed. It will sleep + * for as long as it takes to complete the allocation. + * + * If there was insufficient memory to complete the request, return NULL. + */ +static inline void *ntfs_malloc_nofs_nofail(unsigned long size) +{ + return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_NOFAIL); +} + +static inline void ntfs_free(void *addr) +{ + kvfree(addr); +} + +#endif /* _LINUX_NTFS_MALLOC_H */ diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c new file mode 100644 index 000000000000..6fd1dc4b08c8 --- /dev/null +++ b/fs/ntfs/mft.c @@ -0,0 +1,2907 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. + * Copyright (c) 2002 Richard Russon + */ + +#include +#include +#include +#include + +#include "attrib.h" +#include "aops.h" +#include "bitmap.h" +#include "debug.h" +#include "dir.h" +#include "lcnalloc.h" +#include "malloc.h" +#include "mft.h" +#include "ntfs.h" + +#define MAX_BHS (PAGE_SIZE / NTFS_BLOCK_SIZE) + +/** + * map_mft_record_page - map the page in which a specific mft record resides + * @ni: ntfs inode whose mft record page to map + * + * This maps the page in which the mft record of the ntfs inode @ni is situated + * and returns a pointer to the mft record within the mapped page. + * + * Return value needs to be checked with IS_ERR() and if that is true PTR_ERR() + * contains the negative error code returned. + */ +static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni) +{ + loff_t i_size; + ntfs_volume *vol = ni->vol; + struct inode *mft_vi = vol->mft_ino; + struct page *page; + unsigned long index, end_index; + unsigned ofs; + + BUG_ON(ni->page); + /* + * The index into the page cache and the offset within the page cache + * page of the wanted mft record. FIXME: We need to check for + * overflowing the unsigned long, but I don't think we would ever get + * here if the volume was that big... + */ + index = (u64)ni->mft_no << vol->mft_record_size_bits >> + PAGE_SHIFT; + ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_MASK; + + i_size = i_size_read(mft_vi); + /* The maximum valid index into the page cache for $MFT's data. */ + end_index = i_size >> PAGE_SHIFT; + + /* If the wanted index is out of bounds the mft record doesn't exist. */ + if (unlikely(index >= end_index)) { + if (index > end_index || (i_size & ~PAGE_MASK) < ofs + + vol->mft_record_size) { + page = ERR_PTR(-ENOENT); + ntfs_error(vol->sb, "Attempt to read mft record 0x%lx, " + "which is beyond the end of the mft. " + "This is probably a bug in the ntfs " + "driver.", ni->mft_no); + goto err_out; + } + } + /* Read, map, and pin the page. */ + page = ntfs_map_page(mft_vi->i_mapping, index); + if (!IS_ERR(page)) { + /* Catch multi sector transfer fixup errors. */ + if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) + + ofs)))) { + ni->page = page; + ni->page_ofs = ofs; + return page_address(page) + ofs; + } + ntfs_error(vol->sb, "Mft record 0x%lx is corrupt. " + "Run chkdsk.", ni->mft_no); + ntfs_unmap_page(page); + page = ERR_PTR(-EIO); + NVolSetErrors(vol); + } +err_out: + ni->page = NULL; + ni->page_ofs = 0; + return (void*)page; +} + +/** + * map_mft_record - map, pin and lock an mft record + * @ni: ntfs inode whose MFT record to map + * + * First, take the mrec_lock mutex. We might now be sleeping, while waiting + * for the mutex if it was already locked by someone else. + * + * The page of the record is mapped using map_mft_record_page() before being + * returned to the caller. + * + * This in turn uses ntfs_map_page() to get the page containing the wanted mft + * record (it in turn calls read_cache_page() which reads it in from disk if + * necessary, increments the use count on the page so that it cannot disappear + * under us and returns a reference to the page cache page). + * + * If read_cache_page() invokes ntfs_readpage() to load the page from disk, it + * sets PG_locked and clears PG_uptodate on the page. Once I/O has completed + * and the post-read mst fixups on each mft record in the page have been + * performed, the page gets PG_uptodate set and PG_locked cleared (this is done + * in our asynchronous I/O completion handler end_buffer_read_mft_async()). + * ntfs_map_page() waits for PG_locked to become clear and checks if + * PG_uptodate is set and returns an error code if not. This provides + * sufficient protection against races when reading/using the page. + * + * However there is the write mapping to think about. Doing the above described + * checking here will be fine, because when initiating the write we will set + * PG_locked and clear PG_uptodate making sure nobody is touching the page + * contents. Doing the locking this way means that the commit to disk code in + * the page cache code paths is automatically sufficiently locked with us as + * we will not touch a page that has been locked or is not uptodate. The only + * locking problem then is them locking the page while we are accessing it. + * + * So that code will end up having to own the mrec_lock of all mft + * records/inodes present in the page before I/O can proceed. In that case we + * wouldn't need to bother with PG_locked and PG_uptodate as nobody will be + * accessing anything without owning the mrec_lock mutex. But we do need to + * use them because of the read_cache_page() invocation and the code becomes so + * much simpler this way that it is well worth it. + * + * The mft record is now ours and we return a pointer to it. You need to check + * the returned pointer with IS_ERR() and if that is true, PTR_ERR() will return + * the error code. + * + * NOTE: Caller is responsible for setting the mft record dirty before calling + * unmap_mft_record(). This is obviously only necessary if the caller really + * modified the mft record... + * Q: Do we want to recycle one of the VFS inode state bits instead? + * A: No, the inode ones mean we want to change the mft record, not we want to + * write it out. + */ +MFT_RECORD *map_mft_record(ntfs_inode *ni) +{ + MFT_RECORD *m; + + ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no); + + /* Make sure the ntfs inode doesn't go away. */ + atomic_inc(&ni->count); + + /* Serialize access to this mft record. */ + mutex_lock(&ni->mrec_lock); + + m = map_mft_record_page(ni); + if (!IS_ERR(m)) + return m; + + mutex_unlock(&ni->mrec_lock); + atomic_dec(&ni->count); + ntfs_error(ni->vol->sb, "Failed with error code %lu.", -PTR_ERR(m)); + return m; +} + +/** + * unmap_mft_record_page - unmap the page in which a specific mft record resides + * @ni: ntfs inode whose mft record page to unmap + * + * This unmaps the page in which the mft record of the ntfs inode @ni is + * situated and returns. This is a NOOP if highmem is not configured. + * + * The unmap happens via ntfs_unmap_page() which in turn decrements the use + * count on the page thus releasing it from the pinned state. + * + * We do not actually unmap the page from memory of course, as that will be + * done by the page cache code itself when memory pressure increases or + * whatever. + */ +static inline void unmap_mft_record_page(ntfs_inode *ni) +{ + BUG_ON(!ni->page); + + // TODO: If dirty, blah... + ntfs_unmap_page(ni->page); + ni->page = NULL; + ni->page_ofs = 0; + return; +} + +/** + * unmap_mft_record - release a mapped mft record + * @ni: ntfs inode whose MFT record to unmap + * + * We release the page mapping and the mrec_lock mutex which unmaps the mft + * record and releases it for others to get hold of. We also release the ntfs + * inode by decrementing the ntfs inode reference count. + * + * NOTE: If caller has modified the mft record, it is imperative to set the mft + * record dirty BEFORE calling unmap_mft_record(). + */ +void unmap_mft_record(ntfs_inode *ni) +{ + struct page *page = ni->page; + + BUG_ON(!page); + + ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no); + + unmap_mft_record_page(ni); + mutex_unlock(&ni->mrec_lock); + atomic_dec(&ni->count); + /* + * If pure ntfs_inode, i.e. no vfs inode attached, we leave it to + * ntfs_clear_extent_inode() in the extent inode case, and to the + * caller in the non-extent, yet pure ntfs inode case, to do the actual + * tear down of all structures and freeing of all allocated memory. + */ + return; +} + +/** + * map_extent_mft_record - load an extent inode and attach it to its base + * @base_ni: base ntfs inode + * @mref: mft reference of the extent inode to load + * @ntfs_ino: on successful return, pointer to the ntfs_inode structure + * + * Load the extent mft record @mref and attach it to its base inode @base_ni. + * Return the mapped extent mft record if IS_ERR(result) is false. Otherwise + * PTR_ERR(result) gives the negative error code. + * + * On successful return, @ntfs_ino contains a pointer to the ntfs_inode + * structure of the mapped extent inode. + */ +MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref, + ntfs_inode **ntfs_ino) +{ + MFT_RECORD *m; + ntfs_inode *ni = NULL; + ntfs_inode **extent_nis = NULL; + int i; + unsigned long mft_no = MREF(mref); + u16 seq_no = MSEQNO(mref); + bool destroy_ni = false; + + ntfs_debug("Mapping extent mft record 0x%lx (base mft record 0x%lx).", + mft_no, base_ni->mft_no); + /* Make sure the base ntfs inode doesn't go away. */ + atomic_inc(&base_ni->count); + /* + * Check if this extent inode has already been added to the base inode, + * in which case just return it. If not found, add it to the base + * inode before returning it. + */ + mutex_lock(&base_ni->extent_lock); + if (base_ni->nr_extents > 0) { + extent_nis = base_ni->ext.extent_ntfs_inos; + for (i = 0; i < base_ni->nr_extents; i++) { + if (mft_no != extent_nis[i]->mft_no) + continue; + ni = extent_nis[i]; + /* Make sure the ntfs inode doesn't go away. */ + atomic_inc(&ni->count); + break; + } + } + if (likely(ni != NULL)) { + mutex_unlock(&base_ni->extent_lock); + atomic_dec(&base_ni->count); + /* We found the record; just have to map and return it. */ + m = map_mft_record(ni); + /* map_mft_record() has incremented this on success. */ + atomic_dec(&ni->count); + if (!IS_ERR(m)) { + /* Verify the sequence number. */ + if (likely(le16_to_cpu(m->sequence_number) == seq_no)) { + ntfs_debug("Done 1."); + *ntfs_ino = ni; + return m; + } + unmap_mft_record(ni); + ntfs_error(base_ni->vol->sb, "Found stale extent mft " + "reference! Corrupt filesystem. " + "Run chkdsk."); + return ERR_PTR(-EIO); + } +map_err_out: + ntfs_error(base_ni->vol->sb, "Failed to map extent " + "mft record, error code %ld.", -PTR_ERR(m)); + return m; + } + /* Record wasn't there. Get a new ntfs inode and initialize it. */ + ni = ntfs_new_extent_inode(base_ni->vol->sb, mft_no); + if (unlikely(!ni)) { + mutex_unlock(&base_ni->extent_lock); + atomic_dec(&base_ni->count); + return ERR_PTR(-ENOMEM); + } + ni->vol = base_ni->vol; + ni->seq_no = seq_no; + ni->nr_extents = -1; + ni->ext.base_ntfs_ino = base_ni; + /* Now map the record. */ + m = map_mft_record(ni); + if (IS_ERR(m)) { + mutex_unlock(&base_ni->extent_lock); + atomic_dec(&base_ni->count); + ntfs_clear_extent_inode(ni); + goto map_err_out; + } + /* Verify the sequence number if it is present. */ + if (seq_no && (le16_to_cpu(m->sequence_number) != seq_no)) { + ntfs_error(base_ni->vol->sb, "Found stale extent mft " + "reference! Corrupt filesystem. Run chkdsk."); + destroy_ni = true; + m = ERR_PTR(-EIO); + goto unm_err_out; + } + /* Attach extent inode to base inode, reallocating memory if needed. */ + if (!(base_ni->nr_extents & 3)) { + ntfs_inode **tmp; + int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode *); + + tmp = kmalloc(new_size, GFP_NOFS); + if (unlikely(!tmp)) { + ntfs_error(base_ni->vol->sb, "Failed to allocate " + "internal buffer."); + destroy_ni = true; + m = ERR_PTR(-ENOMEM); + goto unm_err_out; + } + if (base_ni->nr_extents) { + BUG_ON(!base_ni->ext.extent_ntfs_inos); + memcpy(tmp, base_ni->ext.extent_ntfs_inos, new_size - + 4 * sizeof(ntfs_inode *)); + kfree(base_ni->ext.extent_ntfs_inos); + } + base_ni->ext.extent_ntfs_inos = tmp; + } + base_ni->ext.extent_ntfs_inos[base_ni->nr_extents++] = ni; + mutex_unlock(&base_ni->extent_lock); + atomic_dec(&base_ni->count); + ntfs_debug("Done 2."); + *ntfs_ino = ni; + return m; +unm_err_out: + unmap_mft_record(ni); + mutex_unlock(&base_ni->extent_lock); + atomic_dec(&base_ni->count); + /* + * If the extent inode was not attached to the base inode we need to + * release it or we will leak memory. + */ + if (destroy_ni) + ntfs_clear_extent_inode(ni); + return m; +} + +#ifdef NTFS_RW + +/** + * __mark_mft_record_dirty - set the mft record and the page containing it dirty + * @ni: ntfs inode describing the mapped mft record + * + * Internal function. Users should call mark_mft_record_dirty() instead. + * + * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni, + * as well as the page containing the mft record, dirty. Also, mark the base + * vfs inode dirty. This ensures that any changes to the mft record are + * written out to disk. + * + * NOTE: We only set I_DIRTY_DATASYNC (and not I_DIRTY_PAGES) + * on the base vfs inode, because even though file data may have been modified, + * it is dirty in the inode meta data rather than the data page cache of the + * inode, and thus there are no data pages that need writing out. Therefore, a + * full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the + * other hand, is not sufficient, because ->write_inode needs to be called even + * in case of fdatasync. This needs to happen or the file data would not + * necessarily hit the device synchronously, even though the vfs inode has the + * O_SYNC flag set. Also, I_DIRTY_DATASYNC simply "feels" better than just + * I_DIRTY_SYNC, since the file data has not actually hit the block device yet, + * which is not what I_DIRTY_SYNC on its own would suggest. + */ +void __mark_mft_record_dirty(ntfs_inode *ni) +{ + ntfs_inode *base_ni; + + ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); + BUG_ON(NInoAttr(ni)); + mark_ntfs_record_dirty(ni->page, ni->page_ofs); + /* Determine the base vfs inode and mark it dirty, too. */ + mutex_lock(&ni->extent_lock); + if (likely(ni->nr_extents >= 0)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + mutex_unlock(&ni->extent_lock); + __mark_inode_dirty(VFS_I(base_ni), I_DIRTY_DATASYNC); +} + +static const char *ntfs_please_email = "Please email " + "linux-ntfs-dev@lists.sourceforge.net and say that you saw " + "this message. Thank you."; + +/** + * ntfs_sync_mft_mirror_umount - synchronise an mft record to the mft mirror + * @vol: ntfs volume on which the mft record to synchronize resides + * @mft_no: mft record number of mft record to synchronize + * @m: mapped, mst protected (extent) mft record to synchronize + * + * Write the mapped, mst protected (extent) mft record @m with mft record + * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol, + * bypassing the page cache and the $MFTMirr inode itself. + * + * This function is only for use at umount time when the mft mirror inode has + * already been disposed off. We BUG() if we are called while the mft mirror + * inode is still attached to the volume. + * + * On success return 0. On error return -errno. + * + * NOTE: This function is not implemented yet as I am not convinced it can + * actually be triggered considering the sequence of commits we do in super.c:: + * ntfs_put_super(). But just in case we provide this place holder as the + * alternative would be either to BUG() or to get a NULL pointer dereference + * and Oops. + */ +static int ntfs_sync_mft_mirror_umount(ntfs_volume *vol, + const unsigned long mft_no, MFT_RECORD *m) +{ + BUG_ON(vol->mftmirr_ino); + ntfs_error(vol->sb, "Umount time mft mirror syncing is not " + "implemented yet. %s", ntfs_please_email); + return -EOPNOTSUPP; +} + +/** + * ntfs_sync_mft_mirror - synchronize an mft record to the mft mirror + * @vol: ntfs volume on which the mft record to synchronize resides + * @mft_no: mft record number of mft record to synchronize + * @m: mapped, mst protected (extent) mft record to synchronize + * @sync: if true, wait for i/o completion + * + * Write the mapped, mst protected (extent) mft record @m with mft record + * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol. + * + * On success return 0. On error return -errno and set the volume errors flag + * in the ntfs volume @vol. + * + * NOTE: We always perform synchronous i/o and ignore the @sync parameter. + * + * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just + * schedule i/o via ->writepage or do it via kntfsd or whatever. + */ +int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, + MFT_RECORD *m, int sync) +{ + struct page *page; + unsigned int blocksize = vol->sb->s_blocksize; + int max_bhs = vol->mft_record_size / blocksize; + struct buffer_head *bhs[MAX_BHS]; + struct buffer_head *bh, *head; + u8 *kmirr; + runlist_element *rl; + unsigned int block_start, block_end, m_start, m_end, page_ofs; + int i_bhs, nr_bhs, err = 0; + unsigned char blocksize_bits = vol->sb->s_blocksize_bits; + + ntfs_debug("Entering for inode 0x%lx.", mft_no); + BUG_ON(!max_bhs); + if (WARN_ON(max_bhs > MAX_BHS)) + return -EINVAL; + if (unlikely(!vol->mftmirr_ino)) { + /* This could happen during umount... */ + err = ntfs_sync_mft_mirror_umount(vol, mft_no, m); + if (likely(!err)) + return err; + goto err_out; + } + /* Get the page containing the mirror copy of the mft record @m. */ + page = ntfs_map_page(vol->mftmirr_ino->i_mapping, mft_no >> + (PAGE_SHIFT - vol->mft_record_size_bits)); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to map mft mirror page."); + err = PTR_ERR(page); + goto err_out; + } + lock_page(page); + BUG_ON(!PageUptodate(page)); + ClearPageUptodate(page); + /* Offset of the mft mirror record inside the page. */ + page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK; + /* The address in the page of the mirror copy of the mft record @m. */ + kmirr = page_address(page) + page_ofs; + /* Copy the mst protected mft record to the mirror. */ + memcpy(kmirr, m, vol->mft_record_size); + /* Create uptodate buffers if not present. */ + if (unlikely(!page_has_buffers(page))) { + struct buffer_head *tail; + + bh = head = alloc_page_buffers(page, blocksize, true); + do { + set_buffer_uptodate(bh); + tail = bh; + bh = bh->b_this_page; + } while (bh); + tail->b_this_page = head; + attach_page_private(page, head); + } + bh = head = page_buffers(page); + BUG_ON(!bh); + rl = NULL; + nr_bhs = 0; + block_start = 0; + m_start = kmirr - (u8*)page_address(page); + m_end = m_start + vol->mft_record_size; + do { + block_end = block_start + blocksize; + /* If the buffer is outside the mft record, skip it. */ + if (block_end <= m_start) + continue; + if (unlikely(block_start >= m_end)) + break; + /* Need to map the buffer if it is not mapped already. */ + if (unlikely(!buffer_mapped(bh))) { + VCN vcn; + LCN lcn; + unsigned int vcn_ofs; + + bh->b_bdev = vol->sb->s_bdev; + /* Obtain the vcn and offset of the current block. */ + vcn = ((VCN)mft_no << vol->mft_record_size_bits) + + (block_start - m_start); + vcn_ofs = vcn & vol->cluster_size_mask; + vcn >>= vol->cluster_size_bits; + if (!rl) { + down_read(&NTFS_I(vol->mftmirr_ino)-> + runlist.lock); + rl = NTFS_I(vol->mftmirr_ino)->runlist.rl; + /* + * $MFTMirr always has the whole of its runlist + * in memory. + */ + BUG_ON(!rl); + } + /* Seek to element containing target vcn. */ + while (rl->length && rl[1].vcn <= vcn) + rl++; + lcn = ntfs_rl_vcn_to_lcn(rl, vcn); + /* For $MFTMirr, only lcn >= 0 is a successful remap. */ + if (likely(lcn >= 0)) { + /* Setup buffer head to correct block. */ + bh->b_blocknr = ((lcn << + vol->cluster_size_bits) + + vcn_ofs) >> blocksize_bits; + set_buffer_mapped(bh); + } else { + bh->b_blocknr = -1; + ntfs_error(vol->sb, "Cannot write mft mirror " + "record 0x%lx because its " + "location on disk could not " + "be determined (error code " + "%lli).", mft_no, + (long long)lcn); + err = -EIO; + } + } + BUG_ON(!buffer_uptodate(bh)); + BUG_ON(!nr_bhs && (m_start != block_start)); + BUG_ON(nr_bhs >= max_bhs); + bhs[nr_bhs++] = bh; + BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end)); + } while (block_start = block_end, (bh = bh->b_this_page) != head); + if (unlikely(rl)) + up_read(&NTFS_I(vol->mftmirr_ino)->runlist.lock); + if (likely(!err)) { + /* Lock buffers and start synchronous write i/o on them. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { + struct buffer_head *tbh = bhs[i_bhs]; + + if (!trylock_buffer(tbh)) + BUG(); + BUG_ON(!buffer_uptodate(tbh)); + clear_buffer_dirty(tbh); + get_bh(tbh); + tbh->b_end_io = end_buffer_write_sync; + submit_bh(REQ_OP_WRITE, tbh); + } + /* Wait on i/o completion of buffers. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { + struct buffer_head *tbh = bhs[i_bhs]; + + wait_on_buffer(tbh); + if (unlikely(!buffer_uptodate(tbh))) { + err = -EIO; + /* + * Set the buffer uptodate so the page and + * buffer states do not become out of sync. + */ + set_buffer_uptodate(tbh); + } + } + } else /* if (unlikely(err)) */ { + /* Clean the buffers. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) + clear_buffer_dirty(bhs[i_bhs]); + } + /* Current state: all buffers are clean, unlocked, and uptodate. */ + /* Remove the mst protection fixups again. */ + post_write_mst_fixup((NTFS_RECORD*)kmirr); + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + ntfs_unmap_page(page); + if (likely(!err)) { + ntfs_debug("Done."); + } else { + ntfs_error(vol->sb, "I/O error while writing mft mirror " + "record 0x%lx!", mft_no); +err_out: + ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error " + "code %i). Volume will be left marked dirty " + "on umount. Run ntfsfix on the partition " + "after umounting to correct this.", -err); + NVolSetErrors(vol); + } + return err; +} + +/** + * write_mft_record_nolock - write out a mapped (extent) mft record + * @ni: ntfs inode describing the mapped (extent) mft record + * @m: mapped (extent) mft record to write + * @sync: if true, wait for i/o completion + * + * Write the mapped (extent) mft record @m described by the (regular or extent) + * ntfs inode @ni to backing store. If the mft record @m has a counterpart in + * the mft mirror, that is also updated. + * + * We only write the mft record if the ntfs inode @ni is dirty and the first + * buffer belonging to its mft record is dirty, too. We ignore the dirty state + * of subsequent buffers because we could have raced with + * fs/ntfs/aops.c::mark_ntfs_record_dirty(). + * + * On success, clean the mft record and return 0. On error, leave the mft + * record dirty and return -errno. + * + * NOTE: We always perform synchronous i/o and ignore the @sync parameter. + * However, if the mft record has a counterpart in the mft mirror and @sync is + * true, we write the mft record, wait for i/o completion, and only then write + * the mft mirror copy. This ensures that if the system crashes either the mft + * or the mft mirror will contain a self-consistent mft record @m. If @sync is + * false on the other hand, we start i/o on both and then wait for completion + * on them. This provides a speedup but no longer guarantees that you will end + * up with a self-consistent mft record in the case of a crash but if you asked + * for asynchronous writing you probably do not care about that anyway. + * + * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just + * schedule i/o via ->writepage or do it via kntfsd or whatever. + */ +int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync) +{ + ntfs_volume *vol = ni->vol; + struct page *page = ni->page; + unsigned int blocksize = vol->sb->s_blocksize; + unsigned char blocksize_bits = vol->sb->s_blocksize_bits; + int max_bhs = vol->mft_record_size / blocksize; + struct buffer_head *bhs[MAX_BHS]; + struct buffer_head *bh, *head; + runlist_element *rl; + unsigned int block_start, block_end, m_start, m_end; + int i_bhs, nr_bhs, err = 0; + + ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); + BUG_ON(NInoAttr(ni)); + BUG_ON(!max_bhs); + BUG_ON(!PageLocked(page)); + if (WARN_ON(max_bhs > MAX_BHS)) { + err = -EINVAL; + goto err_out; + } + /* + * If the ntfs_inode is clean no need to do anything. If it is dirty, + * mark it as clean now so that it can be redirtied later on if needed. + * There is no danger of races since the caller is holding the locks + * for the mft record @m and the page it is in. + */ + if (!NInoTestClearDirty(ni)) + goto done; + bh = head = page_buffers(page); + BUG_ON(!bh); + rl = NULL; + nr_bhs = 0; + block_start = 0; + m_start = ni->page_ofs; + m_end = m_start + vol->mft_record_size; + do { + block_end = block_start + blocksize; + /* If the buffer is outside the mft record, skip it. */ + if (block_end <= m_start) + continue; + if (unlikely(block_start >= m_end)) + break; + /* + * If this block is not the first one in the record, we ignore + * the buffer's dirty state because we could have raced with a + * parallel mark_ntfs_record_dirty(). + */ + if (block_start == m_start) { + /* This block is the first one in the record. */ + if (!buffer_dirty(bh)) { + BUG_ON(nr_bhs); + /* Clean records are not written out. */ + break; + } + } + /* Need to map the buffer if it is not mapped already. */ + if (unlikely(!buffer_mapped(bh))) { + VCN vcn; + LCN lcn; + unsigned int vcn_ofs; + + bh->b_bdev = vol->sb->s_bdev; + /* Obtain the vcn and offset of the current block. */ + vcn = ((VCN)ni->mft_no << vol->mft_record_size_bits) + + (block_start - m_start); + vcn_ofs = vcn & vol->cluster_size_mask; + vcn >>= vol->cluster_size_bits; + if (!rl) { + down_read(&NTFS_I(vol->mft_ino)->runlist.lock); + rl = NTFS_I(vol->mft_ino)->runlist.rl; + BUG_ON(!rl); + } + /* Seek to element containing target vcn. */ + while (rl->length && rl[1].vcn <= vcn) + rl++; + lcn = ntfs_rl_vcn_to_lcn(rl, vcn); + /* For $MFT, only lcn >= 0 is a successful remap. */ + if (likely(lcn >= 0)) { + /* Setup buffer head to correct block. */ + bh->b_blocknr = ((lcn << + vol->cluster_size_bits) + + vcn_ofs) >> blocksize_bits; + set_buffer_mapped(bh); + } else { + bh->b_blocknr = -1; + ntfs_error(vol->sb, "Cannot write mft record " + "0x%lx because its location " + "on disk could not be " + "determined (error code %lli).", + ni->mft_no, (long long)lcn); + err = -EIO; + } + } + BUG_ON(!buffer_uptodate(bh)); + BUG_ON(!nr_bhs && (m_start != block_start)); + BUG_ON(nr_bhs >= max_bhs); + bhs[nr_bhs++] = bh; + BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end)); + } while (block_start = block_end, (bh = bh->b_this_page) != head); + if (unlikely(rl)) + up_read(&NTFS_I(vol->mft_ino)->runlist.lock); + if (!nr_bhs) + goto done; + if (unlikely(err)) + goto cleanup_out; + /* Apply the mst protection fixups. */ + err = pre_write_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size); + if (err) { + ntfs_error(vol->sb, "Failed to apply mst fixups!"); + goto cleanup_out; + } + flush_dcache_mft_record_page(ni); + /* Lock buffers and start synchronous write i/o on them. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { + struct buffer_head *tbh = bhs[i_bhs]; + + if (!trylock_buffer(tbh)) + BUG(); + BUG_ON(!buffer_uptodate(tbh)); + clear_buffer_dirty(tbh); + get_bh(tbh); + tbh->b_end_io = end_buffer_write_sync; + submit_bh(REQ_OP_WRITE, tbh); + } + /* Synchronize the mft mirror now if not @sync. */ + if (!sync && ni->mft_no < vol->mftmirr_size) + ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync); + /* Wait on i/o completion of buffers. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { + struct buffer_head *tbh = bhs[i_bhs]; + + wait_on_buffer(tbh); + if (unlikely(!buffer_uptodate(tbh))) { + err = -EIO; + /* + * Set the buffer uptodate so the page and buffer + * states do not become out of sync. + */ + if (PageUptodate(page)) + set_buffer_uptodate(tbh); + } + } + /* If @sync, now synchronize the mft mirror. */ + if (sync && ni->mft_no < vol->mftmirr_size) + ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync); + /* Remove the mst protection fixups again. */ + post_write_mst_fixup((NTFS_RECORD*)m); + flush_dcache_mft_record_page(ni); + if (unlikely(err)) { + /* I/O error during writing. This is really bad! */ + ntfs_error(vol->sb, "I/O error while writing mft record " + "0x%lx! Marking base inode as bad. You " + "should unmount the volume and run chkdsk.", + ni->mft_no); + goto err_out; + } +done: + ntfs_debug("Done."); + return 0; +cleanup_out: + /* Clean the buffers. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) + clear_buffer_dirty(bhs[i_bhs]); +err_out: + /* + * Current state: all buffers are clean, unlocked, and uptodate. + * The caller should mark the base inode as bad so that no more i/o + * happens. ->clear_inode() will still be invoked so all extent inodes + * and other allocated memory will be freed. + */ + if (err == -ENOMEM) { + ntfs_error(vol->sb, "Not enough memory to write mft record. " + "Redirtying so the write is retried later."); + mark_mft_record_dirty(ni); + err = 0; + } else + NVolSetErrors(vol); + return err; +} + +/** + * ntfs_may_write_mft_record - check if an mft record may be written out + * @vol: [IN] ntfs volume on which the mft record to check resides + * @mft_no: [IN] mft record number of the mft record to check + * @m: [IN] mapped mft record to check + * @locked_ni: [OUT] caller has to unlock this ntfs inode if one is returned + * + * Check if the mapped (base or extent) mft record @m with mft record number + * @mft_no belonging to the ntfs volume @vol may be written out. If necessary + * and possible the ntfs inode of the mft record is locked and the base vfs + * inode is pinned. The locked ntfs inode is then returned in @locked_ni. The + * caller is responsible for unlocking the ntfs inode and unpinning the base + * vfs inode. + * + * Return 'true' if the mft record may be written out and 'false' if not. + * + * The caller has locked the page and cleared the uptodate flag on it which + * means that we can safely write out any dirty mft records that do not have + * their inodes in icache as determined by ilookup5() as anyone + * opening/creating such an inode would block when attempting to map the mft + * record in read_cache_page() until we are finished with the write out. + * + * Here is a description of the tests we perform: + * + * If the inode is found in icache we know the mft record must be a base mft + * record. If it is dirty, we do not write it and return 'false' as the vfs + * inode write paths will result in the access times being updated which would + * cause the base mft record to be redirtied and written out again. (We know + * the access time update will modify the base mft record because Windows + * chkdsk complains if the standard information attribute is not in the base + * mft record.) + * + * If the inode is in icache and not dirty, we attempt to lock the mft record + * and if we find the lock was already taken, it is not safe to write the mft + * record and we return 'false'. + * + * If we manage to obtain the lock we have exclusive access to the mft record, + * which also allows us safe writeout of the mft record. We then set + * @locked_ni to the locked ntfs inode and return 'true'. + * + * Note we cannot just lock the mft record and sleep while waiting for the lock + * because this would deadlock due to lock reversal (normally the mft record is + * locked before the page is locked but we already have the page locked here + * when we try to lock the mft record). + * + * If the inode is not in icache we need to perform further checks. + * + * If the mft record is not a FILE record or it is a base mft record, we can + * safely write it and return 'true'. + * + * We now know the mft record is an extent mft record. We check if the inode + * corresponding to its base mft record is in icache and obtain a reference to + * it if it is. If it is not, we can safely write it and return 'true'. + * + * We now have the base inode for the extent mft record. We check if it has an + * ntfs inode for the extent mft record attached and if not it is safe to write + * the extent mft record and we return 'true'. + * + * The ntfs inode for the extent mft record is attached to the base inode so we + * attempt to lock the extent mft record and if we find the lock was already + * taken, it is not safe to write the extent mft record and we return 'false'. + * + * If we manage to obtain the lock we have exclusive access to the extent mft + * record, which also allows us safe writeout of the extent mft record. We + * set the ntfs inode of the extent mft record clean and then set @locked_ni to + * the now locked ntfs inode and return 'true'. + * + * Note, the reason for actually writing dirty mft records here and not just + * relying on the vfs inode dirty code paths is that we can have mft records + * modified without them ever having actual inodes in memory. Also we can have + * dirty mft records with clean ntfs inodes in memory. None of the described + * cases would result in the dirty mft records being written out if we only + * relied on the vfs inode dirty code paths. And these cases can really occur + * during allocation of new mft records and in particular when the + * initialized_size of the $MFT/$DATA attribute is extended and the new space + * is initialized using ntfs_mft_record_format(). The clean inode can then + * appear if the mft record is reused for a new inode before it got written + * out. + */ +bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no, + const MFT_RECORD *m, ntfs_inode **locked_ni) +{ + struct super_block *sb = vol->sb; + struct inode *mft_vi = vol->mft_ino; + struct inode *vi; + ntfs_inode *ni, *eni, **extent_nis; + int i; + ntfs_attr na; + + ntfs_debug("Entering for inode 0x%lx.", mft_no); + /* + * Normally we do not return a locked inode so set @locked_ni to NULL. + */ + BUG_ON(!locked_ni); + *locked_ni = NULL; + /* + * Check if the inode corresponding to this mft record is in the VFS + * inode cache and obtain a reference to it if it is. + */ + ntfs_debug("Looking for inode 0x%lx in icache.", mft_no); + na.mft_no = mft_no; + na.name = NULL; + na.name_len = 0; + na.type = AT_UNUSED; + /* + * Optimize inode 0, i.e. $MFT itself, since we have it in memory and + * we get here for it rather often. + */ + if (!mft_no) { + /* Balance the below iput(). */ + vi = igrab(mft_vi); + BUG_ON(vi != mft_vi); + } else { + /* + * Have to use ilookup5_nowait() since ilookup5() waits for the + * inode lock which causes ntfs to deadlock when a concurrent + * inode write via the inode dirty code paths and the page + * dirty code path of the inode dirty code path when writing + * $MFT occurs. + */ + vi = ilookup5_nowait(sb, mft_no, ntfs_test_inode, &na); + } + if (vi) { + ntfs_debug("Base inode 0x%lx is in icache.", mft_no); + /* The inode is in icache. */ + ni = NTFS_I(vi); + /* Take a reference to the ntfs inode. */ + atomic_inc(&ni->count); + /* If the inode is dirty, do not write this record. */ + if (NInoDirty(ni)) { + ntfs_debug("Inode 0x%lx is dirty, do not write it.", + mft_no); + atomic_dec(&ni->count); + iput(vi); + return false; + } + ntfs_debug("Inode 0x%lx is not dirty.", mft_no); + /* The inode is not dirty, try to take the mft record lock. */ + if (unlikely(!mutex_trylock(&ni->mrec_lock))) { + ntfs_debug("Mft record 0x%lx is already locked, do " + "not write it.", mft_no); + atomic_dec(&ni->count); + iput(vi); + return false; + } + ntfs_debug("Managed to lock mft record 0x%lx, write it.", + mft_no); + /* + * The write has to occur while we hold the mft record lock so + * return the locked ntfs inode. + */ + *locked_ni = ni; + return true; + } + ntfs_debug("Inode 0x%lx is not in icache.", mft_no); + /* The inode is not in icache. */ + /* Write the record if it is not a mft record (type "FILE"). */ + if (!ntfs_is_mft_record(m->magic)) { + ntfs_debug("Mft record 0x%lx is not a FILE record, write it.", + mft_no); + return true; + } + /* Write the mft record if it is a base inode. */ + if (!m->base_mft_record) { + ntfs_debug("Mft record 0x%lx is a base record, write it.", + mft_no); + return true; + } + /* + * This is an extent mft record. Check if the inode corresponding to + * its base mft record is in icache and obtain a reference to it if it + * is. + */ + na.mft_no = MREF_LE(m->base_mft_record); + ntfs_debug("Mft record 0x%lx is an extent record. Looking for base " + "inode 0x%lx in icache.", mft_no, na.mft_no); + if (!na.mft_no) { + /* Balance the below iput(). */ + vi = igrab(mft_vi); + BUG_ON(vi != mft_vi); + } else + vi = ilookup5_nowait(sb, na.mft_no, ntfs_test_inode, + &na); + if (!vi) { + /* + * The base inode is not in icache, write this extent mft + * record. + */ + ntfs_debug("Base inode 0x%lx is not in icache, write the " + "extent record.", na.mft_no); + return true; + } + ntfs_debug("Base inode 0x%lx is in icache.", na.mft_no); + /* + * The base inode is in icache. Check if it has the extent inode + * corresponding to this extent mft record attached. + */ + ni = NTFS_I(vi); + mutex_lock(&ni->extent_lock); + if (ni->nr_extents <= 0) { + /* + * The base inode has no attached extent inodes, write this + * extent mft record. + */ + mutex_unlock(&ni->extent_lock); + iput(vi); + ntfs_debug("Base inode 0x%lx has no attached extent inodes, " + "write the extent record.", na.mft_no); + return true; + } + /* Iterate over the attached extent inodes. */ + extent_nis = ni->ext.extent_ntfs_inos; + for (eni = NULL, i = 0; i < ni->nr_extents; ++i) { + if (mft_no == extent_nis[i]->mft_no) { + /* + * Found the extent inode corresponding to this extent + * mft record. + */ + eni = extent_nis[i]; + break; + } + } + /* + * If the extent inode was not attached to the base inode, write this + * extent mft record. + */ + if (!eni) { + mutex_unlock(&ni->extent_lock); + iput(vi); + ntfs_debug("Extent inode 0x%lx is not attached to its base " + "inode 0x%lx, write the extent record.", + mft_no, na.mft_no); + return true; + } + ntfs_debug("Extent inode 0x%lx is attached to its base inode 0x%lx.", + mft_no, na.mft_no); + /* Take a reference to the extent ntfs inode. */ + atomic_inc(&eni->count); + mutex_unlock(&ni->extent_lock); + /* + * Found the extent inode coresponding to this extent mft record. + * Try to take the mft record lock. + */ + if (unlikely(!mutex_trylock(&eni->mrec_lock))) { + atomic_dec(&eni->count); + iput(vi); + ntfs_debug("Extent mft record 0x%lx is already locked, do " + "not write it.", mft_no); + return false; + } + ntfs_debug("Managed to lock extent mft record 0x%lx, write it.", + mft_no); + if (NInoTestClearDirty(eni)) + ntfs_debug("Extent inode 0x%lx is dirty, marking it clean.", + mft_no); + /* + * The write has to occur while we hold the mft record lock so return + * the locked extent ntfs inode. + */ + *locked_ni = eni; + return true; +} + +static const char *es = " Leaving inconsistent metadata. Unmount and run " + "chkdsk."; + +/** + * ntfs_mft_bitmap_find_and_alloc_free_rec_nolock - see name + * @vol: volume on which to search for a free mft record + * @base_ni: open base inode if allocating an extent mft record or NULL + * + * Search for a free mft record in the mft bitmap attribute on the ntfs volume + * @vol. + * + * If @base_ni is NULL start the search at the default allocator position. + * + * If @base_ni is not NULL start the search at the mft record after the base + * mft record @base_ni. + * + * Return the free mft record on success and -errno on error. An error code of + * -ENOSPC means that there are no free mft records in the currently + * initialized mft bitmap. + * + * Locking: Caller must hold vol->mftbmp_lock for writing. + */ +static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol, + ntfs_inode *base_ni) +{ + s64 pass_end, ll, data_pos, pass_start, ofs, bit; + unsigned long flags; + struct address_space *mftbmp_mapping; + u8 *buf, *byte; + struct page *page; + unsigned int page_ofs, size; + u8 pass, b; + + ntfs_debug("Searching for free mft record in the currently " + "initialized mft bitmap."); + mftbmp_mapping = vol->mftbmp_ino->i_mapping; + /* + * Set the end of the pass making sure we do not overflow the mft + * bitmap. + */ + read_lock_irqsave(&NTFS_I(vol->mft_ino)->size_lock, flags); + pass_end = NTFS_I(vol->mft_ino)->allocated_size >> + vol->mft_record_size_bits; + read_unlock_irqrestore(&NTFS_I(vol->mft_ino)->size_lock, flags); + read_lock_irqsave(&NTFS_I(vol->mftbmp_ino)->size_lock, flags); + ll = NTFS_I(vol->mftbmp_ino)->initialized_size << 3; + read_unlock_irqrestore(&NTFS_I(vol->mftbmp_ino)->size_lock, flags); + if (pass_end > ll) + pass_end = ll; + pass = 1; + if (!base_ni) + data_pos = vol->mft_data_pos; + else + data_pos = base_ni->mft_no + 1; + if (data_pos < 24) + data_pos = 24; + if (data_pos >= pass_end) { + data_pos = 24; + pass = 2; + /* This happens on a freshly formatted volume. */ + if (data_pos >= pass_end) + return -ENOSPC; + } + pass_start = data_pos; + ntfs_debug("Starting bitmap search: pass %u, pass_start 0x%llx, " + "pass_end 0x%llx, data_pos 0x%llx.", pass, + (long long)pass_start, (long long)pass_end, + (long long)data_pos); + /* Loop until a free mft record is found. */ + for (; pass <= 2;) { + /* Cap size to pass_end. */ + ofs = data_pos >> 3; + page_ofs = ofs & ~PAGE_MASK; + size = PAGE_SIZE - page_ofs; + ll = ((pass_end + 7) >> 3) - ofs; + if (size > ll) + size = ll; + size <<= 3; + /* + * If we are still within the active pass, search the next page + * for a zero bit. + */ + if (size) { + page = ntfs_map_page(mftbmp_mapping, + ofs >> PAGE_SHIFT); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to read mft " + "bitmap, aborting."); + return PTR_ERR(page); + } + buf = (u8*)page_address(page) + page_ofs; + bit = data_pos & 7; + data_pos &= ~7ull; + ntfs_debug("Before inner for loop: size 0x%x, " + "data_pos 0x%llx, bit 0x%llx", size, + (long long)data_pos, (long long)bit); + for (; bit < size && data_pos + bit < pass_end; + bit &= ~7ull, bit += 8) { + byte = buf + (bit >> 3); + if (*byte == 0xff) + continue; + b = ffz((unsigned long)*byte); + if (b < 8 && b >= (bit & 7)) { + ll = data_pos + (bit & ~7ull) + b; + if (unlikely(ll > (1ll << 32))) { + ntfs_unmap_page(page); + return -ENOSPC; + } + *byte |= 1 << b; + flush_dcache_page(page); + set_page_dirty(page); + ntfs_unmap_page(page); + ntfs_debug("Done. (Found and " + "allocated mft record " + "0x%llx.)", + (long long)ll); + return ll; + } + } + ntfs_debug("After inner for loop: size 0x%x, " + "data_pos 0x%llx, bit 0x%llx", size, + (long long)data_pos, (long long)bit); + data_pos += size; + ntfs_unmap_page(page); + /* + * If the end of the pass has not been reached yet, + * continue searching the mft bitmap for a zero bit. + */ + if (data_pos < pass_end) + continue; + } + /* Do the next pass. */ + if (++pass == 2) { + /* + * Starting the second pass, in which we scan the first + * part of the zone which we omitted earlier. + */ + pass_end = pass_start; + data_pos = pass_start = 24; + ntfs_debug("pass %i, pass_start 0x%llx, pass_end " + "0x%llx.", pass, (long long)pass_start, + (long long)pass_end); + if (data_pos >= pass_end) + break; + } + } + /* No free mft records in currently initialized mft bitmap. */ + ntfs_debug("Done. (No free mft records left in currently initialized " + "mft bitmap.)"); + return -ENOSPC; +} + +/** + * ntfs_mft_bitmap_extend_allocation_nolock - extend mft bitmap by a cluster + * @vol: volume on which to extend the mft bitmap attribute + * + * Extend the mft bitmap attribute on the ntfs volume @vol by one cluster. + * + * Note: Only changes allocated_size, i.e. does not touch initialized_size or + * data_size. + * + * Return 0 on success and -errno on error. + * + * Locking: - Caller must hold vol->mftbmp_lock for writing. + * - This function takes NTFS_I(vol->mftbmp_ino)->runlist.lock for + * writing and releases it before returning. + * - This function takes vol->lcnbmp_lock for writing and releases it + * before returning. + */ +static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol) +{ + LCN lcn; + s64 ll; + unsigned long flags; + struct page *page; + ntfs_inode *mft_ni, *mftbmp_ni; + runlist_element *rl, *rl2 = NULL; + ntfs_attr_search_ctx *ctx = NULL; + MFT_RECORD *mrec; + ATTR_RECORD *a = NULL; + int ret, mp_size; + u32 old_alen = 0; + u8 *b, tb; + struct { + u8 added_cluster:1; + u8 added_run:1; + u8 mp_rebuilt:1; + } status = { 0, 0, 0 }; + + ntfs_debug("Extending mft bitmap allocation."); + mft_ni = NTFS_I(vol->mft_ino); + mftbmp_ni = NTFS_I(vol->mftbmp_ino); + /* + * Determine the last lcn of the mft bitmap. The allocated size of the + * mft bitmap cannot be zero so we are ok to do this. + */ + down_write(&mftbmp_ni->runlist.lock); + read_lock_irqsave(&mftbmp_ni->size_lock, flags); + ll = mftbmp_ni->allocated_size; + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + rl = ntfs_attr_find_vcn_nolock(mftbmp_ni, + (ll - 1) >> vol->cluster_size_bits, NULL); + if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) { + up_write(&mftbmp_ni->runlist.lock); + ntfs_error(vol->sb, "Failed to determine last allocated " + "cluster of mft bitmap attribute."); + if (!IS_ERR(rl)) + ret = -EIO; + else + ret = PTR_ERR(rl); + return ret; + } + lcn = rl->lcn + rl->length; + ntfs_debug("Last lcn of mft bitmap attribute is 0x%llx.", + (long long)lcn); + /* + * Attempt to get the cluster following the last allocated cluster by + * hand as it may be in the MFT zone so the allocator would not give it + * to us. + */ + ll = lcn >> 3; + page = ntfs_map_page(vol->lcnbmp_ino->i_mapping, + ll >> PAGE_SHIFT); + if (IS_ERR(page)) { + up_write(&mftbmp_ni->runlist.lock); + ntfs_error(vol->sb, "Failed to read from lcn bitmap."); + return PTR_ERR(page); + } + b = (u8*)page_address(page) + (ll & ~PAGE_MASK); + tb = 1 << (lcn & 7ull); + down_write(&vol->lcnbmp_lock); + if (*b != 0xff && !(*b & tb)) { + /* Next cluster is free, allocate it. */ + *b |= tb; + flush_dcache_page(page); + set_page_dirty(page); + up_write(&vol->lcnbmp_lock); + ntfs_unmap_page(page); + /* Update the mft bitmap runlist. */ + rl->length++; + rl[1].vcn++; + status.added_cluster = 1; + ntfs_debug("Appending one cluster to mft bitmap."); + } else { + up_write(&vol->lcnbmp_lock); + ntfs_unmap_page(page); + /* Allocate a cluster from the DATA_ZONE. */ + rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE, + true); + if (IS_ERR(rl2)) { + up_write(&mftbmp_ni->runlist.lock); + ntfs_error(vol->sb, "Failed to allocate a cluster for " + "the mft bitmap."); + return PTR_ERR(rl2); + } + rl = ntfs_runlists_merge(mftbmp_ni->runlist.rl, rl2); + if (IS_ERR(rl)) { + up_write(&mftbmp_ni->runlist.lock); + ntfs_error(vol->sb, "Failed to merge runlists for mft " + "bitmap."); + if (ntfs_cluster_free_from_rl(vol, rl2)) { + ntfs_error(vol->sb, "Failed to deallocate " + "allocated cluster.%s", es); + NVolSetErrors(vol); + } + ntfs_free(rl2); + return PTR_ERR(rl); + } + mftbmp_ni->runlist.rl = rl; + status.added_run = 1; + ntfs_debug("Adding one run to mft bitmap."); + /* Find the last run in the new runlist. */ + for (; rl[1].length; rl++) + ; + } + /* + * Update the attribute record as well. Note: @rl is the last + * (non-terminator) runlist element of mft bitmap. + */ + mrec = map_mft_record(mft_ni); + if (IS_ERR(mrec)) { + ntfs_error(vol->sb, "Failed to map mft record."); + ret = PTR_ERR(mrec); + goto undo_alloc; + } + ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); + if (unlikely(!ctx)) { + ntfs_error(vol->sb, "Failed to get search context."); + ret = -ENOMEM; + goto undo_alloc; + } + ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, + mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL, + 0, ctx); + if (unlikely(ret)) { + ntfs_error(vol->sb, "Failed to find last attribute extent of " + "mft bitmap attribute."); + if (ret == -ENOENT) + ret = -EIO; + goto undo_alloc; + } + a = ctx->attr; + ll = sle64_to_cpu(a->data.non_resident.lowest_vcn); + /* Search back for the previous last allocated cluster of mft bitmap. */ + for (rl2 = rl; rl2 > mftbmp_ni->runlist.rl; rl2--) { + if (ll >= rl2->vcn) + break; + } + BUG_ON(ll < rl2->vcn); + BUG_ON(ll >= rl2->vcn + rl2->length); + /* Get the size for the new mapping pairs array for this extent. */ + mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1); + if (unlikely(mp_size <= 0)) { + ntfs_error(vol->sb, "Get size for mapping pairs failed for " + "mft bitmap attribute extent."); + ret = mp_size; + if (!ret) + ret = -EIO; + goto undo_alloc; + } + /* Expand the attribute record if necessary. */ + old_alen = le32_to_cpu(a->length); + ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); + if (unlikely(ret)) { + if (ret != -ENOSPC) { + ntfs_error(vol->sb, "Failed to resize attribute " + "record for mft bitmap attribute."); + goto undo_alloc; + } + // TODO: Deal with this by moving this extent to a new mft + // record or by starting a new extent in a new mft record or by + // moving other attributes out of this mft record. + // Note: It will need to be a special mft record and if none of + // those are available it gets rather complicated... + ntfs_error(vol->sb, "Not enough space in this mft record to " + "accommodate extended mft bitmap attribute " + "extent. Cannot handle this yet."); + ret = -EOPNOTSUPP; + goto undo_alloc; + } + status.mp_rebuilt = 1; + /* Generate the mapping pairs array directly into the attr record. */ + ret = ntfs_mapping_pairs_build(vol, (u8*)a + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset), + mp_size, rl2, ll, -1, NULL); + if (unlikely(ret)) { + ntfs_error(vol->sb, "Failed to build mapping pairs array for " + "mft bitmap attribute."); + goto undo_alloc; + } + /* Update the highest_vcn. */ + a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1); + /* + * We now have extended the mft bitmap allocated_size by one cluster. + * Reflect this in the ntfs_inode structure and the attribute record. + */ + if (a->data.non_resident.lowest_vcn) { + /* + * We are not in the first attribute extent, switch to it, but + * first ensure the changes will make it to disk later. + */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_reinit_search_ctx(ctx); + ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, + mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, + 0, ctx); + if (unlikely(ret)) { + ntfs_error(vol->sb, "Failed to find first attribute " + "extent of mft bitmap attribute."); + goto restore_undo_alloc; + } + a = ctx->attr; + } + write_lock_irqsave(&mftbmp_ni->size_lock, flags); + mftbmp_ni->allocated_size += vol->cluster_size; + a->data.non_resident.allocated_size = + cpu_to_sle64(mftbmp_ni->allocated_size); + write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + /* Ensure the changes make it to disk. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); + up_write(&mftbmp_ni->runlist.lock); + ntfs_debug("Done."); + return 0; +restore_undo_alloc: + ntfs_attr_reinit_search_ctx(ctx); + if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, + mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL, + 0, ctx)) { + ntfs_error(vol->sb, "Failed to find last attribute extent of " + "mft bitmap attribute.%s", es); + write_lock_irqsave(&mftbmp_ni->size_lock, flags); + mftbmp_ni->allocated_size += vol->cluster_size; + write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); + up_write(&mftbmp_ni->runlist.lock); + /* + * The only thing that is now wrong is ->allocated_size of the + * base attribute extent which chkdsk should be able to fix. + */ + NVolSetErrors(vol); + return ret; + } + a = ctx->attr; + a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 2); +undo_alloc: + if (status.added_cluster) { + /* Truncate the last run in the runlist by one cluster. */ + rl->length--; + rl[1].vcn--; + } else if (status.added_run) { + lcn = rl->lcn; + /* Remove the last run from the runlist. */ + rl->lcn = rl[1].lcn; + rl->length = 0; + } + /* Deallocate the cluster. */ + down_write(&vol->lcnbmp_lock); + if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) { + ntfs_error(vol->sb, "Failed to free allocated cluster.%s", es); + NVolSetErrors(vol); + } + up_write(&vol->lcnbmp_lock); + if (status.mp_rebuilt) { + if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset), + old_alen - le16_to_cpu( + a->data.non_resident.mapping_pairs_offset), + rl2, ll, -1, NULL)) { + ntfs_error(vol->sb, "Failed to restore mapping pairs " + "array.%s", es); + NVolSetErrors(vol); + } + if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) { + ntfs_error(vol->sb, "Failed to restore attribute " + "record.%s", es); + NVolSetErrors(vol); + } + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + } + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (!IS_ERR(mrec)) + unmap_mft_record(mft_ni); + up_write(&mftbmp_ni->runlist.lock); + return ret; +} + +/** + * ntfs_mft_bitmap_extend_initialized_nolock - extend mftbmp initialized data + * @vol: volume on which to extend the mft bitmap attribute + * + * Extend the initialized portion of the mft bitmap attribute on the ntfs + * volume @vol by 8 bytes. + * + * Note: Only changes initialized_size and data_size, i.e. requires that + * allocated_size is big enough to fit the new initialized_size. + * + * Return 0 on success and -error on error. + * + * Locking: Caller must hold vol->mftbmp_lock for writing. + */ +static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol) +{ + s64 old_data_size, old_initialized_size; + unsigned long flags; + struct inode *mftbmp_vi; + ntfs_inode *mft_ni, *mftbmp_ni; + ntfs_attr_search_ctx *ctx; + MFT_RECORD *mrec; + ATTR_RECORD *a; + int ret; + + ntfs_debug("Extending mft bitmap initiailized (and data) size."); + mft_ni = NTFS_I(vol->mft_ino); + mftbmp_vi = vol->mftbmp_ino; + mftbmp_ni = NTFS_I(mftbmp_vi); + /* Get the attribute record. */ + mrec = map_mft_record(mft_ni); + if (IS_ERR(mrec)) { + ntfs_error(vol->sb, "Failed to map mft record."); + return PTR_ERR(mrec); + } + ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); + if (unlikely(!ctx)) { + ntfs_error(vol->sb, "Failed to get search context."); + ret = -ENOMEM; + goto unm_err_out; + } + ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, + mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(ret)) { + ntfs_error(vol->sb, "Failed to find first attribute extent of " + "mft bitmap attribute."); + if (ret == -ENOENT) + ret = -EIO; + goto put_err_out; + } + a = ctx->attr; + write_lock_irqsave(&mftbmp_ni->size_lock, flags); + old_data_size = i_size_read(mftbmp_vi); + old_initialized_size = mftbmp_ni->initialized_size; + /* + * We can simply update the initialized_size before filling the space + * with zeroes because the caller is holding the mft bitmap lock for + * writing which ensures that no one else is trying to access the data. + */ + mftbmp_ni->initialized_size += 8; + a->data.non_resident.initialized_size = + cpu_to_sle64(mftbmp_ni->initialized_size); + if (mftbmp_ni->initialized_size > old_data_size) { + i_size_write(mftbmp_vi, mftbmp_ni->initialized_size); + a->data.non_resident.data_size = + cpu_to_sle64(mftbmp_ni->initialized_size); + } + write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + /* Ensure the changes make it to disk. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); + /* Initialize the mft bitmap attribute value with zeroes. */ + ret = ntfs_attr_set(mftbmp_ni, old_initialized_size, 8, 0); + if (likely(!ret)) { + ntfs_debug("Done. (Wrote eight initialized bytes to mft " + "bitmap."); + return 0; + } + ntfs_error(vol->sb, "Failed to write to mft bitmap."); + /* Try to recover from the error. */ + mrec = map_mft_record(mft_ni); + if (IS_ERR(mrec)) { + ntfs_error(vol->sb, "Failed to map mft record.%s", es); + NVolSetErrors(vol); + return ret; + } + ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); + if (unlikely(!ctx)) { + ntfs_error(vol->sb, "Failed to get search context.%s", es); + NVolSetErrors(vol); + goto unm_err_out; + } + if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, + mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx)) { + ntfs_error(vol->sb, "Failed to find first attribute extent of " + "mft bitmap attribute.%s", es); + NVolSetErrors(vol); +put_err_out: + ntfs_attr_put_search_ctx(ctx); +unm_err_out: + unmap_mft_record(mft_ni); + goto err_out; + } + a = ctx->attr; + write_lock_irqsave(&mftbmp_ni->size_lock, flags); + mftbmp_ni->initialized_size = old_initialized_size; + a->data.non_resident.initialized_size = + cpu_to_sle64(old_initialized_size); + if (i_size_read(mftbmp_vi) != old_data_size) { + i_size_write(mftbmp_vi, old_data_size); + a->data.non_resident.data_size = cpu_to_sle64(old_data_size); + } + write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); +#ifdef DEBUG + read_lock_irqsave(&mftbmp_ni->size_lock, flags); + ntfs_debug("Restored status of mftbmp: allocated_size 0x%llx, " + "data_size 0x%llx, initialized_size 0x%llx.", + (long long)mftbmp_ni->allocated_size, + (long long)i_size_read(mftbmp_vi), + (long long)mftbmp_ni->initialized_size); + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); +#endif /* DEBUG */ +err_out: + return ret; +} + +/** + * ntfs_mft_data_extend_allocation_nolock - extend mft data attribute + * @vol: volume on which to extend the mft data attribute + * + * Extend the mft data attribute on the ntfs volume @vol by 16 mft records + * worth of clusters or if not enough space for this by one mft record worth + * of clusters. + * + * Note: Only changes allocated_size, i.e. does not touch initialized_size or + * data_size. + * + * Return 0 on success and -errno on error. + * + * Locking: - Caller must hold vol->mftbmp_lock for writing. + * - This function takes NTFS_I(vol->mft_ino)->runlist.lock for + * writing and releases it before returning. + * - This function calls functions which take vol->lcnbmp_lock for + * writing and release it before returning. + */ +static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) +{ + LCN lcn; + VCN old_last_vcn; + s64 min_nr, nr, ll; + unsigned long flags; + ntfs_inode *mft_ni; + runlist_element *rl, *rl2; + ntfs_attr_search_ctx *ctx = NULL; + MFT_RECORD *mrec; + ATTR_RECORD *a = NULL; + int ret, mp_size; + u32 old_alen = 0; + bool mp_rebuilt = false; + + ntfs_debug("Extending mft data allocation."); + mft_ni = NTFS_I(vol->mft_ino); + /* + * Determine the preferred allocation location, i.e. the last lcn of + * the mft data attribute. The allocated size of the mft data + * attribute cannot be zero so we are ok to do this. + */ + down_write(&mft_ni->runlist.lock); + read_lock_irqsave(&mft_ni->size_lock, flags); + ll = mft_ni->allocated_size; + read_unlock_irqrestore(&mft_ni->size_lock, flags); + rl = ntfs_attr_find_vcn_nolock(mft_ni, + (ll - 1) >> vol->cluster_size_bits, NULL); + if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) { + up_write(&mft_ni->runlist.lock); + ntfs_error(vol->sb, "Failed to determine last allocated " + "cluster of mft data attribute."); + if (!IS_ERR(rl)) + ret = -EIO; + else + ret = PTR_ERR(rl); + return ret; + } + lcn = rl->lcn + rl->length; + ntfs_debug("Last lcn of mft data attribute is 0x%llx.", (long long)lcn); + /* Minimum allocation is one mft record worth of clusters. */ + min_nr = vol->mft_record_size >> vol->cluster_size_bits; + if (!min_nr) + min_nr = 1; + /* Want to allocate 16 mft records worth of clusters. */ + nr = vol->mft_record_size << 4 >> vol->cluster_size_bits; + if (!nr) + nr = min_nr; + /* Ensure we do not go above 2^32-1 mft records. */ + read_lock_irqsave(&mft_ni->size_lock, flags); + ll = mft_ni->allocated_size; + read_unlock_irqrestore(&mft_ni->size_lock, flags); + if (unlikely((ll + (nr << vol->cluster_size_bits)) >> + vol->mft_record_size_bits >= (1ll << 32))) { + nr = min_nr; + if (unlikely((ll + (nr << vol->cluster_size_bits)) >> + vol->mft_record_size_bits >= (1ll << 32))) { + ntfs_warning(vol->sb, "Cannot allocate mft record " + "because the maximum number of inodes " + "(2^32) has already been reached."); + up_write(&mft_ni->runlist.lock); + return -ENOSPC; + } + } + ntfs_debug("Trying mft data allocation with %s cluster count %lli.", + nr > min_nr ? "default" : "minimal", (long long)nr); + old_last_vcn = rl[1].vcn; + do { + rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE, + true); + if (!IS_ERR(rl2)) + break; + if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) { + ntfs_error(vol->sb, "Failed to allocate the minimal " + "number of clusters (%lli) for the " + "mft data attribute.", (long long)nr); + up_write(&mft_ni->runlist.lock); + return PTR_ERR(rl2); + } + /* + * There is not enough space to do the allocation, but there + * might be enough space to do a minimal allocation so try that + * before failing. + */ + nr = min_nr; + ntfs_debug("Retrying mft data allocation with minimal cluster " + "count %lli.", (long long)nr); + } while (1); + rl = ntfs_runlists_merge(mft_ni->runlist.rl, rl2); + if (IS_ERR(rl)) { + up_write(&mft_ni->runlist.lock); + ntfs_error(vol->sb, "Failed to merge runlists for mft data " + "attribute."); + if (ntfs_cluster_free_from_rl(vol, rl2)) { + ntfs_error(vol->sb, "Failed to deallocate clusters " + "from the mft data attribute.%s", es); + NVolSetErrors(vol); + } + ntfs_free(rl2); + return PTR_ERR(rl); + } + mft_ni->runlist.rl = rl; + ntfs_debug("Allocated %lli clusters.", (long long)nr); + /* Find the last run in the new runlist. */ + for (; rl[1].length; rl++) + ; + /* Update the attribute record as well. */ + mrec = map_mft_record(mft_ni); + if (IS_ERR(mrec)) { + ntfs_error(vol->sb, "Failed to map mft record."); + ret = PTR_ERR(mrec); + goto undo_alloc; + } + ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); + if (unlikely(!ctx)) { + ntfs_error(vol->sb, "Failed to get search context."); + ret = -ENOMEM; + goto undo_alloc; + } + ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len, + CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx); + if (unlikely(ret)) { + ntfs_error(vol->sb, "Failed to find last attribute extent of " + "mft data attribute."); + if (ret == -ENOENT) + ret = -EIO; + goto undo_alloc; + } + a = ctx->attr; + ll = sle64_to_cpu(a->data.non_resident.lowest_vcn); + /* Search back for the previous last allocated cluster of mft bitmap. */ + for (rl2 = rl; rl2 > mft_ni->runlist.rl; rl2--) { + if (ll >= rl2->vcn) + break; + } + BUG_ON(ll < rl2->vcn); + BUG_ON(ll >= rl2->vcn + rl2->length); + /* Get the size for the new mapping pairs array for this extent. */ + mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1); + if (unlikely(mp_size <= 0)) { + ntfs_error(vol->sb, "Get size for mapping pairs failed for " + "mft data attribute extent."); + ret = mp_size; + if (!ret) + ret = -EIO; + goto undo_alloc; + } + /* Expand the attribute record if necessary. */ + old_alen = le32_to_cpu(a->length); + ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); + if (unlikely(ret)) { + if (ret != -ENOSPC) { + ntfs_error(vol->sb, "Failed to resize attribute " + "record for mft data attribute."); + goto undo_alloc; + } + // TODO: Deal with this by moving this extent to a new mft + // record or by starting a new extent in a new mft record or by + // moving other attributes out of this mft record. + // Note: Use the special reserved mft records and ensure that + // this extent is not required to find the mft record in + // question. If no free special records left we would need to + // move an existing record away, insert ours in its place, and + // then place the moved record into the newly allocated space + // and we would then need to update all references to this mft + // record appropriately. This is rather complicated... + ntfs_error(vol->sb, "Not enough space in this mft record to " + "accommodate extended mft data attribute " + "extent. Cannot handle this yet."); + ret = -EOPNOTSUPP; + goto undo_alloc; + } + mp_rebuilt = true; + /* Generate the mapping pairs array directly into the attr record. */ + ret = ntfs_mapping_pairs_build(vol, (u8*)a + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset), + mp_size, rl2, ll, -1, NULL); + if (unlikely(ret)) { + ntfs_error(vol->sb, "Failed to build mapping pairs array of " + "mft data attribute."); + goto undo_alloc; + } + /* Update the highest_vcn. */ + a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1); + /* + * We now have extended the mft data allocated_size by nr clusters. + * Reflect this in the ntfs_inode structure and the attribute record. + * @rl is the last (non-terminator) runlist element of mft data + * attribute. + */ + if (a->data.non_resident.lowest_vcn) { + /* + * We are not in the first attribute extent, switch to it, but + * first ensure the changes will make it to disk later. + */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_reinit_search_ctx(ctx); + ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, + mft_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, + ctx); + if (unlikely(ret)) { + ntfs_error(vol->sb, "Failed to find first attribute " + "extent of mft data attribute."); + goto restore_undo_alloc; + } + a = ctx->attr; + } + write_lock_irqsave(&mft_ni->size_lock, flags); + mft_ni->allocated_size += nr << vol->cluster_size_bits; + a->data.non_resident.allocated_size = + cpu_to_sle64(mft_ni->allocated_size); + write_unlock_irqrestore(&mft_ni->size_lock, flags); + /* Ensure the changes make it to disk. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); + up_write(&mft_ni->runlist.lock); + ntfs_debug("Done."); + return 0; +restore_undo_alloc: + ntfs_attr_reinit_search_ctx(ctx); + if (ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len, + CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx)) { + ntfs_error(vol->sb, "Failed to find last attribute extent of " + "mft data attribute.%s", es); + write_lock_irqsave(&mft_ni->size_lock, flags); + mft_ni->allocated_size += nr << vol->cluster_size_bits; + write_unlock_irqrestore(&mft_ni->size_lock, flags); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); + up_write(&mft_ni->runlist.lock); + /* + * The only thing that is now wrong is ->allocated_size of the + * base attribute extent which chkdsk should be able to fix. + */ + NVolSetErrors(vol); + return ret; + } + ctx->attr->data.non_resident.highest_vcn = + cpu_to_sle64(old_last_vcn - 1); +undo_alloc: + if (ntfs_cluster_free(mft_ni, old_last_vcn, -1, ctx) < 0) { + ntfs_error(vol->sb, "Failed to free clusters from mft data " + "attribute.%s", es); + NVolSetErrors(vol); + } + + if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) { + ntfs_error(vol->sb, "Failed to truncate mft data attribute " + "runlist.%s", es); + NVolSetErrors(vol); + } + if (ctx) { + a = ctx->attr; + if (mp_rebuilt && !IS_ERR(ctx->mrec)) { + if (ntfs_mapping_pairs_build(vol, (u8 *)a + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset), + old_alen - le16_to_cpu( + a->data.non_resident.mapping_pairs_offset), + rl2, ll, -1, NULL)) { + ntfs_error(vol->sb, "Failed to restore mapping pairs " + "array.%s", es); + NVolSetErrors(vol); + } + if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) { + ntfs_error(vol->sb, "Failed to restore attribute " + "record.%s", es); + NVolSetErrors(vol); + } + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + } else if (IS_ERR(ctx->mrec)) { + ntfs_error(vol->sb, "Failed to restore attribute search " + "context.%s", es); + NVolSetErrors(vol); + } + ntfs_attr_put_search_ctx(ctx); + } + if (!IS_ERR(mrec)) + unmap_mft_record(mft_ni); + up_write(&mft_ni->runlist.lock); + return ret; +} + +/** + * ntfs_mft_record_layout - layout an mft record into a memory buffer + * @vol: volume to which the mft record will belong + * @mft_no: mft reference specifying the mft record number + * @m: destination buffer of size >= @vol->mft_record_size bytes + * + * Layout an empty, unused mft record with the mft record number @mft_no into + * the buffer @m. The volume @vol is needed because the mft record structure + * was modified in NTFS 3.1 so we need to know which volume version this mft + * record will be used on. + * + * Return 0 on success and -errno on error. + */ +static int ntfs_mft_record_layout(const ntfs_volume *vol, const s64 mft_no, + MFT_RECORD *m) +{ + ATTR_RECORD *a; + + ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no); + if (mft_no >= (1ll << 32)) { + ntfs_error(vol->sb, "Mft record number 0x%llx exceeds " + "maximum of 2^32.", (long long)mft_no); + return -ERANGE; + } + /* Start by clearing the whole mft record to gives us a clean slate. */ + memset(m, 0, vol->mft_record_size); + /* Aligned to 2-byte boundary. */ + if (vol->major_ver < 3 || (vol->major_ver == 3 && !vol->minor_ver)) + m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD_OLD) + 1) & ~1); + else { + m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD) + 1) & ~1); + /* + * Set the NTFS 3.1+ specific fields while we know that the + * volume version is 3.1+. + */ + m->reserved = 0; + m->mft_record_number = cpu_to_le32((u32)mft_no); + } + m->magic = magic_FILE; + if (vol->mft_record_size >= NTFS_BLOCK_SIZE) + m->usa_count = cpu_to_le16(vol->mft_record_size / + NTFS_BLOCK_SIZE + 1); + else { + m->usa_count = cpu_to_le16(1); + ntfs_warning(vol->sb, "Sector size is bigger than mft record " + "size. Setting usa_count to 1. If chkdsk " + "reports this as corruption, please email " + "linux-ntfs-dev@lists.sourceforge.net stating " + "that you saw this message and that the " + "modified filesystem created was corrupt. " + "Thank you."); + } + /* Set the update sequence number to 1. */ + *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = cpu_to_le16(1); + m->lsn = 0; + m->sequence_number = cpu_to_le16(1); + m->link_count = 0; + /* + * Place the attributes straight after the update sequence array, + * aligned to 8-byte boundary. + */ + m->attrs_offset = cpu_to_le16((le16_to_cpu(m->usa_ofs) + + (le16_to_cpu(m->usa_count) << 1) + 7) & ~7); + m->flags = 0; + /* + * Using attrs_offset plus eight bytes (for the termination attribute). + * attrs_offset is already aligned to 8-byte boundary, so no need to + * align again. + */ + m->bytes_in_use = cpu_to_le32(le16_to_cpu(m->attrs_offset) + 8); + m->bytes_allocated = cpu_to_le32(vol->mft_record_size); + m->base_mft_record = 0; + m->next_attr_instance = 0; + /* Add the termination attribute. */ + a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset)); + a->type = AT_END; + a->length = 0; + ntfs_debug("Done."); + return 0; +} + +/** + * ntfs_mft_record_format - format an mft record on an ntfs volume + * @vol: volume on which to format the mft record + * @mft_no: mft record number to format + * + * Format the mft record @mft_no in $MFT/$DATA, i.e. lay out an empty, unused + * mft record into the appropriate place of the mft data attribute. This is + * used when extending the mft data attribute. + * + * Return 0 on success and -errno on error. + */ +static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no) +{ + loff_t i_size; + struct inode *mft_vi = vol->mft_ino; + struct page *page; + MFT_RECORD *m; + pgoff_t index, end_index; + unsigned int ofs; + int err; + + ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no); + /* + * The index into the page cache and the offset within the page cache + * page of the wanted mft record. + */ + index = mft_no << vol->mft_record_size_bits >> PAGE_SHIFT; + ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK; + /* The maximum valid index into the page cache for $MFT's data. */ + i_size = i_size_read(mft_vi); + end_index = i_size >> PAGE_SHIFT; + if (unlikely(index >= end_index)) { + if (unlikely(index > end_index || ofs + vol->mft_record_size >= + (i_size & ~PAGE_MASK))) { + ntfs_error(vol->sb, "Tried to format non-existing mft " + "record 0x%llx.", (long long)mft_no); + return -ENOENT; + } + } + /* Read, map, and pin the page containing the mft record. */ + page = ntfs_map_page(mft_vi->i_mapping, index); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to map page containing mft record " + "to format 0x%llx.", (long long)mft_no); + return PTR_ERR(page); + } + lock_page(page); + BUG_ON(!PageUptodate(page)); + ClearPageUptodate(page); + m = (MFT_RECORD*)((u8*)page_address(page) + ofs); + err = ntfs_mft_record_layout(vol, mft_no, m); + if (unlikely(err)) { + ntfs_error(vol->sb, "Failed to layout mft record 0x%llx.", + (long long)mft_no); + SetPageUptodate(page); + unlock_page(page); + ntfs_unmap_page(page); + return err; + } + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + /* + * Make sure the mft record is written out to disk. We could use + * ilookup5() to check if an inode is in icache and so on but this is + * unnecessary as ntfs_writepage() will write the dirty record anyway. + */ + mark_ntfs_record_dirty(page, ofs); + ntfs_unmap_page(page); + ntfs_debug("Done."); + return 0; +} + +/** + * ntfs_mft_record_alloc - allocate an mft record on an ntfs volume + * @vol: [IN] volume on which to allocate the mft record + * @mode: [IN] mode if want a file or directory, i.e. base inode or 0 + * @base_ni: [IN] open base inode if allocating an extent mft record or NULL + * @mrec: [OUT] on successful return this is the mapped mft record + * + * Allocate an mft record in $MFT/$DATA of an open ntfs volume @vol. + * + * If @base_ni is NULL make the mft record a base mft record, i.e. a file or + * direvctory inode, and allocate it at the default allocator position. In + * this case @mode is the file mode as given to us by the caller. We in + * particular use @mode to distinguish whether a file or a directory is being + * created (S_IFDIR(mode) and S_IFREG(mode), respectively). + * + * If @base_ni is not NULL make the allocated mft record an extent record, + * allocate it starting at the mft record after the base mft record and attach + * the allocated and opened ntfs inode to the base inode @base_ni. In this + * case @mode must be 0 as it is meaningless for extent inodes. + * + * You need to check the return value with IS_ERR(). If false, the function + * was successful and the return value is the now opened ntfs inode of the + * allocated mft record. *@mrec is then set to the allocated, mapped, pinned, + * and locked mft record. If IS_ERR() is true, the function failed and the + * error code is obtained from PTR_ERR(return value). *@mrec is undefined in + * this case. + * + * Allocation strategy: + * + * To find a free mft record, we scan the mft bitmap for a zero bit. To + * optimize this we start scanning at the place specified by @base_ni or if + * @base_ni is NULL we start where we last stopped and we perform wrap around + * when we reach the end. Note, we do not try to allocate mft records below + * number 24 because numbers 0 to 15 are the defined system files anyway and 16 + * to 24 are special in that they are used for storing extension mft records + * for the $DATA attribute of $MFT. This is required to avoid the possibility + * of creating a runlist with a circular dependency which once written to disk + * can never be read in again. Windows will only use records 16 to 24 for + * normal files if the volume is completely out of space. We never use them + * which means that when the volume is really out of space we cannot create any + * more files while Windows can still create up to 8 small files. We can start + * doing this at some later time, it does not matter much for now. + * + * When scanning the mft bitmap, we only search up to the last allocated mft + * record. If there are no free records left in the range 24 to number of + * allocated mft records, then we extend the $MFT/$DATA attribute in order to + * create free mft records. We extend the allocated size of $MFT/$DATA by 16 + * records at a time or one cluster, if cluster size is above 16kiB. If there + * is not sufficient space to do this, we try to extend by a single mft record + * or one cluster, if cluster size is above the mft record size. + * + * No matter how many mft records we allocate, we initialize only the first + * allocated mft record, incrementing mft data size and initialized size + * accordingly, open an ntfs_inode for it and return it to the caller, unless + * there are less than 24 mft records, in which case we allocate and initialize + * mft records until we reach record 24 which we consider as the first free mft + * record for use by normal files. + * + * If during any stage we overflow the initialized data in the mft bitmap, we + * extend the initialized size (and data size) by 8 bytes, allocating another + * cluster if required. The bitmap data size has to be at least equal to the + * number of mft records in the mft, but it can be bigger, in which case the + * superflous bits are padded with zeroes. + * + * Thus, when we return successfully (IS_ERR() is false), we will have: + * - initialized / extended the mft bitmap if necessary, + * - initialized / extended the mft data if necessary, + * - set the bit corresponding to the mft record being allocated in the + * mft bitmap, + * - opened an ntfs_inode for the allocated mft record, and we will have + * - returned the ntfs_inode as well as the allocated mapped, pinned, and + * locked mft record. + * + * On error, the volume will be left in a consistent state and no record will + * be allocated. If rolling back a partial operation fails, we may leave some + * inconsistent metadata in which case we set NVolErrors() so the volume is + * left dirty when unmounted. + * + * Note, this function cannot make use of most of the normal functions, like + * for example for attribute resizing, etc, because when the run list overflows + * the base mft record and an attribute list is used, it is very important that + * the extension mft records used to store the $DATA attribute of $MFT can be + * reached without having to read the information contained inside them, as + * this would make it impossible to find them in the first place after the + * volume is unmounted. $MFT/$BITMAP probably does not need to follow this + * rule because the bitmap is not essential for finding the mft records, but on + * the other hand, handling the bitmap in this special way would make life + * easier because otherwise there might be circular invocations of functions + * when reading the bitmap. + */ +ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode, + ntfs_inode *base_ni, MFT_RECORD **mrec) +{ + s64 ll, bit, old_data_initialized, old_data_size; + unsigned long flags; + struct inode *vi; + struct page *page; + ntfs_inode *mft_ni, *mftbmp_ni, *ni; + ntfs_attr_search_ctx *ctx; + MFT_RECORD *m; + ATTR_RECORD *a; + pgoff_t index; + unsigned int ofs; + int err; + le16 seq_no, usn; + bool record_formatted = false; + + if (base_ni) { + ntfs_debug("Entering (allocating an extent mft record for " + "base mft record 0x%llx).", + (long long)base_ni->mft_no); + /* @mode and @base_ni are mutually exclusive. */ + BUG_ON(mode); + } else + ntfs_debug("Entering (allocating a base mft record)."); + if (mode) { + /* @mode and @base_ni are mutually exclusive. */ + BUG_ON(base_ni); + /* We only support creation of normal files and directories. */ + if (!S_ISREG(mode) && !S_ISDIR(mode)) + return ERR_PTR(-EOPNOTSUPP); + } + BUG_ON(!mrec); + mft_ni = NTFS_I(vol->mft_ino); + mftbmp_ni = NTFS_I(vol->mftbmp_ino); + down_write(&vol->mftbmp_lock); + bit = ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(vol, base_ni); + if (bit >= 0) { + ntfs_debug("Found and allocated free record (#1), bit 0x%llx.", + (long long)bit); + goto have_alloc_rec; + } + if (bit != -ENOSPC) { + up_write(&vol->mftbmp_lock); + return ERR_PTR(bit); + } + /* + * No free mft records left. If the mft bitmap already covers more + * than the currently used mft records, the next records are all free, + * so we can simply allocate the first unused mft record. + * Note: We also have to make sure that the mft bitmap at least covers + * the first 24 mft records as they are special and whilst they may not + * be in use, we do not allocate from them. + */ + read_lock_irqsave(&mft_ni->size_lock, flags); + ll = mft_ni->initialized_size >> vol->mft_record_size_bits; + read_unlock_irqrestore(&mft_ni->size_lock, flags); + read_lock_irqsave(&mftbmp_ni->size_lock, flags); + old_data_initialized = mftbmp_ni->initialized_size; + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + if (old_data_initialized << 3 > ll && old_data_initialized > 3) { + bit = ll; + if (bit < 24) + bit = 24; + if (unlikely(bit >= (1ll << 32))) + goto max_err_out; + ntfs_debug("Found free record (#2), bit 0x%llx.", + (long long)bit); + goto found_free_rec; + } + /* + * The mft bitmap needs to be expanded until it covers the first unused + * mft record that we can allocate. + * Note: The smallest mft record we allocate is mft record 24. + */ + bit = old_data_initialized << 3; + if (unlikely(bit >= (1ll << 32))) + goto max_err_out; + read_lock_irqsave(&mftbmp_ni->size_lock, flags); + old_data_size = mftbmp_ni->allocated_size; + ntfs_debug("Status of mftbmp before extension: allocated_size 0x%llx, " + "data_size 0x%llx, initialized_size 0x%llx.", + (long long)old_data_size, + (long long)i_size_read(vol->mftbmp_ino), + (long long)old_data_initialized); + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + if (old_data_initialized + 8 > old_data_size) { + /* Need to extend bitmap by one more cluster. */ + ntfs_debug("mftbmp: initialized_size + 8 > allocated_size."); + err = ntfs_mft_bitmap_extend_allocation_nolock(vol); + if (unlikely(err)) { + up_write(&vol->mftbmp_lock); + goto err_out; + } +#ifdef DEBUG + read_lock_irqsave(&mftbmp_ni->size_lock, flags); + ntfs_debug("Status of mftbmp after allocation extension: " + "allocated_size 0x%llx, data_size 0x%llx, " + "initialized_size 0x%llx.", + (long long)mftbmp_ni->allocated_size, + (long long)i_size_read(vol->mftbmp_ino), + (long long)mftbmp_ni->initialized_size); + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); +#endif /* DEBUG */ + } + /* + * We now have sufficient allocated space, extend the initialized_size + * as well as the data_size if necessary and fill the new space with + * zeroes. + */ + err = ntfs_mft_bitmap_extend_initialized_nolock(vol); + if (unlikely(err)) { + up_write(&vol->mftbmp_lock); + goto err_out; + } +#ifdef DEBUG + read_lock_irqsave(&mftbmp_ni->size_lock, flags); + ntfs_debug("Status of mftbmp after initialized extension: " + "allocated_size 0x%llx, data_size 0x%llx, " + "initialized_size 0x%llx.", + (long long)mftbmp_ni->allocated_size, + (long long)i_size_read(vol->mftbmp_ino), + (long long)mftbmp_ni->initialized_size); + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); +#endif /* DEBUG */ + ntfs_debug("Found free record (#3), bit 0x%llx.", (long long)bit); +found_free_rec: + /* @bit is the found free mft record, allocate it in the mft bitmap. */ + ntfs_debug("At found_free_rec."); + err = ntfs_bitmap_set_bit(vol->mftbmp_ino, bit); + if (unlikely(err)) { + ntfs_error(vol->sb, "Failed to allocate bit in mft bitmap."); + up_write(&vol->mftbmp_lock); + goto err_out; + } + ntfs_debug("Set bit 0x%llx in mft bitmap.", (long long)bit); +have_alloc_rec: + /* + * The mft bitmap is now uptodate. Deal with mft data attribute now. + * Note, we keep hold of the mft bitmap lock for writing until all + * modifications to the mft data attribute are complete, too, as they + * will impact decisions for mft bitmap and mft record allocation done + * by a parallel allocation and if the lock is not maintained a + * parallel allocation could allocate the same mft record as this one. + */ + ll = (bit + 1) << vol->mft_record_size_bits; + read_lock_irqsave(&mft_ni->size_lock, flags); + old_data_initialized = mft_ni->initialized_size; + read_unlock_irqrestore(&mft_ni->size_lock, flags); + if (ll <= old_data_initialized) { + ntfs_debug("Allocated mft record already initialized."); + goto mft_rec_already_initialized; + } + ntfs_debug("Initializing allocated mft record."); + /* + * The mft record is outside the initialized data. Extend the mft data + * attribute until it covers the allocated record. The loop is only + * actually traversed more than once when a freshly formatted volume is + * first written to so it optimizes away nicely in the common case. + */ + read_lock_irqsave(&mft_ni->size_lock, flags); + ntfs_debug("Status of mft data before extension: " + "allocated_size 0x%llx, data_size 0x%llx, " + "initialized_size 0x%llx.", + (long long)mft_ni->allocated_size, + (long long)i_size_read(vol->mft_ino), + (long long)mft_ni->initialized_size); + while (ll > mft_ni->allocated_size) { + read_unlock_irqrestore(&mft_ni->size_lock, flags); + err = ntfs_mft_data_extend_allocation_nolock(vol); + if (unlikely(err)) { + ntfs_error(vol->sb, "Failed to extend mft data " + "allocation."); + goto undo_mftbmp_alloc_nolock; + } + read_lock_irqsave(&mft_ni->size_lock, flags); + ntfs_debug("Status of mft data after allocation extension: " + "allocated_size 0x%llx, data_size 0x%llx, " + "initialized_size 0x%llx.", + (long long)mft_ni->allocated_size, + (long long)i_size_read(vol->mft_ino), + (long long)mft_ni->initialized_size); + } + read_unlock_irqrestore(&mft_ni->size_lock, flags); + /* + * Extend mft data initialized size (and data size of course) to reach + * the allocated mft record, formatting the mft records allong the way. + * Note: We only modify the ntfs_inode structure as that is all that is + * needed by ntfs_mft_record_format(). We will update the attribute + * record itself in one fell swoop later on. + */ + write_lock_irqsave(&mft_ni->size_lock, flags); + old_data_initialized = mft_ni->initialized_size; + old_data_size = vol->mft_ino->i_size; + while (ll > mft_ni->initialized_size) { + s64 new_initialized_size, mft_no; + + new_initialized_size = mft_ni->initialized_size + + vol->mft_record_size; + mft_no = mft_ni->initialized_size >> vol->mft_record_size_bits; + if (new_initialized_size > i_size_read(vol->mft_ino)) + i_size_write(vol->mft_ino, new_initialized_size); + write_unlock_irqrestore(&mft_ni->size_lock, flags); + ntfs_debug("Initializing mft record 0x%llx.", + (long long)mft_no); + err = ntfs_mft_record_format(vol, mft_no); + if (unlikely(err)) { + ntfs_error(vol->sb, "Failed to format mft record."); + goto undo_data_init; + } + write_lock_irqsave(&mft_ni->size_lock, flags); + mft_ni->initialized_size = new_initialized_size; + } + write_unlock_irqrestore(&mft_ni->size_lock, flags); + record_formatted = true; + /* Update the mft data attribute record to reflect the new sizes. */ + m = map_mft_record(mft_ni); + if (IS_ERR(m)) { + ntfs_error(vol->sb, "Failed to map mft record."); + err = PTR_ERR(m); + goto undo_data_init; + } + ctx = ntfs_attr_get_search_ctx(mft_ni, m); + if (unlikely(!ctx)) { + ntfs_error(vol->sb, "Failed to get search context."); + err = -ENOMEM; + unmap_mft_record(mft_ni); + goto undo_data_init; + } + err = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + ntfs_error(vol->sb, "Failed to find first attribute extent of " + "mft data attribute."); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); + goto undo_data_init; + } + a = ctx->attr; + read_lock_irqsave(&mft_ni->size_lock, flags); + a->data.non_resident.initialized_size = + cpu_to_sle64(mft_ni->initialized_size); + a->data.non_resident.data_size = + cpu_to_sle64(i_size_read(vol->mft_ino)); + read_unlock_irqrestore(&mft_ni->size_lock, flags); + /* Ensure the changes make it to disk. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); + read_lock_irqsave(&mft_ni->size_lock, flags); + ntfs_debug("Status of mft data after mft record initialization: " + "allocated_size 0x%llx, data_size 0x%llx, " + "initialized_size 0x%llx.", + (long long)mft_ni->allocated_size, + (long long)i_size_read(vol->mft_ino), + (long long)mft_ni->initialized_size); + BUG_ON(i_size_read(vol->mft_ino) > mft_ni->allocated_size); + BUG_ON(mft_ni->initialized_size > i_size_read(vol->mft_ino)); + read_unlock_irqrestore(&mft_ni->size_lock, flags); +mft_rec_already_initialized: + /* + * We can finally drop the mft bitmap lock as the mft data attribute + * has been fully updated. The only disparity left is that the + * allocated mft record still needs to be marked as in use to match the + * set bit in the mft bitmap but this is actually not a problem since + * this mft record is not referenced from anywhere yet and the fact + * that it is allocated in the mft bitmap means that no-one will try to + * allocate it either. + */ + up_write(&vol->mftbmp_lock); + /* + * We now have allocated and initialized the mft record. Calculate the + * index of and the offset within the page cache page the record is in. + */ + index = bit << vol->mft_record_size_bits >> PAGE_SHIFT; + ofs = (bit << vol->mft_record_size_bits) & ~PAGE_MASK; + /* Read, map, and pin the page containing the mft record. */ + page = ntfs_map_page(vol->mft_ino->i_mapping, index); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to map page containing allocated " + "mft record 0x%llx.", (long long)bit); + err = PTR_ERR(page); + goto undo_mftbmp_alloc; + } + lock_page(page); + BUG_ON(!PageUptodate(page)); + ClearPageUptodate(page); + m = (MFT_RECORD*)((u8*)page_address(page) + ofs); + /* If we just formatted the mft record no need to do it again. */ + if (!record_formatted) { + /* Sanity check that the mft record is really not in use. */ + if (ntfs_is_file_record(m->magic) && + (m->flags & MFT_RECORD_IN_USE)) { + ntfs_error(vol->sb, "Mft record 0x%llx was marked " + "free in mft bitmap but is marked " + "used itself. Corrupt filesystem. " + "Unmount and run chkdsk.", + (long long)bit); + err = -EIO; + SetPageUptodate(page); + unlock_page(page); + ntfs_unmap_page(page); + NVolSetErrors(vol); + goto undo_mftbmp_alloc; + } + /* + * We need to (re-)format the mft record, preserving the + * sequence number if it is not zero as well as the update + * sequence number if it is not zero or -1 (0xffff). This + * means we do not need to care whether or not something went + * wrong with the previous mft record. + */ + seq_no = m->sequence_number; + usn = *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)); + err = ntfs_mft_record_layout(vol, bit, m); + if (unlikely(err)) { + ntfs_error(vol->sb, "Failed to layout allocated mft " + "record 0x%llx.", (long long)bit); + SetPageUptodate(page); + unlock_page(page); + ntfs_unmap_page(page); + goto undo_mftbmp_alloc; + } + if (seq_no) + m->sequence_number = seq_no; + if (usn && le16_to_cpu(usn) != 0xffff) + *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = usn; + } + /* Set the mft record itself in use. */ + m->flags |= MFT_RECORD_IN_USE; + if (S_ISDIR(mode)) + m->flags |= MFT_RECORD_IS_DIRECTORY; + flush_dcache_page(page); + SetPageUptodate(page); + if (base_ni) { + MFT_RECORD *m_tmp; + + /* + * Setup the base mft record in the extent mft record. This + * completes initialization of the allocated extent mft record + * and we can simply use it with map_extent_mft_record(). + */ + m->base_mft_record = MK_LE_MREF(base_ni->mft_no, + base_ni->seq_no); + /* + * Allocate an extent inode structure for the new mft record, + * attach it to the base inode @base_ni and map, pin, and lock + * its, i.e. the allocated, mft record. + */ + m_tmp = map_extent_mft_record(base_ni, bit, &ni); + if (IS_ERR(m_tmp)) { + ntfs_error(vol->sb, "Failed to map allocated extent " + "mft record 0x%llx.", (long long)bit); + err = PTR_ERR(m_tmp); + /* Set the mft record itself not in use. */ + m->flags &= cpu_to_le16( + ~le16_to_cpu(MFT_RECORD_IN_USE)); + flush_dcache_page(page); + /* Make sure the mft record is written out to disk. */ + mark_ntfs_record_dirty(page, ofs); + unlock_page(page); + ntfs_unmap_page(page); + goto undo_mftbmp_alloc; + } + BUG_ON(m != m_tmp); + /* + * Make sure the allocated mft record is written out to disk. + * No need to set the inode dirty because the caller is going + * to do that anyway after finishing with the new extent mft + * record (e.g. at a minimum a new attribute will be added to + * the mft record. + */ + mark_ntfs_record_dirty(page, ofs); + unlock_page(page); + /* + * Need to unmap the page since map_extent_mft_record() mapped + * it as well so we have it mapped twice at the moment. + */ + ntfs_unmap_page(page); + } else { + /* + * Allocate a new VFS inode and set it up. NOTE: @vi->i_nlink + * is set to 1 but the mft record->link_count is 0. The caller + * needs to bear this in mind. + */ + vi = new_inode(vol->sb); + if (unlikely(!vi)) { + err = -ENOMEM; + /* Set the mft record itself not in use. */ + m->flags &= cpu_to_le16( + ~le16_to_cpu(MFT_RECORD_IN_USE)); + flush_dcache_page(page); + /* Make sure the mft record is written out to disk. */ + mark_ntfs_record_dirty(page, ofs); + unlock_page(page); + ntfs_unmap_page(page); + goto undo_mftbmp_alloc; + } + vi->i_ino = bit; + + /* The owner and group come from the ntfs volume. */ + vi->i_uid = vol->uid; + vi->i_gid = vol->gid; + + /* Initialize the ntfs specific part of @vi. */ + ntfs_init_big_inode(vi); + ni = NTFS_I(vi); + /* + * Set the appropriate mode, attribute type, and name. For + * directories, also setup the index values to the defaults. + */ + if (S_ISDIR(mode)) { + vi->i_mode = S_IFDIR | S_IRWXUGO; + vi->i_mode &= ~vol->dmask; + + NInoSetMstProtected(ni); + ni->type = AT_INDEX_ALLOCATION; + ni->name = I30; + ni->name_len = 4; + + ni->itype.index.block_size = 4096; + ni->itype.index.block_size_bits = ntfs_ffs(4096) - 1; + ni->itype.index.collation_rule = COLLATION_FILE_NAME; + if (vol->cluster_size <= ni->itype.index.block_size) { + ni->itype.index.vcn_size = vol->cluster_size; + ni->itype.index.vcn_size_bits = + vol->cluster_size_bits; + } else { + ni->itype.index.vcn_size = vol->sector_size; + ni->itype.index.vcn_size_bits = + vol->sector_size_bits; + } + } else { + vi->i_mode = S_IFREG | S_IRWXUGO; + vi->i_mode &= ~vol->fmask; + + ni->type = AT_DATA; + ni->name = NULL; + ni->name_len = 0; + } + if (IS_RDONLY(vi)) + vi->i_mode &= ~S_IWUGO; + + /* Set the inode times to the current time. */ + simple_inode_init_ts(vi); + /* + * Set the file size to 0, the ntfs inode sizes are set to 0 by + * the call to ntfs_init_big_inode() below. + */ + vi->i_size = 0; + vi->i_blocks = 0; + + /* Set the sequence number. */ + vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); + /* + * Manually map, pin, and lock the mft record as we already + * have its page mapped and it is very easy to do. + */ + atomic_inc(&ni->count); + mutex_lock(&ni->mrec_lock); + ni->page = page; + ni->page_ofs = ofs; + /* + * Make sure the allocated mft record is written out to disk. + * NOTE: We do not set the ntfs inode dirty because this would + * fail in ntfs_write_inode() because the inode does not have a + * standard information attribute yet. Also, there is no need + * to set the inode dirty because the caller is going to do + * that anyway after finishing with the new mft record (e.g. at + * a minimum some new attributes will be added to the mft + * record. + */ + mark_ntfs_record_dirty(page, ofs); + unlock_page(page); + + /* Add the inode to the inode hash for the superblock. */ + insert_inode_hash(vi); + + /* Update the default mft allocation position. */ + vol->mft_data_pos = bit + 1; + } + /* + * Return the opened, allocated inode of the allocated mft record as + * well as the mapped, pinned, and locked mft record. + */ + ntfs_debug("Returning opened, allocated %sinode 0x%llx.", + base_ni ? "extent " : "", (long long)bit); + *mrec = m; + return ni; +undo_data_init: + write_lock_irqsave(&mft_ni->size_lock, flags); + mft_ni->initialized_size = old_data_initialized; + i_size_write(vol->mft_ino, old_data_size); + write_unlock_irqrestore(&mft_ni->size_lock, flags); + goto undo_mftbmp_alloc_nolock; +undo_mftbmp_alloc: + down_write(&vol->mftbmp_lock); +undo_mftbmp_alloc_nolock: + if (ntfs_bitmap_clear_bit(vol->mftbmp_ino, bit)) { + ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es); + NVolSetErrors(vol); + } + up_write(&vol->mftbmp_lock); +err_out: + return ERR_PTR(err); +max_err_out: + ntfs_warning(vol->sb, "Cannot allocate mft record because the maximum " + "number of inodes (2^32) has already been reached."); + up_write(&vol->mftbmp_lock); + return ERR_PTR(-ENOSPC); +} + +/** + * ntfs_extent_mft_record_free - free an extent mft record on an ntfs volume + * @ni: ntfs inode of the mapped extent mft record to free + * @m: mapped extent mft record of the ntfs inode @ni + * + * Free the mapped extent mft record @m of the extent ntfs inode @ni. + * + * Note that this function unmaps the mft record and closes and destroys @ni + * internally and hence you cannot use either @ni nor @m any more after this + * function returns success. + * + * On success return 0 and on error return -errno. @ni and @m are still valid + * in this case and have not been freed. + * + * For some errors an error message is displayed and the success code 0 is + * returned and the volume is then left dirty on umount. This makes sense in + * case we could not rollback the changes that were already done since the + * caller no longer wants to reference this mft record so it does not matter to + * the caller if something is wrong with it as long as it is properly detached + * from the base inode. + */ +int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m) +{ + unsigned long mft_no = ni->mft_no; + ntfs_volume *vol = ni->vol; + ntfs_inode *base_ni; + ntfs_inode **extent_nis; + int i, err; + le16 old_seq_no; + u16 seq_no; + + BUG_ON(NInoAttr(ni)); + BUG_ON(ni->nr_extents != -1); + + mutex_lock(&ni->extent_lock); + base_ni = ni->ext.base_ntfs_ino; + mutex_unlock(&ni->extent_lock); + + BUG_ON(base_ni->nr_extents <= 0); + + ntfs_debug("Entering for extent inode 0x%lx, base inode 0x%lx.\n", + mft_no, base_ni->mft_no); + + mutex_lock(&base_ni->extent_lock); + + /* Make sure we are holding the only reference to the extent inode. */ + if (atomic_read(&ni->count) > 2) { + ntfs_error(vol->sb, "Tried to free busy extent inode 0x%lx, " + "not freeing.", base_ni->mft_no); + mutex_unlock(&base_ni->extent_lock); + return -EBUSY; + } + + /* Dissociate the ntfs inode from the base inode. */ + extent_nis = base_ni->ext.extent_ntfs_inos; + err = -ENOENT; + for (i = 0; i < base_ni->nr_extents; i++) { + if (ni != extent_nis[i]) + continue; + extent_nis += i; + base_ni->nr_extents--; + memmove(extent_nis, extent_nis + 1, (base_ni->nr_extents - i) * + sizeof(ntfs_inode*)); + err = 0; + break; + } + + mutex_unlock(&base_ni->extent_lock); + + if (unlikely(err)) { + ntfs_error(vol->sb, "Extent inode 0x%lx is not attached to " + "its base inode 0x%lx.", mft_no, + base_ni->mft_no); + BUG(); + } + + /* + * The extent inode is no longer attached to the base inode so no one + * can get a reference to it any more. + */ + + /* Mark the mft record as not in use. */ + m->flags &= ~MFT_RECORD_IN_USE; + + /* Increment the sequence number, skipping zero, if it is not zero. */ + old_seq_no = m->sequence_number; + seq_no = le16_to_cpu(old_seq_no); + if (seq_no == 0xffff) + seq_no = 1; + else if (seq_no) + seq_no++; + m->sequence_number = cpu_to_le16(seq_no); + + /* + * Set the ntfs inode dirty and write it out. We do not need to worry + * about the base inode here since whatever caused the extent mft + * record to be freed is guaranteed to do it already. + */ + NInoSetDirty(ni); + err = write_mft_record(ni, m, 0); + if (unlikely(err)) { + ntfs_error(vol->sb, "Failed to write mft record 0x%lx, not " + "freeing.", mft_no); + goto rollback; + } +rollback_error: + /* Unmap and throw away the now freed extent inode. */ + unmap_extent_mft_record(ni); + ntfs_clear_extent_inode(ni); + + /* Clear the bit in the $MFT/$BITMAP corresponding to this record. */ + down_write(&vol->mftbmp_lock); + err = ntfs_bitmap_clear_bit(vol->mftbmp_ino, mft_no); + up_write(&vol->mftbmp_lock); + if (unlikely(err)) { + /* + * The extent inode is gone but we failed to deallocate it in + * the mft bitmap. Just emit a warning and leave the volume + * dirty on umount. + */ + ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es); + NVolSetErrors(vol); + } + return 0; +rollback: + /* Rollback what we did... */ + mutex_lock(&base_ni->extent_lock); + extent_nis = base_ni->ext.extent_ntfs_inos; + if (!(base_ni->nr_extents & 3)) { + int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode*); + + extent_nis = kmalloc(new_size, GFP_NOFS); + if (unlikely(!extent_nis)) { + ntfs_error(vol->sb, "Failed to allocate internal " + "buffer during rollback.%s", es); + mutex_unlock(&base_ni->extent_lock); + NVolSetErrors(vol); + goto rollback_error; + } + if (base_ni->nr_extents) { + BUG_ON(!base_ni->ext.extent_ntfs_inos); + memcpy(extent_nis, base_ni->ext.extent_ntfs_inos, + new_size - 4 * sizeof(ntfs_inode*)); + kfree(base_ni->ext.extent_ntfs_inos); + } + base_ni->ext.extent_ntfs_inos = extent_nis; + } + m->flags |= MFT_RECORD_IN_USE; + m->sequence_number = old_seq_no; + extent_nis[base_ni->nr_extents++] = ni; + mutex_unlock(&base_ni->extent_lock); + mark_mft_record_dirty(ni); + return err; +} +#endif /* NTFS_RW */ diff --git a/fs/ntfs/mft.h b/fs/ntfs/mft.h new file mode 100644 index 000000000000..49c001af16ed --- /dev/null +++ b/fs/ntfs/mft.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * mft.h - Defines for mft record handling in NTFS Linux kernel driver. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2004 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_MFT_H +#define _LINUX_NTFS_MFT_H + +#include +#include +#include + +#include "inode.h" + +extern MFT_RECORD *map_mft_record(ntfs_inode *ni); +extern void unmap_mft_record(ntfs_inode *ni); + +extern MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref, + ntfs_inode **ntfs_ino); + +static inline void unmap_extent_mft_record(ntfs_inode *ni) +{ + unmap_mft_record(ni); + return; +} + +#ifdef NTFS_RW + +/** + * flush_dcache_mft_record_page - flush_dcache_page() for mft records + * @ni: ntfs inode structure of mft record + * + * Call flush_dcache_page() for the page in which an mft record resides. + * + * This must be called every time an mft record is modified, just after the + * modification. + */ +static inline void flush_dcache_mft_record_page(ntfs_inode *ni) +{ + flush_dcache_page(ni->page); +} + +extern void __mark_mft_record_dirty(ntfs_inode *ni); + +/** + * mark_mft_record_dirty - set the mft record and the page containing it dirty + * @ni: ntfs inode describing the mapped mft record + * + * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni, + * as well as the page containing the mft record, dirty. Also, mark the base + * vfs inode dirty. This ensures that any changes to the mft record are + * written out to disk. + * + * NOTE: Do not do anything if the mft record is already marked dirty. + */ +static inline void mark_mft_record_dirty(ntfs_inode *ni) +{ + if (!NInoTestSetDirty(ni)) + __mark_mft_record_dirty(ni); +} + +extern int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, + MFT_RECORD *m, int sync); + +extern int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync); + +/** + * write_mft_record - write out a mapped (extent) mft record + * @ni: ntfs inode describing the mapped (extent) mft record + * @m: mapped (extent) mft record to write + * @sync: if true, wait for i/o completion + * + * This is just a wrapper for write_mft_record_nolock() (see mft.c), which + * locks the page for the duration of the write. This ensures that there are + * no race conditions between writing the mft record via the dirty inode code + * paths and via the page cache write back code paths or between writing + * neighbouring mft records residing in the same page. + * + * Locking the page also serializes us against ->read_folio() if the page is not + * uptodate. + * + * On success, clean the mft record and return 0. On error, leave the mft + * record dirty and return -errno. + */ +static inline int write_mft_record(ntfs_inode *ni, MFT_RECORD *m, int sync) +{ + struct page *page = ni->page; + int err; + + BUG_ON(!page); + lock_page(page); + err = write_mft_record_nolock(ni, m, sync); + unlock_page(page); + return err; +} + +extern bool ntfs_may_write_mft_record(ntfs_volume *vol, + const unsigned long mft_no, const MFT_RECORD *m, + ntfs_inode **locked_ni); + +extern ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode, + ntfs_inode *base_ni, MFT_RECORD **mrec); +extern int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m); + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_MFT_H */ diff --git a/fs/ntfs/mst.c b/fs/ntfs/mst.c new file mode 100644 index 000000000000..16b3c884abfc --- /dev/null +++ b/fs/ntfs/mst.c @@ -0,0 +1,189 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * mst.c - NTFS multi sector transfer protection handling code. Part of the + * Linux-NTFS project. + * + * Copyright (c) 2001-2004 Anton Altaparmakov + */ + +#include "ntfs.h" + +/** + * post_read_mst_fixup - deprotect multi sector transfer protected data + * @b: pointer to the data to deprotect + * @size: size in bytes of @b + * + * Perform the necessary post read multi sector transfer fixup and detect the + * presence of incomplete multi sector transfers. - In that case, overwrite the + * magic of the ntfs record header being processed with "BAAD" (in memory only!) + * and abort processing. + * + * Return 0 on success and -EINVAL on error ("BAAD" magic will be present). + * + * NOTE: We consider the absence / invalidity of an update sequence array to + * mean that the structure is not protected at all and hence doesn't need to + * be fixed up. Thus, we return success and not failure in this case. This is + * in contrast to pre_write_mst_fixup(), see below. + */ +int post_read_mst_fixup(NTFS_RECORD *b, const u32 size) +{ + u16 usa_ofs, usa_count, usn; + u16 *usa_pos, *data_pos; + + /* Setup the variables. */ + usa_ofs = le16_to_cpu(b->usa_ofs); + /* Decrement usa_count to get number of fixups. */ + usa_count = le16_to_cpu(b->usa_count) - 1; + /* Size and alignment checks. */ + if ( size & (NTFS_BLOCK_SIZE - 1) || + usa_ofs & 1 || + usa_ofs + (usa_count * 2) > size || + (size >> NTFS_BLOCK_SIZE_BITS) != usa_count) + return 0; + /* Position of usn in update sequence array. */ + usa_pos = (u16*)b + usa_ofs/sizeof(u16); + /* + * The update sequence number which has to be equal to each of the + * u16 values before they are fixed up. Note no need to care for + * endianness since we are comparing and moving data for on disk + * structures which means the data is consistent. - If it is + * consistenty the wrong endianness it doesn't make any difference. + */ + usn = *usa_pos; + /* + * Position in protected data of first u16 that needs fixing up. + */ + data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1; + /* + * Check for incomplete multi sector transfer(s). + */ + while (usa_count--) { + if (*data_pos != usn) { + /* + * Incomplete multi sector transfer detected! )-: + * Set the magic to "BAAD" and return failure. + * Note that magic_BAAD is already converted to le32. + */ + b->magic = magic_BAAD; + return -EINVAL; + } + data_pos += NTFS_BLOCK_SIZE/sizeof(u16); + } + /* Re-setup the variables. */ + usa_count = le16_to_cpu(b->usa_count) - 1; + data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1; + /* Fixup all sectors. */ + while (usa_count--) { + /* + * Increment position in usa and restore original data from + * the usa into the data buffer. + */ + *data_pos = *(++usa_pos); + /* Increment position in data as well. */ + data_pos += NTFS_BLOCK_SIZE/sizeof(u16); + } + return 0; +} + +/** + * pre_write_mst_fixup - apply multi sector transfer protection + * @b: pointer to the data to protect + * @size: size in bytes of @b + * + * Perform the necessary pre write multi sector transfer fixup on the data + * pointer to by @b of @size. + * + * Return 0 if fixup applied (success) or -EINVAL if no fixup was performed + * (assumed not needed). This is in contrast to post_read_mst_fixup() above. + * + * NOTE: We consider the absence / invalidity of an update sequence array to + * mean that the structure is not subject to protection and hence doesn't need + * to be fixed up. This means that you have to create a valid update sequence + * array header in the ntfs record before calling this function, otherwise it + * will fail (the header needs to contain the position of the update sequence + * array together with the number of elements in the array). You also need to + * initialise the update sequence number before calling this function + * otherwise a random word will be used (whatever was in the record at that + * position at that time). + */ +int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size) +{ + le16 *usa_pos, *data_pos; + u16 usa_ofs, usa_count, usn; + le16 le_usn; + + /* Sanity check + only fixup if it makes sense. */ + if (!b || ntfs_is_baad_record(b->magic) || + ntfs_is_hole_record(b->magic)) + return -EINVAL; + /* Setup the variables. */ + usa_ofs = le16_to_cpu(b->usa_ofs); + /* Decrement usa_count to get number of fixups. */ + usa_count = le16_to_cpu(b->usa_count) - 1; + /* Size and alignment checks. */ + if ( size & (NTFS_BLOCK_SIZE - 1) || + usa_ofs & 1 || + usa_ofs + (usa_count * 2) > size || + (size >> NTFS_BLOCK_SIZE_BITS) != usa_count) + return -EINVAL; + /* Position of usn in update sequence array. */ + usa_pos = (le16*)((u8*)b + usa_ofs); + /* + * Cyclically increment the update sequence number + * (skipping 0 and -1, i.e. 0xffff). + */ + usn = le16_to_cpup(usa_pos) + 1; + if (usn == 0xffff || !usn) + usn = 1; + le_usn = cpu_to_le16(usn); + *usa_pos = le_usn; + /* Position in data of first u16 that needs fixing up. */ + data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1; + /* Fixup all sectors. */ + while (usa_count--) { + /* + * Increment the position in the usa and save the + * original data from the data buffer into the usa. + */ + *(++usa_pos) = *data_pos; + /* Apply fixup to data. */ + *data_pos = le_usn; + /* Increment position in data as well. */ + data_pos += NTFS_BLOCK_SIZE/sizeof(le16); + } + return 0; +} + +/** + * post_write_mst_fixup - fast deprotect multi sector transfer protected data + * @b: pointer to the data to deprotect + * + * Perform the necessary post write multi sector transfer fixup, not checking + * for any errors, because we assume we have just used pre_write_mst_fixup(), + * thus the data will be fine or we would never have gotten here. + */ +void post_write_mst_fixup(NTFS_RECORD *b) +{ + le16 *usa_pos, *data_pos; + + u16 usa_ofs = le16_to_cpu(b->usa_ofs); + u16 usa_count = le16_to_cpu(b->usa_count) - 1; + + /* Position of usn in update sequence array. */ + usa_pos = (le16*)b + usa_ofs/sizeof(le16); + + /* Position in protected data of first u16 that needs fixing up. */ + data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1; + + /* Fixup all sectors. */ + while (usa_count--) { + /* + * Increment position in usa and restore original data from + * the usa into the data buffer. + */ + *data_pos = *(++usa_pos); + + /* Increment position in data as well. */ + data_pos += NTFS_BLOCK_SIZE/sizeof(le16); + } +} diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c new file mode 100644 index 000000000000..d7498ddc4a72 --- /dev/null +++ b/fs/ntfs/namei.c @@ -0,0 +1,392 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * namei.c - NTFS kernel directory inode operations. Part of the Linux-NTFS + * project. + * + * Copyright (c) 2001-2006 Anton Altaparmakov + */ + +#include +#include +#include +#include + +#include "attrib.h" +#include "debug.h" +#include "dir.h" +#include "mft.h" +#include "ntfs.h" + +/** + * ntfs_lookup - find the inode represented by a dentry in a directory inode + * @dir_ino: directory inode in which to look for the inode + * @dent: dentry representing the inode to look for + * @flags: lookup flags + * + * In short, ntfs_lookup() looks for the inode represented by the dentry @dent + * in the directory inode @dir_ino and if found attaches the inode to the + * dentry @dent. + * + * In more detail, the dentry @dent specifies which inode to look for by + * supplying the name of the inode in @dent->d_name.name. ntfs_lookup() + * converts the name to Unicode and walks the contents of the directory inode + * @dir_ino looking for the converted Unicode name. If the name is found in the + * directory, the corresponding inode is loaded by calling ntfs_iget() on its + * inode number and the inode is associated with the dentry @dent via a call to + * d_splice_alias(). + * + * If the name is not found in the directory, a NULL inode is inserted into the + * dentry @dent via a call to d_add(). The dentry is then termed a negative + * dentry. + * + * Only if an actual error occurs, do we return an error via ERR_PTR(). + * + * In order to handle the case insensitivity issues of NTFS with regards to the + * dcache and the dcache requiring only one dentry per directory, we deal with + * dentry aliases that only differ in case in ->ntfs_lookup() while maintaining + * a case sensitive dcache. This means that we get the full benefit of dcache + * speed when the file/directory is looked up with the same case as returned by + * ->ntfs_readdir() but that a lookup for any other case (or for the short file + * name) will not find anything in dcache and will enter ->ntfs_lookup() + * instead, where we search the directory for a fully matching file name + * (including case) and if that is not found, we search for a file name that + * matches with different case and if that has non-POSIX semantics we return + * that. We actually do only one search (case sensitive) and keep tabs on + * whether we have found a case insensitive match in the process. + * + * To simplify matters for us, we do not treat the short vs long filenames as + * two hard links but instead if the lookup matches a short filename, we + * return the dentry for the corresponding long filename instead. + * + * There are three cases we need to distinguish here: + * + * 1) @dent perfectly matches (i.e. including case) a directory entry with a + * file name in the WIN32 or POSIX namespaces. In this case + * ntfs_lookup_inode_by_name() will return with name set to NULL and we + * just d_splice_alias() @dent. + * 2) @dent matches (not including case) a directory entry with a file name in + * the WIN32 namespace. In this case ntfs_lookup_inode_by_name() will return + * with name set to point to a kmalloc()ed ntfs_name structure containing + * the properly cased little endian Unicode name. We convert the name to the + * current NLS code page, search if a dentry with this name already exists + * and if so return that instead of @dent. At this point things are + * complicated by the possibility of 'disconnected' dentries due to NFS + * which we deal with appropriately (see the code comments). The VFS will + * then destroy the old @dent and use the one we returned. If a dentry is + * not found, we allocate a new one, d_splice_alias() it, and return it as + * above. + * 3) @dent matches either perfectly or not (i.e. we don't care about case) a + * directory entry with a file name in the DOS namespace. In this case + * ntfs_lookup_inode_by_name() will return with name set to point to a + * kmalloc()ed ntfs_name structure containing the mft reference (cpu endian) + * of the inode. We use the mft reference to read the inode and to find the + * file name in the WIN32 namespace corresponding to the matched short file + * name. We then convert the name to the current NLS code page, and proceed + * searching for a dentry with this name, etc, as in case 2), above. + * + * Locking: Caller must hold i_mutex on the directory. + */ +static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent, + unsigned int flags) +{ + ntfs_volume *vol = NTFS_SB(dir_ino->i_sb); + struct inode *dent_inode; + ntfschar *uname; + ntfs_name *name = NULL; + MFT_REF mref; + unsigned long dent_ino; + int uname_len; + + ntfs_debug("Looking up %pd in directory inode 0x%lx.", + dent, dir_ino->i_ino); + /* Convert the name of the dentry to Unicode. */ + uname_len = ntfs_nlstoucs(vol, dent->d_name.name, dent->d_name.len, + &uname); + if (uname_len < 0) { + if (uname_len != -ENAMETOOLONG) + ntfs_error(vol->sb, "Failed to convert name to " + "Unicode."); + return ERR_PTR(uname_len); + } + mref = ntfs_lookup_inode_by_name(NTFS_I(dir_ino), uname, uname_len, + &name); + kmem_cache_free(ntfs_name_cache, uname); + if (!IS_ERR_MREF(mref)) { + dent_ino = MREF(mref); + ntfs_debug("Found inode 0x%lx. Calling ntfs_iget.", dent_ino); + dent_inode = ntfs_iget(vol->sb, dent_ino); + if (!IS_ERR(dent_inode)) { + /* Consistency check. */ + if (is_bad_inode(dent_inode) || MSEQNO(mref) == + NTFS_I(dent_inode)->seq_no || + dent_ino == FILE_MFT) { + /* Perfect WIN32/POSIX match. -- Case 1. */ + if (!name) { + ntfs_debug("Done. (Case 1.)"); + return d_splice_alias(dent_inode, dent); + } + /* + * We are too indented. Handle imperfect + * matches and short file names further below. + */ + goto handle_name; + } + ntfs_error(vol->sb, "Found stale reference to inode " + "0x%lx (reference sequence number = " + "0x%x, inode sequence number = 0x%x), " + "returning -EIO. Run chkdsk.", + dent_ino, MSEQNO(mref), + NTFS_I(dent_inode)->seq_no); + iput(dent_inode); + dent_inode = ERR_PTR(-EIO); + } else + ntfs_error(vol->sb, "ntfs_iget(0x%lx) failed with " + "error code %li.", dent_ino, + PTR_ERR(dent_inode)); + kfree(name); + /* Return the error code. */ + return ERR_CAST(dent_inode); + } + /* It is guaranteed that @name is no longer allocated at this point. */ + if (MREF_ERR(mref) == -ENOENT) { + ntfs_debug("Entry was not found, adding negative dentry."); + /* The dcache will handle negative entries. */ + d_add(dent, NULL); + ntfs_debug("Done."); + return NULL; + } + ntfs_error(vol->sb, "ntfs_lookup_ino_by_name() failed with error " + "code %i.", -MREF_ERR(mref)); + return ERR_PTR(MREF_ERR(mref)); + // TODO: Consider moving this lot to a separate function! (AIA) +handle_name: + { + MFT_RECORD *m; + ntfs_attr_search_ctx *ctx; + ntfs_inode *ni = NTFS_I(dent_inode); + int err; + struct qstr nls_name; + + nls_name.name = NULL; + if (name->type != FILE_NAME_DOS) { /* Case 2. */ + ntfs_debug("Case 2."); + nls_name.len = (unsigned)ntfs_ucstonls(vol, + (ntfschar*)&name->name, name->len, + (unsigned char**)&nls_name.name, 0); + kfree(name); + } else /* if (name->type == FILE_NAME_DOS) */ { /* Case 3. */ + FILE_NAME_ATTR *fn; + + ntfs_debug("Case 3."); + kfree(name); + + /* Find the WIN32 name corresponding to the matched DOS name. */ + ni = NTFS_I(dent_inode); + m = map_mft_record(ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + ctx = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + do { + ATTR_RECORD *a; + u32 val_len; + + err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0, + NULL, 0, ctx); + if (unlikely(err)) { + ntfs_error(vol->sb, "Inode corrupt: No WIN32 " + "namespace counterpart to DOS " + "file name. Run chkdsk."); + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + /* Consistency checks. */ + a = ctx->attr; + if (a->non_resident || a->flags) + goto eio_err_out; + val_len = le32_to_cpu(a->data.resident.value_length); + if (le16_to_cpu(a->data.resident.value_offset) + + val_len > le32_to_cpu(a->length)) + goto eio_err_out; + fn = (FILE_NAME_ATTR*)((u8*)ctx->attr + le16_to_cpu( + ctx->attr->data.resident.value_offset)); + if ((u32)(fn->file_name_length * sizeof(ntfschar) + + sizeof(FILE_NAME_ATTR)) > val_len) + goto eio_err_out; + } while (fn->file_name_type != FILE_NAME_WIN32); + + /* Convert the found WIN32 name to current NLS code page. */ + nls_name.len = (unsigned)ntfs_ucstonls(vol, + (ntfschar*)&fn->file_name, fn->file_name_length, + (unsigned char**)&nls_name.name, 0); + + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); + } + m = NULL; + ctx = NULL; + + /* Check if a conversion error occurred. */ + if ((signed)nls_name.len < 0) { + err = (signed)nls_name.len; + goto err_out; + } + nls_name.hash = full_name_hash(dent, nls_name.name, nls_name.len); + + dent = d_add_ci(dent, dent_inode, &nls_name); + kfree(nls_name.name); + return dent; + +eio_err_out: + ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk."); + err = -EIO; +err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(ni); + iput(dent_inode); + ntfs_error(vol->sb, "Failed, returning error code %i.", err); + return ERR_PTR(err); + } +} + +/* + * Inode operations for directories. + */ +const struct inode_operations ntfs_dir_inode_ops = { + .lookup = ntfs_lookup, /* VFS: Lookup directory. */ +}; + +/** + * ntfs_get_parent - find the dentry of the parent of a given directory dentry + * @child_dent: dentry of the directory whose parent directory to find + * + * Find the dentry for the parent directory of the directory specified by the + * dentry @child_dent. This function is called from + * fs/exportfs/expfs.c::find_exported_dentry() which in turn is called from the + * default ->decode_fh() which is export_decode_fh() in the same file. + * + * The code is based on the ext3 ->get_parent() implementation found in + * fs/ext3/namei.c::ext3_get_parent(). + * + * Note: ntfs_get_parent() is called with @d_inode(child_dent)->i_mutex down. + * + * Return the dentry of the parent directory on success or the error code on + * error (IS_ERR() is true). + */ +static struct dentry *ntfs_get_parent(struct dentry *child_dent) +{ + struct inode *vi = d_inode(child_dent); + ntfs_inode *ni = NTFS_I(vi); + MFT_RECORD *mrec; + ntfs_attr_search_ctx *ctx; + ATTR_RECORD *attr; + FILE_NAME_ATTR *fn; + unsigned long parent_ino; + int err; + + ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); + /* Get the mft record of the inode belonging to the child dentry. */ + mrec = map_mft_record(ni); + if (IS_ERR(mrec)) + return ERR_CAST(mrec); + /* Find the first file name attribute in the mft record. */ + ctx = ntfs_attr_get_search_ctx(ni, mrec); + if (unlikely(!ctx)) { + unmap_mft_record(ni); + return ERR_PTR(-ENOMEM); + } +try_next: + err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, CASE_SENSITIVE, 0, NULL, + 0, ctx); + if (unlikely(err)) { + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); + if (err == -ENOENT) + ntfs_error(vi->i_sb, "Inode 0x%lx does not have a " + "file name attribute. Run chkdsk.", + vi->i_ino); + return ERR_PTR(err); + } + attr = ctx->attr; + if (unlikely(attr->non_resident)) + goto try_next; + fn = (FILE_NAME_ATTR *)((u8 *)attr + + le16_to_cpu(attr->data.resident.value_offset)); + if (unlikely((u8 *)fn + le32_to_cpu(attr->data.resident.value_length) > + (u8*)attr + le32_to_cpu(attr->length))) + goto try_next; + /* Get the inode number of the parent directory. */ + parent_ino = MREF_LE(fn->parent_directory); + /* Release the search context and the mft record of the child. */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); + + return d_obtain_alias(ntfs_iget(vi->i_sb, parent_ino)); +} + +static struct inode *ntfs_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct inode *inode; + + inode = ntfs_iget(sb, ino); + if (!IS_ERR(inode)) { + if (is_bad_inode(inode) || inode->i_generation != generation) { + iput(inode); + inode = ERR_PTR(-ESTALE); + } + } + + return inode; +} + +static struct dentry *ntfs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + ntfs_nfs_get_inode); +} + +static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + ntfs_nfs_get_inode); +} + +/* + * Export operations allowing NFS exporting of mounted NTFS partitions. + * + * We use the default ->encode_fh() for now. Note that they + * use 32 bits to store the inode number which is an unsigned long so on 64-bit + * architectures is usually 64 bits so it would all fail horribly on huge + * volumes. I guess we need to define our own encode and decode fh functions + * that store 64-bit inode numbers at some point but for now we will ignore the + * problem... + * + * We also use the default ->get_name() helper (used by ->decode_fh() via + * fs/exportfs/expfs.c::find_exported_dentry()) as that is completely fs + * independent. + * + * The default ->get_parent() just returns -EACCES so we have to provide our + * own and the default ->get_dentry() is incompatible with NTFS due to not + * allowing the inode number 0 which is used in NTFS for the system file $MFT + * and due to using iget() whereas NTFS needs ntfs_iget(). + */ +const struct export_operations ntfs_export_ops = { + .encode_fh = generic_encode_ino32_fh, + .get_parent = ntfs_get_parent, /* Find the parent of a given + directory. */ + .fh_to_dentry = ntfs_fh_to_dentry, + .fh_to_parent = ntfs_fh_to_parent, +}; diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h new file mode 100644 index 000000000000..e81376ea9152 --- /dev/null +++ b/fs/ntfs/ntfs.h @@ -0,0 +1,150 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * ntfs.h - Defines for NTFS Linux kernel driver. + * + * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. + * Copyright (C) 2002 Richard Russon + */ + +#ifndef _LINUX_NTFS_H +#define _LINUX_NTFS_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" +#include "volume.h" +#include "layout.h" + +typedef enum { + NTFS_BLOCK_SIZE = 512, + NTFS_BLOCK_SIZE_BITS = 9, + NTFS_SB_MAGIC = 0x5346544e, /* 'NTFS' */ + NTFS_MAX_NAME_LEN = 255, + NTFS_MAX_ATTR_NAME_LEN = 255, + NTFS_MAX_CLUSTER_SIZE = 64 * 1024, /* 64kiB */ + NTFS_MAX_PAGES_PER_CLUSTER = NTFS_MAX_CLUSTER_SIZE / PAGE_SIZE, +} NTFS_CONSTANTS; + +/* Global variables. */ + +/* Slab caches (from super.c). */ +extern struct kmem_cache *ntfs_name_cache; +extern struct kmem_cache *ntfs_inode_cache; +extern struct kmem_cache *ntfs_big_inode_cache; +extern struct kmem_cache *ntfs_attr_ctx_cache; +extern struct kmem_cache *ntfs_index_ctx_cache; + +/* The various operations structs defined throughout the driver files. */ +extern const struct address_space_operations ntfs_normal_aops; +extern const struct address_space_operations ntfs_compressed_aops; +extern const struct address_space_operations ntfs_mst_aops; + +extern const struct file_operations ntfs_file_ops; +extern const struct inode_operations ntfs_file_inode_ops; + +extern const struct file_operations ntfs_dir_ops; +extern const struct inode_operations ntfs_dir_inode_ops; + +extern const struct file_operations ntfs_empty_file_ops; +extern const struct inode_operations ntfs_empty_inode_ops; + +extern const struct export_operations ntfs_export_ops; + +/** + * NTFS_SB - return the ntfs volume given a vfs super block + * @sb: VFS super block + * + * NTFS_SB() returns the ntfs volume associated with the VFS super block @sb. + */ +static inline ntfs_volume *NTFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +/* Declarations of functions and global variables. */ + +/* From fs/ntfs/compress.c */ +extern int ntfs_read_compressed_block(struct page *page); +extern int allocate_compression_buffers(void); +extern void free_compression_buffers(void); + +/* From fs/ntfs/super.c */ +#define default_upcase_len 0x10000 +extern struct mutex ntfs_lock; + +typedef struct { + int val; + char *str; +} option_t; +extern const option_t on_errors_arr[]; + +/* From fs/ntfs/mst.c */ +extern int post_read_mst_fixup(NTFS_RECORD *b, const u32 size); +extern int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size); +extern void post_write_mst_fixup(NTFS_RECORD *b); + +/* From fs/ntfs/unistr.c */ +extern bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len, + const ntfschar *s2, size_t s2_len, + const IGNORE_CASE_BOOL ic, + const ntfschar *upcase, const u32 upcase_size); +extern int ntfs_collate_names(const ntfschar *name1, const u32 name1_len, + const ntfschar *name2, const u32 name2_len, + const int err_val, const IGNORE_CASE_BOOL ic, + const ntfschar *upcase, const u32 upcase_len); +extern int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n); +extern int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n, + const ntfschar *upcase, const u32 upcase_size); +extern void ntfs_upcase_name(ntfschar *name, u32 name_len, + const ntfschar *upcase, const u32 upcase_len); +extern void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr, + const ntfschar *upcase, const u32 upcase_len); +extern int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1, + FILE_NAME_ATTR *file_name_attr2, + const int err_val, const IGNORE_CASE_BOOL ic, + const ntfschar *upcase, const u32 upcase_len); +extern int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins, + const int ins_len, ntfschar **outs); +extern int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins, + const int ins_len, unsigned char **outs, int outs_len); + +/* From fs/ntfs/upcase.c */ +extern ntfschar *generate_default_upcase(void); + +static inline int ntfs_ffs(int x) +{ + int r = 1; + + if (!x) + return 0; + if (!(x & 0xffff)) { + x >>= 16; + r += 16; + } + if (!(x & 0xff)) { + x >>= 8; + r += 8; + } + if (!(x & 0xf)) { + x >>= 4; + r += 4; + } + if (!(x & 3)) { + x >>= 2; + r += 2; + } + if (!(x & 1)) { + x >>= 1; + r += 1; + } + return r; +} + +#endif /* _LINUX_NTFS_H */ diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c new file mode 100644 index 000000000000..9160480222fd --- /dev/null +++ b/fs/ntfs/quota.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * quota.c - NTFS kernel quota ($Quota) handling. Part of the Linux-NTFS + * project. + * + * Copyright (c) 2004 Anton Altaparmakov + */ + +#ifdef NTFS_RW + +#include "index.h" +#include "quota.h" +#include "debug.h" +#include "ntfs.h" + +/** + * ntfs_mark_quotas_out_of_date - mark the quotas out of date on an ntfs volume + * @vol: ntfs volume on which to mark the quotas out of date + * + * Mark the quotas out of date on the ntfs volume @vol and return 'true' on + * success and 'false' on error. + */ +bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol) +{ + ntfs_index_context *ictx; + QUOTA_CONTROL_ENTRY *qce; + const le32 qid = QUOTA_DEFAULTS_ID; + int err; + + ntfs_debug("Entering."); + if (NVolQuotaOutOfDate(vol)) + goto done; + if (!vol->quota_ino || !vol->quota_q_ino) { + ntfs_error(vol->sb, "Quota inodes are not open."); + return false; + } + inode_lock(vol->quota_q_ino); + ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino)); + if (!ictx) { + ntfs_error(vol->sb, "Failed to get index context."); + goto err_out; + } + err = ntfs_index_lookup(&qid, sizeof(qid), ictx); + if (err) { + if (err == -ENOENT) + ntfs_error(vol->sb, "Quota defaults entry is not " + "present."); + else + ntfs_error(vol->sb, "Lookup of quota defaults entry " + "failed."); + goto err_out; + } + if (ictx->data_len < offsetof(QUOTA_CONTROL_ENTRY, sid)) { + ntfs_error(vol->sb, "Quota defaults entry size is invalid. " + "Run chkdsk."); + goto err_out; + } + qce = (QUOTA_CONTROL_ENTRY*)ictx->data; + if (le32_to_cpu(qce->version) != QUOTA_VERSION) { + ntfs_error(vol->sb, "Quota defaults entry version 0x%x is not " + "supported.", le32_to_cpu(qce->version)); + goto err_out; + } + ntfs_debug("Quota defaults flags = 0x%x.", le32_to_cpu(qce->flags)); + /* If quotas are already marked out of date, no need to do anything. */ + if (qce->flags & QUOTA_FLAG_OUT_OF_DATE) + goto set_done; + /* + * If quota tracking is neither requested, nor enabled and there are no + * pending deletes, no need to mark the quotas out of date. + */ + if (!(qce->flags & (QUOTA_FLAG_TRACKING_ENABLED | + QUOTA_FLAG_TRACKING_REQUESTED | + QUOTA_FLAG_PENDING_DELETES))) + goto set_done; + /* + * Set the QUOTA_FLAG_OUT_OF_DATE bit thus marking quotas out of date. + * This is verified on WinXP to be sufficient to cause windows to + * rescan the volume on boot and update all quota entries. + */ + qce->flags |= QUOTA_FLAG_OUT_OF_DATE; + /* Ensure the modified flags are written to disk. */ + ntfs_index_entry_flush_dcache_page(ictx); + ntfs_index_entry_mark_dirty(ictx); +set_done: + ntfs_index_ctx_put(ictx); + inode_unlock(vol->quota_q_ino); + /* + * We set the flag so we do not try to mark the quotas out of date + * again on remount. + */ + NVolSetQuotaOutOfDate(vol); +done: + ntfs_debug("Done."); + return true; +err_out: + if (ictx) + ntfs_index_ctx_put(ictx); + inode_unlock(vol->quota_q_ino); + return false; +} + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/quota.h b/fs/ntfs/quota.h new file mode 100644 index 000000000000..fe3132a3d6d2 --- /dev/null +++ b/fs/ntfs/quota.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * quota.h - Defines for NTFS kernel quota ($Quota) handling. Part of the + * Linux-NTFS project. + * + * Copyright (c) 2004 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_QUOTA_H +#define _LINUX_NTFS_QUOTA_H + +#ifdef NTFS_RW + +#include "types.h" +#include "volume.h" + +extern bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol); + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_QUOTA_H */ diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c new file mode 100644 index 000000000000..0d448e9881f7 --- /dev/null +++ b/fs/ntfs/runlist.c @@ -0,0 +1,1893 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * runlist.c - NTFS runlist handling code. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2007 Anton Altaparmakov + * Copyright (c) 2002-2005 Richard Russon + */ + +#include "debug.h" +#include "dir.h" +#include "endian.h" +#include "malloc.h" +#include "ntfs.h" + +/** + * ntfs_rl_mm - runlist memmove + * + * It is up to the caller to serialize access to the runlist @base. + */ +static inline void ntfs_rl_mm(runlist_element *base, int dst, int src, + int size) +{ + if (likely((dst != src) && (size > 0))) + memmove(base + dst, base + src, size * sizeof(*base)); +} + +/** + * ntfs_rl_mc - runlist memory copy + * + * It is up to the caller to serialize access to the runlists @dstbase and + * @srcbase. + */ +static inline void ntfs_rl_mc(runlist_element *dstbase, int dst, + runlist_element *srcbase, int src, int size) +{ + if (likely(size > 0)) + memcpy(dstbase + dst, srcbase + src, size * sizeof(*dstbase)); +} + +/** + * ntfs_rl_realloc - Reallocate memory for runlists + * @rl: original runlist + * @old_size: number of runlist elements in the original runlist @rl + * @new_size: number of runlist elements we need space for + * + * As the runlists grow, more memory will be required. To prevent the + * kernel having to allocate and reallocate large numbers of small bits of + * memory, this function returns an entire page of memory. + * + * It is up to the caller to serialize access to the runlist @rl. + * + * N.B. If the new allocation doesn't require a different number of pages in + * memory, the function will return the original pointer. + * + * On success, return a pointer to the newly allocated, or recycled, memory. + * On error, return -errno. The following error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EINVAL - Invalid parameters were passed in. + */ +static inline runlist_element *ntfs_rl_realloc(runlist_element *rl, + int old_size, int new_size) +{ + runlist_element *new_rl; + + old_size = PAGE_ALIGN(old_size * sizeof(*rl)); + new_size = PAGE_ALIGN(new_size * sizeof(*rl)); + if (old_size == new_size) + return rl; + + new_rl = ntfs_malloc_nofs(new_size); + if (unlikely(!new_rl)) + return ERR_PTR(-ENOMEM); + + if (likely(rl != NULL)) { + if (unlikely(old_size > new_size)) + old_size = new_size; + memcpy(new_rl, rl, old_size); + ntfs_free(rl); + } + return new_rl; +} + +/** + * ntfs_rl_realloc_nofail - Reallocate memory for runlists + * @rl: original runlist + * @old_size: number of runlist elements in the original runlist @rl + * @new_size: number of runlist elements we need space for + * + * As the runlists grow, more memory will be required. To prevent the + * kernel having to allocate and reallocate large numbers of small bits of + * memory, this function returns an entire page of memory. + * + * This function guarantees that the allocation will succeed. It will sleep + * for as long as it takes to complete the allocation. + * + * It is up to the caller to serialize access to the runlist @rl. + * + * N.B. If the new allocation doesn't require a different number of pages in + * memory, the function will return the original pointer. + * + * On success, return a pointer to the newly allocated, or recycled, memory. + * On error, return -errno. The following error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EINVAL - Invalid parameters were passed in. + */ +static inline runlist_element *ntfs_rl_realloc_nofail(runlist_element *rl, + int old_size, int new_size) +{ + runlist_element *new_rl; + + old_size = PAGE_ALIGN(old_size * sizeof(*rl)); + new_size = PAGE_ALIGN(new_size * sizeof(*rl)); + if (old_size == new_size) + return rl; + + new_rl = ntfs_malloc_nofs_nofail(new_size); + BUG_ON(!new_rl); + + if (likely(rl != NULL)) { + if (unlikely(old_size > new_size)) + old_size = new_size; + memcpy(new_rl, rl, old_size); + ntfs_free(rl); + } + return new_rl; +} + +/** + * ntfs_are_rl_mergeable - test if two runlists can be joined together + * @dst: original runlist + * @src: new runlist to test for mergeability with @dst + * + * Test if two runlists can be joined together. For this, their VCNs and LCNs + * must be adjacent. + * + * It is up to the caller to serialize access to the runlists @dst and @src. + * + * Return: true Success, the runlists can be merged. + * false Failure, the runlists cannot be merged. + */ +static inline bool ntfs_are_rl_mergeable(runlist_element *dst, + runlist_element *src) +{ + BUG_ON(!dst); + BUG_ON(!src); + + /* We can merge unmapped regions even if they are misaligned. */ + if ((dst->lcn == LCN_RL_NOT_MAPPED) && (src->lcn == LCN_RL_NOT_MAPPED)) + return true; + /* If the runs are misaligned, we cannot merge them. */ + if ((dst->vcn + dst->length) != src->vcn) + return false; + /* If both runs are non-sparse and contiguous, we can merge them. */ + if ((dst->lcn >= 0) && (src->lcn >= 0) && + ((dst->lcn + dst->length) == src->lcn)) + return true; + /* If we are merging two holes, we can merge them. */ + if ((dst->lcn == LCN_HOLE) && (src->lcn == LCN_HOLE)) + return true; + /* Cannot merge. */ + return false; +} + +/** + * __ntfs_rl_merge - merge two runlists without testing if they can be merged + * @dst: original, destination runlist + * @src: new runlist to merge with @dst + * + * Merge the two runlists, writing into the destination runlist @dst. The + * caller must make sure the runlists can be merged or this will corrupt the + * destination runlist. + * + * It is up to the caller to serialize access to the runlists @dst and @src. + */ +static inline void __ntfs_rl_merge(runlist_element *dst, runlist_element *src) +{ + dst->length += src->length; +} + +/** + * ntfs_rl_append - append a runlist after a given element + * @dst: original runlist to be worked on + * @dsize: number of elements in @dst (including end marker) + * @src: runlist to be inserted into @dst + * @ssize: number of elements in @src (excluding end marker) + * @loc: append the new runlist @src after this element in @dst + * + * Append the runlist @src after element @loc in @dst. Merge the right end of + * the new runlist, if necessary. Adjust the size of the hole before the + * appended runlist. + * + * It is up to the caller to serialize access to the runlists @dst and @src. + * + * On success, return a pointer to the new, combined, runlist. Note, both + * runlists @dst and @src are deallocated before returning so you cannot use + * the pointers for anything any more. (Strictly speaking the returned runlist + * may be the same as @dst but this is irrelevant.) + * + * On error, return -errno. Both runlists are left unmodified. The following + * error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EINVAL - Invalid parameters were passed in. + */ +static inline runlist_element *ntfs_rl_append(runlist_element *dst, + int dsize, runlist_element *src, int ssize, int loc) +{ + bool right = false; /* Right end of @src needs merging. */ + int marker; /* End of the inserted runs. */ + + BUG_ON(!dst); + BUG_ON(!src); + + /* First, check if the right hand end needs merging. */ + if ((loc + 1) < dsize) + right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1); + + /* Space required: @dst size + @src size, less one if we merged. */ + dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - right); + if (IS_ERR(dst)) + return dst; + /* + * We are guaranteed to succeed from here so can start modifying the + * original runlists. + */ + + /* First, merge the right hand end, if necessary. */ + if (right) + __ntfs_rl_merge(src + ssize - 1, dst + loc + 1); + + /* First run after the @src runs that have been inserted. */ + marker = loc + ssize + 1; + + /* Move the tail of @dst out of the way, then copy in @src. */ + ntfs_rl_mm(dst, marker, loc + 1 + right, dsize - (loc + 1 + right)); + ntfs_rl_mc(dst, loc + 1, src, 0, ssize); + + /* Adjust the size of the preceding hole. */ + dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn; + + /* We may have changed the length of the file, so fix the end marker */ + if (dst[marker].lcn == LCN_ENOENT) + dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length; + + return dst; +} + +/** + * ntfs_rl_insert - insert a runlist into another + * @dst: original runlist to be worked on + * @dsize: number of elements in @dst (including end marker) + * @src: new runlist to be inserted + * @ssize: number of elements in @src (excluding end marker) + * @loc: insert the new runlist @src before this element in @dst + * + * Insert the runlist @src before element @loc in the runlist @dst. Merge the + * left end of the new runlist, if necessary. Adjust the size of the hole + * after the inserted runlist. + * + * It is up to the caller to serialize access to the runlists @dst and @src. + * + * On success, return a pointer to the new, combined, runlist. Note, both + * runlists @dst and @src are deallocated before returning so you cannot use + * the pointers for anything any more. (Strictly speaking the returned runlist + * may be the same as @dst but this is irrelevant.) + * + * On error, return -errno. Both runlists are left unmodified. The following + * error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EINVAL - Invalid parameters were passed in. + */ +static inline runlist_element *ntfs_rl_insert(runlist_element *dst, + int dsize, runlist_element *src, int ssize, int loc) +{ + bool left = false; /* Left end of @src needs merging. */ + bool disc = false; /* Discontinuity between @dst and @src. */ + int marker; /* End of the inserted runs. */ + + BUG_ON(!dst); + BUG_ON(!src); + + /* + * disc => Discontinuity between the end of @dst and the start of @src. + * This means we might need to insert a "not mapped" run. + */ + if (loc == 0) + disc = (src[0].vcn > 0); + else { + s64 merged_length; + + left = ntfs_are_rl_mergeable(dst + loc - 1, src); + + merged_length = dst[loc - 1].length; + if (left) + merged_length += src->length; + + disc = (src[0].vcn > dst[loc - 1].vcn + merged_length); + } + /* + * Space required: @dst size + @src size, less one if we merged, plus + * one if there was a discontinuity. + */ + dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left + disc); + if (IS_ERR(dst)) + return dst; + /* + * We are guaranteed to succeed from here so can start modifying the + * original runlist. + */ + if (left) + __ntfs_rl_merge(dst + loc - 1, src); + /* + * First run after the @src runs that have been inserted. + * Nominally, @marker equals @loc + @ssize, i.e. location + number of + * runs in @src. However, if @left, then the first run in @src has + * been merged with one in @dst. And if @disc, then @dst and @src do + * not meet and we need an extra run to fill the gap. + */ + marker = loc + ssize - left + disc; + + /* Move the tail of @dst out of the way, then copy in @src. */ + ntfs_rl_mm(dst, marker, loc, dsize - loc); + ntfs_rl_mc(dst, loc + disc, src, left, ssize - left); + + /* Adjust the VCN of the first run after the insertion... */ + dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length; + /* ... and the length. */ + if (dst[marker].lcn == LCN_HOLE || dst[marker].lcn == LCN_RL_NOT_MAPPED) + dst[marker].length = dst[marker + 1].vcn - dst[marker].vcn; + + /* Writing beyond the end of the file and there is a discontinuity. */ + if (disc) { + if (loc > 0) { + dst[loc].vcn = dst[loc - 1].vcn + dst[loc - 1].length; + dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn; + } else { + dst[loc].vcn = 0; + dst[loc].length = dst[loc + 1].vcn; + } + dst[loc].lcn = LCN_RL_NOT_MAPPED; + } + return dst; +} + +/** + * ntfs_rl_replace - overwrite a runlist element with another runlist + * @dst: original runlist to be worked on + * @dsize: number of elements in @dst (including end marker) + * @src: new runlist to be inserted + * @ssize: number of elements in @src (excluding end marker) + * @loc: index in runlist @dst to overwrite with @src + * + * Replace the runlist element @dst at @loc with @src. Merge the left and + * right ends of the inserted runlist, if necessary. + * + * It is up to the caller to serialize access to the runlists @dst and @src. + * + * On success, return a pointer to the new, combined, runlist. Note, both + * runlists @dst and @src are deallocated before returning so you cannot use + * the pointers for anything any more. (Strictly speaking the returned runlist + * may be the same as @dst but this is irrelevant.) + * + * On error, return -errno. Both runlists are left unmodified. The following + * error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EINVAL - Invalid parameters were passed in. + */ +static inline runlist_element *ntfs_rl_replace(runlist_element *dst, + int dsize, runlist_element *src, int ssize, int loc) +{ + signed delta; + bool left = false; /* Left end of @src needs merging. */ + bool right = false; /* Right end of @src needs merging. */ + int tail; /* Start of tail of @dst. */ + int marker; /* End of the inserted runs. */ + + BUG_ON(!dst); + BUG_ON(!src); + + /* First, see if the left and right ends need merging. */ + if ((loc + 1) < dsize) + right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1); + if (loc > 0) + left = ntfs_are_rl_mergeable(dst + loc - 1, src); + /* + * Allocate some space. We will need less if the left, right, or both + * ends get merged. The -1 accounts for the run being replaced. + */ + delta = ssize - 1 - left - right; + if (delta > 0) { + dst = ntfs_rl_realloc(dst, dsize, dsize + delta); + if (IS_ERR(dst)) + return dst; + } + /* + * We are guaranteed to succeed from here so can start modifying the + * original runlists. + */ + + /* First, merge the left and right ends, if necessary. */ + if (right) + __ntfs_rl_merge(src + ssize - 1, dst + loc + 1); + if (left) + __ntfs_rl_merge(dst + loc - 1, src); + /* + * Offset of the tail of @dst. This needs to be moved out of the way + * to make space for the runs to be copied from @src, i.e. the first + * run of the tail of @dst. + * Nominally, @tail equals @loc + 1, i.e. location, skipping the + * replaced run. However, if @right, then one of @dst's runs is + * already merged into @src. + */ + tail = loc + right + 1; + /* + * First run after the @src runs that have been inserted, i.e. where + * the tail of @dst needs to be moved to. + * Nominally, @marker equals @loc + @ssize, i.e. location + number of + * runs in @src. However, if @left, then the first run in @src has + * been merged with one in @dst. + */ + marker = loc + ssize - left; + + /* Move the tail of @dst out of the way, then copy in @src. */ + ntfs_rl_mm(dst, marker, tail, dsize - tail); + ntfs_rl_mc(dst, loc, src, left, ssize - left); + + /* We may have changed the length of the file, so fix the end marker. */ + if (dsize - tail > 0 && dst[marker].lcn == LCN_ENOENT) + dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length; + return dst; +} + +/** + * ntfs_rl_split - insert a runlist into the centre of a hole + * @dst: original runlist to be worked on + * @dsize: number of elements in @dst (including end marker) + * @src: new runlist to be inserted + * @ssize: number of elements in @src (excluding end marker) + * @loc: index in runlist @dst at which to split and insert @src + * + * Split the runlist @dst at @loc into two and insert @new in between the two + * fragments. No merging of runlists is necessary. Adjust the size of the + * holes either side. + * + * It is up to the caller to serialize access to the runlists @dst and @src. + * + * On success, return a pointer to the new, combined, runlist. Note, both + * runlists @dst and @src are deallocated before returning so you cannot use + * the pointers for anything any more. (Strictly speaking the returned runlist + * may be the same as @dst but this is irrelevant.) + * + * On error, return -errno. Both runlists are left unmodified. The following + * error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EINVAL - Invalid parameters were passed in. + */ +static inline runlist_element *ntfs_rl_split(runlist_element *dst, int dsize, + runlist_element *src, int ssize, int loc) +{ + BUG_ON(!dst); + BUG_ON(!src); + + /* Space required: @dst size + @src size + one new hole. */ + dst = ntfs_rl_realloc(dst, dsize, dsize + ssize + 1); + if (IS_ERR(dst)) + return dst; + /* + * We are guaranteed to succeed from here so can start modifying the + * original runlists. + */ + + /* Move the tail of @dst out of the way, then copy in @src. */ + ntfs_rl_mm(dst, loc + 1 + ssize, loc, dsize - loc); + ntfs_rl_mc(dst, loc + 1, src, 0, ssize); + + /* Adjust the size of the holes either size of @src. */ + dst[loc].length = dst[loc+1].vcn - dst[loc].vcn; + dst[loc+ssize+1].vcn = dst[loc+ssize].vcn + dst[loc+ssize].length; + dst[loc+ssize+1].length = dst[loc+ssize+2].vcn - dst[loc+ssize+1].vcn; + + return dst; +} + +/** + * ntfs_runlists_merge - merge two runlists into one + * @drl: original runlist to be worked on + * @srl: new runlist to be merged into @drl + * + * First we sanity check the two runlists @srl and @drl to make sure that they + * are sensible and can be merged. The runlist @srl must be either after the + * runlist @drl or completely within a hole (or unmapped region) in @drl. + * + * It is up to the caller to serialize access to the runlists @drl and @srl. + * + * Merging of runlists is necessary in two cases: + * 1. When attribute lists are used and a further extent is being mapped. + * 2. When new clusters are allocated to fill a hole or extend a file. + * + * There are four possible ways @srl can be merged. It can: + * - be inserted at the beginning of a hole, + * - split the hole in two and be inserted between the two fragments, + * - be appended at the end of a hole, or it can + * - replace the whole hole. + * It can also be appended to the end of the runlist, which is just a variant + * of the insert case. + * + * On success, return a pointer to the new, combined, runlist. Note, both + * runlists @drl and @srl are deallocated before returning so you cannot use + * the pointers for anything any more. (Strictly speaking the returned runlist + * may be the same as @dst but this is irrelevant.) + * + * On error, return -errno. Both runlists are left unmodified. The following + * error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EINVAL - Invalid parameters were passed in. + * -ERANGE - The runlists overlap and cannot be merged. + */ +runlist_element *ntfs_runlists_merge(runlist_element *drl, + runlist_element *srl) +{ + int di, si; /* Current index into @[ds]rl. */ + int sstart; /* First index with lcn > LCN_RL_NOT_MAPPED. */ + int dins; /* Index into @drl at which to insert @srl. */ + int dend, send; /* Last index into @[ds]rl. */ + int dfinal, sfinal; /* The last index into @[ds]rl with + lcn >= LCN_HOLE. */ + int marker = 0; + VCN marker_vcn = 0; + +#ifdef DEBUG + ntfs_debug("dst:"); + ntfs_debug_dump_runlist(drl); + ntfs_debug("src:"); + ntfs_debug_dump_runlist(srl); +#endif + + /* Check for silly calling... */ + if (unlikely(!srl)) + return drl; + if (IS_ERR(srl) || IS_ERR(drl)) + return ERR_PTR(-EINVAL); + + /* Check for the case where the first mapping is being done now. */ + if (unlikely(!drl)) { + drl = srl; + /* Complete the source runlist if necessary. */ + if (unlikely(drl[0].vcn)) { + /* Scan to the end of the source runlist. */ + for (dend = 0; likely(drl[dend].length); dend++) + ; + dend++; + drl = ntfs_rl_realloc(drl, dend, dend + 1); + if (IS_ERR(drl)) + return drl; + /* Insert start element at the front of the runlist. */ + ntfs_rl_mm(drl, 1, 0, dend); + drl[0].vcn = 0; + drl[0].lcn = LCN_RL_NOT_MAPPED; + drl[0].length = drl[1].vcn; + } + goto finished; + } + + si = di = 0; + + /* Skip any unmapped start element(s) in the source runlist. */ + while (srl[si].length && srl[si].lcn < LCN_HOLE) + si++; + + /* Can't have an entirely unmapped source runlist. */ + BUG_ON(!srl[si].length); + + /* Record the starting points. */ + sstart = si; + + /* + * Skip forward in @drl until we reach the position where @srl needs to + * be inserted. If we reach the end of @drl, @srl just needs to be + * appended to @drl. + */ + for (; drl[di].length; di++) { + if (drl[di].vcn + drl[di].length > srl[sstart].vcn) + break; + } + dins = di; + + /* Sanity check for illegal overlaps. */ + if ((drl[di].vcn == srl[si].vcn) && (drl[di].lcn >= 0) && + (srl[si].lcn >= 0)) { + ntfs_error(NULL, "Run lists overlap. Cannot merge!"); + return ERR_PTR(-ERANGE); + } + + /* Scan to the end of both runlists in order to know their sizes. */ + for (send = si; srl[send].length; send++) + ; + for (dend = di; drl[dend].length; dend++) + ; + + if (srl[send].lcn == LCN_ENOENT) + marker_vcn = srl[marker = send].vcn; + + /* Scan to the last element with lcn >= LCN_HOLE. */ + for (sfinal = send; sfinal >= 0 && srl[sfinal].lcn < LCN_HOLE; sfinal--) + ; + for (dfinal = dend; dfinal >= 0 && drl[dfinal].lcn < LCN_HOLE; dfinal--) + ; + + { + bool start; + bool finish; + int ds = dend + 1; /* Number of elements in drl & srl */ + int ss = sfinal - sstart + 1; + + start = ((drl[dins].lcn < LCN_RL_NOT_MAPPED) || /* End of file */ + (drl[dins].vcn == srl[sstart].vcn)); /* Start of hole */ + finish = ((drl[dins].lcn >= LCN_RL_NOT_MAPPED) && /* End of file */ + ((drl[dins].vcn + drl[dins].length) <= /* End of hole */ + (srl[send - 1].vcn + srl[send - 1].length))); + + /* Or we will lose an end marker. */ + if (finish && !drl[dins].length) + ss++; + if (marker && (drl[dins].vcn + drl[dins].length > srl[send - 1].vcn)) + finish = false; +#if 0 + ntfs_debug("dfinal = %i, dend = %i", dfinal, dend); + ntfs_debug("sstart = %i, sfinal = %i, send = %i", sstart, sfinal, send); + ntfs_debug("start = %i, finish = %i", start, finish); + ntfs_debug("ds = %i, ss = %i, dins = %i", ds, ss, dins); +#endif + if (start) { + if (finish) + drl = ntfs_rl_replace(drl, ds, srl + sstart, ss, dins); + else + drl = ntfs_rl_insert(drl, ds, srl + sstart, ss, dins); + } else { + if (finish) + drl = ntfs_rl_append(drl, ds, srl + sstart, ss, dins); + else + drl = ntfs_rl_split(drl, ds, srl + sstart, ss, dins); + } + if (IS_ERR(drl)) { + ntfs_error(NULL, "Merge failed."); + return drl; + } + ntfs_free(srl); + if (marker) { + ntfs_debug("Triggering marker code."); + for (ds = dend; drl[ds].length; ds++) + ; + /* We only need to care if @srl ended after @drl. */ + if (drl[ds].vcn <= marker_vcn) { + int slots = 0; + + if (drl[ds].vcn == marker_vcn) { + ntfs_debug("Old marker = 0x%llx, replacing " + "with LCN_ENOENT.", + (unsigned long long) + drl[ds].lcn); + drl[ds].lcn = LCN_ENOENT; + goto finished; + } + /* + * We need to create an unmapped runlist element in + * @drl or extend an existing one before adding the + * ENOENT terminator. + */ + if (drl[ds].lcn == LCN_ENOENT) { + ds--; + slots = 1; + } + if (drl[ds].lcn != LCN_RL_NOT_MAPPED) { + /* Add an unmapped runlist element. */ + if (!slots) { + drl = ntfs_rl_realloc_nofail(drl, ds, + ds + 2); + slots = 2; + } + ds++; + /* Need to set vcn if it isn't set already. */ + if (slots != 1) + drl[ds].vcn = drl[ds - 1].vcn + + drl[ds - 1].length; + drl[ds].lcn = LCN_RL_NOT_MAPPED; + /* We now used up a slot. */ + slots--; + } + drl[ds].length = marker_vcn - drl[ds].vcn; + /* Finally add the ENOENT terminator. */ + ds++; + if (!slots) + drl = ntfs_rl_realloc_nofail(drl, ds, ds + 1); + drl[ds].vcn = marker_vcn; + drl[ds].lcn = LCN_ENOENT; + drl[ds].length = (s64)0; + } + } + } + +finished: + /* The merge was completed successfully. */ + ntfs_debug("Merged runlist:"); + ntfs_debug_dump_runlist(drl); + return drl; +} + +/** + * ntfs_mapping_pairs_decompress - convert mapping pairs array to runlist + * @vol: ntfs volume on which the attribute resides + * @attr: attribute record whose mapping pairs array to decompress + * @old_rl: optional runlist in which to insert @attr's runlist + * + * It is up to the caller to serialize access to the runlist @old_rl. + * + * Decompress the attribute @attr's mapping pairs array into a runlist. On + * success, return the decompressed runlist. + * + * If @old_rl is not NULL, decompressed runlist is inserted into the + * appropriate place in @old_rl and the resultant, combined runlist is + * returned. The original @old_rl is deallocated. + * + * On error, return -errno. @old_rl is left unmodified in that case. + * + * The following error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EIO - Corrupt runlist. + * -EINVAL - Invalid parameters were passed in. + * -ERANGE - The two runlists overlap. + * + * FIXME: For now we take the conceptionally simplest approach of creating the + * new runlist disregarding the already existing one and then splicing the + * two into one, if that is possible (we check for overlap and discard the new + * runlist if overlap present before returning ERR_PTR(-ERANGE)). + */ +runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol, + const ATTR_RECORD *attr, runlist_element *old_rl) +{ + VCN vcn; /* Current vcn. */ + LCN lcn; /* Current lcn. */ + s64 deltaxcn; /* Change in [vl]cn. */ + runlist_element *rl; /* The output runlist. */ + u8 *buf; /* Current position in mapping pairs array. */ + u8 *attr_end; /* End of attribute. */ + int rlsize; /* Size of runlist buffer. */ + u16 rlpos; /* Current runlist position in units of + runlist_elements. */ + u8 b; /* Current byte offset in buf. */ + +#ifdef DEBUG + /* Make sure attr exists and is non-resident. */ + if (!attr || !attr->non_resident || sle64_to_cpu( + attr->data.non_resident.lowest_vcn) < (VCN)0) { + ntfs_error(vol->sb, "Invalid arguments."); + return ERR_PTR(-EINVAL); + } +#endif + /* Start at vcn = lowest_vcn and lcn 0. */ + vcn = sle64_to_cpu(attr->data.non_resident.lowest_vcn); + lcn = 0; + /* Get start of the mapping pairs array. */ + buf = (u8*)attr + le16_to_cpu( + attr->data.non_resident.mapping_pairs_offset); + attr_end = (u8*)attr + le32_to_cpu(attr->length); + if (unlikely(buf < (u8*)attr || buf > attr_end)) { + ntfs_error(vol->sb, "Corrupt attribute."); + return ERR_PTR(-EIO); + } + /* If the mapping pairs array is valid but empty, nothing to do. */ + if (!vcn && !*buf) + return old_rl; + /* Current position in runlist array. */ + rlpos = 0; + /* Allocate first page and set current runlist size to one page. */ + rl = ntfs_malloc_nofs(rlsize = PAGE_SIZE); + if (unlikely(!rl)) + return ERR_PTR(-ENOMEM); + /* Insert unmapped starting element if necessary. */ + if (vcn) { + rl->vcn = 0; + rl->lcn = LCN_RL_NOT_MAPPED; + rl->length = vcn; + rlpos++; + } + while (buf < attr_end && *buf) { + /* + * Allocate more memory if needed, including space for the + * not-mapped and terminator elements. ntfs_malloc_nofs() + * operates on whole pages only. + */ + if (((rlpos + 3) * sizeof(*old_rl)) > rlsize) { + runlist_element *rl2; + + rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE); + if (unlikely(!rl2)) { + ntfs_free(rl); + return ERR_PTR(-ENOMEM); + } + memcpy(rl2, rl, rlsize); + ntfs_free(rl); + rl = rl2; + rlsize += PAGE_SIZE; + } + /* Enter the current vcn into the current runlist element. */ + rl[rlpos].vcn = vcn; + /* + * Get the change in vcn, i.e. the run length in clusters. + * Doing it this way ensures that we signextend negative values. + * A negative run length doesn't make any sense, but hey, I + * didn't make up the NTFS specs and Windows NT4 treats the run + * length as a signed value so that's how it is... + */ + b = *buf & 0xf; + if (b) { + if (unlikely(buf + b > attr_end)) + goto io_error; + for (deltaxcn = (s8)buf[b--]; b; b--) + deltaxcn = (deltaxcn << 8) + buf[b]; + } else { /* The length entry is compulsory. */ + ntfs_error(vol->sb, "Missing length entry in mapping " + "pairs array."); + deltaxcn = (s64)-1; + } + /* + * Assume a negative length to indicate data corruption and + * hence clean-up and return NULL. + */ + if (unlikely(deltaxcn < 0)) { + ntfs_error(vol->sb, "Invalid length in mapping pairs " + "array."); + goto err_out; + } + /* + * Enter the current run length into the current runlist + * element. + */ + rl[rlpos].length = deltaxcn; + /* Increment the current vcn by the current run length. */ + vcn += deltaxcn; + /* + * There might be no lcn change at all, as is the case for + * sparse clusters on NTFS 3.0+, in which case we set the lcn + * to LCN_HOLE. + */ + if (!(*buf & 0xf0)) + rl[rlpos].lcn = LCN_HOLE; + else { + /* Get the lcn change which really can be negative. */ + u8 b2 = *buf & 0xf; + b = b2 + ((*buf >> 4) & 0xf); + if (buf + b > attr_end) + goto io_error; + for (deltaxcn = (s8)buf[b--]; b > b2; b--) + deltaxcn = (deltaxcn << 8) + buf[b]; + /* Change the current lcn to its new value. */ + lcn += deltaxcn; +#ifdef DEBUG + /* + * On NTFS 1.2-, apparently can have lcn == -1 to + * indicate a hole. But we haven't verified ourselves + * whether it is really the lcn or the deltaxcn that is + * -1. So if either is found give us a message so we + * can investigate it further! + */ + if (vol->major_ver < 3) { + if (unlikely(deltaxcn == (LCN)-1)) + ntfs_error(vol->sb, "lcn delta == -1"); + if (unlikely(lcn == (LCN)-1)) + ntfs_error(vol->sb, "lcn == -1"); + } +#endif + /* Check lcn is not below -1. */ + if (unlikely(lcn < (LCN)-1)) { + ntfs_error(vol->sb, "Invalid LCN < -1 in " + "mapping pairs array."); + goto err_out; + } + /* Enter the current lcn into the runlist element. */ + rl[rlpos].lcn = lcn; + } + /* Get to the next runlist element. */ + rlpos++; + /* Increment the buffer position to the next mapping pair. */ + buf += (*buf & 0xf) + ((*buf >> 4) & 0xf) + 1; + } + if (unlikely(buf >= attr_end)) + goto io_error; + /* + * If there is a highest_vcn specified, it must be equal to the final + * vcn in the runlist - 1, or something has gone badly wrong. + */ + deltaxcn = sle64_to_cpu(attr->data.non_resident.highest_vcn); + if (unlikely(deltaxcn && vcn - 1 != deltaxcn)) { +mpa_err: + ntfs_error(vol->sb, "Corrupt mapping pairs array in " + "non-resident attribute."); + goto err_out; + } + /* Setup not mapped runlist element if this is the base extent. */ + if (!attr->data.non_resident.lowest_vcn) { + VCN max_cluster; + + max_cluster = ((sle64_to_cpu( + attr->data.non_resident.allocated_size) + + vol->cluster_size - 1) >> + vol->cluster_size_bits) - 1; + /* + * A highest_vcn of zero means this is a single extent + * attribute so simply terminate the runlist with LCN_ENOENT). + */ + if (deltaxcn) { + /* + * If there is a difference between the highest_vcn and + * the highest cluster, the runlist is either corrupt + * or, more likely, there are more extents following + * this one. + */ + if (deltaxcn < max_cluster) { + ntfs_debug("More extents to follow; deltaxcn " + "= 0x%llx, max_cluster = " + "0x%llx", + (unsigned long long)deltaxcn, + (unsigned long long) + max_cluster); + rl[rlpos].vcn = vcn; + vcn += rl[rlpos].length = max_cluster - + deltaxcn; + rl[rlpos].lcn = LCN_RL_NOT_MAPPED; + rlpos++; + } else if (unlikely(deltaxcn > max_cluster)) { + ntfs_error(vol->sb, "Corrupt attribute. " + "deltaxcn = 0x%llx, " + "max_cluster = 0x%llx", + (unsigned long long)deltaxcn, + (unsigned long long) + max_cluster); + goto mpa_err; + } + } + rl[rlpos].lcn = LCN_ENOENT; + } else /* Not the base extent. There may be more extents to follow. */ + rl[rlpos].lcn = LCN_RL_NOT_MAPPED; + + /* Setup terminating runlist element. */ + rl[rlpos].vcn = vcn; + rl[rlpos].length = (s64)0; + /* If no existing runlist was specified, we are done. */ + if (!old_rl) { + ntfs_debug("Mapping pairs array successfully decompressed:"); + ntfs_debug_dump_runlist(rl); + return rl; + } + /* Now combine the new and old runlists checking for overlaps. */ + old_rl = ntfs_runlists_merge(old_rl, rl); + if (!IS_ERR(old_rl)) + return old_rl; + ntfs_free(rl); + ntfs_error(vol->sb, "Failed to merge runlists."); + return old_rl; +io_error: + ntfs_error(vol->sb, "Corrupt attribute."); +err_out: + ntfs_free(rl); + return ERR_PTR(-EIO); +} + +/** + * ntfs_rl_vcn_to_lcn - convert a vcn into a lcn given a runlist + * @rl: runlist to use for conversion + * @vcn: vcn to convert + * + * Convert the virtual cluster number @vcn of an attribute into a logical + * cluster number (lcn) of a device using the runlist @rl to map vcns to their + * corresponding lcns. + * + * It is up to the caller to serialize access to the runlist @rl. + * + * Since lcns must be >= 0, we use negative return codes with special meaning: + * + * Return code Meaning / Description + * ================================================== + * LCN_HOLE Hole / not allocated on disk. + * LCN_RL_NOT_MAPPED This is part of the runlist which has not been + * inserted into the runlist yet. + * LCN_ENOENT There is no such vcn in the attribute. + * + * Locking: - The caller must have locked the runlist (for reading or writing). + * - This function does not touch the lock, nor does it modify the + * runlist. + */ +LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn) +{ + int i; + + BUG_ON(vcn < 0); + /* + * If rl is NULL, assume that we have found an unmapped runlist. The + * caller can then attempt to map it and fail appropriately if + * necessary. + */ + if (unlikely(!rl)) + return LCN_RL_NOT_MAPPED; + + /* Catch out of lower bounds vcn. */ + if (unlikely(vcn < rl[0].vcn)) + return LCN_ENOENT; + + for (i = 0; likely(rl[i].length); i++) { + if (unlikely(vcn < rl[i+1].vcn)) { + if (likely(rl[i].lcn >= (LCN)0)) + return rl[i].lcn + (vcn - rl[i].vcn); + return rl[i].lcn; + } + } + /* + * The terminator element is setup to the correct value, i.e. one of + * LCN_HOLE, LCN_RL_NOT_MAPPED, or LCN_ENOENT. + */ + if (likely(rl[i].lcn < (LCN)0)) + return rl[i].lcn; + /* Just in case... We could replace this with BUG() some day. */ + return LCN_ENOENT; +} + +#ifdef NTFS_RW + +/** + * ntfs_rl_find_vcn_nolock - find a vcn in a runlist + * @rl: runlist to search + * @vcn: vcn to find + * + * Find the virtual cluster number @vcn in the runlist @rl and return the + * address of the runlist element containing the @vcn on success. + * + * Return NULL if @rl is NULL or @vcn is in an unmapped part/out of bounds of + * the runlist. + * + * Locking: The runlist must be locked on entry. + */ +runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl, const VCN vcn) +{ + BUG_ON(vcn < 0); + if (unlikely(!rl || vcn < rl[0].vcn)) + return NULL; + while (likely(rl->length)) { + if (unlikely(vcn < rl[1].vcn)) { + if (likely(rl->lcn >= LCN_HOLE)) + return rl; + return NULL; + } + rl++; + } + if (likely(rl->lcn == LCN_ENOENT)) + return rl; + return NULL; +} + +/** + * ntfs_get_nr_significant_bytes - get number of bytes needed to store a number + * @n: number for which to get the number of bytes for + * + * Return the number of bytes required to store @n unambiguously as + * a signed number. + * + * This is used in the context of the mapping pairs array to determine how + * many bytes will be needed in the array to store a given logical cluster + * number (lcn) or a specific run length. + * + * Return the number of bytes written. This function cannot fail. + */ +static inline int ntfs_get_nr_significant_bytes(const s64 n) +{ + s64 l = n; + int i; + s8 j; + + i = 0; + do { + l >>= 8; + i++; + } while (l != 0 && l != -1); + j = (n >> 8 * (i - 1)) & 0xff; + /* If the sign bit is wrong, we need an extra byte. */ + if ((n < 0 && j >= 0) || (n > 0 && j < 0)) + i++; + return i; +} + +/** + * ntfs_get_size_for_mapping_pairs - get bytes needed for mapping pairs array + * @vol: ntfs volume (needed for the ntfs version) + * @rl: locked runlist to determine the size of the mapping pairs of + * @first_vcn: first vcn which to include in the mapping pairs array + * @last_vcn: last vcn which to include in the mapping pairs array + * + * Walk the locked runlist @rl and calculate the size in bytes of the mapping + * pairs array corresponding to the runlist @rl, starting at vcn @first_vcn and + * finishing with vcn @last_vcn. + * + * A @last_vcn of -1 means end of runlist and in that case the size of the + * mapping pairs array corresponding to the runlist starting at vcn @first_vcn + * and finishing at the end of the runlist is determined. + * + * This for example allows us to allocate a buffer of the right size when + * building the mapping pairs array. + * + * If @rl is NULL, just return 1 (for the single terminator byte). + * + * Return the calculated size in bytes on success. On error, return -errno. + * The following error codes are defined: + * -EINVAL - Run list contains unmapped elements. Make sure to only pass + * fully mapped runlists to this function. + * -EIO - The runlist is corrupt. + * + * Locking: @rl must be locked on entry (either for reading or writing), it + * remains locked throughout, and is left locked upon return. + */ +int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol, + const runlist_element *rl, const VCN first_vcn, + const VCN last_vcn) +{ + LCN prev_lcn; + int rls; + bool the_end = false; + + BUG_ON(first_vcn < 0); + BUG_ON(last_vcn < -1); + BUG_ON(last_vcn >= 0 && first_vcn > last_vcn); + if (!rl) { + BUG_ON(first_vcn); + BUG_ON(last_vcn > 0); + return 1; + } + /* Skip to runlist element containing @first_vcn. */ + while (rl->length && first_vcn >= rl[1].vcn) + rl++; + if (unlikely((!rl->length && first_vcn > rl->vcn) || + first_vcn < rl->vcn)) + return -EINVAL; + prev_lcn = 0; + /* Always need the termining zero byte. */ + rls = 1; + /* Do the first partial run if present. */ + if (first_vcn > rl->vcn) { + s64 delta, length = rl->length; + + /* We know rl->length != 0 already. */ + if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) + goto err_out; + /* + * If @stop_vcn is given and finishes inside this run, cap the + * run length. + */ + if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { + s64 s1 = last_vcn + 1; + if (unlikely(rl[1].vcn > s1)) + length = s1 - rl->vcn; + the_end = true; + } + delta = first_vcn - rl->vcn; + /* Header byte + length. */ + rls += 1 + ntfs_get_nr_significant_bytes(length - delta); + /* + * If the logical cluster number (lcn) denotes a hole and we + * are on NTFS 3.0+, we don't store it at all, i.e. we need + * zero space. On earlier NTFS versions we just store the lcn. + * Note: this assumes that on NTFS 1.2-, holes are stored with + * an lcn of -1 and not a delta_lcn of -1 (unless both are -1). + */ + if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { + prev_lcn = rl->lcn; + if (likely(rl->lcn >= 0)) + prev_lcn += delta; + /* Change in lcn. */ + rls += ntfs_get_nr_significant_bytes(prev_lcn); + } + /* Go to next runlist element. */ + rl++; + } + /* Do the full runs. */ + for (; rl->length && !the_end; rl++) { + s64 length = rl->length; + + if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) + goto err_out; + /* + * If @stop_vcn is given and finishes inside this run, cap the + * run length. + */ + if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { + s64 s1 = last_vcn + 1; + if (unlikely(rl[1].vcn > s1)) + length = s1 - rl->vcn; + the_end = true; + } + /* Header byte + length. */ + rls += 1 + ntfs_get_nr_significant_bytes(length); + /* + * If the logical cluster number (lcn) denotes a hole and we + * are on NTFS 3.0+, we don't store it at all, i.e. we need + * zero space. On earlier NTFS versions we just store the lcn. + * Note: this assumes that on NTFS 1.2-, holes are stored with + * an lcn of -1 and not a delta_lcn of -1 (unless both are -1). + */ + if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { + /* Change in lcn. */ + rls += ntfs_get_nr_significant_bytes(rl->lcn - + prev_lcn); + prev_lcn = rl->lcn; + } + } + return rls; +err_out: + if (rl->lcn == LCN_RL_NOT_MAPPED) + rls = -EINVAL; + else + rls = -EIO; + return rls; +} + +/** + * ntfs_write_significant_bytes - write the significant bytes of a number + * @dst: destination buffer to write to + * @dst_max: pointer to last byte of destination buffer for bounds checking + * @n: number whose significant bytes to write + * + * Store in @dst, the minimum bytes of the number @n which are required to + * identify @n unambiguously as a signed number, taking care not to exceed + * @dest_max, the maximum position within @dst to which we are allowed to + * write. + * + * This is used when building the mapping pairs array of a runlist to compress + * a given logical cluster number (lcn) or a specific run length to the minimum + * size possible. + * + * Return the number of bytes written on success. On error, i.e. the + * destination buffer @dst is too small, return -ENOSPC. + */ +static inline int ntfs_write_significant_bytes(s8 *dst, const s8 *dst_max, + const s64 n) +{ + s64 l = n; + int i; + s8 j; + + i = 0; + do { + if (unlikely(dst > dst_max)) + goto err_out; + *dst++ = l & 0xffll; + l >>= 8; + i++; + } while (l != 0 && l != -1); + j = (n >> 8 * (i - 1)) & 0xff; + /* If the sign bit is wrong, we need an extra byte. */ + if (n < 0 && j >= 0) { + if (unlikely(dst > dst_max)) + goto err_out; + i++; + *dst = (s8)-1; + } else if (n > 0 && j < 0) { + if (unlikely(dst > dst_max)) + goto err_out; + i++; + *dst = (s8)0; + } + return i; +err_out: + return -ENOSPC; +} + +/** + * ntfs_mapping_pairs_build - build the mapping pairs array from a runlist + * @vol: ntfs volume (needed for the ntfs version) + * @dst: destination buffer to which to write the mapping pairs array + * @dst_len: size of destination buffer @dst in bytes + * @rl: locked runlist for which to build the mapping pairs array + * @first_vcn: first vcn which to include in the mapping pairs array + * @last_vcn: last vcn which to include in the mapping pairs array + * @stop_vcn: first vcn outside destination buffer on success or -ENOSPC + * + * Create the mapping pairs array from the locked runlist @rl, starting at vcn + * @first_vcn and finishing with vcn @last_vcn and save the array in @dst. + * @dst_len is the size of @dst in bytes and it should be at least equal to the + * value obtained by calling ntfs_get_size_for_mapping_pairs(). + * + * A @last_vcn of -1 means end of runlist and in that case the mapping pairs + * array corresponding to the runlist starting at vcn @first_vcn and finishing + * at the end of the runlist is created. + * + * If @rl is NULL, just write a single terminator byte to @dst. + * + * On success or -ENOSPC error, if @stop_vcn is not NULL, *@stop_vcn is set to + * the first vcn outside the destination buffer. Note that on error, @dst has + * been filled with all the mapping pairs that will fit, thus it can be treated + * as partial success, in that a new attribute extent needs to be created or + * the next extent has to be used and the mapping pairs build has to be + * continued with @first_vcn set to *@stop_vcn. + * + * Return 0 on success and -errno on error. The following error codes are + * defined: + * -EINVAL - Run list contains unmapped elements. Make sure to only pass + * fully mapped runlists to this function. + * -EIO - The runlist is corrupt. + * -ENOSPC - The destination buffer is too small. + * + * Locking: @rl must be locked on entry (either for reading or writing), it + * remains locked throughout, and is left locked upon return. + */ +int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, + const int dst_len, const runlist_element *rl, + const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn) +{ + LCN prev_lcn; + s8 *dst_max, *dst_next; + int err = -ENOSPC; + bool the_end = false; + s8 len_len, lcn_len; + + BUG_ON(first_vcn < 0); + BUG_ON(last_vcn < -1); + BUG_ON(last_vcn >= 0 && first_vcn > last_vcn); + BUG_ON(dst_len < 1); + if (!rl) { + BUG_ON(first_vcn); + BUG_ON(last_vcn > 0); + if (stop_vcn) + *stop_vcn = 0; + /* Terminator byte. */ + *dst = 0; + return 0; + } + /* Skip to runlist element containing @first_vcn. */ + while (rl->length && first_vcn >= rl[1].vcn) + rl++; + if (unlikely((!rl->length && first_vcn > rl->vcn) || + first_vcn < rl->vcn)) + return -EINVAL; + /* + * @dst_max is used for bounds checking in + * ntfs_write_significant_bytes(). + */ + dst_max = dst + dst_len - 1; + prev_lcn = 0; + /* Do the first partial run if present. */ + if (first_vcn > rl->vcn) { + s64 delta, length = rl->length; + + /* We know rl->length != 0 already. */ + if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) + goto err_out; + /* + * If @stop_vcn is given and finishes inside this run, cap the + * run length. + */ + if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { + s64 s1 = last_vcn + 1; + if (unlikely(rl[1].vcn > s1)) + length = s1 - rl->vcn; + the_end = true; + } + delta = first_vcn - rl->vcn; + /* Write length. */ + len_len = ntfs_write_significant_bytes(dst + 1, dst_max, + length - delta); + if (unlikely(len_len < 0)) + goto size_err; + /* + * If the logical cluster number (lcn) denotes a hole and we + * are on NTFS 3.0+, we don't store it at all, i.e. we need + * zero space. On earlier NTFS versions we just write the lcn + * change. FIXME: Do we need to write the lcn change or just + * the lcn in that case? Not sure as I have never seen this + * case on NT4. - We assume that we just need to write the lcn + * change until someone tells us otherwise... (AIA) + */ + if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { + prev_lcn = rl->lcn; + if (likely(rl->lcn >= 0)) + prev_lcn += delta; + /* Write change in lcn. */ + lcn_len = ntfs_write_significant_bytes(dst + 1 + + len_len, dst_max, prev_lcn); + if (unlikely(lcn_len < 0)) + goto size_err; + } else + lcn_len = 0; + dst_next = dst + len_len + lcn_len + 1; + if (unlikely(dst_next > dst_max)) + goto size_err; + /* Update header byte. */ + *dst = lcn_len << 4 | len_len; + /* Position at next mapping pairs array element. */ + dst = dst_next; + /* Go to next runlist element. */ + rl++; + } + /* Do the full runs. */ + for (; rl->length && !the_end; rl++) { + s64 length = rl->length; + + if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) + goto err_out; + /* + * If @stop_vcn is given and finishes inside this run, cap the + * run length. + */ + if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { + s64 s1 = last_vcn + 1; + if (unlikely(rl[1].vcn > s1)) + length = s1 - rl->vcn; + the_end = true; + } + /* Write length. */ + len_len = ntfs_write_significant_bytes(dst + 1, dst_max, + length); + if (unlikely(len_len < 0)) + goto size_err; + /* + * If the logical cluster number (lcn) denotes a hole and we + * are on NTFS 3.0+, we don't store it at all, i.e. we need + * zero space. On earlier NTFS versions we just write the lcn + * change. FIXME: Do we need to write the lcn change or just + * the lcn in that case? Not sure as I have never seen this + * case on NT4. - We assume that we just need to write the lcn + * change until someone tells us otherwise... (AIA) + */ + if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { + /* Write change in lcn. */ + lcn_len = ntfs_write_significant_bytes(dst + 1 + + len_len, dst_max, rl->lcn - prev_lcn); + if (unlikely(lcn_len < 0)) + goto size_err; + prev_lcn = rl->lcn; + } else + lcn_len = 0; + dst_next = dst + len_len + lcn_len + 1; + if (unlikely(dst_next > dst_max)) + goto size_err; + /* Update header byte. */ + *dst = lcn_len << 4 | len_len; + /* Position at next mapping pairs array element. */ + dst = dst_next; + } + /* Success. */ + err = 0; +size_err: + /* Set stop vcn. */ + if (stop_vcn) + *stop_vcn = rl->vcn; + /* Add terminator byte. */ + *dst = 0; + return err; +err_out: + if (rl->lcn == LCN_RL_NOT_MAPPED) + err = -EINVAL; + else + err = -EIO; + return err; +} + +/** + * ntfs_rl_truncate_nolock - truncate a runlist starting at a specified vcn + * @vol: ntfs volume (needed for error output) + * @runlist: runlist to truncate + * @new_length: the new length of the runlist in VCNs + * + * Truncate the runlist described by @runlist as well as the memory buffer + * holding the runlist elements to a length of @new_length VCNs. + * + * If @new_length lies within the runlist, the runlist elements with VCNs of + * @new_length and above are discarded. As a special case if @new_length is + * zero, the runlist is discarded and set to NULL. + * + * If @new_length lies beyond the runlist, a sparse runlist element is added to + * the end of the runlist @runlist or if the last runlist element is a sparse + * one already, this is extended. + * + * Note, no checking is done for unmapped runlist elements. It is assumed that + * the caller has mapped any elements that need to be mapped already. + * + * Return 0 on success and -errno on error. + * + * Locking: The caller must hold @runlist->lock for writing. + */ +int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist, + const s64 new_length) +{ + runlist_element *rl; + int old_size; + + ntfs_debug("Entering for new_length 0x%llx.", (long long)new_length); + BUG_ON(!runlist); + BUG_ON(new_length < 0); + rl = runlist->rl; + if (!new_length) { + ntfs_debug("Freeing runlist."); + runlist->rl = NULL; + if (rl) + ntfs_free(rl); + return 0; + } + if (unlikely(!rl)) { + /* + * Create a runlist consisting of a sparse runlist element of + * length @new_length followed by a terminator runlist element. + */ + rl = ntfs_malloc_nofs(PAGE_SIZE); + if (unlikely(!rl)) { + ntfs_error(vol->sb, "Not enough memory to allocate " + "runlist element buffer."); + return -ENOMEM; + } + runlist->rl = rl; + rl[1].length = rl->vcn = 0; + rl->lcn = LCN_HOLE; + rl[1].vcn = rl->length = new_length; + rl[1].lcn = LCN_ENOENT; + return 0; + } + BUG_ON(new_length < rl->vcn); + /* Find @new_length in the runlist. */ + while (likely(rl->length && new_length >= rl[1].vcn)) + rl++; + /* + * If not at the end of the runlist we need to shrink it. + * If at the end of the runlist we need to expand it. + */ + if (rl->length) { + runlist_element *trl; + bool is_end; + + ntfs_debug("Shrinking runlist."); + /* Determine the runlist size. */ + trl = rl + 1; + while (likely(trl->length)) + trl++; + old_size = trl - runlist->rl + 1; + /* Truncate the run. */ + rl->length = new_length - rl->vcn; + /* + * If a run was partially truncated, make the following runlist + * element a terminator. + */ + is_end = false; + if (rl->length) { + rl++; + if (!rl->length) + is_end = true; + rl->vcn = new_length; + rl->length = 0; + } + rl->lcn = LCN_ENOENT; + /* Reallocate memory if necessary. */ + if (!is_end) { + int new_size = rl - runlist->rl + 1; + rl = ntfs_rl_realloc(runlist->rl, old_size, new_size); + if (IS_ERR(rl)) + ntfs_warning(vol->sb, "Failed to shrink " + "runlist buffer. This just " + "wastes a bit of memory " + "temporarily so we ignore it " + "and return success."); + else + runlist->rl = rl; + } + } else if (likely(/* !rl->length && */ new_length > rl->vcn)) { + ntfs_debug("Expanding runlist."); + /* + * If there is a previous runlist element and it is a sparse + * one, extend it. Otherwise need to add a new, sparse runlist + * element. + */ + if ((rl > runlist->rl) && ((rl - 1)->lcn == LCN_HOLE)) + (rl - 1)->length = new_length - (rl - 1)->vcn; + else { + /* Determine the runlist size. */ + old_size = rl - runlist->rl + 1; + /* Reallocate memory if necessary. */ + rl = ntfs_rl_realloc(runlist->rl, old_size, + old_size + 1); + if (IS_ERR(rl)) { + ntfs_error(vol->sb, "Failed to expand runlist " + "buffer, aborting."); + return PTR_ERR(rl); + } + runlist->rl = rl; + /* + * Set @rl to the same runlist element in the new + * runlist as before in the old runlist. + */ + rl += old_size - 1; + /* Add a new, sparse runlist element. */ + rl->lcn = LCN_HOLE; + rl->length = new_length - rl->vcn; + /* Add a new terminator runlist element. */ + rl++; + rl->length = 0; + } + rl->vcn = new_length; + rl->lcn = LCN_ENOENT; + } else /* if (unlikely(!rl->length && new_length == rl->vcn)) */ { + /* Runlist already has same size as requested. */ + rl->lcn = LCN_ENOENT; + } + ntfs_debug("Done."); + return 0; +} + +/** + * ntfs_rl_punch_nolock - punch a hole into a runlist + * @vol: ntfs volume (needed for error output) + * @runlist: runlist to punch a hole into + * @start: starting VCN of the hole to be created + * @length: size of the hole to be created in units of clusters + * + * Punch a hole into the runlist @runlist starting at VCN @start and of size + * @length clusters. + * + * Return 0 on success and -errno on error, in which case @runlist has not been + * modified. + * + * If @start and/or @start + @length are outside the runlist return error code + * -ENOENT. + * + * If the runlist contains unmapped or error elements between @start and @start + * + @length return error code -EINVAL. + * + * Locking: The caller must hold @runlist->lock for writing. + */ +int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist, + const VCN start, const s64 length) +{ + const VCN end = start + length; + s64 delta; + runlist_element *rl, *rl_end, *rl_real_end, *trl; + int old_size; + bool lcn_fixup = false; + + ntfs_debug("Entering for start 0x%llx, length 0x%llx.", + (long long)start, (long long)length); + BUG_ON(!runlist); + BUG_ON(start < 0); + BUG_ON(length < 0); + BUG_ON(end < 0); + rl = runlist->rl; + if (unlikely(!rl)) { + if (likely(!start && !length)) + return 0; + return -ENOENT; + } + /* Find @start in the runlist. */ + while (likely(rl->length && start >= rl[1].vcn)) + rl++; + rl_end = rl; + /* Find @end in the runlist. */ + while (likely(rl_end->length && end >= rl_end[1].vcn)) { + /* Verify there are no unmapped or error elements. */ + if (unlikely(rl_end->lcn < LCN_HOLE)) + return -EINVAL; + rl_end++; + } + /* Check the last element. */ + if (unlikely(rl_end->length && rl_end->lcn < LCN_HOLE)) + return -EINVAL; + /* This covers @start being out of bounds, too. */ + if (!rl_end->length && end > rl_end->vcn) + return -ENOENT; + if (!length) + return 0; + if (!rl->length) + return -ENOENT; + rl_real_end = rl_end; + /* Determine the runlist size. */ + while (likely(rl_real_end->length)) + rl_real_end++; + old_size = rl_real_end - runlist->rl + 1; + /* If @start is in a hole simply extend the hole. */ + if (rl->lcn == LCN_HOLE) { + /* + * If both @start and @end are in the same sparse run, we are + * done. + */ + if (end <= rl[1].vcn) { + ntfs_debug("Done (requested hole is already sparse)."); + return 0; + } +extend_hole: + /* Extend the hole. */ + rl->length = end - rl->vcn; + /* If @end is in a hole, merge it with the current one. */ + if (rl_end->lcn == LCN_HOLE) { + rl_end++; + rl->length = rl_end->vcn - rl->vcn; + } + /* We have done the hole. Now deal with the remaining tail. */ + rl++; + /* Cut out all runlist elements up to @end. */ + if (rl < rl_end) + memmove(rl, rl_end, (rl_real_end - rl_end + 1) * + sizeof(*rl)); + /* Adjust the beginning of the tail if necessary. */ + if (end > rl->vcn) { + delta = end - rl->vcn; + rl->vcn = end; + rl->length -= delta; + /* Only adjust the lcn if it is real. */ + if (rl->lcn >= 0) + rl->lcn += delta; + } +shrink_allocation: + /* Reallocate memory if the allocation changed. */ + if (rl < rl_end) { + rl = ntfs_rl_realloc(runlist->rl, old_size, + old_size - (rl_end - rl)); + if (IS_ERR(rl)) + ntfs_warning(vol->sb, "Failed to shrink " + "runlist buffer. This just " + "wastes a bit of memory " + "temporarily so we ignore it " + "and return success."); + else + runlist->rl = rl; + } + ntfs_debug("Done (extend hole)."); + return 0; + } + /* + * If @start is at the beginning of a run things are easier as there is + * no need to split the first run. + */ + if (start == rl->vcn) { + /* + * @start is at the beginning of a run. + * + * If the previous run is sparse, extend its hole. + * + * If @end is not in the same run, switch the run to be sparse + * and extend the newly created hole. + * + * Thus both of these cases reduce the problem to the above + * case of "@start is in a hole". + */ + if (rl > runlist->rl && (rl - 1)->lcn == LCN_HOLE) { + rl--; + goto extend_hole; + } + if (end >= rl[1].vcn) { + rl->lcn = LCN_HOLE; + goto extend_hole; + } + /* + * The final case is when @end is in the same run as @start. + * For this need to split the run into two. One run for the + * sparse region between the beginning of the old run, i.e. + * @start, and @end and one for the remaining non-sparse + * region, i.e. between @end and the end of the old run. + */ + trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1); + if (IS_ERR(trl)) + goto enomem_out; + old_size++; + if (runlist->rl != trl) { + rl = trl + (rl - runlist->rl); + rl_end = trl + (rl_end - runlist->rl); + rl_real_end = trl + (rl_real_end - runlist->rl); + runlist->rl = trl; + } +split_end: + /* Shift all the runs up by one. */ + memmove(rl + 1, rl, (rl_real_end - rl + 1) * sizeof(*rl)); + /* Finally, setup the two split runs. */ + rl->lcn = LCN_HOLE; + rl->length = length; + rl++; + rl->vcn += length; + /* Only adjust the lcn if it is real. */ + if (rl->lcn >= 0 || lcn_fixup) + rl->lcn += length; + rl->length -= length; + ntfs_debug("Done (split one)."); + return 0; + } + /* + * @start is neither in a hole nor at the beginning of a run. + * + * If @end is in a hole, things are easier as simply truncating the run + * @start is in to end at @start - 1, deleting all runs after that up + * to @end, and finally extending the beginning of the run @end is in + * to be @start is all that is needed. + */ + if (rl_end->lcn == LCN_HOLE) { + /* Truncate the run containing @start. */ + rl->length = start - rl->vcn; + rl++; + /* Cut out all runlist elements up to @end. */ + if (rl < rl_end) + memmove(rl, rl_end, (rl_real_end - rl_end + 1) * + sizeof(*rl)); + /* Extend the beginning of the run @end is in to be @start. */ + rl->vcn = start; + rl->length = rl[1].vcn - start; + goto shrink_allocation; + } + /* + * If @end is not in a hole there are still two cases to distinguish. + * Either @end is or is not in the same run as @start. + * + * The second case is easier as it can be reduced to an already solved + * problem by truncating the run @start is in to end at @start - 1. + * Then, if @end is in the next run need to split the run into a sparse + * run followed by a non-sparse run (already covered above) and if @end + * is not in the next run switching it to be sparse, again reduces the + * problem to the already covered case of "@start is in a hole". + */ + if (end >= rl[1].vcn) { + /* + * If @end is not in the next run, reduce the problem to the + * case of "@start is in a hole". + */ + if (rl[1].length && end >= rl[2].vcn) { + /* Truncate the run containing @start. */ + rl->length = start - rl->vcn; + rl++; + rl->vcn = start; + rl->lcn = LCN_HOLE; + goto extend_hole; + } + trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1); + if (IS_ERR(trl)) + goto enomem_out; + old_size++; + if (runlist->rl != trl) { + rl = trl + (rl - runlist->rl); + rl_end = trl + (rl_end - runlist->rl); + rl_real_end = trl + (rl_real_end - runlist->rl); + runlist->rl = trl; + } + /* Truncate the run containing @start. */ + rl->length = start - rl->vcn; + rl++; + /* + * @end is in the next run, reduce the problem to the case + * where "@start is at the beginning of a run and @end is in + * the same run as @start". + */ + delta = rl->vcn - start; + rl->vcn = start; + if (rl->lcn >= 0) { + rl->lcn -= delta; + /* Need this in case the lcn just became negative. */ + lcn_fixup = true; + } + rl->length += delta; + goto split_end; + } + /* + * The first case from above, i.e. @end is in the same run as @start. + * We need to split the run into three. One run for the non-sparse + * region between the beginning of the old run and @start, one for the + * sparse region between @start and @end, and one for the remaining + * non-sparse region, i.e. between @end and the end of the old run. + */ + trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 2); + if (IS_ERR(trl)) + goto enomem_out; + old_size += 2; + if (runlist->rl != trl) { + rl = trl + (rl - runlist->rl); + rl_end = trl + (rl_end - runlist->rl); + rl_real_end = trl + (rl_real_end - runlist->rl); + runlist->rl = trl; + } + /* Shift all the runs up by two. */ + memmove(rl + 2, rl, (rl_real_end - rl + 1) * sizeof(*rl)); + /* Finally, setup the three split runs. */ + rl->length = start - rl->vcn; + rl++; + rl->vcn = start; + rl->lcn = LCN_HOLE; + rl->length = length; + rl++; + delta = end - rl->vcn; + rl->vcn = end; + rl->lcn += delta; + rl->length -= delta; + ntfs_debug("Done (split both)."); + return 0; +enomem_out: + ntfs_error(vol->sb, "Not enough memory to extend runlist buffer."); + return -ENOMEM; +} + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/runlist.h b/fs/ntfs/runlist.h new file mode 100644 index 000000000000..38de0a375f59 --- /dev/null +++ b/fs/ntfs/runlist.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * runlist.h - Defines for runlist handling in NTFS Linux kernel driver. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2005 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + */ + +#ifndef _LINUX_NTFS_RUNLIST_H +#define _LINUX_NTFS_RUNLIST_H + +#include "types.h" +#include "layout.h" +#include "volume.h" + +/** + * runlist_element - in memory vcn to lcn mapping array element + * @vcn: starting vcn of the current array element + * @lcn: starting lcn of the current array element + * @length: length in clusters of the current array element + * + * The last vcn (in fact the last vcn + 1) is reached when length == 0. + * + * When lcn == -1 this means that the count vcns starting at vcn are not + * physically allocated (i.e. this is a hole / data is sparse). + */ +typedef struct { /* In memory vcn to lcn mapping structure element. */ + VCN vcn; /* vcn = Starting virtual cluster number. */ + LCN lcn; /* lcn = Starting logical cluster number. */ + s64 length; /* Run length in clusters. */ +} runlist_element; + +/** + * runlist - in memory vcn to lcn mapping array including a read/write lock + * @rl: pointer to an array of runlist elements + * @lock: read/write spinlock for serializing access to @rl + * + */ +typedef struct { + runlist_element *rl; + struct rw_semaphore lock; +} runlist; + +static inline void ntfs_init_runlist(runlist *rl) +{ + rl->rl = NULL; + init_rwsem(&rl->lock); +} + +typedef enum { + LCN_HOLE = -1, /* Keep this as highest value or die! */ + LCN_RL_NOT_MAPPED = -2, + LCN_ENOENT = -3, + LCN_ENOMEM = -4, + LCN_EIO = -5, +} LCN_SPECIAL_VALUES; + +extern runlist_element *ntfs_runlists_merge(runlist_element *drl, + runlist_element *srl); + +extern runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol, + const ATTR_RECORD *attr, runlist_element *old_rl); + +extern LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn); + +#ifdef NTFS_RW + +extern runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl, + const VCN vcn); + +extern int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol, + const runlist_element *rl, const VCN first_vcn, + const VCN last_vcn); + +extern int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, + const int dst_len, const runlist_element *rl, + const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn); + +extern int ntfs_rl_truncate_nolock(const ntfs_volume *vol, + runlist *const runlist, const s64 new_length); + +int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist, + const VCN start, const s64 length); + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_RUNLIST_H */ diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c new file mode 100644 index 000000000000..56a7d5bd33e4 --- /dev/null +++ b/fs/ntfs/super.c @@ -0,0 +1,3202 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. + * Copyright (c) 2001,2002 Richard Russon + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include /* For bdev_logical_block_size(). */ +#include +#include +#include +#include +#include + +#include "sysctl.h" +#include "logfile.h" +#include "quota.h" +#include "usnjrnl.h" +#include "dir.h" +#include "debug.h" +#include "index.h" +#include "inode.h" +#include "aops.h" +#include "layout.h" +#include "malloc.h" +#include "ntfs.h" + +/* Number of mounted filesystems which have compression enabled. */ +static unsigned long ntfs_nr_compression_users; + +/* A global default upcase table and a corresponding reference count. */ +static ntfschar *default_upcase; +static unsigned long ntfs_nr_upcase_users; + +/* Error constants/strings used in inode.c::ntfs_show_options(). */ +typedef enum { + /* One of these must be present, default is ON_ERRORS_CONTINUE. */ + ON_ERRORS_PANIC = 0x01, + ON_ERRORS_REMOUNT_RO = 0x02, + ON_ERRORS_CONTINUE = 0x04, + /* Optional, can be combined with any of the above. */ + ON_ERRORS_RECOVER = 0x10, +} ON_ERRORS_ACTIONS; + +const option_t on_errors_arr[] = { + { ON_ERRORS_PANIC, "panic" }, + { ON_ERRORS_REMOUNT_RO, "remount-ro", }, + { ON_ERRORS_CONTINUE, "continue", }, + { ON_ERRORS_RECOVER, "recover" }, + { 0, NULL } +}; + +/** + * simple_getbool - convert input string to a boolean value + * @s: input string to convert + * @setval: where to store the output boolean value + * + * Copied from old ntfs driver (which copied from vfat driver). + * + * "1", "yes", "true", or an empty string are converted to %true. + * "0", "no", and "false" are converted to %false. + * + * Return: %1 if the string is converted or was empty and *setval contains it; + * %0 if the string was not valid. + */ +static int simple_getbool(char *s, bool *setval) +{ + if (s) { + if (!strcmp(s, "1") || !strcmp(s, "yes") || !strcmp(s, "true")) + *setval = true; + else if (!strcmp(s, "0") || !strcmp(s, "no") || + !strcmp(s, "false")) + *setval = false; + else + return 0; + } else + *setval = true; + return 1; +} + +/** + * parse_options - parse the (re)mount options + * @vol: ntfs volume + * @opt: string containing the (re)mount options + * + * Parse the recognized options in @opt for the ntfs volume described by @vol. + */ +static bool parse_options(ntfs_volume *vol, char *opt) +{ + char *p, *v, *ov; + static char *utf8 = "utf8"; + int errors = 0, sloppy = 0; + kuid_t uid = INVALID_UID; + kgid_t gid = INVALID_GID; + umode_t fmask = (umode_t)-1, dmask = (umode_t)-1; + int mft_zone_multiplier = -1, on_errors = -1; + int show_sys_files = -1, case_sensitive = -1, disable_sparse = -1; + struct nls_table *nls_map = NULL, *old_nls; + + /* I am lazy... (-8 */ +#define NTFS_GETOPT_WITH_DEFAULT(option, variable, default_value) \ + if (!strcmp(p, option)) { \ + if (!v || !*v) \ + variable = default_value; \ + else { \ + variable = simple_strtoul(ov = v, &v, 0); \ + if (*v) \ + goto needs_val; \ + } \ + } +#define NTFS_GETOPT(option, variable) \ + if (!strcmp(p, option)) { \ + if (!v || !*v) \ + goto needs_arg; \ + variable = simple_strtoul(ov = v, &v, 0); \ + if (*v) \ + goto needs_val; \ + } +#define NTFS_GETOPT_UID(option, variable) \ + if (!strcmp(p, option)) { \ + uid_t uid_value; \ + if (!v || !*v) \ + goto needs_arg; \ + uid_value = simple_strtoul(ov = v, &v, 0); \ + if (*v) \ + goto needs_val; \ + variable = make_kuid(current_user_ns(), uid_value); \ + if (!uid_valid(variable)) \ + goto needs_val; \ + } +#define NTFS_GETOPT_GID(option, variable) \ + if (!strcmp(p, option)) { \ + gid_t gid_value; \ + if (!v || !*v) \ + goto needs_arg; \ + gid_value = simple_strtoul(ov = v, &v, 0); \ + if (*v) \ + goto needs_val; \ + variable = make_kgid(current_user_ns(), gid_value); \ + if (!gid_valid(variable)) \ + goto needs_val; \ + } +#define NTFS_GETOPT_OCTAL(option, variable) \ + if (!strcmp(p, option)) { \ + if (!v || !*v) \ + goto needs_arg; \ + variable = simple_strtoul(ov = v, &v, 8); \ + if (*v) \ + goto needs_val; \ + } +#define NTFS_GETOPT_BOOL(option, variable) \ + if (!strcmp(p, option)) { \ + bool val; \ + if (!simple_getbool(v, &val)) \ + goto needs_bool; \ + variable = val; \ + } +#define NTFS_GETOPT_OPTIONS_ARRAY(option, variable, opt_array) \ + if (!strcmp(p, option)) { \ + int _i; \ + if (!v || !*v) \ + goto needs_arg; \ + ov = v; \ + if (variable == -1) \ + variable = 0; \ + for (_i = 0; opt_array[_i].str && *opt_array[_i].str; _i++) \ + if (!strcmp(opt_array[_i].str, v)) { \ + variable |= opt_array[_i].val; \ + break; \ + } \ + if (!opt_array[_i].str || !*opt_array[_i].str) \ + goto needs_val; \ + } + if (!opt || !*opt) + goto no_mount_options; + ntfs_debug("Entering with mount options string: %s", opt); + while ((p = strsep(&opt, ","))) { + if ((v = strchr(p, '='))) + *v++ = 0; + NTFS_GETOPT_UID("uid", uid) + else NTFS_GETOPT_GID("gid", gid) + else NTFS_GETOPT_OCTAL("umask", fmask = dmask) + else NTFS_GETOPT_OCTAL("fmask", fmask) + else NTFS_GETOPT_OCTAL("dmask", dmask) + else NTFS_GETOPT("mft_zone_multiplier", mft_zone_multiplier) + else NTFS_GETOPT_WITH_DEFAULT("sloppy", sloppy, true) + else NTFS_GETOPT_BOOL("show_sys_files", show_sys_files) + else NTFS_GETOPT_BOOL("case_sensitive", case_sensitive) + else NTFS_GETOPT_BOOL("disable_sparse", disable_sparse) + else NTFS_GETOPT_OPTIONS_ARRAY("errors", on_errors, + on_errors_arr) + else if (!strcmp(p, "posix") || !strcmp(p, "show_inodes")) + ntfs_warning(vol->sb, "Ignoring obsolete option %s.", + p); + else if (!strcmp(p, "nls") || !strcmp(p, "iocharset")) { + if (!strcmp(p, "iocharset")) + ntfs_warning(vol->sb, "Option iocharset is " + "deprecated. Please use " + "option nls= in " + "the future."); + if (!v || !*v) + goto needs_arg; +use_utf8: + old_nls = nls_map; + nls_map = load_nls(v); + if (!nls_map) { + if (!old_nls) { + ntfs_error(vol->sb, "NLS character set " + "%s not found.", v); + return false; + } + ntfs_error(vol->sb, "NLS character set %s not " + "found. Using previous one %s.", + v, old_nls->charset); + nls_map = old_nls; + } else /* nls_map */ { + unload_nls(old_nls); + } + } else if (!strcmp(p, "utf8")) { + bool val = false; + ntfs_warning(vol->sb, "Option utf8 is no longer " + "supported, using option nls=utf8. Please " + "use option nls=utf8 in the future and " + "make sure utf8 is compiled either as a " + "module or into the kernel."); + if (!v || !*v) + val = true; + else if (!simple_getbool(v, &val)) + goto needs_bool; + if (val) { + v = utf8; + goto use_utf8; + } + } else { + ntfs_error(vol->sb, "Unrecognized mount option %s.", p); + if (errors < INT_MAX) + errors++; + } +#undef NTFS_GETOPT_OPTIONS_ARRAY +#undef NTFS_GETOPT_BOOL +#undef NTFS_GETOPT +#undef NTFS_GETOPT_WITH_DEFAULT + } +no_mount_options: + if (errors && !sloppy) + return false; + if (sloppy) + ntfs_warning(vol->sb, "Sloppy option given. Ignoring " + "unrecognized mount option(s) and continuing."); + /* Keep this first! */ + if (on_errors != -1) { + if (!on_errors) { + ntfs_error(vol->sb, "Invalid errors option argument " + "or bug in options parser."); + return false; + } + } + if (nls_map) { + if (vol->nls_map && vol->nls_map != nls_map) { + ntfs_error(vol->sb, "Cannot change NLS character set " + "on remount."); + return false; + } /* else (!vol->nls_map) */ + ntfs_debug("Using NLS character set %s.", nls_map->charset); + vol->nls_map = nls_map; + } else /* (!nls_map) */ { + if (!vol->nls_map) { + vol->nls_map = load_nls_default(); + if (!vol->nls_map) { + ntfs_error(vol->sb, "Failed to load default " + "NLS character set."); + return false; + } + ntfs_debug("Using default NLS character set (%s).", + vol->nls_map->charset); + } + } + if (mft_zone_multiplier != -1) { + if (vol->mft_zone_multiplier && vol->mft_zone_multiplier != + mft_zone_multiplier) { + ntfs_error(vol->sb, "Cannot change mft_zone_multiplier " + "on remount."); + return false; + } + if (mft_zone_multiplier < 1 || mft_zone_multiplier > 4) { + ntfs_error(vol->sb, "Invalid mft_zone_multiplier. " + "Using default value, i.e. 1."); + mft_zone_multiplier = 1; + } + vol->mft_zone_multiplier = mft_zone_multiplier; + } + if (!vol->mft_zone_multiplier) + vol->mft_zone_multiplier = 1; + if (on_errors != -1) + vol->on_errors = on_errors; + if (!vol->on_errors || vol->on_errors == ON_ERRORS_RECOVER) + vol->on_errors |= ON_ERRORS_CONTINUE; + if (uid_valid(uid)) + vol->uid = uid; + if (gid_valid(gid)) + vol->gid = gid; + if (fmask != (umode_t)-1) + vol->fmask = fmask; + if (dmask != (umode_t)-1) + vol->dmask = dmask; + if (show_sys_files != -1) { + if (show_sys_files) + NVolSetShowSystemFiles(vol); + else + NVolClearShowSystemFiles(vol); + } + if (case_sensitive != -1) { + if (case_sensitive) + NVolSetCaseSensitive(vol); + else + NVolClearCaseSensitive(vol); + } + if (disable_sparse != -1) { + if (disable_sparse) + NVolClearSparseEnabled(vol); + else { + if (!NVolSparseEnabled(vol) && + vol->major_ver && vol->major_ver < 3) + ntfs_warning(vol->sb, "Not enabling sparse " + "support due to NTFS volume " + "version %i.%i (need at least " + "version 3.0).", vol->major_ver, + vol->minor_ver); + else + NVolSetSparseEnabled(vol); + } + } + return true; +needs_arg: + ntfs_error(vol->sb, "The %s option requires an argument.", p); + return false; +needs_bool: + ntfs_error(vol->sb, "The %s option requires a boolean argument.", p); + return false; +needs_val: + ntfs_error(vol->sb, "Invalid %s option argument: %s", p, ov); + return false; +} + +#ifdef NTFS_RW + +/** + * ntfs_write_volume_flags - write new flags to the volume information flags + * @vol: ntfs volume on which to modify the flags + * @flags: new flags value for the volume information flags + * + * Internal function. You probably want to use ntfs_{set,clear}_volume_flags() + * instead (see below). + * + * Replace the volume information flags on the volume @vol with the value + * supplied in @flags. Note, this overwrites the volume information flags, so + * make sure to combine the flags you want to modify with the old flags and use + * the result when calling ntfs_write_volume_flags(). + * + * Return 0 on success and -errno on error. + */ +static int ntfs_write_volume_flags(ntfs_volume *vol, const VOLUME_FLAGS flags) +{ + ntfs_inode *ni = NTFS_I(vol->vol_ino); + MFT_RECORD *m; + VOLUME_INFORMATION *vi; + ntfs_attr_search_ctx *ctx; + int err; + + ntfs_debug("Entering, old flags = 0x%x, new flags = 0x%x.", + le16_to_cpu(vol->vol_flags), le16_to_cpu(flags)); + if (vol->vol_flags == flags) + goto done; + BUG_ON(!ni); + m = map_mft_record(ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(ni, m); + if (!ctx) { + err = -ENOMEM; + goto put_unm_err_out; + } + err = ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0, + ctx); + if (err) + goto put_unm_err_out; + vi = (VOLUME_INFORMATION*)((u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset)); + vol->vol_flags = vi->flags = flags; + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); +done: + ntfs_debug("Done."); + return 0; +put_unm_err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); +err_out: + ntfs_error(vol->sb, "Failed with error code %i.", -err); + return err; +} + +/** + * ntfs_set_volume_flags - set bits in the volume information flags + * @vol: ntfs volume on which to modify the flags + * @flags: flags to set on the volume + * + * Set the bits in @flags in the volume information flags on the volume @vol. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_set_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags) +{ + flags &= VOLUME_FLAGS_MASK; + return ntfs_write_volume_flags(vol, vol->vol_flags | flags); +} + +/** + * ntfs_clear_volume_flags - clear bits in the volume information flags + * @vol: ntfs volume on which to modify the flags + * @flags: flags to clear on the volume + * + * Clear the bits in @flags in the volume information flags on the volume @vol. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_clear_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags) +{ + flags &= VOLUME_FLAGS_MASK; + flags = vol->vol_flags & cpu_to_le16(~le16_to_cpu(flags)); + return ntfs_write_volume_flags(vol, flags); +} + +#endif /* NTFS_RW */ + +/** + * ntfs_remount - change the mount options of a mounted ntfs filesystem + * @sb: superblock of mounted ntfs filesystem + * @flags: remount flags + * @opt: remount options string + * + * Change the mount options of an already mounted ntfs filesystem. + * + * NOTE: The VFS sets the @sb->s_flags remount flags to @flags after + * ntfs_remount() returns successfully (i.e. returns 0). Otherwise, + * @sb->s_flags are not changed. + */ +static int ntfs_remount(struct super_block *sb, int *flags, char *opt) +{ + ntfs_volume *vol = NTFS_SB(sb); + + ntfs_debug("Entering with remount options string: %s", opt); + + sync_filesystem(sb); + +#ifndef NTFS_RW + /* For read-only compiled driver, enforce read-only flag. */ + *flags |= SB_RDONLY; +#else /* NTFS_RW */ + /* + * For the read-write compiled driver, if we are remounting read-write, + * make sure there are no volume errors and that no unsupported volume + * flags are set. Also, empty the logfile journal as it would become + * stale as soon as something is written to the volume and mark the + * volume dirty so that chkdsk is run if the volume is not umounted + * cleanly. Finally, mark the quotas out of date so Windows rescans + * the volume on boot and updates them. + * + * When remounting read-only, mark the volume clean if no volume errors + * have occurred. + */ + if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) { + static const char *es = ". Cannot remount read-write."; + + /* Remounting read-write. */ + if (NVolErrors(vol)) { + ntfs_error(sb, "Volume has errors and is read-only%s", + es); + return -EROFS; + } + if (vol->vol_flags & VOLUME_IS_DIRTY) { + ntfs_error(sb, "Volume is dirty and read-only%s", es); + return -EROFS; + } + if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) { + ntfs_error(sb, "Volume has been modified by chkdsk " + "and is read-only%s", es); + return -EROFS; + } + if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { + ntfs_error(sb, "Volume has unsupported flags set " + "(0x%x) and is read-only%s", + (unsigned)le16_to_cpu(vol->vol_flags), + es); + return -EROFS; + } + if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { + ntfs_error(sb, "Failed to set dirty bit in volume " + "information flags%s", es); + return -EROFS; + } +#if 0 + // TODO: Enable this code once we start modifying anything that + // is different between NTFS 1.2 and 3.x... + /* Set NT4 compatibility flag on newer NTFS version volumes. */ + if ((vol->major_ver > 1)) { + if (ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) { + ntfs_error(sb, "Failed to set NT4 " + "compatibility flag%s", es); + NVolSetErrors(vol); + return -EROFS; + } + } +#endif + if (!ntfs_empty_logfile(vol->logfile_ino)) { + ntfs_error(sb, "Failed to empty journal $LogFile%s", + es); + NVolSetErrors(vol); + return -EROFS; + } + if (!ntfs_mark_quotas_out_of_date(vol)) { + ntfs_error(sb, "Failed to mark quotas out of date%s", + es); + NVolSetErrors(vol); + return -EROFS; + } + if (!ntfs_stamp_usnjrnl(vol)) { + ntfs_error(sb, "Failed to stamp transaction log " + "($UsnJrnl)%s", es); + NVolSetErrors(vol); + return -EROFS; + } + } else if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) { + /* Remounting read-only. */ + if (!NVolErrors(vol)) { + if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY)) + ntfs_warning(sb, "Failed to clear dirty bit " + "in volume information " + "flags. Run chkdsk."); + } + } +#endif /* NTFS_RW */ + + // TODO: Deal with *flags. + + if (!parse_options(vol, opt)) + return -EINVAL; + + ntfs_debug("Done."); + return 0; +} + +/** + * is_boot_sector_ntfs - check whether a boot sector is a valid NTFS boot sector + * @sb: Super block of the device to which @b belongs. + * @b: Boot sector of device @sb to check. + * @silent: If 'true', all output will be silenced. + * + * is_boot_sector_ntfs() checks whether the boot sector @b is a valid NTFS boot + * sector. Returns 'true' if it is valid and 'false' if not. + * + * @sb is only needed for warning/error output, i.e. it can be NULL when silent + * is 'true'. + */ +static bool is_boot_sector_ntfs(const struct super_block *sb, + const NTFS_BOOT_SECTOR *b, const bool silent) +{ + /* + * Check that checksum == sum of u32 values from b to the checksum + * field. If checksum is zero, no checking is done. We will work when + * the checksum test fails, since some utilities update the boot sector + * ignoring the checksum which leaves the checksum out-of-date. We + * report a warning if this is the case. + */ + if ((void*)b < (void*)&b->checksum && b->checksum && !silent) { + le32 *u; + u32 i; + + for (i = 0, u = (le32*)b; u < (le32*)(&b->checksum); ++u) + i += le32_to_cpup(u); + if (le32_to_cpu(b->checksum) != i) + ntfs_warning(sb, "Invalid boot sector checksum."); + } + /* Check OEMidentifier is "NTFS " */ + if (b->oem_id != magicNTFS) + goto not_ntfs; + /* Check bytes per sector value is between 256 and 4096. */ + if (le16_to_cpu(b->bpb.bytes_per_sector) < 0x100 || + le16_to_cpu(b->bpb.bytes_per_sector) > 0x1000) + goto not_ntfs; + /* Check sectors per cluster value is valid. */ + switch (b->bpb.sectors_per_cluster) { + case 1: case 2: case 4: case 8: case 16: case 32: case 64: case 128: + break; + default: + goto not_ntfs; + } + /* Check the cluster size is not above the maximum (64kiB). */ + if ((u32)le16_to_cpu(b->bpb.bytes_per_sector) * + b->bpb.sectors_per_cluster > NTFS_MAX_CLUSTER_SIZE) + goto not_ntfs; + /* Check reserved/unused fields are really zero. */ + if (le16_to_cpu(b->bpb.reserved_sectors) || + le16_to_cpu(b->bpb.root_entries) || + le16_to_cpu(b->bpb.sectors) || + le16_to_cpu(b->bpb.sectors_per_fat) || + le32_to_cpu(b->bpb.large_sectors) || b->bpb.fats) + goto not_ntfs; + /* Check clusters per file mft record value is valid. */ + if ((u8)b->clusters_per_mft_record < 0xe1 || + (u8)b->clusters_per_mft_record > 0xf7) + switch (b->clusters_per_mft_record) { + case 1: case 2: case 4: case 8: case 16: case 32: case 64: + break; + default: + goto not_ntfs; + } + /* Check clusters per index block value is valid. */ + if ((u8)b->clusters_per_index_record < 0xe1 || + (u8)b->clusters_per_index_record > 0xf7) + switch (b->clusters_per_index_record) { + case 1: case 2: case 4: case 8: case 16: case 32: case 64: + break; + default: + goto not_ntfs; + } + /* + * Check for valid end of sector marker. We will work without it, but + * many BIOSes will refuse to boot from a bootsector if the magic is + * incorrect, so we emit a warning. + */ + if (!silent && b->end_of_sector_marker != cpu_to_le16(0xaa55)) + ntfs_warning(sb, "Invalid end of sector marker."); + return true; +not_ntfs: + return false; +} + +/** + * read_ntfs_boot_sector - read the NTFS boot sector of a device + * @sb: super block of device to read the boot sector from + * @silent: if true, suppress all output + * + * Reads the boot sector from the device and validates it. If that fails, tries + * to read the backup boot sector, first from the end of the device a-la NT4 and + * later and then from the middle of the device a-la NT3.51 and before. + * + * If a valid boot sector is found but it is not the primary boot sector, we + * repair the primary boot sector silently (unless the device is read-only or + * the primary boot sector is not accessible). + * + * NOTE: To call this function, @sb must have the fields s_dev, the ntfs super + * block (u.ntfs_sb), nr_blocks and the device flags (s_flags) initialized + * to their respective values. + * + * Return the unlocked buffer head containing the boot sector or NULL on error. + */ +static struct buffer_head *read_ntfs_boot_sector(struct super_block *sb, + const int silent) +{ + const char *read_err_str = "Unable to read %s boot sector."; + struct buffer_head *bh_primary, *bh_backup; + sector_t nr_blocks = NTFS_SB(sb)->nr_blocks; + + /* Try to read primary boot sector. */ + if ((bh_primary = sb_bread(sb, 0))) { + if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*) + bh_primary->b_data, silent)) + return bh_primary; + if (!silent) + ntfs_error(sb, "Primary boot sector is invalid."); + } else if (!silent) + ntfs_error(sb, read_err_str, "primary"); + if (!(NTFS_SB(sb)->on_errors & ON_ERRORS_RECOVER)) { + if (bh_primary) + brelse(bh_primary); + if (!silent) + ntfs_error(sb, "Mount option errors=recover not used. " + "Aborting without trying to recover."); + return NULL; + } + /* Try to read NT4+ backup boot sector. */ + if ((bh_backup = sb_bread(sb, nr_blocks - 1))) { + if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*) + bh_backup->b_data, silent)) + goto hotfix_primary_boot_sector; + brelse(bh_backup); + } else if (!silent) + ntfs_error(sb, read_err_str, "backup"); + /* Try to read NT3.51- backup boot sector. */ + if ((bh_backup = sb_bread(sb, nr_blocks >> 1))) { + if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*) + bh_backup->b_data, silent)) + goto hotfix_primary_boot_sector; + if (!silent) + ntfs_error(sb, "Could not find a valid backup boot " + "sector."); + brelse(bh_backup); + } else if (!silent) + ntfs_error(sb, read_err_str, "backup"); + /* We failed. Cleanup and return. */ + if (bh_primary) + brelse(bh_primary); + return NULL; +hotfix_primary_boot_sector: + if (bh_primary) { + /* + * If we managed to read sector zero and the volume is not + * read-only, copy the found, valid backup boot sector to the + * primary boot sector. Note we only copy the actual boot + * sector structure, not the actual whole device sector as that + * may be bigger and would potentially damage the $Boot system + * file (FIXME: Would be nice to know if the backup boot sector + * on a large sector device contains the whole boot loader or + * just the first 512 bytes). + */ + if (!sb_rdonly(sb)) { + ntfs_warning(sb, "Hot-fix: Recovering invalid primary " + "boot sector from backup copy."); + memcpy(bh_primary->b_data, bh_backup->b_data, + NTFS_BLOCK_SIZE); + mark_buffer_dirty(bh_primary); + sync_dirty_buffer(bh_primary); + if (buffer_uptodate(bh_primary)) { + brelse(bh_backup); + return bh_primary; + } + ntfs_error(sb, "Hot-fix: Device write error while " + "recovering primary boot sector."); + } else { + ntfs_warning(sb, "Hot-fix: Recovery of primary boot " + "sector failed: Read-only mount."); + } + brelse(bh_primary); + } + ntfs_warning(sb, "Using backup boot sector."); + return bh_backup; +} + +/** + * parse_ntfs_boot_sector - parse the boot sector and store the data in @vol + * @vol: volume structure to initialise with data from boot sector + * @b: boot sector to parse + * + * Parse the ntfs boot sector @b and store all imporant information therein in + * the ntfs super block @vol. Return 'true' on success and 'false' on error. + */ +static bool parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b) +{ + unsigned int sectors_per_cluster_bits, nr_hidden_sects; + int clusters_per_mft_record, clusters_per_index_record; + s64 ll; + + vol->sector_size = le16_to_cpu(b->bpb.bytes_per_sector); + vol->sector_size_bits = ffs(vol->sector_size) - 1; + ntfs_debug("vol->sector_size = %i (0x%x)", vol->sector_size, + vol->sector_size); + ntfs_debug("vol->sector_size_bits = %i (0x%x)", vol->sector_size_bits, + vol->sector_size_bits); + if (vol->sector_size < vol->sb->s_blocksize) { + ntfs_error(vol->sb, "Sector size (%i) is smaller than the " + "device block size (%lu). This is not " + "supported. Sorry.", vol->sector_size, + vol->sb->s_blocksize); + return false; + } + ntfs_debug("sectors_per_cluster = 0x%x", b->bpb.sectors_per_cluster); + sectors_per_cluster_bits = ffs(b->bpb.sectors_per_cluster) - 1; + ntfs_debug("sectors_per_cluster_bits = 0x%x", + sectors_per_cluster_bits); + nr_hidden_sects = le32_to_cpu(b->bpb.hidden_sectors); + ntfs_debug("number of hidden sectors = 0x%x", nr_hidden_sects); + vol->cluster_size = vol->sector_size << sectors_per_cluster_bits; + vol->cluster_size_mask = vol->cluster_size - 1; + vol->cluster_size_bits = ffs(vol->cluster_size) - 1; + ntfs_debug("vol->cluster_size = %i (0x%x)", vol->cluster_size, + vol->cluster_size); + ntfs_debug("vol->cluster_size_mask = 0x%x", vol->cluster_size_mask); + ntfs_debug("vol->cluster_size_bits = %i", vol->cluster_size_bits); + if (vol->cluster_size < vol->sector_size) { + ntfs_error(vol->sb, "Cluster size (%i) is smaller than the " + "sector size (%i). This is not supported. " + "Sorry.", vol->cluster_size, vol->sector_size); + return false; + } + clusters_per_mft_record = b->clusters_per_mft_record; + ntfs_debug("clusters_per_mft_record = %i (0x%x)", + clusters_per_mft_record, clusters_per_mft_record); + if (clusters_per_mft_record > 0) + vol->mft_record_size = vol->cluster_size << + (ffs(clusters_per_mft_record) - 1); + else + /* + * When mft_record_size < cluster_size, clusters_per_mft_record + * = -log2(mft_record_size) bytes. mft_record_size normaly is + * 1024 bytes, which is encoded as 0xF6 (-10 in decimal). + */ + vol->mft_record_size = 1 << -clusters_per_mft_record; + vol->mft_record_size_mask = vol->mft_record_size - 1; + vol->mft_record_size_bits = ffs(vol->mft_record_size) - 1; + ntfs_debug("vol->mft_record_size = %i (0x%x)", vol->mft_record_size, + vol->mft_record_size); + ntfs_debug("vol->mft_record_size_mask = 0x%x", + vol->mft_record_size_mask); + ntfs_debug("vol->mft_record_size_bits = %i (0x%x)", + vol->mft_record_size_bits, vol->mft_record_size_bits); + /* + * We cannot support mft record sizes above the PAGE_SIZE since + * we store $MFT/$DATA, the table of mft records in the page cache. + */ + if (vol->mft_record_size > PAGE_SIZE) { + ntfs_error(vol->sb, "Mft record size (%i) exceeds the " + "PAGE_SIZE on your system (%lu). " + "This is not supported. Sorry.", + vol->mft_record_size, PAGE_SIZE); + return false; + } + /* We cannot support mft record sizes below the sector size. */ + if (vol->mft_record_size < vol->sector_size) { + ntfs_error(vol->sb, "Mft record size (%i) is smaller than the " + "sector size (%i). This is not supported. " + "Sorry.", vol->mft_record_size, + vol->sector_size); + return false; + } + clusters_per_index_record = b->clusters_per_index_record; + ntfs_debug("clusters_per_index_record = %i (0x%x)", + clusters_per_index_record, clusters_per_index_record); + if (clusters_per_index_record > 0) + vol->index_record_size = vol->cluster_size << + (ffs(clusters_per_index_record) - 1); + else + /* + * When index_record_size < cluster_size, + * clusters_per_index_record = -log2(index_record_size) bytes. + * index_record_size normaly equals 4096 bytes, which is + * encoded as 0xF4 (-12 in decimal). + */ + vol->index_record_size = 1 << -clusters_per_index_record; + vol->index_record_size_mask = vol->index_record_size - 1; + vol->index_record_size_bits = ffs(vol->index_record_size) - 1; + ntfs_debug("vol->index_record_size = %i (0x%x)", + vol->index_record_size, vol->index_record_size); + ntfs_debug("vol->index_record_size_mask = 0x%x", + vol->index_record_size_mask); + ntfs_debug("vol->index_record_size_bits = %i (0x%x)", + vol->index_record_size_bits, + vol->index_record_size_bits); + /* We cannot support index record sizes below the sector size. */ + if (vol->index_record_size < vol->sector_size) { + ntfs_error(vol->sb, "Index record size (%i) is smaller than " + "the sector size (%i). This is not " + "supported. Sorry.", vol->index_record_size, + vol->sector_size); + return false; + } + /* + * Get the size of the volume in clusters and check for 64-bit-ness. + * Windows currently only uses 32 bits to save the clusters so we do + * the same as it is much faster on 32-bit CPUs. + */ + ll = sle64_to_cpu(b->number_of_sectors) >> sectors_per_cluster_bits; + if ((u64)ll >= 1ULL << 32) { + ntfs_error(vol->sb, "Cannot handle 64-bit clusters. Sorry."); + return false; + } + vol->nr_clusters = ll; + ntfs_debug("vol->nr_clusters = 0x%llx", (long long)vol->nr_clusters); + /* + * On an architecture where unsigned long is 32-bits, we restrict the + * volume size to 2TiB (2^41). On a 64-bit architecture, the compiler + * will hopefully optimize the whole check away. + */ + if (sizeof(unsigned long) < 8) { + if ((ll << vol->cluster_size_bits) >= (1ULL << 41)) { + ntfs_error(vol->sb, "Volume size (%lluTiB) is too " + "large for this architecture. " + "Maximum supported is 2TiB. Sorry.", + (unsigned long long)ll >> (40 - + vol->cluster_size_bits)); + return false; + } + } + ll = sle64_to_cpu(b->mft_lcn); + if (ll >= vol->nr_clusters) { + ntfs_error(vol->sb, "MFT LCN (%lli, 0x%llx) is beyond end of " + "volume. Weird.", (unsigned long long)ll, + (unsigned long long)ll); + return false; + } + vol->mft_lcn = ll; + ntfs_debug("vol->mft_lcn = 0x%llx", (long long)vol->mft_lcn); + ll = sle64_to_cpu(b->mftmirr_lcn); + if (ll >= vol->nr_clusters) { + ntfs_error(vol->sb, "MFTMirr LCN (%lli, 0x%llx) is beyond end " + "of volume. Weird.", (unsigned long long)ll, + (unsigned long long)ll); + return false; + } + vol->mftmirr_lcn = ll; + ntfs_debug("vol->mftmirr_lcn = 0x%llx", (long long)vol->mftmirr_lcn); +#ifdef NTFS_RW + /* + * Work out the size of the mft mirror in number of mft records. If the + * cluster size is less than or equal to the size taken by four mft + * records, the mft mirror stores the first four mft records. If the + * cluster size is bigger than the size taken by four mft records, the + * mft mirror contains as many mft records as will fit into one + * cluster. + */ + if (vol->cluster_size <= (4 << vol->mft_record_size_bits)) + vol->mftmirr_size = 4; + else + vol->mftmirr_size = vol->cluster_size >> + vol->mft_record_size_bits; + ntfs_debug("vol->mftmirr_size = %i", vol->mftmirr_size); +#endif /* NTFS_RW */ + vol->serial_no = le64_to_cpu(b->volume_serial_number); + ntfs_debug("vol->serial_no = 0x%llx", + (unsigned long long)vol->serial_no); + return true; +} + +/** + * ntfs_setup_allocators - initialize the cluster and mft allocators + * @vol: volume structure for which to setup the allocators + * + * Setup the cluster (lcn) and mft allocators to the starting values. + */ +static void ntfs_setup_allocators(ntfs_volume *vol) +{ +#ifdef NTFS_RW + LCN mft_zone_size, mft_lcn; +#endif /* NTFS_RW */ + + ntfs_debug("vol->mft_zone_multiplier = 0x%x", + vol->mft_zone_multiplier); +#ifdef NTFS_RW + /* Determine the size of the MFT zone. */ + mft_zone_size = vol->nr_clusters; + switch (vol->mft_zone_multiplier) { /* % of volume size in clusters */ + case 4: + mft_zone_size >>= 1; /* 50% */ + break; + case 3: + mft_zone_size = (mft_zone_size + + (mft_zone_size >> 1)) >> 2; /* 37.5% */ + break; + case 2: + mft_zone_size >>= 2; /* 25% */ + break; + /* case 1: */ + default: + mft_zone_size >>= 3; /* 12.5% */ + break; + } + /* Setup the mft zone. */ + vol->mft_zone_start = vol->mft_zone_pos = vol->mft_lcn; + ntfs_debug("vol->mft_zone_pos = 0x%llx", + (unsigned long long)vol->mft_zone_pos); + /* + * Calculate the mft_lcn for an unmodified NTFS volume (see mkntfs + * source) and if the actual mft_lcn is in the expected place or even + * further to the front of the volume, extend the mft_zone to cover the + * beginning of the volume as well. This is in order to protect the + * area reserved for the mft bitmap as well within the mft_zone itself. + * On non-standard volumes we do not protect it as the overhead would + * be higher than the speed increase we would get by doing it. + */ + mft_lcn = (8192 + 2 * vol->cluster_size - 1) / vol->cluster_size; + if (mft_lcn * vol->cluster_size < 16 * 1024) + mft_lcn = (16 * 1024 + vol->cluster_size - 1) / + vol->cluster_size; + if (vol->mft_zone_start <= mft_lcn) + vol->mft_zone_start = 0; + ntfs_debug("vol->mft_zone_start = 0x%llx", + (unsigned long long)vol->mft_zone_start); + /* + * Need to cap the mft zone on non-standard volumes so that it does + * not point outside the boundaries of the volume. We do this by + * halving the zone size until we are inside the volume. + */ + vol->mft_zone_end = vol->mft_lcn + mft_zone_size; + while (vol->mft_zone_end >= vol->nr_clusters) { + mft_zone_size >>= 1; + vol->mft_zone_end = vol->mft_lcn + mft_zone_size; + } + ntfs_debug("vol->mft_zone_end = 0x%llx", + (unsigned long long)vol->mft_zone_end); + /* + * Set the current position within each data zone to the start of the + * respective zone. + */ + vol->data1_zone_pos = vol->mft_zone_end; + ntfs_debug("vol->data1_zone_pos = 0x%llx", + (unsigned long long)vol->data1_zone_pos); + vol->data2_zone_pos = 0; + ntfs_debug("vol->data2_zone_pos = 0x%llx", + (unsigned long long)vol->data2_zone_pos); + + /* Set the mft data allocation position to mft record 24. */ + vol->mft_data_pos = 24; + ntfs_debug("vol->mft_data_pos = 0x%llx", + (unsigned long long)vol->mft_data_pos); +#endif /* NTFS_RW */ +} + +#ifdef NTFS_RW + +/** + * load_and_init_mft_mirror - load and setup the mft mirror inode for a volume + * @vol: ntfs super block describing device whose mft mirror to load + * + * Return 'true' on success or 'false' on error. + */ +static bool load_and_init_mft_mirror(ntfs_volume *vol) +{ + struct inode *tmp_ino; + ntfs_inode *tmp_ni; + + ntfs_debug("Entering."); + /* Get mft mirror inode. */ + tmp_ino = ntfs_iget(vol->sb, FILE_MFTMirr); + if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) { + if (!IS_ERR(tmp_ino)) + iput(tmp_ino); + /* Caller will display error message. */ + return false; + } + /* + * Re-initialize some specifics about $MFTMirr's inode as + * ntfs_read_inode() will have set up the default ones. + */ + /* Set uid and gid to root. */ + tmp_ino->i_uid = GLOBAL_ROOT_UID; + tmp_ino->i_gid = GLOBAL_ROOT_GID; + /* Regular file. No access for anyone. */ + tmp_ino->i_mode = S_IFREG; + /* No VFS initiated operations allowed for $MFTMirr. */ + tmp_ino->i_op = &ntfs_empty_inode_ops; + tmp_ino->i_fop = &ntfs_empty_file_ops; + /* Put in our special address space operations. */ + tmp_ino->i_mapping->a_ops = &ntfs_mst_aops; + tmp_ni = NTFS_I(tmp_ino); + /* The $MFTMirr, like the $MFT is multi sector transfer protected. */ + NInoSetMstProtected(tmp_ni); + NInoSetSparseDisabled(tmp_ni); + /* + * Set up our little cheat allowing us to reuse the async read io + * completion handler for directories. + */ + tmp_ni->itype.index.block_size = vol->mft_record_size; + tmp_ni->itype.index.block_size_bits = vol->mft_record_size_bits; + vol->mftmirr_ino = tmp_ino; + ntfs_debug("Done."); + return true; +} + +/** + * check_mft_mirror - compare contents of the mft mirror with the mft + * @vol: ntfs super block describing device whose mft mirror to check + * + * Return 'true' on success or 'false' on error. + * + * Note, this function also results in the mft mirror runlist being completely + * mapped into memory. The mft mirror write code requires this and will BUG() + * should it find an unmapped runlist element. + */ +static bool check_mft_mirror(ntfs_volume *vol) +{ + struct super_block *sb = vol->sb; + ntfs_inode *mirr_ni; + struct page *mft_page, *mirr_page; + u8 *kmft, *kmirr; + runlist_element *rl, rl2[2]; + pgoff_t index; + int mrecs_per_page, i; + + ntfs_debug("Entering."); + /* Compare contents of $MFT and $MFTMirr. */ + mrecs_per_page = PAGE_SIZE / vol->mft_record_size; + BUG_ON(!mrecs_per_page); + BUG_ON(!vol->mftmirr_size); + mft_page = mirr_page = NULL; + kmft = kmirr = NULL; + index = i = 0; + do { + u32 bytes; + + /* Switch pages if necessary. */ + if (!(i % mrecs_per_page)) { + if (index) { + ntfs_unmap_page(mft_page); + ntfs_unmap_page(mirr_page); + } + /* Get the $MFT page. */ + mft_page = ntfs_map_page(vol->mft_ino->i_mapping, + index); + if (IS_ERR(mft_page)) { + ntfs_error(sb, "Failed to read $MFT."); + return false; + } + kmft = page_address(mft_page); + /* Get the $MFTMirr page. */ + mirr_page = ntfs_map_page(vol->mftmirr_ino->i_mapping, + index); + if (IS_ERR(mirr_page)) { + ntfs_error(sb, "Failed to read $MFTMirr."); + goto mft_unmap_out; + } + kmirr = page_address(mirr_page); + ++index; + } + /* Do not check the record if it is not in use. */ + if (((MFT_RECORD*)kmft)->flags & MFT_RECORD_IN_USE) { + /* Make sure the record is ok. */ + if (ntfs_is_baad_recordp((le32*)kmft)) { + ntfs_error(sb, "Incomplete multi sector " + "transfer detected in mft " + "record %i.", i); +mm_unmap_out: + ntfs_unmap_page(mirr_page); +mft_unmap_out: + ntfs_unmap_page(mft_page); + return false; + } + } + /* Do not check the mirror record if it is not in use. */ + if (((MFT_RECORD*)kmirr)->flags & MFT_RECORD_IN_USE) { + if (ntfs_is_baad_recordp((le32*)kmirr)) { + ntfs_error(sb, "Incomplete multi sector " + "transfer detected in mft " + "mirror record %i.", i); + goto mm_unmap_out; + } + } + /* Get the amount of data in the current record. */ + bytes = le32_to_cpu(((MFT_RECORD*)kmft)->bytes_in_use); + if (bytes < sizeof(MFT_RECORD_OLD) || + bytes > vol->mft_record_size || + ntfs_is_baad_recordp((le32*)kmft)) { + bytes = le32_to_cpu(((MFT_RECORD*)kmirr)->bytes_in_use); + if (bytes < sizeof(MFT_RECORD_OLD) || + bytes > vol->mft_record_size || + ntfs_is_baad_recordp((le32*)kmirr)) + bytes = vol->mft_record_size; + } + /* Compare the two records. */ + if (memcmp(kmft, kmirr, bytes)) { + ntfs_error(sb, "$MFT and $MFTMirr (record %i) do not " + "match. Run ntfsfix or chkdsk.", i); + goto mm_unmap_out; + } + kmft += vol->mft_record_size; + kmirr += vol->mft_record_size; + } while (++i < vol->mftmirr_size); + /* Release the last pages. */ + ntfs_unmap_page(mft_page); + ntfs_unmap_page(mirr_page); + + /* Construct the mft mirror runlist by hand. */ + rl2[0].vcn = 0; + rl2[0].lcn = vol->mftmirr_lcn; + rl2[0].length = (vol->mftmirr_size * vol->mft_record_size + + vol->cluster_size - 1) / vol->cluster_size; + rl2[1].vcn = rl2[0].length; + rl2[1].lcn = LCN_ENOENT; + rl2[1].length = 0; + /* + * Because we have just read all of the mft mirror, we know we have + * mapped the full runlist for it. + */ + mirr_ni = NTFS_I(vol->mftmirr_ino); + down_read(&mirr_ni->runlist.lock); + rl = mirr_ni->runlist.rl; + /* Compare the two runlists. They must be identical. */ + i = 0; + do { + if (rl2[i].vcn != rl[i].vcn || rl2[i].lcn != rl[i].lcn || + rl2[i].length != rl[i].length) { + ntfs_error(sb, "$MFTMirr location mismatch. " + "Run chkdsk."); + up_read(&mirr_ni->runlist.lock); + return false; + } + } while (rl2[i++].length); + up_read(&mirr_ni->runlist.lock); + ntfs_debug("Done."); + return true; +} + +/** + * load_and_check_logfile - load and check the logfile inode for a volume + * @vol: ntfs super block describing device whose logfile to load + * + * Return 'true' on success or 'false' on error. + */ +static bool load_and_check_logfile(ntfs_volume *vol, + RESTART_PAGE_HEADER **rp) +{ + struct inode *tmp_ino; + + ntfs_debug("Entering."); + tmp_ino = ntfs_iget(vol->sb, FILE_LogFile); + if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) { + if (!IS_ERR(tmp_ino)) + iput(tmp_ino); + /* Caller will display error message. */ + return false; + } + if (!ntfs_check_logfile(tmp_ino, rp)) { + iput(tmp_ino); + /* ntfs_check_logfile() will have displayed error output. */ + return false; + } + NInoSetSparseDisabled(NTFS_I(tmp_ino)); + vol->logfile_ino = tmp_ino; + ntfs_debug("Done."); + return true; +} + +#define NTFS_HIBERFIL_HEADER_SIZE 4096 + +/** + * check_windows_hibernation_status - check if Windows is suspended on a volume + * @vol: ntfs super block of device to check + * + * Check if Windows is hibernated on the ntfs volume @vol. This is done by + * looking for the file hiberfil.sys in the root directory of the volume. If + * the file is not present Windows is definitely not suspended. + * + * If hiberfil.sys exists and is less than 4kiB in size it means Windows is + * definitely suspended (this volume is not the system volume). Caveat: on a + * system with many volumes it is possible that the < 4kiB check is bogus but + * for now this should do fine. + * + * If hiberfil.sys exists and is larger than 4kiB in size, we need to read the + * hiberfil header (which is the first 4kiB). If this begins with "hibr", + * Windows is definitely suspended. If it is completely full of zeroes, + * Windows is definitely not hibernated. Any other case is treated as if + * Windows is suspended. This caters for the above mentioned caveat of a + * system with many volumes where no "hibr" magic would be present and there is + * no zero header. + * + * Return 0 if Windows is not hibernated on the volume, >0 if Windows is + * hibernated on the volume, and -errno on error. + */ +static int check_windows_hibernation_status(ntfs_volume *vol) +{ + MFT_REF mref; + struct inode *vi; + struct page *page; + u32 *kaddr, *kend; + ntfs_name *name = NULL; + int ret = 1; + static const ntfschar hiberfil[13] = { cpu_to_le16('h'), + cpu_to_le16('i'), cpu_to_le16('b'), + cpu_to_le16('e'), cpu_to_le16('r'), + cpu_to_le16('f'), cpu_to_le16('i'), + cpu_to_le16('l'), cpu_to_le16('.'), + cpu_to_le16('s'), cpu_to_le16('y'), + cpu_to_le16('s'), 0 }; + + ntfs_debug("Entering."); + /* + * Find the inode number for the hibernation file by looking up the + * filename hiberfil.sys in the root directory. + */ + inode_lock(vol->root_ino); + mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12, + &name); + inode_unlock(vol->root_ino); + if (IS_ERR_MREF(mref)) { + ret = MREF_ERR(mref); + /* If the file does not exist, Windows is not hibernated. */ + if (ret == -ENOENT) { + ntfs_debug("hiberfil.sys not present. Windows is not " + "hibernated on the volume."); + return 0; + } + /* A real error occurred. */ + ntfs_error(vol->sb, "Failed to find inode number for " + "hiberfil.sys."); + return ret; + } + /* We do not care for the type of match that was found. */ + kfree(name); + /* Get the inode. */ + vi = ntfs_iget(vol->sb, MREF(mref)); + if (IS_ERR(vi) || is_bad_inode(vi)) { + if (!IS_ERR(vi)) + iput(vi); + ntfs_error(vol->sb, "Failed to load hiberfil.sys."); + return IS_ERR(vi) ? PTR_ERR(vi) : -EIO; + } + if (unlikely(i_size_read(vi) < NTFS_HIBERFIL_HEADER_SIZE)) { + ntfs_debug("hiberfil.sys is smaller than 4kiB (0x%llx). " + "Windows is hibernated on the volume. This " + "is not the system volume.", i_size_read(vi)); + goto iput_out; + } + page = ntfs_map_page(vi->i_mapping, 0); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to read from hiberfil.sys."); + ret = PTR_ERR(page); + goto iput_out; + } + kaddr = (u32*)page_address(page); + if (*(le32*)kaddr == cpu_to_le32(0x72626968)/*'hibr'*/) { + ntfs_debug("Magic \"hibr\" found in hiberfil.sys. Windows is " + "hibernated on the volume. This is the " + "system volume."); + goto unm_iput_out; + } + kend = kaddr + NTFS_HIBERFIL_HEADER_SIZE/sizeof(*kaddr); + do { + if (unlikely(*kaddr)) { + ntfs_debug("hiberfil.sys is larger than 4kiB " + "(0x%llx), does not contain the " + "\"hibr\" magic, and does not have a " + "zero header. Windows is hibernated " + "on the volume. This is not the " + "system volume.", i_size_read(vi)); + goto unm_iput_out; + } + } while (++kaddr < kend); + ntfs_debug("hiberfil.sys contains a zero header. Windows is not " + "hibernated on the volume. This is the system " + "volume."); + ret = 0; +unm_iput_out: + ntfs_unmap_page(page); +iput_out: + iput(vi); + return ret; +} + +/** + * load_and_init_quota - load and setup the quota file for a volume if present + * @vol: ntfs super block describing device whose quota file to load + * + * Return 'true' on success or 'false' on error. If $Quota is not present, we + * leave vol->quota_ino as NULL and return success. + */ +static bool load_and_init_quota(ntfs_volume *vol) +{ + MFT_REF mref; + struct inode *tmp_ino; + ntfs_name *name = NULL; + static const ntfschar Quota[7] = { cpu_to_le16('$'), + cpu_to_le16('Q'), cpu_to_le16('u'), + cpu_to_le16('o'), cpu_to_le16('t'), + cpu_to_le16('a'), 0 }; + static ntfschar Q[3] = { cpu_to_le16('$'), + cpu_to_le16('Q'), 0 }; + + ntfs_debug("Entering."); + /* + * Find the inode number for the quota file by looking up the filename + * $Quota in the extended system files directory $Extend. + */ + inode_lock(vol->extend_ino); + mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6, + &name); + inode_unlock(vol->extend_ino); + if (IS_ERR_MREF(mref)) { + /* + * If the file does not exist, quotas are disabled and have + * never been enabled on this volume, just return success. + */ + if (MREF_ERR(mref) == -ENOENT) { + ntfs_debug("$Quota not present. Volume does not have " + "quotas enabled."); + /* + * No need to try to set quotas out of date if they are + * not enabled. + */ + NVolSetQuotaOutOfDate(vol); + return true; + } + /* A real error occurred. */ + ntfs_error(vol->sb, "Failed to find inode number for $Quota."); + return false; + } + /* We do not care for the type of match that was found. */ + kfree(name); + /* Get the inode. */ + tmp_ino = ntfs_iget(vol->sb, MREF(mref)); + if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) { + if (!IS_ERR(tmp_ino)) + iput(tmp_ino); + ntfs_error(vol->sb, "Failed to load $Quota."); + return false; + } + vol->quota_ino = tmp_ino; + /* Get the $Q index allocation attribute. */ + tmp_ino = ntfs_index_iget(vol->quota_ino, Q, 2); + if (IS_ERR(tmp_ino)) { + ntfs_error(vol->sb, "Failed to load $Quota/$Q index."); + return false; + } + vol->quota_q_ino = tmp_ino; + ntfs_debug("Done."); + return true; +} + +/** + * load_and_init_usnjrnl - load and setup the transaction log if present + * @vol: ntfs super block describing device whose usnjrnl file to load + * + * Return 'true' on success or 'false' on error. + * + * If $UsnJrnl is not present or in the process of being disabled, we set + * NVolUsnJrnlStamped() and return success. + * + * If the $UsnJrnl $DATA/$J attribute has a size equal to the lowest valid usn, + * i.e. transaction logging has only just been enabled or the journal has been + * stamped and nothing has been logged since, we also set NVolUsnJrnlStamped() + * and return success. + */ +static bool load_and_init_usnjrnl(ntfs_volume *vol) +{ + MFT_REF mref; + struct inode *tmp_ino; + ntfs_inode *tmp_ni; + struct page *page; + ntfs_name *name = NULL; + USN_HEADER *uh; + static const ntfschar UsnJrnl[9] = { cpu_to_le16('$'), + cpu_to_le16('U'), cpu_to_le16('s'), + cpu_to_le16('n'), cpu_to_le16('J'), + cpu_to_le16('r'), cpu_to_le16('n'), + cpu_to_le16('l'), 0 }; + static ntfschar Max[5] = { cpu_to_le16('$'), + cpu_to_le16('M'), cpu_to_le16('a'), + cpu_to_le16('x'), 0 }; + static ntfschar J[3] = { cpu_to_le16('$'), + cpu_to_le16('J'), 0 }; + + ntfs_debug("Entering."); + /* + * Find the inode number for the transaction log file by looking up the + * filename $UsnJrnl in the extended system files directory $Extend. + */ + inode_lock(vol->extend_ino); + mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8, + &name); + inode_unlock(vol->extend_ino); + if (IS_ERR_MREF(mref)) { + /* + * If the file does not exist, transaction logging is disabled, + * just return success. + */ + if (MREF_ERR(mref) == -ENOENT) { + ntfs_debug("$UsnJrnl not present. Volume does not " + "have transaction logging enabled."); +not_enabled: + /* + * No need to try to stamp the transaction log if + * transaction logging is not enabled. + */ + NVolSetUsnJrnlStamped(vol); + return true; + } + /* A real error occurred. */ + ntfs_error(vol->sb, "Failed to find inode number for " + "$UsnJrnl."); + return false; + } + /* We do not care for the type of match that was found. */ + kfree(name); + /* Get the inode. */ + tmp_ino = ntfs_iget(vol->sb, MREF(mref)); + if (IS_ERR(tmp_ino) || unlikely(is_bad_inode(tmp_ino))) { + if (!IS_ERR(tmp_ino)) + iput(tmp_ino); + ntfs_error(vol->sb, "Failed to load $UsnJrnl."); + return false; + } + vol->usnjrnl_ino = tmp_ino; + /* + * If the transaction log is in the process of being deleted, we can + * ignore it. + */ + if (unlikely(vol->vol_flags & VOLUME_DELETE_USN_UNDERWAY)) { + ntfs_debug("$UsnJrnl in the process of being disabled. " + "Volume does not have transaction logging " + "enabled."); + goto not_enabled; + } + /* Get the $DATA/$Max attribute. */ + tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, Max, 4); + if (IS_ERR(tmp_ino)) { + ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$Max " + "attribute."); + return false; + } + vol->usnjrnl_max_ino = tmp_ino; + if (unlikely(i_size_read(tmp_ino) < sizeof(USN_HEADER))) { + ntfs_error(vol->sb, "Found corrupt $UsnJrnl/$DATA/$Max " + "attribute (size is 0x%llx but should be at " + "least 0x%zx bytes).", i_size_read(tmp_ino), + sizeof(USN_HEADER)); + return false; + } + /* Get the $DATA/$J attribute. */ + tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, J, 2); + if (IS_ERR(tmp_ino)) { + ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$J " + "attribute."); + return false; + } + vol->usnjrnl_j_ino = tmp_ino; + /* Verify $J is non-resident and sparse. */ + tmp_ni = NTFS_I(vol->usnjrnl_j_ino); + if (unlikely(!NInoNonResident(tmp_ni) || !NInoSparse(tmp_ni))) { + ntfs_error(vol->sb, "$UsnJrnl/$DATA/$J attribute is resident " + "and/or not sparse."); + return false; + } + /* Read the USN_HEADER from $DATA/$Max. */ + page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to read from $UsnJrnl/$DATA/$Max " + "attribute."); + return false; + } + uh = (USN_HEADER*)page_address(page); + /* Sanity check the $Max. */ + if (unlikely(sle64_to_cpu(uh->allocation_delta) > + sle64_to_cpu(uh->maximum_size))) { + ntfs_error(vol->sb, "Allocation delta (0x%llx) exceeds " + "maximum size (0x%llx). $UsnJrnl is corrupt.", + (long long)sle64_to_cpu(uh->allocation_delta), + (long long)sle64_to_cpu(uh->maximum_size)); + ntfs_unmap_page(page); + return false; + } + /* + * If the transaction log has been stamped and nothing has been written + * to it since, we do not need to stamp it. + */ + if (unlikely(sle64_to_cpu(uh->lowest_valid_usn) >= + i_size_read(vol->usnjrnl_j_ino))) { + if (likely(sle64_to_cpu(uh->lowest_valid_usn) == + i_size_read(vol->usnjrnl_j_ino))) { + ntfs_unmap_page(page); + ntfs_debug("$UsnJrnl is enabled but nothing has been " + "logged since it was last stamped. " + "Treating this as if the volume does " + "not have transaction logging " + "enabled."); + goto not_enabled; + } + ntfs_error(vol->sb, "$UsnJrnl has lowest valid usn (0x%llx) " + "which is out of bounds (0x%llx). $UsnJrnl " + "is corrupt.", + (long long)sle64_to_cpu(uh->lowest_valid_usn), + i_size_read(vol->usnjrnl_j_ino)); + ntfs_unmap_page(page); + return false; + } + ntfs_unmap_page(page); + ntfs_debug("Done."); + return true; +} + +/** + * load_and_init_attrdef - load the attribute definitions table for a volume + * @vol: ntfs super block describing device whose attrdef to load + * + * Return 'true' on success or 'false' on error. + */ +static bool load_and_init_attrdef(ntfs_volume *vol) +{ + loff_t i_size; + struct super_block *sb = vol->sb; + struct inode *ino; + struct page *page; + pgoff_t index, max_index; + unsigned int size; + + ntfs_debug("Entering."); + /* Read attrdef table and setup vol->attrdef and vol->attrdef_size. */ + ino = ntfs_iget(sb, FILE_AttrDef); + if (IS_ERR(ino) || is_bad_inode(ino)) { + if (!IS_ERR(ino)) + iput(ino); + goto failed; + } + NInoSetSparseDisabled(NTFS_I(ino)); + /* The size of FILE_AttrDef must be above 0 and fit inside 31 bits. */ + i_size = i_size_read(ino); + if (i_size <= 0 || i_size > 0x7fffffff) + goto iput_failed; + vol->attrdef = (ATTR_DEF*)ntfs_malloc_nofs(i_size); + if (!vol->attrdef) + goto iput_failed; + index = 0; + max_index = i_size >> PAGE_SHIFT; + size = PAGE_SIZE; + while (index < max_index) { + /* Read the attrdef table and copy it into the linear buffer. */ +read_partial_attrdef_page: + page = ntfs_map_page(ino->i_mapping, index); + if (IS_ERR(page)) + goto free_iput_failed; + memcpy((u8*)vol->attrdef + (index++ << PAGE_SHIFT), + page_address(page), size); + ntfs_unmap_page(page); + } + if (size == PAGE_SIZE) { + size = i_size & ~PAGE_MASK; + if (size) + goto read_partial_attrdef_page; + } + vol->attrdef_size = i_size; + ntfs_debug("Read %llu bytes from $AttrDef.", i_size); + iput(ino); + return true; +free_iput_failed: + ntfs_free(vol->attrdef); + vol->attrdef = NULL; +iput_failed: + iput(ino); +failed: + ntfs_error(sb, "Failed to initialize attribute definition table."); + return false; +} + +#endif /* NTFS_RW */ + +/** + * load_and_init_upcase - load the upcase table for an ntfs volume + * @vol: ntfs super block describing device whose upcase to load + * + * Return 'true' on success or 'false' on error. + */ +static bool load_and_init_upcase(ntfs_volume *vol) +{ + loff_t i_size; + struct super_block *sb = vol->sb; + struct inode *ino; + struct page *page; + pgoff_t index, max_index; + unsigned int size; + int i, max; + + ntfs_debug("Entering."); + /* Read upcase table and setup vol->upcase and vol->upcase_len. */ + ino = ntfs_iget(sb, FILE_UpCase); + if (IS_ERR(ino) || is_bad_inode(ino)) { + if (!IS_ERR(ino)) + iput(ino); + goto upcase_failed; + } + /* + * The upcase size must not be above 64k Unicode characters, must not + * be zero and must be a multiple of sizeof(ntfschar). + */ + i_size = i_size_read(ino); + if (!i_size || i_size & (sizeof(ntfschar) - 1) || + i_size > 64ULL * 1024 * sizeof(ntfschar)) + goto iput_upcase_failed; + vol->upcase = (ntfschar*)ntfs_malloc_nofs(i_size); + if (!vol->upcase) + goto iput_upcase_failed; + index = 0; + max_index = i_size >> PAGE_SHIFT; + size = PAGE_SIZE; + while (index < max_index) { + /* Read the upcase table and copy it into the linear buffer. */ +read_partial_upcase_page: + page = ntfs_map_page(ino->i_mapping, index); + if (IS_ERR(page)) + goto iput_upcase_failed; + memcpy((char*)vol->upcase + (index++ << PAGE_SHIFT), + page_address(page), size); + ntfs_unmap_page(page); + } + if (size == PAGE_SIZE) { + size = i_size & ~PAGE_MASK; + if (size) + goto read_partial_upcase_page; + } + vol->upcase_len = i_size >> UCHAR_T_SIZE_BITS; + ntfs_debug("Read %llu bytes from $UpCase (expected %zu bytes).", + i_size, 64 * 1024 * sizeof(ntfschar)); + iput(ino); + mutex_lock(&ntfs_lock); + if (!default_upcase) { + ntfs_debug("Using volume specified $UpCase since default is " + "not present."); + mutex_unlock(&ntfs_lock); + return true; + } + max = default_upcase_len; + if (max > vol->upcase_len) + max = vol->upcase_len; + for (i = 0; i < max; i++) + if (vol->upcase[i] != default_upcase[i]) + break; + if (i == max) { + ntfs_free(vol->upcase); + vol->upcase = default_upcase; + vol->upcase_len = max; + ntfs_nr_upcase_users++; + mutex_unlock(&ntfs_lock); + ntfs_debug("Volume specified $UpCase matches default. Using " + "default."); + return true; + } + mutex_unlock(&ntfs_lock); + ntfs_debug("Using volume specified $UpCase since it does not match " + "the default."); + return true; +iput_upcase_failed: + iput(ino); + ntfs_free(vol->upcase); + vol->upcase = NULL; +upcase_failed: + mutex_lock(&ntfs_lock); + if (default_upcase) { + vol->upcase = default_upcase; + vol->upcase_len = default_upcase_len; + ntfs_nr_upcase_users++; + mutex_unlock(&ntfs_lock); + ntfs_error(sb, "Failed to load $UpCase from the volume. Using " + "default."); + return true; + } + mutex_unlock(&ntfs_lock); + ntfs_error(sb, "Failed to initialize upcase table."); + return false; +} + +/* + * The lcn and mft bitmap inodes are NTFS-internal inodes with + * their own special locking rules: + */ +static struct lock_class_key + lcnbmp_runlist_lock_key, lcnbmp_mrec_lock_key, + mftbmp_runlist_lock_key, mftbmp_mrec_lock_key; + +/** + * load_system_files - open the system files using normal functions + * @vol: ntfs super block describing device whose system files to load + * + * Open the system files with normal access functions and complete setting up + * the ntfs super block @vol. + * + * Return 'true' on success or 'false' on error. + */ +static bool load_system_files(ntfs_volume *vol) +{ + struct super_block *sb = vol->sb; + MFT_RECORD *m; + VOLUME_INFORMATION *vi; + ntfs_attr_search_ctx *ctx; +#ifdef NTFS_RW + RESTART_PAGE_HEADER *rp; + int err; +#endif /* NTFS_RW */ + + ntfs_debug("Entering."); +#ifdef NTFS_RW + /* Get mft mirror inode compare the contents of $MFT and $MFTMirr. */ + if (!load_and_init_mft_mirror(vol) || !check_mft_mirror(vol)) { + static const char *es1 = "Failed to load $MFTMirr"; + static const char *es2 = "$MFTMirr does not match $MFT"; + static const char *es3 = ". Run ntfsfix and/or chkdsk."; + + /* If a read-write mount, convert it to a read-only mount. */ + if (!sb_rdonly(sb)) { + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=" + "continue nor on_errors=" + "remount-ro was specified%s", + !vol->mftmirr_ino ? es1 : es2, + es3); + goto iput_mirr_err_out; + } + sb->s_flags |= SB_RDONLY; + ntfs_error(sb, "%s. Mounting read-only%s", + !vol->mftmirr_ino ? es1 : es2, es3); + } else + ntfs_warning(sb, "%s. Will not be able to remount " + "read-write%s", + !vol->mftmirr_ino ? es1 : es2, es3); + /* This will prevent a read-write remount. */ + NVolSetErrors(vol); + } +#endif /* NTFS_RW */ + /* Get mft bitmap attribute inode. */ + vol->mftbmp_ino = ntfs_attr_iget(vol->mft_ino, AT_BITMAP, NULL, 0); + if (IS_ERR(vol->mftbmp_ino)) { + ntfs_error(sb, "Failed to load $MFT/$BITMAP attribute."); + goto iput_mirr_err_out; + } + lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->runlist.lock, + &mftbmp_runlist_lock_key); + lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->mrec_lock, + &mftbmp_mrec_lock_key); + /* Read upcase table and setup @vol->upcase and @vol->upcase_len. */ + if (!load_and_init_upcase(vol)) + goto iput_mftbmp_err_out; +#ifdef NTFS_RW + /* + * Read attribute definitions table and setup @vol->attrdef and + * @vol->attrdef_size. + */ + if (!load_and_init_attrdef(vol)) + goto iput_upcase_err_out; +#endif /* NTFS_RW */ + /* + * Get the cluster allocation bitmap inode and verify the size, no + * need for any locking at this stage as we are already running + * exclusively as we are mount in progress task. + */ + vol->lcnbmp_ino = ntfs_iget(sb, FILE_Bitmap); + if (IS_ERR(vol->lcnbmp_ino) || is_bad_inode(vol->lcnbmp_ino)) { + if (!IS_ERR(vol->lcnbmp_ino)) + iput(vol->lcnbmp_ino); + goto bitmap_failed; + } + lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->runlist.lock, + &lcnbmp_runlist_lock_key); + lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->mrec_lock, + &lcnbmp_mrec_lock_key); + + NInoSetSparseDisabled(NTFS_I(vol->lcnbmp_ino)); + if ((vol->nr_clusters + 7) >> 3 > i_size_read(vol->lcnbmp_ino)) { + iput(vol->lcnbmp_ino); +bitmap_failed: + ntfs_error(sb, "Failed to load $Bitmap."); + goto iput_attrdef_err_out; + } + /* + * Get the volume inode and setup our cache of the volume flags and + * version. + */ + vol->vol_ino = ntfs_iget(sb, FILE_Volume); + if (IS_ERR(vol->vol_ino) || is_bad_inode(vol->vol_ino)) { + if (!IS_ERR(vol->vol_ino)) + iput(vol->vol_ino); +volume_failed: + ntfs_error(sb, "Failed to load $Volume."); + goto iput_lcnbmp_err_out; + } + m = map_mft_record(NTFS_I(vol->vol_ino)); + if (IS_ERR(m)) { +iput_volume_failed: + iput(vol->vol_ino); + goto volume_failed; + } + if (!(ctx = ntfs_attr_get_search_ctx(NTFS_I(vol->vol_ino), m))) { + ntfs_error(sb, "Failed to get attribute search context."); + goto get_ctx_vol_failed; + } + if (ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0, + ctx) || ctx->attr->non_resident || ctx->attr->flags) { +err_put_vol: + ntfs_attr_put_search_ctx(ctx); +get_ctx_vol_failed: + unmap_mft_record(NTFS_I(vol->vol_ino)); + goto iput_volume_failed; + } + vi = (VOLUME_INFORMATION*)((char*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset)); + /* Some bounds checks. */ + if ((u8*)vi < (u8*)ctx->attr || (u8*)vi + + le32_to_cpu(ctx->attr->data.resident.value_length) > + (u8*)ctx->attr + le32_to_cpu(ctx->attr->length)) + goto err_put_vol; + /* Copy the volume flags and version to the ntfs_volume structure. */ + vol->vol_flags = vi->flags; + vol->major_ver = vi->major_ver; + vol->minor_ver = vi->minor_ver; + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(NTFS_I(vol->vol_ino)); + pr_info("volume version %i.%i.\n", vol->major_ver, + vol->minor_ver); + if (vol->major_ver < 3 && NVolSparseEnabled(vol)) { + ntfs_warning(vol->sb, "Disabling sparse support due to NTFS " + "volume version %i.%i (need at least version " + "3.0).", vol->major_ver, vol->minor_ver); + NVolClearSparseEnabled(vol); + } +#ifdef NTFS_RW + /* Make sure that no unsupported volume flags are set. */ + if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { + static const char *es1a = "Volume is dirty"; + static const char *es1b = "Volume has been modified by chkdsk"; + static const char *es1c = "Volume has unsupported flags set"; + static const char *es2a = ". Run chkdsk and mount in Windows."; + static const char *es2b = ". Mount in Windows."; + const char *es1, *es2; + + es2 = es2a; + if (vol->vol_flags & VOLUME_IS_DIRTY) + es1 = es1a; + else if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) { + es1 = es1b; + es2 = es2b; + } else { + es1 = es1c; + ntfs_warning(sb, "Unsupported volume flags 0x%x " + "encountered.", + (unsigned)le16_to_cpu(vol->vol_flags)); + } + /* If a read-write mount, convert it to a read-only mount. */ + if (!sb_rdonly(sb)) { + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=" + "continue nor on_errors=" + "remount-ro was specified%s", + es1, es2); + goto iput_vol_err_out; + } + sb->s_flags |= SB_RDONLY; + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + } else + ntfs_warning(sb, "%s. Will not be able to remount " + "read-write%s", es1, es2); + /* + * Do not set NVolErrors() because ntfs_remount() re-checks the + * flags which we need to do in case any flags have changed. + */ + } + /* + * Get the inode for the logfile, check it and determine if the volume + * was shutdown cleanly. + */ + rp = NULL; + if (!load_and_check_logfile(vol, &rp) || + !ntfs_is_logfile_clean(vol->logfile_ino, rp)) { + static const char *es1a = "Failed to load $LogFile"; + static const char *es1b = "$LogFile is not clean"; + static const char *es2 = ". Mount in Windows."; + const char *es1; + + es1 = !vol->logfile_ino ? es1a : es1b; + /* If a read-write mount, convert it to a read-only mount. */ + if (!sb_rdonly(sb)) { + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=" + "continue nor on_errors=" + "remount-ro was specified%s", + es1, es2); + if (vol->logfile_ino) { + BUG_ON(!rp); + ntfs_free(rp); + } + goto iput_logfile_err_out; + } + sb->s_flags |= SB_RDONLY; + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + } else + ntfs_warning(sb, "%s. Will not be able to remount " + "read-write%s", es1, es2); + /* This will prevent a read-write remount. */ + NVolSetErrors(vol); + } + ntfs_free(rp); +#endif /* NTFS_RW */ + /* Get the root directory inode so we can do path lookups. */ + vol->root_ino = ntfs_iget(sb, FILE_root); + if (IS_ERR(vol->root_ino) || is_bad_inode(vol->root_ino)) { + if (!IS_ERR(vol->root_ino)) + iput(vol->root_ino); + ntfs_error(sb, "Failed to load root directory."); + goto iput_logfile_err_out; + } +#ifdef NTFS_RW + /* + * Check if Windows is suspended to disk on the target volume. If it + * is hibernated, we must not write *anything* to the disk so set + * NVolErrors() without setting the dirty volume flag and mount + * read-only. This will prevent read-write remounting and it will also + * prevent all writes. + */ + err = check_windows_hibernation_status(vol); + if (unlikely(err)) { + static const char *es1a = "Failed to determine if Windows is " + "hibernated"; + static const char *es1b = "Windows is hibernated"; + static const char *es2 = ". Run chkdsk."; + const char *es1; + + es1 = err < 0 ? es1a : es1b; + /* If a read-write mount, convert it to a read-only mount. */ + if (!sb_rdonly(sb)) { + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=" + "continue nor on_errors=" + "remount-ro was specified%s", + es1, es2); + goto iput_root_err_out; + } + sb->s_flags |= SB_RDONLY; + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + } else + ntfs_warning(sb, "%s. Will not be able to remount " + "read-write%s", es1, es2); + /* This will prevent a read-write remount. */ + NVolSetErrors(vol); + } + /* If (still) a read-write mount, mark the volume dirty. */ + if (!sb_rdonly(sb) && ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { + static const char *es1 = "Failed to set dirty bit in volume " + "information flags"; + static const char *es2 = ". Run chkdsk."; + + /* Convert to a read-only mount. */ + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=continue nor " + "on_errors=remount-ro was specified%s", + es1, es2); + goto iput_root_err_out; + } + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + sb->s_flags |= SB_RDONLY; + /* + * Do not set NVolErrors() because ntfs_remount() might manage + * to set the dirty flag in which case all would be well. + */ + } +#if 0 + // TODO: Enable this code once we start modifying anything that is + // different between NTFS 1.2 and 3.x... + /* + * If (still) a read-write mount, set the NT4 compatibility flag on + * newer NTFS version volumes. + */ + if (!(sb->s_flags & SB_RDONLY) && (vol->major_ver > 1) && + ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) { + static const char *es1 = "Failed to set NT4 compatibility flag"; + static const char *es2 = ". Run chkdsk."; + + /* Convert to a read-only mount. */ + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=continue nor " + "on_errors=remount-ro was specified%s", + es1, es2); + goto iput_root_err_out; + } + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + sb->s_flags |= SB_RDONLY; + NVolSetErrors(vol); + } +#endif + /* If (still) a read-write mount, empty the logfile. */ + if (!sb_rdonly(sb) && !ntfs_empty_logfile(vol->logfile_ino)) { + static const char *es1 = "Failed to empty $LogFile"; + static const char *es2 = ". Mount in Windows."; + + /* Convert to a read-only mount. */ + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=continue nor " + "on_errors=remount-ro was specified%s", + es1, es2); + goto iput_root_err_out; + } + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + sb->s_flags |= SB_RDONLY; + NVolSetErrors(vol); + } +#endif /* NTFS_RW */ + /* If on NTFS versions before 3.0, we are done. */ + if (unlikely(vol->major_ver < 3)) + return true; + /* NTFS 3.0+ specific initialization. */ + /* Get the security descriptors inode. */ + vol->secure_ino = ntfs_iget(sb, FILE_Secure); + if (IS_ERR(vol->secure_ino) || is_bad_inode(vol->secure_ino)) { + if (!IS_ERR(vol->secure_ino)) + iput(vol->secure_ino); + ntfs_error(sb, "Failed to load $Secure."); + goto iput_root_err_out; + } + // TODO: Initialize security. + /* Get the extended system files' directory inode. */ + vol->extend_ino = ntfs_iget(sb, FILE_Extend); + if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino) || + !S_ISDIR(vol->extend_ino->i_mode)) { + if (!IS_ERR(vol->extend_ino)) + iput(vol->extend_ino); + ntfs_error(sb, "Failed to load $Extend."); + goto iput_sec_err_out; + } +#ifdef NTFS_RW + /* Find the quota file, load it if present, and set it up. */ + if (!load_and_init_quota(vol)) { + static const char *es1 = "Failed to load $Quota"; + static const char *es2 = ". Run chkdsk."; + + /* If a read-write mount, convert it to a read-only mount. */ + if (!sb_rdonly(sb)) { + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=" + "continue nor on_errors=" + "remount-ro was specified%s", + es1, es2); + goto iput_quota_err_out; + } + sb->s_flags |= SB_RDONLY; + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + } else + ntfs_warning(sb, "%s. Will not be able to remount " + "read-write%s", es1, es2); + /* This will prevent a read-write remount. */ + NVolSetErrors(vol); + } + /* If (still) a read-write mount, mark the quotas out of date. */ + if (!sb_rdonly(sb) && !ntfs_mark_quotas_out_of_date(vol)) { + static const char *es1 = "Failed to mark quotas out of date"; + static const char *es2 = ". Run chkdsk."; + + /* Convert to a read-only mount. */ + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=continue nor " + "on_errors=remount-ro was specified%s", + es1, es2); + goto iput_quota_err_out; + } + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + sb->s_flags |= SB_RDONLY; + NVolSetErrors(vol); + } + /* + * Find the transaction log file ($UsnJrnl), load it if present, check + * it, and set it up. + */ + if (!load_and_init_usnjrnl(vol)) { + static const char *es1 = "Failed to load $UsnJrnl"; + static const char *es2 = ". Run chkdsk."; + + /* If a read-write mount, convert it to a read-only mount. */ + if (!sb_rdonly(sb)) { + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=" + "continue nor on_errors=" + "remount-ro was specified%s", + es1, es2); + goto iput_usnjrnl_err_out; + } + sb->s_flags |= SB_RDONLY; + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + } else + ntfs_warning(sb, "%s. Will not be able to remount " + "read-write%s", es1, es2); + /* This will prevent a read-write remount. */ + NVolSetErrors(vol); + } + /* If (still) a read-write mount, stamp the transaction log. */ + if (!sb_rdonly(sb) && !ntfs_stamp_usnjrnl(vol)) { + static const char *es1 = "Failed to stamp transaction log " + "($UsnJrnl)"; + static const char *es2 = ". Run chkdsk."; + + /* Convert to a read-only mount. */ + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=continue nor " + "on_errors=remount-ro was specified%s", + es1, es2); + goto iput_usnjrnl_err_out; + } + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + sb->s_flags |= SB_RDONLY; + NVolSetErrors(vol); + } +#endif /* NTFS_RW */ + return true; +#ifdef NTFS_RW +iput_usnjrnl_err_out: + iput(vol->usnjrnl_j_ino); + iput(vol->usnjrnl_max_ino); + iput(vol->usnjrnl_ino); +iput_quota_err_out: + iput(vol->quota_q_ino); + iput(vol->quota_ino); + iput(vol->extend_ino); +#endif /* NTFS_RW */ +iput_sec_err_out: + iput(vol->secure_ino); +iput_root_err_out: + iput(vol->root_ino); +iput_logfile_err_out: +#ifdef NTFS_RW + iput(vol->logfile_ino); +iput_vol_err_out: +#endif /* NTFS_RW */ + iput(vol->vol_ino); +iput_lcnbmp_err_out: + iput(vol->lcnbmp_ino); +iput_attrdef_err_out: + vol->attrdef_size = 0; + if (vol->attrdef) { + ntfs_free(vol->attrdef); + vol->attrdef = NULL; + } +#ifdef NTFS_RW +iput_upcase_err_out: +#endif /* NTFS_RW */ + vol->upcase_len = 0; + mutex_lock(&ntfs_lock); + if (vol->upcase == default_upcase) { + ntfs_nr_upcase_users--; + vol->upcase = NULL; + } + mutex_unlock(&ntfs_lock); + if (vol->upcase) { + ntfs_free(vol->upcase); + vol->upcase = NULL; + } +iput_mftbmp_err_out: + iput(vol->mftbmp_ino); +iput_mirr_err_out: +#ifdef NTFS_RW + iput(vol->mftmirr_ino); +#endif /* NTFS_RW */ + return false; +} + +/** + * ntfs_put_super - called by the vfs to unmount a volume + * @sb: vfs superblock of volume to unmount + * + * ntfs_put_super() is called by the VFS (from fs/super.c::do_umount()) when + * the volume is being unmounted (umount system call has been invoked) and it + * releases all inodes and memory belonging to the NTFS specific part of the + * super block. + */ +static void ntfs_put_super(struct super_block *sb) +{ + ntfs_volume *vol = NTFS_SB(sb); + + ntfs_debug("Entering."); + +#ifdef NTFS_RW + /* + * Commit all inodes while they are still open in case some of them + * cause others to be dirtied. + */ + ntfs_commit_inode(vol->vol_ino); + + /* NTFS 3.0+ specific. */ + if (vol->major_ver >= 3) { + if (vol->usnjrnl_j_ino) + ntfs_commit_inode(vol->usnjrnl_j_ino); + if (vol->usnjrnl_max_ino) + ntfs_commit_inode(vol->usnjrnl_max_ino); + if (vol->usnjrnl_ino) + ntfs_commit_inode(vol->usnjrnl_ino); + if (vol->quota_q_ino) + ntfs_commit_inode(vol->quota_q_ino); + if (vol->quota_ino) + ntfs_commit_inode(vol->quota_ino); + if (vol->extend_ino) + ntfs_commit_inode(vol->extend_ino); + if (vol->secure_ino) + ntfs_commit_inode(vol->secure_ino); + } + + ntfs_commit_inode(vol->root_ino); + + down_write(&vol->lcnbmp_lock); + ntfs_commit_inode(vol->lcnbmp_ino); + up_write(&vol->lcnbmp_lock); + + down_write(&vol->mftbmp_lock); + ntfs_commit_inode(vol->mftbmp_ino); + up_write(&vol->mftbmp_lock); + + if (vol->logfile_ino) + ntfs_commit_inode(vol->logfile_ino); + + if (vol->mftmirr_ino) + ntfs_commit_inode(vol->mftmirr_ino); + ntfs_commit_inode(vol->mft_ino); + + /* + * If a read-write mount and no volume errors have occurred, mark the + * volume clean. Also, re-commit all affected inodes. + */ + if (!sb_rdonly(sb)) { + if (!NVolErrors(vol)) { + if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY)) + ntfs_warning(sb, "Failed to clear dirty bit " + "in volume information " + "flags. Run chkdsk."); + ntfs_commit_inode(vol->vol_ino); + ntfs_commit_inode(vol->root_ino); + if (vol->mftmirr_ino) + ntfs_commit_inode(vol->mftmirr_ino); + ntfs_commit_inode(vol->mft_ino); + } else { + ntfs_warning(sb, "Volume has errors. Leaving volume " + "marked dirty. Run chkdsk."); + } + } +#endif /* NTFS_RW */ + + iput(vol->vol_ino); + vol->vol_ino = NULL; + + /* NTFS 3.0+ specific clean up. */ + if (vol->major_ver >= 3) { +#ifdef NTFS_RW + if (vol->usnjrnl_j_ino) { + iput(vol->usnjrnl_j_ino); + vol->usnjrnl_j_ino = NULL; + } + if (vol->usnjrnl_max_ino) { + iput(vol->usnjrnl_max_ino); + vol->usnjrnl_max_ino = NULL; + } + if (vol->usnjrnl_ino) { + iput(vol->usnjrnl_ino); + vol->usnjrnl_ino = NULL; + } + if (vol->quota_q_ino) { + iput(vol->quota_q_ino); + vol->quota_q_ino = NULL; + } + if (vol->quota_ino) { + iput(vol->quota_ino); + vol->quota_ino = NULL; + } +#endif /* NTFS_RW */ + if (vol->extend_ino) { + iput(vol->extend_ino); + vol->extend_ino = NULL; + } + if (vol->secure_ino) { + iput(vol->secure_ino); + vol->secure_ino = NULL; + } + } + + iput(vol->root_ino); + vol->root_ino = NULL; + + down_write(&vol->lcnbmp_lock); + iput(vol->lcnbmp_ino); + vol->lcnbmp_ino = NULL; + up_write(&vol->lcnbmp_lock); + + down_write(&vol->mftbmp_lock); + iput(vol->mftbmp_ino); + vol->mftbmp_ino = NULL; + up_write(&vol->mftbmp_lock); + +#ifdef NTFS_RW + if (vol->logfile_ino) { + iput(vol->logfile_ino); + vol->logfile_ino = NULL; + } + if (vol->mftmirr_ino) { + /* Re-commit the mft mirror and mft just in case. */ + ntfs_commit_inode(vol->mftmirr_ino); + ntfs_commit_inode(vol->mft_ino); + iput(vol->mftmirr_ino); + vol->mftmirr_ino = NULL; + } + /* + * We should have no dirty inodes left, due to + * mft.c::ntfs_mft_writepage() cleaning all the dirty pages as + * the underlying mft records are written out and cleaned. + */ + ntfs_commit_inode(vol->mft_ino); + write_inode_now(vol->mft_ino, 1); +#endif /* NTFS_RW */ + + iput(vol->mft_ino); + vol->mft_ino = NULL; + + /* Throw away the table of attribute definitions. */ + vol->attrdef_size = 0; + if (vol->attrdef) { + ntfs_free(vol->attrdef); + vol->attrdef = NULL; + } + vol->upcase_len = 0; + /* + * Destroy the global default upcase table if necessary. Also decrease + * the number of upcase users if we are a user. + */ + mutex_lock(&ntfs_lock); + if (vol->upcase == default_upcase) { + ntfs_nr_upcase_users--; + vol->upcase = NULL; + } + if (!ntfs_nr_upcase_users && default_upcase) { + ntfs_free(default_upcase); + default_upcase = NULL; + } + if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users) + free_compression_buffers(); + mutex_unlock(&ntfs_lock); + if (vol->upcase) { + ntfs_free(vol->upcase); + vol->upcase = NULL; + } + + unload_nls(vol->nls_map); + + sb->s_fs_info = NULL; + kfree(vol); +} + +/** + * get_nr_free_clusters - return the number of free clusters on a volume + * @vol: ntfs volume for which to obtain free cluster count + * + * Calculate the number of free clusters on the mounted NTFS volume @vol. We + * actually calculate the number of clusters in use instead because this + * allows us to not care about partial pages as these will be just zero filled + * and hence not be counted as allocated clusters. + * + * The only particularity is that clusters beyond the end of the logical ntfs + * volume will be marked as allocated to prevent errors which means we have to + * discount those at the end. This is important as the cluster bitmap always + * has a size in multiples of 8 bytes, i.e. up to 63 clusters could be outside + * the logical volume and marked in use when they are not as they do not exist. + * + * If any pages cannot be read we assume all clusters in the erroring pages are + * in use. This means we return an underestimate on errors which is better than + * an overestimate. + */ +static s64 get_nr_free_clusters(ntfs_volume *vol) +{ + s64 nr_free = vol->nr_clusters; + struct address_space *mapping = vol->lcnbmp_ino->i_mapping; + struct page *page; + pgoff_t index, max_index; + + ntfs_debug("Entering."); + /* Serialize accesses to the cluster bitmap. */ + down_read(&vol->lcnbmp_lock); + /* + * Convert the number of bits into bytes rounded up, then convert into + * multiples of PAGE_SIZE, rounding up so that if we have one + * full and one partial page max_index = 2. + */ + max_index = (((vol->nr_clusters + 7) >> 3) + PAGE_SIZE - 1) >> + PAGE_SHIFT; + /* Use multiples of 4 bytes, thus max_size is PAGE_SIZE / 4. */ + ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.", + max_index, PAGE_SIZE / 4); + for (index = 0; index < max_index; index++) { + unsigned long *kaddr; + + /* + * Read the page from page cache, getting it from backing store + * if necessary, and increment the use count. + */ + page = read_mapping_page(mapping, index, NULL); + /* Ignore pages which errored synchronously. */ + if (IS_ERR(page)) { + ntfs_debug("read_mapping_page() error. Skipping " + "page (index 0x%lx).", index); + nr_free -= PAGE_SIZE * 8; + continue; + } + kaddr = kmap_atomic(page); + /* + * Subtract the number of set bits. If this + * is the last page and it is partial we don't really care as + * it just means we do a little extra work but it won't affect + * the result as all out of range bytes are set to zero by + * ntfs_readpage(). + */ + nr_free -= bitmap_weight(kaddr, + PAGE_SIZE * BITS_PER_BYTE); + kunmap_atomic(kaddr); + put_page(page); + } + ntfs_debug("Finished reading $Bitmap, last index = 0x%lx.", index - 1); + /* + * Fixup for eventual bits outside logical ntfs volume (see function + * description above). + */ + if (vol->nr_clusters & 63) + nr_free += 64 - (vol->nr_clusters & 63); + up_read(&vol->lcnbmp_lock); + /* If errors occurred we may well have gone below zero, fix this. */ + if (nr_free < 0) + nr_free = 0; + ntfs_debug("Exiting."); + return nr_free; +} + +/** + * __get_nr_free_mft_records - return the number of free inodes on a volume + * @vol: ntfs volume for which to obtain free inode count + * @nr_free: number of mft records in filesystem + * @max_index: maximum number of pages containing set bits + * + * Calculate the number of free mft records (inodes) on the mounted NTFS + * volume @vol. We actually calculate the number of mft records in use instead + * because this allows us to not care about partial pages as these will be just + * zero filled and hence not be counted as allocated mft record. + * + * If any pages cannot be read we assume all mft records in the erroring pages + * are in use. This means we return an underestimate on errors which is better + * than an overestimate. + * + * NOTE: Caller must hold mftbmp_lock rw_semaphore for reading or writing. + */ +static unsigned long __get_nr_free_mft_records(ntfs_volume *vol, + s64 nr_free, const pgoff_t max_index) +{ + struct address_space *mapping = vol->mftbmp_ino->i_mapping; + struct page *page; + pgoff_t index; + + ntfs_debug("Entering."); + /* Use multiples of 4 bytes, thus max_size is PAGE_SIZE / 4. */ + ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = " + "0x%lx.", max_index, PAGE_SIZE / 4); + for (index = 0; index < max_index; index++) { + unsigned long *kaddr; + + /* + * Read the page from page cache, getting it from backing store + * if necessary, and increment the use count. + */ + page = read_mapping_page(mapping, index, NULL); + /* Ignore pages which errored synchronously. */ + if (IS_ERR(page)) { + ntfs_debug("read_mapping_page() error. Skipping " + "page (index 0x%lx).", index); + nr_free -= PAGE_SIZE * 8; + continue; + } + kaddr = kmap_atomic(page); + /* + * Subtract the number of set bits. If this + * is the last page and it is partial we don't really care as + * it just means we do a little extra work but it won't affect + * the result as all out of range bytes are set to zero by + * ntfs_readpage(). + */ + nr_free -= bitmap_weight(kaddr, + PAGE_SIZE * BITS_PER_BYTE); + kunmap_atomic(kaddr); + put_page(page); + } + ntfs_debug("Finished reading $MFT/$BITMAP, last index = 0x%lx.", + index - 1); + /* If errors occurred we may well have gone below zero, fix this. */ + if (nr_free < 0) + nr_free = 0; + ntfs_debug("Exiting."); + return nr_free; +} + +/** + * ntfs_statfs - return information about mounted NTFS volume + * @dentry: dentry from mounted volume + * @sfs: statfs structure in which to return the information + * + * Return information about the mounted NTFS volume @dentry in the statfs structure + * pointed to by @sfs (this is initialized with zeros before ntfs_statfs is + * called). We interpret the values to be correct of the moment in time at + * which we are called. Most values are variable otherwise and this isn't just + * the free values but the totals as well. For example we can increase the + * total number of file nodes if we run out and we can keep doing this until + * there is no more space on the volume left at all. + * + * Called from vfs_statfs which is used to handle the statfs, fstatfs, and + * ustat system calls. + * + * Return 0 on success or -errno on error. + */ +static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs) +{ + struct super_block *sb = dentry->d_sb; + s64 size; + ntfs_volume *vol = NTFS_SB(sb); + ntfs_inode *mft_ni = NTFS_I(vol->mft_ino); + pgoff_t max_index; + unsigned long flags; + + ntfs_debug("Entering."); + /* Type of filesystem. */ + sfs->f_type = NTFS_SB_MAGIC; + /* Optimal transfer block size. */ + sfs->f_bsize = PAGE_SIZE; + /* + * Total data blocks in filesystem in units of f_bsize and since + * inodes are also stored in data blocs ($MFT is a file) this is just + * the total clusters. + */ + sfs->f_blocks = vol->nr_clusters << vol->cluster_size_bits >> + PAGE_SHIFT; + /* Free data blocks in filesystem in units of f_bsize. */ + size = get_nr_free_clusters(vol) << vol->cluster_size_bits >> + PAGE_SHIFT; + if (size < 0LL) + size = 0LL; + /* Free blocks avail to non-superuser, same as above on NTFS. */ + sfs->f_bavail = sfs->f_bfree = size; + /* Serialize accesses to the inode bitmap. */ + down_read(&vol->mftbmp_lock); + read_lock_irqsave(&mft_ni->size_lock, flags); + size = i_size_read(vol->mft_ino) >> vol->mft_record_size_bits; + /* + * Convert the maximum number of set bits into bytes rounded up, then + * convert into multiples of PAGE_SIZE, rounding up so that if we + * have one full and one partial page max_index = 2. + */ + max_index = ((((mft_ni->initialized_size >> vol->mft_record_size_bits) + + 7) >> 3) + PAGE_SIZE - 1) >> PAGE_SHIFT; + read_unlock_irqrestore(&mft_ni->size_lock, flags); + /* Number of inodes in filesystem (at this point in time). */ + sfs->f_files = size; + /* Free inodes in fs (based on current total count). */ + sfs->f_ffree = __get_nr_free_mft_records(vol, size, max_index); + up_read(&vol->mftbmp_lock); + /* + * File system id. This is extremely *nix flavour dependent and even + * within Linux itself all fs do their own thing. I interpret this to + * mean a unique id associated with the mounted fs and not the id + * associated with the filesystem driver, the latter is already given + * by the filesystem type in sfs->f_type. Thus we use the 64-bit + * volume serial number splitting it into two 32-bit parts. We enter + * the least significant 32-bits in f_fsid[0] and the most significant + * 32-bits in f_fsid[1]. + */ + sfs->f_fsid = u64_to_fsid(vol->serial_no); + /* Maximum length of filenames. */ + sfs->f_namelen = NTFS_MAX_NAME_LEN; + return 0; +} + +#ifdef NTFS_RW +static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc) +{ + return __ntfs_write_inode(vi, wbc->sync_mode == WB_SYNC_ALL); +} +#endif + +/* + * The complete super operations. + */ +static const struct super_operations ntfs_sops = { + .alloc_inode = ntfs_alloc_big_inode, /* VFS: Allocate new inode. */ + .free_inode = ntfs_free_big_inode, /* VFS: Deallocate inode. */ +#ifdef NTFS_RW + .write_inode = ntfs_write_inode, /* VFS: Write dirty inode to + disk. */ +#endif /* NTFS_RW */ + .put_super = ntfs_put_super, /* Syscall: umount. */ + .statfs = ntfs_statfs, /* Syscall: statfs */ + .remount_fs = ntfs_remount, /* Syscall: mount -o remount. */ + .evict_inode = ntfs_evict_big_inode, /* VFS: Called when an inode is + removed from memory. */ + .show_options = ntfs_show_options, /* Show mount options in + proc. */ +}; + +/** + * ntfs_fill_super - mount an ntfs filesystem + * @sb: super block of ntfs filesystem to mount + * @opt: string containing the mount options + * @silent: silence error output + * + * ntfs_fill_super() is called by the VFS to mount the device described by @sb + * with the mount otions in @data with the NTFS filesystem. + * + * If @silent is true, remain silent even if errors are detected. This is used + * during bootup, when the kernel tries to mount the root filesystem with all + * registered filesystems one after the other until one succeeds. This implies + * that all filesystems except the correct one will quite correctly and + * expectedly return an error, but nobody wants to see error messages when in + * fact this is what is supposed to happen. + * + * NOTE: @sb->s_flags contains the mount options flags. + */ +static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) +{ + ntfs_volume *vol; + struct buffer_head *bh; + struct inode *tmp_ino; + int blocksize, result; + + /* + * We do a pretty difficult piece of bootstrap by reading the + * MFT (and other metadata) from disk into memory. We'll only + * release this metadata during umount, so the locking patterns + * observed during bootstrap do not count. So turn off the + * observation of locking patterns (strictly for this context + * only) while mounting NTFS. [The validator is still active + * otherwise, even for this context: it will for example record + * lock class registrations.] + */ + lockdep_off(); + ntfs_debug("Entering."); +#ifndef NTFS_RW + sb->s_flags |= SB_RDONLY; +#endif /* ! NTFS_RW */ + /* Allocate a new ntfs_volume and place it in sb->s_fs_info. */ + sb->s_fs_info = kmalloc(sizeof(ntfs_volume), GFP_NOFS); + vol = NTFS_SB(sb); + if (!vol) { + if (!silent) + ntfs_error(sb, "Allocation of NTFS volume structure " + "failed. Aborting mount..."); + lockdep_on(); + return -ENOMEM; + } + /* Initialize ntfs_volume structure. */ + *vol = (ntfs_volume) { + .sb = sb, + /* + * Default is group and other don't have any access to files or + * directories while owner has full access. Further, files by + * default are not executable but directories are of course + * browseable. + */ + .fmask = 0177, + .dmask = 0077, + }; + init_rwsem(&vol->mftbmp_lock); + init_rwsem(&vol->lcnbmp_lock); + + /* By default, enable sparse support. */ + NVolSetSparseEnabled(vol); + + /* Important to get the mount options dealt with now. */ + if (!parse_options(vol, (char*)opt)) + goto err_out_now; + + /* We support sector sizes up to the PAGE_SIZE. */ + if (bdev_logical_block_size(sb->s_bdev) > PAGE_SIZE) { + if (!silent) + ntfs_error(sb, "Device has unsupported sector size " + "(%i). The maximum supported sector " + "size on this architecture is %lu " + "bytes.", + bdev_logical_block_size(sb->s_bdev), + PAGE_SIZE); + goto err_out_now; + } + /* + * Setup the device access block size to NTFS_BLOCK_SIZE or the hard + * sector size, whichever is bigger. + */ + blocksize = sb_min_blocksize(sb, NTFS_BLOCK_SIZE); + if (blocksize < NTFS_BLOCK_SIZE) { + if (!silent) + ntfs_error(sb, "Unable to set device block size."); + goto err_out_now; + } + BUG_ON(blocksize != sb->s_blocksize); + ntfs_debug("Set device block size to %i bytes (block size bits %i).", + blocksize, sb->s_blocksize_bits); + /* Determine the size of the device in units of block_size bytes. */ + vol->nr_blocks = sb_bdev_nr_blocks(sb); + if (!vol->nr_blocks) { + if (!silent) + ntfs_error(sb, "Unable to determine device size."); + goto err_out_now; + } + /* Read the boot sector and return unlocked buffer head to it. */ + if (!(bh = read_ntfs_boot_sector(sb, silent))) { + if (!silent) + ntfs_error(sb, "Not an NTFS volume."); + goto err_out_now; + } + /* + * Extract the data from the boot sector and setup the ntfs volume + * using it. + */ + result = parse_ntfs_boot_sector(vol, (NTFS_BOOT_SECTOR*)bh->b_data); + brelse(bh); + if (!result) { + if (!silent) + ntfs_error(sb, "Unsupported NTFS filesystem."); + goto err_out_now; + } + /* + * If the boot sector indicates a sector size bigger than the current + * device block size, switch the device block size to the sector size. + * TODO: It may be possible to support this case even when the set + * below fails, we would just be breaking up the i/o for each sector + * into multiple blocks for i/o purposes but otherwise it should just + * work. However it is safer to leave disabled until someone hits this + * error message and then we can get them to try it without the setting + * so we know for sure that it works. + */ + if (vol->sector_size > blocksize) { + blocksize = sb_set_blocksize(sb, vol->sector_size); + if (blocksize != vol->sector_size) { + if (!silent) + ntfs_error(sb, "Unable to set device block " + "size to sector size (%i).", + vol->sector_size); + goto err_out_now; + } + BUG_ON(blocksize != sb->s_blocksize); + vol->nr_blocks = sb_bdev_nr_blocks(sb); + ntfs_debug("Changed device block size to %i bytes (block size " + "bits %i) to match volume sector size.", + blocksize, sb->s_blocksize_bits); + } + /* Initialize the cluster and mft allocators. */ + ntfs_setup_allocators(vol); + /* Setup remaining fields in the super block. */ + sb->s_magic = NTFS_SB_MAGIC; + /* + * Ntfs allows 63 bits for the file size, i.e. correct would be: + * sb->s_maxbytes = ~0ULL >> 1; + * But the kernel uses a long as the page cache page index which on + * 32-bit architectures is only 32-bits. MAX_LFS_FILESIZE is kernel + * defined to the maximum the page cache page index can cope with + * without overflowing the index or to 2^63 - 1, whichever is smaller. + */ + sb->s_maxbytes = MAX_LFS_FILESIZE; + /* Ntfs measures time in 100ns intervals. */ + sb->s_time_gran = 100; + /* + * Now load the metadata required for the page cache and our address + * space operations to function. We do this by setting up a specialised + * read_inode method and then just calling the normal iget() to obtain + * the inode for $MFT which is sufficient to allow our normal inode + * operations and associated address space operations to function. + */ + sb->s_op = &ntfs_sops; + tmp_ino = new_inode(sb); + if (!tmp_ino) { + if (!silent) + ntfs_error(sb, "Failed to load essential metadata."); + goto err_out_now; + } + tmp_ino->i_ino = FILE_MFT; + insert_inode_hash(tmp_ino); + if (ntfs_read_inode_mount(tmp_ino) < 0) { + if (!silent) + ntfs_error(sb, "Failed to load essential metadata."); + goto iput_tmp_ino_err_out_now; + } + mutex_lock(&ntfs_lock); + /* + * The current mount is a compression user if the cluster size is + * less than or equal 4kiB. + */ + if (vol->cluster_size <= 4096 && !ntfs_nr_compression_users++) { + result = allocate_compression_buffers(); + if (result) { + ntfs_error(NULL, "Failed to allocate buffers " + "for compression engine."); + ntfs_nr_compression_users--; + mutex_unlock(&ntfs_lock); + goto iput_tmp_ino_err_out_now; + } + } + /* + * Generate the global default upcase table if necessary. Also + * temporarily increment the number of upcase users to avoid race + * conditions with concurrent (u)mounts. + */ + if (!default_upcase) + default_upcase = generate_default_upcase(); + ntfs_nr_upcase_users++; + mutex_unlock(&ntfs_lock); + /* + * From now on, ignore @silent parameter. If we fail below this line, + * it will be due to a corrupt fs or a system error, so we report it. + */ + /* + * Open the system files with normal access functions and complete + * setting up the ntfs super block. + */ + if (!load_system_files(vol)) { + ntfs_error(sb, "Failed to load system files."); + goto unl_upcase_iput_tmp_ino_err_out_now; + } + + /* We grab a reference, simulating an ntfs_iget(). */ + ihold(vol->root_ino); + if ((sb->s_root = d_make_root(vol->root_ino))) { + ntfs_debug("Exiting, status successful."); + /* Release the default upcase if it has no users. */ + mutex_lock(&ntfs_lock); + if (!--ntfs_nr_upcase_users && default_upcase) { + ntfs_free(default_upcase); + default_upcase = NULL; + } + mutex_unlock(&ntfs_lock); + sb->s_export_op = &ntfs_export_ops; + lockdep_on(); + return 0; + } + ntfs_error(sb, "Failed to allocate root directory."); + /* Clean up after the successful load_system_files() call from above. */ + // TODO: Use ntfs_put_super() instead of repeating all this code... + // FIXME: Should mark the volume clean as the error is most likely + // -ENOMEM. + iput(vol->vol_ino); + vol->vol_ino = NULL; + /* NTFS 3.0+ specific clean up. */ + if (vol->major_ver >= 3) { +#ifdef NTFS_RW + if (vol->usnjrnl_j_ino) { + iput(vol->usnjrnl_j_ino); + vol->usnjrnl_j_ino = NULL; + } + if (vol->usnjrnl_max_ino) { + iput(vol->usnjrnl_max_ino); + vol->usnjrnl_max_ino = NULL; + } + if (vol->usnjrnl_ino) { + iput(vol->usnjrnl_ino); + vol->usnjrnl_ino = NULL; + } + if (vol->quota_q_ino) { + iput(vol->quota_q_ino); + vol->quota_q_ino = NULL; + } + if (vol->quota_ino) { + iput(vol->quota_ino); + vol->quota_ino = NULL; + } +#endif /* NTFS_RW */ + if (vol->extend_ino) { + iput(vol->extend_ino); + vol->extend_ino = NULL; + } + if (vol->secure_ino) { + iput(vol->secure_ino); + vol->secure_ino = NULL; + } + } + iput(vol->root_ino); + vol->root_ino = NULL; + iput(vol->lcnbmp_ino); + vol->lcnbmp_ino = NULL; + iput(vol->mftbmp_ino); + vol->mftbmp_ino = NULL; +#ifdef NTFS_RW + if (vol->logfile_ino) { + iput(vol->logfile_ino); + vol->logfile_ino = NULL; + } + if (vol->mftmirr_ino) { + iput(vol->mftmirr_ino); + vol->mftmirr_ino = NULL; + } +#endif /* NTFS_RW */ + /* Throw away the table of attribute definitions. */ + vol->attrdef_size = 0; + if (vol->attrdef) { + ntfs_free(vol->attrdef); + vol->attrdef = NULL; + } + vol->upcase_len = 0; + mutex_lock(&ntfs_lock); + if (vol->upcase == default_upcase) { + ntfs_nr_upcase_users--; + vol->upcase = NULL; + } + mutex_unlock(&ntfs_lock); + if (vol->upcase) { + ntfs_free(vol->upcase); + vol->upcase = NULL; + } + if (vol->nls_map) { + unload_nls(vol->nls_map); + vol->nls_map = NULL; + } + /* Error exit code path. */ +unl_upcase_iput_tmp_ino_err_out_now: + /* + * Decrease the number of upcase users and destroy the global default + * upcase table if necessary. + */ + mutex_lock(&ntfs_lock); + if (!--ntfs_nr_upcase_users && default_upcase) { + ntfs_free(default_upcase); + default_upcase = NULL; + } + if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users) + free_compression_buffers(); + mutex_unlock(&ntfs_lock); +iput_tmp_ino_err_out_now: + iput(tmp_ino); + if (vol->mft_ino && vol->mft_ino != tmp_ino) + iput(vol->mft_ino); + vol->mft_ino = NULL; + /* Errors at this stage are irrelevant. */ +err_out_now: + sb->s_fs_info = NULL; + kfree(vol); + ntfs_debug("Failed, returning -EINVAL."); + lockdep_on(); + return -EINVAL; +} + +/* + * This is a slab cache to optimize allocations and deallocations of Unicode + * strings of the maximum length allowed by NTFS, which is NTFS_MAX_NAME_LEN + * (255) Unicode characters + a terminating NULL Unicode character. + */ +struct kmem_cache *ntfs_name_cache; + +/* Slab caches for efficient allocation/deallocation of inodes. */ +struct kmem_cache *ntfs_inode_cache; +struct kmem_cache *ntfs_big_inode_cache; + +/* Init once constructor for the inode slab cache. */ +static void ntfs_big_inode_init_once(void *foo) +{ + ntfs_inode *ni = (ntfs_inode *)foo; + + inode_init_once(VFS_I(ni)); +} + +/* + * Slab caches to optimize allocations and deallocations of attribute search + * contexts and index contexts, respectively. + */ +struct kmem_cache *ntfs_attr_ctx_cache; +struct kmem_cache *ntfs_index_ctx_cache; + +/* Driver wide mutex. */ +DEFINE_MUTEX(ntfs_lock); + +static struct dentry *ntfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super); +} + +static struct file_system_type ntfs_fs_type = { + .owner = THIS_MODULE, + .name = "ntfs", + .mount = ntfs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("ntfs"); + +/* Stable names for the slab caches. */ +static const char ntfs_index_ctx_cache_name[] = "ntfs_index_ctx_cache"; +static const char ntfs_attr_ctx_cache_name[] = "ntfs_attr_ctx_cache"; +static const char ntfs_name_cache_name[] = "ntfs_name_cache"; +static const char ntfs_inode_cache_name[] = "ntfs_inode_cache"; +static const char ntfs_big_inode_cache_name[] = "ntfs_big_inode_cache"; + +static int __init init_ntfs_fs(void) +{ + int err = 0; + + /* This may be ugly but it results in pretty output so who cares. (-8 */ + pr_info("driver " NTFS_VERSION " [Flags: R/" +#ifdef NTFS_RW + "W" +#else + "O" +#endif +#ifdef DEBUG + " DEBUG" +#endif +#ifdef MODULE + " MODULE" +#endif + "].\n"); + + ntfs_debug("Debug messages are enabled."); + + ntfs_index_ctx_cache = kmem_cache_create(ntfs_index_ctx_cache_name, + sizeof(ntfs_index_context), 0 /* offset */, + SLAB_HWCACHE_ALIGN, NULL /* ctor */); + if (!ntfs_index_ctx_cache) { + pr_crit("Failed to create %s!\n", ntfs_index_ctx_cache_name); + goto ictx_err_out; + } + ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name, + sizeof(ntfs_attr_search_ctx), 0 /* offset */, + SLAB_HWCACHE_ALIGN, NULL /* ctor */); + if (!ntfs_attr_ctx_cache) { + pr_crit("NTFS: Failed to create %s!\n", + ntfs_attr_ctx_cache_name); + goto actx_err_out; + } + + ntfs_name_cache = kmem_cache_create(ntfs_name_cache_name, + (NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!ntfs_name_cache) { + pr_crit("Failed to create %s!\n", ntfs_name_cache_name); + goto name_err_out; + } + + ntfs_inode_cache = kmem_cache_create(ntfs_inode_cache_name, + sizeof(ntfs_inode), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); + if (!ntfs_inode_cache) { + pr_crit("Failed to create %s!\n", ntfs_inode_cache_name); + goto inode_err_out; + } + + ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name, + sizeof(big_ntfs_inode), 0, + SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| + SLAB_ACCOUNT, ntfs_big_inode_init_once); + if (!ntfs_big_inode_cache) { + pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name); + goto big_inode_err_out; + } + + /* Register the ntfs sysctls. */ + err = ntfs_sysctl(1); + if (err) { + pr_crit("Failed to register NTFS sysctls!\n"); + goto sysctl_err_out; + } + + err = register_filesystem(&ntfs_fs_type); + if (!err) { + ntfs_debug("NTFS driver registered successfully."); + return 0; /* Success! */ + } + pr_crit("Failed to register NTFS filesystem driver!\n"); + + /* Unregister the ntfs sysctls. */ + ntfs_sysctl(0); +sysctl_err_out: + kmem_cache_destroy(ntfs_big_inode_cache); +big_inode_err_out: + kmem_cache_destroy(ntfs_inode_cache); +inode_err_out: + kmem_cache_destroy(ntfs_name_cache); +name_err_out: + kmem_cache_destroy(ntfs_attr_ctx_cache); +actx_err_out: + kmem_cache_destroy(ntfs_index_ctx_cache); +ictx_err_out: + if (!err) { + pr_crit("Aborting NTFS filesystem driver registration...\n"); + err = -ENOMEM; + } + return err; +} + +static void __exit exit_ntfs_fs(void) +{ + ntfs_debug("Unregistering NTFS driver."); + + unregister_filesystem(&ntfs_fs_type); + + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(ntfs_big_inode_cache); + kmem_cache_destroy(ntfs_inode_cache); + kmem_cache_destroy(ntfs_name_cache); + kmem_cache_destroy(ntfs_attr_ctx_cache); + kmem_cache_destroy(ntfs_index_ctx_cache); + /* Unregister the ntfs sysctls. */ + ntfs_sysctl(0); +} + +MODULE_AUTHOR("Anton Altaparmakov "); +MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc."); +MODULE_VERSION(NTFS_VERSION); +MODULE_LICENSE("GPL"); +#ifdef DEBUG +module_param(debug_msgs, bint, 0); +MODULE_PARM_DESC(debug_msgs, "Enable debug messages."); +#endif + +module_init(init_ntfs_fs) +module_exit(exit_ntfs_fs) diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c new file mode 100644 index 000000000000..4e980170d86a --- /dev/null +++ b/fs/ntfs/sysctl.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * sysctl.c - Code for sysctl handling in NTFS Linux kernel driver. Part of + * the Linux-NTFS project. Adapted from the old NTFS driver, + * Copyright (C) 1997 Martin von Löwis, Régis Duchesne + * + * Copyright (c) 2002-2005 Anton Altaparmakov + */ + +#ifdef DEBUG + +#include + +#ifdef CONFIG_SYSCTL + +#include +#include + +#include "sysctl.h" +#include "debug.h" + +/* Definition of the ntfs sysctl. */ +static struct ctl_table ntfs_sysctls[] = { + { + .procname = "ntfs-debug", + .data = &debug_msgs, /* Data pointer and size. */ + .maxlen = sizeof(debug_msgs), + .mode = 0644, /* Mode, proc handler. */ + .proc_handler = proc_dointvec + }, +}; + +/* Storage for the sysctls header. */ +static struct ctl_table_header *sysctls_root_table; + +/** + * ntfs_sysctl - add or remove the debug sysctl + * @add: add (1) or remove (0) the sysctl + * + * Add or remove the debug sysctl. Return 0 on success or -errno on error. + */ +int ntfs_sysctl(int add) +{ + if (add) { + BUG_ON(sysctls_root_table); + sysctls_root_table = register_sysctl("fs", ntfs_sysctls); + if (!sysctls_root_table) + return -ENOMEM; + } else { + BUG_ON(!sysctls_root_table); + unregister_sysctl_table(sysctls_root_table); + sysctls_root_table = NULL; + } + return 0; +} + +#endif /* CONFIG_SYSCTL */ +#endif /* DEBUG */ diff --git a/fs/ntfs/sysctl.h b/fs/ntfs/sysctl.h new file mode 100644 index 000000000000..96bb2299d2d5 --- /dev/null +++ b/fs/ntfs/sysctl.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * sysctl.h - Defines for sysctl handling in NTFS Linux kernel driver. Part of + * the Linux-NTFS project. Adapted from the old NTFS driver, + * Copyright (C) 1997 Martin von Löwis, Régis Duchesne + * + * Copyright (c) 2002-2004 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_SYSCTL_H +#define _LINUX_NTFS_SYSCTL_H + + +#if defined(DEBUG) && defined(CONFIG_SYSCTL) + +extern int ntfs_sysctl(int add); + +#else + +/* Just return success. */ +static inline int ntfs_sysctl(int add) +{ + return 0; +} + +#endif /* DEBUG && CONFIG_SYSCTL */ +#endif /* _LINUX_NTFS_SYSCTL_H */ diff --git a/fs/ntfs/time.h b/fs/ntfs/time.h new file mode 100644 index 000000000000..6b63261300cc --- /dev/null +++ b/fs/ntfs/time.h @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * time.h - NTFS time conversion functions. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2005 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_TIME_H +#define _LINUX_NTFS_TIME_H + +#include /* For current_kernel_time(). */ +#include /* For do_div(). */ + +#include "endian.h" + +#define NTFS_TIME_OFFSET ((s64)(369 * 365 + 89) * 24 * 3600 * 10000000) + +/** + * utc2ntfs - convert Linux UTC time to NTFS time + * @ts: Linux UTC time to convert to NTFS time + * + * Convert the Linux UTC time @ts to its corresponding NTFS time and return + * that in little endian format. + * + * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec + * and a long tv_nsec where tv_sec is the number of 1-second intervals since + * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second + * intervals since the value of tv_sec. + * + * NTFS uses Microsoft's standard time format which is stored in a s64 and is + * measured as the number of 100-nano-second intervals since 1st January 1601, + * 00:00:00 UTC. + */ +static inline sle64 utc2ntfs(const struct timespec64 ts) +{ + /* + * Convert the seconds to 100ns intervals, add the nano-seconds + * converted to 100ns intervals, and then add the NTFS time offset. + */ + return cpu_to_sle64((s64)ts.tv_sec * 10000000 + ts.tv_nsec / 100 + + NTFS_TIME_OFFSET); +} + +/** + * get_current_ntfs_time - get the current time in little endian NTFS format + * + * Get the current time from the Linux kernel, convert it to its corresponding + * NTFS time and return that in little endian format. + */ +static inline sle64 get_current_ntfs_time(void) +{ + struct timespec64 ts; + + ktime_get_coarse_real_ts64(&ts); + return utc2ntfs(ts); +} + +/** + * ntfs2utc - convert NTFS time to Linux time + * @time: NTFS time (little endian) to convert to Linux UTC + * + * Convert the little endian NTFS time @time to its corresponding Linux UTC + * time and return that in cpu format. + * + * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec + * and a long tv_nsec where tv_sec is the number of 1-second intervals since + * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second + * intervals since the value of tv_sec. + * + * NTFS uses Microsoft's standard time format which is stored in a s64 and is + * measured as the number of 100 nano-second intervals since 1st January 1601, + * 00:00:00 UTC. + */ +static inline struct timespec64 ntfs2utc(const sle64 time) +{ + struct timespec64 ts; + + /* Subtract the NTFS time offset. */ + u64 t = (u64)(sle64_to_cpu(time) - NTFS_TIME_OFFSET); + /* + * Convert the time to 1-second intervals and the remainder to + * 1-nano-second intervals. + */ + ts.tv_nsec = do_div(t, 10000000) * 100; + ts.tv_sec = t; + return ts; +} + +#endif /* _LINUX_NTFS_TIME_H */ diff --git a/fs/ntfs/types.h b/fs/ntfs/types.h new file mode 100644 index 000000000000..9a47859e7a06 --- /dev/null +++ b/fs/ntfs/types.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * types.h - Defines for NTFS Linux kernel driver specific types. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2005 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_TYPES_H +#define _LINUX_NTFS_TYPES_H + +#include + +typedef __le16 le16; +typedef __le32 le32; +typedef __le64 le64; +typedef __u16 __bitwise sle16; +typedef __u32 __bitwise sle32; +typedef __u64 __bitwise sle64; + +/* 2-byte Unicode character type. */ +typedef le16 ntfschar; +#define UCHAR_T_SIZE_BITS 1 + +/* + * Clusters are signed 64-bit values on NTFS volumes. We define two types, LCN + * and VCN, to allow for type checking and better code readability. + */ +typedef s64 VCN; +typedef sle64 leVCN; +typedef s64 LCN; +typedef sle64 leLCN; + +/* + * The NTFS journal $LogFile uses log sequence numbers which are signed 64-bit + * values. We define our own type LSN, to allow for type checking and better + * code readability. + */ +typedef s64 LSN; +typedef sle64 leLSN; + +/* + * The NTFS transaction log $UsnJrnl uses usn which are signed 64-bit values. + * We define our own type USN, to allow for type checking and better code + * readability. + */ +typedef s64 USN; +typedef sle64 leUSN; + +typedef enum { + CASE_SENSITIVE = 0, + IGNORE_CASE = 1, +} IGNORE_CASE_BOOL; + +#endif /* _LINUX_NTFS_TYPES_H */ diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c new file mode 100644 index 000000000000..a6b6c64f14a9 --- /dev/null +++ b/fs/ntfs/unistr.c @@ -0,0 +1,384 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * unistr.c - NTFS Unicode string handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2006 Anton Altaparmakov + */ + +#include + +#include "types.h" +#include "debug.h" +#include "ntfs.h" + +/* + * IMPORTANT + * ========= + * + * All these routines assume that the Unicode characters are in little endian + * encoding inside the strings!!! + */ + +/* + * This is used by the name collation functions to quickly determine what + * characters are (in)valid. + */ +static const u8 legal_ansi_char_array[0x40] = { + 0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + + 0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17, + 0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00, + + 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, + 0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18, +}; + +/** + * ntfs_are_names_equal - compare two Unicode names for equality + * @s1: name to compare to @s2 + * @s1_len: length in Unicode characters of @s1 + * @s2: name to compare to @s1 + * @s2_len: length in Unicode characters of @s2 + * @ic: ignore case bool + * @upcase: upcase table (only if @ic == IGNORE_CASE) + * @upcase_size: length in Unicode characters of @upcase (if present) + * + * Compare the names @s1 and @s2 and return 'true' (1) if the names are + * identical, or 'false' (0) if they are not identical. If @ic is IGNORE_CASE, + * the @upcase table is used to performa a case insensitive comparison. + */ +bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len, + const ntfschar *s2, size_t s2_len, const IGNORE_CASE_BOOL ic, + const ntfschar *upcase, const u32 upcase_size) +{ + if (s1_len != s2_len) + return false; + if (ic == CASE_SENSITIVE) + return !ntfs_ucsncmp(s1, s2, s1_len); + return !ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size); +} + +/** + * ntfs_collate_names - collate two Unicode names + * @name1: first Unicode name to compare + * @name2: second Unicode name to compare + * @err_val: if @name1 contains an invalid character return this value + * @ic: either CASE_SENSITIVE or IGNORE_CASE + * @upcase: upcase table (ignored if @ic is CASE_SENSITIVE) + * @upcase_len: upcase table size (ignored if @ic is CASE_SENSITIVE) + * + * ntfs_collate_names collates two Unicode names and returns: + * + * -1 if the first name collates before the second one, + * 0 if the names match, + * 1 if the second name collates before the first one, or + * @err_val if an invalid character is found in @name1 during the comparison. + * + * The following characters are considered invalid: '"', '*', '<', '>' and '?'. + */ +int ntfs_collate_names(const ntfschar *name1, const u32 name1_len, + const ntfschar *name2, const u32 name2_len, + const int err_val, const IGNORE_CASE_BOOL ic, + const ntfschar *upcase, const u32 upcase_len) +{ + u32 cnt, min_len; + u16 c1, c2; + + min_len = name1_len; + if (name1_len > name2_len) + min_len = name2_len; + for (cnt = 0; cnt < min_len; ++cnt) { + c1 = le16_to_cpu(*name1++); + c2 = le16_to_cpu(*name2++); + if (ic) { + if (c1 < upcase_len) + c1 = le16_to_cpu(upcase[c1]); + if (c2 < upcase_len) + c2 = le16_to_cpu(upcase[c2]); + } + if (c1 < 64 && legal_ansi_char_array[c1] & 8) + return err_val; + if (c1 < c2) + return -1; + if (c1 > c2) + return 1; + } + if (name1_len < name2_len) + return -1; + if (name1_len == name2_len) + return 0; + /* name1_len > name2_len */ + c1 = le16_to_cpu(*name1); + if (c1 < 64 && legal_ansi_char_array[c1] & 8) + return err_val; + return 1; +} + +/** + * ntfs_ucsncmp - compare two little endian Unicode strings + * @s1: first string + * @s2: second string + * @n: maximum unicode characters to compare + * + * Compare the first @n characters of the Unicode strings @s1 and @s2, + * The strings in little endian format and appropriate le16_to_cpu() + * conversion is performed on non-little endian machines. + * + * The function returns an integer less than, equal to, or greater than zero + * if @s1 (or the first @n Unicode characters thereof) is found, respectively, + * to be less than, to match, or be greater than @s2. + */ +int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n) +{ + u16 c1, c2; + size_t i; + + for (i = 0; i < n; ++i) { + c1 = le16_to_cpu(s1[i]); + c2 = le16_to_cpu(s2[i]); + if (c1 < c2) + return -1; + if (c1 > c2) + return 1; + if (!c1) + break; + } + return 0; +} + +/** + * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case + * @s1: first string + * @s2: second string + * @n: maximum unicode characters to compare + * @upcase: upcase table + * @upcase_size: upcase table size in Unicode characters + * + * Compare the first @n characters of the Unicode strings @s1 and @s2, + * ignoring case. The strings in little endian format and appropriate + * le16_to_cpu() conversion is performed on non-little endian machines. + * + * Each character is uppercased using the @upcase table before the comparison. + * + * The function returns an integer less than, equal to, or greater than zero + * if @s1 (or the first @n Unicode characters thereof) is found, respectively, + * to be less than, to match, or be greater than @s2. + */ +int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n, + const ntfschar *upcase, const u32 upcase_size) +{ + size_t i; + u16 c1, c2; + + for (i = 0; i < n; ++i) { + if ((c1 = le16_to_cpu(s1[i])) < upcase_size) + c1 = le16_to_cpu(upcase[c1]); + if ((c2 = le16_to_cpu(s2[i])) < upcase_size) + c2 = le16_to_cpu(upcase[c2]); + if (c1 < c2) + return -1; + if (c1 > c2) + return 1; + if (!c1) + break; + } + return 0; +} + +void ntfs_upcase_name(ntfschar *name, u32 name_len, const ntfschar *upcase, + const u32 upcase_len) +{ + u32 i; + u16 u; + + for (i = 0; i < name_len; i++) + if ((u = le16_to_cpu(name[i])) < upcase_len) + name[i] = upcase[u]; +} + +void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr, + const ntfschar *upcase, const u32 upcase_len) +{ + ntfs_upcase_name((ntfschar*)&file_name_attr->file_name, + file_name_attr->file_name_length, upcase, upcase_len); +} + +int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1, + FILE_NAME_ATTR *file_name_attr2, + const int err_val, const IGNORE_CASE_BOOL ic, + const ntfschar *upcase, const u32 upcase_len) +{ + return ntfs_collate_names((ntfschar*)&file_name_attr1->file_name, + file_name_attr1->file_name_length, + (ntfschar*)&file_name_attr2->file_name, + file_name_attr2->file_name_length, + err_val, ic, upcase, upcase_len); +} + +/** + * ntfs_nlstoucs - convert NLS string to little endian Unicode string + * @vol: ntfs volume which we are working with + * @ins: input NLS string buffer + * @ins_len: length of input string in bytes + * @outs: on return contains the allocated output Unicode string buffer + * + * Convert the input string @ins, which is in whatever format the loaded NLS + * map dictates, into a little endian, 2-byte Unicode string. + * + * This function allocates the string and the caller is responsible for + * calling kmem_cache_free(ntfs_name_cache, *@outs); when finished with it. + * + * On success the function returns the number of Unicode characters written to + * the output string *@outs (>= 0), not counting the terminating Unicode NULL + * character. *@outs is set to the allocated output string buffer. + * + * On error, a negative number corresponding to the error code is returned. In + * that case the output string is not allocated. Both *@outs and *@outs_len + * are then undefined. + * + * This might look a bit odd due to fast path optimization... + */ +int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins, + const int ins_len, ntfschar **outs) +{ + struct nls_table *nls = vol->nls_map; + ntfschar *ucs; + wchar_t wc; + int i, o, wc_len; + + /* We do not trust outside sources. */ + if (likely(ins)) { + ucs = kmem_cache_alloc(ntfs_name_cache, GFP_NOFS); + if (likely(ucs)) { + for (i = o = 0; i < ins_len; i += wc_len) { + wc_len = nls->char2uni(ins + i, ins_len - i, + &wc); + if (likely(wc_len >= 0 && + o < NTFS_MAX_NAME_LEN)) { + if (likely(wc)) { + ucs[o++] = cpu_to_le16(wc); + continue; + } /* else if (!wc) */ + break; + } /* else if (wc_len < 0 || + o >= NTFS_MAX_NAME_LEN) */ + goto name_err; + } + ucs[o] = 0; + *outs = ucs; + return o; + } /* else if (!ucs) */ + ntfs_error(vol->sb, "Failed to allocate buffer for converted " + "name from ntfs_name_cache."); + return -ENOMEM; + } /* else if (!ins) */ + ntfs_error(vol->sb, "Received NULL pointer."); + return -EINVAL; +name_err: + kmem_cache_free(ntfs_name_cache, ucs); + if (wc_len < 0) { + ntfs_error(vol->sb, "Name using character set %s contains " + "characters that cannot be converted to " + "Unicode.", nls->charset); + i = -EILSEQ; + } else /* if (o >= NTFS_MAX_NAME_LEN) */ { + ntfs_error(vol->sb, "Name is too long (maximum length for a " + "name on NTFS is %d Unicode characters.", + NTFS_MAX_NAME_LEN); + i = -ENAMETOOLONG; + } + return i; +} + +/** + * ntfs_ucstonls - convert little endian Unicode string to NLS string + * @vol: ntfs volume which we are working with + * @ins: input Unicode string buffer + * @ins_len: length of input string in Unicode characters + * @outs: on return contains the (allocated) output NLS string buffer + * @outs_len: length of output string buffer in bytes + * + * Convert the input little endian, 2-byte Unicode string @ins, of length + * @ins_len into the string format dictated by the loaded NLS. + * + * If *@outs is NULL, this function allocates the string and the caller is + * responsible for calling kfree(*@outs); when finished with it. In this case + * @outs_len is ignored and can be 0. + * + * On success the function returns the number of bytes written to the output + * string *@outs (>= 0), not counting the terminating NULL byte. If the output + * string buffer was allocated, *@outs is set to it. + * + * On error, a negative number corresponding to the error code is returned. In + * that case the output string is not allocated. The contents of *@outs are + * then undefined. + * + * This might look a bit odd due to fast path optimization... + */ +int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins, + const int ins_len, unsigned char **outs, int outs_len) +{ + struct nls_table *nls = vol->nls_map; + unsigned char *ns; + int i, o, ns_len, wc; + + /* We don't trust outside sources. */ + if (ins) { + ns = *outs; + ns_len = outs_len; + if (ns && !ns_len) { + wc = -ENAMETOOLONG; + goto conversion_err; + } + if (!ns) { + ns_len = ins_len * NLS_MAX_CHARSET_SIZE; + ns = kmalloc(ns_len + 1, GFP_NOFS); + if (!ns) + goto mem_err_out; + } + for (i = o = 0; i < ins_len; i++) { +retry: wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o, + ns_len - o); + if (wc > 0) { + o += wc; + continue; + } else if (!wc) + break; + else if (wc == -ENAMETOOLONG && ns != *outs) { + unsigned char *tc; + /* Grow in multiples of 64 bytes. */ + tc = kmalloc((ns_len + 64) & + ~63, GFP_NOFS); + if (tc) { + memcpy(tc, ns, ns_len); + ns_len = ((ns_len + 64) & ~63) - 1; + kfree(ns); + ns = tc; + goto retry; + } /* No memory so goto conversion_error; */ + } /* wc < 0, real error. */ + goto conversion_err; + } + ns[o] = 0; + *outs = ns; + return o; + } /* else (!ins) */ + ntfs_error(vol->sb, "Received NULL pointer."); + return -EINVAL; +conversion_err: + ntfs_error(vol->sb, "Unicode name contains characters that cannot be " + "converted to character set %s. You might want to " + "try to use the mount option nls=utf8.", nls->charset); + if (ns != *outs) + kfree(ns); + if (wc != -ENAMETOOLONG) + wc = -EILSEQ; + return wc; +mem_err_out: + ntfs_error(vol->sb, "Failed to allocate name!"); + return -ENOMEM; +} diff --git a/fs/ntfs/upcase.c b/fs/ntfs/upcase.c new file mode 100644 index 000000000000..4ebe84a78dea --- /dev/null +++ b/fs/ntfs/upcase.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * upcase.c - Generate the full NTFS Unicode upcase table in little endian. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001 Richard Russon + * Copyright (c) 2001-2006 Anton Altaparmakov + */ + +#include "malloc.h" +#include "ntfs.h" + +ntfschar *generate_default_upcase(void) +{ + static const int uc_run_table[][3] = { /* Start, End, Add */ + {0x0061, 0x007B, -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72, 74}, + {0x00E0, 0x00F7, -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76, 86}, + {0x00F8, 0x00FF, -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100}, + {0x0256, 0x0258, -205}, {0x1F00, 0x1F08, 8}, {0x1F78, 0x1F7A, 128}, + {0x028A, 0x028C, -217}, {0x1F10, 0x1F16, 8}, {0x1F7A, 0x1F7C, 112}, + {0x03AC, 0x03AD, -38}, {0x1F20, 0x1F28, 8}, {0x1F7C, 0x1F7E, 126}, + {0x03AD, 0x03B0, -37}, {0x1F30, 0x1F38, 8}, {0x1FB0, 0x1FB2, 8}, + {0x03B1, 0x03C2, -32}, {0x1F40, 0x1F46, 8}, {0x1FD0, 0x1FD2, 8}, + {0x03C2, 0x03C3, -31}, {0x1F51, 0x1F52, 8}, {0x1FE0, 0x1FE2, 8}, + {0x03C3, 0x03CC, -32}, {0x1F53, 0x1F54, 8}, {0x1FE5, 0x1FE6, 7}, + {0x03CC, 0x03CD, -64}, {0x1F55, 0x1F56, 8}, {0x2170, 0x2180, -16}, + {0x03CD, 0x03CF, -63}, {0x1F57, 0x1F58, 8}, {0x24D0, 0x24EA, -26}, + {0x0430, 0x0450, -32}, {0x1F60, 0x1F68, 8}, {0xFF41, 0xFF5B, -32}, + {0} + }; + + static const int uc_dup_table[][2] = { /* Start, End */ + {0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC}, + {0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB}, + {0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5}, + {0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9}, + {0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95}, + {0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9}, + {0} + }; + + static const int uc_word_table[][2] = { /* Offset, Value */ + {0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196}, + {0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C}, + {0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D}, + {0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F}, + {0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9}, + {0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE}, + {0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7}, + {0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197}, + {0} + }; + + int i, r; + ntfschar *uc; + + uc = ntfs_malloc_nofs(default_upcase_len * sizeof(ntfschar)); + if (!uc) + return uc; + memset(uc, 0, default_upcase_len * sizeof(ntfschar)); + /* Generate the little endian Unicode upcase table used by ntfs. */ + for (i = 0; i < default_upcase_len; i++) + uc[i] = cpu_to_le16(i); + for (r = 0; uc_run_table[r][0]; r++) + for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++) + le16_add_cpu(&uc[i], uc_run_table[r][2]); + for (r = 0; uc_dup_table[r][0]; r++) + for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2) + le16_add_cpu(&uc[i + 1], -1); + for (r = 0; uc_word_table[r][0]; r++) + uc[uc_word_table[r][0]] = cpu_to_le16(uc_word_table[r][1]); + return uc; +} diff --git a/fs/ntfs/volume.h b/fs/ntfs/volume.h new file mode 100644 index 000000000000..930a9ae8a053 --- /dev/null +++ b/fs/ntfs/volume.h @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * volume.h - Defines for volume structures in NTFS Linux kernel driver. Part + * of the Linux-NTFS project. + * + * Copyright (c) 2001-2006 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + */ + +#ifndef _LINUX_NTFS_VOLUME_H +#define _LINUX_NTFS_VOLUME_H + +#include +#include + +#include "types.h" +#include "layout.h" + +/* + * The NTFS in memory super block structure. + */ +typedef struct { + /* + * FIXME: Reorder to have commonly used together element within the + * same cache line, aiming at a cache line size of 32 bytes. Aim for + * 64 bytes for less commonly used together elements. Put most commonly + * used elements to front of structure. Obviously do this only when the + * structure has stabilized... (AIA) + */ + /* Device specifics. */ + struct super_block *sb; /* Pointer back to the super_block. */ + LCN nr_blocks; /* Number of sb->s_blocksize bytes + sized blocks on the device. */ + /* Configuration provided by user at mount time. */ + unsigned long flags; /* Miscellaneous flags, see below. */ + kuid_t uid; /* uid that files will be mounted as. */ + kgid_t gid; /* gid that files will be mounted as. */ + umode_t fmask; /* The mask for file permissions. */ + umode_t dmask; /* The mask for directory + permissions. */ + u8 mft_zone_multiplier; /* Initial mft zone multiplier. */ + u8 on_errors; /* What to do on filesystem errors. */ + /* NTFS bootsector provided information. */ + u16 sector_size; /* in bytes */ + u8 sector_size_bits; /* log2(sector_size) */ + u32 cluster_size; /* in bytes */ + u32 cluster_size_mask; /* cluster_size - 1 */ + u8 cluster_size_bits; /* log2(cluster_size) */ + u32 mft_record_size; /* in bytes */ + u32 mft_record_size_mask; /* mft_record_size - 1 */ + u8 mft_record_size_bits; /* log2(mft_record_size) */ + u32 index_record_size; /* in bytes */ + u32 index_record_size_mask; /* index_record_size - 1 */ + u8 index_record_size_bits; /* log2(index_record_size) */ + LCN nr_clusters; /* Volume size in clusters == number of + bits in lcn bitmap. */ + LCN mft_lcn; /* Cluster location of mft data. */ + LCN mftmirr_lcn; /* Cluster location of copy of mft. */ + u64 serial_no; /* The volume serial number. */ + /* Mount specific NTFS information. */ + u32 upcase_len; /* Number of entries in upcase[]. */ + ntfschar *upcase; /* The upcase table. */ + + s32 attrdef_size; /* Size of the attribute definition + table in bytes. */ + ATTR_DEF *attrdef; /* Table of attribute definitions. + Obtained from FILE_AttrDef. */ + +#ifdef NTFS_RW + /* Variables used by the cluster and mft allocators. */ + s64 mft_data_pos; /* Mft record number at which to + allocate the next mft record. */ + LCN mft_zone_start; /* First cluster of the mft zone. */ + LCN mft_zone_end; /* First cluster beyond the mft zone. */ + LCN mft_zone_pos; /* Current position in the mft zone. */ + LCN data1_zone_pos; /* Current position in the first data + zone. */ + LCN data2_zone_pos; /* Current position in the second data + zone. */ +#endif /* NTFS_RW */ + + struct inode *mft_ino; /* The VFS inode of $MFT. */ + + struct inode *mftbmp_ino; /* Attribute inode for $MFT/$BITMAP. */ + struct rw_semaphore mftbmp_lock; /* Lock for serializing accesses to the + mft record bitmap ($MFT/$BITMAP). */ +#ifdef NTFS_RW + struct inode *mftmirr_ino; /* The VFS inode of $MFTMirr. */ + int mftmirr_size; /* Size of mft mirror in mft records. */ + + struct inode *logfile_ino; /* The VFS inode of $LogFile. */ +#endif /* NTFS_RW */ + + struct inode *lcnbmp_ino; /* The VFS inode of $Bitmap. */ + struct rw_semaphore lcnbmp_lock; /* Lock for serializing accesses to the + cluster bitmap ($Bitmap/$DATA). */ + + struct inode *vol_ino; /* The VFS inode of $Volume. */ + VOLUME_FLAGS vol_flags; /* Volume flags. */ + u8 major_ver; /* Ntfs major version of volume. */ + u8 minor_ver; /* Ntfs minor version of volume. */ + + struct inode *root_ino; /* The VFS inode of the root + directory. */ + struct inode *secure_ino; /* The VFS inode of $Secure (NTFS3.0+ + only, otherwise NULL). */ + struct inode *extend_ino; /* The VFS inode of $Extend (NTFS3.0+ + only, otherwise NULL). */ +#ifdef NTFS_RW + /* $Quota stuff is NTFS3.0+ specific. Unused/NULL otherwise. */ + struct inode *quota_ino; /* The VFS inode of $Quota. */ + struct inode *quota_q_ino; /* Attribute inode for $Quota/$Q. */ + /* $UsnJrnl stuff is NTFS3.0+ specific. Unused/NULL otherwise. */ + struct inode *usnjrnl_ino; /* The VFS inode of $UsnJrnl. */ + struct inode *usnjrnl_max_ino; /* Attribute inode for $UsnJrnl/$Max. */ + struct inode *usnjrnl_j_ino; /* Attribute inode for $UsnJrnl/$J. */ +#endif /* NTFS_RW */ + struct nls_table *nls_map; +} ntfs_volume; + +/* + * Defined bits for the flags field in the ntfs_volume structure. + */ +typedef enum { + NV_Errors, /* 1: Volume has errors, prevent remount rw. */ + NV_ShowSystemFiles, /* 1: Return system files in ntfs_readdir(). */ + NV_CaseSensitive, /* 1: Treat file names as case sensitive and + create filenames in the POSIX namespace. + Otherwise be case insensitive but still + create file names in POSIX namespace. */ + NV_LogFileEmpty, /* 1: $LogFile journal is empty. */ + NV_QuotaOutOfDate, /* 1: $Quota is out of date. */ + NV_UsnJrnlStamped, /* 1: $UsnJrnl has been stamped. */ + NV_SparseEnabled, /* 1: May create sparse files. */ +} ntfs_volume_flags; + +/* + * Macro tricks to expand the NVolFoo(), NVolSetFoo(), and NVolClearFoo() + * functions. + */ +#define DEFINE_NVOL_BIT_OPS(flag) \ +static inline int NVol##flag(ntfs_volume *vol) \ +{ \ + return test_bit(NV_##flag, &(vol)->flags); \ +} \ +static inline void NVolSet##flag(ntfs_volume *vol) \ +{ \ + set_bit(NV_##flag, &(vol)->flags); \ +} \ +static inline void NVolClear##flag(ntfs_volume *vol) \ +{ \ + clear_bit(NV_##flag, &(vol)->flags); \ +} + +/* Emit the ntfs volume bitops functions. */ +DEFINE_NVOL_BIT_OPS(Errors) +DEFINE_NVOL_BIT_OPS(ShowSystemFiles) +DEFINE_NVOL_BIT_OPS(CaseSensitive) +DEFINE_NVOL_BIT_OPS(LogFileEmpty) +DEFINE_NVOL_BIT_OPS(QuotaOutOfDate) +DEFINE_NVOL_BIT_OPS(UsnJrnlStamped) +DEFINE_NVOL_BIT_OPS(SparseEnabled) + +#endif /* _LINUX_NTFS_VOLUME_H */ -- 2.25.1 This updates in-memory, on-disk structures, headers and documentation. Signed-off-by: Namjae Jeon --- Documentation/filesystems/index.rst | 1 + Documentation/filesystems/ntfs.rst | 203 +++ fs/ntfs/aops.h | 84 +- fs/ntfs/attrib.h | 181 +- fs/ntfs/attrlist.h | 21 + fs/ntfs/bitmap.h | 21 +- fs/ntfs/collate.h | 23 +- fs/ntfs/debug.h | 14 +- fs/ntfs/dir.h | 23 +- fs/ntfs/ea.h | 25 + fs/ntfs/index.h | 97 +- fs/ntfs/inode.h | 378 +++-- fs/ntfs/iomap.h | 22 + fs/ntfs/layout.h | 2436 +++++++++++++-------------- fs/ntfs/lcnalloc.h | 48 +- fs/ntfs/logfile.h | 461 ++--- fs/ntfs/malloc.h | 33 +- fs/ntfs/mft.h | 74 +- fs/ntfs/ntfs.h | 142 +- fs/ntfs/quota.h | 11 +- fs/ntfs/reparse.h | 15 + fs/ntfs/runlist.h | 105 +- fs/ntfs/sysctl.h | 8 +- fs/ntfs/time.h | 16 +- fs/ntfs/volume.h | 248 ++- include/uapi/linux/ntfs.h | 23 + 26 files changed, 2539 insertions(+), 2174 deletions(-) create mode 100644 Documentation/filesystems/ntfs.rst create mode 100644 fs/ntfs/attrlist.h create mode 100644 fs/ntfs/ea.h create mode 100644 fs/ntfs/iomap.h create mode 100644 fs/ntfs/reparse.h create mode 100644 include/uapi/linux/ntfs.h diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index f4873197587d..0d1f88185b73 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -98,6 +98,7 @@ Documentation for filesystem implementations. isofs nilfs2 nfs/index + ntfs ntfs3 ocfs2 ocfs2-online-filecheck diff --git a/Documentation/filesystems/ntfs.rst b/Documentation/filesystems/ntfs.rst new file mode 100644 index 000000000000..1ae44772cd53 --- /dev/null +++ b/Documentation/filesystems/ntfs.rst @@ -0,0 +1,203 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================= +The Linux NTFS filesystem driver +================================= + + +.. Table of contents + + - Overview + - Features + - Utilities support + - Supported mount options + + +Overview +======== + +The new ntfs is an implementation that supports write and the current +trends(iomap, no buffer-head) based on read-only classic NTFS. +The old read-only ntfs code is much cleaner, with extensive comments, +offers readability that makes understanding NTFS easier. +The target is to provide current trends(iomap, no buffer head, folio), +enhanced performance, stable maintenance, utility support including fsck. + +Features +======== + +- Write support: + Implement write support on classic read-only NTFS. Additionally, + integrate delayed allocation to enhance write performance through + multi-cluster allocation and minimized fragmentation of cluster bitmap. + +- Switch to using iomap: + Use iomap for buffered IO writes, reads, direct IO, file extent mapping, + readpages, writepages operations. + +- Stop using the buffer head: + The use of buffer head in old ntfs and switched to use folio instead. + As a result, CONFIG_BUFFER_HEAD option enable is removed in Kconfig. + +- Performance Enhancements: + write, file list browsing, mount performance are improved with + the following. + - Use iomap aops. + - Delayed allocation support. + - Optimize zero out for newly allocated clusters. + - Optimize runlist merge overhead with small chunck size. + - pre-load mft(inode) blocks and index(dentry) blocks to improve + readdir + stat performance. + - Load lcn bitmap on background. + +- Stability improvement: + a. Pass more xfstests tests: + ntfs implement fallocate, idmapped mount and permission, etc, + resulting in a significantly high number(287) of xfstests pass. + b. Bonnie++ issue[3]: + The Bonnie++ benchmark fails on ntfs3 with a "Directory not empty" + error during file deletion. ntfs3 currently iterates directory + entries by reading index blocks one by one. When entries are deleted + concurrently, index block merging or entry relocation can cause + readdir() to skip some entries, leaving files undeleted in + workloads(bonnie++) that mix unlink and directory scans. + ntfs implement leaf chain traversal in readdir to avoid entry skip + on deletion. + + +Utilities support +================= + +While ntfs-3g includes ntfsprogs as a component, it notably lacks +the fsck implementation. So we have launched a new ntfs utilitiies +project called ntfsprogs-plus by forking from ntfs-3g after removing +unnecessary ntfs fuse implementation. fsck.ntfs can be used for ntfs +testing with xfstests as well as for recovering corrupted NTFS device. +Download the following ntfsprogs-plus and can use mkfs.ntfs and fsck.ntfs. + + https://github.com/ntfsprogs-plus/ntfsprogs-plus + + +Supported mount options +======================= + +The NTFS+ driver supports the following mount options: + +======================= =================================================== +iocharset=name Character set to use for converting between + the encoding is used for user visible filename and + 16 bit Unicode characters. + +nls=name Deprecated option. Still supported but please use + iocharset=name in the future. + +uid= +gid= +umask= Provide default owner, group, and access mode mask. + These options work as documented in mount(8). By + default, the files/directories are owned by root + and he/she has read and write permissions, as well + as browse permission for directories. No one else + has any access permissions. I.e. the mode on all + files is by default rw------- and + for directories rwx------, a consequence of + the default fmask=0177 and dmask=0077. + Using a umask of zero will grant all permissions to + everyone, i.e. all files and directories will have + mode rwxrwxrwx. + +fmask= +dmask= Instead of specifying umask which applies both to + files and directories, fmask applies only to files + and dmask only to directories. + +showmeta= +show_sys_files= If show_sys_files is specified, show the system + files in directory listings. Otherwise the default + behaviour is to hide the system files. + Note that even when show_sys_files is specified, + "$MFT" will not be visible due to bugs/mis-features + in glibc. Further, note that irrespective of + show_sys_files, all files are accessible by name, + i.e. you can always do "ls -l \$UpCase" for example + to specifically show the system file containing + the Unicode upcase table. + +case_sensitive= If case_sensitive is specified, treat all filenames + as case sensitive and create file names in + the POSIX namespace (default behavior). Note, + the Linux NTFS driver will never create short + filenames and will remove them on rename/delete of + the corresponding long file name. Note that files + remain accessible via their short file name, if it + exists. + +nocase= If nocase is specified, treat filenames + case-insensitively. + +disable_sparse= If disable_sparse is specified, creation of sparse + regions, i.e. holes, inside files is disabled for + the volume (for the duration of this mount only). + By default, creation of sparse regions is enabled, + which is consistent with the behaviour of + traditional Unix filesystems. + +errors=opt Specify NTFS+ behavior on critical errors: panic, + remount the partition in read-only mode or + continue without doing anything (default behavior). + +mft_zone_multiplier= Set the MFT zone multiplier for the volume (this + setting is not persistent across mounts and can be + changed from mount to mount but cannot be changed + on remount). Values of 1 to 4 are allowed, 1 being + the default. The MFT zone multiplier determines + how much space is reserved for the MFT on the + volume. If all other space is used up, then the + MFT zone will be shrunk dynamically, so this has no + impact on the amount of free space. However, it + can have an impact on performance by affecting + fragmentation of the MFT. In general use the + default. If you have a lot of small files then use + a higher value. The values have the following + meaning: + + ===== ================================= + Value MFT zone size (% of volume size) + ===== ================================= + 1 12.5% + 2 25% + 3 37.5% + 4 50% + ===== ================================= + + Note this option is irrelevant for read-only mount. + +preallocated_size= Set preallocated size to optimize runlist merge + overhead with small chunck size.(64KB size by + default) + +acl= Enable POSIX ACL support. When specified, POSIX + ACLs stored in extended attributes are enforced. + Default is off. Requires kernel config + NTFSPLUS_FS_POSIX_ACL enabled. + +sys_immutable= Make NTFS system files (e.g. $MFT, $LogFile, + $Bitmap, $UpCase, etc.) immutable to user initiated + modifications for extra safety. Default is off. + +nohidden= Hide files and directories marked with the Windows + "hidden" attribute. By default hidden items are + shown. + +hide_dot_files= Hide names beginning with a dot ("."). By default + dot files are shown. When enabled, files and + directories created with a leading '.' will be + hidden from directory listings. + +windows_names= Refuse creation/rename of files with characters or + reserved device names disallowed on Windows (e.g. + CON, NUL, AUX, COM1, LPT1, etc.). Default is off. +discard= Issue block device discard for clusters freed on + file deletion/truncation to inform underlying + storage. +======================= ================================================== diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h index 8d0958a149cb..43e4ef7722a5 100644 --- a/fs/ntfs/aops.h +++ b/fs/ntfs/aops.h @@ -1,88 +1,24 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * aops.h - Defines for NTFS kernel address space operations and page cache - * handling. Part of the Linux-NTFS project. +/** + * Defines for NTFS kernel address space operations and page cache + * handling. * * Copyright (c) 2001-2004 Anton Altaparmakov * Copyright (c) 2002 Richard Russon + * Copyright (c) 2025 LG Electronics Co., Ltd. */ #ifndef _LINUX_NTFS_AOPS_H #define _LINUX_NTFS_AOPS_H -#include -#include #include -#include +#include +#include "volume.h" #include "inode.h" -/** - * ntfs_unmap_page - release a page that was mapped using ntfs_map_page() - * @page: the page to release - * - * Unpin, unmap and release a page that was obtained from ntfs_map_page(). - */ -static inline void ntfs_unmap_page(struct page *page) -{ - kunmap(page); - put_page(page); -} - -/** - * ntfs_map_page - map a page into accessible memory, reading it if necessary - * @mapping: address space for which to obtain the page - * @index: index into the page cache for @mapping of the page to map - * - * Read a page from the page cache of the address space @mapping at position - * @index, where @index is in units of PAGE_SIZE, and not in bytes. - * - * If the page is not in memory it is loaded from disk first using the - * read_folio method defined in the address space operations of @mapping - * and the page is added to the page cache of @mapping in the process. - * - * If the page belongs to an mst protected attribute and it is marked as such - * in its ntfs inode (NInoMstProtected()) the mst fixups are applied but no - * error checking is performed. This means the caller has to verify whether - * the ntfs record(s) contained in the page are valid or not using one of the - * ntfs_is_XXXX_record{,p}() macros, where XXXX is the record type you are - * expecting to see. (For details of the macros, see fs/ntfs/layout.h.) - * - * If the page is in high memory it is mapped into memory directly addressible - * by the kernel. - * - * Finally the page count is incremented, thus pinning the page into place. - * - * The above means that page_address(page) can be used on all pages obtained - * with ntfs_map_page() to get the kernel virtual address of the page. - * - * When finished with the page, the caller has to call ntfs_unmap_page() to - * unpin, unmap and release the page. - * - * Note this does not grant exclusive access. If such is desired, the caller - * must provide it independently of the ntfs_{un}map_page() calls by using - * a {rw_}semaphore or other means of serialization. A spin lock cannot be - * used as ntfs_map_page() can block. - * - * The unlocked and uptodate page is returned on success or an encoded error - * on failure. Caller has to test for error using the IS_ERR() macro on the - * return value. If that evaluates to 'true', the negative error code can be - * obtained using PTR_ERR() on the return value of ntfs_map_page(). - */ -static inline struct page *ntfs_map_page(struct address_space *mapping, - unsigned long index) -{ - struct page *page = read_mapping_page(mapping, index, NULL); - - if (!IS_ERR(page)) - kmap(page); - return page; -} - -#ifdef NTFS_RW - -extern void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs); - -#endif /* NTFS_RW */ - +void mark_ntfs_record_dirty(struct folio *folio); +int ntfs_dev_read(struct super_block *sb, void *buf, loff_t start, loff_t end); +int ntfs_dev_write(struct super_block *sb, void *buf, loff_t start, + loff_t size, bool wait); #endif /* _LINUX_NTFS_AOPS_H */ diff --git a/fs/ntfs/attrib.h b/fs/ntfs/attrib.h index fe0890d3d072..e7991851dc9a 100644 --- a/fs/ntfs/attrib.h +++ b/fs/ntfs/attrib.h @@ -1,21 +1,20 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * attrib.h - Defines for attribute handling in NTFS Linux kernel driver. - * Part of the Linux-NTFS project. + * Defines for attribute handling in NTFS Linux kernel driver. + * Part of the Linux-NTFS project. * * Copyright (c) 2001-2005 Anton Altaparmakov * Copyright (c) 2002 Richard Russon + * Copyright (c) 2025 LG Electronics Co., Ltd. */ #ifndef _LINUX_NTFS_ATTRIB_H #define _LINUX_NTFS_ATTRIB_H -#include "endian.h" -#include "types.h" -#include "layout.h" -#include "inode.h" -#include "runlist.h" -#include "volume.h" +#include "ntfs.h" +#include "dir.h" + +extern __le16 AT_UNNAMED[]; /** * ntfs_attr_search_ctx - used in attribute search functions @@ -35,68 +34,126 @@ * any modification of the search context, to automagically get the next * matching attribute. */ -typedef struct { - MFT_RECORD *mrec; - ATTR_RECORD *attr; +struct ntfs_attr_search_ctx { + struct mft_record *mrec; + bool mapped_mrec; + struct attr_record *attr; bool is_first; - ntfs_inode *ntfs_ino; - ATTR_LIST_ENTRY *al_entry; - ntfs_inode *base_ntfs_ino; - MFT_RECORD *base_mrec; - ATTR_RECORD *base_attr; -} ntfs_attr_search_ctx; - -extern int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, - ntfs_attr_search_ctx *ctx); -extern int ntfs_map_runlist(ntfs_inode *ni, VCN vcn); - -extern LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn, + struct ntfs_inode *ntfs_ino; + struct attr_list_entry *al_entry; + struct ntfs_inode *base_ntfs_ino; + struct mft_record *base_mrec; + bool mapped_base_mrec; + struct attr_record *base_attr; +}; + +enum { /* ways of processing holes when expanding */ + HOLES_NO, + HOLES_OK, +}; + +int ntfs_map_runlist_nolock(struct ntfs_inode *ni, s64 vcn, + struct ntfs_attr_search_ctx *ctx); +int ntfs_map_runlist(struct ntfs_inode *ni, s64 vcn); +s64 ntfs_attr_vcn_to_lcn_nolock(struct ntfs_inode *ni, const s64 vcn, const bool write_locked); - -extern runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, - const VCN vcn, ntfs_attr_search_ctx *ctx); - -int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name, - const u32 name_len, const IGNORE_CASE_BOOL ic, - const VCN lowest_vcn, const u8 *val, const u32 val_len, - ntfs_attr_search_ctx *ctx); - -extern int load_attribute_list(ntfs_volume *vol, runlist *rl, u8 *al_start, - const s64 size, const s64 initialized_size); - -static inline s64 ntfs_attr_size(const ATTR_RECORD *a) +struct runlist_element *ntfs_attr_find_vcn_nolock(struct ntfs_inode *ni, + const s64 vcn, struct ntfs_attr_search_ctx *ctx); +struct runlist_element *__ntfs_attr_find_vcn_nolock(struct runlist *runlist, + const s64 vcn); +int ntfs_attr_map_whole_runlist(struct ntfs_inode *ni); +int ntfs_attr_lookup(const __le32 type, const __le16 *name, + const u32 name_len, const u32 ic, + const s64 lowest_vcn, const u8 *val, const u32 val_len, + struct ntfs_attr_search_ctx *ctx); +int load_attribute_list(struct ntfs_inode *base_ni, + u8 *al_start, const s64 size); + +static inline s64 ntfs_attr_size(const struct attr_record *a) { if (!a->non_resident) return (s64)le32_to_cpu(a->data.resident.value_length); - return sle64_to_cpu(a->data.non_resident.data_size); + return le64_to_cpu(a->data.non_resident.data_size); } -extern void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx); -extern ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, - MFT_RECORD *mrec); -extern void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx); - -#ifdef NTFS_RW - -extern int ntfs_attr_size_bounds_check(const ntfs_volume *vol, - const ATTR_TYPE type, const s64 size); -extern int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, - const ATTR_TYPE type); -extern int ntfs_attr_can_be_resident(const ntfs_volume *vol, - const ATTR_TYPE type); - -extern int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size); -extern int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a, +void ntfs_attr_reinit_search_ctx(struct ntfs_attr_search_ctx *ctx); +struct ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(struct ntfs_inode *ni, + struct mft_record *mrec); +void ntfs_attr_put_search_ctx(struct ntfs_attr_search_ctx *ctx); +int ntfs_attr_size_bounds_check(const struct ntfs_volume *vol, + const __le32 type, const s64 size); +int ntfs_attr_can_be_resident(const struct ntfs_volume *vol, + const __le32 type); +int ntfs_attr_map_cluster(struct ntfs_inode *ni, s64 vcn_start, s64 *lcn_start, + s64 *lcn_count, s64 max_clu_count, bool *balloc, bool update_mp, bool skip_holes); +int ntfs_attr_record_resize(struct mft_record *m, struct attr_record *a, u32 new_size); +int ntfs_resident_attr_value_resize(struct mft_record *m, struct attr_record *a, const u32 new_size); - -extern int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size); - -extern s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size, - const s64 new_data_size, const s64 data_start); - -extern int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, +int ntfs_attr_make_non_resident(struct ntfs_inode *ni, const u32 data_size); +int ntfs_attr_set(struct ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val); +int ntfs_attr_set_initialized_size(struct ntfs_inode *ni, loff_t new_size); +int ntfs_attr_open(struct ntfs_inode *ni, const __le32 type, + __le16 *name, u32 name_len); +void ntfs_attr_close(struct ntfs_inode *n); +int ntfs_attr_fallocate(struct ntfs_inode *ni, loff_t start, loff_t byte_len, bool keep_size); +int ntfs_non_resident_attr_insert_range(struct ntfs_inode *ni, s64 start_vcn, s64 len); +int ntfs_non_resident_attr_collapse_range(struct ntfs_inode *ni, s64 start_vcn, s64 len); +int ntfs_non_resident_attr_punch_hole(struct ntfs_inode *ni, s64 start_vcn, s64 len); +int __ntfs_attr_truncate_vfs(struct ntfs_inode *ni, const s64 newsize, + const s64 i_size); +int ntfs_attr_expand(struct ntfs_inode *ni, const s64 newsize, const s64 prealloc_size); +int ntfs_attr_truncate_i(struct ntfs_inode *ni, const s64 newsize, unsigned int holes); +int ntfs_attr_truncate(struct ntfs_inode *ni, const s64 newsize); +int ntfs_attr_rm(struct ntfs_inode *ni); +int ntfs_attr_exist(struct ntfs_inode *ni, const __le32 type, __le16 *name, + u32 name_len); +int ntfs_attr_remove(struct ntfs_inode *ni, const __le32 type, __le16 *name, + u32 name_len); +int ntfs_attr_record_rm(struct ntfs_attr_search_ctx *ctx); +int ntfs_attr_record_move_to(struct ntfs_attr_search_ctx *ctx, struct ntfs_inode *ni); +int ntfs_attr_add(struct ntfs_inode *ni, __le32 type, + __le16 *name, u8 name_len, u8 *val, s64 size); +int ntfs_attr_record_move_away(struct ntfs_attr_search_ctx *ctx, int extra); +char *ntfs_attr_name_get(const struct ntfs_volume *vol, const __le16 *uname, + const int uname_len); +void ntfs_attr_name_free(unsigned char **name); +void *ntfs_attr_readall(struct ntfs_inode *ni, const __le32 type, + __le16 *name, u32 name_len, s64 *data_size); +int ntfs_resident_attr_record_add(struct ntfs_inode *ni, __le32 type, + __le16 *name, u8 name_len, u8 *val, u32 size, + __le16 flags); +int ntfs_attr_update_mapping_pairs(struct ntfs_inode *ni, s64 from_vcn); +struct runlist_element *ntfs_attr_vcn_to_rl(struct ntfs_inode *ni, s64 vcn, s64 *lcn); -#endif /* NTFS_RW */ - +/** + * ntfs_attrs_walk - syntactic sugar for walking all attributes in an inode + * @ctx: initialised attribute search context + * + * Syntactic sugar for walking attributes in an inode. + * + * Return 0 on success and -1 on error with errno set to the error code from + * ntfs_attr_lookup(). + * + * Example: When you want to enumerate all attributes in an open ntfs inode + * @ni, you can simply do: + * + * int err; + * struct ntfs_attr_search_ctx *ctx = ntfs_attr_get_search_ctx(ni, NULL); + * if (!ctx) + * // Error code is in errno. Handle this case. + * while (!(err = ntfs_attrs_walk(ctx))) { + * struct attr_record *attr = ctx->attr; + * // attr now contains the next attribute. Do whatever you want + * // with it and then just continue with the while loop. + * } + * if (err && errno != ENOENT) + * // Ooops. An error occurred! You should handle this case. + * // Now finished with all attributes in the inode. + */ +static inline int ntfs_attrs_walk(struct ntfs_attr_search_ctx *ctx) +{ + return ntfs_attr_lookup(AT_UNUSED, NULL, 0, CASE_SENSITIVE, 0, + NULL, 0, ctx); +} #endif /* _LINUX_NTFS_ATTRIB_H */ diff --git a/fs/ntfs/attrlist.h b/fs/ntfs/attrlist.h new file mode 100644 index 000000000000..d0eadc5db1b0 --- /dev/null +++ b/fs/ntfs/attrlist.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Exports for attribute list attribute handling. + * Originated from Linux-NTFS project. + * + * Copyright (c) 2004 Anton Altaparmakov + * Copyright (c) 2004 Yura Pakhuchiy + * Copyright (c) 2025 LG Electronics Co., Ltd. + */ + +#ifndef _NTFS_ATTRLIST_H +#define _NTFS_ATTRLIST_H + +#include "attrib.h" + +int ntfs_attrlist_need(struct ntfs_inode *ni); +int ntfs_attrlist_entry_add(struct ntfs_inode *ni, struct attr_record *attr); +int ntfs_attrlist_entry_rm(struct ntfs_attr_search_ctx *ctx); +int ntfs_attrlist_update(struct ntfs_inode *base_ni); + +#endif /* defined _NTFS_ATTRLIST_H */ diff --git a/fs/ntfs/bitmap.h b/fs/ntfs/bitmap.h index 9dd2224ca9c4..d58b3ebe5944 100644 --- a/fs/ntfs/bitmap.h +++ b/fs/ntfs/bitmap.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * bitmap.h - Defines for NTFS kernel bitmap handling. Part of the Linux-NTFS - * project. + * Defines for NTFS kernel bitmap handling. Part of the Linux-NTFS + * project. * * Copyright (c) 2004 Anton Altaparmakov */ @@ -9,13 +9,12 @@ #ifndef _LINUX_NTFS_BITMAP_H #define _LINUX_NTFS_BITMAP_H -#ifdef NTFS_RW - #include -#include "types.h" +#include "volume.h" -extern int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, +int ntfsp_trim_fs(struct ntfs_volume *vol, struct fstrim_range *range); +int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, const s64 count, const u8 value, const bool is_rollback); /** @@ -27,8 +26,6 @@ extern int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, * * Set @count bits starting at bit @start_bit in the bitmap described by the * vfs inode @vi to @value, where @value is either 0 or 1. - * - * Return 0 on success and -errno on error. */ static inline int ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, const s64 count, const u8 value) @@ -62,8 +59,6 @@ static inline int ntfs_bitmap_set_run(struct inode *vi, const s64 start_bit, * * Clear @count bits starting at bit @start_bit in the bitmap described by the * vfs inode @vi. - * - * Return 0 on success and -errno on error. */ static inline int ntfs_bitmap_clear_run(struct inode *vi, const s64 start_bit, const s64 count) @@ -77,8 +72,6 @@ static inline int ntfs_bitmap_clear_run(struct inode *vi, const s64 start_bit, * @bit: bit to set * * Set bit @bit in the bitmap described by the vfs inode @vi. - * - * Return 0 on success and -errno on error. */ static inline int ntfs_bitmap_set_bit(struct inode *vi, const s64 bit) { @@ -91,14 +84,10 @@ static inline int ntfs_bitmap_set_bit(struct inode *vi, const s64 bit) * @bit: bit to clear * * Clear bit @bit in the bitmap described by the vfs inode @vi. - * - * Return 0 on success and -errno on error. */ static inline int ntfs_bitmap_clear_bit(struct inode *vi, const s64 bit) { return ntfs_bitmap_clear_run(vi, bit, 1); } -#endif /* NTFS_RW */ - #endif /* defined _LINUX_NTFS_BITMAP_H */ diff --git a/fs/ntfs/collate.h b/fs/ntfs/collate.h index f2255619b4f4..cf04508340f0 100644 --- a/fs/ntfs/collate.h +++ b/fs/ntfs/collate.h @@ -1,26 +1,27 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * collate.h - Defines for NTFS kernel collation handling. Part of the - * Linux-NTFS project. + * Defines for NTFS kernel collation handling. + * Part of the Linux-NTFS project. * * Copyright (c) 2004 Anton Altaparmakov + * + * Part of this file is based on code from the NTFS-3G project. + * and is copyrighted by the respective authors below: + * Copyright (c) 2004 Anton Altaparmakov + * Copyright (c) 2005 Yura Pakhuchiy */ #ifndef _LINUX_NTFS_COLLATE_H #define _LINUX_NTFS_COLLATE_H -#include "types.h" #include "volume.h" -static inline bool ntfs_is_collation_rule_supported(COLLATION_RULE cr) { +static inline bool ntfs_is_collation_rule_supported(__le32 cr) +{ int i; - /* - * FIXME: At the moment we only support COLLATION_BINARY and - * COLLATION_NTOFS_ULONG, so we return false for everything else for - * now. - */ - if (unlikely(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG)) + if (unlikely(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG && + cr != COLLATION_FILE_NAME) && cr != COLLATION_NTOFS_ULONGS) return false; i = le32_to_cpu(cr); if (likely(((i >= 0) && (i <= 0x02)) || @@ -29,7 +30,7 @@ static inline bool ntfs_is_collation_rule_supported(COLLATION_RULE cr) { return false; } -extern int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr, +int ntfs_collate(struct ntfs_volume *vol, __le32 cr, const void *data1, const int data1_len, const void *data2, const int data2_len); diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h index 6fdef388f129..fc8ae7b5c28e 100644 --- a/fs/ntfs/debug.h +++ b/fs/ntfs/debug.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * debug.h - NTFS kernel debug support. Part of the Linux-NTFS project. + * NTFS kernel debug support. Part of the Linux-NTFS project. * * Copyright (c) 2001-2004 Anton Altaparmakov */ @@ -30,7 +30,7 @@ void __ntfs_debug(const char *file, int line, const char *function, #define ntfs_debug(f, a...) \ __ntfs_debug(__FILE__, __LINE__, __func__, f, ##a) -extern void ntfs_debug_dump_runlist(const runlist_element *rl); +void ntfs_debug_dump_runlist(const struct runlist_element *rl); #else /* !DEBUG */ @@ -40,7 +40,11 @@ do { \ no_printk(fmt, ##__VA_ARGS__); \ } while (0) -#define ntfs_debug_dump_runlist(rl) do {} while (0) +#define ntfs_debug_dump_runlist(rl) \ +do { \ + if (0) \ + (void)rl; \ +} while (0) #endif /* !DEBUG */ @@ -50,8 +54,10 @@ void __ntfs_warning(const char *function, const struct super_block *sb, #define ntfs_warning(sb, f, a...) __ntfs_warning(__func__, sb, f, ##a) extern __printf(3, 4) -void __ntfs_error(const char *function, const struct super_block *sb, +void __ntfs_error(const char *function, struct super_block *sb, const char *fmt, ...); #define ntfs_error(sb, f, a...) __ntfs_error(__func__, sb, f, ##a) +void ntfs_handle_error(struct super_block *sb); + #endif /* _LINUX_NTFS_DEBUG_H */ diff --git a/fs/ntfs/dir.h b/fs/ntfs/dir.h index 0e326753df40..5abe21c3d938 100644 --- a/fs/ntfs/dir.h +++ b/fs/ntfs/dir.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * dir.h - Defines for directory handling in NTFS Linux kernel driver. Part of - * the Linux-NTFS project. + * Defines for directory handling in NTFS Linux kernel driver. + * Part of the Linux-NTFS project. * * Copyright (c) 2002-2004 Anton Altaparmakov */ @@ -9,26 +9,25 @@ #ifndef _LINUX_NTFS_DIR_H #define _LINUX_NTFS_DIR_H -#include "layout.h" #include "inode.h" -#include "types.h" /* * ntfs_name is used to return the file name to the caller of * ntfs_lookup_inode_by_name() in order for the caller (namei.c::ntfs_lookup()) * to be able to deal with dcache aliasing issues. */ -typedef struct { - MFT_REF mref; - FILE_NAME_TYPE_FLAGS type; +struct ntfs_name { + u64 mref; + u8 type; u8 len; - ntfschar name[0]; -} __attribute__ ((__packed__)) ntfs_name; + __le16 name[]; +} __packed; /* The little endian Unicode string $I30 as a global constant. */ -extern ntfschar I30[5]; +extern __le16 I30[5]; -extern MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, - const ntfschar *uname, const int uname_len, ntfs_name **res); +u64 ntfs_lookup_inode_by_name(struct ntfs_inode *dir_ni, + const __le16 *uname, const int uname_len, struct ntfs_name **res); +int ntfs_check_empty_dir(struct ntfs_inode *ni, struct mft_record *ni_mrec); #endif /* _LINUX_NTFS_FS_DIR_H */ diff --git a/fs/ntfs/ea.h b/fs/ntfs/ea.h new file mode 100644 index 000000000000..a4302f98d359 --- /dev/null +++ b/fs/ntfs/ea.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#define NTFS_EA_UID BIT(1) +#define NTFS_EA_GID BIT(2) +#define NTFS_EA_MODE BIT(3) + +extern const struct xattr_handler *const ntfsp_xattr_handlers[]; + +int ntfs_ea_set_wsl_not_symlink(struct ntfs_inode *ni, mode_t mode, dev_t dev); +int ntfs_ea_get_wsl_inode(struct inode *inode, dev_t *rdevp, unsigned int flags); +int ntfs_ea_set_wsl_inode(struct inode *inode, dev_t rdev, __le16 *ea_size, + unsigned int flags); +ssize_t ntfsp_listxattr(struct dentry *dentry, char *buffer, size_t size); + +#ifdef CONFIG_NTFS_FS_POSIX_ACL +struct posix_acl *ntfsp_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, + int type); +int ntfsp_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, + struct posix_acl *acl, int type); +int ntfsp_init_acl(struct mnt_idmap *idmap, struct inode *inode, + struct inode *dir); +#else +#define ntfsp_get_acl NULL +#define ntfsp_set_acl NULL +#endif diff --git a/fs/ntfs/index.h b/fs/ntfs/index.h index bb3c3ae55138..b5c719910ab6 100644 --- a/fs/ntfs/index.h +++ b/fs/ntfs/index.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * index.h - Defines for NTFS kernel index handling. Part of the Linux-NTFS - * project. + * Defines for NTFS kernel index handling. Part of the Linux-NTFS + * project. * * Copyright (c) 2004 Anton Altaparmakov */ @@ -11,13 +11,14 @@ #include -#include "types.h" -#include "layout.h" -#include "inode.h" #include "attrib.h" #include "mft.h" #include "aops.h" +#define VCN_INDEX_ROOT_PARENT ((s64)-2) + +#define MAX_PARENT_VCN 32 + /** * @idx_ni: index inode containing the @entry described by this context * @entry: index entry (points into @ir or @ia) @@ -58,26 +59,38 @@ * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to * ensure that the changes are written to disk. */ -typedef struct { - ntfs_inode *idx_ni; - INDEX_ENTRY *entry; +struct ntfs_index_context { + struct ntfs_inode *idx_ni; + __le16 *name; + u32 name_len; + struct index_entry *entry; + __le32 cr; void *data; u16 data_len; bool is_in_root; - INDEX_ROOT *ir; - ntfs_attr_search_ctx *actx; - ntfs_inode *base_ni; - INDEX_ALLOCATION *ia; + struct index_root *ir; + struct ntfs_attr_search_ctx *actx; + struct index_block *ib; + struct ntfs_inode *base_ni; + struct index_block *ia; struct page *page; -} ntfs_index_context; - -extern ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni); -extern void ntfs_index_ctx_put(ntfs_index_context *ictx); - -extern int ntfs_index_lookup(const void *key, const int key_len, - ntfs_index_context *ictx); + struct ntfs_inode *ia_ni; + int parent_pos[MAX_PARENT_VCN]; /* parent entries' positions */ + s64 parent_vcn[MAX_PARENT_VCN]; /* entry's parent nodes */ + int pindex; /* maximum it's the number of the parent nodes */ + bool ib_dirty; + u32 block_size; + u8 vcn_size_bits; + bool sync_write; +}; -#ifdef NTFS_RW +int ntfs_index_entry_inconsistent(struct ntfs_index_context *icx, struct ntfs_volume *vol, + const struct index_entry *ie, __le32 collation_rule, u64 inum); +struct ntfs_index_context *ntfs_index_ctx_get(struct ntfs_inode *ni, __le16 *name, + u32 name_len); +void ntfs_index_ctx_put(struct ntfs_index_context *ictx); +int ntfs_index_lookup(const void *key, const int key_len, + struct ntfs_index_context *ictx); /** * ntfs_index_entry_flush_dcache_page - flush_dcache_page() for index entries @@ -94,41 +107,21 @@ extern int ntfs_index_lookup(const void *key, const int key_len, * If the index entry is in an index block belonging to the index allocation * attribute, simply flush the page cache page containing the index block. */ -static inline void ntfs_index_entry_flush_dcache_page(ntfs_index_context *ictx) +static inline void ntfs_index_entry_flush_dcache_page(struct ntfs_index_context *ictx) { - if (ictx->is_in_root) - flush_dcache_mft_record_page(ictx->actx->ntfs_ino); - else + if (!ictx->is_in_root) flush_dcache_page(ictx->page); } -/** - * ntfs_index_entry_mark_dirty - mark an index entry dirty - * @ictx: ntfs index context describing the index entry - * - * Mark the index entry described by the index entry context @ictx dirty. - * - * If the index entry is in the index root attribute, simply mark the mft - * record containing the index root attribute dirty. This ensures the mft - * record, and hence the index root attribute, will be written out to disk - * later. - * - * If the index entry is in an index block belonging to the index allocation - * attribute, mark the buffers belonging to the index record as well as the - * page cache page the index block is in dirty. This automatically marks the - * VFS inode of the ntfs index inode to which the index entry belongs dirty, - * too (I_DIRTY_PAGES) and this in turn ensures the page buffers, and hence the - * dirty index block, will be written out to disk later. - */ -static inline void ntfs_index_entry_mark_dirty(ntfs_index_context *ictx) -{ - if (ictx->is_in_root) - mark_mft_record_dirty(ictx->actx->ntfs_ino); - else - mark_ntfs_record_dirty(ictx->page, - (u8*)ictx->ia - (u8*)page_address(ictx->page)); -} - -#endif /* NTFS_RW */ +void ntfs_index_entry_mark_dirty(struct ntfs_index_context *ictx); +int ntfs_index_add_filename(struct ntfs_inode *ni, struct file_name_attr *fn, u64 mref); +int ntfs_index_remove(struct ntfs_inode *ni, const void *key, const int keylen); +struct ntfs_inode *ntfs_ia_open(struct ntfs_index_context *icx, struct ntfs_inode *ni); +struct index_entry *ntfs_index_walk_down(struct index_entry *ie, struct ntfs_index_context *ictx); +struct index_entry *ntfs_index_next(struct index_entry *ie, struct ntfs_index_context *ictx); +int ntfs_index_rm(struct ntfs_index_context *icx); +void ntfs_index_ctx_reinit(struct ntfs_index_context *icx); +int ntfs_ie_add(struct ntfs_index_context *icx, struct index_entry *ie); +int ntfs_icx_ib_sync_write(struct ntfs_index_context *icx); #endif /* _LINUX_NTFS_INDEX_H */ diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h index 147ef4ddb691..a22798e1d756 100644 --- a/fs/ntfs/inode.h +++ b/fs/ntfs/inode.h @@ -1,45 +1,43 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * inode.h - Defines for inode structures NTFS Linux kernel driver. Part of - * the Linux-NTFS project. + * Defines for inode structures NTFS Linux kernel driver. Part of + * the Linux-NTFS project. * * Copyright (c) 2001-2007 Anton Altaparmakov * Copyright (c) 2002 Richard Russon + * Copyright (c) 2025 LG Electronics Co., Ltd. */ #ifndef _LINUX_NTFS_INODE_H #define _LINUX_NTFS_INODE_H -#include - -#include -#include -#include -#include -#include - -#include "layout.h" -#include "volume.h" -#include "types.h" -#include "runlist.h" #include "debug.h" -typedef struct _ntfs_inode ntfs_inode; +enum ntfs_inode_mutex_lock_class { + NTFS_INODE_MUTEX_PARENT, + NTFS_INODE_MUTEX_NORMAL, + NTFS_INODE_MUTEX_PARENT_2, + NTFS_INODE_MUTEX_NORMAL_2, + NTFS_REPARSE_MUTEX_PARENT, + NTFS_EA_MUTEX_NORMAL +}; /* * The NTFS in-memory inode structure. It is just used as an extension to the * fields already provided in the VFS inode. */ -struct _ntfs_inode { +struct ntfs_inode { rwlock_t size_lock; /* Lock serializing access to inode sizes. */ - s64 initialized_size; /* Copy from the attribute record. */ - s64 allocated_size; /* Copy from the attribute record. */ - unsigned long state; /* NTFS specific flags describing this inode. - See ntfs_inode_state_bits below. */ + unsigned long state; /* + * NTFS specific flags describing this inode. + * See ntfs_inode_state_bits below. + */ + __le32 flags; /* Flags describing the file. (Copy from STANDARD_INFORMATION) */ unsigned long mft_no; /* Number of the mft record / inode. */ u16 seq_no; /* Sequence number of the mft record. */ atomic_t count; /* Inode reference count for book keeping. */ - ntfs_volume *vol; /* Pointer to the ntfs volume of this inode. */ + struct ntfs_volume *vol; /* Pointer to the ntfs volume of this inode. */ + /* * If NInoAttr() is true, the below fields describe the attribute which * this fake inode belongs to. The actual inode of this attribute is @@ -49,111 +47,153 @@ struct _ntfs_inode { * name_len = 0 for files and name = I30 (global constant) and * name_len = 4 for directories. */ - ATTR_TYPE type; /* Attribute type of this fake inode. */ - ntfschar *name; /* Attribute name of this fake inode. */ + __le32 type; /* Attribute type of this fake inode. */ + __le16 *name; /* Attribute name of this fake inode. */ u32 name_len; /* Attribute name length of this fake inode. */ - runlist runlist; /* If state has the NI_NonResident bit set, - the runlist of the unnamed data attribute - (if a file) or of the index allocation - attribute (directory) or of the attribute - described by the fake inode (if NInoAttr()). - If runlist.rl is NULL, the runlist has not - been read in yet or has been unmapped. If - NI_NonResident is clear, the attribute is - resident (file and fake inode) or there is - no $I30 index allocation attribute - (small directory). In the latter case - runlist.rl is always NULL.*/ + struct runlist runlist; /* + * If state has the NI_NonResident bit set, + * the runlist of the unnamed data attribute + * (if a file) or of the index allocation + * attribute (directory) or of the attribute + * described by the fake inode (if NInoAttr()). + * If runlist.rl is NULL, the runlist has not + * been read in yet or has been unmapped. If + * NI_NonResident is clear, the attribute is + * resident (file and fake inode) or there is + * no $I30 index allocation attribute + * (small directory). In the latter case + * runlist.rl is always NULL. + */ + s64 lcn_seek_trunc; + + s64 data_size; /* Copy from the attribute record. */ + s64 initialized_size; /* Copy from the attribute record. */ + s64 allocated_size; /* Copy from the attribute record. */ + + struct timespec64 i_crtime; + /* * The following fields are only valid for real inodes and extent * inodes. */ - struct mutex mrec_lock; /* Lock for serializing access to the - mft record belonging to this inode. */ - struct page *page; /* The page containing the mft record of the - inode. This should only be touched by the - (un)map_mft_record*() functions. */ - int page_ofs; /* Offset into the page at which the mft record - begins. This should only be touched by the - (un)map_mft_record*() functions. */ + void *mrec; + struct mutex mrec_lock; /* + * Lock for serializing access to the + * mft record belonging to this inode. + */ + struct folio *folio; /* + * The folio containing the mft record of the + * inode. This should only be touched by the + * (un)map_mft_record*() functions. + */ + int folio_ofs; /* + * Offset into the folio at which the mft record + * begins. This should only be touched by the + * (un)map_mft_record*() functions. + */ + s64 mft_lcn[2]; /* s64 number containing the mft record */ + unsigned int mft_lcn_count; + /* * Attribute list support (only for use by the attribute lookup * functions). Setup during read_inode for all inodes with attribute - * lists. Only valid if NI_AttrList is set in state, and attr_list_rl is - * further only valid if NI_AttrListNonResident is set. + * lists. Only valid if NI_AttrList is set in state. */ u32 attr_list_size; /* Length of attribute list value in bytes. */ u8 *attr_list; /* Attribute list value itself. */ - runlist attr_list_rl; /* Run list for the attribute list value. */ + union { struct { /* It is a directory, $MFT, or an index inode. */ u32 block_size; /* Size of an index block. */ - u32 vcn_size; /* Size of a vcn in this - index. */ - COLLATION_RULE collation_rule; /* The collation rule - for the index. */ - u8 block_size_bits; /* Log2 of the above. */ + u32 vcn_size; /* Size of a vcn in this index. */ + __le32 collation_rule; /* The collation rule for the index. */ + u8 block_size_bits; /* Log2 of the above. */ u8 vcn_size_bits; /* Log2 of the above. */ } index; struct { /* It is a compressed/sparse file/attribute inode. */ - s64 size; /* Copy of compressed_size from - $DATA. */ - u32 block_size; /* Size of a compression block - (cb). */ + s64 size; /* Copy of compressed_size from $DATA. */ + u32 block_size; /* Size of a compression block (cb). */ u8 block_size_bits; /* Log2 of the size of a cb. */ u8 block_clusters; /* Number of clusters per cb. */ } compressed; } itype; - struct mutex extent_lock; /* Lock for accessing/modifying the - below . */ - s32 nr_extents; /* For a base mft record, the number of attached extent - inodes (0 if none), for extent records and for fake - inodes describing an attribute this is -1. */ + struct mutex extent_lock; /* Lock for accessing/modifying the below . */ + s32 nr_extents; /* + * For a base mft record, the number of attached extent\ + * inodes (0 if none), for extent records and for fake + * inodes describing an attribute this is -1. + */ union { /* This union is only used if nr_extents != 0. */ - ntfs_inode **extent_ntfs_inos; /* For nr_extents > 0, array of - the ntfs inodes of the extent - mft records belonging to - this base inode which have - been loaded. */ - ntfs_inode *base_ntfs_ino; /* For nr_extents == -1, the - ntfs inode of the base mft - record. For fake inodes, the - real (base) inode to which - the attribute belongs. */ + struct ntfs_inode **extent_ntfs_inos; /* + * For nr_extents > 0, array of + * the ntfs inodes of the extent + * mft records belonging to + * this base inode which have + * been loaded. + */ + struct ntfs_inode *base_ntfs_ino; /* + * For nr_extents == -1, the + * ntfs inode of the base mft + * record. For fake inodes, the + * real (base) inode to which + * the attribute belongs. + */ } ext; + + unsigned int i_dealloc_clusters; + char *target; }; /* * Defined bits for the state field in the ntfs_inode structure. * (f) = files only, (d) = directories only, (a) = attributes/fake inodes only */ -typedef enum { +enum { NI_Dirty, /* 1: Mft record needs to be written to disk. */ + NI_AttrListDirty, /* 1: Mft record contains an attribute list. */ NI_AttrList, /* 1: Mft record contains an attribute list. */ - NI_AttrListNonResident, /* 1: Attribute list is non-resident. Implies - NI_AttrList is set. */ - - NI_Attr, /* 1: Fake inode for attribute i/o. - 0: Real inode or extent inode. */ - - NI_MstProtected, /* 1: Attribute is protected by MST fixups. - 0: Attribute is not protected by fixups. */ - NI_NonResident, /* 1: Unnamed data attr is non-resident (f). - 1: Attribute is non-resident (a). */ - NI_IndexAllocPresent = NI_NonResident, /* 1: $I30 index alloc attr is - present (d). */ - NI_Compressed, /* 1: Unnamed data attr is compressed (f). - 1: Create compressed files by default (d). - 1: Attribute is compressed (a). */ - NI_Encrypted, /* 1: Unnamed data attr is encrypted (f). - 1: Create encrypted files by default (d). - 1: Attribute is encrypted (a). */ - NI_Sparse, /* 1: Unnamed data attr is sparse (f). - 1: Create sparse files by default (d). - 1: Attribute is sparse (a). */ + NI_AttrListNonResident, /* + * 1: Attribute list is non-resident. Implies + * NI_AttrList is set. + */ + + NI_Attr, /* + * 1: Fake inode for attribute i/o. + * 0: Real inode or extent inode. + */ + + NI_MstProtected, /* + * 1: Attribute is protected by MST fixups. + * 0: Attribute is not protected by fixups. + */ + NI_NonResident, /* + * 1: Unnamed data attr is non-resident (f). + * 1: Attribute is non-resident (a). + */ + NI_IndexAllocPresent, /* 1: $I30 index alloc attr is present (d). */ + NI_Compressed, /* + * 1: Unnamed data attr is compressed (f). + * 1: Create compressed files by default (d). + * 1: Attribute is compressed (a). + */ + NI_Encrypted, /* + * 1: Unnamed data attr is encrypted (f). + * 1: Create encrypted files by default (d). + * 1: Attribute is encrypted (a). + */ + NI_Sparse, /* + * 1: Unnamed data attr is sparse (f). + * 1: Create sparse files by default (d). + * 1: Attribute is sparse (a). + */ NI_SparseDisabled, /* 1: May not create sparse regions. */ - NI_TruncateFailed, /* 1: Last ntfs_truncate() call failed. */ -} ntfs_inode_state_bits; + NI_FullyMapped, + NI_FileNameDirty, + NI_BeingDeleted, + NI_BeingCreated, + NI_HasEA, + NI_RunlistDirty, +}; /* * NOTE: We should be adding dirty mft records to a list somewhere and they @@ -165,37 +205,38 @@ typedef enum { * Macro tricks to expand the NInoFoo(), NInoSetFoo(), and NInoClearFoo() * functions. */ -#define NINO_FNS(flag) \ -static inline int NIno##flag(ntfs_inode *ni) \ -{ \ - return test_bit(NI_##flag, &(ni)->state); \ -} \ -static inline void NInoSet##flag(ntfs_inode *ni) \ -{ \ - set_bit(NI_##flag, &(ni)->state); \ -} \ -static inline void NInoClear##flag(ntfs_inode *ni) \ -{ \ - clear_bit(NI_##flag, &(ni)->state); \ +#define NINO_FNS(flag) \ +static inline int NIno##flag(struct ntfs_inode *ni) \ +{ \ + return test_bit(NI_##flag, &(ni)->state); \ +} \ +static inline void NInoSet##flag(struct ntfs_inode *ni) \ +{ \ + set_bit(NI_##flag, &(ni)->state); \ +} \ +static inline void NInoClear##flag(struct ntfs_inode *ni) \ +{ \ + clear_bit(NI_##flag, &(ni)->state); \ } /* * As above for NInoTestSetFoo() and NInoTestClearFoo(). */ -#define TAS_NINO_FNS(flag) \ -static inline int NInoTestSet##flag(ntfs_inode *ni) \ -{ \ - return test_and_set_bit(NI_##flag, &(ni)->state); \ -} \ -static inline int NInoTestClear##flag(ntfs_inode *ni) \ -{ \ - return test_and_clear_bit(NI_##flag, &(ni)->state); \ +#define TAS_NINO_FNS(flag) \ +static inline int NInoTestSet##flag(struct ntfs_inode *ni) \ +{ \ + return test_and_set_bit(NI_##flag, &(ni)->state); \ +} \ +static inline int NInoTestClear##flag(struct ntfs_inode *ni) \ +{ \ + return test_and_clear_bit(NI_##flag, &(ni)->state); \ } /* Emit the ntfs inode bitops functions. */ NINO_FNS(Dirty) TAS_NINO_FNS(Dirty) NINO_FNS(AttrList) +NINO_FNS(AttrListDirty) NINO_FNS(AttrListNonResident) NINO_FNS(Attr) NINO_FNS(MstProtected) @@ -205,17 +246,22 @@ NINO_FNS(Compressed) NINO_FNS(Encrypted) NINO_FNS(Sparse) NINO_FNS(SparseDisabled) -NINO_FNS(TruncateFailed) +NINO_FNS(FullyMapped) +NINO_FNS(FileNameDirty) +TAS_NINO_FNS(FileNameDirty) +NINO_FNS(BeingDeleted) +NINO_FNS(HasEA) +NINO_FNS(RunlistDirty) /* * The full structure containing a ntfs_inode and a vfs struct inode. Used for * all real and fake inodes but not for extent inodes which lack the vfs struct * inode. */ -typedef struct { - ntfs_inode ntfs_inode; +struct big_ntfs_inode { + struct ntfs_inode ntfs_inode; struct inode vfs_inode; /* The vfs inode structure. */ -} big_ntfs_inode; +}; /** * NTFS_I - return the ntfs inode given a vfs inode @@ -223,22 +269,18 @@ typedef struct { * * NTFS_I() returns the ntfs inode associated with the VFS @inode. */ -static inline ntfs_inode *NTFS_I(struct inode *inode) +static inline struct ntfs_inode *NTFS_I(struct inode *inode) { - return (ntfs_inode *)container_of(inode, big_ntfs_inode, vfs_inode); + return (struct ntfs_inode *)container_of(inode, struct big_ntfs_inode, vfs_inode); } -static inline struct inode *VFS_I(ntfs_inode *ni) +static inline struct inode *VFS_I(struct ntfs_inode *ni) { - return &((big_ntfs_inode *)ni)->vfs_inode; + return &((struct big_ntfs_inode *)ni)->vfs_inode; } /** * ntfs_attr - ntfs in memory attribute structure - * @mft_no: mft record number of the base mft record of this attribute - * @name: Unicode name of the attribute (NULL if unnamed) - * @name_len: length of @name in Unicode characters (0 if unnamed) - * @type: attribute type (see layout.h) * * This structure exists only to provide a small structure for the * ntfs_{attr_}iget()/ntfs_test_inode()/ntfs_init_locked_inode() mechanism. @@ -246,65 +288,67 @@ static inline struct inode *VFS_I(ntfs_inode *ni) * NOTE: Elements are ordered by size to make the structure as compact as * possible on all architectures. */ -typedef struct { +struct ntfs_attr { unsigned long mft_no; - ntfschar *name; + __le16 *name; u32 name_len; - ATTR_TYPE type; -} ntfs_attr; - -extern int ntfs_test_inode(struct inode *vi, void *data); + __le32 type; + unsigned long state; +}; -extern struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no); -extern struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type, - ntfschar *name, u32 name_len); -extern struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name, +int ntfs_test_inode(struct inode *vi, void *data); +struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no); +struct inode *ntfs_attr_iget(struct inode *base_vi, __le32 type, + __le16 *name, u32 name_len); +struct inode *ntfs_index_iget(struct inode *base_vi, __le16 *name, u32 name_len); - -extern struct inode *ntfs_alloc_big_inode(struct super_block *sb); -extern void ntfs_free_big_inode(struct inode *inode); -extern void ntfs_evict_big_inode(struct inode *vi); - -extern void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni); +struct inode *ntfs_alloc_big_inode(struct super_block *sb); +void ntfs_free_big_inode(struct inode *inode); +int ntfs_drop_big_inode(struct inode *inode); +void ntfs_evict_big_inode(struct inode *vi); +void __ntfs_init_inode(struct super_block *sb, struct ntfs_inode *ni); static inline void ntfs_init_big_inode(struct inode *vi) { - ntfs_inode *ni = NTFS_I(vi); + struct ntfs_inode *ni = NTFS_I(vi); ntfs_debug("Entering."); __ntfs_init_inode(vi->i_sb, ni); ni->mft_no = vi->i_ino; } -extern ntfs_inode *ntfs_new_extent_inode(struct super_block *sb, +struct ntfs_inode *ntfs_new_extent_inode(struct super_block *sb, unsigned long mft_no); -extern void ntfs_clear_extent_inode(ntfs_inode *ni); - -extern int ntfs_read_inode_mount(struct inode *vi); - -extern int ntfs_show_options(struct seq_file *sf, struct dentry *root); - -#ifdef NTFS_RW - -extern int ntfs_truncate(struct inode *vi); -extern void ntfs_truncate_vfs(struct inode *vi); - -extern int ntfs_setattr(struct mnt_idmap *idmap, - struct dentry *dentry, struct iattr *attr); - -extern int __ntfs_write_inode(struct inode *vi, int sync); +void ntfs_clear_extent_inode(struct ntfs_inode *ni); +int ntfs_read_inode_mount(struct inode *vi); +int ntfs_show_options(struct seq_file *sf, struct dentry *root); +int ntfs_truncate_vfs(struct inode *vi, loff_t new_size, loff_t i_size); + +int ntfsp_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr); +int ntfsp_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, unsigned int request_mask, + unsigned int query_flags); + +int ntfs_get_block_mft_record(struct ntfs_inode *mft_ni, struct ntfs_inode *ni); +int __ntfs_write_inode(struct inode *vi, int sync); +int ntfs_inode_attach_all_extents(struct ntfs_inode *ni); +int ntfs_inode_add_attrlist(struct ntfs_inode *ni); +void ntfs_destroy_ext_inode(struct ntfs_inode *ni); +int ntfs_inode_free_space(struct ntfs_inode *ni, int size); +s64 ntfs_inode_attr_pread(struct inode *vi, s64 pos, s64 count, u8 *buf); +s64 ntfs_inode_attr_pwrite(struct inode *vi, s64 pos, s64 count, u8 *buf, + bool sync); +int ntfs_inode_close(struct ntfs_inode *ni); static inline void ntfs_commit_inode(struct inode *vi) { - if (!is_bad_inode(vi)) - __ntfs_write_inode(vi, 1); - return; + __ntfs_write_inode(vi, 1); } -#else - -static inline void ntfs_truncate_vfs(struct inode *vi) {} - -#endif /* NTFS_RW */ +int ntfs_inode_sync_filename(struct ntfs_inode *ni); +int ntfs_extend_initialized_size(struct inode *vi, const loff_t offset, + const loff_t new_size); +void ntfs_set_vfs_operations(struct inode *inode, mode_t mode, dev_t dev); #endif /* _LINUX_NTFS_INODE_H */ diff --git a/fs/ntfs/iomap.h b/fs/ntfs/iomap.h new file mode 100644 index 000000000000..c2602f3ff11a --- /dev/null +++ b/fs/ntfs/iomap.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/** + * Copyright (c) 2025 LG Electronics Co., Ltd. + */ + +#ifndef _LINUX_NTFS_IOMAP_H +#define _LINUX_NTFS_IOMAP_H + +#include +#include + +#include "volume.h" +#include "inode.h" + +extern const struct iomap_ops ntfs_write_iomap_ops; +extern const struct iomap_ops ntfs_read_iomap_ops; +extern const struct iomap_ops ntfs_page_mkwrite_iomap_ops; +extern const struct iomap_ops ntfs_dio_iomap_ops; +extern const struct iomap_writeback_ops ntfs_writeback_ops; +extern const struct iomap_write_ops ntfs_iomap_folio_ops; +extern int ntfs_zero_range(struct inode *inode, loff_t offset, loff_t length, bool bdirect); +#endif /* _LINUX_NTFS_IOMAP_H */ diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h index 5d4bf7a3259f..a29ea10d9a37 100644 --- a/fs/ntfs/layout.h +++ b/fs/ntfs/layout.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * layout.h - All NTFS associated on-disk structures. Part of the Linux-NTFS - * project. + * All NTFS associated on-disk structures. Part of the Linux-NTFS + * project. * * Copyright (c) 2001-2005 Anton Altaparmakov * Copyright (c) 2002 Richard Russon @@ -15,8 +15,6 @@ #include #include -#include "types.h" - /* The NTFS oem_id "NTFS " */ #define magicNTFS cpu_to_le64(0x202020205346544eULL) @@ -34,54 +32,60 @@ /* * BIOS parameter block (bpb) structure. */ -typedef struct { - le16 bytes_per_sector; /* Size of a sector in bytes. */ +struct bios_parameter_block { + __le16 bytes_per_sector; /* Size of a sector in bytes. */ u8 sectors_per_cluster; /* Size of a cluster in sectors. */ - le16 reserved_sectors; /* zero */ + __le16 reserved_sectors; /* zero */ u8 fats; /* zero */ - le16 root_entries; /* zero */ - le16 sectors; /* zero */ + __le16 root_entries; /* zero */ + __le16 sectors; /* zero */ u8 media_type; /* 0xf8 = hard disk */ - le16 sectors_per_fat; /* zero */ - le16 sectors_per_track; /* irrelevant */ - le16 heads; /* irrelevant */ - le32 hidden_sectors; /* zero */ - le32 large_sectors; /* zero */ -} __attribute__ ((__packed__)) BIOS_PARAMETER_BLOCK; + __le16 sectors_per_fat; /* zero */ + __le16 sectors_per_track; /* irrelevant */ + __le16 heads; /* irrelevant */ + __le32 hidden_sectors; /* zero */ + __le32 large_sectors; /* zero */ +} __packed; /* * NTFS boot sector structure. */ -typedef struct { +struct ntfs_boot_sector { u8 jump[3]; /* Irrelevant (jump to boot up code).*/ - le64 oem_id; /* Magic "NTFS ". */ - BIOS_PARAMETER_BLOCK bpb; /* See BIOS_PARAMETER_BLOCK. */ - u8 unused[4]; /* zero, NTFS diskedit.exe states that - this is actually: - __u8 physical_drive; // 0x80 - __u8 current_head; // zero - __u8 extended_boot_signature; - // 0x80 - __u8 unused; // zero + __le64 oem_id; /* Magic "NTFS ". */ + struct bios_parameter_block bpb; /* See BIOS_PARAMETER_BLOCK. */ + u8 unused[4]; /* + * zero, NTFS diskedit.exe states that + * this is actually: + * __u8 physical_drive; // 0x80 + * __u8 current_head; // zero + * __u8 extended_boot_signature; + * // 0x80 + * __u8 unused; // zero + */ + __le64 number_of_sectors; /* + * Number of sectors in volume. Gives + * maximum volume size of 2^63 sectors. + * Assuming standard sector size of 512 + * bytes, the maximum byte size is + * approx. 4.7x10^21 bytes. (-; */ -/*0x28*/sle64 number_of_sectors; /* Number of sectors in volume. Gives - maximum volume size of 2^63 sectors. - Assuming standard sector size of 512 - bytes, the maximum byte size is - approx. 4.7x10^21 bytes. (-; */ - sle64 mft_lcn; /* Cluster location of mft data. */ - sle64 mftmirr_lcn; /* Cluster location of copy of mft. */ + __le64 mft_lcn; /* Cluster location of mft data. */ + __le64 mftmirr_lcn; /* Cluster location of copy of mft. */ s8 clusters_per_mft_record; /* Mft record size in clusters. */ u8 reserved0[3]; /* zero */ s8 clusters_per_index_record; /* Index block size in clusters. */ u8 reserved1[3]; /* zero */ - le64 volume_serial_number; /* Irrelevant (serial number). */ - le32 checksum; /* Boot sector checksum. */ -/*0x54*/u8 bootstrap[426]; /* Irrelevant (boot up code). */ - le16 end_of_sector_marker; /* End of bootsector magic. Always is - 0xaa55 in little endian. */ -/* sizeof() = 512 (0x200) bytes */ -} __attribute__ ((__packed__)) NTFS_BOOT_SECTOR; + __le64 volume_serial_number; /* Irrelevant (serial number). */ + __le32 checksum; /* Boot sector checksum. */ + u8 bootstrap[426]; /* Irrelevant (boot up code). */ + __le16 end_of_sector_marker; /* + * End of bootsector magic. Always is + * 0xaa55 in little endian. + */ +} __packed; + +static_assert(sizeof(struct ntfs_boot_sector) == 512); /* * Magic identifiers present at the beginning of all ntfs record containing @@ -93,37 +97,37 @@ enum { magic_INDX = cpu_to_le32(0x58444e49), /* Index buffer. */ magic_HOLE = cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */ - /* Found in $LogFile/$DATA. */ + /* Found in LogFile/DATA. */ magic_RSTR = cpu_to_le32(0x52545352), /* Restart page. */ magic_RCRD = cpu_to_le32(0x44524352), /* Log record page. */ - /* Found in $LogFile/$DATA. (May be found in $MFT/$DATA, also?) */ + /* Found in LogFile/DATA. (May be found in $MFT/$DATA, also?) */ magic_CHKD = cpu_to_le32(0x444b4843), /* Modified by chkdsk. */ /* Found in all ntfs record containing records. */ - magic_BAAD = cpu_to_le32(0x44414142), /* Failed multi sector - transfer was detected. */ + magic_BAAD = cpu_to_le32(0x44414142), /* + * Failed multi sector + * transfer was detected. + */ /* - * Found in $LogFile/$DATA when a page is full of 0xff bytes and is + * Found in LogFile/DATA when a page is full of 0xff bytes and is * thus not initialized. Page must be initialized before using it. */ magic_empty = cpu_to_le32(0xffffffff) /* Record is empty. */ }; -typedef le32 NTFS_RECORD_TYPE; - /* * Generic magic comparison macros. Finally found a use for the ## preprocessor * operator! (-8 */ -static inline bool __ntfs_is_magic(le32 x, NTFS_RECORD_TYPE r) +static inline bool __ntfs_is_magic(__le32 x, __le32 r) { return (x == r); } #define ntfs_is_magic(x, m) __ntfs_is_magic(x, magic_##m) -static inline bool __ntfs_is_magicp(le32 *p, NTFS_RECORD_TYPE r) +static inline bool __ntfs_is_magicp(__le32 *p, __le32 r) { return (*p == r); } @@ -132,31 +136,31 @@ static inline bool __ntfs_is_magicp(le32 *p, NTFS_RECORD_TYPE r) /* * Specialised magic comparison macros for the NTFS_RECORD_TYPEs defined above. */ -#define ntfs_is_file_record(x) ( ntfs_is_magic (x, FILE) ) -#define ntfs_is_file_recordp(p) ( ntfs_is_magicp(p, FILE) ) -#define ntfs_is_mft_record(x) ( ntfs_is_file_record (x) ) -#define ntfs_is_mft_recordp(p) ( ntfs_is_file_recordp(p) ) -#define ntfs_is_indx_record(x) ( ntfs_is_magic (x, INDX) ) -#define ntfs_is_indx_recordp(p) ( ntfs_is_magicp(p, INDX) ) -#define ntfs_is_hole_record(x) ( ntfs_is_magic (x, HOLE) ) -#define ntfs_is_hole_recordp(p) ( ntfs_is_magicp(p, HOLE) ) +#define ntfs_is_file_record(x) (ntfs_is_magic(x, FILE)) +#define ntfs_is_file_recordp(p) (ntfs_is_magicp(p, FILE)) +#define ntfs_is_mft_record(x) (ntfs_is_file_record(x)) +#define ntfs_is_mft_recordp(p) (ntfs_is_file_recordp(p)) +#define ntfs_is_indx_record(x) (ntfs_is_magic(x, INDX)) +#define ntfs_is_indx_recordp(p) (ntfs_is_magicp(p, INDX)) +#define ntfs_is_hole_record(x) (ntfs_is_magic(x, HOLE)) +#define ntfs_is_hole_recordp(p) (ntfs_is_magicp(p, HOLE)) -#define ntfs_is_rstr_record(x) ( ntfs_is_magic (x, RSTR) ) -#define ntfs_is_rstr_recordp(p) ( ntfs_is_magicp(p, RSTR) ) -#define ntfs_is_rcrd_record(x) ( ntfs_is_magic (x, RCRD) ) -#define ntfs_is_rcrd_recordp(p) ( ntfs_is_magicp(p, RCRD) ) +#define ntfs_is_rstr_record(x) (ntfs_is_magic(x, RSTR)) +#define ntfs_is_rstr_recordp(p) (ntfs_is_magicp(p, RSTR)) +#define ntfs_is_rcrd_record(x) (ntfs_is_magic(x, RCRD)) +#define ntfs_is_rcrd_recordp(p) (ntfs_is_magicp(p, RCRD)) -#define ntfs_is_chkd_record(x) ( ntfs_is_magic (x, CHKD) ) -#define ntfs_is_chkd_recordp(p) ( ntfs_is_magicp(p, CHKD) ) +#define ntfs_is_chkd_record(x) (ntfs_is_magic(x, CHKD)) +#define ntfs_is_chkd_recordp(p) (ntfs_is_magicp(p, CHKD)) -#define ntfs_is_baad_record(x) ( ntfs_is_magic (x, BAAD) ) -#define ntfs_is_baad_recordp(p) ( ntfs_is_magicp(p, BAAD) ) +#define ntfs_is_baad_record(x) (ntfs_is_magic(x, BAAD)) +#define ntfs_is_baad_recordp(p) (ntfs_is_magicp(p, BAAD)) -#define ntfs_is_empty_record(x) ( ntfs_is_magic (x, empty) ) -#define ntfs_is_empty_recordp(p) ( ntfs_is_magicp(p, empty) ) +#define ntfs_is_empty_record(x) (ntfs_is_magic(x, empty)) +#define ntfs_is_empty_recordp(p) (ntfs_is_magicp(p, empty)) /* - * The Update Sequence Array (usa) is an array of the le16 values which belong + * The Update Sequence Array (usa) is an array of the __le16 values which belong * to the end of each sector protected by the update sequence record in which * this array is contained. Note that the first entry is the Update Sequence * Number (usn), a cyclic counter of how many times the protected record has @@ -166,21 +170,27 @@ static inline bool __ntfs_is_magicp(le32 *p, NTFS_RECORD_TYPE r) * transfer has occurred when the data was written. * The maximum size for the update sequence array is fixed to: * maximum size = usa_ofs + (usa_count * 2) = 510 bytes - * The 510 bytes comes from the fact that the last le16 in the array has to - * (obviously) finish before the last le16 of the first 512-byte sector. + * The 510 bytes comes from the fact that the last __le16 in the array has to + * (obviously) finish before the last __le16 of the first 512-byte sector. * This formula can be used as a consistency check in that usa_ofs + * (usa_count * 2) has to be less than or equal to 510. */ -typedef struct { - NTFS_RECORD_TYPE magic; /* A four-byte magic identifying the record - type and/or status. */ - le16 usa_ofs; /* Offset to the Update Sequence Array (usa) - from the start of the ntfs record. */ - le16 usa_count; /* Number of le16 sized entries in the usa - including the Update Sequence Number (usn), - thus the number of fixups is the usa_count - minus 1. */ -} __attribute__ ((__packed__)) NTFS_RECORD; +struct ntfs_record { + __le32 magic; /* + * A four-byte magic identifying the record + * type and/or status. + */ + __le16 usa_ofs; /* + * Offset to the Update Sequence Array (usa) + * from the start of the ntfs record. + */ + __le16 usa_count; /* + * Number of __le16 sized entries in the usa + * including the Update Sequence Number (usn), + * thus the number of fixups is the usa_count + * minus 1. + */ +} __packed; /* * System files mft record numbers. All these files are always marked as used @@ -189,55 +199,77 @@ typedef struct { * of the system files is always equal to their mft record number and it is * never modified. */ -typedef enum { - FILE_MFT = 0, /* Master file table (mft). Data attribute - contains the entries and bitmap attribute - records which ones are in use (bit==1). */ +enum { + FILE_MFT = 0, /* + * Master file table (mft). Data attribute + * contains the entries and bitmap attribute + * records which ones are in use (bit==1). + */ FILE_MFTMirr = 1, /* Mft mirror: copy of first four mft records - in data attribute. If cluster size > 4kiB, - copy of first N mft records, with - N = cluster_size / mft_record_size. */ + * in data attribute. If cluster size > 4kiB, + * copy of first N mft records, with + * N = cluster_size / mft_record_size. + */ FILE_LogFile = 2, /* Journalling log in data attribute. */ - FILE_Volume = 3, /* Volume name attribute and volume information - attribute (flags and ntfs version). Windows - refers to this file as volume DASD (Direct - Access Storage Device). */ - FILE_AttrDef = 4, /* Array of attribute definitions in data - attribute. */ + FILE_Volume = 3, /* + * Volume name attribute and volume information + * attribute (flags and ntfs version). Windows + * refers to this file as volume DASD (Direct + * Access Storage Device). + */ + FILE_AttrDef = 4, /* + * Array of attribute definitions in data + * attribute. + */ FILE_root = 5, /* Root directory. */ - FILE_Bitmap = 6, /* Allocation bitmap of all clusters (lcns) in - data attribute. */ - FILE_Boot = 7, /* Boot sector (always at cluster 0) in data - attribute. */ - FILE_BadClus = 8, /* Contains all bad clusters in the non-resident - data attribute. */ - FILE_Secure = 9, /* Shared security descriptors in data attribute - and two indexes into the descriptors. - Appeared in Windows 2000. Before that, this - file was named $Quota but was unused. */ - FILE_UpCase = 10, /* Uppercase equivalents of all 65536 Unicode - characters in data attribute. */ - FILE_Extend = 11, /* Directory containing other system files (eg. - $ObjId, $Quota, $Reparse and $UsnJrnl). This - is new to NTFS3.0. */ + FILE_Bitmap = 6, /* + * Allocation bitmap of all clusters (lcns) in + * data attribute. + */ + FILE_Boot = 7, /* + * Boot sector (always at cluster 0) in data + * attribute. + */ + FILE_BadClus = 8, /* + * Contains all bad clusters in the non-resident + * data attribute. + */ + FILE_Secure = 9, /* + * Shared security descriptors in data attribute + * and two indexes into the descriptors. + * Appeared in Windows 2000. Before that, this + * file was named $Quota but was unused. + */ + FILE_UpCase = 10, /* + * Uppercase equivalents of all 65536 Unicode + * characters in data attribute. + */ + FILE_Extend = 11, /* + * Directory containing other system files (eg. + * $ObjId, $Quota, $Reparse and $UsnJrnl). This + * is new to NTFS3.0. + */ FILE_reserved12 = 12, /* Reserved for future use (records 12-15). */ FILE_reserved13 = 13, FILE_reserved14 = 14, FILE_reserved15 = 15, - FILE_first_user = 16, /* First user file, used as test limit for - whether to allow opening a file or not. */ -} NTFS_SYSTEM_FILES; + FILE_first_user = 16, /* + * First user file, used as test limit for + * whether to allow opening a file or not. + */ +}; /* * These are the so far known MFT_RECORD_* flags (16-bit) which contain * information about the mft record in which they are present. */ enum { - MFT_RECORD_IN_USE = cpu_to_le16(0x0001), - MFT_RECORD_IS_DIRECTORY = cpu_to_le16(0x0002), -} __attribute__ ((__packed__)); - -typedef le16 MFT_RECORD_FLAGS; + MFT_RECORD_IN_USE = cpu_to_le16(0x0001), + MFT_RECORD_IS_DIRECTORY = cpu_to_le16(0x0002), + MFT_RECORD_IS_4 = cpu_to_le16(0x0004), + MFT_RECORD_IS_VIEW_INDEX = cpu_to_le16(0x0008), + MFT_REC_SPACE_FILLER = 0xffff, /*Just to make flags 16-bit.*/ +} __packed; /* * mft references (aka file references or file record segment references) are @@ -251,34 +283,14 @@ typedef le16 MFT_RECORD_FLAGS; * The sequence number is a circular counter (skipping 0) describing how many * times the referenced mft record has been (re)used. This has to match the * sequence number of the mft record being referenced, otherwise the reference - * is considered stale and removed (FIXME: only ntfsck or the driver itself?). + * is considered stale and removed. * * If the sequence number is zero it is assumed that no sequence number * consistency checking should be performed. - * - * FIXME: Since inodes are 32-bit as of now, the driver needs to always check - * for high_part being 0 and if not either BUG(), cause a panic() or handle - * the situation in some other way. This shouldn't be a problem as a volume has - * to become HUGE in order to need more than 32-bits worth of mft records. - * Assuming the standard mft record size of 1kb only the records (never mind - * the non-resident attributes, etc.) would require 4Tb of space on their own - * for the first 32 bits worth of records. This is only if some strange person - * doesn't decide to foul play and make the mft sparse which would be a really - * horrible thing to do as it would trash our current driver implementation. )-: - * Do I hear screams "we want 64-bit inodes!" ?!? (-; - * - * FIXME: The mft zone is defined as the first 12% of the volume. This space is - * reserved so that the mft can grow contiguously and hence doesn't become - * fragmented. Volume free space includes the empty part of the mft zone and - * when the volume's free 88% are used up, the mft zone is shrunk by a factor - * of 2, thus making more space available for more files/data. This process is - * repeated every time there is no more free space except for the mft zone until - * there really is no more free space. - */ - -/* - * Typedef the MFT_REF as a 64-bit value for easier handling. - * Also define two unpacking macros to get to the reference (MREF) and + */ + +/* + * Define two unpacking macros to get to the reference (MREF) and * sequence number (MSEQNO) respectively. * The _LE versions are to be applied on little endian MFT_REFs. * Note: The _LE versions will return a CPU endian formatted value! @@ -286,16 +298,14 @@ typedef le16 MFT_RECORD_FLAGS; #define MFT_REF_MASK_CPU 0x0000ffffffffffffULL #define MFT_REF_MASK_LE cpu_to_le64(MFT_REF_MASK_CPU) -typedef u64 MFT_REF; -typedef le64 leMFT_REF; - -#define MK_MREF(m, s) ((MFT_REF)(((MFT_REF)(s) << 48) | \ - ((MFT_REF)(m) & MFT_REF_MASK_CPU))) +#define MK_MREF(m, s) ((u64)(((u64)(s) << 48) | \ + ((u64)(m) & MFT_REF_MASK_CPU))) #define MK_LE_MREF(m, s) cpu_to_le64(MK_MREF(m, s)) #define MREF(x) ((unsigned long)((x) & MFT_REF_MASK_CPU)) #define MSEQNO(x) ((u16)(((x) >> 48) & 0xffff)) #define MREF_LE(x) ((unsigned long)(le64_to_cpu(x) & MFT_REF_MASK_CPU)) +#define MREF_INO(x) ((unsigned long)MREF_LE(x)) #define MSEQNO_LE(x) ((u16)((le64_to_cpu(x) >> 48) & 0xffff)) #define IS_ERR_MREF(x) (((x) & 0x0000800000000000ULL) ? true : false) @@ -309,63 +319,77 @@ typedef le64 leMFT_REF; * in that it only consists of the attribute type code AT_END and none of the * other members of the attribute structure are present. */ -typedef struct { -/*Ofs*/ -/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ - NTFS_RECORD_TYPE magic; /* Usually the magic is "FILE". */ - le16 usa_ofs; /* See NTFS_RECORD definition above. */ - le16 usa_count; /* See NTFS_RECORD definition above. */ - -/* 8*/ le64 lsn; /* $LogFile sequence number for this record. - Changed every time the record is modified. */ -/* 16*/ le16 sequence_number; /* Number of times this mft record has been - reused. (See description for MFT_REF - above.) NOTE: The increment (skipping zero) - is done when the file is deleted. NOTE: If - this is zero it is left zero. */ -/* 18*/ le16 link_count; /* Number of hard links, i.e. the number of - directory entries referencing this record. - NOTE: Only used in mft base records. - NOTE: When deleting a directory entry we - check the link_count and if it is 1 we - delete the file. Otherwise we delete the - FILE_NAME_ATTR being referenced by the - directory entry from the mft record and - decrement the link_count. - FIXME: Careful with Win32 + DOS names! */ -/* 20*/ le16 attrs_offset; /* Byte offset to the first attribute in this - mft record from the start of the mft record. - NOTE: Must be aligned to 8-byte boundary. */ -/* 22*/ MFT_RECORD_FLAGS flags; /* Bit array of MFT_RECORD_FLAGS. When a file - is deleted, the MFT_RECORD_IN_USE flag is - set to zero. */ -/* 24*/ le32 bytes_in_use; /* Number of bytes used in this mft record. - NOTE: Must be aligned to 8-byte boundary. */ -/* 28*/ le32 bytes_allocated; /* Number of bytes allocated for this mft - record. This should be equal to the mft - record size. */ -/* 32*/ leMFT_REF base_mft_record;/* This is zero for base mft records. - When it is not zero it is a mft reference - pointing to the base mft record to which - this record belongs (this is then used to - locate the attribute list attribute present - in the base record which describes this - extension record and hence might need - modification when the extension record - itself is modified, also locating the - attribute list also means finding the other - potential extents, belonging to the non-base - mft record). */ -/* 40*/ le16 next_attr_instance;/* The instance number that will be assigned to - the next attribute added to this mft record. - NOTE: Incremented each time after it is used. - NOTE: Every time the mft record is reused - this number is set to zero. NOTE: The first - instance number is always 0. */ +struct mft_record { + __le32 magic; /* Usually the magic is "FILE". */ + __le16 usa_ofs; /* See ntfs_record struct definition above. */ + __le16 usa_count; /* See ntfs_record struct definition above. */ + + __le64 lsn; /* + * LogFile sequence number for this record. + * Changed every time the record is modified. + */ + __le16 sequence_number; /* + * Number of times this mft record has been + * reused. (See description for MFT_REF + * above.) NOTE: The increment (skipping zero) + * is done when the file is deleted. NOTE: If + * this is zero it is left zero. + */ + __le16 link_count; /* + * Number of hard links, i.e. the number of + * directory entries referencing this record. + * NOTE: Only used in mft base records. + * NOTE: When deleting a directory entry we + * check the link_count and if it is 1 we + * delete the file. Otherwise we delete the + * struct file_name_attr being referenced by the + * directory entry from the mft record and + * decrement the link_count. + */ + __le16 attrs_offset; /* + * Byte offset to the first attribute in this + * mft record from the start of the mft record. + * NOTE: Must be aligned to 8-byte boundary. + */ + __le16 flags; /* + * Bit array of MFT_RECORD_FLAGS. When a file + * is deleted, the MFT_RECORD_IN_USE flag is + * set to zero. + */ + __le32 bytes_in_use; /* + * Number of bytes used in this mft record. + * NOTE: Must be aligned to 8-byte boundary. + */ + __le32 bytes_allocated; /* + * Number of bytes allocated for this mft + * record. This should be equal to the mft + * record size. + */ + __le64 base_mft_record; /* + * This is zero for base mft records. + * When it is not zero it is a mft reference + * pointing to the base mft record to which + * this record belongs (this is then used to + * locate the attribute list attribute present + * in the base record which describes this + * extension record and hence might need + * modification when the extension record + * itself is modified, also locating the + * attribute list also means finding the other + * potential extents, belonging to the non-base + * mft record). + */ + __le16 next_attr_instance; /* + * The instance number that will be assigned to + * the next attribute added to this mft record. + * NOTE: Incremented each time after it is used. + * NOTE: Every time the mft record is reused + * this number is set to zero. NOTE: The first + * instance number is always 0. + */ /* The below fields are specific to NTFS 3.1+ (Windows XP and above): */ -/* 42*/ le16 reserved; /* Reserved/alignment. */ -/* 44*/ le32 mft_record_number; /* Number of this mft record. */ -/* sizeof() = 48 bytes */ + __le16 reserved; /* Reserved/alignment. */ + __le32 mft_record_number; /* Number of this mft record. */ /* * When (re)using the mft record, we place the update sequence array at this * offset, i.e. before we start with the attributes. This also makes sense, @@ -375,63 +399,79 @@ typedef struct { * by overwriting it since you then can't get it back... * When reading we obviously use the data from the ntfs record header. */ -} __attribute__ ((__packed__)) MFT_RECORD; +} __packed; + +static_assert(sizeof(struct mft_record) == 48); /* This is the version without the NTFS 3.1+ specific fields. */ -typedef struct { -/*Ofs*/ -/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ - NTFS_RECORD_TYPE magic; /* Usually the magic is "FILE". */ - le16 usa_ofs; /* See NTFS_RECORD definition above. */ - le16 usa_count; /* See NTFS_RECORD definition above. */ - -/* 8*/ le64 lsn; /* $LogFile sequence number for this record. - Changed every time the record is modified. */ -/* 16*/ le16 sequence_number; /* Number of times this mft record has been - reused. (See description for MFT_REF - above.) NOTE: The increment (skipping zero) - is done when the file is deleted. NOTE: If - this is zero it is left zero. */ -/* 18*/ le16 link_count; /* Number of hard links, i.e. the number of - directory entries referencing this record. - NOTE: Only used in mft base records. - NOTE: When deleting a directory entry we - check the link_count and if it is 1 we - delete the file. Otherwise we delete the - FILE_NAME_ATTR being referenced by the - directory entry from the mft record and - decrement the link_count. - FIXME: Careful with Win32 + DOS names! */ -/* 20*/ le16 attrs_offset; /* Byte offset to the first attribute in this - mft record from the start of the mft record. - NOTE: Must be aligned to 8-byte boundary. */ -/* 22*/ MFT_RECORD_FLAGS flags; /* Bit array of MFT_RECORD_FLAGS. When a file - is deleted, the MFT_RECORD_IN_USE flag is - set to zero. */ -/* 24*/ le32 bytes_in_use; /* Number of bytes used in this mft record. - NOTE: Must be aligned to 8-byte boundary. */ -/* 28*/ le32 bytes_allocated; /* Number of bytes allocated for this mft - record. This should be equal to the mft - record size. */ -/* 32*/ leMFT_REF base_mft_record;/* This is zero for base mft records. - When it is not zero it is a mft reference - pointing to the base mft record to which - this record belongs (this is then used to - locate the attribute list attribute present - in the base record which describes this - extension record and hence might need - modification when the extension record - itself is modified, also locating the - attribute list also means finding the other - potential extents, belonging to the non-base - mft record). */ -/* 40*/ le16 next_attr_instance;/* The instance number that will be assigned to - the next attribute added to this mft record. - NOTE: Incremented each time after it is used. - NOTE: Every time the mft record is reused - this number is set to zero. NOTE: The first - instance number is always 0. */ -/* sizeof() = 42 bytes */ +struct mft_record_old { + __le32 magic; /* Usually the magic is "FILE". */ + __le16 usa_ofs; /* See ntfs_record struct definition above. */ + __le16 usa_count; /* See ntfs_record struct definition above. */ + + __le64 lsn; /* + * LogFile sequence number for this record. + * Changed every time the record is modified. + */ + __le16 sequence_number; /* + * Number of times this mft record has been + * reused. (See description for MFT_REF + * above.) NOTE: The increment (skipping zero) + * is done when the file is deleted. NOTE: If + * this is zero it is left zero. + */ + __le16 link_count; /* + * Number of hard links, i.e. the number of + * directory entries referencing this record. + * NOTE: Only used in mft base records. + * NOTE: When deleting a directory entry we + * check the link_count and if it is 1 we + * delete the file. Otherwise we delete the + * struct file_name_attr being referenced by the + * directory entry from the mft record and + * decrement the link_count. + */ + __le16 attrs_offset; /* + * Byte offset to the first attribute in this + * mft record from the start of the mft record. + * NOTE: Must be aligned to 8-byte boundary. + */ + __le16 flags; /* + * Bit array of MFT_RECORD_FLAGS. When a file + * is deleted, the MFT_RECORD_IN_USE flag is + * set to zero. + */ + __le32 bytes_in_use; /* + * Number of bytes used in this mft record. + * NOTE: Must be aligned to 8-byte boundary. + */ + __le32 bytes_allocated; /* + * Number of bytes allocated for this mft + * record. This should be equal to the mft + * record size. + */ + __le64 base_mft_record; /* + * This is zero for base mft records. + * When it is not zero it is a mft reference + * pointing to the base mft record to which + * this record belongs (this is then used to + * locate the attribute list attribute present + * in the base record which describes this + * extension record and hence might need + * modification when the extension record + * itself is modified, also locating the + * attribute list also means finding the other + * potential extents, belonging to the non-base + * mft record). + */ + __le16 next_attr_instance; /* + * The instance number that will be assigned to + * the next attribute added to this mft record. + * NOTE: Incremented each time after it is used. + * NOTE: Every time the mft record is reused + * this number is set to zero. NOTE: The first + * instance number is always 0. + */ /* * When (re)using the mft record, we place the update sequence array at this * offset, i.e. before we start with the attributes. This also makes sense, @@ -441,7 +481,9 @@ typedef struct { * by overwriting it since you then can't get it back... * When reading we obviously use the data from the ntfs record header. */ -} __attribute__ ((__packed__)) MFT_RECORD_OLD; +} __packed; + +static_assert(sizeof(struct mft_record_old) == 42); /* * System defined attributes (32-bit). Each attribute type has a corresponding @@ -452,29 +494,27 @@ typedef struct { * a revealing choice of symbol I do not know what is... (-; */ enum { - AT_UNUSED = cpu_to_le32( 0), - AT_STANDARD_INFORMATION = cpu_to_le32( 0x10), - AT_ATTRIBUTE_LIST = cpu_to_le32( 0x20), - AT_FILE_NAME = cpu_to_le32( 0x30), - AT_OBJECT_ID = cpu_to_le32( 0x40), - AT_SECURITY_DESCRIPTOR = cpu_to_le32( 0x50), - AT_VOLUME_NAME = cpu_to_le32( 0x60), - AT_VOLUME_INFORMATION = cpu_to_le32( 0x70), - AT_DATA = cpu_to_le32( 0x80), - AT_INDEX_ROOT = cpu_to_le32( 0x90), - AT_INDEX_ALLOCATION = cpu_to_le32( 0xa0), - AT_BITMAP = cpu_to_le32( 0xb0), - AT_REPARSE_POINT = cpu_to_le32( 0xc0), - AT_EA_INFORMATION = cpu_to_le32( 0xd0), - AT_EA = cpu_to_le32( 0xe0), - AT_PROPERTY_SET = cpu_to_le32( 0xf0), - AT_LOGGED_UTILITY_STREAM = cpu_to_le32( 0x100), - AT_FIRST_USER_DEFINED_ATTRIBUTE = cpu_to_le32( 0x1000), + AT_UNUSED = cpu_to_le32(0), + AT_STANDARD_INFORMATION = cpu_to_le32(0x10), + AT_ATTRIBUTE_LIST = cpu_to_le32(0x20), + AT_FILE_NAME = cpu_to_le32(0x30), + AT_OBJECT_ID = cpu_to_le32(0x40), + AT_SECURITY_DESCRIPTOR = cpu_to_le32(0x50), + AT_VOLUME_NAME = cpu_to_le32(0x60), + AT_VOLUME_INFORMATION = cpu_to_le32(0x70), + AT_DATA = cpu_to_le32(0x80), + AT_INDEX_ROOT = cpu_to_le32(0x90), + AT_INDEX_ALLOCATION = cpu_to_le32(0xa0), + AT_BITMAP = cpu_to_le32(0xb0), + AT_REPARSE_POINT = cpu_to_le32(0xc0), + AT_EA_INFORMATION = cpu_to_le32(0xd0), + AT_EA = cpu_to_le32(0xe0), + AT_PROPERTY_SET = cpu_to_le32(0xf0), + AT_LOGGED_UTILITY_STREAM = cpu_to_le32(0x100), + AT_FIRST_USER_DEFINED_ATTRIBUTE = cpu_to_le32(0x1000), AT_END = cpu_to_le32(0xffffffff) }; -typedef le32 ATTR_TYPE; - /* * The collation rules for sorting views/indexes/etc (32-bit). * @@ -490,7 +530,7 @@ typedef le32 ATTR_TYPE; * unistr.c::ntfs_collate_names() and unistr.c::legal_ansi_char_array[] * for what I mean but COLLATION_UNICODE_STRING would not give any special * treatment to any characters at all, but this is speculation. - * COLLATION_NTOFS_ULONG - Sorting is done according to ascending le32 key + * COLLATION_NTOFS_ULONG - Sorting is done according to ascending __le32 key * values. E.g. used for $SII index in FILE_Secure, which sorts by * security_id (le32). * COLLATION_NTOFS_SID - Sorting is done according to ascending SID values. @@ -499,19 +539,19 @@ typedef le32 ATTR_TYPE; * values and second by ascending security_id values. E.g. used for $SDH * index in FILE_Secure. * COLLATION_NTOFS_ULONGS - Sorting is done according to a sequence of ascending - * le32 key values. E.g. used for $O index in FILE_Extend/$ObjId, which + * __le32 key values. E.g. used for $O index in FILE_Extend/$ObjId, which * sorts by object_id (16-byte), by splitting up the object_id in four - * le32 values and using them as individual keys. E.g. take the following + * __le32 values and using them as individual keys. E.g. take the following * two security_ids, stored as follows on disk: * 1st: a1 61 65 b7 65 7b d4 11 9e 3d 00 e0 81 10 42 59 * 2nd: 38 14 37 d2 d2 f3 d4 11 a5 21 c8 6b 79 b1 97 45 - * To compare them, they are split into four le32 values each, like so: + * To compare them, they are split into four __le32 values each, like so: * 1st: 0xb76561a1 0x11d47b65 0xe0003d9e 0x59421081 * 2nd: 0xd2371438 0x11d4f3d2 0x6bc821a5 0x4597b179 * Now, it is apparent why the 2nd object_id collates after the 1st: the - * first le32 value of the 1st object_id is less than the first le32 of - * the 2nd object_id. If the first le32 values of both object_ids were - * equal then the second le32 values would be compared, etc. + * first __le32 value of the 1st object_id is less than the first __le32 of + * the 2nd object_id. If the first __le32 values of both object_ids were + * equal then the second __le32 values would be compared, etc. */ enum { COLLATION_BINARY = cpu_to_le32(0x00), @@ -523,45 +563,45 @@ enum { COLLATION_NTOFS_ULONGS = cpu_to_le32(0x13), }; -typedef le32 COLLATION_RULE; - /* * The flags (32-bit) describing attribute properties in the attribute - * definition structure. FIXME: This information is based on Regis's - * information and, according to him, it is not certain and probably - * incomplete. The INDEXABLE flag is fairly certainly correct as only the file + * definition structure. + * The INDEXABLE flag is fairly certainly correct as only the file * name attribute has this flag set and this is the only attribute indexed in * NT4. */ enum { - ATTR_DEF_INDEXABLE = cpu_to_le32(0x02), /* Attribute can be - indexed. */ - ATTR_DEF_MULTIPLE = cpu_to_le32(0x04), /* Attribute type - can be present multiple times in the - mft records of an inode. */ - ATTR_DEF_NOT_ZERO = cpu_to_le32(0x08), /* Attribute value - must contain at least one non-zero - byte. */ - ATTR_DEF_INDEXED_UNIQUE = cpu_to_le32(0x10), /* Attribute must be - indexed and the attribute value must be - unique for the attribute type in all of - the mft records of an inode. */ - ATTR_DEF_NAMED_UNIQUE = cpu_to_le32(0x20), /* Attribute must be - named and the name must be unique for - the attribute type in all of the mft - records of an inode. */ - ATTR_DEF_RESIDENT = cpu_to_le32(0x40), /* Attribute must be - resident. */ - ATTR_DEF_ALWAYS_LOG = cpu_to_le32(0x80), /* Always log - modifications to this attribute, - regardless of whether it is resident or - non-resident. Without this, only log - modifications if the attribute is - resident. */ + ATTR_DEF_INDEXABLE = cpu_to_le32(0x02), /* Attribute can be indexed. */ + ATTR_DEF_MULTIPLE = cpu_to_le32(0x04), /* + * Attribute type can be present + * multiple times in the mft records + * of an inode. + */ + ATTR_DEF_NOT_ZERO = cpu_to_le32(0x08), /* + * Attribute value must contain + * at least one non-zero byte. + */ + ATTR_DEF_INDEXED_UNIQUE = cpu_to_le32(0x10), /* + * Attribute must be indexed and + * the attribute value must be unique + * for the attribute type in all of + * the mft records of an inode. + */ + ATTR_DEF_NAMED_UNIQUE = cpu_to_le32(0x20), /* + * Attribute must be named and + * the name must be unique for + * the attribute type in all of the mft + * records of an inode. + */ + ATTR_DEF_RESIDENT = cpu_to_le32(0x40), /* Attribute must be resident. */ + ATTR_DEF_ALWAYS_LOG = cpu_to_le32(0x80), /* + * Always log modifications to this attribute, + * regardless of whether it is resident or + * non-resident. Without this, only log + * modifications if the attribute is resident. + */ }; -typedef le32 ATTR_DEF_FLAGS; - /* * The data attribute of FILE_AttrDef contains a sequence of attribute * definitions for the NTFS volume. With this, it is supposed to be safe for an @@ -571,33 +611,30 @@ typedef le32 ATTR_DEF_FLAGS; * attribute can be resident/non-resident and possibly other things, but the * actual bits are unknown. */ -typedef struct { -/*hex ofs*/ -/* 0*/ ntfschar name[0x40]; /* Unicode name of the attribute. Zero - terminated. */ -/* 80*/ ATTR_TYPE type; /* Type of the attribute. */ -/* 84*/ le32 display_rule; /* Default display rule. - FIXME: What does it mean? (AIA) */ -/* 88*/ COLLATION_RULE collation_rule; /* Default collation rule. */ -/* 8c*/ ATTR_DEF_FLAGS flags; /* Flags describing the attribute. */ -/* 90*/ sle64 min_size; /* Optional minimum attribute size. */ -/* 98*/ sle64 max_size; /* Maximum size of attribute. */ -/* sizeof() = 0xa0 or 160 bytes */ -} __attribute__ ((__packed__)) ATTR_DEF; +struct attr_def { + __le16 name[0x40]; /* Unicode name of the attribute. Zero terminated. */ + __le32 type; /* Type of the attribute. */ + __le32 display_rule; /* Default display rule. */ + __le32 collation_rule; /* Default collation rule. */ + __le32 flags; /* Flags describing the attribute. */ + __le64 min_size; /* Optional minimum attribute size. */ + __le64 max_size; /* Maximum size of attribute. */ +} __packed; + +static_assert(sizeof(struct attr_def) == 160); /* * Attribute flags (16-bit). */ enum { ATTR_IS_COMPRESSED = cpu_to_le16(0x0001), - ATTR_COMPRESSION_MASK = cpu_to_le16(0x00ff), /* Compression method - mask. Also, first - illegal value. */ + ATTR_COMPRESSION_MASK = cpu_to_le16(0x00ff), /* + * Compression method mask. + * Also, first illegal value. + */ ATTR_IS_ENCRYPTED = cpu_to_le16(0x4000), ATTR_IS_SPARSE = cpu_to_le16(0x8000), -} __attribute__ ((__packed__)); - -typedef le16 ATTR_FLAGS; +} __packed; /* * Attribute compression. @@ -670,110 +707,133 @@ typedef le16 ATTR_FLAGS; * Flags of resident attributes (8-bit). */ enum { - RESIDENT_ATTR_IS_INDEXED = 0x01, /* Attribute is referenced in an index - (has implications for deleting and - modifying the attribute). */ -} __attribute__ ((__packed__)); - -typedef u8 RESIDENT_ATTR_FLAGS; + RESIDENT_ATTR_IS_INDEXED = 0x01, /* + * Attribute is referenced in an index + * (has implications for deleting and + * modifying the attribute). + */ +} __packed; /* * Attribute record header. Always aligned to 8-byte boundary. */ -typedef struct { -/*Ofs*/ -/* 0*/ ATTR_TYPE type; /* The (32-bit) type of the attribute. */ -/* 4*/ le32 length; /* Byte size of the resident part of the - attribute (aligned to 8-byte boundary). - Used to get to the next attribute. */ -/* 8*/ u8 non_resident; /* If 0, attribute is resident. - If 1, attribute is non-resident. */ -/* 9*/ u8 name_length; /* Unicode character size of name of attribute. - 0 if unnamed. */ -/* 10*/ le16 name_offset; /* If name_length != 0, the byte offset to the - beginning of the name from the attribute - record. Note that the name is stored as a - Unicode string. When creating, place offset - just at the end of the record header. Then, - follow with attribute value or mapping pairs - array, resident and non-resident attributes - respectively, aligning to an 8-byte - boundary. */ -/* 12*/ ATTR_FLAGS flags; /* Flags describing the attribute. */ -/* 14*/ le16 instance; /* The instance of this attribute record. This - number is unique within this mft record (see - MFT_RECORD/next_attribute_instance notes in - mft.h for more details). */ -/* 16*/ union { +struct attr_record { + __le32 type; /* The (32-bit) type of the attribute. */ + __le32 length; /* + * Byte size of the resident part of the + * attribute (aligned to 8-byte boundary). + * Used to get to the next attribute. + */ + u8 non_resident; /* + * If 0, attribute is resident. + * If 1, attribute is non-resident. + */ + u8 name_length; /* Unicode character size of name of attribute. 0 if unnamed. */ + __le16 name_offset; /* + * If name_length != 0, the byte offset to the + * beginning of the name from the attribute + * record. Note that the name is stored as a + * Unicode string. When creating, place offset + * just at the end of the record header. Then, + * follow with attribute value or mapping pairs + * array, resident and non-resident attributes + * respectively, aligning to an 8-byte + * boundary. + */ + __le16 flags; /* Flags describing the attribute. */ + __le16 instance; /* + * The instance of this attribute record. This + * number is unique within this mft record (see + * MFT_RECORD/next_attribute_instance notes in + * mft.h for more details). + */ + union { /* Resident attributes. */ struct { -/* 16 */ le32 value_length;/* Byte size of attribute value. */ -/* 20 */ le16 value_offset;/* Byte offset of the attribute - value from the start of the - attribute record. When creating, - align to 8-byte boundary if we - have a name present as this might - not have a length of a multiple - of 8-bytes. */ -/* 22 */ RESIDENT_ATTR_FLAGS flags; /* See above. */ -/* 23 */ s8 reserved; /* Reserved/alignment to 8-byte - boundary. */ - } __attribute__ ((__packed__)) resident; + __le32 value_length; /* Byte size of attribute value. */ + __le16 value_offset; /* + * Byte offset of the attribute + * value from the start of the + * attribute record. When creating, + * align to 8-byte boundary if we + * have a name present as this might + * not have a length of a multiple + * of 8-bytes. + */ + u8 flags; /* See above. */ + s8 reserved; /* Reserved/alignment to 8-byte boundary. */ + } __packed resident; /* Non-resident attributes. */ struct { -/* 16*/ leVCN lowest_vcn;/* Lowest valid virtual cluster number - for this portion of the attribute value or - 0 if this is the only extent (usually the - case). - Only when an attribute list is used - does lowest_vcn != 0 ever occur. */ -/* 24*/ leVCN highest_vcn;/* Highest valid vcn of this extent of - the attribute value. - Usually there is only one - portion, so this usually equals the attribute - value size in clusters minus 1. Can be -1 for - zero length files. Can be 0 for "single extent" - attributes. */ -/* 32*/ le16 mapping_pairs_offset; /* Byte offset from the - beginning of the structure to the mapping pairs - array which contains the mappings between the - vcns and the logical cluster numbers (lcns). - When creating, place this at the end of this - record header aligned to 8-byte boundary. */ -/* 34*/ u8 compression_unit; /* The compression unit expressed - as the log to the base 2 of the number of - clusters in a compression unit. 0 means not - compressed. (This effectively limits the - compression unit size to be a power of two - clusters.) WinNT4 only uses a value of 4. - Sparse files have this set to 0 on XPSP2. */ -/* 35*/ u8 reserved[5]; /* Align to 8-byte boundary. */ -/* The sizes below are only used when lowest_vcn is zero, as otherwise it would - be difficult to keep them up-to-date.*/ -/* 40*/ sle64 allocated_size; /* Byte size of disk space - allocated to hold the attribute value. Always - is a multiple of the cluster size. When a file - is compressed, this field is a multiple of the - compression block size (2^compression_unit) and - it represents the logically allocated space - rather than the actual on disk usage. For this - use the compressed_size (see below). */ -/* 48*/ sle64 data_size; /* Byte size of the attribute - value. Can be larger than allocated_size if - attribute value is compressed or sparse. */ -/* 56*/ sle64 initialized_size; /* Byte size of initialized - portion of the attribute value. Usually equals - data_size. */ -/* sizeof(uncompressed attr) = 64*/ -/* 64*/ sle64 compressed_size; /* Byte size of the attribute - value after compression. Only present when - compressed or sparse. Always is a multiple of - the cluster size. Represents the actual amount - of disk space being used on the disk. */ -/* sizeof(compressed attr) = 72*/ - } __attribute__ ((__packed__)) non_resident; - } __attribute__ ((__packed__)) data; -} __attribute__ ((__packed__)) ATTR_RECORD; - -typedef ATTR_RECORD ATTR_REC; + __le64 lowest_vcn; /* + * Lowest valid virtual cluster number + * for this portion of the attribute value or + * 0 if this is the only extent (usually the + * case). - Only when an attribute list is used + * does lowest_vcn != 0 ever occur. + */ + __le64 highest_vcn; /* + * Highest valid vcn of this extent of + * the attribute value. - Usually there is only one + * portion, so this usually equals the attribute + * value size in clusters minus 1. Can be -1 for + * zero length files. Can be 0 for "single extent" + * attributes. + */ + __le16 mapping_pairs_offset; /* + * Byte offset from the beginning of + * the structure to the mapping pairs + * array which contains the mappings + * between the vcns and the logical cluster + * numbers (lcns). + * When creating, place this at the end of + * this record header aligned to 8-byte + * boundary. + */ + u8 compression_unit; /* + * The compression unit expressed as the log + * to the base 2 of the number of + * clusters in a compression unit. 0 means not + * compressed. (This effectively limits the + * compression unit size to be a power of two + * clusters.) WinNT4 only uses a value of 4. + * Sparse files have this set to 0 on XPSP2. + */ + u8 reserved[5]; /* Align to 8-byte boundary. */ +/* + * The sizes below are only used when lowest_vcn is zero, as otherwise it would + * be difficult to keep them up-to-date. + */ + __le64 allocated_size; /* + * Byte size of disk space allocated + * to hold the attribute value. Always + * is a multiple of the cluster size. + * When a file is compressed, this field + * is a multiple of the compression block + * size (2^compression_unit) and it represents + * the logically allocated space rather than + * the actual on disk usage. For this use + * the compressed_size (see below). + */ + __le64 data_size; /* + * Byte size of the attribute value. Can be + * larger than allocated_size if attribute value + * is compressed or sparse. + */ + __le64 initialized_size; /* + * Byte size of initialized portion of + * the attribute value. Usually equals data_size. + */ + __le64 compressed_size; /* + * Byte size of the attribute value after + * compression. Only present when compressed + * or sparse. Always is a multiple of the cluster + * size. Represents the actual amount of disk + * space being used on the disk. + */ + } __packed non_resident; + } __packed data; +} __packed; /* * File attribute flags (32-bit) appearing in the file_attributes fields of the @@ -792,8 +852,10 @@ enum { /* Old DOS volid. Unused in NT. = cpu_to_le32(0x00000008), */ FILE_ATTR_DIRECTORY = cpu_to_le32(0x00000010), - /* Note, FILE_ATTR_DIRECTORY is not considered valid in NT. It is - reserved for the DOS SUBDIRECTORY flag. */ + /* + * Note, FILE_ATTR_DIRECTORY is not considered valid in NT. It is + * reserved for the DOS SUBDIRECTORY flag. + */ FILE_ATTR_ARCHIVE = cpu_to_le32(0x00000020), FILE_ATTR_DEVICE = cpu_to_le32(0x00000040), FILE_ATTR_NORMAL = cpu_to_le32(0x00000080), @@ -808,32 +870,40 @@ enum { FILE_ATTR_ENCRYPTED = cpu_to_le32(0x00004000), FILE_ATTR_VALID_FLAGS = cpu_to_le32(0x00007fb7), - /* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the - FILE_ATTR_DEVICE and preserves everything else. This mask is used - to obtain all flags that are valid for reading. */ + /* + * Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the + * FILE_ATTR_DEVICE and preserves everything else. This mask is used + * to obtain all flags that are valid for reading. + */ FILE_ATTR_VALID_SET_FLAGS = cpu_to_le32(0x000031a7), - /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the - F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT, - F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask - is used to obtain all flags that are valid for setting. */ + /* + * Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the + * F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT, + * F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask + * is used to obtain all flags that are valid for setting. + */ + /* Supposed to mean no data locally, possibly repurposed */ + FILE_ATTRIBUTE_RECALL_ON_OPEN = cpu_to_le32(0x00040000), /* * The flag FILE_ATTR_DUP_FILENAME_INDEX_PRESENT is present in all * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION * attribute of an mft record. */ FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = cpu_to_le32(0x10000000), - /* Note, this is a copy of the corresponding bit from the mft record, - telling us whether this is a directory or not, i.e. whether it has - an index root attribute or not. */ + /* + * Note, this is a copy of the corresponding bit from the mft record, + * telling us whether this is a directory or not, i.e. whether it has + * an index root attribute or not. + */ FILE_ATTR_DUP_VIEW_INDEX_PRESENT = cpu_to_le32(0x20000000), - /* Note, this is a copy of the corresponding bit from the mft record, - telling us whether this file has a view index present (eg. object id - index, quota index, one of the security indexes or the encrypting - filesystem related indexes). */ + /* + * Note, this is a copy of the corresponding bit from the mft record, + * telling us whether this file has a view index present (eg. object id + * index, quota index, one of the security indexes or the encrypting + * filesystem related indexes). + */ }; -typedef le32 FILE_ATTR_FLAGS; - /* * NOTE on times in NTFS: All times are in MS standard time format, i.e. they * are the number of 100-nanosecond intervals since 1st January 1601, 00:00:00 @@ -851,30 +921,29 @@ typedef le32 FILE_ATTR_FLAGS; * correct by practical experimentation on Windows NT4 SP6a and is hence * assumed to be the one and only correct interpretation. */ -typedef struct { -/*Ofs*/ -/* 0*/ sle64 creation_time; /* Time file was created. Updated when - a filename is changed(?). */ -/* 8*/ sle64 last_data_change_time; /* Time the data attribute was last - modified. */ -/* 16*/ sle64 last_mft_change_time; /* Time this mft record was last - modified. */ -/* 24*/ sle64 last_access_time; /* Approximate time when the file was - last accessed (obviously this is not - updated on read-only volumes). In - Windows this is only updated when - accessed if some time delta has - passed since the last update. Also, - last access time updates can be - disabled altogether for speed. */ -/* 32*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */ -/* 36*/ union { +struct standard_information { + __le64 creation_time; /* + * Time file was created. Updated when + * a filename is changed(?). + */ + __le64 last_data_change_time; /* Time the data attribute was last modified. */ + __le64 last_mft_change_time; /* Time this mft record was last modified. */ + __le64 last_access_time; /* + * Approximate time when the file was + * last accessed (obviously this is not + * updated on read-only volumes). In + * Windows this is only updated when + * accessed if some time delta has + * passed since the last update. Also, + * last access time updates can be + * disabled altogether for speed. + */ + __le32 file_attributes; /* Flags describing the file. */ + union { /* NTFS 1.2 */ struct { - /* 36*/ u8 reserved12[12]; /* Reserved/alignment to 8-byte - boundary. */ - } __attribute__ ((__packed__)) v1; - /* sizeof() = 48 bytes */ + u8 reserved12[12]; /* Reserved/alignment to 8-byte boundary. */ + } __packed v1; /* NTFS 3.x */ struct { /* @@ -883,7 +952,7 @@ typedef struct { * Recognize the difference by comparing the length of the resident attribute * value. If it is 48, then the following fields are missing. If it is 72 then * the fields are present. Maybe just check like this: - * if (resident.ValueLength < sizeof(STANDARD_INFORMATION)) { + * if (resident.ValueLength < sizeof(struct standard_information)) { * Assume NTFS 1.2- format. * If (volume version is 3.x) * Upgrade attribute to NTFS 3.x format. @@ -896,32 +965,48 @@ typedef struct { * views that as a corruption, assuming that it behaves like this for all * attributes. */ - /* 36*/ le32 maximum_versions; /* Maximum allowed versions for - file. Zero if version numbering is disabled. */ - /* 40*/ le32 version_number; /* This file's version (if any). - Set to zero if maximum_versions is zero. */ - /* 44*/ le32 class_id; /* Class id from bidirectional - class id index (?). */ - /* 48*/ le32 owner_id; /* Owner_id of the user owning - the file. Translate via $Q index in FILE_Extend - /$Quota to the quota control entry for the user - owning the file. Zero if quotas are disabled. */ - /* 52*/ le32 security_id; /* Security_id for the file. - Translate via $SII index and $SDS data stream - in FILE_Secure to the security descriptor. */ - /* 56*/ le64 quota_charged; /* Byte size of the charge to - the quota for all streams of the file. Note: Is - zero if quotas are disabled. */ - /* 64*/ leUSN usn; /* Last update sequence number - of the file. This is a direct index into the - transaction log file ($UsnJrnl). It is zero if - the usn journal is disabled or this file has - not been subject to logging yet. See usnjrnl.h - for details. */ - } __attribute__ ((__packed__)) v3; - /* sizeof() = 72 bytes (NTFS 3.x) */ - } __attribute__ ((__packed__)) ver; -} __attribute__ ((__packed__)) STANDARD_INFORMATION; + __le32 maximum_versions; /* + * Maximum allowed versions for + * file. Zero if version numbering + * is disabled. + */ + __le32 version_number; /* + * This file's version (if any). + * Set to zero if maximum_versions + * is zero. + */ + __le32 class_id; /* + * Class id from bidirectional + * class id index (?). + */ + __le32 owner_id; /* + * Owner_id of the user owning + * the file. Translate via $Q index + * in FILE_Extend /$Quota to the quota + * control entry for the user owning + * the file. Zero if quotas are disabled. + */ + __le32 security_id; /* + * Security_id for the file. Translate via + * $SII index and $SDS data stream in + * FILE_Secure to the security descriptor. + */ + __le64 quota_charged; /* + * Byte size of the charge to the quota for + * all streams of the file. Note: Is zero + * if quotas are disabled. + */ + __le64 usn; /* + * Last update sequence number of the file. + * This is a direct index into the transaction + * log file ($UsnJrnl). It is zero if the usn + * journal is disabled or this file has not been + * subject to logging yet. See usnjrnl.h + * for details. + */ + } __packed v3; + } __packed ver; +} __packed; /* * Attribute: Attribute list (0x20). @@ -937,7 +1022,7 @@ typedef struct { * itself. The list is sorted: first by attribute type, second by attribute * name (if present), third by instance number. The extents of one * non-resident attribute (if present) immediately follow after the initial - * extent. They are ordered by lowest_vcn and have their instace set to zero. + * extent. They are ordered by lowest_vcn and have their instance set to zero. * It is not allowed to have two attributes with all sorting keys equal. * - Further restrictions: * - If not resident, the vcn to lcn mapping array has to fit inside the @@ -955,37 +1040,46 @@ typedef struct { * NTFS 3.0 volumes). * - There are many named streams. */ -typedef struct { -/*Ofs*/ -/* 0*/ ATTR_TYPE type; /* Type of referenced attribute. */ -/* 4*/ le16 length; /* Byte size of this entry (8-byte aligned). */ -/* 6*/ u8 name_length; /* Size in Unicode chars of the name of the - attribute or 0 if unnamed. */ -/* 7*/ u8 name_offset; /* Byte offset to beginning of attribute name - (always set this to where the name would - start even if unnamed). */ -/* 8*/ leVCN lowest_vcn; /* Lowest virtual cluster number of this portion - of the attribute value. This is usually 0. It - is non-zero for the case where one attribute - does not fit into one mft record and thus - several mft records are allocated to hold - this attribute. In the latter case, each mft - record holds one extent of the attribute and - there is one attribute list entry for each - extent. NOTE: This is DEFINITELY a signed - value! The windows driver uses cmp, followed - by jg when comparing this, thus it treats it - as signed. */ -/* 16*/ leMFT_REF mft_reference;/* The reference of the mft record holding - the ATTR_RECORD for this portion of the - attribute value. */ -/* 24*/ le16 instance; /* If lowest_vcn = 0, the instance of the - attribute being referenced; otherwise 0. */ -/* 26*/ ntfschar name[0]; /* Use when creating only. When reading use - name_offset to determine the location of the - name. */ -/* sizeof() = 26 + (attribute_name_length * 2) bytes */ -} __attribute__ ((__packed__)) ATTR_LIST_ENTRY; +struct attr_list_entry { + __le32 type; /* Type of referenced attribute. */ + __le16 length; /* Byte size of this entry (8-byte aligned). */ + u8 name_length; /* + * Size in Unicode chars of the name of the + * attribute or 0 if unnamed. + */ + u8 name_offset; /* + * Byte offset to beginning of attribute name + * (always set this to where the name would + * start even if unnamed). + */ + __le64 lowest_vcn; /* + * Lowest virtual cluster number of this portion + * of the attribute value. This is usually 0. It + * is non-zero for the case where one attribute + * does not fit into one mft record and thus + * several mft records are allocated to hold + * this attribute. In the latter case, each mft + * record holds one extent of the attribute and + * there is one attribute list entry for each + * extent. NOTE: This is DEFINITELY a signed + * value! The windows driver uses cmp, followed + * by jg when comparing this, thus it treats it + * as signed. + */ + __le64 mft_reference; /* + * The reference of the mft record holding + * the attr record for this portion of the + * attribute value. + */ + __le16 instance; /* + * If lowest_vcn = 0, the instance of the + * attribute being referenced; otherwise 0. + */ + __le16 name[]; /* + * Use when creating only. When reading use + * name_offset to determine the location of the name. + */ +} __packed; /* * The maximum allowed length for a file name. @@ -997,33 +1091,39 @@ typedef struct { */ enum { FILE_NAME_POSIX = 0x00, - /* This is the largest namespace. It is case sensitive and allows all - Unicode characters except for: '\0' and '/'. Beware that in - WinNT/2k/2003 by default files which eg have the same name except - for their case will not be distinguished by the standard utilities - and thus a "del filename" will delete both "filename" and "fileName" - without warning. However if for example Services For Unix (SFU) are - installed and the case sensitive option was enabled at installation - time, then you can create/access/delete such files. - Note that even SFU places restrictions on the filenames beyond the - '\0' and '/' and in particular the following set of characters is - not allowed: '"', '/', '<', '>', '\'. All other characters, - including the ones no allowed in WIN32 namespace are allowed. - Tested with SFU 3.5 (this is now free) running on Windows XP. */ + /* + * This is the largest namespace. It is case sensitive and allows all + * Unicode characters except for: '\0' and '/'. Beware that in + * WinNT/2k/2003 by default files which eg have the same name except + * for their case will not be distinguished by the standard utilities + * and thus a "del filename" will delete both "filename" and "fileName" + * without warning. However if for example Services For Unix (SFU) are + * installed and the case sensitive option was enabled at installation + * time, then you can create/access/delete such files. + * Note that even SFU places restrictions on the filenames beyond the + * '\0' and '/' and in particular the following set of characters is + * not allowed: '"', '/', '<', '>', '\'. All other characters, + * including the ones no allowed in WIN32 namespace are allowed. + * Tested with SFU 3.5 (this is now free) running on Windows XP. + */ FILE_NAME_WIN32 = 0x01, - /* The standard WinNT/2k NTFS long filenames. Case insensitive. All - Unicode chars except: '\0', '"', '*', '/', ':', '<', '>', '?', '\', - and '|'. Further, names cannot end with a '.' or a space. */ + /* + * The standard WinNT/2k NTFS long filenames. Case insensitive. All + * Unicode chars except: '\0', '"', '*', '/', ':', '<', '>', '?', '\', + * and '|'. Further, names cannot end with a '.' or a space. + */ FILE_NAME_DOS = 0x02, - /* The standard DOS filenames (8.3 format). Uppercase only. All 8-bit - characters greater space, except: '"', '*', '+', ',', '/', ':', ';', - '<', '=', '>', '?', and '\'. */ + /* + * The standard DOS filenames (8.3 format). Uppercase only. All 8-bit + * characters greater space, except: '"', '*', '+', ',', '/', ':', ';', + * '<', '=', '>', '?', and '\'.\ + */ FILE_NAME_WIN32_AND_DOS = 0x03, - /* 3 means that both the Win32 and the DOS filenames are identical and - hence have been saved in this single filename record. */ -} __attribute__ ((__packed__)); - -typedef u8 FILE_NAME_TYPE_FLAGS; + /* + * 3 means that both the Win32 and the DOS filenames are identical and + * hence have been saved in this single filename record. + */ +} __packed; /* * Attribute: Filename (0x30). @@ -1038,53 +1138,54 @@ typedef u8 FILE_NAME_TYPE_FLAGS; * correct by practical experimentation on Windows NT4 SP6a and is hence * assumed to be the one and only correct interpretation. */ -typedef struct { +struct file_name_attr { /*hex ofs*/ -/* 0*/ leMFT_REF parent_directory; /* Directory this filename is - referenced from. */ -/* 8*/ sle64 creation_time; /* Time file was created. */ -/* 10*/ sle64 last_data_change_time; /* Time the data attribute was last - modified. */ -/* 18*/ sle64 last_mft_change_time; /* Time this mft record was last - modified. */ -/* 20*/ sle64 last_access_time; /* Time this mft record was last - accessed. */ -/* 28*/ sle64 allocated_size; /* Byte size of on-disk allocated space - for the unnamed data attribute. So - for normal $DATA, this is the - allocated_size from the unnamed - $DATA attribute and for compressed - and/or sparse $DATA, this is the - compressed_size from the unnamed - $DATA attribute. For a directory or - other inode without an unnamed $DATA - attribute, this is always 0. NOTE: - This is a multiple of the cluster - size. */ -/* 30*/ sle64 data_size; /* Byte size of actual data in unnamed - data attribute. For a directory or - other inode without an unnamed $DATA - attribute, this is always 0. */ -/* 38*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */ -/* 3c*/ union { - /* 3c*/ struct { - /* 3c*/ le16 packed_ea_size; /* Size of the buffer needed to - pack the extended attributes - (EAs), if such are present.*/ - /* 3e*/ le16 reserved; /* Reserved for alignment. */ - } __attribute__ ((__packed__)) ea; - /* 3c*/ struct { - /* 3c*/ le32 reparse_point_tag; /* Type of reparse point, - present only in reparse - points and only if there are - no EAs. */ - } __attribute__ ((__packed__)) rp; - } __attribute__ ((__packed__)) type; -/* 40*/ u8 file_name_length; /* Length of file name in - (Unicode) characters. */ -/* 41*/ FILE_NAME_TYPE_FLAGS file_name_type; /* Namespace of the file name.*/ -/* 42*/ ntfschar file_name[0]; /* File name in Unicode. */ -} __attribute__ ((__packed__)) FILE_NAME_ATTR; + __le64 parent_directory; /* Directory this filename is referenced from. */ + __le64 creation_time; /* Time file was created. */ + __le64 last_data_change_time; /* Time the data attribute was last modified. */ + __le64 last_mft_change_time; /* Time this mft record was last modified. */ + __le64 last_access_time; /* Time this mft record was last accessed. */ + __le64 allocated_size; /* + * Byte size of on-disk allocated space + * for the unnamed data attribute. So for normal + * $DATA, this is the allocated_size from + * the unnamed $DATA attribute and for compressed + * and/or sparse $DATA, this is the + * compressed_size from the unnamed + * $DATA attribute. For a directory or + * other inode without an unnamed $DATA attribute, + * this is always 0. NOTE: This is a multiple of + * the cluster size. + */ + __le64 data_size; /* + * Byte size of actual data in unnamed + * data attribute. For a directory or + * other inode without an unnamed $DATA + * attribute, this is always 0. + */ + __le32 file_attributes; /* Flags describing the file. */ + union { + struct { + __le16 packed_ea_size; /* + * Size of the buffer needed to + * pack the extended attributes + * (EAs), if such are present. + */ + __le16 reserved; /* Reserved for alignment. */ + } __packed ea; + struct { + __le32 reparse_point_tag; /* + * Type of reparse point, + * present only in reparse + * points and only if there are + * no EAs. + */ + } __packed rp; + } __packed type; + u8 file_name_length; /* Length of file name in (Unicode) characters. */ + u8 file_name_type; /* Namespace of the file name.*/ + __le16 file_name[]; /* File name in Unicode. */ +} __packed; /* * GUID structures store globally unique identifiers (GUID). A GUID is a @@ -1095,75 +1196,16 @@ typedef struct { * Example of a GUID: * 1F010768-5A73-BC91-0010A52216A7 */ -typedef struct { - le32 data1; /* The first eight hexadecimal digits of the GUID. */ - le16 data2; /* The first group of four hexadecimal digits. */ - le16 data3; /* The second group of four hexadecimal digits. */ - u8 data4[8]; /* The first two bytes are the third group of four - hexadecimal digits. The remaining six bytes are the - final 12 hexadecimal digits. */ -} __attribute__ ((__packed__)) GUID; - -/* - * FILE_Extend/$ObjId contains an index named $O. This index contains all - * object_ids present on the volume as the index keys and the corresponding - * mft_record numbers as the index entry data parts. The data part (defined - * below) also contains three other object_ids: - * birth_volume_id - object_id of FILE_Volume on which the file was first - * created. Optional (i.e. can be zero). - * birth_object_id - object_id of file when it was first created. Usually - * equals the object_id. Optional (i.e. can be zero). - * domain_id - Reserved (always zero). - */ -typedef struct { - leMFT_REF mft_reference;/* Mft record containing the object_id in - the index entry key. */ - union { - struct { - GUID birth_volume_id; - GUID birth_object_id; - GUID domain_id; - } __attribute__ ((__packed__)) origin; - u8 extended_info[48]; - } __attribute__ ((__packed__)) opt; -} __attribute__ ((__packed__)) OBJ_ID_INDEX_DATA; - -/* - * Attribute: Object id (NTFS 3.0+) (0x40). - * - * NOTE: Always resident. - */ -typedef struct { - GUID object_id; /* Unique id assigned to the - file.*/ - /* The following fields are optional. The attribute value size is 16 - bytes, i.e. sizeof(GUID), if these are not present at all. Note, - the entries can be present but one or more (or all) can be zero - meaning that that particular value(s) is(are) not defined. */ - union { - struct { - GUID birth_volume_id; /* Unique id of volume on which - the file was first created.*/ - GUID birth_object_id; /* Unique id of file when it was - first created. */ - GUID domain_id; /* Reserved, zero. */ - } __attribute__ ((__packed__)) origin; - u8 extended_info[48]; - } __attribute__ ((__packed__)) opt; -} __attribute__ ((__packed__)) OBJECT_ID_ATTR; - -/* - * The pre-defined IDENTIFIER_AUTHORITIES used as SID_IDENTIFIER_AUTHORITY in - * the SID structure (see below). - */ -//typedef enum { /* SID string prefix. */ -// SECURITY_NULL_SID_AUTHORITY = {0, 0, 0, 0, 0, 0}, /* S-1-0 */ -// SECURITY_WORLD_SID_AUTHORITY = {0, 0, 0, 0, 0, 1}, /* S-1-1 */ -// SECURITY_LOCAL_SID_AUTHORITY = {0, 0, 0, 0, 0, 2}, /* S-1-2 */ -// SECURITY_CREATOR_SID_AUTHORITY = {0, 0, 0, 0, 0, 3}, /* S-1-3 */ -// SECURITY_NON_UNIQUE_AUTHORITY = {0, 0, 0, 0, 0, 4}, /* S-1-4 */ -// SECURITY_NT_SID_AUTHORITY = {0, 0, 0, 0, 0, 5}, /* S-1-5 */ -//} IDENTIFIER_AUTHORITIES; +struct guid { + __le32 data1; /* The first eight hexadecimal digits of the GUID. */ + __le16 data2; /* The first group of four hexadecimal digits. */ + __le16 data3; /* The second group of four hexadecimal digits. */ + u8 data4[8]; /* + * The first two bytes are the third group of four + * hexadecimal digits. The remaining six bytes are the + * final 12 hexadecimal digits. + */ +} __packed; /* * These relative identifiers (RIDs) are used with the above identifier @@ -1175,75 +1217,75 @@ typedef struct { * made up of the identifier authority SECURITY_CREATOR_SID_AUTHORITY (3) and * the relative identifier SECURITY_CREATOR_OWNER_RID (0). */ -typedef enum { /* Identifier authority. */ - SECURITY_NULL_RID = 0, /* S-1-0 */ - SECURITY_WORLD_RID = 0, /* S-1-1 */ - SECURITY_LOCAL_RID = 0, /* S-1-2 */ +enum { /* Identifier authority. */ + SECURITY_NULL_RID = 0, /* S-1-0 */ + SECURITY_WORLD_RID = 0, /* S-1-1 */ + SECURITY_LOCAL_RID = 0, /* S-1-2 */ - SECURITY_CREATOR_OWNER_RID = 0, /* S-1-3 */ - SECURITY_CREATOR_GROUP_RID = 1, /* S-1-3 */ + SECURITY_CREATOR_OWNER_RID = 0, /* S-1-3 */ + SECURITY_CREATOR_GROUP_RID = 1, /* S-1-3 */ - SECURITY_CREATOR_OWNER_SERVER_RID = 2, /* S-1-3 */ - SECURITY_CREATOR_GROUP_SERVER_RID = 3, /* S-1-3 */ + SECURITY_CREATOR_OWNER_SERVER_RID = 2, /* S-1-3 */ + SECURITY_CREATOR_GROUP_SERVER_RID = 3, /* S-1-3 */ - SECURITY_DIALUP_RID = 1, - SECURITY_NETWORK_RID = 2, - SECURITY_BATCH_RID = 3, - SECURITY_INTERACTIVE_RID = 4, - SECURITY_SERVICE_RID = 6, - SECURITY_ANONYMOUS_LOGON_RID = 7, - SECURITY_PROXY_RID = 8, - SECURITY_ENTERPRISE_CONTROLLERS_RID=9, - SECURITY_SERVER_LOGON_RID = 9, - SECURITY_PRINCIPAL_SELF_RID = 0xa, - SECURITY_AUTHENTICATED_USER_RID = 0xb, - SECURITY_RESTRICTED_CODE_RID = 0xc, - SECURITY_TERMINAL_SERVER_RID = 0xd, + SECURITY_DIALUP_RID = 1, + SECURITY_NETWORK_RID = 2, + SECURITY_BATCH_RID = 3, + SECURITY_INTERACTIVE_RID = 4, + SECURITY_SERVICE_RID = 6, + SECURITY_ANONYMOUS_LOGON_RID = 7, + SECURITY_PROXY_RID = 8, + SECURITY_ENTERPRISE_CONTROLLERS_RID = 9, + SECURITY_SERVER_LOGON_RID = 9, + SECURITY_PRINCIPAL_SELF_RID = 0xa, + SECURITY_AUTHENTICATED_USER_RID = 0xb, + SECURITY_RESTRICTED_CODE_RID = 0xc, + SECURITY_TERMINAL_SERVER_RID = 0xd, - SECURITY_LOGON_IDS_RID = 5, - SECURITY_LOGON_IDS_RID_COUNT = 3, + SECURITY_LOGON_IDS_RID = 5, + SECURITY_LOGON_IDS_RID_COUNT = 3, - SECURITY_LOCAL_SYSTEM_RID = 0x12, + SECURITY_LOCAL_SYSTEM_RID = 0x12, - SECURITY_NT_NON_UNIQUE = 0x15, + SECURITY_NT_NON_UNIQUE = 0x15, - SECURITY_BUILTIN_DOMAIN_RID = 0x20, + SECURITY_BUILTIN_DOMAIN_RID = 0x20, /* * Well-known domain relative sub-authority values (RIDs). */ /* Users. */ - DOMAIN_USER_RID_ADMIN = 0x1f4, - DOMAIN_USER_RID_GUEST = 0x1f5, - DOMAIN_USER_RID_KRBTGT = 0x1f6, + DOMAIN_USER_RID_ADMIN = 0x1f4, + DOMAIN_USER_RID_GUEST = 0x1f5, + DOMAIN_USER_RID_KRBTGT = 0x1f6, /* Groups. */ - DOMAIN_GROUP_RID_ADMINS = 0x200, - DOMAIN_GROUP_RID_USERS = 0x201, - DOMAIN_GROUP_RID_GUESTS = 0x202, - DOMAIN_GROUP_RID_COMPUTERS = 0x203, - DOMAIN_GROUP_RID_CONTROLLERS = 0x204, - DOMAIN_GROUP_RID_CERT_ADMINS = 0x205, - DOMAIN_GROUP_RID_SCHEMA_ADMINS = 0x206, - DOMAIN_GROUP_RID_ENTERPRISE_ADMINS= 0x207, - DOMAIN_GROUP_RID_POLICY_ADMINS = 0x208, + DOMAIN_GROUP_RID_ADMINS = 0x200, + DOMAIN_GROUP_RID_USERS = 0x201, + DOMAIN_GROUP_RID_GUESTS = 0x202, + DOMAIN_GROUP_RID_COMPUTERS = 0x203, + DOMAIN_GROUP_RID_CONTROLLERS = 0x204, + DOMAIN_GROUP_RID_CERT_ADMINS = 0x205, + DOMAIN_GROUP_RID_SCHEMA_ADMINS = 0x206, + DOMAIN_GROUP_RID_ENTERPRISE_ADMINS = 0x207, + DOMAIN_GROUP_RID_POLICY_ADMINS = 0x208, /* Aliases. */ - DOMAIN_ALIAS_RID_ADMINS = 0x220, - DOMAIN_ALIAS_RID_USERS = 0x221, - DOMAIN_ALIAS_RID_GUESTS = 0x222, - DOMAIN_ALIAS_RID_POWER_USERS = 0x223, - - DOMAIN_ALIAS_RID_ACCOUNT_OPS = 0x224, - DOMAIN_ALIAS_RID_SYSTEM_OPS = 0x225, - DOMAIN_ALIAS_RID_PRINT_OPS = 0x226, - DOMAIN_ALIAS_RID_BACKUP_OPS = 0x227, - - DOMAIN_ALIAS_RID_REPLICATOR = 0x228, - DOMAIN_ALIAS_RID_RAS_SERVERS = 0x229, - DOMAIN_ALIAS_RID_PREW2KCOMPACCESS = 0x22a, -} RELATIVE_IDENTIFIERS; + DOMAIN_ALIAS_RID_ADMINS = 0x220, + DOMAIN_ALIAS_RID_USERS = 0x221, + DOMAIN_ALIAS_RID_GUESTS = 0x222, + DOMAIN_ALIAS_RID_POWER_USERS = 0x223, + + DOMAIN_ALIAS_RID_ACCOUNT_OPS = 0x224, + DOMAIN_ALIAS_RID_SYSTEM_OPS = 0x225, + DOMAIN_ALIAS_RID_PRINT_OPS = 0x226, + DOMAIN_ALIAS_RID_BACKUP_OPS = 0x227, + + DOMAIN_ALIAS_RID_REPLICATOR = 0x228, + DOMAIN_ALIAS_RID_RAS_SERVERS = 0x229, + DOMAIN_ALIAS_RID_PREW2KCOMPACCESS = 0x22a, +}; /* * The universal well-known SIDs: @@ -1282,20 +1324,6 @@ typedef enum { /* Identifier authority. */ * (Built-in domain) S-1-5-0x20 */ -/* - * The SID_IDENTIFIER_AUTHORITY is a 48-bit value used in the SID structure. - * - * NOTE: This is stored as a big endian number, hence the high_part comes - * before the low_part. - */ -typedef union { - struct { - u16 high_part; /* High 16-bits. */ - u32 low_part; /* Low 32-bits. */ - } __attribute__ ((__packed__)) parts; - u8 value[6]; /* Value as individual bytes. */ -} __attribute__ ((__packed__)) SID_IDENTIFIER_AUTHORITY; - /* * The SID structure is a variable-length structure used to uniquely identify * users or groups. SID stands for security identifier. @@ -1320,52 +1348,46 @@ typedef union { * sub_authority[0] = 32, // SECURITY_BUILTIN_DOMAIN_RID * sub_authority[1] = 544 // DOMAIN_ALIAS_RID_ADMINS */ -typedef struct { +struct ntfs_sid { u8 revision; u8 sub_authority_count; - SID_IDENTIFIER_AUTHORITY identifier_authority; - le32 sub_authority[1]; /* At least one sub_authority. */ -} __attribute__ ((__packed__)) SID; - -/* - * Current constants for SIDs. - */ -typedef enum { - SID_REVISION = 1, /* Current revision level. */ - SID_MAX_SUB_AUTHORITIES = 15, /* Maximum number of those. */ - SID_RECOMMENDED_SUB_AUTHORITIES = 1, /* Will change to around 6 in - a future revision. */ -} SID_CONSTANTS; + union { + struct { + u16 high_part; /* High 16-bits. */ + u32 low_part; /* Low 32-bits. */ + } __packed parts; + u8 value[6]; /* Value as individual bytes. */ + } identifier_authority; + __le32 sub_authority[]; /* At least one sub_authority. */ +} __packed; /* * The predefined ACE types (8-bit, see below). */ enum { - ACCESS_MIN_MS_ACE_TYPE = 0, - ACCESS_ALLOWED_ACE_TYPE = 0, - ACCESS_DENIED_ACE_TYPE = 1, - SYSTEM_AUDIT_ACE_TYPE = 2, - SYSTEM_ALARM_ACE_TYPE = 3, /* Not implemented as of Win2k. */ - ACCESS_MAX_MS_V2_ACE_TYPE = 3, + ACCESS_MIN_MS_ACE_TYPE = 0, + ACCESS_ALLOWED_ACE_TYPE = 0, + ACCESS_DENIED_ACE_TYPE = 1, + SYSTEM_AUDIT_ACE_TYPE = 2, + SYSTEM_ALARM_ACE_TYPE = 3, /* Not implemented as of Win2k. */ + ACCESS_MAX_MS_V2_ACE_TYPE = 3, - ACCESS_ALLOWED_COMPOUND_ACE_TYPE= 4, - ACCESS_MAX_MS_V3_ACE_TYPE = 4, + ACCESS_ALLOWED_COMPOUND_ACE_TYPE = 4, + ACCESS_MAX_MS_V3_ACE_TYPE = 4, /* The following are Win2k only. */ - ACCESS_MIN_MS_OBJECT_ACE_TYPE = 5, - ACCESS_ALLOWED_OBJECT_ACE_TYPE = 5, - ACCESS_DENIED_OBJECT_ACE_TYPE = 6, - SYSTEM_AUDIT_OBJECT_ACE_TYPE = 7, - SYSTEM_ALARM_OBJECT_ACE_TYPE = 8, - ACCESS_MAX_MS_OBJECT_ACE_TYPE = 8, + ACCESS_MIN_MS_OBJECT_ACE_TYPE = 5, + ACCESS_ALLOWED_OBJECT_ACE_TYPE = 5, + ACCESS_DENIED_OBJECT_ACE_TYPE = 6, + SYSTEM_AUDIT_OBJECT_ACE_TYPE = 7, + SYSTEM_ALARM_OBJECT_ACE_TYPE = 8, + ACCESS_MAX_MS_OBJECT_ACE_TYPE = 8, - ACCESS_MAX_MS_V4_ACE_TYPE = 8, + ACCESS_MAX_MS_V4_ACE_TYPE = 8, /* This one is for WinNT/2k. */ - ACCESS_MAX_MS_ACE_TYPE = 8, -} __attribute__ ((__packed__)); - -typedef u8 ACE_TYPES; + ACCESS_MAX_MS_ACE_TYPE = 8, +} __packed; /* * The ACE flags (8-bit) for audit and inheritance (see below). @@ -1389,27 +1411,7 @@ enum { /* The audit flags. */ SUCCESSFUL_ACCESS_ACE_FLAG = 0x40, FAILED_ACCESS_ACE_FLAG = 0x80, -} __attribute__ ((__packed__)); - -typedef u8 ACE_FLAGS; - -/* - * An ACE is an access-control entry in an access-control list (ACL). - * An ACE defines access to an object for a specific user or group or defines - * the types of access that generate system-administration messages or alarms - * for a specific user or group. The user or group is identified by a security - * identifier (SID). - * - * Each ACE starts with an ACE_HEADER structure (aligned on 4-byte boundary), - * which specifies the type and size of the ACE. The format of the subsequent - * data depends on the ACE type. - */ -typedef struct { -/*Ofs*/ -/* 0*/ ACE_TYPES type; /* Type of the ACE. */ -/* 1*/ ACE_FLAGS flags; /* Flags describing the ACE. */ -/* 2*/ le16 size; /* Size in bytes of the ACE. */ -} __attribute__ ((__packed__)) ACE_HEADER; +} __packed; /* * The access mask (32-bit). Defines the access rights. @@ -1542,38 +1544,17 @@ enum { GENERIC_READ = cpu_to_le32(0x80000000), }; -typedef le32 ACCESS_MASK; - -/* - * The generic mapping array. Used to denote the mapping of each generic - * access right to a specific access mask. - * - * FIXME: What exactly is this and what is it for? (AIA) - */ -typedef struct { - ACCESS_MASK generic_read; - ACCESS_MASK generic_write; - ACCESS_MASK generic_execute; - ACCESS_MASK generic_all; -} __attribute__ ((__packed__)) GENERIC_MAPPING; - /* * The predefined ACE type structures are as defined below. */ -/* - * ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE, SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE - */ -typedef struct { -/* 0 ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */ - ACE_TYPES type; /* Type of the ACE. */ - ACE_FLAGS flags; /* Flags describing the ACE. */ - le16 size; /* Size in bytes of the ACE. */ -/* 4*/ ACCESS_MASK mask; /* Access mask associated with the ACE. */ - -/* 8*/ SID sid; /* The SID associated with the ACE. */ -} __attribute__ ((__packed__)) ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE, - SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE; +struct ntfs_ace { + u8 type; /* Type of the ACE. */ + u8 flags; /* Flags describing the ACE. */ + __le16 size; /* Size in bytes of the ACE. */ + __le32 mask; /* Access mask associated with the ACE. */ + struct ntfs_sid sid; /* The SID associated with the ACE. */ +} __packed; /* * The object ACE flags (32-bit). @@ -1583,25 +1564,6 @@ enum { ACE_INHERITED_OBJECT_TYPE_PRESENT = cpu_to_le32(2), }; -typedef le32 OBJECT_ACE_FLAGS; - -typedef struct { -/* 0 ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */ - ACE_TYPES type; /* Type of the ACE. */ - ACE_FLAGS flags; /* Flags describing the ACE. */ - le16 size; /* Size in bytes of the ACE. */ -/* 4*/ ACCESS_MASK mask; /* Access mask associated with the ACE. */ - -/* 8*/ OBJECT_ACE_FLAGS object_flags; /* Flags describing the object ACE. */ -/* 12*/ GUID object_type; -/* 28*/ GUID inherited_object_type; - -/* 44*/ SID sid; /* The SID associated with the ACE. */ -} __attribute__ ((__packed__)) ACCESS_ALLOWED_OBJECT_ACE, - ACCESS_DENIED_OBJECT_ACE, - SYSTEM_AUDIT_OBJECT_ACE, - SYSTEM_ALARM_OBJECT_ACE; - /* * An ACL is an access-control list (ACL). * An ACL starts with an ACL header structure, which specifies the size of @@ -1609,32 +1571,18 @@ typedef struct { * zero or more access control entries (ACEs). The ACL as well as each ACE * are aligned on 4-byte boundaries. */ -typedef struct { +struct ntfs_acl { u8 revision; /* Revision of this ACL. */ u8 alignment1; - le16 size; /* Allocated space in bytes for ACL. Includes this - header, the ACEs and the remaining free space. */ - le16 ace_count; /* Number of ACEs in the ACL. */ - le16 alignment2; -/* sizeof() = 8 bytes */ -} __attribute__ ((__packed__)) ACL; + __le16 size; /* + * Allocated space in bytes for ACL. Includes this + * header, the ACEs and the remaining free space. + */ + __le16 ace_count; /* Number of ACEs in the ACL. */ + __le16 alignment2; +} __packed; -/* - * Current constants for ACLs. - */ -typedef enum { - /* Current revision. */ - ACL_REVISION = 2, - ACL_REVISION_DS = 4, - - /* History of revisions. */ - ACL_REVISION1 = 1, - MIN_ACL_REVISION = 2, - ACL_REVISION2 = 2, - ACL_REVISION3 = 3, - ACL_REVISION4 = 4, - MAX_ACL_REVISION = 4, -} ACL_CONSTANTS; +static_assert(sizeof(struct ntfs_acl) == 8); /* * The security descriptor control flags (16-bit). @@ -1698,87 +1646,44 @@ enum { SE_SACL_PROTECTED = cpu_to_le16(0x2000), SE_RM_CONTROL_VALID = cpu_to_le16(0x4000), SE_SELF_RELATIVE = cpu_to_le16(0x8000) -} __attribute__ ((__packed__)); - -typedef le16 SECURITY_DESCRIPTOR_CONTROL; +} __packed; /* * Self-relative security descriptor. Contains the owner and group SIDs as well * as the sacl and dacl ACLs inside the security descriptor itself. */ -typedef struct { - u8 revision; /* Revision level of the security descriptor. */ - u8 alignment; - SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of - the descriptor as well as the following fields. */ - le32 owner; /* Byte offset to a SID representing an object's - owner. If this is NULL, no owner SID is present in - the descriptor. */ - le32 group; /* Byte offset to a SID representing an object's - primary group. If this is NULL, no primary group - SID is present in the descriptor. */ - le32 sacl; /* Byte offset to a system ACL. Only valid, if - SE_SACL_PRESENT is set in the control field. If - SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL - is specified. */ - le32 dacl; /* Byte offset to a discretionary ACL. Only valid, if - SE_DACL_PRESENT is set in the control field. If - SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL - (unconditionally granting access) is specified. */ -/* sizeof() = 0x14 bytes */ -} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_RELATIVE; - -/* - * Absolute security descriptor. Does not contain the owner and group SIDs, nor - * the sacl and dacl ACLs inside the security descriptor. Instead, it contains - * pointers to these structures in memory. Obviously, absolute security - * descriptors are only useful for in memory representations of security - * descriptors. On disk, a self-relative security descriptor is used. - */ -typedef struct { +struct security_descriptor_relative { u8 revision; /* Revision level of the security descriptor. */ u8 alignment; - SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of - the descriptor as well as the following fields. */ - SID *owner; /* Points to a SID representing an object's owner. If - this is NULL, no owner SID is present in the - descriptor. */ - SID *group; /* Points to a SID representing an object's primary - group. If this is NULL, no primary group SID is - present in the descriptor. */ - ACL *sacl; /* Points to a system ACL. Only valid, if - SE_SACL_PRESENT is set in the control field. If - SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL - is specified. */ - ACL *dacl; /* Points to a discretionary ACL. Only valid, if - SE_DACL_PRESENT is set in the control field. If - SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL - (unconditionally granting access) is specified. */ -} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR; - -/* - * Current constants for security descriptors. - */ -typedef enum { - /* Current revision. */ - SECURITY_DESCRIPTOR_REVISION = 1, - SECURITY_DESCRIPTOR_REVISION1 = 1, - - /* The sizes of both the absolute and relative security descriptors is - the same as pointers, at least on ia32 architecture are 32-bit. */ - SECURITY_DESCRIPTOR_MIN_LENGTH = sizeof(SECURITY_DESCRIPTOR), -} SECURITY_DESCRIPTOR_CONSTANTS; - -/* - * Attribute: Security descriptor (0x50). A standard self-relative security - * descriptor. - * - * NOTE: Can be resident or non-resident. - * NOTE: Not used in NTFS 3.0+, as security descriptors are stored centrally - * in FILE_Secure and the correct descriptor is found using the security_id - * from the standard information attribute. - */ -typedef SECURITY_DESCRIPTOR_RELATIVE SECURITY_DESCRIPTOR_ATTR; + __le16 control; /* + * Flags qualifying the type of * the descriptor as well as + * the following fields. + */ + __le32 owner; /* + * Byte offset to a SID representing an object's + * owner. If this is NULL, no owner SID is present in + * the descriptor. + */ + __le32 group; /* + * Byte offset to a SID representing an object's + * primary group. If this is NULL, no primary group + * SID is present in the descriptor. + */ + __le32 sacl; /* + * Byte offset to a system ACL. Only valid, if + * SE_SACL_PRESENT is set in the control field. If + * SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL + * is specified. + */ + __le32 dacl; /* + * Byte offset to a discretionary ACL. Only valid, if + * SE_DACL_PRESENT is set in the control field. If + * SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL + * (unconditionally granting access) is specified. + */ +} __packed; + +static_assert(sizeof(struct security_descriptor_relative) == 20); /* * On NTFS 3.0+, all security descriptors are stored in FILE_Secure. Only one @@ -1819,67 +1724,23 @@ typedef SECURITY_DESCRIPTOR_RELATIVE SECURITY_DESCRIPTOR_ATTR; * references an entry any more. */ -/* - * This header precedes each security descriptor in the $SDS data stream. - * This is also the index entry data part of both the $SII and $SDH indexes. - */ -typedef struct { - le32 hash; /* Hash of the security descriptor. */ - le32 security_id; /* The security_id assigned to the descriptor. */ - le64 offset; /* Byte offset of this entry in the $SDS stream. */ - le32 length; /* Size in bytes of this entry in $SDS stream. */ -} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_HEADER; - -/* - * The $SDS data stream contains the security descriptors, aligned on 16-byte - * boundaries, sorted by security_id in a B+ tree. Security descriptors cannot - * cross 256kib boundaries (this restriction is imposed by the Windows cache - * manager). Each security descriptor is contained in a SDS_ENTRY structure. - * Also, each security descriptor is stored twice in the $SDS stream with a - * fixed offset of 0x40000 bytes (256kib, the Windows cache manager's max size) - * between them; i.e. if a SDS_ENTRY specifies an offset of 0x51d0, then the - * first copy of the security descriptor will be at offset 0x51d0 in the - * $SDS data stream and the second copy will be at offset 0x451d0. - */ -typedef struct { -/*Ofs*/ -/* 0 SECURITY_DESCRIPTOR_HEADER; -- Unfolded here as gcc doesn't like - unnamed structs. */ - le32 hash; /* Hash of the security descriptor. */ - le32 security_id; /* The security_id assigned to the descriptor. */ - le64 offset; /* Byte offset of this entry in the $SDS stream. */ - le32 length; /* Size in bytes of this entry in $SDS stream. */ -/* 20*/ SECURITY_DESCRIPTOR_RELATIVE sid; /* The self-relative security - descriptor. */ -} __attribute__ ((__packed__)) SDS_ENTRY; - /* * The index entry key used in the $SII index. The collation type is * COLLATION_NTOFS_ULONG. */ -typedef struct { - le32 security_id; /* The security_id assigned to the descriptor. */ -} __attribute__ ((__packed__)) SII_INDEX_KEY; +struct sii_index_key { + __le32 security_id; /* The security_id assigned to the descriptor. */ +} __packed; /* * The index entry key used in the $SDH index. The keys are sorted first by * hash and then by security_id. The collation rule is * COLLATION_NTOFS_SECURITY_HASH. */ -typedef struct { - le32 hash; /* Hash of the security descriptor. */ - le32 security_id; /* The security_id assigned to the descriptor. */ -} __attribute__ ((__packed__)) SDH_INDEX_KEY; - -/* - * Attribute: Volume name (0x60). - * - * NOTE: Always resident. - * NOTE: Present only in FILE_Volume. - */ -typedef struct { - ntfschar name[0]; /* The name of the volume in Unicode. */ -} __attribute__ ((__packed__)) VOLUME_NAME; +struct sdh_index_key { + __le32 hash; /* Hash of the security descriptor. */ + __le32 security_id; /* The security_id assigned to the descriptor. */ +} __packed; /* * Possible flags for the volume (16-bit). @@ -1900,9 +1761,7 @@ enum { /* To make our life easier when checking if we must mount read-only. */ VOLUME_MUST_MOUNT_RO_MASK = cpu_to_le16(0xc027), -} __attribute__ ((__packed__)); - -typedef le16 VOLUME_FLAGS; +} __packed; /* * Attribute: Volume information (0x70). @@ -1912,23 +1771,12 @@ typedef le16 VOLUME_FLAGS; * NOTE: Windows 2000 uses NTFS 3.0 while Windows NT4 service pack 6a uses * NTFS 1.2. I haven't personally seen other values yet. */ -typedef struct { - le64 reserved; /* Not used (yet?). */ +struct volume_information { + __le64 reserved; /* Not used (yet?). */ u8 major_ver; /* Major version of the ntfs format. */ u8 minor_ver; /* Minor version of the ntfs format. */ - VOLUME_FLAGS flags; /* Bit array of VOLUME_* flags. */ -} __attribute__ ((__packed__)) VOLUME_INFORMATION; - -/* - * Attribute: Data attribute (0x80). - * - * NOTE: Can be resident or non-resident. - * - * Data contents of a file (i.e. the unnamed stream) or of a named stream. - */ -typedef struct { - u8 data[0]; /* The file's data contents. */ -} __attribute__ ((__packed__)) DATA_ATTR; + __le16 flags; /* Bit array of VOLUME_* flags. */ +} __packed; /* * Index header flags (8-bit). @@ -1937,52 +1785,66 @@ enum { /* * When index header is in an index root attribute: */ - SMALL_INDEX = 0, /* The index is small enough to fit inside the index - root attribute and there is no index allocation - attribute present. */ - LARGE_INDEX = 1, /* The index is too large to fit in the index root - attribute and/or an index allocation attribute is - present. */ + SMALL_INDEX = 0, /* + * The index is small enough to fit inside the index + * root attribute and there is no index allocation + * attribute present. + */ + LARGE_INDEX = 1, /* + * The index is too large to fit in the index root + * attribute and/or an index allocation attribute is + * present. + */ /* * When index header is in an index block, i.e. is part of index * allocation attribute: */ - LEAF_NODE = 0, /* This is a leaf node, i.e. there are no more nodes - branching off it. */ - INDEX_NODE = 1, /* This node indexes other nodes, i.e. it is not a leaf - node. */ + LEAF_NODE = 0, /* + * This is a leaf node, i.e. there are no more nodes + * branching off it. + */ + INDEX_NODE = 1, /* + * This node indexes other nodes, i.e. it is not a leaf + * node. + */ NODE_MASK = 1, /* Mask for accessing the *_NODE bits. */ -} __attribute__ ((__packed__)); - -typedef u8 INDEX_HEADER_FLAGS; +} __packed; /* * This is the header for indexes, describing the INDEX_ENTRY records, which - * follow the INDEX_HEADER. Together the index header and the index entries + * follow the index_header. Together the index header and the index entries * make up a complete index. * * IMPORTANT NOTE: The offset, length and size structure members are counted * relative to the start of the index header structure and not relative to the * start of the index root or index allocation structures themselves. */ -typedef struct { - le32 entries_offset; /* Byte offset to first INDEX_ENTRY - aligned to 8-byte boundary. */ - le32 index_length; /* Data size of the index in bytes, - i.e. bytes used from allocated - size, aligned to 8-byte boundary. */ - le32 allocated_size; /* Byte size of this index (block), - multiple of 8 bytes. */ - /* NOTE: For the index root attribute, the above two numbers are always - equal, as the attribute is resident and it is resized as needed. In - the case of the index allocation attribute the attribute is not - resident and hence the allocated_size is a fixed value and must - equal the index_block_size specified by the INDEX_ROOT attribute - corresponding to the INDEX_ALLOCATION attribute this INDEX_BLOCK - belongs to. */ - INDEX_HEADER_FLAGS flags; /* Bit field of INDEX_HEADER_FLAGS. */ +struct index_header { + __le32 entries_offset; /* + * Byte offset to first INDEX_ENTRY + * aligned to 8-byte boundary. + */ + __le32 index_length; /* + * Data size of the index in bytes, + * i.e. bytes used from allocated + * size, aligned to 8-byte boundary. + */ + __le32 allocated_size; /* + * Byte size of this index (block), + * multiple of 8 bytes. + */ + /* + * NOTE: For the index root attribute, the above two numbers are always + * equal, as the attribute is resident and it is resized as needed. In + * the case of the index allocation attribute the attribute is not + * resident and hence the allocated_size is a fixed value and must + * equal the index_block_size specified by the INDEX_ROOT attribute + * corresponding to the INDEX_ALLOCATION attribute this INDEX_BLOCK + * belongs to. + */ + u8 flags; /* Bit field of INDEX_HEADER_FLAGS. */ u8 reserved[3]; /* Reserved/align to 8-byte boundary. */ -} __attribute__ ((__packed__)) INDEX_HEADER; +} __packed; /* * Attribute: Index root (0x90). @@ -2003,28 +1865,35 @@ typedef struct { * NOTE: The root directory (FILE_root) contains an entry for itself. Other * directories do not contain entries for themselves, though. */ -typedef struct { - ATTR_TYPE type; /* Type of the indexed attribute. Is - $FILE_NAME for directories, zero - for view indexes. No other values - allowed. */ - COLLATION_RULE collation_rule; /* Collation rule used to sort the - index entries. If type is $FILE_NAME, - this must be COLLATION_FILE_NAME. */ - le32 index_block_size; /* Size of each index block in bytes (in - the index allocation attribute). */ - u8 clusters_per_index_block; /* Cluster size of each index block (in - the index allocation attribute), when - an index block is >= than a cluster, - otherwise this will be the log of - the size (like how the encoding of - the mft record size and the index - record size found in the boot sector - work). Has to be a power of 2. */ +struct index_root { + __le32 type; /* + * Type of the indexed attribute. Is + * $FILE_NAME for directories, zero + * for view indexes. No other values + * allowed. + */ + __le32 collation_rule; /* + * Collation rule used to sort the index + * entries. If type is $FILE_NAME, this + * must be COLLATION_FILE_NAME. + */ + __le32 index_block_size; /* + * Size of each index block in bytes (in + * the index allocation attribute). + */ + u8 clusters_per_index_block; /* + * Cluster size of each index block (in + * the index allocation attribute), when + * an index block is >= than a cluster, + * otherwise this will be the log of + * the size (like how the encoding of + * the mft record size and the index + * record size found in the boot sector + * work). Has to be a power of 2. + */ u8 reserved[3]; /* Reserved/align to 8-byte boundary. */ - INDEX_HEADER index; /* Index header describing the - following index entries. */ -} __attribute__ ((__packed__)) INDEX_ROOT; + struct index_header index; /* Index header describing the following index entries. */ +} __packed; /* * Attribute: Index allocation (0xa0). @@ -2032,24 +1901,26 @@ typedef struct { * NOTE: Always non-resident (doesn't make sense to be resident anyway!). * * This is an array of index blocks. Each index block starts with an - * INDEX_BLOCK structure containing an index header, followed by a sequence of - * index entries (INDEX_ENTRY structures), as described by the INDEX_HEADER. - */ -typedef struct { -/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ - NTFS_RECORD_TYPE magic; /* Magic is "INDX". */ - le16 usa_ofs; /* See NTFS_RECORD definition. */ - le16 usa_count; /* See NTFS_RECORD definition. */ - -/* 8*/ sle64 lsn; /* $LogFile sequence number of the last - modification of this index block. */ -/* 16*/ leVCN index_block_vcn; /* Virtual cluster number of the index block. - If the cluster_size on the volume is <= the - index_block_size of the directory, - index_block_vcn counts in units of clusters, - and in units of sectors otherwise. */ -/* 24*/ INDEX_HEADER index; /* Describes the following index entries. */ -/* sizeof()= 40 (0x28) bytes */ + * index_block structure containing an index header, followed by a sequence of + * index entries (INDEX_ENTRY structures), as described by the struct index_header. + */ +struct index_block { + __le32 magic; /* Magic is "INDX". */ + __le16 usa_ofs; /* See ntfs_record struct definition. */ + __le16 usa_count; /* See ntfs_record struct definition. */ + + __le64 lsn; /* + * LogFile sequence number of the last + * modification of this index block. + */ + __le64 index_block_vcn; /* + * Virtual cluster number of the index block. + * If the cluster_size on the volume is <= the + * index_block_size of the directory, + * index_block_vcn counts in units of clusters, + * and in units of sectors otherwise. + */ + struct index_header index; /* Describes the following index entries. */ /* * When creating the index block, we place the update sequence array at this * offset, i.e. before we start with the index entries. This also makes sense, @@ -2059,9 +1930,9 @@ typedef struct { * by overwriting it since you then can't get it back... * When reading use the data from the ntfs record header. */ -} __attribute__ ((__packed__)) INDEX_BLOCK; +} __packed; -typedef INDEX_BLOCK INDEX_ALLOCATION; +static_assert(sizeof(struct index_block) == 40); /* * The system file FILE_Extend/$Reparse contains an index named $R listing @@ -2069,14 +1940,15 @@ typedef INDEX_BLOCK INDEX_ALLOCATION; * below. Note, that there is no index data associated with the index entries. * * The index entries are sorted by the index key file_id. The collation rule is - * COLLATION_NTOFS_ULONGS. FIXME: Verify whether the reparse_tag is not the - * primary key / is not a key at all. (AIA) + * COLLATION_NTOFS_ULONGS. */ -typedef struct { - le32 reparse_tag; /* Reparse point type (inc. flags). */ - leMFT_REF file_id; /* Mft record of the file containing the - reparse point attribute. */ -} __attribute__ ((__packed__)) REPARSE_INDEX_KEY; +struct reparse_index_key { + __le32 reparse_tag; /* Reparse point type (inc. flags). */ + __le64 file_id; /* + * Mft record of the file containing + * the reparse point attribute. + */ +} __packed; /* * Quota flags (32-bit). @@ -2106,8 +1978,6 @@ enum { QUOTA_FLAG_PENDING_DELETES = cpu_to_le32(0x00000800), }; -typedef le32 QUOTA_FLAGS; - /* * The system file FILE_Extend/$Quota contains two indexes $O and $Q. Quotas * are on a per volume and per user basis. @@ -2129,19 +1999,21 @@ typedef le32 QUOTA_FLAGS; * * The $Q index entry data is the quota control entry and is defined below. */ -typedef struct { - le32 version; /* Currently equals 2. */ - QUOTA_FLAGS flags; /* Flags describing this quota entry. */ - le64 bytes_used; /* How many bytes of the quota are in use. */ - sle64 change_time; /* Last time this quota entry was changed. */ - sle64 threshold; /* Soft quota (-1 if not limited). */ - sle64 limit; /* Hard quota (-1 if not limited). */ - sle64 exceeded_time; /* How long the soft quota has been exceeded. */ - SID sid; /* The SID of the user/object associated with - this quota entry. Equals zero for the quota - defaults entry (and in fact on a WinXP - volume, it is not present at all). */ -} __attribute__ ((__packed__)) QUOTA_CONTROL_ENTRY; +struct quota_control_entry { + __le32 version; /* Currently equals 2. */ + __le32 flags; /* Flags describing this quota entry. */ + __le64 bytes_used; /* How many bytes of the quota are in use. */ + __le64 change_time; /* Last time this quota entry was changed. */ + __le64 threshold; /* Soft quota (-1 if not limited). */ + __le64 limit; /* Hard quota (-1 if not limited). */ + __le64 exceeded_time; /* How long the soft quota has been exceeded. */ + struct ntfs_sid sid; /* + * The SID of the user/object associated with + * this quota entry. Equals zero for the quota + * defaults entry (and in fact on a WinXP + * volume, it is not present at all). + */ +} __packed; /* * Predefined owner_id values (32-bit). @@ -2155,138 +2027,141 @@ enum { /* * Current constants for quota control entries. */ -typedef enum { +enum { /* Current version. */ QUOTA_VERSION = 2, -} QUOTA_CONTROL_ENTRY_CONSTANTS; +}; /* * Index entry flags (16-bit). */ enum { - INDEX_ENTRY_NODE = cpu_to_le16(1), /* This entry contains a - sub-node, i.e. a reference to an index block in form of - a virtual cluster number (see below). */ - INDEX_ENTRY_END = cpu_to_le16(2), /* This signifies the last - entry in an index block. The index entry does not - represent a file but it can point to a sub-node. */ - - INDEX_ENTRY_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force - enum bit width to 16-bit. */ -} __attribute__ ((__packed__)); - -typedef le16 INDEX_ENTRY_FLAGS; + INDEX_ENTRY_NODE = cpu_to_le16(1), /* + * This entry contains a sub-node, + * i.e. a reference to an index block + * in form of a virtual cluster number + * (see below). + */ + INDEX_ENTRY_END = cpu_to_le16(2), /* + * This signifies the last entry in an + * index block. The index entry does not + * represent a file but it can point + * to a sub-node. + */ + + INDEX_ENTRY_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16-bit. */ +} __packed; /* * This the index entry header (see below). */ -typedef struct { +struct index_entry_header { /* 0*/ union { struct { /* Only valid when INDEX_ENTRY_END is not set. */ - leMFT_REF indexed_file; /* The mft reference of the file - described by this index - entry. Used for directory - indexes. */ - } __attribute__ ((__packed__)) dir; - struct { /* Used for views/indexes to find the entry's data. */ - le16 data_offset; /* Data byte offset from this - INDEX_ENTRY. Follows the - index key. */ - le16 data_length; /* Data length in bytes. */ - le32 reservedV; /* Reserved (zero). */ - } __attribute__ ((__packed__)) vi; - } __attribute__ ((__packed__)) data; -/* 8*/ le16 length; /* Byte size of this index entry, multiple of - 8-bytes. */ -/* 10*/ le16 key_length; /* Byte size of the key value, which is in the - index entry. It follows field reserved. Not - multiple of 8-bytes. */ -/* 12*/ INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */ -/* 14*/ le16 reserved; /* Reserved/align to 8-byte boundary. */ -/* sizeof() = 16 bytes */ -} __attribute__ ((__packed__)) INDEX_ENTRY_HEADER; - -/* - * This is an index entry. A sequence of such entries follows each INDEX_HEADER + __le64 indexed_file; /* + * The mft reference of the file + * described by this index entry. + * Used for directory indexes. + */ + } __packed dir; + struct { + /* Used for views/indexes to find the entry's data. */ + __le16 data_offset; /* + * Data byte offset from this + * INDEX_ENTRY. Follows the index key. + */ + __le16 data_length; /* Data length in bytes. */ + __le32 reservedV; /* Reserved (zero). */ + } __packed vi; + } __packed data; + __le16 length; /* Byte size of this index entry, multiple of 8-bytes. */ + __le16 key_length; /* + * Byte size of the key value, which is in the index entry. + * It follows field reserved. Not multiple of 8-bytes. + */ + __le16 flags; /* Bit field of INDEX_ENTRY_* flags. */ + __le16 reserved; /* Reserved/align to 8-byte boundary. */ +} __packed; + +static_assert(sizeof(struct index_entry_header) == 16); + +/* + * This is an index entry. A sequence of such entries follows each index_header * structure. Together they make up a complete index. The index follows either * an index root attribute or an index allocation attribute. * * NOTE: Before NTFS 3.0 only filename attributes were indexed. */ -typedef struct { -/*Ofs*/ -/* 0 INDEX_ENTRY_HEADER; -- Unfolded here as gcc dislikes unnamed structs. */ +struct index_entry { union { struct { /* Only valid when INDEX_ENTRY_END is not set. */ - leMFT_REF indexed_file; /* The mft reference of the file - described by this index - entry. Used for directory - indexes. */ - } __attribute__ ((__packed__)) dir; + __le64 indexed_file; /* + * The mft reference of the file + * described by this index entry. + * Used for directory indexes. + */ + } __packed dir; struct { /* Used for views/indexes to find the entry's data. */ - le16 data_offset; /* Data byte offset from this - INDEX_ENTRY. Follows the - index key. */ - le16 data_length; /* Data length in bytes. */ - le32 reservedV; /* Reserved (zero). */ - } __attribute__ ((__packed__)) vi; - } __attribute__ ((__packed__)) data; - le16 length; /* Byte size of this index entry, multiple of - 8-bytes. */ - le16 key_length; /* Byte size of the key value, which is in the - index entry. It follows field reserved. Not - multiple of 8-bytes. */ - INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */ - le16 reserved; /* Reserved/align to 8-byte boundary. */ - -/* 16*/ union { /* The key of the indexed attribute. NOTE: Only present - if INDEX_ENTRY_END bit in flags is not set. NOTE: On - NTFS versions before 3.0 the only valid key is the - FILE_NAME_ATTR. On NTFS 3.0+ the following - additional index keys are defined: */ - FILE_NAME_ATTR file_name;/* $I30 index in directories. */ - SII_INDEX_KEY sii; /* $SII index in $Secure. */ - SDH_INDEX_KEY sdh; /* $SDH index in $Secure. */ - GUID object_id; /* $O index in FILE_Extend/$ObjId: The - object_id of the mft record found in - the data part of the index. */ - REPARSE_INDEX_KEY reparse; /* $R index in - FILE_Extend/$Reparse. */ - SID sid; /* $O index in FILE_Extend/$Quota: - SID of the owner of the user_id. */ - le32 owner_id; /* $Q index in FILE_Extend/$Quota: - user_id of the owner of the quota - control entry in the data part of - the index. */ - } __attribute__ ((__packed__)) key; - /* The (optional) index data is inserted here when creating. */ - // leVCN vcn; /* If INDEX_ENTRY_NODE bit in flags is set, the last - // eight bytes of this index entry contain the virtual - // cluster number of the index block that holds the - // entries immediately preceding the current entry (the - // vcn references the corresponding cluster in the data - // of the non-resident index allocation attribute). If - // the key_length is zero, then the vcn immediately - // follows the INDEX_ENTRY_HEADER. Regardless of - // key_length, the address of the 8-byte boundary - // aligned vcn of INDEX_ENTRY{_HEADER} *ie is given by - // (char*)ie + le16_to_cpu(ie*)->length) - sizeof(VCN), - // where sizeof(VCN) can be hardcoded as 8 if wanted. */ -} __attribute__ ((__packed__)) INDEX_ENTRY; - -/* - * Attribute: Bitmap (0xb0). - * - * Contains an array of bits (aka a bitfield). - * - * When used in conjunction with the index allocation attribute, each bit - * corresponds to one index block within the index allocation attribute. Thus - * the number of bits in the bitmap * index block size / cluster size is the - * number of clusters in the index allocation attribute. - */ -typedef struct { - u8 bitmap[0]; /* Array of bits. */ -} __attribute__ ((__packed__)) BITMAP_ATTR; + __le16 data_offset; /* + * Data byte offset from this INDEX_ENTRY. + * Follows the index key. + */ + __le16 data_length; /* Data length in bytes. */ + __le32 reservedV; /* Reserved (zero). */ + } __packed vi; + } __packed data; + __le16 length; /* Byte size of this index entry, multiple of 8-bytes. */ + __le16 key_length; /* + * Byte size of the key value, which is in the index entry. + * It follows field reserved. Not multiple of 8-bytes. + */ + __le16 flags; /* Bit field of INDEX_ENTRY_* flags. */ + __le16 reserved; /* Reserved/align to 8-byte boundary. */ + + union { + /* + * The key of the indexed attribute. NOTE: Only present + * if INDEX_ENTRY_END bit in flags is not set. NOTE: On + * NTFS versions before 3.0 the only valid key is the + * struct file_name_attr. On NTFS 3.0+ the following + * additional index keys are defined: + */ + struct file_name_attr file_name; /* $I30 index in directories. */ + struct sii_index_key sii; /* $SII index in $Secure. */ + struct sdh_index_key sdh; /* $SDH index in $Secure. */ + struct guid object_id; /* + * $O index in FILE_Extend/$ObjId: The object_id + * of the mft record found in the data part of + * the index. + */ + struct reparse_index_key reparse; /* $R index in FILE_Extend/$Reparse. */ + struct ntfs_sid sid; /* + * $O index in FILE_Extend/$Quota: + * SID of the owner of the user_id. + */ + __le32 owner_id; /* + * $Q index in FILE_Extend/$Quota: + * user_id of the owner of the quota + * control entry in the data part of + * the index. + */ + } __packed key; + /* + * The (optional) index data is inserted here when creating. + * __le64 vcn; If INDEX_ENTRY_NODE bit in flags is set, the last + * eight bytes of this index entry contain the virtual + * cluster number of the index block that holds the + * entries immediately preceding the current entry (the + * vcn references the corresponding cluster in the data + * of the non-resident index allocation attribute). If + * the key_length is zero, then the vcn immediately + * follows the INDEX_ENTRY_HEADER. Regardless of + * key_length, the address of the 8-byte boundary + * aligned vcn of INDEX_ENTRY{_HEADER} *ie is given by + * (char*)ie + le16_to_cpu(ie*)->length) - sizeof(VCN), + * where sizeof(VCN) can be hardcoded as 8 if wanted. + */ +} __packed; /* * The reparse point tag defines the type of the reparse point. It also @@ -2294,21 +2169,25 @@ typedef struct { * * The reparse point tag is an unsigned 32-bit value divided in three parts: * - * 1. The least significant 16 bits (i.e. bits 0 to 15) specifiy the type of + * 1. The least significant 16 bits (i.e. bits 0 to 15) specify the type of * the reparse point. - * 2. The 13 bits after this (i.e. bits 16 to 28) are reserved for future use. - * 3. The most significant three bits are flags describing the reparse point. + * 2. The 12 bits after this (i.e. bits 16 to 27) are reserved for future use. + * 3. The most significant four bits are flags describing the reparse point. * They are defined as follows: + * bit 28: Directory bit. If set, the directory is not a surrogate + * and can be used the usual way. * bit 29: Name surrogate bit. If set, the filename is an alias for * another object in the system. * bit 30: High-latency bit. If set, accessing the first byte of data will * be slow. (E.g. the data is stored on a tape drive.) * bit 31: Microsoft bit. If set, the tag is owned by Microsoft. User * defined tags have to use zero here. - * - * These are the predefined reparse point tags: + * 4. Moreover, on Windows 10 : + * Some flags may be used in bits 12 to 15 to further describe the + * reparse point. */ enum { + IO_REPARSE_TAG_DIRECTORY = cpu_to_le32(0x10000000), IO_REPARSE_TAG_IS_ALIAS = cpu_to_le32(0x20000000), IO_REPARSE_TAG_IS_HIGH_LATENCY = cpu_to_le32(0x40000000), IO_REPARSE_TAG_IS_MICROSOFT = cpu_to_le32(0x80000000), @@ -2317,18 +2196,31 @@ enum { IO_REPARSE_TAG_RESERVED_ONE = cpu_to_le32(0x00000001), IO_REPARSE_TAG_RESERVED_RANGE = cpu_to_le32(0x00000001), - IO_REPARSE_TAG_NSS = cpu_to_le32(0x68000005), - IO_REPARSE_TAG_NSS_RECOVER = cpu_to_le32(0x68000006), - IO_REPARSE_TAG_SIS = cpu_to_le32(0x68000007), - IO_REPARSE_TAG_DFS = cpu_to_le32(0x68000008), - - IO_REPARSE_TAG_MOUNT_POINT = cpu_to_le32(0x88000003), - - IO_REPARSE_TAG_HSM = cpu_to_le32(0xa8000004), - - IO_REPARSE_TAG_SYMBOLIC_LINK = cpu_to_le32(0xe8000000), - - IO_REPARSE_TAG_VALID_VALUES = cpu_to_le32(0xe000ffff), + IO_REPARSE_TAG_CSV = cpu_to_le32(0x80000009), + IO_REPARSE_TAG_DEDUP = cpu_to_le32(0x80000013), + IO_REPARSE_TAG_DFS = cpu_to_le32(0x8000000A), + IO_REPARSE_TAG_DFSR = cpu_to_le32(0x80000012), + IO_REPARSE_TAG_HSM = cpu_to_le32(0xC0000004), + IO_REPARSE_TAG_HSM2 = cpu_to_le32(0x80000006), + IO_REPARSE_TAG_MOUNT_POINT = cpu_to_le32(0xA0000003), + IO_REPARSE_TAG_NFS = cpu_to_le32(0x80000014), + IO_REPARSE_TAG_SIS = cpu_to_le32(0x80000007), + IO_REPARSE_TAG_SYMLINK = cpu_to_le32(0xA000000C), + IO_REPARSE_TAG_WIM = cpu_to_le32(0x80000008), + IO_REPARSE_TAG_DFM = cpu_to_le32(0x80000016), + IO_REPARSE_TAG_WOF = cpu_to_le32(0x80000017), + IO_REPARSE_TAG_WCI = cpu_to_le32(0x80000018), + IO_REPARSE_TAG_CLOUD = cpu_to_le32(0x9000001A), + IO_REPARSE_TAG_APPEXECLINK = cpu_to_le32(0x8000001B), + IO_REPARSE_TAG_GVFS = cpu_to_le32(0x9000001C), + IO_REPARSE_TAG_LX_SYMLINK = cpu_to_le32(0xA000001D), + IO_REPARSE_TAG_AF_UNIX = cpu_to_le32(0x80000023), + IO_REPARSE_TAG_LX_FIFO = cpu_to_le32(0x80000024), + IO_REPARSE_TAG_LX_CHR = cpu_to_le32(0x80000025), + IO_REPARSE_TAG_LX_BLK = cpu_to_le32(0x80000026), + + IO_REPARSE_TAG_VALID_VALUES = cpu_to_le32(0xf000ffff), + IO_REPARSE_PLUGIN_SELECT = cpu_to_le32(0xffff0fff), }; /* @@ -2336,40 +2228,42 @@ enum { * * NOTE: Can be resident or non-resident. */ -typedef struct { - le32 reparse_tag; /* Reparse point type (inc. flags). */ - le16 reparse_data_length; /* Byte size of reparse data. */ - le16 reserved; /* Align to 8-byte boundary. */ +struct reparse_point { + __le32 reparse_tag; /* Reparse point type (inc. flags). */ + __le16 reparse_data_length; /* Byte size of reparse data. */ + __le16 reserved; /* Align to 8-byte boundary. */ u8 reparse_data[0]; /* Meaning depends on reparse_tag. */ -} __attribute__ ((__packed__)) REPARSE_POINT; +} __packed; /* * Attribute: Extended attribute (EA) information (0xd0). * * NOTE: Always resident. (Is this true???) */ -typedef struct { - le16 ea_length; /* Byte size of the packed extended - attributes. */ - le16 need_ea_count; /* The number of extended attributes which have - the NEED_EA bit set. */ - le32 ea_query_length; /* Byte size of the buffer required to query - the extended attributes when calling - ZwQueryEaFile() in Windows NT/2k. I.e. the - byte size of the unpacked extended - attributes. */ -} __attribute__ ((__packed__)) EA_INFORMATION; +struct ea_information { + __le16 ea_length; /* Byte size of the packed extended attributes. */ + __le16 need_ea_count; /* + * The number of extended attributes which have + * the NEED_EA bit set. + */ + __le32 ea_query_length; /* + * Byte size of the buffer required to query + * the extended attributes when calling + * ZwQueryEaFile() in Windows NT/2k. I.e. + * the byte size of the unpacked extended attributes. + */ +} __packed; /* * Extended attribute flags (8-bit). */ enum { - NEED_EA = 0x80 /* If set the file to which the EA belongs - cannot be interpreted without understanding - the associates extended attributes. */ -} __attribute__ ((__packed__)); - -typedef u8 EA_FLAGS; + NEED_EA = 0x80 /* + * If set the file to which the EA belongs + * cannot be interpreted without understanding + * the associates extended attributes. + */ +} __packed; /* * Attribute: Extended attribute (EA) (0xe0). @@ -2379,43 +2273,19 @@ typedef u8 EA_FLAGS; * Like the attribute list and the index buffer list, the EA attribute value is * a sequence of EA_ATTR variable length records. */ -typedef struct { - le32 next_entry_offset; /* Offset to the next EA_ATTR. */ - EA_FLAGS flags; /* Flags describing the EA. */ - u8 ea_name_length; /* Length of the name of the EA in bytes - excluding the '\0' byte terminator. */ - le16 ea_value_length; /* Byte size of the EA's value. */ - u8 ea_name[0]; /* Name of the EA. Note this is ASCII, not - Unicode and it is zero terminated. */ - u8 ea_value[0]; /* The value of the EA. Immediately follows - the name. */ -} __attribute__ ((__packed__)) EA_ATTR; - -/* - * Attribute: Property set (0xf0). - * - * Intended to support Native Structure Storage (NSS) - a feature removed from - * NTFS 3.0 during beta testing. - */ -typedef struct { - /* Irrelevant as feature unused. */ -} __attribute__ ((__packed__)) PROPERTY_SET; - -/* - * Attribute: Logged utility stream (0x100). - * - * NOTE: Can be resident or non-resident. - * - * Operations on this attribute are logged to the journal ($LogFile) like - * normal metadata changes. - * - * Used by the Encrypting File System (EFS). All encrypted files have this - * attribute with the name $EFS. - */ -typedef struct { - /* Can be anything the creator chooses. */ - /* EFS uses it as follows: */ - // FIXME: Type this info, verifying it along the way. (AIA) -} __attribute__ ((__packed__)) LOGGED_UTILITY_STREAM, EFS_ATTR; +struct ea_attr { + __le32 next_entry_offset; /* Offset to the next EA_ATTR. */ + u8 flags; /* Flags describing the EA. */ + u8 ea_name_length; /* + * Length of the name of the EA in bytes + * excluding the '\0' byte terminator. + */ + __le16 ea_value_length; /* Byte size of the EA's value. */ + u8 ea_name[]; /* + * Name of the EA. Note this is ASCII, not + * Unicode and it is zero terminated. + */ + /* u8 ea_value[]; */ /* The value of the EA. Immediately follows the name. */ +} __packed; #endif /* _LINUX_NTFS_LAYOUT_H */ diff --git a/fs/ntfs/lcnalloc.h b/fs/ntfs/lcnalloc.h index 1589a6d8434b..a1c66b8b73ac 100644 --- a/fs/ntfs/lcnalloc.h +++ b/fs/ntfs/lcnalloc.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * lcnalloc.h - Exports for NTFS kernel cluster (de)allocation. Part of the - * Linux-NTFS project. + * Exports for NTFS kernel cluster (de)allocation. + * Part of the Linux-NTFS project. * * Copyright (c) 2004-2005 Anton Altaparmakov */ @@ -9,30 +9,25 @@ #ifndef _LINUX_NTFS_LCNALLOC_H #define _LINUX_NTFS_LCNALLOC_H -#ifdef NTFS_RW - -#include +#include #include "attrib.h" -#include "types.h" -#include "inode.h" -#include "runlist.h" -#include "volume.h" -typedef enum { +enum { FIRST_ZONE = 0, /* For sanity checking. */ MFT_ZONE = 0, /* Allocate from $MFT zone. */ DATA_ZONE = 1, /* Allocate from $DATA zone. */ LAST_ZONE = 1, /* For sanity checking. */ -} NTFS_CLUSTER_ALLOCATION_ZONES; - -extern runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, - const VCN start_vcn, const s64 count, const LCN start_lcn, - const NTFS_CLUSTER_ALLOCATION_ZONES zone, - const bool is_extension); +}; -extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, - s64 count, ntfs_attr_search_ctx *ctx, const bool is_rollback); +struct runlist_element *ntfs_cluster_alloc(struct ntfs_volume *vol, + const s64 start_vcn, const s64 count, const s64 start_lcn, + const int zone, + const bool is_extension, + const bool is_contig, + const bool is_dealloc); +s64 __ntfs_cluster_free(struct ntfs_inode *ni, const s64 start_vcn, + s64 count, struct ntfs_attr_search_ctx *ctx, const bool is_rollback); /** * ntfs_cluster_free - free clusters on an ntfs volume @@ -90,14 +85,14 @@ extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, * - If @ctx is not NULL, the base mft record must be mapped on entry * and it will be left mapped on return. */ -static inline s64 ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, - s64 count, ntfs_attr_search_ctx *ctx) +static inline s64 ntfs_cluster_free(struct ntfs_inode *ni, const s64 start_vcn, + s64 count, struct ntfs_attr_search_ctx *ctx) { return __ntfs_cluster_free(ni, start_vcn, count, ctx, false); } -extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol, - const runlist_element *rl); +int ntfs_cluster_free_from_rl_nolock(struct ntfs_volume *vol, + const struct runlist_element *rl); /** * ntfs_cluster_free_from_rl - free clusters from runlist @@ -115,17 +110,18 @@ extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol, * - The caller must have locked the runlist @rl for reading or * writing. */ -static inline int ntfs_cluster_free_from_rl(ntfs_volume *vol, - const runlist_element *rl) +static inline int ntfs_cluster_free_from_rl(struct ntfs_volume *vol, + const struct runlist_element *rl) { int ret; + unsigned int memalloc_flags; + memalloc_flags = memalloc_nofs_save(); down_write(&vol->lcnbmp_lock); ret = ntfs_cluster_free_from_rl_nolock(vol, rl); up_write(&vol->lcnbmp_lock); + memalloc_nofs_restore(memalloc_flags); return ret; } -#endif /* NTFS_RW */ - #endif /* defined _LINUX_NTFS_LCNALLOC_H */ diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h index 429d4909cc72..3c7e42425503 100644 --- a/fs/ntfs/logfile.h +++ b/fs/ntfs/logfile.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * logfile.h - Defines for NTFS kernel journal ($LogFile) handling. Part of - * the Linux-NTFS project. + * Defines for NTFS kernel journal (LogFile) handling. + * Part of the Linux-NTFS project. * * Copyright (c) 2000-2005 Anton Altaparmakov */ @@ -9,16 +9,10 @@ #ifndef _LINUX_NTFS_LOGFILE_H #define _LINUX_NTFS_LOGFILE_H -#ifdef NTFS_RW - -#include - -#include "types.h" -#include "endian.h" #include "layout.h" /* - * Journal ($LogFile) organization: + * Journal (LogFile) organization: * * Two restart areas present in the first two pages (restart pages, one restart * area in each page). When the volume is dismounted they should be identical, @@ -42,7 +36,7 @@ * reinitialize the logfile and start again with version 1.1. */ -/* Some $LogFile related constants. */ +/* Some LogFile related constants. */ #define MaxLogFileSize 0x100000000ULL #define DefaultLogPageSize 4096 #define MinLogRecordPages 48 @@ -50,40 +44,42 @@ /* * Log file restart page header (begins the restart area). */ -typedef struct { -/*Ofs*/ -/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ -/* 0*/ NTFS_RECORD_TYPE magic; /* The magic is "RSTR". */ -/* 4*/ le16 usa_ofs; /* See NTFS_RECORD definition in layout.h. - When creating, set this to be immediately - after this header structure (without any - alignment). */ -/* 6*/ le16 usa_count; /* See NTFS_RECORD definition in layout.h. */ +struct restart_page_header { + __le32 magic; /* The magic is "RSTR". */ + __le16 usa_ofs; /* + * See ntfs_record struct definition in layout.h. + * When creating, set this to be immediately after + * this header structure (without any alignment). + */ + __le16 usa_count; /* See ntfs_record struct definition in layout.h. */ -/* 8*/ leLSN chkdsk_lsn; /* The last log file sequence number found by - chkdsk. Only used when the magic is changed - to "CHKD". Otherwise this is zero. */ -/* 16*/ le32 system_page_size; /* Byte size of system pages when the log file - was created, has to be >= 512 and a power of - 2. Use this to calculate the required size - of the usa (usa_count) and add it to usa_ofs. - Then verify that the result is less than the - value of the restart_area_offset. */ -/* 20*/ le32 log_page_size; /* Byte size of log file pages, has to be >= - 512 and a power of 2. The default is 4096 - and is used when the system page size is - between 4096 and 8192. Otherwise this is - set to the system page size instead. */ -/* 24*/ le16 restart_area_offset;/* Byte offset from the start of this header to - the RESTART_AREA. Value has to be aligned - to 8-byte boundary. When creating, set this - to be after the usa. */ -/* 26*/ sle16 minor_ver; /* Log file minor version. Only check if major - version is 1. */ -/* 28*/ sle16 major_ver; /* Log file major version. We only support - version 1.1. */ + __le64 chkdsk_lsn; /* + * The last log file sequence number found by chkdsk. + * Only used when the magic is changed to "CHKD". + * Otherwise this is zero. + */ + __le32 system_page_size; /* + * Byte size of system pages when the log file was created, + * has to be >= 512 and a power of 2. Use this to calculate + * the required size of the usa (usa_count) and add it to + * usa_ofs. Then verify that the result is less than + * the value of the restart_area_offset. + */ + __le32 log_page_size; /* + * Byte size of log file pages, has to be >= 512 and + * a power of 2. The default is 4096 and is used + * when the system page size is between 4096 and 8192. + * Otherwise this is set to the system page size instead. + */ + __le16 restart_area_offset; /* + * Byte offset from the start of this header to + * the RESTART_AREA. Value has to be aligned to 8-byte + * boundary. When creating, set this to be after the usa. + */ + __le16 minor_ver; /* Log file minor version. Only check if major version is 1. */ + __le16 major_ver; /* Log file major version. We only support version 1.1. */ /* sizeof() = 30 (0x1e) bytes */ -} __attribute__ ((__packed__)) RESTART_PAGE_HEADER; +} __packed; /* * Constant for the log client indices meaning that there are no client records @@ -100,196 +96,221 @@ typedef struct { enum { RESTART_VOLUME_IS_CLEAN = cpu_to_le16(0x0002), RESTART_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */ -} __attribute__ ((__packed__)); - -typedef le16 RESTART_AREA_FLAGS; +} __packed; /* * Log file restart area record. The offset of this record is found by adding * the offset of the RESTART_PAGE_HEADER to the restart_area_offset value found * in it. See notes at restart_area_offset above. */ -typedef struct { -/*Ofs*/ -/* 0*/ leLSN current_lsn; /* The current, i.e. last LSN inside the log - when the restart area was last written. - This happens often but what is the interval? - Is it just fixed time or is it every time a - check point is written or somethine else? - On create set to 0. */ -/* 8*/ le16 log_clients; /* Number of log client records in the array of - log client records which follows this - restart area. Must be 1. */ -/* 10*/ le16 client_free_list; /* The index of the first free log client record - in the array of log client records. - LOGFILE_NO_CLIENT means that there are no - free log client records in the array. - If != LOGFILE_NO_CLIENT, check that - log_clients > client_free_list. On Win2k - and presumably earlier, on a clean volume - this is != LOGFILE_NO_CLIENT, and it should - be 0, i.e. the first (and only) client - record is free and thus the logfile is - closed and hence clean. A dirty volume - would have left the logfile open and hence - this would be LOGFILE_NO_CLIENT. On WinXP - and presumably later, the logfile is always - open, even on clean shutdown so this should - always be LOGFILE_NO_CLIENT. */ -/* 12*/ le16 client_in_use_list;/* The index of the first in-use log client - record in the array of log client records. - LOGFILE_NO_CLIENT means that there are no - in-use log client records in the array. If - != LOGFILE_NO_CLIENT check that log_clients - > client_in_use_list. On Win2k and - presumably earlier, on a clean volume this - is LOGFILE_NO_CLIENT, i.e. there are no - client records in use and thus the logfile - is closed and hence clean. A dirty volume - would have left the logfile open and hence - this would be != LOGFILE_NO_CLIENT, and it - should be 0, i.e. the first (and only) - client record is in use. On WinXP and - presumably later, the logfile is always - open, even on clean shutdown so this should - always be 0. */ -/* 14*/ RESTART_AREA_FLAGS flags;/* Flags modifying LFS behaviour. On Win2k - and presumably earlier this is always 0. On - WinXP and presumably later, if the logfile - was shutdown cleanly, the second bit, - RESTART_VOLUME_IS_CLEAN, is set. This bit - is cleared when the volume is mounted by - WinXP and set when the volume is dismounted, - thus if the logfile is dirty, this bit is - clear. Thus we don't need to check the - Windows version to determine if the logfile - is clean. Instead if the logfile is closed, - we know it must be clean. If it is open and - this bit is set, we also know it must be - clean. If on the other hand the logfile is - open and this bit is clear, we can be almost - certain that the logfile is dirty. */ -/* 16*/ le32 seq_number_bits; /* How many bits to use for the sequence - number. This is calculated as 67 - the - number of bits required to store the logfile - size in bytes and this can be used in with - the specified file_size as a consistency - check. */ -/* 20*/ le16 restart_area_length;/* Length of the restart area including the - client array. Following checks required if - version matches. Otherwise, skip them. - restart_area_offset + restart_area_length - has to be <= system_page_size. Also, - restart_area_length has to be >= - client_array_offset + (log_clients * - sizeof(log client record)). */ -/* 22*/ le16 client_array_offset;/* Offset from the start of this record to - the first log client record if versions are - matched. When creating, set this to be - after this restart area structure, aligned - to 8-bytes boundary. If the versions do not - match, this is ignored and the offset is - assumed to be (sizeof(RESTART_AREA) + 7) & - ~7, i.e. rounded up to first 8-byte - boundary. Either way, client_array_offset - has to be aligned to an 8-byte boundary. - Also, restart_area_offset + - client_array_offset has to be <= 510. - Finally, client_array_offset + (log_clients - * sizeof(log client record)) has to be <= - system_page_size. On Win2k and presumably - earlier, this is 0x30, i.e. immediately - following this record. On WinXP and - presumably later, this is 0x40, i.e. there - are 16 extra bytes between this record and - the client array. This probably means that - the RESTART_AREA record is actually bigger - in WinXP and later. */ -/* 24*/ sle64 file_size; /* Usable byte size of the log file. If the - restart_area_offset + the offset of the - file_size are > 510 then corruption has - occurred. This is the very first check when - starting with the restart_area as if it - fails it means that some of the above values - will be corrupted by the multi sector - transfer protection. The file_size has to - be rounded down to be a multiple of the - log_page_size in the RESTART_PAGE_HEADER and - then it has to be at least big enough to - store the two restart pages and 48 (0x30) - log record pages. */ -/* 32*/ le32 last_lsn_data_length;/* Length of data of last LSN, not including - the log record header. On create set to - 0. */ -/* 36*/ le16 log_record_header_length;/* Byte size of the log record header. - If the version matches then check that the - value of log_record_header_length is a - multiple of 8, i.e. - (log_record_header_length + 7) & ~7 == - log_record_header_length. When creating set - it to sizeof(LOG_RECORD_HEADER), aligned to - 8 bytes. */ -/* 38*/ le16 log_page_data_offset;/* Offset to the start of data in a log record - page. Must be a multiple of 8. On create - set it to immediately after the update - sequence array of the log record page. */ -/* 40*/ le32 restart_log_open_count;/* A counter that gets incremented every - time the logfile is restarted which happens - at mount time when the logfile is opened. - When creating set to a random value. Win2k - sets it to the low 32 bits of the current - system time in NTFS format (see time.h). */ -/* 44*/ le32 reserved; /* Reserved/alignment to 8-byte boundary. */ +struct restart_area { + __le64 current_lsn; /* + * The current, i.e. last LSN inside the log + * when the restart area was last written. + * This happens often but what is the interval? + * Is it just fixed time or is it every time a + * check point is written or somethine else? + * On create set to 0. + */ + __le16 log_clients; /* + * Number of log client records in the array of + * log client records which follows this + * restart area. Must be 1. + */ + __le16 client_free_list; /* + * The index of the first free log client record + * in the array of log client records. + * LOGFILE_NO_CLIENT means that there are no + * free log client records in the array. + * If != LOGFILE_NO_CLIENT, check that + * log_clients > client_free_list. On Win2k + * and presumably earlier, on a clean volume + * this is != LOGFILE_NO_CLIENT, and it should + * be 0, i.e. the first (and only) client + * record is free and thus the logfile is + * closed and hence clean. A dirty volume + * would have left the logfile open and hence + * this would be LOGFILE_NO_CLIENT. On WinXP + * and presumably later, the logfile is always + * open, even on clean shutdown so this should + * always be LOGFILE_NO_CLIENT. + */ + __le16 client_in_use_list; /* + * The index of the first in-use log client + * record in the array of log client records. + * LOGFILE_NO_CLIENT means that there are no + * in-use log client records in the array. If + * != LOGFILE_NO_CLIENT check that log_clients + * > client_in_use_list. On Win2k and + * presumably earlier, on a clean volume this + * is LOGFILE_NO_CLIENT, i.e. there are no + * client records in use and thus the logfile + * is closed and hence clean. A dirty volume + * would have left the logfile open and hence + * this would be != LOGFILE_NO_CLIENT, and it + * should be 0, i.e. the first (and only) + * client record is in use. On WinXP and + * presumably later, the logfile is always + * open, even on clean shutdown so this should + * always be 0. + */ + __le16 flags; /* + * Flags modifying LFS behaviour. On Win2k + * and presumably earlier this is always 0. On + * WinXP and presumably later, if the logfile + * was shutdown cleanly, the second bit, + * RESTART_VOLUME_IS_CLEAN, is set. This bit + * is cleared when the volume is mounted by + * WinXP and set when the volume is dismounted, + * thus if the logfile is dirty, this bit is + * clear. Thus we don't need to check the + * Windows version to determine if the logfile + * is clean. Instead if the logfile is closed, + * we know it must be clean. If it is open and + * this bit is set, we also know it must be + * clean. If on the other hand the logfile is + * open and this bit is clear, we can be almost + * certain that the logfile is dirty. + */ + __le32 seq_number_bits; /* + * How many bits to use for the sequence + * number. This is calculated as 67 - the + * number of bits required to store the logfile + * size in bytes and this can be used in with + * the specified file_size as a consistency + * check. + */ + __le16 restart_area_length; /* + * Length of the restart area including the + * client array. Following checks required if + * version matches. Otherwise, skip them. + * restart_area_offset + restart_area_length + * has to be <= system_page_size. Also, + * restart_area_length has to be >= + * client_array_offset + (log_clients * + * sizeof(log client record)). + */ + __le16 client_array_offset; /* + * Offset from the start of this record to + * the first log client record if versions are + * matched. When creating, set this to be + * after this restart area structure, aligned + * to 8-bytes boundary. If the versions do not + * match, this is ignored and the offset is + * assumed to be (sizeof(RESTART_AREA) + 7) & + * ~7, i.e. rounded up to first 8-byte + * boundary. Either way, client_array_offset + * has to be aligned to an 8-byte boundary. + * Also, restart_area_offset + + * client_array_offset has to be <= 510. + * Finally, client_array_offset + (log_clients + * sizeof(log client record)) has to be <= + * system_page_size. On Win2k and presumably + * earlier, this is 0x30, i.e. immediately + * following this record. On WinXP and + * presumably later, this is 0x40, i.e. there + * are 16 extra bytes between this record and + * the client array. This probably means that + * the RESTART_AREA record is actually bigger + * in WinXP and later. + */ + __le64 file_size; /* + * Usable byte size of the log file. If the + * restart_area_offset + the offset of the + * file_size are > 510 then corruption has + * occurred. This is the very first check when + * starting with the restart_area as if it + * fails it means that some of the above values + * will be corrupted by the multi sector + * transfer protection. The file_size has to + * be rounded down to be a multiple of the + * log_page_size in the RESTART_PAGE_HEADER and + * then it has to be at least big enough to + * store the two restart pages and 48 (0x30) + * log record pages. + */ + __le32 last_lsn_data_length; /* + * Length of data of last LSN, not including + * the log record header. On create set to 0. + */ + __le16 log_record_header_length; /* + * Byte size of the log record header. + * If the version matches then check that the + * value of log_record_header_length is a + * multiple of 8, + * i.e. (log_record_header_length + 7) & ~7 == + * log_record_header_length. When creating set + * it to sizeof(LOG_RECORD_HEADER), aligned to + * 8 bytes. + */ + __le16 log_page_data_offset; /* + * Offset to the start of data in a log record + * page. Must be a multiple of 8. On create + * set it to immediately after the update sequence + * array of the log record page. + */ + __le32 restart_log_open_count; /* + * A counter that gets incremented every time + * the logfile is restarted which happens at mount + * time when the logfile is opened. When creating + * set to a random value. Win2k sets it to the low + * 32 bits of the current system time in NTFS format + * (see time.h). + */ + __le32 reserved; /* Reserved/alignment to 8-byte boundary. */ /* sizeof() = 48 (0x30) bytes */ -} __attribute__ ((__packed__)) RESTART_AREA; +} __packed; /* * Log client record. The offset of this record is found by adding the offset * of the RESTART_AREA to the client_array_offset value found in it. */ -typedef struct { -/*Ofs*/ -/* 0*/ leLSN oldest_lsn; /* Oldest LSN needed by this client. On create - set to 0. */ -/* 8*/ leLSN client_restart_lsn;/* LSN at which this client needs to restart - the volume, i.e. the current position within - the log file. At present, if clean this - should = current_lsn in restart area but it - probably also = current_lsn when dirty most - of the time. At create set to 0. */ -/* 16*/ le16 prev_client; /* The offset to the previous log client record - in the array of log client records. - LOGFILE_NO_CLIENT means there is no previous - client record, i.e. this is the first one. - This is always LOGFILE_NO_CLIENT. */ -/* 18*/ le16 next_client; /* The offset to the next log client record in - the array of log client records. - LOGFILE_NO_CLIENT means there are no next - client records, i.e. this is the last one. - This is always LOGFILE_NO_CLIENT. */ -/* 20*/ le16 seq_number; /* On Win2k and presumably earlier, this is set - to zero every time the logfile is restarted - and it is incremented when the logfile is - closed at dismount time. Thus it is 0 when - dirty and 1 when clean. On WinXP and - presumably later, this is always 0. */ -/* 22*/ u8 reserved[6]; /* Reserved/alignment. */ -/* 28*/ le32 client_name_length;/* Length of client name in bytes. Should - always be 8. */ -/* 32*/ ntfschar client_name[64];/* Name of the client in Unicode. Should - always be "NTFS" with the remaining bytes - set to 0. */ +struct log_client_record { + __le64 oldest_lsn; /* + * Oldest LSN needed by this client. On create + * set to 0. + */ + __le64 client_restart_lsn; /* + * LSN at which this client needs to restart + * the volume, i.e. the current position within + * the log file. At present, if clean this + * should = current_lsn in restart area but it + * probably also = current_lsn when dirty most + * of the time. At create set to 0. + */ + __le16 prev_client; /* + * The offset to the previous log client record + * in the array of log client records. + * LOGFILE_NO_CLIENT means there is no previous + * client record, i.e. this is the first one. + * This is always LOGFILE_NO_CLIENT. + */ + __le16 next_client; /* + * The offset to the next log client record in + * the array of log client records. + * LOGFILE_NO_CLIENT means there are no next + * client records, i.e. this is the last one. + * This is always LOGFILE_NO_CLIENT. + */ + __le16 seq_number; /* + * On Win2k and presumably earlier, this is set + * to zero every time the logfile is restarted + * and it is incremented when the logfile is + * closed at dismount time. Thus it is 0 when + * dirty and 1 when clean. On WinXP and + * presumably later, this is always 0. + */ + u8 reserved[6]; /* Reserved/alignment. */ + __le32 client_name_length; /* Length of client name in bytes. Should always be 8. */ + __le16 client_name[64]; /* + * Name of the client in Unicode. + * Should always be "NTFS" with the remaining bytes + * set to 0. + */ /* sizeof() = 160 (0xa0) bytes */ -} __attribute__ ((__packed__)) LOG_CLIENT_RECORD; - -extern bool ntfs_check_logfile(struct inode *log_vi, - RESTART_PAGE_HEADER **rp); - -extern bool ntfs_is_logfile_clean(struct inode *log_vi, - const RESTART_PAGE_HEADER *rp); - -extern bool ntfs_empty_logfile(struct inode *log_vi); - -#endif /* NTFS_RW */ +} __packed; +bool ntfs_check_logfile(struct inode *log_vi, + struct restart_page_header **rp); +bool ntfs_empty_logfile(struct inode *log_vi); #endif /* _LINUX_NTFS_LOGFILE_H */ diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h index 7068425735f1..52afe4e1ffcc 100644 --- a/fs/ntfs/malloc.h +++ b/fs/ntfs/malloc.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * malloc.h - NTFS kernel memory handling. Part of the Linux-NTFS project. + * NTFS kernel memory handling. Part of the Linux-NTFS project. * * Copyright (c) 2001-2005 Anton Altaparmakov */ @@ -11,6 +11,7 @@ #include #include #include +#include /** * __ntfs_malloc - allocate memory in multiples of pages @@ -28,9 +29,10 @@ static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask) { if (likely(size <= PAGE_SIZE)) { - BUG_ON(!size); + if (!size) + return NULL; /* kmalloc() has per-CPU caches so is faster for now. */ - return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM); + return kmalloc(PAGE_SIZE, gfp_mask); /* return (void *)__get_free_page(gfp_mask); */ } if (likely((size >> PAGE_SHIFT) < totalram_pages())) @@ -49,7 +51,7 @@ static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask) */ static inline void *ntfs_malloc_nofs(unsigned long size) { - return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM); + return __ntfs_malloc(size, GFP_NOFS | __GFP_ZERO); } /** @@ -66,7 +68,7 @@ static inline void *ntfs_malloc_nofs(unsigned long size) */ static inline void *ntfs_malloc_nofs_nofail(unsigned long size) { - return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_NOFAIL); + return __ntfs_malloc(size, GFP_NOFS | __GFP_NOFAIL); } static inline void ntfs_free(void *addr) @@ -74,4 +76,25 @@ static inline void ntfs_free(void *addr) kvfree(addr); } +static inline void *ntfs_realloc_nofs(void *addr, unsigned long new_size, + unsigned long cpy_size) +{ + void *pnew_addr; + + if (new_size == 0) { + ntfs_free(addr); + return NULL; + } + + pnew_addr = ntfs_malloc_nofs(new_size); + if (pnew_addr == NULL) + return NULL; + if (addr) { + cpy_size = min(cpy_size, new_size); + if (cpy_size) + memcpy(pnew_addr, addr, cpy_size); + ntfs_free(addr); + } + return pnew_addr; +} #endif /* _LINUX_NTFS_MALLOC_H */ diff --git a/fs/ntfs/mft.h b/fs/ntfs/mft.h index 49c001af16ed..cce944242f89 100644 --- a/fs/ntfs/mft.h +++ b/fs/ntfs/mft.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * mft.h - Defines for mft record handling in NTFS Linux kernel driver. - * Part of the Linux-NTFS project. + * Defines for mft record handling in NTFS Linux kernel driver. + * Part of the Linux-NTFS project. * * Copyright (c) 2001-2004 Anton Altaparmakov */ @@ -9,41 +9,22 @@ #ifndef _LINUX_NTFS_MFT_H #define _LINUX_NTFS_MFT_H -#include #include #include #include "inode.h" -extern MFT_RECORD *map_mft_record(ntfs_inode *ni); -extern void unmap_mft_record(ntfs_inode *ni); +struct mft_record *map_mft_record(struct ntfs_inode *ni); +void unmap_mft_record(struct ntfs_inode *ni); +struct mft_record *map_extent_mft_record(struct ntfs_inode *base_ni, u64 mref, + struct ntfs_inode **ntfs_ino); -extern MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref, - ntfs_inode **ntfs_ino); - -static inline void unmap_extent_mft_record(ntfs_inode *ni) +static inline void unmap_extent_mft_record(struct ntfs_inode *ni) { unmap_mft_record(ni); - return; } -#ifdef NTFS_RW - -/** - * flush_dcache_mft_record_page - flush_dcache_page() for mft records - * @ni: ntfs inode structure of mft record - * - * Call flush_dcache_page() for the page in which an mft record resides. - * - * This must be called every time an mft record is modified, just after the - * modification. - */ -static inline void flush_dcache_mft_record_page(ntfs_inode *ni) -{ - flush_dcache_page(ni->page); -} - -extern void __mark_mft_record_dirty(ntfs_inode *ni); +void __mark_mft_record_dirty(struct ntfs_inode *ni); /** * mark_mft_record_dirty - set the mft record and the page containing it dirty @@ -56,16 +37,15 @@ extern void __mark_mft_record_dirty(ntfs_inode *ni); * * NOTE: Do not do anything if the mft record is already marked dirty. */ -static inline void mark_mft_record_dirty(ntfs_inode *ni) +static inline void mark_mft_record_dirty(struct ntfs_inode *ni) { if (!NInoTestSetDirty(ni)) __mark_mft_record_dirty(ni); } -extern int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, - MFT_RECORD *m, int sync); - -extern int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync); +int ntfs_sync_mft_mirror(struct ntfs_volume *vol, const unsigned long mft_no, + struct mft_record *m); +int write_mft_record_nolock(struct ntfs_inode *ni, struct mft_record *m, int sync); /** * write_mft_record - write out a mapped (extent) mft record @@ -85,26 +65,28 @@ extern int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync); * On success, clean the mft record and return 0. On error, leave the mft * record dirty and return -errno. */ -static inline int write_mft_record(ntfs_inode *ni, MFT_RECORD *m, int sync) +static inline int write_mft_record(struct ntfs_inode *ni, struct mft_record *m, int sync) { - struct page *page = ni->page; + struct folio *folio = ni->folio; int err; - BUG_ON(!page); - lock_page(page); + folio_lock(folio); err = write_mft_record_nolock(ni, m, sync); - unlock_page(page); + folio_unlock(folio); + return err; } -extern bool ntfs_may_write_mft_record(ntfs_volume *vol, - const unsigned long mft_no, const MFT_RECORD *m, - ntfs_inode **locked_ni); - -extern ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode, - ntfs_inode *base_ni, MFT_RECORD **mrec); -extern int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m); - -#endif /* NTFS_RW */ +bool ntfs_may_write_mft_record(struct ntfs_volume *vol, + const unsigned long mft_no, const struct mft_record *m, + struct ntfs_inode **locked_ni); +int ntfs_mft_record_alloc(struct ntfs_volume *vol, const int mode, + struct ntfs_inode **ni, struct ntfs_inode *base_ni, + struct mft_record **ni_mrec); +int ntfs_mft_record_free(struct ntfs_volume *vol, struct ntfs_inode *ni); +int ntfs_mft_records_write(const struct ntfs_volume *vol, const u64 mref, + const s64 count, struct mft_record *b); +int ntfs_mft_record_check(const struct ntfs_volume *vol, struct mft_record *m, + unsigned long mft_no); #endif /* _LINUX_NTFS_MFT_H */ diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h index e81376ea9152..00720645fae9 100644 --- a/fs/ntfs/ntfs.h +++ b/fs/ntfs/ntfs.h @@ -1,9 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * ntfs.h - Defines for NTFS Linux kernel driver. + * Defines for NTFS Linux kernel driver. * * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. * Copyright (C) 2002 Richard Russon + * Copyright (c) 2025 LG Electronics Co., Ltd. */ #ifndef _LINUX_NTFS_H @@ -17,20 +18,59 @@ #include #include #include +#include -#include "types.h" #include "volume.h" #include "layout.h" +#include "inode.h" -typedef enum { +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define NTFS_DEF_PREALLOC_SIZE (64*1024*1024) + +#define STANDARD_COMPRESSION_UNIT 4 +#define MAX_COMPRESSION_CLUSTER_SIZE 4096 + +#define UCHAR_T_SIZE_BITS 1 + +#define NTFS_B_TO_CLU(vol, b) ((b) >> (vol)->cluster_size_bits) +#define NTFS_CLU_TO_B(vol, clu) ((u64)(clu) << (vol)->cluster_size_bits) +#define NTFS_B_TO_CLU_OFS(vol, clu) ((u64)(clu) & (vol)->cluster_size_mask) + +#define NTFS_MFT_NR_TO_CLU(vol, mft_no) (((u64)mft_no << (vol)->mft_record_size_bits) >> \ + (vol)->cluster_size_bits) +#define NTFS_MFT_NR_TO_PIDX(vol, mft_no) (mft_no >> (PAGE_SHIFT - \ + (vol)->mft_record_size_bits)) +#define NTFS_MFT_NR_TO_POFS(vol, mft_no) (((u64)mft_no << (vol)->mft_record_size_bits) & \ + ~PAGE_MASK) + +#define NTFS_PIDX_TO_BLK(vol, idx) (((u64)idx << PAGE_SHIFT) >> \ + ((vol)->sb)->s_blocksize_bits) +#define NTFS_PIDX_TO_CLU(vol, idx) (((u64)idx << PAGE_SHIFT) >> \ + (vol)->cluster_size_bits) +#define NTFS_CLU_TO_PIDX(vol, clu) (((u64)(clu) << (vol)->cluster_size_bits) >> \ + PAGE_SHIFT) +#define NTFS_CLU_TO_POFS(vol, clu) (((u64)(clu) << (vol)->cluster_size_bits) & \ + ~PAGE_MASK) + +#define NTFS_B_TO_SECTOR(vol, b) ((b) >> ((vol)->sb)->s_blocksize_bits) + +enum { NTFS_BLOCK_SIZE = 512, NTFS_BLOCK_SIZE_BITS = 9, NTFS_SB_MAGIC = 0x5346544e, /* 'NTFS' */ NTFS_MAX_NAME_LEN = 255, - NTFS_MAX_ATTR_NAME_LEN = 255, - NTFS_MAX_CLUSTER_SIZE = 64 * 1024, /* 64kiB */ - NTFS_MAX_PAGES_PER_CLUSTER = NTFS_MAX_CLUSTER_SIZE / PAGE_SIZE, -} NTFS_CONSTANTS; + NTFS_MAX_LABEL_LEN = 128, +}; + +enum { + CASE_SENSITIVE = 0, + IGNORE_CASE = 1, +}; /* Global variables. */ @@ -42,12 +82,12 @@ extern struct kmem_cache *ntfs_attr_ctx_cache; extern struct kmem_cache *ntfs_index_ctx_cache; /* The various operations structs defined throughout the driver files. */ -extern const struct address_space_operations ntfs_normal_aops; -extern const struct address_space_operations ntfs_compressed_aops; -extern const struct address_space_operations ntfs_mst_aops; +extern const struct address_space_operations ntfs_aops; extern const struct file_operations ntfs_file_ops; extern const struct inode_operations ntfs_file_inode_ops; +extern const struct inode_operations ntfs_symlink_inode_operations; +extern const struct inode_operations ntfsp_special_inode_operations; extern const struct file_operations ntfs_dir_ops; extern const struct inode_operations ntfs_dir_inode_ops; @@ -63,7 +103,7 @@ extern const struct export_operations ntfs_export_ops; * * NTFS_SB() returns the ntfs volume associated with the VFS super block @sb. */ -static inline ntfs_volume *NTFS_SB(struct super_block *sb) +static inline struct ntfs_volume *NTFS_SB(struct super_block *sb) { return sb->s_fs_info; } @@ -71,52 +111,64 @@ static inline ntfs_volume *NTFS_SB(struct super_block *sb) /* Declarations of functions and global variables. */ /* From fs/ntfs/compress.c */ -extern int ntfs_read_compressed_block(struct page *page); -extern int allocate_compression_buffers(void); -extern void free_compression_buffers(void); +int ntfs_read_compressed_block(struct folio *folio); +int allocate_compression_buffers(void); +void free_compression_buffers(void); +int ntfs_compress_write(struct ntfs_inode *ni, loff_t pos, size_t count, + struct iov_iter *from); /* From fs/ntfs/super.c */ #define default_upcase_len 0x10000 extern struct mutex ntfs_lock; -typedef struct { +struct option_t { int val; char *str; -} option_t; -extern const option_t on_errors_arr[]; +}; +extern const struct option_t on_errors_arr[]; +int ntfs_set_volume_flags(struct ntfs_volume *vol, __le16 flags); +int ntfs_clear_volume_flags(struct ntfs_volume *vol, __le16 flags); +int ntfs_write_volume_label(struct ntfs_volume *vol, char *label); /* From fs/ntfs/mst.c */ -extern int post_read_mst_fixup(NTFS_RECORD *b, const u32 size); -extern int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size); -extern void post_write_mst_fixup(NTFS_RECORD *b); +int post_read_mst_fixup(struct ntfs_record *b, const u32 size); +int pre_write_mst_fixup(struct ntfs_record *b, const u32 size); +void post_write_mst_fixup(struct ntfs_record *b); /* From fs/ntfs/unistr.c */ -extern bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len, - const ntfschar *s2, size_t s2_len, - const IGNORE_CASE_BOOL ic, - const ntfschar *upcase, const u32 upcase_size); -extern int ntfs_collate_names(const ntfschar *name1, const u32 name1_len, - const ntfschar *name2, const u32 name2_len, - const int err_val, const IGNORE_CASE_BOOL ic, - const ntfschar *upcase, const u32 upcase_len); -extern int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n); -extern int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n, - const ntfschar *upcase, const u32 upcase_size); -extern void ntfs_upcase_name(ntfschar *name, u32 name_len, - const ntfschar *upcase, const u32 upcase_len); -extern void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr, - const ntfschar *upcase, const u32 upcase_len); -extern int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1, - FILE_NAME_ATTR *file_name_attr2, - const int err_val, const IGNORE_CASE_BOOL ic, - const ntfschar *upcase, const u32 upcase_len); -extern int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins, - const int ins_len, ntfschar **outs); -extern int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins, +bool ntfs_are_names_equal(const __le16 *s1, size_t s1_len, + const __le16 *s2, size_t s2_len, + const u32 ic, + const __le16 *upcase, const u32 upcase_size); +int ntfs_collate_names(const __le16 *name1, const u32 name1_len, + const __le16 *name2, const u32 name2_len, + const int err_val, const u32 ic, + const __le16 *upcase, const u32 upcase_len); +int ntfs_ucsncmp(const __le16 *s1, const __le16 *s2, size_t n); +int ntfs_ucsncasecmp(const __le16 *s1, const __le16 *s2, size_t n, + const __le16 *upcase, const u32 upcase_size); +int ntfs_file_compare_values(const struct file_name_attr *file_name_attr1, + const struct file_name_attr *file_name_attr2, + const int err_val, const u32 ic, + const __le16 *upcase, const u32 upcase_len); +int ntfs_nlstoucs(const struct ntfs_volume *vol, const char *ins, + const int ins_len, __le16 **outs, int max_name_len); +int ntfs_ucstonls(const struct ntfs_volume *vol, const __le16 *ins, const int ins_len, unsigned char **outs, int outs_len); +__le16 *ntfs_ucsndup(const __le16 *s, u32 maxlen); +bool ntfs_names_are_equal(const __le16 *s1, size_t s1_len, + const __le16 *s2, size_t s2_len, + const u32 ic, + const __le16 *upcase, const u32 upcase_size); +int ntfs_force_shutdown(struct super_block *sb, u32 flags); +long ntfsp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); +#ifdef CONFIG_COMPAT +long ntfsp_compat_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg); +#endif /* From fs/ntfs/upcase.c */ -extern ntfschar *generate_default_upcase(void); +__le16 *generate_default_upcase(void); static inline int ntfs_ffs(int x) { @@ -140,10 +192,8 @@ static inline int ntfs_ffs(int x) x >>= 2; r += 2; } - if (!(x & 1)) { - x >>= 1; + if (!(x & 1)) r += 1; - } return r; } diff --git a/fs/ntfs/quota.h b/fs/ntfs/quota.h index fe3132a3d6d2..7d70a0f5aa0e 100644 --- a/fs/ntfs/quota.h +++ b/fs/ntfs/quota.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * quota.h - Defines for NTFS kernel quota ($Quota) handling. Part of the - * Linux-NTFS project. + * Defines for NTFS kernel quota ($Quota) handling. + * Part of the Linux-NTFS project. * * Copyright (c) 2004 Anton Altaparmakov */ @@ -9,13 +9,8 @@ #ifndef _LINUX_NTFS_QUOTA_H #define _LINUX_NTFS_QUOTA_H -#ifdef NTFS_RW - -#include "types.h" #include "volume.h" -extern bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol); - -#endif /* NTFS_RW */ +bool ntfs_mark_quotas_out_of_date(struct ntfs_volume *vol); #endif /* _LINUX_NTFS_QUOTA_H */ diff --git a/fs/ntfs/reparse.h b/fs/ntfs/reparse.h new file mode 100644 index 000000000000..a1f3829a89da --- /dev/null +++ b/fs/ntfs/reparse.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/** + * Copyright (c) 2008-2021 Jean-Pierre Andre + * Copyright (c) 2025 LG Electronics Co., Ltd. + */ + +extern __le16 reparse_index_name[]; + +unsigned int ntfs_make_symlink(struct ntfs_inode *ni); +unsigned int ntfs_reparse_tag_dt_types(struct ntfs_volume *vol, unsigned long mref); +int ntfs_reparse_set_wsl_symlink(struct ntfs_inode *ni, + const __le16 *target, int target_len); +int ntfs_reparse_set_wsl_not_symlink(struct ntfs_inode *ni, mode_t mode); +int ntfs_delete_reparse_index(struct ntfs_inode *ni); +int ntfs_remove_ntfs_reparse_data(struct ntfs_inode *ni); diff --git a/fs/ntfs/runlist.h b/fs/ntfs/runlist.h index 38de0a375f59..cac8b5a26a79 100644 --- a/fs/ntfs/runlist.h +++ b/fs/ntfs/runlist.h @@ -1,17 +1,16 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * runlist.h - Defines for runlist handling in NTFS Linux kernel driver. - * Part of the Linux-NTFS project. + * Defines for runlist handling in NTFS Linux kernel driver. + * Part of the Linux-NTFS project. * * Copyright (c) 2001-2005 Anton Altaparmakov * Copyright (c) 2002 Richard Russon + * Copyright (c) 2025 LG Electronics Co., Ltd. */ #ifndef _LINUX_NTFS_RUNLIST_H #define _LINUX_NTFS_RUNLIST_H -#include "types.h" -#include "layout.h" #include "volume.h" /** @@ -25,64 +24,70 @@ * When lcn == -1 this means that the count vcns starting at vcn are not * physically allocated (i.e. this is a hole / data is sparse). */ -typedef struct { /* In memory vcn to lcn mapping structure element. */ - VCN vcn; /* vcn = Starting virtual cluster number. */ - LCN lcn; /* lcn = Starting logical cluster number. */ +struct runlist_element { /* In memory vcn to lcn mapping structure element. */ + s64 vcn; /* vcn = Starting virtual cluster number. */ + s64 lcn; /* lcn = Starting logical cluster number. */ s64 length; /* Run length in clusters. */ -} runlist_element; +}; /** * runlist - in memory vcn to lcn mapping array including a read/write lock * @rl: pointer to an array of runlist elements * @lock: read/write spinlock for serializing access to @rl - * + * @rl_hint: hint/cache pointing to the last accessed runlist element */ -typedef struct { - runlist_element *rl; +struct runlist { + struct runlist_element *rl; struct rw_semaphore lock; -} runlist; + size_t count; + int rl_hint; +}; -static inline void ntfs_init_runlist(runlist *rl) +static inline void ntfs_init_runlist(struct runlist *rl) { rl->rl = NULL; init_rwsem(&rl->lock); + rl->count = 0; + rl->rl_hint = -1; } -typedef enum { - LCN_HOLE = -1, /* Keep this as highest value or die! */ - LCN_RL_NOT_MAPPED = -2, - LCN_ENOENT = -3, - LCN_ENOMEM = -4, - LCN_EIO = -5, -} LCN_SPECIAL_VALUES; - -extern runlist_element *ntfs_runlists_merge(runlist_element *drl, - runlist_element *srl); - -extern runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol, - const ATTR_RECORD *attr, runlist_element *old_rl); - -extern LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn); - -#ifdef NTFS_RW - -extern runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl, - const VCN vcn); - -extern int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol, - const runlist_element *rl, const VCN first_vcn, - const VCN last_vcn); - -extern int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, - const int dst_len, const runlist_element *rl, - const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn); - -extern int ntfs_rl_truncate_nolock(const ntfs_volume *vol, - runlist *const runlist, const s64 new_length); - -int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist, - const VCN start, const s64 length); - -#endif /* NTFS_RW */ - +enum { + LCN_DELALLOC = -1, + LCN_HOLE = -2, + LCN_RL_NOT_MAPPED = -3, + LCN_ENOENT = -4, + LCN_ENOMEM = -5, + LCN_EIO = -6, + LCN_EINVAL = -7, +}; + +struct runlist_element *ntfs_runlists_merge(struct runlist *d_runlist, + struct runlist_element *srl, size_t s_rl_count, + size_t *new_rl_count); +struct runlist_element *ntfs_mapping_pairs_decompress(const struct ntfs_volume *vol, + const struct attr_record *attr, struct runlist *old_runlist, + size_t *new_rl_count); +s64 ntfs_rl_vcn_to_lcn(const struct runlist_element *rl, const s64 vcn); +struct runlist_element *ntfs_rl_find_vcn_nolock(struct runlist_element *rl, const s64 vcn); +int ntfs_get_size_for_mapping_pairs(const struct ntfs_volume *vol, + const struct runlist_element *rl, const s64 first_vcn, + const s64 last_vcn, int max_mp_size); +int ntfs_mapping_pairs_build(const struct ntfs_volume *vol, s8 *dst, + const int dst_len, const struct runlist_element *rl, + const s64 first_vcn, const s64 last_vcn, s64 *const stop_vcn, + struct runlist_element **stop_rl, unsigned int *de_cluster_count); +int ntfs_rl_truncate_nolock(const struct ntfs_volume *vol, + struct runlist *const runlist, const s64 new_length); +int ntfs_rl_sparse(struct runlist_element *rl); +s64 ntfs_rl_get_compressed_size(struct ntfs_volume *vol, struct runlist_element *rl); +struct runlist_element *ntfs_rl_insert_range(struct runlist_element *dst_rl, int dst_cnt, + struct runlist_element *src_rl, int src_cnt, size_t *new_cnt); +struct runlist_element *ntfs_rl_punch_hole(struct runlist_element *dst_rl, int dst_cnt, + s64 start_vcn, s64 len, struct runlist_element **punch_rl, + size_t *new_rl_cnt); +struct runlist_element *ntfs_rl_collapse_range(struct runlist_element *dst_rl, int dst_cnt, + s64 start_vcn, s64 len, struct runlist_element **punch_rl, + size_t *new_rl_cnt); +struct runlist_element *ntfs_rl_realloc(struct runlist_element *rl, int old_size, + int new_size); #endif /* _LINUX_NTFS_RUNLIST_H */ diff --git a/fs/ntfs/sysctl.h b/fs/ntfs/sysctl.h index 96bb2299d2d5..50fa1095ad7f 100644 --- a/fs/ntfs/sysctl.h +++ b/fs/ntfs/sysctl.h @@ -1,9 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * sysctl.h - Defines for sysctl handling in NTFS Linux kernel driver. Part of - * the Linux-NTFS project. Adapted from the old NTFS driver, - * Copyright (C) 1997 Martin von Löwis, Régis Duchesne + * Defines for sysctl handling in NTFS Linux kernel driver. Part of + * the Linux-NTFS project. Adapted from the old NTFS driver. * + * Copyright (C) 1997 Martin von Löwis, Régis Duchesne * Copyright (c) 2002-2004 Anton Altaparmakov */ @@ -13,7 +13,7 @@ #if defined(DEBUG) && defined(CONFIG_SYSCTL) -extern int ntfs_sysctl(int add); +int ntfs_sysctl(int add); #else diff --git a/fs/ntfs/time.h b/fs/ntfs/time.h index 6b63261300cc..0d56ae5dc0c5 100644 --- a/fs/ntfs/time.h +++ b/fs/ntfs/time.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * time.h - NTFS time conversion functions. Part of the Linux-NTFS project. + * NTFS time conversion functions. Part of the Linux-NTFS project. * * Copyright (c) 2001-2005 Anton Altaparmakov */ @@ -8,11 +8,9 @@ #ifndef _LINUX_NTFS_TIME_H #define _LINUX_NTFS_TIME_H -#include /* For current_kernel_time(). */ +#include #include /* For do_div(). */ -#include "endian.h" - #define NTFS_TIME_OFFSET ((s64)(369 * 365 + 89) * 24 * 3600 * 10000000) /** @@ -31,13 +29,13 @@ * measured as the number of 100-nano-second intervals since 1st January 1601, * 00:00:00 UTC. */ -static inline sle64 utc2ntfs(const struct timespec64 ts) +static inline __le64 utc2ntfs(const struct timespec64 ts) { /* * Convert the seconds to 100ns intervals, add the nano-seconds * converted to 100ns intervals, and then add the NTFS time offset. */ - return cpu_to_sle64((s64)ts.tv_sec * 10000000 + ts.tv_nsec / 100 + + return cpu_to_le64((s64)ts.tv_sec * 10000000 + ts.tv_nsec / 100 + NTFS_TIME_OFFSET); } @@ -47,7 +45,7 @@ static inline sle64 utc2ntfs(const struct timespec64 ts) * Get the current time from the Linux kernel, convert it to its corresponding * NTFS time and return that in little endian format. */ -static inline sle64 get_current_ntfs_time(void) +static inline __le64 get_current_ntfs_time(void) { struct timespec64 ts; @@ -71,12 +69,12 @@ static inline sle64 get_current_ntfs_time(void) * measured as the number of 100 nano-second intervals since 1st January 1601, * 00:00:00 UTC. */ -static inline struct timespec64 ntfs2utc(const sle64 time) +static inline struct timespec64 ntfs2utc(const __le64 time) { struct timespec64 ts; /* Subtract the NTFS time offset. */ - u64 t = (u64)(sle64_to_cpu(time) - NTFS_TIME_OFFSET); + u64 t = (u64)(le64_to_cpu(time) - NTFS_TIME_OFFSET); /* * Convert the time to 1-second intervals and the remainder to * 1-nano-second intervals. diff --git a/fs/ntfs/volume.h b/fs/ntfs/volume.h index 930a9ae8a053..b934c88e5e11 100644 --- a/fs/ntfs/volume.h +++ b/fs/ntfs/volume.h @@ -1,45 +1,47 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * volume.h - Defines for volume structures in NTFS Linux kernel driver. Part - * of the Linux-NTFS project. + * Defines for volume structures in NTFS Linux kernel driver. + * Part of the Linux-NTFS project. * * Copyright (c) 2001-2006 Anton Altaparmakov * Copyright (c) 2002 Richard Russon + * Copyright (c) 2025 LG Electronics Co., Ltd. */ #ifndef _LINUX_NTFS_VOLUME_H #define _LINUX_NTFS_VOLUME_H #include +#include +#include #include +#include +#include -#include "types.h" #include "layout.h" +#define NTFS_VOL_UID BIT(1) +#define NTFS_VOL_GID BIT(2) + /* * The NTFS in memory super block structure. */ -typedef struct { - /* - * FIXME: Reorder to have commonly used together element within the - * same cache line, aiming at a cache line size of 32 bytes. Aim for - * 64 bytes for less commonly used together elements. Put most commonly - * used elements to front of structure. Obviously do this only when the - * structure has stabilized... (AIA) - */ +struct ntfs_volume { /* Device specifics. */ struct super_block *sb; /* Pointer back to the super_block. */ - LCN nr_blocks; /* Number of sb->s_blocksize bytes - sized blocks on the device. */ + s64 nr_blocks; /* + * Number of sb->s_blocksize bytes + * sized blocks on the device. + */ /* Configuration provided by user at mount time. */ unsigned long flags; /* Miscellaneous flags, see below. */ kuid_t uid; /* uid that files will be mounted as. */ kgid_t gid; /* gid that files will be mounted as. */ umode_t fmask; /* The mask for file permissions. */ - umode_t dmask; /* The mask for directory - permissions. */ + umode_t dmask; /* The mask for directory permissions. */ u8 mft_zone_multiplier; /* Initial mft zone multiplier. */ u8 on_errors; /* What to do on filesystem errors. */ + errseq_t wb_err; /* NTFS bootsector provided information. */ u16 sector_size; /* in bytes */ u8 sector_size_bits; /* log2(sector_size) */ @@ -52,104 +54,126 @@ typedef struct { u32 index_record_size; /* in bytes */ u32 index_record_size_mask; /* index_record_size - 1 */ u8 index_record_size_bits; /* log2(index_record_size) */ - LCN nr_clusters; /* Volume size in clusters == number of - bits in lcn bitmap. */ - LCN mft_lcn; /* Cluster location of mft data. */ - LCN mftmirr_lcn; /* Cluster location of copy of mft. */ + s64 nr_clusters; /* + * Volume size in clusters == number of + * bits in lcn bitmap. + */ + s64 mft_lcn; /* Cluster location of mft data. */ + s64 mftmirr_lcn; /* Cluster location of copy of mft. */ u64 serial_no; /* The volume serial number. */ /* Mount specific NTFS information. */ u32 upcase_len; /* Number of entries in upcase[]. */ - ntfschar *upcase; /* The upcase table. */ + __le16 *upcase; /* The upcase table. */ - s32 attrdef_size; /* Size of the attribute definition - table in bytes. */ - ATTR_DEF *attrdef; /* Table of attribute definitions. - Obtained from FILE_AttrDef. */ + s32 attrdef_size; /* Size of the attribute definition table in bytes. */ + struct attr_def *attrdef; /* + * Table of attribute definitions. + * Obtained from FILE_AttrDef. + */ -#ifdef NTFS_RW /* Variables used by the cluster and mft allocators. */ - s64 mft_data_pos; /* Mft record number at which to - allocate the next mft record. */ - LCN mft_zone_start; /* First cluster of the mft zone. */ - LCN mft_zone_end; /* First cluster beyond the mft zone. */ - LCN mft_zone_pos; /* Current position in the mft zone. */ - LCN data1_zone_pos; /* Current position in the first data - zone. */ - LCN data2_zone_pos; /* Current position in the second data - zone. */ -#endif /* NTFS_RW */ + s64 mft_data_pos; /* + * Mft record number at which to + * allocate the next mft record. + */ + s64 mft_zone_start; /* First cluster of the mft zone. */ + s64 mft_zone_end; /* First cluster beyond the mft zone. */ + s64 mft_zone_pos; /* Current position in the mft zone. */ + s64 data1_zone_pos; /* Current position in the first data zone. */ + s64 data2_zone_pos; /* Current position in the second data zone. */ struct inode *mft_ino; /* The VFS inode of $MFT. */ struct inode *mftbmp_ino; /* Attribute inode for $MFT/$BITMAP. */ - struct rw_semaphore mftbmp_lock; /* Lock for serializing accesses to the - mft record bitmap ($MFT/$BITMAP). */ -#ifdef NTFS_RW + struct rw_semaphore mftbmp_lock; /* + * Lock for serializing accesses to the + * mft record bitmap ($MFT/$BITMAP). + */ struct inode *mftmirr_ino; /* The VFS inode of $MFTMirr. */ int mftmirr_size; /* Size of mft mirror in mft records. */ - struct inode *logfile_ino; /* The VFS inode of $LogFile. */ -#endif /* NTFS_RW */ + struct inode *logfile_ino; /* The VFS inode of LogFile. */ struct inode *lcnbmp_ino; /* The VFS inode of $Bitmap. */ - struct rw_semaphore lcnbmp_lock; /* Lock for serializing accesses to the - cluster bitmap ($Bitmap/$DATA). */ + struct rw_semaphore lcnbmp_lock; /* + * Lock for serializing accesses to the + * cluster bitmap ($Bitmap/$DATA). + */ struct inode *vol_ino; /* The VFS inode of $Volume. */ - VOLUME_FLAGS vol_flags; /* Volume flags. */ + __le16 vol_flags; /* Volume flags. */ u8 major_ver; /* Ntfs major version of volume. */ u8 minor_ver; /* Ntfs minor version of volume. */ - - struct inode *root_ino; /* The VFS inode of the root - directory. */ - struct inode *secure_ino; /* The VFS inode of $Secure (NTFS3.0+ - only, otherwise NULL). */ - struct inode *extend_ino; /* The VFS inode of $Extend (NTFS3.0+ - only, otherwise NULL). */ -#ifdef NTFS_RW + unsigned char *volume_label; + + struct inode *root_ino; /* The VFS inode of the root directory. */ + struct inode *secure_ino; /* + * The VFS inode of $Secure (NTFS3.0+ + * only, otherwise NULL). + */ + struct inode *extend_ino; /* + * The VFS inode of $Extend (NTFS3.0+ + * only, otherwise NULL). + */ /* $Quota stuff is NTFS3.0+ specific. Unused/NULL otherwise. */ struct inode *quota_ino; /* The VFS inode of $Quota. */ struct inode *quota_q_ino; /* Attribute inode for $Quota/$Q. */ - /* $UsnJrnl stuff is NTFS3.0+ specific. Unused/NULL otherwise. */ - struct inode *usnjrnl_ino; /* The VFS inode of $UsnJrnl. */ - struct inode *usnjrnl_max_ino; /* Attribute inode for $UsnJrnl/$Max. */ - struct inode *usnjrnl_j_ino; /* Attribute inode for $UsnJrnl/$J. */ -#endif /* NTFS_RW */ struct nls_table *nls_map; -} ntfs_volume; + bool nls_utf8; + wait_queue_head_t free_waitq; + + atomic64_t free_clusters; /* Track the number of free clusters */ + atomic64_t free_mft_records; /* Track the free mft records */ + atomic64_t dirty_clusters; + u8 sparse_compression_unit; + unsigned int *lcn_empty_bits_per_page; + struct work_struct precalc_work; + loff_t preallocated_size; +}; /* * Defined bits for the flags field in the ntfs_volume structure. */ -typedef enum { +enum { NV_Errors, /* 1: Volume has errors, prevent remount rw. */ NV_ShowSystemFiles, /* 1: Return system files in ntfs_readdir(). */ - NV_CaseSensitive, /* 1: Treat file names as case sensitive and - create filenames in the POSIX namespace. - Otherwise be case insensitive but still - create file names in POSIX namespace. */ - NV_LogFileEmpty, /* 1: $LogFile journal is empty. */ - NV_QuotaOutOfDate, /* 1: $Quota is out of date. */ - NV_UsnJrnlStamped, /* 1: $UsnJrnl has been stamped. */ - NV_SparseEnabled, /* 1: May create sparse files. */ -} ntfs_volume_flags; + NV_CaseSensitive, /* + * 1: Treat file names as case sensitive and + * create filenames in the POSIX namespace. + * Otherwise be case insensitive but still + * create file names in POSIX namespace. + */ + NV_LogFileEmpty, /* 1: LogFile journal is empty. */ + NV_QuotaOutOfDate, /* 1: Quota is out of date. */ + NV_UsnJrnlStamped, /* 1: UsnJrnl has been stamped. */ + NV_ReadOnly, + NV_Compression, + NV_FreeClusterKnown, + NV_Shutdown, + NV_SysImmutable, /* 1: Protect system files from deletion. */ + NV_ShowHiddenFiles, /* 1: Return hidden files in ntfs_readdir(). */ + NV_HideDotFiles, + NV_CheckWindowsNames, + NV_Discard, + NV_DisableSparse, +}; /* * Macro tricks to expand the NVolFoo(), NVolSetFoo(), and NVolClearFoo() * functions. */ #define DEFINE_NVOL_BIT_OPS(flag) \ -static inline int NVol##flag(ntfs_volume *vol) \ -{ \ - return test_bit(NV_##flag, &(vol)->flags); \ -} \ -static inline void NVolSet##flag(ntfs_volume *vol) \ -{ \ - set_bit(NV_##flag, &(vol)->flags); \ -} \ -static inline void NVolClear##flag(ntfs_volume *vol) \ -{ \ - clear_bit(NV_##flag, &(vol)->flags); \ +static inline int NVol##flag(struct ntfs_volume *vol) \ +{ \ + return test_bit(NV_##flag, &(vol)->flags); \ +} \ +static inline void NVolSet##flag(struct ntfs_volume *vol) \ +{ \ + set_bit(NV_##flag, &(vol)->flags); \ +} \ +static inline void NVolClear##flag(struct ntfs_volume *vol) \ +{ \ + clear_bit(NV_##flag, &(vol)->flags); \ } /* Emit the ntfs volume bitops functions. */ @@ -159,6 +183,72 @@ DEFINE_NVOL_BIT_OPS(CaseSensitive) DEFINE_NVOL_BIT_OPS(LogFileEmpty) DEFINE_NVOL_BIT_OPS(QuotaOutOfDate) DEFINE_NVOL_BIT_OPS(UsnJrnlStamped) -DEFINE_NVOL_BIT_OPS(SparseEnabled) +DEFINE_NVOL_BIT_OPS(ReadOnly) +DEFINE_NVOL_BIT_OPS(Compression) +DEFINE_NVOL_BIT_OPS(FreeClusterKnown) +DEFINE_NVOL_BIT_OPS(Shutdown) +DEFINE_NVOL_BIT_OPS(SysImmutable) +DEFINE_NVOL_BIT_OPS(ShowHiddenFiles) +DEFINE_NVOL_BIT_OPS(HideDotFiles) +DEFINE_NVOL_BIT_OPS(CheckWindowsNames) +DEFINE_NVOL_BIT_OPS(Discard) +DEFINE_NVOL_BIT_OPS(DisableSparse) + +static inline void ntfs_inc_free_clusters(struct ntfs_volume *vol, s64 nr) +{ + if (!NVolFreeClusterKnown(vol)) + wait_event(vol->free_waitq, NVolFreeClusterKnown(vol)); + atomic64_add(nr, &vol->free_clusters); +} + +static inline void ntfs_dec_free_clusters(struct ntfs_volume *vol, s64 nr) +{ + if (!NVolFreeClusterKnown(vol)) + wait_event(vol->free_waitq, NVolFreeClusterKnown(vol)); + atomic64_sub(nr, &vol->free_clusters); +} + +static inline void ntfs_inc_free_mft_records(struct ntfs_volume *vol, s64 nr) +{ + if (!NVolFreeClusterKnown(vol)) + return; + + atomic64_add(nr, &vol->free_mft_records); +} + +static inline void ntfs_dec_free_mft_records(struct ntfs_volume *vol, s64 nr) +{ + if (!NVolFreeClusterKnown(vol)) + return; + + atomic64_sub(nr, &vol->free_mft_records); +} + +static inline void ntfs_set_lcn_empty_bits(struct ntfs_volume *vol, unsigned long index, + u8 val, unsigned int count) +{ + if (!NVolFreeClusterKnown(vol)) + wait_event(vol->free_waitq, NVolFreeClusterKnown(vol)); + + if (val) + vol->lcn_empty_bits_per_page[index] -= count; + else + vol->lcn_empty_bits_per_page[index] += count; +} + +static __always_inline void ntfs_hold_dirty_clusters(struct ntfs_volume *vol, s64 nr_clusters) +{ + atomic64_add(nr_clusters, &vol->dirty_clusters); +} + +static __always_inline void ntfs_release_dirty_clusters(struct ntfs_volume *vol, s64 nr_clusters) +{ + if (atomic64_read(&vol->dirty_clusters) < nr_clusters) + atomic64_set(&vol->dirty_clusters, 0); + else + atomic64_sub(nr_clusters, &vol->dirty_clusters); +} +s64 ntfs_available_clusters_count(struct ntfs_volume *vol, s64 nr_clusters); +s64 get_nr_free_clusters(struct ntfs_volume *vol); #endif /* _LINUX_NTFS_VOLUME_H */ diff --git a/include/uapi/linux/ntfs.h b/include/uapi/linux/ntfs.h new file mode 100644 index 000000000000..e76957285280 --- /dev/null +++ b/include/uapi/linux/ntfs.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright (c) 2025 LG Electronics Co., Ltd. + */ + +#ifndef _UAPI_LINUX_NTFS_H +#define _UAPI_LINUX_NTFS_H +#include +#include + +/* + * ntfs-specific ioctl commands + */ +#define NTFS_IOC_SHUTDOWN _IOR('X', 125, __u32) + +/* + * Flags used by NTFS_IOC_SHUTDOWN + */ +#define NTFS_GOING_DOWN_DEFAULT 0x0 /* default with full sync */ +#define NTFS_GOING_DOWN_FULLSYNC 0x1 /* going down with full sync*/ +#define NTFS_GOING_DOWN_NOSYNC 0x2 /* going down */ + +#endif /* _UAPI_LINUX_NTFS_H */ -- 2.25.1 This updates the implementation of superblock operations. Signed-off-by: Namjae Jeon --- fs/ntfs/super.c | 2505 ++++++++++++++++++++--------------------------- 1 file changed, 1040 insertions(+), 1465 deletions(-) diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 56a7d5bd33e4..3dff06b9ce9f 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -1,355 +1,324 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project. + * NTFS kernel super block handling. Part of the Linux-NTFS project. * * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. * Copyright (c) 2001,2002 Richard Russon + * Copyright (c) 2025 LG Electronics Co., Ltd. */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include -#include -#include -#include -#include #include /* For bdev_logical_block_size(). */ #include -#include #include -#include -#include +#include +#include +#include +#include +#include #include "sysctl.h" #include "logfile.h" #include "quota.h" -#include "usnjrnl.h" -#include "dir.h" -#include "debug.h" #include "index.h" -#include "inode.h" -#include "aops.h" -#include "layout.h" -#include "malloc.h" #include "ntfs.h" - -/* Number of mounted filesystems which have compression enabled. */ -static unsigned long ntfs_nr_compression_users; +#include "ea.h" +#include "volume.h" +#include "malloc.h" /* A global default upcase table and a corresponding reference count. */ -static ntfschar *default_upcase; +static __le16 *default_upcase; static unsigned long ntfs_nr_upcase_users; +static struct workqueue_struct *ntfs_wq; + /* Error constants/strings used in inode.c::ntfs_show_options(). */ -typedef enum { +enum { /* One of these must be present, default is ON_ERRORS_CONTINUE. */ - ON_ERRORS_PANIC = 0x01, - ON_ERRORS_REMOUNT_RO = 0x02, - ON_ERRORS_CONTINUE = 0x04, - /* Optional, can be combined with any of the above. */ - ON_ERRORS_RECOVER = 0x10, -} ON_ERRORS_ACTIONS; - -const option_t on_errors_arr[] = { - { ON_ERRORS_PANIC, "panic" }, - { ON_ERRORS_REMOUNT_RO, "remount-ro", }, - { ON_ERRORS_CONTINUE, "continue", }, - { ON_ERRORS_RECOVER, "recover" }, - { 0, NULL } + ON_ERRORS_PANIC = 0x01, + ON_ERRORS_REMOUNT_RO = 0x02, + ON_ERRORS_CONTINUE = 0x04, }; -/** - * simple_getbool - convert input string to a boolean value - * @s: input string to convert - * @setval: where to store the output boolean value - * - * Copied from old ntfs driver (which copied from vfat driver). - * - * "1", "yes", "true", or an empty string are converted to %true. - * "0", "no", and "false" are converted to %false. - * - * Return: %1 if the string is converted or was empty and *setval contains it; - * %0 if the string was not valid. - */ -static int simple_getbool(char *s, bool *setval) -{ - if (s) { - if (!strcmp(s, "1") || !strcmp(s, "yes") || !strcmp(s, "true")) - *setval = true; - else if (!strcmp(s, "0") || !strcmp(s, "no") || - !strcmp(s, "false")) - *setval = false; - else - return 0; - } else - *setval = true; - return 1; -} +static const struct constant_table ntfs_param_enums[] = { + { "panic", ON_ERRORS_PANIC }, + { "remount-ro", ON_ERRORS_REMOUNT_RO }, + { "continue", ON_ERRORS_CONTINUE }, + {} +}; -/** - * parse_options - parse the (re)mount options - * @vol: ntfs volume - * @opt: string containing the (re)mount options - * - * Parse the recognized options in @opt for the ntfs volume described by @vol. - */ -static bool parse_options(ntfs_volume *vol, char *opt) +enum { + Opt_uid, + Opt_gid, + Opt_umask, + Opt_dmask, + Opt_fmask, + Opt_errors, + Opt_nls, + Opt_charset, + Opt_show_sys_files, + Opt_show_meta, + Opt_case_sensitive, + Opt_disable_sparse, + Opt_sparse, + Opt_mft_zone_multiplier, + Opt_preallocated_size, + Opt_sys_immutable, + Opt_nohidden, + Opt_hide_dot_files, + Opt_check_windows_names, + Opt_acl, + Opt_discard, + Opt_nocase, +}; + +static const struct fs_parameter_spec ntfs_parameters[] = { + fsparam_u32("uid", Opt_uid), + fsparam_u32("gid", Opt_gid), + fsparam_u32oct("umask", Opt_umask), + fsparam_u32oct("dmask", Opt_dmask), + fsparam_u32oct("fmask", Opt_fmask), + fsparam_string("nls", Opt_nls), + fsparam_string("iocharset", Opt_charset), + fsparam_enum("errors", Opt_errors, ntfs_param_enums), + fsparam_flag("show_sys_files", Opt_show_sys_files), + fsparam_flag("showmeta", Opt_show_meta), + fsparam_flag("case_sensitive", Opt_case_sensitive), + fsparam_flag("disable_sparse", Opt_disable_sparse), + fsparam_s32("mft_zone_multiplier", Opt_mft_zone_multiplier), + fsparam_u64("preallocated_size", Opt_preallocated_size), + fsparam_flag("sys_immutable", Opt_sys_immutable), + fsparam_flag("nohidden", Opt_nohidden), + fsparam_flag("hide_dot_files", Opt_hide_dot_files), + fsparam_flag("windows_names", Opt_check_windows_names), + fsparam_flag("acl", Opt_acl), + fsparam_flag("discard", Opt_discard), + fsparam_flag("sparse", Opt_sparse), + fsparam_flag("nocase", Opt_nocase), + {} +}; + +static int ntfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p, *v, *ov; - static char *utf8 = "utf8"; - int errors = 0, sloppy = 0; - kuid_t uid = INVALID_UID; - kgid_t gid = INVALID_GID; - umode_t fmask = (umode_t)-1, dmask = (umode_t)-1; - int mft_zone_multiplier = -1, on_errors = -1; - int show_sys_files = -1, case_sensitive = -1, disable_sparse = -1; - struct nls_table *nls_map = NULL, *old_nls; - - /* I am lazy... (-8 */ -#define NTFS_GETOPT_WITH_DEFAULT(option, variable, default_value) \ - if (!strcmp(p, option)) { \ - if (!v || !*v) \ - variable = default_value; \ - else { \ - variable = simple_strtoul(ov = v, &v, 0); \ - if (*v) \ - goto needs_val; \ - } \ - } -#define NTFS_GETOPT(option, variable) \ - if (!strcmp(p, option)) { \ - if (!v || !*v) \ - goto needs_arg; \ - variable = simple_strtoul(ov = v, &v, 0); \ - if (*v) \ - goto needs_val; \ - } -#define NTFS_GETOPT_UID(option, variable) \ - if (!strcmp(p, option)) { \ - uid_t uid_value; \ - if (!v || !*v) \ - goto needs_arg; \ - uid_value = simple_strtoul(ov = v, &v, 0); \ - if (*v) \ - goto needs_val; \ - variable = make_kuid(current_user_ns(), uid_value); \ - if (!uid_valid(variable)) \ - goto needs_val; \ - } -#define NTFS_GETOPT_GID(option, variable) \ - if (!strcmp(p, option)) { \ - gid_t gid_value; \ - if (!v || !*v) \ - goto needs_arg; \ - gid_value = simple_strtoul(ov = v, &v, 0); \ - if (*v) \ - goto needs_val; \ - variable = make_kgid(current_user_ns(), gid_value); \ - if (!gid_valid(variable)) \ - goto needs_val; \ - } -#define NTFS_GETOPT_OCTAL(option, variable) \ - if (!strcmp(p, option)) { \ - if (!v || !*v) \ - goto needs_arg; \ - variable = simple_strtoul(ov = v, &v, 8); \ - if (*v) \ - goto needs_val; \ - } -#define NTFS_GETOPT_BOOL(option, variable) \ - if (!strcmp(p, option)) { \ - bool val; \ - if (!simple_getbool(v, &val)) \ - goto needs_bool; \ - variable = val; \ - } -#define NTFS_GETOPT_OPTIONS_ARRAY(option, variable, opt_array) \ - if (!strcmp(p, option)) { \ - int _i; \ - if (!v || !*v) \ - goto needs_arg; \ - ov = v; \ - if (variable == -1) \ - variable = 0; \ - for (_i = 0; opt_array[_i].str && *opt_array[_i].str; _i++) \ - if (!strcmp(opt_array[_i].str, v)) { \ - variable |= opt_array[_i].val; \ - break; \ - } \ - if (!opt_array[_i].str || !*opt_array[_i].str) \ - goto needs_val; \ - } - if (!opt || !*opt) - goto no_mount_options; - ntfs_debug("Entering with mount options string: %s", opt); - while ((p = strsep(&opt, ","))) { - if ((v = strchr(p, '='))) - *v++ = 0; - NTFS_GETOPT_UID("uid", uid) - else NTFS_GETOPT_GID("gid", gid) - else NTFS_GETOPT_OCTAL("umask", fmask = dmask) - else NTFS_GETOPT_OCTAL("fmask", fmask) - else NTFS_GETOPT_OCTAL("dmask", dmask) - else NTFS_GETOPT("mft_zone_multiplier", mft_zone_multiplier) - else NTFS_GETOPT_WITH_DEFAULT("sloppy", sloppy, true) - else NTFS_GETOPT_BOOL("show_sys_files", show_sys_files) - else NTFS_GETOPT_BOOL("case_sensitive", case_sensitive) - else NTFS_GETOPT_BOOL("disable_sparse", disable_sparse) - else NTFS_GETOPT_OPTIONS_ARRAY("errors", on_errors, - on_errors_arr) - else if (!strcmp(p, "posix") || !strcmp(p, "show_inodes")) - ntfs_warning(vol->sb, "Ignoring obsolete option %s.", - p); - else if (!strcmp(p, "nls") || !strcmp(p, "iocharset")) { - if (!strcmp(p, "iocharset")) - ntfs_warning(vol->sb, "Option iocharset is " - "deprecated. Please use " - "option nls= in " - "the future."); - if (!v || !*v) - goto needs_arg; -use_utf8: - old_nls = nls_map; - nls_map = load_nls(v); - if (!nls_map) { - if (!old_nls) { - ntfs_error(vol->sb, "NLS character set " - "%s not found.", v); - return false; - } - ntfs_error(vol->sb, "NLS character set %s not " - "found. Using previous one %s.", - v, old_nls->charset); - nls_map = old_nls; - } else /* nls_map */ { - unload_nls(old_nls); - } - } else if (!strcmp(p, "utf8")) { - bool val = false; - ntfs_warning(vol->sb, "Option utf8 is no longer " - "supported, using option nls=utf8. Please " - "use option nls=utf8 in the future and " - "make sure utf8 is compiled either as a " - "module or into the kernel."); - if (!v || !*v) - val = true; - else if (!simple_getbool(v, &val)) - goto needs_bool; - if (val) { - v = utf8; - goto use_utf8; - } - } else { - ntfs_error(vol->sb, "Unrecognized mount option %s.", p); - if (errors < INT_MAX) - errors++; - } -#undef NTFS_GETOPT_OPTIONS_ARRAY -#undef NTFS_GETOPT_BOOL -#undef NTFS_GETOPT -#undef NTFS_GETOPT_WITH_DEFAULT - } -no_mount_options: - if (errors && !sloppy) - return false; - if (sloppy) - ntfs_warning(vol->sb, "Sloppy option given. Ignoring " - "unrecognized mount option(s) and continuing."); - /* Keep this first! */ - if (on_errors != -1) { - if (!on_errors) { - ntfs_error(vol->sb, "Invalid errors option argument " - "or bug in options parser."); - return false; - } - } - if (nls_map) { - if (vol->nls_map && vol->nls_map != nls_map) { - ntfs_error(vol->sb, "Cannot change NLS character set " - "on remount."); - return false; - } /* else (!vol->nls_map) */ - ntfs_debug("Using NLS character set %s.", nls_map->charset); - vol->nls_map = nls_map; - } else /* (!nls_map) */ { + struct ntfs_volume *vol = fc->s_fs_info; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, ntfs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_uid: + vol->uid = make_kuid(current_user_ns(), result.uint_32); + break; + case Opt_gid: + vol->gid = make_kgid(current_user_ns(), result.uint_32); + break; + case Opt_umask: + vol->fmask = vol->dmask = result.uint_32; + break; + case Opt_dmask: + vol->dmask = result.uint_32; + break; + case Opt_fmask: + vol->fmask = result.uint_32; + break; + case Opt_errors: + vol->on_errors = result.uint_32; + break; + case Opt_nls: + case Opt_charset: + if (vol->nls_map) + unload_nls(vol->nls_map); + vol->nls_map = load_nls(param->string); if (!vol->nls_map) { - vol->nls_map = load_nls_default(); - if (!vol->nls_map) { - ntfs_error(vol->sb, "Failed to load default " - "NLS character set."); - return false; - } - ntfs_debug("Using default NLS character set (%s).", - vol->nls_map->charset); + ntfs_error(vol->sb, "Failed to load NLS table '%s'.", + param->string); + return -EINVAL; } - } - if (mft_zone_multiplier != -1) { + break; + case Opt_mft_zone_multiplier: if (vol->mft_zone_multiplier && vol->mft_zone_multiplier != - mft_zone_multiplier) { - ntfs_error(vol->sb, "Cannot change mft_zone_multiplier " - "on remount."); - return false; - } - if (mft_zone_multiplier < 1 || mft_zone_multiplier > 4) { - ntfs_error(vol->sb, "Invalid mft_zone_multiplier. " - "Using default value, i.e. 1."); - mft_zone_multiplier = 1; + result.int_32) { + ntfs_error(vol->sb, "Cannot change mft_zone_multiplier on remount."); + return -EINVAL; } - vol->mft_zone_multiplier = mft_zone_multiplier; - } - if (!vol->mft_zone_multiplier) - vol->mft_zone_multiplier = 1; - if (on_errors != -1) - vol->on_errors = on_errors; - if (!vol->on_errors || vol->on_errors == ON_ERRORS_RECOVER) - vol->on_errors |= ON_ERRORS_CONTINUE; - if (uid_valid(uid)) - vol->uid = uid; - if (gid_valid(gid)) - vol->gid = gid; - if (fmask != (umode_t)-1) - vol->fmask = fmask; - if (dmask != (umode_t)-1) - vol->dmask = dmask; - if (show_sys_files != -1) { - if (show_sys_files) + if (result.int_32 < 1 || result.int_32 > 4) { + ntfs_error(vol->sb, + "Invalid mft_zone_multiplier. Using default value, i.e. 1."); + vol->mft_zone_multiplier = 1; + } else + vol->mft_zone_multiplier = result.int_32; + break; + case Opt_show_sys_files: + case Opt_show_meta: + if (result.boolean) NVolSetShowSystemFiles(vol); else NVolClearShowSystemFiles(vol); - } - if (case_sensitive != -1) { - if (case_sensitive) + break; + case Opt_case_sensitive: + if (result.boolean) NVolSetCaseSensitive(vol); else NVolClearCaseSensitive(vol); + break; + case Opt_nocase: + if (result.boolean) + NVolClearCaseSensitive(vol); + else + NVolSetCaseSensitive(vol); + break; + case Opt_preallocated_size: + vol->preallocated_size = (loff_t)result.uint_64; + break; + case Opt_sys_immutable: + if (result.boolean) + NVolSetSysImmutable(vol); + else + NVolClearSysImmutable(vol); + break; + case Opt_nohidden: + if (result.boolean) + NVolClearShowHiddenFiles(vol); + else + NVolSetShowHiddenFiles(vol); + break; + case Opt_hide_dot_files: + if (result.boolean) + NVolSetHideDotFiles(vol); + else + NVolClearHideDotFiles(vol); + break; + case Opt_check_windows_names: + if (result.boolean) + NVolSetCheckWindowsNames(vol); + else + NVolClearCheckWindowsNames(vol); + break; + case Opt_acl: + if (result.boolean) + fc->sb_flags |= SB_POSIXACL; + else + fc->sb_flags &= ~SB_POSIXACL; + break; + case Opt_discard: + if (result.boolean) + NVolSetDiscard(vol); + else + NVolClearDiscard(vol); + break; + case Opt_disable_sparse: + if (result.boolean) + NVolSetDisableSparse(vol); + else + NVolClearDisableSparse(vol); + break; + case Opt_sparse: + break; + default: + return -EINVAL; } - if (disable_sparse != -1) { - if (disable_sparse) - NVolClearSparseEnabled(vol); - else { - if (!NVolSparseEnabled(vol) && - vol->major_ver && vol->major_ver < 3) - ntfs_warning(vol->sb, "Not enabling sparse " - "support due to NTFS volume " - "version %i.%i (need at least " - "version 3.0).", vol->major_ver, - vol->minor_ver); - else - NVolSetSparseEnabled(vol); + + return 0; +} + +static int ntfs_reconfigure(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; + struct ntfs_volume *vol = NTFS_SB(sb); + + ntfs_debug("Entering with remount"); + + sync_filesystem(sb); + + /* + * For the read-write compiled driver, if we are remounting read-write, + * make sure there are no volume errors and that no unsupported volume + * flags are set. Also, empty the logfile journal as it would become + * stale as soon as something is written to the volume and mark the + * volume dirty so that chkdsk is run if the volume is not umounted + * cleanly. Finally, mark the quotas out of date so Windows rescans + * the volume on boot and updates them. + * + * When remounting read-only, mark the volume clean if no volume errors + * have occurred. + */ + if (sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY)) { + static const char *es = ". Cannot remount read-write."; + + /* Remounting read-write. */ + if (NVolErrors(vol)) { + ntfs_error(sb, "Volume has errors and is read-only%s", + es); + return -EROFS; + } + if (vol->vol_flags & VOLUME_IS_DIRTY) { + ntfs_error(sb, "Volume is dirty and read-only%s", es); + return -EROFS; + } + if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) { + ntfs_error(sb, "Volume has been modified by chkdsk and is read-only%s", es); + return -EROFS; + } + if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { + ntfs_error(sb, "Volume has unsupported flags set (0x%x) and is read-only%s", + le16_to_cpu(vol->vol_flags), es); + return -EROFS; + } + if (vol->logfile_ino && !ntfs_empty_logfile(vol->logfile_ino)) { + ntfs_error(sb, "Failed to empty journal LogFile%s", + es); + NVolSetErrors(vol); + return -EROFS; + } + if (!ntfs_mark_quotas_out_of_date(vol)) { + ntfs_error(sb, "Failed to mark quotas out of date%s", + es); + NVolSetErrors(vol); + return -EROFS; + } + } else if (!sb_rdonly(sb) && (fc->sb_flags & SB_RDONLY)) { + /* Remounting read-only. */ + if (!NVolErrors(vol)) { + if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY)) + ntfs_warning(sb, + "Failed to clear dirty bit in volume information flags. Run chkdsk."); } } - return true; -needs_arg: - ntfs_error(vol->sb, "The %s option requires an argument.", p); - return false; -needs_bool: - ntfs_error(vol->sb, "The %s option requires a boolean argument.", p); - return false; -needs_val: - ntfs_error(vol->sb, "Invalid %s option argument: %s", p, ov); - return false; + + ntfs_debug("Done."); + return 0; } -#ifdef NTFS_RW +const struct option_t on_errors_arr[] = { + { ON_ERRORS_PANIC, "panic" }, + { ON_ERRORS_REMOUNT_RO, "remount-ro", }, + { ON_ERRORS_CONTINUE, "continue", }, + { 0, NULL } +}; + +void ntfs_handle_error(struct super_block *sb) +{ + struct ntfs_volume *vol = NTFS_SB(sb); + + if (sb_rdonly(sb)) + return; + + if (vol->on_errors == ON_ERRORS_REMOUNT_RO) { + sb->s_flags |= SB_RDONLY; + pr_crit("(device %s): Filesystem has been set read-only\n", + sb->s_id); + } else if (vol->on_errors == ON_ERRORS_PANIC) { + panic("ntfs: (device %s): panic from previous error\n", + sb->s_id); + } else if (vol->on_errors == ON_ERRORS_CONTINUE) { + if (errseq_check(&sb->s_wb_err, vol->wb_err) == -ENODEV) { + NVolSetShutdown(vol); + vol->wb_err = sb->s_wb_err; + } + } +} /** * ntfs_write_volume_flags - write new flags to the volume information flags @@ -366,48 +335,43 @@ static bool parse_options(ntfs_volume *vol, char *opt) * * Return 0 on success and -errno on error. */ -static int ntfs_write_volume_flags(ntfs_volume *vol, const VOLUME_FLAGS flags) +static int ntfs_write_volume_flags(struct ntfs_volume *vol, const __le16 flags) { - ntfs_inode *ni = NTFS_I(vol->vol_ino); - MFT_RECORD *m; - VOLUME_INFORMATION *vi; - ntfs_attr_search_ctx *ctx; + struct ntfs_inode *ni = NTFS_I(vol->vol_ino); + struct volume_information *vi; + struct ntfs_attr_search_ctx *ctx; int err; ntfs_debug("Entering, old flags = 0x%x, new flags = 0x%x.", le16_to_cpu(vol->vol_flags), le16_to_cpu(flags)); + mutex_lock(&ni->mrec_lock); if (vol->vol_flags == flags) goto done; - BUG_ON(!ni); - m = map_mft_record(ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(ni, m); + + ctx = ntfs_attr_get_search_ctx(ni, NULL); if (!ctx) { err = -ENOMEM; goto put_unm_err_out; } + err = ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0, ctx); if (err) goto put_unm_err_out; - vi = (VOLUME_INFORMATION*)((u8*)ctx->attr + + + vi = (struct volume_information *)((u8 *)ctx->attr + le16_to_cpu(ctx->attr->data.resident.value_offset)); vol->vol_flags = vi->flags = flags; - flush_dcache_mft_record_page(ctx->ntfs_ino); mark_mft_record_dirty(ctx->ntfs_ino); ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ni); done: + mutex_unlock(&ni->mrec_lock); ntfs_debug("Done."); return 0; put_unm_err_out: if (ctx) ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ni); -err_out: + mutex_unlock(&ni->mrec_lock); ntfs_error(vol->sb, "Failed with error code %i.", -err); return err; } @@ -421,7 +385,7 @@ static int ntfs_write_volume_flags(ntfs_volume *vol, const VOLUME_FLAGS flags) * * Return 0 on success and -errno on error. */ -static inline int ntfs_set_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags) +int ntfs_set_volume_flags(struct ntfs_volume *vol, __le16 flags) { flags &= VOLUME_FLAGS_MASK; return ntfs_write_volume_flags(vol, vol->vol_flags | flags); @@ -436,130 +400,62 @@ static inline int ntfs_set_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags) * * Return 0 on success and -errno on error. */ -static inline int ntfs_clear_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags) +int ntfs_clear_volume_flags(struct ntfs_volume *vol, __le16 flags) { flags &= VOLUME_FLAGS_MASK; flags = vol->vol_flags & cpu_to_le16(~le16_to_cpu(flags)); return ntfs_write_volume_flags(vol, flags); } -#endif /* NTFS_RW */ - -/** - * ntfs_remount - change the mount options of a mounted ntfs filesystem - * @sb: superblock of mounted ntfs filesystem - * @flags: remount flags - * @opt: remount options string - * - * Change the mount options of an already mounted ntfs filesystem. - * - * NOTE: The VFS sets the @sb->s_flags remount flags to @flags after - * ntfs_remount() returns successfully (i.e. returns 0). Otherwise, - * @sb->s_flags are not changed. - */ -static int ntfs_remount(struct super_block *sb, int *flags, char *opt) +int ntfs_write_volume_label(struct ntfs_volume *vol, char *label) { - ntfs_volume *vol = NTFS_SB(sb); - - ntfs_debug("Entering with remount options string: %s", opt); - - sync_filesystem(sb); - -#ifndef NTFS_RW - /* For read-only compiled driver, enforce read-only flag. */ - *flags |= SB_RDONLY; -#else /* NTFS_RW */ - /* - * For the read-write compiled driver, if we are remounting read-write, - * make sure there are no volume errors and that no unsupported volume - * flags are set. Also, empty the logfile journal as it would become - * stale as soon as something is written to the volume and mark the - * volume dirty so that chkdsk is run if the volume is not umounted - * cleanly. Finally, mark the quotas out of date so Windows rescans - * the volume on boot and updates them. - * - * When remounting read-only, mark the volume clean if no volume errors - * have occurred. - */ - if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) { - static const char *es = ". Cannot remount read-write."; + struct ntfs_inode *vol_ni = NTFS_I(vol->vol_ino); + struct ntfs_attr_search_ctx *ctx; + __le16 *uname; + int uname_len, ret; + + uname_len = ntfs_nlstoucs(vol, label, strlen(label), + &uname, FSLABEL_MAX); + if (uname_len < 0) { + ntfs_error(vol->sb, + "Failed to convert volume label '%s' to Unicode.", + label); + return uname_len; + } + + if (uname_len > NTFS_MAX_LABEL_LEN) { + ntfs_error(vol->sb, + "Volume label is too long (max %d characters).", + NTFS_MAX_LABEL_LEN); + kvfree(uname); + return -EINVAL; + } - /* Remounting read-write. */ - if (NVolErrors(vol)) { - ntfs_error(sb, "Volume has errors and is read-only%s", - es); - return -EROFS; - } - if (vol->vol_flags & VOLUME_IS_DIRTY) { - ntfs_error(sb, "Volume is dirty and read-only%s", es); - return -EROFS; - } - if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) { - ntfs_error(sb, "Volume has been modified by chkdsk " - "and is read-only%s", es); - return -EROFS; - } - if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { - ntfs_error(sb, "Volume has unsupported flags set " - "(0x%x) and is read-only%s", - (unsigned)le16_to_cpu(vol->vol_flags), - es); - return -EROFS; - } - if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { - ntfs_error(sb, "Failed to set dirty bit in volume " - "information flags%s", es); - return -EROFS; - } -#if 0 - // TODO: Enable this code once we start modifying anything that - // is different between NTFS 1.2 and 3.x... - /* Set NT4 compatibility flag on newer NTFS version volumes. */ - if ((vol->major_ver > 1)) { - if (ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) { - ntfs_error(sb, "Failed to set NT4 " - "compatibility flag%s", es); - NVolSetErrors(vol); - return -EROFS; - } - } -#endif - if (!ntfs_empty_logfile(vol->logfile_ino)) { - ntfs_error(sb, "Failed to empty journal $LogFile%s", - es); - NVolSetErrors(vol); - return -EROFS; - } - if (!ntfs_mark_quotas_out_of_date(vol)) { - ntfs_error(sb, "Failed to mark quotas out of date%s", - es); - NVolSetErrors(vol); - return -EROFS; - } - if (!ntfs_stamp_usnjrnl(vol)) { - ntfs_error(sb, "Failed to stamp transaction log " - "($UsnJrnl)%s", es); - NVolSetErrors(vol); - return -EROFS; - } - } else if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) { - /* Remounting read-only. */ - if (!NVolErrors(vol)) { - if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY)) - ntfs_warning(sb, "Failed to clear dirty bit " - "in volume information " - "flags. Run chkdsk."); - } + mutex_lock(&vol_ni->mrec_lock); + ctx = ntfs_attr_get_search_ctx(vol_ni, NULL); + if (!ctx) { + ret = -ENOMEM; + goto out; } -#endif /* NTFS_RW */ - // TODO: Deal with *flags. + if (!ntfs_attr_lookup(AT_VOLUME_NAME, NULL, 0, 0, 0, NULL, 0, + ctx)) + ntfs_attr_record_rm(ctx); + ntfs_attr_put_search_ctx(ctx); - if (!parse_options(vol, opt)) - return -EINVAL; + ret = ntfs_resident_attr_record_add(vol_ni, AT_VOLUME_NAME, AT_UNNAMED, 0, + (u8 *)uname, uname_len * sizeof(__le16), 0); +out: + mutex_unlock(&vol_ni->mrec_lock); + kvfree(uname); + mark_inode_dirty_sync(vol->vol_ino); - ntfs_debug("Done."); - return 0; + if (ret >= 0) { + kfree(vol->volume_label); + vol->volume_label = kstrdup(label, GFP_KERNEL); + ret = 0; + } + return ret; } /** @@ -575,7 +471,7 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt) * is 'true'. */ static bool is_boot_sector_ntfs(const struct super_block *sb, - const NTFS_BOOT_SECTOR *b, const bool silent) + const struct ntfs_boot_sector *b, const bool silent) { /* * Check that checksum == sum of u32 values from b to the checksum @@ -584,11 +480,11 @@ static bool is_boot_sector_ntfs(const struct super_block *sb, * ignoring the checksum which leaves the checksum out-of-date. We * report a warning if this is the case. */ - if ((void*)b < (void*)&b->checksum && b->checksum && !silent) { - le32 *u; + if ((void *)b < (void *)&b->checksum && b->checksum && !silent) { + __le32 *u; u32 i; - for (i = 0, u = (le32*)b; u < (le32*)(&b->checksum); ++u) + for (i = 0, u = (__le32 *)b; u < (__le32 *)(&b->checksum); ++u) i += le32_to_cpup(u); if (le32_to_cpu(b->checksum) != i) ntfs_warning(sb, "Invalid boot sector checksum."); @@ -598,19 +494,16 @@ static bool is_boot_sector_ntfs(const struct super_block *sb, goto not_ntfs; /* Check bytes per sector value is between 256 and 4096. */ if (le16_to_cpu(b->bpb.bytes_per_sector) < 0x100 || - le16_to_cpu(b->bpb.bytes_per_sector) > 0x1000) - goto not_ntfs; - /* Check sectors per cluster value is valid. */ - switch (b->bpb.sectors_per_cluster) { - case 1: case 2: case 4: case 8: case 16: case 32: case 64: case 128: - break; - default: + le16_to_cpu(b->bpb.bytes_per_sector) > 0x1000) goto not_ntfs; - } - /* Check the cluster size is not above the maximum (64kiB). */ - if ((u32)le16_to_cpu(b->bpb.bytes_per_sector) * - b->bpb.sectors_per_cluster > NTFS_MAX_CLUSTER_SIZE) + /* + * Check sectors per cluster value is valid and the cluster size + * is not above the maximum (2MB). + */ + if (b->bpb.sectors_per_cluster > 0x80 && + b->bpb.sectors_per_cluster < 0xf4) goto not_ntfs; + /* Check reserved/unused fields are really zero. */ if (le16_to_cpu(b->bpb.reserved_sectors) || le16_to_cpu(b->bpb.root_entries) || @@ -653,100 +546,33 @@ static bool is_boot_sector_ntfs(const struct super_block *sb, * @sb: super block of device to read the boot sector from * @silent: if true, suppress all output * - * Reads the boot sector from the device and validates it. If that fails, tries - * to read the backup boot sector, first from the end of the device a-la NT4 and - * later and then from the middle of the device a-la NT3.51 and before. - * - * If a valid boot sector is found but it is not the primary boot sector, we - * repair the primary boot sector silently (unless the device is read-only or - * the primary boot sector is not accessible). - * - * NOTE: To call this function, @sb must have the fields s_dev, the ntfs super - * block (u.ntfs_sb), nr_blocks and the device flags (s_flags) initialized - * to their respective values. - * - * Return the unlocked buffer head containing the boot sector or NULL on error. + * Reads the boot sector from the device and validates it. */ -static struct buffer_head *read_ntfs_boot_sector(struct super_block *sb, +static char *read_ntfs_boot_sector(struct super_block *sb, const int silent) { - const char *read_err_str = "Unable to read %s boot sector."; - struct buffer_head *bh_primary, *bh_backup; - sector_t nr_blocks = NTFS_SB(sb)->nr_blocks; - - /* Try to read primary boot sector. */ - if ((bh_primary = sb_bread(sb, 0))) { - if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*) - bh_primary->b_data, silent)) - return bh_primary; - if (!silent) - ntfs_error(sb, "Primary boot sector is invalid."); - } else if (!silent) - ntfs_error(sb, read_err_str, "primary"); - if (!(NTFS_SB(sb)->on_errors & ON_ERRORS_RECOVER)) { - if (bh_primary) - brelse(bh_primary); + char *boot_sector; + + boot_sector = ntfs_malloc_nofs(PAGE_SIZE); + if (!boot_sector) + return NULL; + + if (ntfs_dev_read(sb, boot_sector, 0, PAGE_SIZE)) { if (!silent) - ntfs_error(sb, "Mount option errors=recover not used. " - "Aborting without trying to recover."); + ntfs_error(sb, "Unable to read primary boot sector."); + kfree(boot_sector); return NULL; } - /* Try to read NT4+ backup boot sector. */ - if ((bh_backup = sb_bread(sb, nr_blocks - 1))) { - if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*) - bh_backup->b_data, silent)) - goto hotfix_primary_boot_sector; - brelse(bh_backup); - } else if (!silent) - ntfs_error(sb, read_err_str, "backup"); - /* Try to read NT3.51- backup boot sector. */ - if ((bh_backup = sb_bread(sb, nr_blocks >> 1))) { - if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*) - bh_backup->b_data, silent)) - goto hotfix_primary_boot_sector; + + if (!is_boot_sector_ntfs(sb, (struct ntfs_boot_sector *)boot_sector, + silent)) { if (!silent) - ntfs_error(sb, "Could not find a valid backup boot " - "sector."); - brelse(bh_backup); - } else if (!silent) - ntfs_error(sb, read_err_str, "backup"); - /* We failed. Cleanup and return. */ - if (bh_primary) - brelse(bh_primary); - return NULL; -hotfix_primary_boot_sector: - if (bh_primary) { - /* - * If we managed to read sector zero and the volume is not - * read-only, copy the found, valid backup boot sector to the - * primary boot sector. Note we only copy the actual boot - * sector structure, not the actual whole device sector as that - * may be bigger and would potentially damage the $Boot system - * file (FIXME: Would be nice to know if the backup boot sector - * on a large sector device contains the whole boot loader or - * just the first 512 bytes). - */ - if (!sb_rdonly(sb)) { - ntfs_warning(sb, "Hot-fix: Recovering invalid primary " - "boot sector from backup copy."); - memcpy(bh_primary->b_data, bh_backup->b_data, - NTFS_BLOCK_SIZE); - mark_buffer_dirty(bh_primary); - sync_dirty_buffer(bh_primary); - if (buffer_uptodate(bh_primary)) { - brelse(bh_backup); - return bh_primary; - } - ntfs_error(sb, "Hot-fix: Device write error while " - "recovering primary boot sector."); - } else { - ntfs_warning(sb, "Hot-fix: Recovery of primary boot " - "sector failed: Read-only mount."); - } - brelse(bh_primary); + ntfs_error(sb, "Primary boot sector is invalid."); + kfree(boot_sector); + return NULL; } - ntfs_warning(sb, "Using backup boot sector."); - return bh_backup; + + return boot_sector; } /** @@ -757,9 +583,10 @@ static struct buffer_head *read_ntfs_boot_sector(struct super_block *sb, * Parse the ntfs boot sector @b and store all imporant information therein in * the ntfs super block @vol. Return 'true' on success and 'false' on error. */ -static bool parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b) +static bool parse_ntfs_boot_sector(struct ntfs_volume *vol, + const struct ntfs_boot_sector *b) { - unsigned int sectors_per_cluster_bits, nr_hidden_sects; + unsigned int sectors_per_cluster, sectors_per_cluster_bits, nr_hidden_sects; int clusters_per_mft_record, clusters_per_index_record; s64 ll; @@ -770,14 +597,18 @@ static bool parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b) ntfs_debug("vol->sector_size_bits = %i (0x%x)", vol->sector_size_bits, vol->sector_size_bits); if (vol->sector_size < vol->sb->s_blocksize) { - ntfs_error(vol->sb, "Sector size (%i) is smaller than the " - "device block size (%lu). This is not " - "supported. Sorry.", vol->sector_size, - vol->sb->s_blocksize); + ntfs_error(vol->sb, + "Sector size (%i) is smaller than the device block size (%lu). This is not supported.", + vol->sector_size, vol->sb->s_blocksize); return false; } + + if (b->bpb.sectors_per_cluster >= 0xf4) + sectors_per_cluster = 1U << -(s8)b->bpb.sectors_per_cluster; + else + sectors_per_cluster = b->bpb.sectors_per_cluster; ntfs_debug("sectors_per_cluster = 0x%x", b->bpb.sectors_per_cluster); - sectors_per_cluster_bits = ffs(b->bpb.sectors_per_cluster) - 1; + sectors_per_cluster_bits = ffs(sectors_per_cluster) - 1; ntfs_debug("sectors_per_cluster_bits = 0x%x", sectors_per_cluster_bits); nr_hidden_sects = le32_to_cpu(b->bpb.hidden_sectors); @@ -790,9 +621,9 @@ static bool parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b) ntfs_debug("vol->cluster_size_mask = 0x%x", vol->cluster_size_mask); ntfs_debug("vol->cluster_size_bits = %i", vol->cluster_size_bits); if (vol->cluster_size < vol->sector_size) { - ntfs_error(vol->sb, "Cluster size (%i) is smaller than the " - "sector size (%i). This is not supported. " - "Sorry.", vol->cluster_size, vol->sector_size); + ntfs_error(vol->sb, + "Cluster size (%i) is smaller than the sector size (%i). This is not supported.", + vol->cluster_size, vol->sector_size); return false; } clusters_per_mft_record = b->clusters_per_mft_record; @@ -821,19 +652,15 @@ static bool parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b) * we store $MFT/$DATA, the table of mft records in the page cache. */ if (vol->mft_record_size > PAGE_SIZE) { - ntfs_error(vol->sb, "Mft record size (%i) exceeds the " - "PAGE_SIZE on your system (%lu). " - "This is not supported. Sorry.", - vol->mft_record_size, PAGE_SIZE); + ntfs_error(vol->sb, + "Mft record size (%i) exceeds the PAGE_SIZE on your system (%lu). This is not supported.", + vol->mft_record_size, PAGE_SIZE); return false; } /* We cannot support mft record sizes below the sector size. */ if (vol->mft_record_size < vol->sector_size) { - ntfs_error(vol->sb, "Mft record size (%i) is smaller than the " - "sector size (%i). This is not supported. " - "Sorry.", vol->mft_record_size, - vol->sector_size); - return false; + ntfs_warning(vol->sb, "Mft record size (%i) is smaller than the sector size (%i).", + vol->mft_record_size, vol->sector_size); } clusters_per_index_record = b->clusters_per_index_record; ntfs_debug("clusters_per_index_record = %i (0x%x)", @@ -860,10 +687,9 @@ static bool parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b) vol->index_record_size_bits); /* We cannot support index record sizes below the sector size. */ if (vol->index_record_size < vol->sector_size) { - ntfs_error(vol->sb, "Index record size (%i) is smaller than " - "the sector size (%i). This is not " - "supported. Sorry.", vol->index_record_size, - vol->sector_size); + ntfs_error(vol->sb, + "Index record size (%i) is smaller than the sector size (%i). This is not supported.", + vol->index_record_size, vol->sector_size); return false; } /* @@ -871,47 +697,29 @@ static bool parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b) * Windows currently only uses 32 bits to save the clusters so we do * the same as it is much faster on 32-bit CPUs. */ - ll = sle64_to_cpu(b->number_of_sectors) >> sectors_per_cluster_bits; + ll = le64_to_cpu(b->number_of_sectors) >> sectors_per_cluster_bits; if ((u64)ll >= 1ULL << 32) { - ntfs_error(vol->sb, "Cannot handle 64-bit clusters. Sorry."); + ntfs_error(vol->sb, "Cannot handle 64-bit clusters."); return false; } vol->nr_clusters = ll; - ntfs_debug("vol->nr_clusters = 0x%llx", (long long)vol->nr_clusters); - /* - * On an architecture where unsigned long is 32-bits, we restrict the - * volume size to 2TiB (2^41). On a 64-bit architecture, the compiler - * will hopefully optimize the whole check away. - */ - if (sizeof(unsigned long) < 8) { - if ((ll << vol->cluster_size_bits) >= (1ULL << 41)) { - ntfs_error(vol->sb, "Volume size (%lluTiB) is too " - "large for this architecture. " - "Maximum supported is 2TiB. Sorry.", - (unsigned long long)ll >> (40 - - vol->cluster_size_bits)); - return false; - } - } - ll = sle64_to_cpu(b->mft_lcn); + ntfs_debug("vol->nr_clusters = 0x%llx", vol->nr_clusters); + ll = le64_to_cpu(b->mft_lcn); if (ll >= vol->nr_clusters) { - ntfs_error(vol->sb, "MFT LCN (%lli, 0x%llx) is beyond end of " - "volume. Weird.", (unsigned long long)ll, - (unsigned long long)ll); + ntfs_error(vol->sb, "MFT LCN (%lli, 0x%llx) is beyond end of volume. Weird.", + ll, ll); return false; } vol->mft_lcn = ll; - ntfs_debug("vol->mft_lcn = 0x%llx", (long long)vol->mft_lcn); - ll = sle64_to_cpu(b->mftmirr_lcn); + ntfs_debug("vol->mft_lcn = 0x%llx", vol->mft_lcn); + ll = le64_to_cpu(b->mftmirr_lcn); if (ll >= vol->nr_clusters) { - ntfs_error(vol->sb, "MFTMirr LCN (%lli, 0x%llx) is beyond end " - "of volume. Weird.", (unsigned long long)ll, - (unsigned long long)ll); + ntfs_error(vol->sb, "MFTMirr LCN (%lli, 0x%llx) is beyond end of volume. Weird.", + ll, ll); return false; } vol->mftmirr_lcn = ll; - ntfs_debug("vol->mftmirr_lcn = 0x%llx", (long long)vol->mftmirr_lcn); -#ifdef NTFS_RW + ntfs_debug("vol->mftmirr_lcn = 0x%llx", vol->mftmirr_lcn); /* * Work out the size of the mft mirror in number of mft records. If the * cluster size is less than or equal to the size taken by four mft @@ -926,10 +734,27 @@ static bool parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b) vol->mftmirr_size = vol->cluster_size >> vol->mft_record_size_bits; ntfs_debug("vol->mftmirr_size = %i", vol->mftmirr_size); -#endif /* NTFS_RW */ vol->serial_no = le64_to_cpu(b->volume_serial_number); - ntfs_debug("vol->serial_no = 0x%llx", - (unsigned long long)vol->serial_no); + ntfs_debug("vol->serial_no = 0x%llx", vol->serial_no); + + vol->sparse_compression_unit = 4; + if (vol->cluster_size > 4096) { + switch (vol->cluster_size) { + case 65536: + vol->sparse_compression_unit = 0; + break; + case 32768: + vol->sparse_compression_unit = 1; + break; + case 16384: + vol->sparse_compression_unit = 2; + break; + case 8192: + vol->sparse_compression_unit = 3; + break; + } + } + return true; } @@ -939,15 +764,12 @@ static bool parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b) * * Setup the cluster (lcn) and mft allocators to the starting values. */ -static void ntfs_setup_allocators(ntfs_volume *vol) +static void ntfs_setup_allocators(struct ntfs_volume *vol) { -#ifdef NTFS_RW - LCN mft_zone_size, mft_lcn; -#endif /* NTFS_RW */ + s64 mft_zone_size, mft_lcn; ntfs_debug("vol->mft_zone_multiplier = 0x%x", vol->mft_zone_multiplier); -#ifdef NTFS_RW /* Determine the size of the MFT zone. */ mft_zone_size = vol->nr_clusters; switch (vol->mft_zone_multiplier) { /* % of volume size in clusters */ @@ -968,8 +790,7 @@ static void ntfs_setup_allocators(ntfs_volume *vol) } /* Setup the mft zone. */ vol->mft_zone_start = vol->mft_zone_pos = vol->mft_lcn; - ntfs_debug("vol->mft_zone_pos = 0x%llx", - (unsigned long long)vol->mft_zone_pos); + ntfs_debug("vol->mft_zone_pos = 0x%llx", vol->mft_zone_pos); /* * Calculate the mft_lcn for an unmodified NTFS volume (see mkntfs * source) and if the actual mft_lcn is in the expected place or even @@ -979,14 +800,13 @@ static void ntfs_setup_allocators(ntfs_volume *vol) * On non-standard volumes we do not protect it as the overhead would * be higher than the speed increase we would get by doing it. */ - mft_lcn = (8192 + 2 * vol->cluster_size - 1) / vol->cluster_size; + mft_lcn = NTFS_B_TO_CLU(vol, 8192 + 2 * vol->cluster_size - 1); if (mft_lcn * vol->cluster_size < 16 * 1024) - mft_lcn = (16 * 1024 + vol->cluster_size - 1) / - vol->cluster_size; + mft_lcn = (16 * 1024 + vol->cluster_size - 1) >> + vol->cluster_size_bits; if (vol->mft_zone_start <= mft_lcn) vol->mft_zone_start = 0; - ntfs_debug("vol->mft_zone_start = 0x%llx", - (unsigned long long)vol->mft_zone_start); + ntfs_debug("vol->mft_zone_start = 0x%llx", vol->mft_zone_start); /* * Need to cap the mft zone on non-standard volumes so that it does * not point outside the boundaries of the volume. We do this by @@ -997,48 +817,47 @@ static void ntfs_setup_allocators(ntfs_volume *vol) mft_zone_size >>= 1; vol->mft_zone_end = vol->mft_lcn + mft_zone_size; } - ntfs_debug("vol->mft_zone_end = 0x%llx", - (unsigned long long)vol->mft_zone_end); + ntfs_debug("vol->mft_zone_end = 0x%llx", vol->mft_zone_end); /* * Set the current position within each data zone to the start of the * respective zone. */ vol->data1_zone_pos = vol->mft_zone_end; - ntfs_debug("vol->data1_zone_pos = 0x%llx", - (unsigned long long)vol->data1_zone_pos); + ntfs_debug("vol->data1_zone_pos = 0x%llx", vol->data1_zone_pos); vol->data2_zone_pos = 0; - ntfs_debug("vol->data2_zone_pos = 0x%llx", - (unsigned long long)vol->data2_zone_pos); + ntfs_debug("vol->data2_zone_pos = 0x%llx", vol->data2_zone_pos); /* Set the mft data allocation position to mft record 24. */ vol->mft_data_pos = 24; - ntfs_debug("vol->mft_data_pos = 0x%llx", - (unsigned long long)vol->mft_data_pos); -#endif /* NTFS_RW */ + ntfs_debug("vol->mft_data_pos = 0x%llx", vol->mft_data_pos); } -#ifdef NTFS_RW - +static struct lock_class_key mftmirr_runlist_lock_key, + mftmirr_mrec_lock_key; /** * load_and_init_mft_mirror - load and setup the mft mirror inode for a volume * @vol: ntfs super block describing device whose mft mirror to load * * Return 'true' on success or 'false' on error. */ -static bool load_and_init_mft_mirror(ntfs_volume *vol) +static bool load_and_init_mft_mirror(struct ntfs_volume *vol) { struct inode *tmp_ino; - ntfs_inode *tmp_ni; + struct ntfs_inode *tmp_ni; ntfs_debug("Entering."); /* Get mft mirror inode. */ tmp_ino = ntfs_iget(vol->sb, FILE_MFTMirr); - if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) { + if (IS_ERR(tmp_ino)) { if (!IS_ERR(tmp_ino)) iput(tmp_ino); /* Caller will display error message. */ return false; } + lockdep_set_class(&NTFS_I(tmp_ino)->runlist.lock, + &mftmirr_runlist_lock_key); + lockdep_set_class(&NTFS_I(tmp_ino)->mrec_lock, + &mftmirr_mrec_lock_key); /* * Re-initialize some specifics about $MFTMirr's inode as * ntfs_read_inode() will have set up the default ones. @@ -1052,7 +871,7 @@ static bool load_and_init_mft_mirror(ntfs_volume *vol) tmp_ino->i_op = &ntfs_empty_inode_ops; tmp_ino->i_fop = &ntfs_empty_file_ops; /* Put in our special address space operations. */ - tmp_ino->i_mapping->a_ops = &ntfs_mst_aops; + tmp_ino->i_mapping->a_ops = &ntfs_aops; tmp_ni = NTFS_I(tmp_ino); /* The $MFTMirr, like the $MFT is multi sector transfer protected. */ NInoSetMstProtected(tmp_ni); @@ -1078,23 +897,19 @@ static bool load_and_init_mft_mirror(ntfs_volume *vol) * mapped into memory. The mft mirror write code requires this and will BUG() * should it find an unmapped runlist element. */ -static bool check_mft_mirror(ntfs_volume *vol) +static bool check_mft_mirror(struct ntfs_volume *vol) { struct super_block *sb = vol->sb; - ntfs_inode *mirr_ni; - struct page *mft_page, *mirr_page; - u8 *kmft, *kmirr; - runlist_element *rl, rl2[2]; + struct ntfs_inode *mirr_ni; + struct folio *mft_folio = NULL, *mirr_folio = NULL; + u8 *kmft = NULL, *kmirr = NULL; + struct runlist_element *rl, rl2[2]; pgoff_t index; int mrecs_per_page, i; ntfs_debug("Entering."); /* Compare contents of $MFT and $MFTMirr. */ mrecs_per_page = PAGE_SIZE / vol->mft_record_size; - BUG_ON(!mrecs_per_page); - BUG_ON(!vol->mftmirr_size); - mft_page = mirr_page = NULL; - kmft = kmirr = NULL; index = i = 0; do { u32 bytes; @@ -1102,79 +917,80 @@ static bool check_mft_mirror(ntfs_volume *vol) /* Switch pages if necessary. */ if (!(i % mrecs_per_page)) { if (index) { - ntfs_unmap_page(mft_page); - ntfs_unmap_page(mirr_page); + kunmap_local(kmirr); + folio_put(mirr_folio); + kunmap_local(kmft); + folio_put(mft_folio); } /* Get the $MFT page. */ - mft_page = ntfs_map_page(vol->mft_ino->i_mapping, - index); - if (IS_ERR(mft_page)) { + mft_folio = read_mapping_folio(vol->mft_ino->i_mapping, + index, NULL); + if (IS_ERR(mft_folio)) { ntfs_error(sb, "Failed to read $MFT."); return false; } - kmft = page_address(mft_page); + kmft = kmap_local_folio(mft_folio, 0); /* Get the $MFTMirr page. */ - mirr_page = ntfs_map_page(vol->mftmirr_ino->i_mapping, - index); - if (IS_ERR(mirr_page)) { + mirr_folio = read_mapping_folio(vol->mftmirr_ino->i_mapping, + index, NULL); + if (IS_ERR(mirr_folio)) { ntfs_error(sb, "Failed to read $MFTMirr."); goto mft_unmap_out; } - kmirr = page_address(mirr_page); + kmirr = kmap_local_folio(mirr_folio, 0); ++index; } + /* Do not check the record if it is not in use. */ - if (((MFT_RECORD*)kmft)->flags & MFT_RECORD_IN_USE) { + if (((struct mft_record *)kmft)->flags & MFT_RECORD_IN_USE) { /* Make sure the record is ok. */ - if (ntfs_is_baad_recordp((le32*)kmft)) { - ntfs_error(sb, "Incomplete multi sector " - "transfer detected in mft " - "record %i.", i); + if (ntfs_is_baad_recordp((__le32 *)kmft)) { + ntfs_error(sb, + "Incomplete multi sector transfer detected in mft record %i.", + i); mm_unmap_out: - ntfs_unmap_page(mirr_page); + kunmap_local(kmirr); + folio_put(mirr_folio); mft_unmap_out: - ntfs_unmap_page(mft_page); + kunmap_local(kmft); + folio_put(mft_folio); return false; } } /* Do not check the mirror record if it is not in use. */ - if (((MFT_RECORD*)kmirr)->flags & MFT_RECORD_IN_USE) { - if (ntfs_is_baad_recordp((le32*)kmirr)) { - ntfs_error(sb, "Incomplete multi sector " - "transfer detected in mft " - "mirror record %i.", i); + if (((struct mft_record *)kmirr)->flags & MFT_RECORD_IN_USE) { + if (ntfs_is_baad_recordp((__le32 *)kmirr)) { + ntfs_error(sb, + "Incomplete multi sector transfer detected in mft mirror record %i.", + i); goto mm_unmap_out; } } /* Get the amount of data in the current record. */ - bytes = le32_to_cpu(((MFT_RECORD*)kmft)->bytes_in_use); - if (bytes < sizeof(MFT_RECORD_OLD) || - bytes > vol->mft_record_size || - ntfs_is_baad_recordp((le32*)kmft)) { - bytes = le32_to_cpu(((MFT_RECORD*)kmirr)->bytes_in_use); - if (bytes < sizeof(MFT_RECORD_OLD) || - bytes > vol->mft_record_size || - ntfs_is_baad_recordp((le32*)kmirr)) + bytes = le32_to_cpu(((struct mft_record *)kmft)->bytes_in_use); + if (bytes < sizeof(struct mft_record_old) || + bytes > vol->mft_record_size || + ntfs_is_baad_recordp((__le32 *)kmft)) { + bytes = le32_to_cpu(((struct mft_record *)kmirr)->bytes_in_use); + if (bytes < sizeof(struct mft_record_old) || + bytes > vol->mft_record_size || + ntfs_is_baad_recordp((__le32 *)kmirr)) bytes = vol->mft_record_size; } - /* Compare the two records. */ - if (memcmp(kmft, kmirr, bytes)) { - ntfs_error(sb, "$MFT and $MFTMirr (record %i) do not " - "match. Run ntfsfix or chkdsk.", i); - goto mm_unmap_out; - } kmft += vol->mft_record_size; kmirr += vol->mft_record_size; } while (++i < vol->mftmirr_size); - /* Release the last pages. */ - ntfs_unmap_page(mft_page); - ntfs_unmap_page(mirr_page); + /* Release the last folios. */ + kunmap_local(kmirr); + folio_put(mirr_folio); + kunmap_local(kmft); + folio_put(mft_folio); /* Construct the mft mirror runlist by hand. */ rl2[0].vcn = 0; rl2[0].lcn = vol->mftmirr_lcn; - rl2[0].length = (vol->mftmirr_size * vol->mft_record_size + - vol->cluster_size - 1) / vol->cluster_size; + rl2[0].length = NTFS_B_TO_CLU(vol, vol->mftmirr_size * vol->mft_record_size + + vol->cluster_size - 1); rl2[1].vcn = rl2[0].length; rl2[1].lcn = LCN_ENOENT; rl2[1].length = 0; @@ -1190,8 +1006,7 @@ static bool check_mft_mirror(ntfs_volume *vol) do { if (rl2[i].vcn != rl[i].vcn || rl2[i].lcn != rl[i].lcn || rl2[i].length != rl[i].length) { - ntfs_error(sb, "$MFTMirr location mismatch. " - "Run chkdsk."); + ntfs_error(sb, "$MFTMirr location mismatch. Run chkdsk."); up_read(&mirr_ni->runlist.lock); return false; } @@ -1203,32 +1018,29 @@ static bool check_mft_mirror(ntfs_volume *vol) /** * load_and_check_logfile - load and check the logfile inode for a volume - * @vol: ntfs super block describing device whose logfile to load * - * Return 'true' on success or 'false' on error. + * Return 0 on success or errno on error. */ -static bool load_and_check_logfile(ntfs_volume *vol, - RESTART_PAGE_HEADER **rp) +static int load_and_check_logfile(struct ntfs_volume *vol, + struct restart_page_header **rp) { struct inode *tmp_ino; + int err = 0; ntfs_debug("Entering."); tmp_ino = ntfs_iget(vol->sb, FILE_LogFile); - if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) { + if (IS_ERR(tmp_ino)) { if (!IS_ERR(tmp_ino)) iput(tmp_ino); /* Caller will display error message. */ - return false; - } - if (!ntfs_check_logfile(tmp_ino, rp)) { - iput(tmp_ino); - /* ntfs_check_logfile() will have displayed error output. */ - return false; + return -ENOENT; } + if (!ntfs_check_logfile(tmp_ino, rp)) + err = -EINVAL; NInoSetSparseDisabled(NTFS_I(tmp_ino)); vol->logfile_ino = tmp_ino; ntfs_debug("Done."); - return true; + return err; } #define NTFS_HIBERFIL_HEADER_SIZE 4096 @@ -1257,21 +1069,21 @@ static bool load_and_check_logfile(ntfs_volume *vol, * Return 0 if Windows is not hibernated on the volume, >0 if Windows is * hibernated on the volume, and -errno on error. */ -static int check_windows_hibernation_status(ntfs_volume *vol) +static int check_windows_hibernation_status(struct ntfs_volume *vol) { - MFT_REF mref; - struct inode *vi; - struct page *page; - u32 *kaddr, *kend; - ntfs_name *name = NULL; - int ret = 1; - static const ntfschar hiberfil[13] = { cpu_to_le16('h'), + static const __le16 hiberfil[13] = { cpu_to_le16('h'), cpu_to_le16('i'), cpu_to_le16('b'), cpu_to_le16('e'), cpu_to_le16('r'), cpu_to_le16('f'), cpu_to_le16('i'), cpu_to_le16('l'), cpu_to_le16('.'), cpu_to_le16('s'), cpu_to_le16('y'), cpu_to_le16('s'), 0 }; + u64 mref; + struct inode *vi; + struct folio *folio; + u32 *kaddr, *kend, *start_addr = NULL; + struct ntfs_name *name = NULL; + int ret = 1; ntfs_debug("Entering."); /* @@ -1282,66 +1094,57 @@ static int check_windows_hibernation_status(ntfs_volume *vol) mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12, &name); inode_unlock(vol->root_ino); + kfree(name); if (IS_ERR_MREF(mref)) { ret = MREF_ERR(mref); /* If the file does not exist, Windows is not hibernated. */ if (ret == -ENOENT) { - ntfs_debug("hiberfil.sys not present. Windows is not " - "hibernated on the volume."); + ntfs_debug("hiberfil.sys not present. Windows is not hibernated on the volume."); return 0; } /* A real error occurred. */ - ntfs_error(vol->sb, "Failed to find inode number for " - "hiberfil.sys."); + ntfs_error(vol->sb, "Failed to find inode number for hiberfil.sys."); return ret; } - /* We do not care for the type of match that was found. */ - kfree(name); /* Get the inode. */ vi = ntfs_iget(vol->sb, MREF(mref)); - if (IS_ERR(vi) || is_bad_inode(vi)) { + if (IS_ERR(vi)) { if (!IS_ERR(vi)) iput(vi); ntfs_error(vol->sb, "Failed to load hiberfil.sys."); return IS_ERR(vi) ? PTR_ERR(vi) : -EIO; } if (unlikely(i_size_read(vi) < NTFS_HIBERFIL_HEADER_SIZE)) { - ntfs_debug("hiberfil.sys is smaller than 4kiB (0x%llx). " - "Windows is hibernated on the volume. This " - "is not the system volume.", i_size_read(vi)); + ntfs_debug("hiberfil.sys is smaller than 4kiB (0x%llx). Windows is hibernated on the volume. This is not the system volume.", + i_size_read(vi)); goto iput_out; } - page = ntfs_map_page(vi->i_mapping, 0); - if (IS_ERR(page)) { + + folio = read_mapping_folio(vi->i_mapping, 0, NULL); + if (IS_ERR(folio)) { ntfs_error(vol->sb, "Failed to read from hiberfil.sys."); - ret = PTR_ERR(page); + ret = PTR_ERR(folio); goto iput_out; } - kaddr = (u32*)page_address(page); - if (*(le32*)kaddr == cpu_to_le32(0x72626968)/*'hibr'*/) { - ntfs_debug("Magic \"hibr\" found in hiberfil.sys. Windows is " - "hibernated on the volume. This is the " - "system volume."); + start_addr = (u32 *)kmap_local_folio(folio, 0); + kaddr = start_addr; + if (*(__le32 *)kaddr == cpu_to_le32(0x72626968)/*'hibr'*/) { + ntfs_debug("Magic \"hibr\" found in hiberfil.sys. Windows is hibernated on the volume. This is the system volume."); goto unm_iput_out; } kend = kaddr + NTFS_HIBERFIL_HEADER_SIZE/sizeof(*kaddr); do { if (unlikely(*kaddr)) { - ntfs_debug("hiberfil.sys is larger than 4kiB " - "(0x%llx), does not contain the " - "\"hibr\" magic, and does not have a " - "zero header. Windows is hibernated " - "on the volume. This is not the " - "system volume.", i_size_read(vi)); + ntfs_debug("hiberfil.sys is larger than 4kiB (0x%llx), does not contain the \"hibr\" magic, and does not have a zero header. Windows is hibernated on the volume. This is not the system volume.", + i_size_read(vi)); goto unm_iput_out; } } while (++kaddr < kend); - ntfs_debug("hiberfil.sys contains a zero header. Windows is not " - "hibernated on the volume. This is the system " - "volume."); + ntfs_debug("hiberfil.sys contains a zero header. Windows is not hibernated on the volume. This is the system volume."); ret = 0; unm_iput_out: - ntfs_unmap_page(page); + kunmap_local(start_addr); + folio_put(folio); iput_out: iput(vi); return ret; @@ -1354,17 +1157,17 @@ static int check_windows_hibernation_status(ntfs_volume *vol) * Return 'true' on success or 'false' on error. If $Quota is not present, we * leave vol->quota_ino as NULL and return success. */ -static bool load_and_init_quota(ntfs_volume *vol) +static bool load_and_init_quota(struct ntfs_volume *vol) { - MFT_REF mref; - struct inode *tmp_ino; - ntfs_name *name = NULL; - static const ntfschar Quota[7] = { cpu_to_le16('$'), + static const __le16 Quota[7] = { cpu_to_le16('$'), cpu_to_le16('Q'), cpu_to_le16('u'), cpu_to_le16('o'), cpu_to_le16('t'), cpu_to_le16('a'), 0 }; - static ntfschar Q[3] = { cpu_to_le16('$'), + static __le16 Q[3] = { cpu_to_le16('$'), cpu_to_le16('Q'), 0 }; + struct ntfs_name *name = NULL; + u64 mref; + struct inode *tmp_ino; ntfs_debug("Entering."); /* @@ -1375,14 +1178,14 @@ static bool load_and_init_quota(ntfs_volume *vol) mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6, &name); inode_unlock(vol->extend_ino); + kfree(name); if (IS_ERR_MREF(mref)) { /* * If the file does not exist, quotas are disabled and have * never been enabled on this volume, just return success. */ if (MREF_ERR(mref) == -ENOENT) { - ntfs_debug("$Quota not present. Volume does not have " - "quotas enabled."); + ntfs_debug("$Quota not present. Volume does not have quotas enabled."); /* * No need to try to set quotas out of date if they are * not enabled. @@ -1394,11 +1197,9 @@ static bool load_and_init_quota(ntfs_volume *vol) ntfs_error(vol->sb, "Failed to find inode number for $Quota."); return false; } - /* We do not care for the type of match that was found. */ - kfree(name); /* Get the inode. */ tmp_ino = ntfs_iget(vol->sb, MREF(mref)); - if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) { + if (IS_ERR(tmp_ino)) { if (!IS_ERR(tmp_ino)) iput(tmp_ino); ntfs_error(vol->sb, "Failed to load $Quota."); @@ -1416,186 +1217,26 @@ static bool load_and_init_quota(ntfs_volume *vol) return true; } -/** - * load_and_init_usnjrnl - load and setup the transaction log if present - * @vol: ntfs super block describing device whose usnjrnl file to load - * - * Return 'true' on success or 'false' on error. - * - * If $UsnJrnl is not present or in the process of being disabled, we set - * NVolUsnJrnlStamped() and return success. - * - * If the $UsnJrnl $DATA/$J attribute has a size equal to the lowest valid usn, - * i.e. transaction logging has only just been enabled or the journal has been - * stamped and nothing has been logged since, we also set NVolUsnJrnlStamped() - * and return success. - */ -static bool load_and_init_usnjrnl(ntfs_volume *vol) -{ - MFT_REF mref; - struct inode *tmp_ino; - ntfs_inode *tmp_ni; - struct page *page; - ntfs_name *name = NULL; - USN_HEADER *uh; - static const ntfschar UsnJrnl[9] = { cpu_to_le16('$'), - cpu_to_le16('U'), cpu_to_le16('s'), - cpu_to_le16('n'), cpu_to_le16('J'), - cpu_to_le16('r'), cpu_to_le16('n'), - cpu_to_le16('l'), 0 }; - static ntfschar Max[5] = { cpu_to_le16('$'), - cpu_to_le16('M'), cpu_to_le16('a'), - cpu_to_le16('x'), 0 }; - static ntfschar J[3] = { cpu_to_le16('$'), - cpu_to_le16('J'), 0 }; - - ntfs_debug("Entering."); - /* - * Find the inode number for the transaction log file by looking up the - * filename $UsnJrnl in the extended system files directory $Extend. - */ - inode_lock(vol->extend_ino); - mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8, - &name); - inode_unlock(vol->extend_ino); - if (IS_ERR_MREF(mref)) { - /* - * If the file does not exist, transaction logging is disabled, - * just return success. - */ - if (MREF_ERR(mref) == -ENOENT) { - ntfs_debug("$UsnJrnl not present. Volume does not " - "have transaction logging enabled."); -not_enabled: - /* - * No need to try to stamp the transaction log if - * transaction logging is not enabled. - */ - NVolSetUsnJrnlStamped(vol); - return true; - } - /* A real error occurred. */ - ntfs_error(vol->sb, "Failed to find inode number for " - "$UsnJrnl."); - return false; - } - /* We do not care for the type of match that was found. */ - kfree(name); - /* Get the inode. */ - tmp_ino = ntfs_iget(vol->sb, MREF(mref)); - if (IS_ERR(tmp_ino) || unlikely(is_bad_inode(tmp_ino))) { - if (!IS_ERR(tmp_ino)) - iput(tmp_ino); - ntfs_error(vol->sb, "Failed to load $UsnJrnl."); - return false; - } - vol->usnjrnl_ino = tmp_ino; - /* - * If the transaction log is in the process of being deleted, we can - * ignore it. - */ - if (unlikely(vol->vol_flags & VOLUME_DELETE_USN_UNDERWAY)) { - ntfs_debug("$UsnJrnl in the process of being disabled. " - "Volume does not have transaction logging " - "enabled."); - goto not_enabled; - } - /* Get the $DATA/$Max attribute. */ - tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, Max, 4); - if (IS_ERR(tmp_ino)) { - ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$Max " - "attribute."); - return false; - } - vol->usnjrnl_max_ino = tmp_ino; - if (unlikely(i_size_read(tmp_ino) < sizeof(USN_HEADER))) { - ntfs_error(vol->sb, "Found corrupt $UsnJrnl/$DATA/$Max " - "attribute (size is 0x%llx but should be at " - "least 0x%zx bytes).", i_size_read(tmp_ino), - sizeof(USN_HEADER)); - return false; - } - /* Get the $DATA/$J attribute. */ - tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, J, 2); - if (IS_ERR(tmp_ino)) { - ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$J " - "attribute."); - return false; - } - vol->usnjrnl_j_ino = tmp_ino; - /* Verify $J is non-resident and sparse. */ - tmp_ni = NTFS_I(vol->usnjrnl_j_ino); - if (unlikely(!NInoNonResident(tmp_ni) || !NInoSparse(tmp_ni))) { - ntfs_error(vol->sb, "$UsnJrnl/$DATA/$J attribute is resident " - "and/or not sparse."); - return false; - } - /* Read the USN_HEADER from $DATA/$Max. */ - page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to read from $UsnJrnl/$DATA/$Max " - "attribute."); - return false; - } - uh = (USN_HEADER*)page_address(page); - /* Sanity check the $Max. */ - if (unlikely(sle64_to_cpu(uh->allocation_delta) > - sle64_to_cpu(uh->maximum_size))) { - ntfs_error(vol->sb, "Allocation delta (0x%llx) exceeds " - "maximum size (0x%llx). $UsnJrnl is corrupt.", - (long long)sle64_to_cpu(uh->allocation_delta), - (long long)sle64_to_cpu(uh->maximum_size)); - ntfs_unmap_page(page); - return false; - } - /* - * If the transaction log has been stamped and nothing has been written - * to it since, we do not need to stamp it. - */ - if (unlikely(sle64_to_cpu(uh->lowest_valid_usn) >= - i_size_read(vol->usnjrnl_j_ino))) { - if (likely(sle64_to_cpu(uh->lowest_valid_usn) == - i_size_read(vol->usnjrnl_j_ino))) { - ntfs_unmap_page(page); - ntfs_debug("$UsnJrnl is enabled but nothing has been " - "logged since it was last stamped. " - "Treating this as if the volume does " - "not have transaction logging " - "enabled."); - goto not_enabled; - } - ntfs_error(vol->sb, "$UsnJrnl has lowest valid usn (0x%llx) " - "which is out of bounds (0x%llx). $UsnJrnl " - "is corrupt.", - (long long)sle64_to_cpu(uh->lowest_valid_usn), - i_size_read(vol->usnjrnl_j_ino)); - ntfs_unmap_page(page); - return false; - } - ntfs_unmap_page(page); - ntfs_debug("Done."); - return true; -} - /** * load_and_init_attrdef - load the attribute definitions table for a volume * @vol: ntfs super block describing device whose attrdef to load * * Return 'true' on success or 'false' on error. */ -static bool load_and_init_attrdef(ntfs_volume *vol) +static bool load_and_init_attrdef(struct ntfs_volume *vol) { loff_t i_size; struct super_block *sb = vol->sb; struct inode *ino; - struct page *page; + struct folio *folio; + u8 *addr; pgoff_t index, max_index; unsigned int size; ntfs_debug("Entering."); /* Read attrdef table and setup vol->attrdef and vol->attrdef_size. */ ino = ntfs_iget(sb, FILE_AttrDef); - if (IS_ERR(ino) || is_bad_inode(ino)) { + if (IS_ERR(ino)) { if (!IS_ERR(ino)) iput(ino); goto failed; @@ -1605,7 +1246,7 @@ static bool load_and_init_attrdef(ntfs_volume *vol) i_size = i_size_read(ino); if (i_size <= 0 || i_size > 0x7fffffff) goto iput_failed; - vol->attrdef = (ATTR_DEF*)ntfs_malloc_nofs(i_size); + vol->attrdef = (struct attr_def *)ntfs_malloc_nofs(i_size); if (!vol->attrdef) goto iput_failed; index = 0; @@ -1614,12 +1255,14 @@ static bool load_and_init_attrdef(ntfs_volume *vol) while (index < max_index) { /* Read the attrdef table and copy it into the linear buffer. */ read_partial_attrdef_page: - page = ntfs_map_page(ino->i_mapping, index); - if (IS_ERR(page)) + folio = read_mapping_folio(ino->i_mapping, index, NULL); + if (IS_ERR(folio)) goto free_iput_failed; - memcpy((u8*)vol->attrdef + (index++ << PAGE_SHIFT), - page_address(page), size); - ntfs_unmap_page(page); + addr = kmap_local_folio(folio, 0); + memcpy((u8 *)vol->attrdef + (index++ << PAGE_SHIFT), + addr, size); + kunmap_local(addr); + folio_put(folio); } if (size == PAGE_SIZE) { size = i_size & ~PAGE_MASK; @@ -1640,20 +1283,19 @@ static bool load_and_init_attrdef(ntfs_volume *vol) return false; } -#endif /* NTFS_RW */ - /** * load_and_init_upcase - load the upcase table for an ntfs volume * @vol: ntfs super block describing device whose upcase to load * * Return 'true' on success or 'false' on error. */ -static bool load_and_init_upcase(ntfs_volume *vol) +static bool load_and_init_upcase(struct ntfs_volume *vol) { loff_t i_size; struct super_block *sb = vol->sb; struct inode *ino; - struct page *page; + struct folio *folio; + u8 *addr; pgoff_t index, max_index; unsigned int size; int i, max; @@ -1661,20 +1303,20 @@ static bool load_and_init_upcase(ntfs_volume *vol) ntfs_debug("Entering."); /* Read upcase table and setup vol->upcase and vol->upcase_len. */ ino = ntfs_iget(sb, FILE_UpCase); - if (IS_ERR(ino) || is_bad_inode(ino)) { + if (IS_ERR(ino)) { if (!IS_ERR(ino)) iput(ino); goto upcase_failed; } /* * The upcase size must not be above 64k Unicode characters, must not - * be zero and must be a multiple of sizeof(ntfschar). + * be zero and must be a multiple of sizeof(__le16). */ i_size = i_size_read(ino); - if (!i_size || i_size & (sizeof(ntfschar) - 1) || - i_size > 64ULL * 1024 * sizeof(ntfschar)) + if (!i_size || i_size & (sizeof(__le16) - 1) || + i_size > 64ULL * 1024 * sizeof(__le16)) goto iput_upcase_failed; - vol->upcase = (ntfschar*)ntfs_malloc_nofs(i_size); + vol->upcase = (__le16 *)ntfs_malloc_nofs(i_size); if (!vol->upcase) goto iput_upcase_failed; index = 0; @@ -1683,13 +1325,15 @@ static bool load_and_init_upcase(ntfs_volume *vol) while (index < max_index) { /* Read the upcase table and copy it into the linear buffer. */ read_partial_upcase_page: - page = ntfs_map_page(ino->i_mapping, index); - if (IS_ERR(page)) + folio = read_mapping_folio(ino->i_mapping, index, NULL); + if (IS_ERR(folio)) goto iput_upcase_failed; - memcpy((char*)vol->upcase + (index++ << PAGE_SHIFT), - page_address(page), size); - ntfs_unmap_page(page); - } + addr = kmap_local_folio(folio, 0); + memcpy((char *)vol->upcase + (index++ << PAGE_SHIFT), + addr, size); + kunmap_local(addr); + folio_put(folio); + }; if (size == PAGE_SIZE) { size = i_size & ~PAGE_MASK; if (size) @@ -1697,12 +1341,11 @@ static bool load_and_init_upcase(ntfs_volume *vol) } vol->upcase_len = i_size >> UCHAR_T_SIZE_BITS; ntfs_debug("Read %llu bytes from $UpCase (expected %zu bytes).", - i_size, 64 * 1024 * sizeof(ntfschar)); + i_size, 64 * 1024 * sizeof(__le16)); iput(ino); mutex_lock(&ntfs_lock); if (!default_upcase) { - ntfs_debug("Using volume specified $UpCase since default is " - "not present."); + ntfs_debug("Using volume specified $UpCase since default is not present."); mutex_unlock(&ntfs_lock); return true; } @@ -1718,13 +1361,11 @@ static bool load_and_init_upcase(ntfs_volume *vol) vol->upcase_len = max; ntfs_nr_upcase_users++; mutex_unlock(&ntfs_lock); - ntfs_debug("Volume specified $UpCase matches default. Using " - "default."); + ntfs_debug("Volume specified $UpCase matches default. Using default."); return true; } mutex_unlock(&ntfs_lock); - ntfs_debug("Using volume specified $UpCase since it does not match " - "the default."); + ntfs_debug("Using volume specified $UpCase since it does not match the default."); return true; iput_upcase_failed: iput(ino); @@ -1737,8 +1378,7 @@ static bool load_and_init_upcase(ntfs_volume *vol) vol->upcase_len = default_upcase_len; ntfs_nr_upcase_users++; mutex_unlock(&ntfs_lock); - ntfs_error(sb, "Failed to load $UpCase from the volume. Using " - "default."); + ntfs_error(sb, "Failed to load $UpCase from the volume. Using default."); return true; } mutex_unlock(&ntfs_lock); @@ -1763,47 +1403,30 @@ static struct lock_class_key * * Return 'true' on success or 'false' on error. */ -static bool load_system_files(ntfs_volume *vol) +static bool load_system_files(struct ntfs_volume *vol) { struct super_block *sb = vol->sb; - MFT_RECORD *m; - VOLUME_INFORMATION *vi; - ntfs_attr_search_ctx *ctx; -#ifdef NTFS_RW - RESTART_PAGE_HEADER *rp; + struct mft_record *m; + struct volume_information *vi; + struct ntfs_attr_search_ctx *ctx; + struct restart_page_header *rp; int err; -#endif /* NTFS_RW */ ntfs_debug("Entering."); -#ifdef NTFS_RW /* Get mft mirror inode compare the contents of $MFT and $MFTMirr. */ if (!load_and_init_mft_mirror(vol) || !check_mft_mirror(vol)) { - static const char *es1 = "Failed to load $MFTMirr"; - static const char *es2 = "$MFTMirr does not match $MFT"; - static const char *es3 = ". Run ntfsfix and/or chkdsk."; - /* If a read-write mount, convert it to a read-only mount. */ - if (!sb_rdonly(sb)) { - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=" - "continue nor on_errors=" - "remount-ro was specified%s", - !vol->mftmirr_ino ? es1 : es2, - es3); - goto iput_mirr_err_out; - } + if (!sb_rdonly(sb) && vol->on_errors == ON_ERRORS_REMOUNT_RO) { + static const char *es1 = "Failed to load $MFTMirr"; + static const char *es2 = "$MFTMirr does not match $MFT"; + static const char *es3 = ". Run ntfsck and/or chkdsk."; + sb->s_flags |= SB_RDONLY; ntfs_error(sb, "%s. Mounting read-only%s", !vol->mftmirr_ino ? es1 : es2, es3); - } else - ntfs_warning(sb, "%s. Will not be able to remount " - "read-write%s", - !vol->mftmirr_ino ? es1 : es2, es3); - /* This will prevent a read-write remount. */ + } NVolSetErrors(vol); } -#endif /* NTFS_RW */ /* Get mft bitmap attribute inode. */ vol->mftbmp_ino = ntfs_attr_iget(vol->mft_ino, AT_BITMAP, NULL, 0); if (IS_ERR(vol->mftbmp_ino)) { @@ -1817,21 +1440,19 @@ static bool load_system_files(ntfs_volume *vol) /* Read upcase table and setup @vol->upcase and @vol->upcase_len. */ if (!load_and_init_upcase(vol)) goto iput_mftbmp_err_out; -#ifdef NTFS_RW /* * Read attribute definitions table and setup @vol->attrdef and * @vol->attrdef_size. */ if (!load_and_init_attrdef(vol)) goto iput_upcase_err_out; -#endif /* NTFS_RW */ /* * Get the cluster allocation bitmap inode and verify the size, no * need for any locking at this stage as we are already running * exclusively as we are mount in progress task. */ vol->lcnbmp_ino = ntfs_iget(sb, FILE_Bitmap); - if (IS_ERR(vol->lcnbmp_ino) || is_bad_inode(vol->lcnbmp_ino)) { + if (IS_ERR(vol->lcnbmp_ino)) { if (!IS_ERR(vol->lcnbmp_ino)) iput(vol->lcnbmp_ino); goto bitmap_failed; @@ -1853,7 +1474,7 @@ static bool load_system_files(ntfs_volume *vol) * version. */ vol->vol_ino = ntfs_iget(sb, FILE_Volume); - if (IS_ERR(vol->vol_ino) || is_bad_inode(vol->vol_ino)) { + if (IS_ERR(vol->vol_ino)) { if (!IS_ERR(vol->vol_ino)) iput(vol->vol_ino); volume_failed: @@ -1866,10 +1487,25 @@ static bool load_system_files(ntfs_volume *vol) iput(vol->vol_ino); goto volume_failed; } - if (!(ctx = ntfs_attr_get_search_ctx(NTFS_I(vol->vol_ino), m))) { + + ctx = ntfs_attr_get_search_ctx(NTFS_I(vol->vol_ino), m); + if (!ctx) { ntfs_error(sb, "Failed to get attribute search context."); goto get_ctx_vol_failed; } + + if (!ntfs_attr_lookup(AT_VOLUME_NAME, NULL, 0, 0, 0, NULL, 0, ctx) && + !ctx->attr->non_resident && + !(ctx->attr->flags & (ATTR_IS_SPARSE | ATTR_IS_COMPRESSED)) && + le32_to_cpu(ctx->attr->data.resident.value_length) > 0) { + err = ntfs_ucstonls(vol, (__le16 *)((u8 *)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset)), + le32_to_cpu(ctx->attr->data.resident.value_length) / 2, + &vol->volume_label, NTFS_MAX_LABEL_LEN); + if (err < 0) + vol->volume_label = NULL; + } + if (ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0, ctx) || ctx->attr->non_resident || ctx->attr->flags) { err_put_vol: @@ -1878,28 +1514,22 @@ static bool load_system_files(ntfs_volume *vol) unmap_mft_record(NTFS_I(vol->vol_ino)); goto iput_volume_failed; } - vi = (VOLUME_INFORMATION*)((char*)ctx->attr + + vi = (struct volume_information *)((char *)ctx->attr + le16_to_cpu(ctx->attr->data.resident.value_offset)); /* Some bounds checks. */ - if ((u8*)vi < (u8*)ctx->attr || (u8*)vi + + if ((u8 *)vi < (u8 *)ctx->attr || (u8 *)vi + le32_to_cpu(ctx->attr->data.resident.value_length) > - (u8*)ctx->attr + le32_to_cpu(ctx->attr->length)) + (u8 *)ctx->attr + le32_to_cpu(ctx->attr->length)) goto err_put_vol; - /* Copy the volume flags and version to the ntfs_volume structure. */ + /* Copy the volume flags and version to the struct ntfs_volume structure. */ vol->vol_flags = vi->flags; vol->major_ver = vi->major_ver; vol->minor_ver = vi->minor_ver; ntfs_attr_put_search_ctx(ctx); unmap_mft_record(NTFS_I(vol->vol_ino)); - pr_info("volume version %i.%i.\n", vol->major_ver, - vol->minor_ver); - if (vol->major_ver < 3 && NVolSparseEnabled(vol)) { - ntfs_warning(vol->sb, "Disabling sparse support due to NTFS " - "volume version %i.%i (need at least version " - "3.0).", vol->major_ver, vol->minor_ver); - NVolClearSparseEnabled(vol); - } -#ifdef NTFS_RW + pr_info("volume version %i.%i, dev %s, cluster size %d\n", + vol->major_ver, vol->minor_ver, sb->s_id, vol->cluster_size); + /* Make sure that no unsupported volume flags are set. */ if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { static const char *es1a = "Volume is dirty"; @@ -1917,25 +1547,14 @@ static bool load_system_files(ntfs_volume *vol) es2 = es2b; } else { es1 = es1c; - ntfs_warning(sb, "Unsupported volume flags 0x%x " - "encountered.", - (unsigned)le16_to_cpu(vol->vol_flags)); + ntfs_warning(sb, "Unsupported volume flags 0x%x encountered.", + (unsigned int)le16_to_cpu(vol->vol_flags)); } /* If a read-write mount, convert it to a read-only mount. */ - if (!sb_rdonly(sb)) { - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=" - "continue nor on_errors=" - "remount-ro was specified%s", - es1, es2); - goto iput_vol_err_out; - } + if (!sb_rdonly(sb) && vol->on_errors == ON_ERRORS_REMOUNT_RO) { sb->s_flags |= SB_RDONLY; ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - } else - ntfs_warning(sb, "%s. Will not be able to remount " - "read-write%s", es1, es2); + } /* * Do not set NVolErrors() because ntfs_remount() re-checks the * flags which we need to do in case any flags have changed. @@ -1946,47 +1565,25 @@ static bool load_system_files(ntfs_volume *vol) * was shutdown cleanly. */ rp = NULL; - if (!load_and_check_logfile(vol, &rp) || - !ntfs_is_logfile_clean(vol->logfile_ino, rp)) { - static const char *es1a = "Failed to load $LogFile"; - static const char *es1b = "$LogFile is not clean"; - static const char *es2 = ". Mount in Windows."; - const char *es1; - - es1 = !vol->logfile_ino ? es1a : es1b; + err = load_and_check_logfile(vol, &rp); + if (err) { /* If a read-write mount, convert it to a read-only mount. */ - if (!sb_rdonly(sb)) { - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=" - "continue nor on_errors=" - "remount-ro was specified%s", - es1, es2); - if (vol->logfile_ino) { - BUG_ON(!rp); - ntfs_free(rp); - } - goto iput_logfile_err_out; - } + if (!sb_rdonly(sb) && vol->on_errors == ON_ERRORS_REMOUNT_RO) { sb->s_flags |= SB_RDONLY; - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - } else - ntfs_warning(sb, "%s. Will not be able to remount " - "read-write%s", es1, es2); - /* This will prevent a read-write remount. */ + ntfs_error(sb, "Failed to load LogFile. Mounting read-only."); + } NVolSetErrors(vol); } + ntfs_free(rp); -#endif /* NTFS_RW */ /* Get the root directory inode so we can do path lookups. */ vol->root_ino = ntfs_iget(sb, FILE_root); - if (IS_ERR(vol->root_ino) || is_bad_inode(vol->root_ino)) { + if (IS_ERR(vol->root_ino)) { if (!IS_ERR(vol->root_ino)) iput(vol->root_ino); ntfs_error(sb, "Failed to load root directory."); goto iput_logfile_err_out; } -#ifdef NTFS_RW /* * Check if Windows is suspended to disk on the target volume. If it * is hibernated, we must not write *anything* to the disk so set @@ -1996,223 +1593,74 @@ static bool load_system_files(ntfs_volume *vol) */ err = check_windows_hibernation_status(vol); if (unlikely(err)) { - static const char *es1a = "Failed to determine if Windows is " - "hibernated"; + static const char *es1a = "Failed to determine if Windows is hibernated"; static const char *es1b = "Windows is hibernated"; static const char *es2 = ". Run chkdsk."; const char *es1; es1 = err < 0 ? es1a : es1b; /* If a read-write mount, convert it to a read-only mount. */ - if (!sb_rdonly(sb)) { - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=" - "continue nor on_errors=" - "remount-ro was specified%s", - es1, es2); - goto iput_root_err_out; - } + if (!sb_rdonly(sb) && vol->on_errors == ON_ERRORS_REMOUNT_RO) { sb->s_flags |= SB_RDONLY; ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - } else - ntfs_warning(sb, "%s. Will not be able to remount " - "read-write%s", es1, es2); - /* This will prevent a read-write remount. */ - NVolSetErrors(vol); - } - /* If (still) a read-write mount, mark the volume dirty. */ - if (!sb_rdonly(sb) && ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { - static const char *es1 = "Failed to set dirty bit in volume " - "information flags"; - static const char *es2 = ". Run chkdsk."; - - /* Convert to a read-only mount. */ - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=continue nor " - "on_errors=remount-ro was specified%s", - es1, es2); - goto iput_root_err_out; } - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= SB_RDONLY; - /* - * Do not set NVolErrors() because ntfs_remount() might manage - * to set the dirty flag in which case all would be well. - */ - } -#if 0 - // TODO: Enable this code once we start modifying anything that is - // different between NTFS 1.2 and 3.x... - /* - * If (still) a read-write mount, set the NT4 compatibility flag on - * newer NTFS version volumes. - */ - if (!(sb->s_flags & SB_RDONLY) && (vol->major_ver > 1) && - ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) { - static const char *es1 = "Failed to set NT4 compatibility flag"; - static const char *es2 = ". Run chkdsk."; - - /* Convert to a read-only mount. */ - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=continue nor " - "on_errors=remount-ro was specified%s", - es1, es2); - goto iput_root_err_out; - } - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= SB_RDONLY; NVolSetErrors(vol); } -#endif + /* If (still) a read-write mount, empty the logfile. */ - if (!sb_rdonly(sb) && !ntfs_empty_logfile(vol->logfile_ino)) { - static const char *es1 = "Failed to empty $LogFile"; + if (!sb_rdonly(sb) && + vol->logfile_ino && !ntfs_empty_logfile(vol->logfile_ino) && + vol->on_errors == ON_ERRORS_REMOUNT_RO) { + static const char *es1 = "Failed to empty LogFile"; static const char *es2 = ". Mount in Windows."; /* Convert to a read-only mount. */ - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=continue nor " - "on_errors=remount-ro was specified%s", - es1, es2); - goto iput_root_err_out; - } ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); sb->s_flags |= SB_RDONLY; NVolSetErrors(vol); } -#endif /* NTFS_RW */ /* If on NTFS versions before 3.0, we are done. */ if (unlikely(vol->major_ver < 3)) return true; /* NTFS 3.0+ specific initialization. */ /* Get the security descriptors inode. */ vol->secure_ino = ntfs_iget(sb, FILE_Secure); - if (IS_ERR(vol->secure_ino) || is_bad_inode(vol->secure_ino)) { + if (IS_ERR(vol->secure_ino)) { if (!IS_ERR(vol->secure_ino)) iput(vol->secure_ino); ntfs_error(sb, "Failed to load $Secure."); goto iput_root_err_out; } - // TODO: Initialize security. /* Get the extended system files' directory inode. */ vol->extend_ino = ntfs_iget(sb, FILE_Extend); - if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino) || + if (IS_ERR(vol->extend_ino) || !S_ISDIR(vol->extend_ino->i_mode)) { if (!IS_ERR(vol->extend_ino)) iput(vol->extend_ino); ntfs_error(sb, "Failed to load $Extend."); goto iput_sec_err_out; } -#ifdef NTFS_RW /* Find the quota file, load it if present, and set it up. */ - if (!load_and_init_quota(vol)) { + if (!load_and_init_quota(vol) && + vol->on_errors == ON_ERRORS_REMOUNT_RO) { static const char *es1 = "Failed to load $Quota"; static const char *es2 = ". Run chkdsk."; - /* If a read-write mount, convert it to a read-only mount. */ - if (!sb_rdonly(sb)) { - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=" - "continue nor on_errors=" - "remount-ro was specified%s", - es1, es2); - goto iput_quota_err_out; - } - sb->s_flags |= SB_RDONLY; - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - } else - ntfs_warning(sb, "%s. Will not be able to remount " - "read-write%s", es1, es2); - /* This will prevent a read-write remount. */ - NVolSetErrors(vol); - } - /* If (still) a read-write mount, mark the quotas out of date. */ - if (!sb_rdonly(sb) && !ntfs_mark_quotas_out_of_date(vol)) { - static const char *es1 = "Failed to mark quotas out of date"; - static const char *es2 = ". Run chkdsk."; - - /* Convert to a read-only mount. */ - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=continue nor " - "on_errors=remount-ro was specified%s", - es1, es2); - goto iput_quota_err_out; - } - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); sb->s_flags |= SB_RDONLY; - NVolSetErrors(vol); - } - /* - * Find the transaction log file ($UsnJrnl), load it if present, check - * it, and set it up. - */ - if (!load_and_init_usnjrnl(vol)) { - static const char *es1 = "Failed to load $UsnJrnl"; - static const char *es2 = ". Run chkdsk."; - - /* If a read-write mount, convert it to a read-only mount. */ - if (!sb_rdonly(sb)) { - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=" - "continue nor on_errors=" - "remount-ro was specified%s", - es1, es2); - goto iput_usnjrnl_err_out; - } - sb->s_flags |= SB_RDONLY; - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - } else - ntfs_warning(sb, "%s. Will not be able to remount " - "read-write%s", es1, es2); + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); /* This will prevent a read-write remount. */ NVolSetErrors(vol); } - /* If (still) a read-write mount, stamp the transaction log. */ - if (!sb_rdonly(sb) && !ntfs_stamp_usnjrnl(vol)) { - static const char *es1 = "Failed to stamp transaction log " - "($UsnJrnl)"; - static const char *es2 = ". Run chkdsk."; - /* Convert to a read-only mount. */ - if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | - ON_ERRORS_CONTINUE))) { - ntfs_error(sb, "%s and neither on_errors=continue nor " - "on_errors=remount-ro was specified%s", - es1, es2); - goto iput_usnjrnl_err_out; - } - ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= SB_RDONLY; - NVolSetErrors(vol); - } -#endif /* NTFS_RW */ return true; -#ifdef NTFS_RW -iput_usnjrnl_err_out: - iput(vol->usnjrnl_j_ino); - iput(vol->usnjrnl_max_ino); - iput(vol->usnjrnl_ino); -iput_quota_err_out: - iput(vol->quota_q_ino); - iput(vol->quota_ino); - iput(vol->extend_ino); -#endif /* NTFS_RW */ + iput_sec_err_out: iput(vol->secure_ino); iput_root_err_out: iput(vol->root_ino); iput_logfile_err_out: -#ifdef NTFS_RW - iput(vol->logfile_ino); -iput_vol_err_out: -#endif /* NTFS_RW */ + if (vol->logfile_ino) + iput(vol->logfile_ino); iput(vol->vol_ino); iput_lcnbmp_err_out: iput(vol->lcnbmp_ino); @@ -2222,9 +1670,7 @@ static bool load_system_files(ntfs_volume *vol) ntfs_free(vol->attrdef); vol->attrdef = NULL; } -#ifdef NTFS_RW iput_upcase_err_out: -#endif /* NTFS_RW */ vol->upcase_len = 0; mutex_lock(&ntfs_lock); if (vol->upcase == default_upcase) { @@ -2239,28 +1685,62 @@ static bool load_system_files(ntfs_volume *vol) iput_mftbmp_err_out: iput(vol->mftbmp_ino); iput_mirr_err_out: -#ifdef NTFS_RW iput(vol->mftmirr_ino); -#endif /* NTFS_RW */ return false; } +static void ntfs_volume_free(struct ntfs_volume *vol) +{ + /* Throw away the table of attribute definitions. */ + vol->attrdef_size = 0; + if (vol->attrdef) { + ntfs_free(vol->attrdef); + vol->attrdef = NULL; + } + vol->upcase_len = 0; + /* + * Destroy the global default upcase table if necessary. Also decrease + * the number of upcase users if we are a user. + */ + mutex_lock(&ntfs_lock); + if (vol->upcase == default_upcase) { + ntfs_nr_upcase_users--; + vol->upcase = NULL; + } + + if (!ntfs_nr_upcase_users && default_upcase) { + ntfs_free(default_upcase); + default_upcase = NULL; + } + + free_compression_buffers(); + + mutex_unlock(&ntfs_lock); + if (vol->upcase) { + ntfs_free(vol->upcase); + vol->upcase = NULL; + } + + unload_nls(vol->nls_map); + + if (vol->lcn_empty_bits_per_page) + kvfree(vol->lcn_empty_bits_per_page); + kfree(vol->volume_label); + kfree(vol); +} + /** * ntfs_put_super - called by the vfs to unmount a volume * @sb: vfs superblock of volume to unmount - * - * ntfs_put_super() is called by the VFS (from fs/super.c::do_umount()) when - * the volume is being unmounted (umount system call has been invoked) and it - * releases all inodes and memory belonging to the NTFS specific part of the - * super block. */ static void ntfs_put_super(struct super_block *sb) { - ntfs_volume *vol = NTFS_SB(sb); + struct ntfs_volume *vol = NTFS_SB(sb); - ntfs_debug("Entering."); + pr_info("Entering %s, dev %s\n", __func__, sb->s_id); + + cancel_work_sync(&vol->precalc_work); -#ifdef NTFS_RW /* * Commit all inodes while they are still open in case some of them * cause others to be dirtied. @@ -2269,12 +1749,6 @@ static void ntfs_put_super(struct super_block *sb) /* NTFS 3.0+ specific. */ if (vol->major_ver >= 3) { - if (vol->usnjrnl_j_ino) - ntfs_commit_inode(vol->usnjrnl_j_ino); - if (vol->usnjrnl_max_ino) - ntfs_commit_inode(vol->usnjrnl_max_ino); - if (vol->usnjrnl_ino) - ntfs_commit_inode(vol->usnjrnl_ino); if (vol->quota_q_ino) ntfs_commit_inode(vol->quota_q_ino); if (vol->quota_ino) @@ -2287,13 +1761,13 @@ static void ntfs_put_super(struct super_block *sb) ntfs_commit_inode(vol->root_ino); - down_write(&vol->lcnbmp_lock); ntfs_commit_inode(vol->lcnbmp_ino); - up_write(&vol->lcnbmp_lock); - down_write(&vol->mftbmp_lock); + /* + * the GFP_NOFS scope is not needed because ntfs_commit_inode + * does nothing + */ ntfs_commit_inode(vol->mftbmp_ino); - up_write(&vol->mftbmp_lock); if (vol->logfile_ino) ntfs_commit_inode(vol->logfile_ino); @@ -2309,39 +1783,24 @@ static void ntfs_put_super(struct super_block *sb) if (!sb_rdonly(sb)) { if (!NVolErrors(vol)) { if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY)) - ntfs_warning(sb, "Failed to clear dirty bit " - "in volume information " - "flags. Run chkdsk."); + ntfs_warning(sb, + "Failed to clear dirty bit in volume information flags. Run chkdsk."); ntfs_commit_inode(vol->vol_ino); ntfs_commit_inode(vol->root_ino); if (vol->mftmirr_ino) ntfs_commit_inode(vol->mftmirr_ino); ntfs_commit_inode(vol->mft_ino); } else { - ntfs_warning(sb, "Volume has errors. Leaving volume " - "marked dirty. Run chkdsk."); + ntfs_warning(sb, + "Volume has errors. Leaving volume marked dirty. Run chkdsk."); } } -#endif /* NTFS_RW */ iput(vol->vol_ino); vol->vol_ino = NULL; /* NTFS 3.0+ specific clean up. */ if (vol->major_ver >= 3) { -#ifdef NTFS_RW - if (vol->usnjrnl_j_ino) { - iput(vol->usnjrnl_j_ino); - vol->usnjrnl_j_ino = NULL; - } - if (vol->usnjrnl_max_ino) { - iput(vol->usnjrnl_max_ino); - vol->usnjrnl_max_ino = NULL; - } - if (vol->usnjrnl_ino) { - iput(vol->usnjrnl_ino); - vol->usnjrnl_ino = NULL; - } if (vol->quota_q_ino) { iput(vol->quota_q_ino); vol->quota_q_ino = NULL; @@ -2350,7 +1809,6 @@ static void ntfs_put_super(struct super_block *sb) iput(vol->quota_ino); vol->quota_ino = NULL; } -#endif /* NTFS_RW */ if (vol->extend_ino) { iput(vol->extend_ino); vol->extend_ino = NULL; @@ -2364,17 +1822,12 @@ static void ntfs_put_super(struct super_block *sb) iput(vol->root_ino); vol->root_ino = NULL; - down_write(&vol->lcnbmp_lock); iput(vol->lcnbmp_ino); vol->lcnbmp_ino = NULL; - up_write(&vol->lcnbmp_lock); - down_write(&vol->mftbmp_lock); iput(vol->mftbmp_ino); vol->mftbmp_ino = NULL; - up_write(&vol->mftbmp_lock); -#ifdef NTFS_RW if (vol->logfile_ino) { iput(vol->logfile_ino); vol->logfile_ino = NULL; @@ -2393,43 +1846,66 @@ static void ntfs_put_super(struct super_block *sb) */ ntfs_commit_inode(vol->mft_ino); write_inode_now(vol->mft_ino, 1); -#endif /* NTFS_RW */ iput(vol->mft_ino); vol->mft_ino = NULL; - /* Throw away the table of attribute definitions. */ - vol->attrdef_size = 0; - if (vol->attrdef) { - ntfs_free(vol->attrdef); - vol->attrdef = NULL; - } - vol->upcase_len = 0; - /* - * Destroy the global default upcase table if necessary. Also decrease - * the number of upcase users if we are a user. - */ - mutex_lock(&ntfs_lock); - if (vol->upcase == default_upcase) { - ntfs_nr_upcase_users--; - vol->upcase = NULL; - } - if (!ntfs_nr_upcase_users && default_upcase) { - ntfs_free(default_upcase); - default_upcase = NULL; - } - if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users) - free_compression_buffers(); - mutex_unlock(&ntfs_lock); - if (vol->upcase) { - ntfs_free(vol->upcase); - vol->upcase = NULL; + ntfs_volume_free(vol); +} + +int ntfs_force_shutdown(struct super_block *sb, u32 flags) +{ + struct ntfs_volume *vol = NTFS_SB(sb); + int ret; + + if (NVolShutdown(vol)) + return 0; + + switch (flags) { + case NTFS_GOING_DOWN_DEFAULT: + case NTFS_GOING_DOWN_FULLSYNC: + ret = bdev_freeze(sb->s_bdev); + if (ret) + return ret; + bdev_thaw(sb->s_bdev); + NVolSetShutdown(vol); + break; + case NTFS_GOING_DOWN_NOSYNC: + NVolSetShutdown(vol); + break; + default: + return -EINVAL; } - unload_nls(vol->nls_map); + return 0; +} - sb->s_fs_info = NULL; - kfree(vol); +static void ntfs_shutdown(struct super_block *sb) +{ + ntfs_force_shutdown(sb, NTFS_GOING_DOWN_NOSYNC); + +} + +static int ntfs_sync_fs(struct super_block *sb, int wait) +{ + struct ntfs_volume *vol = NTFS_SB(sb); + int err = 0; + + if (NVolShutdown(vol)) + return -EIO; + + if (!wait) + return 0; + + /* If there are some dirty buffers in the bdev inode */ + if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY)) { + ntfs_warning(sb, "Failed to clear dirty bit in volume information flags. Run chkdsk."); + err = -EIO; + } + sync_inodes_sb(sb); + sync_blockdev(sb->s_bdev); + blkdev_issue_flush(sb->s_bdev); + return err; } /** @@ -2451,16 +1927,27 @@ static void ntfs_put_super(struct super_block *sb) * in use. This means we return an underestimate on errors which is better than * an overestimate. */ -static s64 get_nr_free_clusters(ntfs_volume *vol) +s64 get_nr_free_clusters(struct ntfs_volume *vol) { s64 nr_free = vol->nr_clusters; + u32 nr_used; struct address_space *mapping = vol->lcnbmp_ino->i_mapping; - struct page *page; + struct folio *folio; pgoff_t index, max_index; + struct file_ra_state *ra; ntfs_debug("Entering."); /* Serialize accesses to the cluster bitmap. */ - down_read(&vol->lcnbmp_lock); + + if (NVolFreeClusterKnown(vol)) + return atomic64_read(&vol->free_clusters); + + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return 0; + + file_ra_state_init(ra, mapping); + /* * Convert the number of bits into bytes rounded up, then convert into * multiples of PAGE_SIZE, rounding up so that if we have one @@ -2475,18 +1962,27 @@ static s64 get_nr_free_clusters(ntfs_volume *vol) unsigned long *kaddr; /* - * Read the page from page cache, getting it from backing store + * Get folio from page cache, getting it from backing store * if necessary, and increment the use count. */ - page = read_mapping_page(mapping, index, NULL); + folio = filemap_lock_folio(mapping, index); + if (IS_ERR(folio)) { + page_cache_sync_readahead(mapping, ra, NULL, + index, max_index - index); + folio = read_mapping_folio(mapping, index, NULL); + if (!IS_ERR(folio)) + folio_lock(folio); + } + /* Ignore pages which errored synchronously. */ - if (IS_ERR(page)) { - ntfs_debug("read_mapping_page() error. Skipping " - "page (index 0x%lx).", index); + if (IS_ERR(folio)) { + ntfs_debug("Skipping page (index 0x%lx).", index); nr_free -= PAGE_SIZE * 8; + vol->lcn_empty_bits_per_page[index] = 0; continue; } - kaddr = kmap_atomic(page); + + kaddr = kmap_local_folio(folio, 0); /* * Subtract the number of set bits. If this * is the last page and it is partial we don't really care as @@ -2494,10 +1990,12 @@ static s64 get_nr_free_clusters(ntfs_volume *vol) * the result as all out of range bytes are set to zero by * ntfs_readpage(). */ - nr_free -= bitmap_weight(kaddr, - PAGE_SIZE * BITS_PER_BYTE); - kunmap_atomic(kaddr); - put_page(page); + nr_used = bitmap_weight(kaddr, PAGE_SIZE * BITS_PER_BYTE); + nr_free -= nr_used; + vol->lcn_empty_bits_per_page[index] = PAGE_SIZE * BITS_PER_BYTE - nr_used; + kunmap_local(kaddr); + folio_unlock(folio); + folio_put(folio); } ntfs_debug("Finished reading $Bitmap, last index = 0x%lx.", index - 1); /* @@ -2506,14 +2004,45 @@ static s64 get_nr_free_clusters(ntfs_volume *vol) */ if (vol->nr_clusters & 63) nr_free += 64 - (vol->nr_clusters & 63); - up_read(&vol->lcnbmp_lock); + /* If errors occurred we may well have gone below zero, fix this. */ if (nr_free < 0) nr_free = 0; + else + atomic64_set(&vol->free_clusters, nr_free); + + kfree(ra); + NVolSetFreeClusterKnown(vol); + wake_up_all(&vol->free_waitq); ntfs_debug("Exiting."); return nr_free; } +/* + * @nr_clusters is the number of clusters requested for allocation. + * + * Return the number of clusters available for allocation within + * the range of @nr_clusters, which is counts that considered + * for delayed allocation. + */ +s64 ntfs_available_clusters_count(struct ntfs_volume *vol, s64 nr_clusters) +{ + s64 free_clusters; + + /* wait event */ + if (!NVolFreeClusterKnown(vol)) + wait_event(vol->free_waitq, NVolFreeClusterKnown(vol)); + + free_clusters = atomic64_read(&vol->free_clusters) - + atomic64_read(&vol->dirty_clusters); + if (free_clusters <= 0) + return -ENOSPC; + else if (free_clusters < nr_clusters) + nr_clusters = free_clusters; + + return nr_clusters; +} + /** * __get_nr_free_mft_records - return the number of free inodes on a volume * @vol: ntfs volume for which to obtain free inode count @@ -2531,33 +2060,50 @@ static s64 get_nr_free_clusters(ntfs_volume *vol) * * NOTE: Caller must hold mftbmp_lock rw_semaphore for reading or writing. */ -static unsigned long __get_nr_free_mft_records(ntfs_volume *vol, +static unsigned long __get_nr_free_mft_records(struct ntfs_volume *vol, s64 nr_free, const pgoff_t max_index) { struct address_space *mapping = vol->mftbmp_ino->i_mapping; - struct page *page; + struct folio *folio; pgoff_t index; + struct file_ra_state *ra; ntfs_debug("Entering."); + + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return 0; + + file_ra_state_init(ra, mapping); + /* Use multiples of 4 bytes, thus max_size is PAGE_SIZE / 4. */ - ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = " - "0x%lx.", max_index, PAGE_SIZE / 4); + ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = 0x%lx.", + max_index, PAGE_SIZE / 4); for (index = 0; index < max_index; index++) { unsigned long *kaddr; /* - * Read the page from page cache, getting it from backing store + * Get folio from page cache, getting it from backing store * if necessary, and increment the use count. */ - page = read_mapping_page(mapping, index, NULL); + folio = filemap_lock_folio(mapping, index); + if (IS_ERR(folio)) { + page_cache_sync_readahead(mapping, ra, NULL, + index, max_index - index); + folio = read_mapping_folio(mapping, index, NULL); + if (!IS_ERR(folio)) + folio_lock(folio); + } + /* Ignore pages which errored synchronously. */ - if (IS_ERR(page)) { - ntfs_debug("read_mapping_page() error. Skipping " - "page (index 0x%lx).", index); + if (IS_ERR(folio)) { + ntfs_debug("read_mapping_page() error. Skipping page (index 0x%lx).", + index); nr_free -= PAGE_SIZE * 8; continue; } - kaddr = kmap_atomic(page); + + kaddr = kmap_local_folio(folio, 0); /* * Subtract the number of set bits. If this * is the last page and it is partial we don't really care as @@ -2567,14 +2113,19 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol, */ nr_free -= bitmap_weight(kaddr, PAGE_SIZE * BITS_PER_BYTE); - kunmap_atomic(kaddr); - put_page(page); + kunmap_local(kaddr); + folio_unlock(folio); + folio_put(folio); } ntfs_debug("Finished reading $MFT/$BITMAP, last index = 0x%lx.", index - 1); /* If errors occurred we may well have gone below zero, fix this. */ if (nr_free < 0) nr_free = 0; + else + atomic64_set(&vol->free_mft_records, nr_free); + + kfree(ra); ntfs_debug("Exiting."); return nr_free; } @@ -2601,47 +2152,46 @@ static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs) { struct super_block *sb = dentry->d_sb; s64 size; - ntfs_volume *vol = NTFS_SB(sb); - ntfs_inode *mft_ni = NTFS_I(vol->mft_ino); - pgoff_t max_index; + struct ntfs_volume *vol = NTFS_SB(sb); + struct ntfs_inode *mft_ni = NTFS_I(vol->mft_ino); unsigned long flags; ntfs_debug("Entering."); /* Type of filesystem. */ sfs->f_type = NTFS_SB_MAGIC; /* Optimal transfer block size. */ - sfs->f_bsize = PAGE_SIZE; + sfs->f_bsize = vol->cluster_size; + /* Fundamental file system block size, used as the unit. */ + sfs->f_frsize = vol->cluster_size; + /* * Total data blocks in filesystem in units of f_bsize and since * inodes are also stored in data blocs ($MFT is a file) this is just * the total clusters. */ - sfs->f_blocks = vol->nr_clusters << vol->cluster_size_bits >> - PAGE_SHIFT; + sfs->f_blocks = vol->nr_clusters; + + /* wait event */ + if (!NVolFreeClusterKnown(vol)) + wait_event(vol->free_waitq, NVolFreeClusterKnown(vol)); + /* Free data blocks in filesystem in units of f_bsize. */ - size = get_nr_free_clusters(vol) << vol->cluster_size_bits >> - PAGE_SHIFT; + size = atomic64_read(&vol->free_clusters) - + atomic64_read(&vol->dirty_clusters); if (size < 0LL) size = 0LL; + /* Free blocks avail to non-superuser, same as above on NTFS. */ sfs->f_bavail = sfs->f_bfree = size; - /* Serialize accesses to the inode bitmap. */ - down_read(&vol->mftbmp_lock); + + /* Number of inodes in filesystem (at this point in time). */ read_lock_irqsave(&mft_ni->size_lock, flags); - size = i_size_read(vol->mft_ino) >> vol->mft_record_size_bits; - /* - * Convert the maximum number of set bits into bytes rounded up, then - * convert into multiples of PAGE_SIZE, rounding up so that if we - * have one full and one partial page max_index = 2. - */ - max_index = ((((mft_ni->initialized_size >> vol->mft_record_size_bits) - + 7) >> 3) + PAGE_SIZE - 1) >> PAGE_SHIFT; + sfs->f_files = i_size_read(vol->mft_ino) >> vol->mft_record_size_bits; read_unlock_irqrestore(&mft_ni->size_lock, flags); - /* Number of inodes in filesystem (at this point in time). */ - sfs->f_files = size; + /* Free inodes in fs (based on current total count). */ - sfs->f_ffree = __get_nr_free_mft_records(vol, size, max_index); - up_read(&vol->mftbmp_lock); + sfs->f_ffree = atomic64_read(&vol->free_mft_records); + /* * File system id. This is extremely *nix flavour dependent and even * within Linux itself all fs do their own thing. I interpret this to @@ -2655,40 +2205,44 @@ static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs) sfs->f_fsid = u64_to_fsid(vol->serial_no); /* Maximum length of filenames. */ sfs->f_namelen = NTFS_MAX_NAME_LEN; + return 0; } -#ifdef NTFS_RW static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc) { return __ntfs_write_inode(vi, wbc->sync_mode == WB_SYNC_ALL); } -#endif -/* +/** * The complete super operations. */ static const struct super_operations ntfs_sops = { .alloc_inode = ntfs_alloc_big_inode, /* VFS: Allocate new inode. */ .free_inode = ntfs_free_big_inode, /* VFS: Deallocate inode. */ -#ifdef NTFS_RW - .write_inode = ntfs_write_inode, /* VFS: Write dirty inode to - disk. */ -#endif /* NTFS_RW */ + .drop_inode = ntfs_drop_big_inode, + .write_inode = ntfs_write_inode, /* VFS: Write dirty inode to disk. */ .put_super = ntfs_put_super, /* Syscall: umount. */ + .shutdown = ntfs_shutdown, + .sync_fs = ntfs_sync_fs, /* Syscall: sync. */ .statfs = ntfs_statfs, /* Syscall: statfs */ - .remount_fs = ntfs_remount, /* Syscall: mount -o remount. */ - .evict_inode = ntfs_evict_big_inode, /* VFS: Called when an inode is - removed from memory. */ - .show_options = ntfs_show_options, /* Show mount options in - proc. */ + .evict_inode = ntfs_evict_big_inode, + .show_options = ntfs_show_options, /* Show mount options in proc. */ }; +static void precalc_free_clusters(struct work_struct *work) +{ + struct ntfs_volume *vol = container_of(work, struct ntfs_volume, precalc_work); + s64 nr_free; + + nr_free = get_nr_free_clusters(vol); + + ntfs_debug("pre-calculate free clusters(%lld) using workqueue", + nr_free); +} + /** * ntfs_fill_super - mount an ntfs filesystem - * @sb: super block of ntfs filesystem to mount - * @opt: string containing the mount options - * @silent: silence error output * * ntfs_fill_super() is called by the VFS to mount the device described by @sb * with the mount otions in @data with the NTFS filesystem. @@ -2699,15 +2253,19 @@ static const struct super_operations ntfs_sops = { * that all filesystems except the correct one will quite correctly and * expectedly return an error, but nobody wants to see error messages when in * fact this is what is supposed to happen. - * - * NOTE: @sb->s_flags contains the mount options flags. */ -static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) +static struct lock_class_key ntfs_mft_inval_lock_key; + +static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) { - ntfs_volume *vol; - struct buffer_head *bh; + char *boot; struct inode *tmp_ino; int blocksize, result; + pgoff_t lcn_bit_pages; + struct ntfs_volume *vol = NTFS_SB(sb); + int silent = fc->sb_flags & SB_SILENT; + + vol->sb = sb; /* * We do a pretty difficult piece of bootstrap by reading the @@ -2721,52 +2279,29 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) */ lockdep_off(); ntfs_debug("Entering."); -#ifndef NTFS_RW - sb->s_flags |= SB_RDONLY; -#endif /* ! NTFS_RW */ - /* Allocate a new ntfs_volume and place it in sb->s_fs_info. */ - sb->s_fs_info = kmalloc(sizeof(ntfs_volume), GFP_NOFS); - vol = NTFS_SB(sb); - if (!vol) { - if (!silent) - ntfs_error(sb, "Allocation of NTFS volume structure " - "failed. Aborting mount..."); - lockdep_on(); - return -ENOMEM; - } - /* Initialize ntfs_volume structure. */ - *vol = (ntfs_volume) { - .sb = sb, - /* - * Default is group and other don't have any access to files or - * directories while owner has full access. Further, files by - * default are not executable but directories are of course - * browseable. - */ - .fmask = 0177, - .dmask = 0077, - }; - init_rwsem(&vol->mftbmp_lock); - init_rwsem(&vol->lcnbmp_lock); - /* By default, enable sparse support. */ - NVolSetSparseEnabled(vol); + if (vol->nls_map && !strcmp(vol->nls_map->charset, "utf8")) + vol->nls_utf8 = true; + if (NVolDisableSparse(vol)) + vol->preallocated_size = 0; - /* Important to get the mount options dealt with now. */ - if (!parse_options(vol, (char*)opt)) - goto err_out_now; + if (NVolDiscard(vol) && !bdev_max_discard_sectors(sb->s_bdev)) { + ntfs_warning( + sb, + "Discard requested but device does not support discard. Discard disabled."); + NVolClearDiscard(vol); + } /* We support sector sizes up to the PAGE_SIZE. */ if (bdev_logical_block_size(sb->s_bdev) > PAGE_SIZE) { if (!silent) - ntfs_error(sb, "Device has unsupported sector size " - "(%i). The maximum supported sector " - "size on this architecture is %lu " - "bytes.", - bdev_logical_block_size(sb->s_bdev), - PAGE_SIZE); + ntfs_error(sb, + "Device has unsupported sector size (%i). The maximum supported sector size on this architecture is %lu bytes.", + bdev_logical_block_size(sb->s_bdev), + PAGE_SIZE); goto err_out_now; } + /* * Setup the device access block size to NTFS_BLOCK_SIZE or the hard * sector size, whichever is bigger. @@ -2777,18 +2312,20 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) ntfs_error(sb, "Unable to set device block size."); goto err_out_now; } - BUG_ON(blocksize != sb->s_blocksize); + ntfs_debug("Set device block size to %i bytes (block size bits %i).", blocksize, sb->s_blocksize_bits); /* Determine the size of the device in units of block_size bytes. */ - vol->nr_blocks = sb_bdev_nr_blocks(sb); - if (!vol->nr_blocks) { + if (!bdev_nr_bytes(sb->s_bdev)) { if (!silent) ntfs_error(sb, "Unable to determine device size."); goto err_out_now; } + vol->nr_blocks = bdev_nr_bytes(sb->s_bdev) >> + sb->s_blocksize_bits; /* Read the boot sector and return unlocked buffer head to it. */ - if (!(bh = read_ntfs_boot_sector(sb, silent))) { + boot = read_ntfs_boot_sector(sb, silent); + if (!boot) { if (!silent) ntfs_error(sb, "Not an NTFS volume."); goto err_out_now; @@ -2797,36 +2334,26 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) * Extract the data from the boot sector and setup the ntfs volume * using it. */ - result = parse_ntfs_boot_sector(vol, (NTFS_BOOT_SECTOR*)bh->b_data); - brelse(bh); + result = parse_ntfs_boot_sector(vol, (struct ntfs_boot_sector *)boot); + kfree(boot); if (!result) { if (!silent) ntfs_error(sb, "Unsupported NTFS filesystem."); goto err_out_now; } - /* - * If the boot sector indicates a sector size bigger than the current - * device block size, switch the device block size to the sector size. - * TODO: It may be possible to support this case even when the set - * below fails, we would just be breaking up the i/o for each sector - * into multiple blocks for i/o purposes but otherwise it should just - * work. However it is safer to leave disabled until someone hits this - * error message and then we can get them to try it without the setting - * so we know for sure that it works. - */ + if (vol->sector_size > blocksize) { blocksize = sb_set_blocksize(sb, vol->sector_size); if (blocksize != vol->sector_size) { if (!silent) - ntfs_error(sb, "Unable to set device block " - "size to sector size (%i).", - vol->sector_size); + ntfs_error(sb, + "Unable to set device block size to sector size (%i).", + vol->sector_size); goto err_out_now; } - BUG_ON(blocksize != sb->s_blocksize); - vol->nr_blocks = sb_bdev_nr_blocks(sb); - ntfs_debug("Changed device block size to %i bytes (block size " - "bits %i) to match volume sector size.", + vol->nr_blocks = bdev_nr_bytes(sb->s_bdev) >> + sb->s_blocksize_bits; + ntfs_debug("Changed device block size to %i bytes (block size bits %i) to match volume sector size.", blocksize, sb->s_blocksize_bits); } /* Initialize the cluster and mft allocators. */ @@ -2844,6 +2371,8 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) sb->s_maxbytes = MAX_LFS_FILESIZE; /* Ntfs measures time in 100ns intervals. */ sb->s_time_gran = 100; + + sb->s_xattr = ntfsp_xattr_handlers; /* * Now load the metadata required for the page cache and our address * space operations to function. We do this by setting up a specialised @@ -2858,6 +2387,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) ntfs_error(sb, "Failed to load essential metadata."); goto err_out_now; } + tmp_ino->i_ino = FILE_MFT; insert_inode_hash(tmp_ino); if (ntfs_read_inode_mount(tmp_ino) < 0) { @@ -2865,21 +2395,11 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) ntfs_error(sb, "Failed to load essential metadata."); goto iput_tmp_ino_err_out_now; } + lockdep_set_class(&tmp_ino->i_mapping->invalidate_lock, + &ntfs_mft_inval_lock_key); + mutex_lock(&ntfs_lock); - /* - * The current mount is a compression user if the cluster size is - * less than or equal 4kiB. - */ - if (vol->cluster_size <= 4096 && !ntfs_nr_compression_users++) { - result = allocate_compression_buffers(); - if (result) { - ntfs_error(NULL, "Failed to allocate buffers " - "for compression engine."); - ntfs_nr_compression_users--; - mutex_unlock(&ntfs_lock); - goto iput_tmp_ino_err_out_now; - } - } + /* * Generate the global default upcase table if necessary. Also * temporarily increment the number of upcase users to avoid race @@ -2889,6 +2409,16 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) default_upcase = generate_default_upcase(); ntfs_nr_upcase_users++; mutex_unlock(&ntfs_lock); + + lcn_bit_pages = (((vol->nr_clusters + 7) >> 3) + PAGE_SIZE - 1) >> PAGE_SHIFT; + vol->lcn_empty_bits_per_page = kvmalloc_array(lcn_bit_pages, sizeof(unsigned int), + GFP_KERNEL); + if (!vol->lcn_empty_bits_per_page) { + ntfs_error(sb, + "Unable to allocate pages for storing LCN empty bit counts\n"); + goto unl_upcase_iput_tmp_ino_err_out_now; + } + /* * From now on, ignore @silent parameter. If we fail below this line, * it will be due to a corrupt fs or a system error, so we report it. @@ -2904,8 +2434,12 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) /* We grab a reference, simulating an ntfs_iget(). */ ihold(vol->root_ino); - if ((sb->s_root = d_make_root(vol->root_ino))) { + sb->s_root = d_make_root(vol->root_ino); + if (sb->s_root) { + s64 nr_records; + ntfs_debug("Exiting, status successful."); + /* Release the default upcase if it has no users. */ mutex_lock(&ntfs_lock); if (!--ntfs_nr_upcase_users && default_upcase) { @@ -2915,30 +2449,25 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) mutex_unlock(&ntfs_lock); sb->s_export_op = &ntfs_export_ops; lockdep_on(); + + nr_records = __get_nr_free_mft_records(vol, + i_size_read(vol->mft_ino) >> vol->mft_record_size_bits, + ((((NTFS_I(vol->mft_ino)->initialized_size >> + vol->mft_record_size_bits) + + 7) >> 3) + PAGE_SIZE - 1) >> PAGE_SHIFT); + ntfs_debug("Free mft records(%lld)", nr_records); + + init_waitqueue_head(&vol->free_waitq); + INIT_WORK(&vol->precalc_work, precalc_free_clusters); + queue_work(ntfs_wq, &vol->precalc_work); return 0; } ntfs_error(sb, "Failed to allocate root directory."); /* Clean up after the successful load_system_files() call from above. */ - // TODO: Use ntfs_put_super() instead of repeating all this code... - // FIXME: Should mark the volume clean as the error is most likely - // -ENOMEM. iput(vol->vol_ino); vol->vol_ino = NULL; /* NTFS 3.0+ specific clean up. */ if (vol->major_ver >= 3) { -#ifdef NTFS_RW - if (vol->usnjrnl_j_ino) { - iput(vol->usnjrnl_j_ino); - vol->usnjrnl_j_ino = NULL; - } - if (vol->usnjrnl_max_ino) { - iput(vol->usnjrnl_max_ino); - vol->usnjrnl_max_ino = NULL; - } - if (vol->usnjrnl_ino) { - iput(vol->usnjrnl_ino); - vol->usnjrnl_ino = NULL; - } if (vol->quota_q_ino) { iput(vol->quota_q_ino); vol->quota_q_ino = NULL; @@ -2947,7 +2476,6 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) iput(vol->quota_ino); vol->quota_ino = NULL; } -#endif /* NTFS_RW */ if (vol->extend_ino) { iput(vol->extend_ino); vol->extend_ino = NULL; @@ -2963,7 +2491,6 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) vol->lcnbmp_ino = NULL; iput(vol->mftbmp_ino); vol->mftbmp_ino = NULL; -#ifdef NTFS_RW if (vol->logfile_ino) { iput(vol->logfile_ino); vol->logfile_ino = NULL; @@ -2972,7 +2499,6 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) iput(vol->mftmirr_ino); vol->mftmirr_ino = NULL; } -#endif /* NTFS_RW */ /* Throw away the table of attribute definitions. */ vol->attrdef_size = 0; if (vol->attrdef) { @@ -2996,6 +2522,8 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) } /* Error exit code path. */ unl_upcase_iput_tmp_ino_err_out_now: + if (vol->lcn_empty_bits_per_page) + kvfree(vol->lcn_empty_bits_per_page); /* * Decrease the number of upcase users and destroy the global default * upcase table if necessary. @@ -3005,8 +2533,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) ntfs_free(default_upcase); default_upcase = NULL; } - if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users) - free_compression_buffers(); + mutex_unlock(&ntfs_lock); iput_tmp_ino_err_out_now: iput(tmp_ino); @@ -3036,7 +2563,7 @@ struct kmem_cache *ntfs_big_inode_cache; /* Init once constructor for the inode slab cache. */ static void ntfs_big_inode_init_once(void *foo) { - ntfs_inode *ni = (ntfs_inode *)foo; + struct ntfs_inode *ni = (struct ntfs_inode *)foo; inode_init_once(VFS_I(ni)); } @@ -3051,20 +2578,79 @@ struct kmem_cache *ntfs_index_ctx_cache; /* Driver wide mutex. */ DEFINE_MUTEX(ntfs_lock); -static struct dentry *ntfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int ntfs_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, ntfs_fill_super); +} + +static void ntfs_free_fs_context(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super); + struct ntfs_volume *vol = fc->s_fs_info; + + if (vol) + ntfs_volume_free(vol); +} + +static const struct fs_context_operations ntfs_context_ops = { + .parse_param = ntfs_parse_param, + .get_tree = ntfs_get_tree, + .free = ntfs_free_fs_context, + .reconfigure = ntfs_reconfigure, +}; + +static int ntfs_init_fs_context(struct fs_context *fc) +{ + struct ntfs_volume *vol; + + /* Allocate a new struct ntfs_volume and place it in sb->s_fs_info. */ + vol = kmalloc(sizeof(struct ntfs_volume), GFP_NOFS); + if (!vol) + return -ENOMEM; + + /* Initialize struct ntfs_volume structure. */ + *vol = (struct ntfs_volume) { + .uid = INVALID_UID, + .gid = INVALID_GID, + .fmask = 0, + .dmask = 0, + .mft_zone_multiplier = 1, + .on_errors = ON_ERRORS_CONTINUE, + .nls_map = load_nls_default(), + .preallocated_size = NTFS_DEF_PREALLOC_SIZE, + }; + + NVolSetShowHiddenFiles(vol); + NVolSetCaseSensitive(vol); + init_rwsem(&vol->mftbmp_lock); + init_rwsem(&vol->lcnbmp_lock); + + fc->s_fs_info = vol; + fc->ops = &ntfs_context_ops; + return 0; } static struct file_system_type ntfs_fs_type = { - .owner = THIS_MODULE, - .name = "ntfs", - .mount = ntfs_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .owner = THIS_MODULE, + .name = "ntfs", + .init_fs_context = ntfs_init_fs_context, + .parameters = ntfs_parameters, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; -MODULE_ALIAS_FS("ntfs"); + +static int ntfs_workqueue_init(void) +{ + ntfs_wq = alloc_workqueue("ntfs-bg-io", 0, 0); + if (!ntfs_wq) + return -ENOMEM; + return 0; +} + +static void ntfs_workqueue_destroy(void) +{ + destroy_workqueue(ntfs_wq); + ntfs_wq = NULL; +} /* Stable names for the slab caches. */ static const char ntfs_index_ctx_cache_name[] = "ntfs_index_ctx_cache"; @@ -3077,32 +2663,21 @@ static int __init init_ntfs_fs(void) { int err = 0; - /* This may be ugly but it results in pretty output so who cares. (-8 */ - pr_info("driver " NTFS_VERSION " [Flags: R/" -#ifdef NTFS_RW - "W" -#else - "O" -#endif -#ifdef DEBUG - " DEBUG" -#endif -#ifdef MODULE - " MODULE" -#endif - "].\n"); - - ntfs_debug("Debug messages are enabled."); + err = ntfs_workqueue_init(); + if (err) { + pr_crit("Failed to register workqueue!\n"); + return err; + } ntfs_index_ctx_cache = kmem_cache_create(ntfs_index_ctx_cache_name, - sizeof(ntfs_index_context), 0 /* offset */, + sizeof(struct ntfs_index_context), 0 /* offset */, SLAB_HWCACHE_ALIGN, NULL /* ctor */); if (!ntfs_index_ctx_cache) { pr_crit("Failed to create %s!\n", ntfs_index_ctx_cache_name); goto ictx_err_out; } ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name, - sizeof(ntfs_attr_search_ctx), 0 /* offset */, + sizeof(struct ntfs_attr_search_ctx), 0 /* offset */, SLAB_HWCACHE_ALIGN, NULL /* ctor */); if (!ntfs_attr_ctx_cache) { pr_crit("NTFS: Failed to create %s!\n", @@ -3111,7 +2686,7 @@ static int __init init_ntfs_fs(void) } ntfs_name_cache = kmem_cache_create(ntfs_name_cache_name, - (NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0, + (NTFS_MAX_NAME_LEN+2) * sizeof(__le16), 0, SLAB_HWCACHE_ALIGN, NULL); if (!ntfs_name_cache) { pr_crit("Failed to create %s!\n", ntfs_name_cache_name); @@ -3119,17 +2694,16 @@ static int __init init_ntfs_fs(void) } ntfs_inode_cache = kmem_cache_create(ntfs_inode_cache_name, - sizeof(ntfs_inode), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); + sizeof(struct ntfs_inode), 0, SLAB_RECLAIM_ACCOUNT, NULL); if (!ntfs_inode_cache) { pr_crit("Failed to create %s!\n", ntfs_inode_cache_name); goto inode_err_out; } ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name, - sizeof(big_ntfs_inode), 0, - SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| - SLAB_ACCOUNT, ntfs_big_inode_init_once); + sizeof(struct big_ntfs_inode), 0, SLAB_HWCACHE_ALIGN | + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, + ntfs_big_inode_init_once); if (!ntfs_big_inode_cache) { pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name); goto big_inode_err_out; @@ -3185,18 +2759,19 @@ static void __exit exit_ntfs_fs(void) kmem_cache_destroy(ntfs_name_cache); kmem_cache_destroy(ntfs_attr_ctx_cache); kmem_cache_destroy(ntfs_index_ctx_cache); + ntfs_workqueue_destroy(); /* Unregister the ntfs sysctls. */ ntfs_sysctl(0); } -MODULE_AUTHOR("Anton Altaparmakov "); -MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc."); -MODULE_VERSION(NTFS_VERSION); +module_init(init_ntfs_fs); +module_exit(exit_ntfs_fs); + +MODULE_AUTHOR("Anton Altaparmakov "); /* Original read-only NTFS driver */ +MODULE_AUTHOR("Namjae Jeon "); /* Add write, iomap and various features */ +MODULE_DESCRIPTION("NTFS read-write filesystem driver"); MODULE_LICENSE("GPL"); #ifdef DEBUG module_param(debug_msgs, bint, 0); MODULE_PARM_DESC(debug_msgs, "Enable debug messages."); #endif - -module_init(init_ntfs_fs) -module_exit(exit_ntfs_fs) -- 2.25.1 This updates the implementation of directory operations. Signed-off-by: Namjae Jeon --- fs/ntfs/dir.c | 1639 +++++++++++++------------------- fs/ntfs/index.c | 2397 ++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 2700 insertions(+), 1336 deletions(-) diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 629723a8d712..26cae08a59a7 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1,26 +1,24 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* - * dir.c - NTFS kernel directory operations. Part of the Linux-NTFS project. +/** + * NTFS kernel directory operations. Part of the Linux-NTFS project. * * Copyright (c) 2001-2007 Anton Altaparmakov * Copyright (c) 2002 Richard Russon + * Copyright (c) 2025 LG Electronics Co., Ltd. */ -#include -#include #include #include "dir.h" -#include "aops.h" -#include "attrib.h" #include "mft.h" -#include "debug.h" #include "ntfs.h" +#include "index.h" +#include "reparse.h" -/* +/** * The little endian Unicode string $I30 as a global constant. */ -ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'), +__le16 I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'), cpu_to_le16('3'), cpu_to_le16('0'), 0 }; /** @@ -61,30 +59,29 @@ ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'), * locked whilst being accessed otherwise we may find a corrupt * page due to it being under ->writepage at the moment which * applies the mst protection fixups before writing out and then - * removes them again after the write is complete after which it + * removes them again after the write is complete after which it * unlocks the page. */ -MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, - const int uname_len, ntfs_name **res) +u64 ntfs_lookup_inode_by_name(struct ntfs_inode *dir_ni, const __le16 *uname, + const int uname_len, struct ntfs_name **res) { - ntfs_volume *vol = dir_ni->vol; + struct ntfs_volume *vol = dir_ni->vol; struct super_block *sb = vol->sb; - MFT_RECORD *m; - INDEX_ROOT *ir; - INDEX_ENTRY *ie; - INDEX_ALLOCATION *ia; + struct inode *ia_vi = NULL; + struct mft_record *m; + struct index_root *ir; + struct index_entry *ie; + struct index_block *ia; u8 *index_end; u64 mref; - ntfs_attr_search_ctx *ctx; + struct ntfs_attr_search_ctx *ctx; int err, rc; - VCN vcn, old_vcn; + s64 vcn, old_vcn; struct address_space *ia_mapping; - struct page *page; - u8 *kaddr; - ntfs_name *name = NULL; + struct folio *folio; + u8 *kaddr = NULL; + struct ntfs_name *name = NULL; - BUG_ON(!S_ISDIR(VFS_I(dir_ni)->i_mode)); - BUG_ON(NInoAttr(dir_ni)); /* Get hold of the mft record for the directory. */ m = map_mft_record(dir_ni); if (IS_ERR(m)) { @@ -102,30 +99,30 @@ MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, 0, ctx); if (unlikely(err)) { if (err == -ENOENT) { - ntfs_error(sb, "Index root attribute missing in " - "directory inode 0x%lx.", - dir_ni->mft_no); + ntfs_error(sb, + "Index root attribute missing in directory inode 0x%lx.", + dir_ni->mft_no); err = -EIO; } goto err_out; } /* Get to the index root value (it's been verified in read_inode). */ - ir = (INDEX_ROOT*)((u8*)ctx->attr + + ir = (struct index_root *)((u8 *)ctx->attr + le16_to_cpu(ctx->attr->data.resident.value_offset)); - index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); + index_end = (u8 *)&ir->index + le32_to_cpu(ir->index.index_length); /* The first index entry. */ - ie = (INDEX_ENTRY*)((u8*)&ir->index + + ie = (struct index_entry *)((u8 *)&ir->index + le32_to_cpu(ir->index.entries_offset)); /* * Loop until we exceed valid memory (corruption case) or until we * reach the last entry. */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + for (;; ie = (struct index_entry *)((u8 *)ie + le16_to_cpu(ie->length))) { /* Bounds checks. */ - if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->key_length) > - index_end) + if ((u8 *)ie < (u8 *)ctx->mrec || + (u8 *)ie + sizeof(struct index_entry_header) > index_end || + (u8 *)ie + sizeof(struct index_entry_header) + le16_to_cpu(ie->key_length) > + index_end || (u8 *)ie + le16_to_cpu(ie->length) > index_end) goto dir_err_out; /* * The last entry cannot contain a name. It can however contain @@ -133,6 +130,13 @@ MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, */ if (ie->flags & INDEX_ENTRY_END) break; + /* Key length should not be zero if it is not last entry. */ + if (!ie->key_length) + goto dir_err_out; + /* Check the consistency of an index entry */ + if (ntfs_index_entry_inconsistent(NULL, vol, ie, COLLATION_FILE_NAME, + dir_ni->mft_no)) + goto dir_err_out; /* * We perform a case sensitive comparison and if that matches * we are done and return the mft reference of the inode (i.e. @@ -141,7 +145,7 @@ MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, * returning. */ if (ntfs_are_names_equal(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, + (__le16 *)&ie->key.file_name.file_name, ie->key.file_name.file_name_length, CASE_SENSITIVE, vol->upcase, vol->upcase_len)) { found_it: @@ -157,7 +161,7 @@ MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, */ if (ie->key.file_name.file_name_type == FILE_NAME_DOS) { if (!name) { - name = kmalloc(sizeof(ntfs_name), + name = kmalloc(sizeof(struct ntfs_name), GFP_NOFS); if (!name) { err = -ENOMEM; @@ -188,30 +192,26 @@ MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, * only cache the mft reference and the file name type (we set * the name length to zero for simplicity). */ - if (!NVolCaseSensitive(vol) && - ie->key.file_name.file_name_type && - ntfs_are_names_equal(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, - IGNORE_CASE, vol->upcase, vol->upcase_len)) { - int name_size = sizeof(ntfs_name); + if ((!NVolCaseSensitive(vol) || + ie->key.file_name.file_name_type == FILE_NAME_DOS) && + ntfs_are_names_equal(uname, uname_len, + (__le16 *)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, + IGNORE_CASE, vol->upcase, + vol->upcase_len)) { + int name_size = sizeof(struct ntfs_name); u8 type = ie->key.file_name.file_name_type; u8 len = ie->key.file_name.file_name_length; /* Only one case insensitive matching name allowed. */ if (name) { - ntfs_error(sb, "Found already allocated name " - "in phase 1. Please run chkdsk " - "and if that doesn't find any " - "errors please report you saw " - "this message to " - "linux-ntfs-dev@lists." - "sourceforge.net."); + ntfs_error(sb, + "Found already allocated name in phase 1. Please run chkdsk"); goto dir_err_out; } if (type != FILE_NAME_DOS) - name_size += len * sizeof(ntfschar); + name_size += len * sizeof(__le16); name = kmalloc(name_size, GFP_NOFS); if (!name) { err = -ENOMEM; @@ -222,7 +222,7 @@ MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, if (type != FILE_NAME_DOS) { name->len = len; memcpy(name->name, ie->key.file_name.file_name, - len * sizeof(ntfschar)); + len * sizeof(__le16)); } else name->len = 0; *res = name; @@ -232,7 +232,7 @@ MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, * know which way in the B+tree we have to go. */ rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, + (__le16 *)&ie->key.file_name.file_name, ie->key.file_name.file_name_length, 1, IGNORE_CASE, vol->upcase, vol->upcase_len); /* @@ -251,7 +251,7 @@ MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, * collation. */ rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, + (__le16 *)&ie->key.file_name.file_name, ie->key.file_name.file_name_length, 1, CASE_SENSITIVE, vol->upcase, vol->upcase_len); if (rc == -1) @@ -281,109 +281,117 @@ MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, err = -ENOENT; goto err_out; } /* Child node present, descend into it. */ - /* Consistency check: Verify that an index allocation exists. */ - if (!NInoIndexAllocPresent(dir_ni)) { - ntfs_error(sb, "No index allocation attribute but index entry " - "requires one. Directory inode 0x%lx is " - "corrupt or driver bug.", dir_ni->mft_no); - goto err_out; - } + /* Get the starting vcn of the index_block holding the child node. */ - vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8)); - ia_mapping = VFS_I(dir_ni)->i_mapping; + vcn = le64_to_cpup((__le64 *)((u8 *)ie + le16_to_cpu(ie->length) - 8)); + /* * We are done with the index root and the mft record. Release them, - * otherwise we deadlock with ntfs_map_page(). + * otherwise we deadlock with read_mapping_folio(). */ ntfs_attr_put_search_ctx(ctx); unmap_mft_record(dir_ni); m = NULL; ctx = NULL; + + ia_vi = ntfs_index_iget(VFS_I(dir_ni), I30, 4); + if (IS_ERR(ia_vi)) { + err = PTR_ERR(ia_vi); + goto err_out; + } + + ia_mapping = ia_vi->i_mapping; descend_into_child_node: /* * Convert vcn to index into the index allocation attribute in units * of PAGE_SIZE and map the page cache page, reading it from * disk if necessary. */ - page = ntfs_map_page(ia_mapping, vcn << - dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT); - if (IS_ERR(page)) { + folio = read_mapping_folio(ia_mapping, vcn << + dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT, NULL); + if (IS_ERR(folio)) { ntfs_error(sb, "Failed to map directory index page, error %ld.", - -PTR_ERR(page)); - err = PTR_ERR(page); + -PTR_ERR(folio)); + err = PTR_ERR(folio); goto err_out; } - lock_page(page); - kaddr = (u8*)page_address(page); + + folio_lock(folio); + kaddr = kmalloc(PAGE_SIZE, GFP_NOFS); + if (!kaddr) { + err = -ENOMEM; + folio_unlock(folio); + folio_put(folio); + goto unm_err_out; + } + + memcpy_from_folio(kaddr, folio, 0, PAGE_SIZE); + post_read_mst_fixup((struct ntfs_record *)kaddr, PAGE_SIZE); + folio_unlock(folio); + folio_put(folio); fast_descend_into_child_node: /* Get to the index allocation block. */ - ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << + ia = (struct index_block *)(kaddr + ((vcn << dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK)); /* Bounds checks. */ - if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) { - ntfs_error(sb, "Out of bounds check failed. Corrupt directory " - "inode 0x%lx or driver bug.", dir_ni->mft_no); + if ((u8 *)ia < kaddr || (u8 *)ia > kaddr + PAGE_SIZE) { + ntfs_error(sb, + "Out of bounds check failed. Corrupt directory inode 0x%lx or driver bug.", + dir_ni->mft_no); goto unm_err_out; } /* Catch multi sector transfer fixup errors. */ if (unlikely(!ntfs_is_indx_record(ia->magic))) { - ntfs_error(sb, "Directory index record with vcn 0x%llx is " - "corrupt. Corrupt inode 0x%lx. Run chkdsk.", - (unsigned long long)vcn, dir_ni->mft_no); + ntfs_error(sb, + "Directory index record with vcn 0x%llx is corrupt. Corrupt inode 0x%lx. Run chkdsk.", + (unsigned long long)vcn, dir_ni->mft_no); goto unm_err_out; } - if (sle64_to_cpu(ia->index_block_vcn) != vcn) { - ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " - "different from expected VCN (0x%llx). " - "Directory inode 0x%lx is corrupt or driver " - "bug.", (unsigned long long) - sle64_to_cpu(ia->index_block_vcn), - (unsigned long long)vcn, dir_ni->mft_no); + if (le64_to_cpu(ia->index_block_vcn) != vcn) { + ntfs_error(sb, + "Actual VCN (0x%llx) of index buffer is different from expected VCN (0x%llx). Directory inode 0x%lx is corrupt or driver bug.", + (unsigned long long)le64_to_cpu(ia->index_block_vcn), + (unsigned long long)vcn, dir_ni->mft_no); goto unm_err_out; } if (le32_to_cpu(ia->index.allocated_size) + 0x18 != dir_ni->itype.index.block_size) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " - "0x%lx has a size (%u) differing from the " - "directory specified size (%u). Directory " - "inode is corrupt or driver bug.", - (unsigned long long)vcn, dir_ni->mft_no, - le32_to_cpu(ia->index.allocated_size) + 0x18, - dir_ni->itype.index.block_size); + ntfs_error(sb, + "Index buffer (VCN 0x%llx) of directory inode 0x%lx has a size (%u) differing from the directory specified size (%u). Directory inode is corrupt or driver bug.", + (unsigned long long)vcn, dir_ni->mft_no, + le32_to_cpu(ia->index.allocated_size) + 0x18, + dir_ni->itype.index.block_size); goto unm_err_out; } - index_end = (u8*)ia + dir_ni->itype.index.block_size; + index_end = (u8 *)ia + dir_ni->itype.index.block_size; if (index_end > kaddr + PAGE_SIZE) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " - "0x%lx crosses page boundary. Impossible! " - "Cannot access! This is probably a bug in the " - "driver.", (unsigned long long)vcn, - dir_ni->mft_no); + ntfs_error(sb, + "Index buffer (VCN 0x%llx) of directory inode 0x%lx crosses page boundary. Impossible! Cannot access! This is probably a bug in the driver.", + (unsigned long long)vcn, dir_ni->mft_no); goto unm_err_out; } - index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); - if (index_end > (u8*)ia + dir_ni->itype.index.block_size) { - ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory " - "inode 0x%lx exceeds maximum size.", - (unsigned long long)vcn, dir_ni->mft_no); + index_end = (u8 *)&ia->index + le32_to_cpu(ia->index.index_length); + if (index_end > (u8 *)ia + dir_ni->itype.index.block_size) { + ntfs_error(sb, + "Size of index buffer (VCN 0x%llx) of directory inode 0x%lx exceeds maximum size.", + (unsigned long long)vcn, dir_ni->mft_no); goto unm_err_out; } /* The first index entry. */ - ie = (INDEX_ENTRY*)((u8*)&ia->index + + ie = (struct index_entry *)((u8 *)&ia->index + le32_to_cpu(ia->index.entries_offset)); /* * Iterate similar to above big loop but applied to index buffer, thus * loop until we exceed valid memory (corruption case) or until we * reach the last entry. */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - /* Bounds check. */ - if ((u8*)ie < (u8*)ia || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->key_length) > - index_end) { - ntfs_error(sb, "Index entry out of bounds in " - "directory inode 0x%lx.", + for (;; ie = (struct index_entry *)((u8 *)ie + le16_to_cpu(ie->length))) { + /* Bounds checks. */ + if ((u8 *)ie < (u8 *)ia || + (u8 *)ie + sizeof(struct index_entry_header) > index_end || + (u8 *)ie + sizeof(struct index_entry_header) + le16_to_cpu(ie->key_length) > + index_end || (u8 *)ie + le16_to_cpu(ie->length) > index_end) { + ntfs_error(sb, "Index entry out of bounds in directory inode 0x%lx.", dir_ni->mft_no); goto unm_err_out; } @@ -393,6 +401,13 @@ MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, */ if (ie->flags & INDEX_ENTRY_END) break; + /* Key length should not be zero if it is not last entry. */ + if (!ie->key_length) + goto unm_err_out; + /* Check the consistency of an index entry */ + if (ntfs_index_entry_inconsistent(NULL, vol, ie, COLLATION_FILE_NAME, + dir_ni->mft_no)) + goto unm_err_out; /* * We perform a case sensitive comparison and if that matches * we are done and return the mft reference of the inode (i.e. @@ -401,7 +416,7 @@ MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, * returning. */ if (ntfs_are_names_equal(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, + (__le16 *)&ie->key.file_name.file_name, ie->key.file_name.file_name_length, CASE_SENSITIVE, vol->upcase, vol->upcase_len)) { found_it2: @@ -417,7 +432,7 @@ MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, */ if (ie->key.file_name.file_name_type == FILE_NAME_DOS) { if (!name) { - name = kmalloc(sizeof(ntfs_name), + name = kmalloc(sizeof(struct ntfs_name), GFP_NOFS); if (!name) { err = -ENOMEM; @@ -434,8 +449,8 @@ MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, *res = NULL; } mref = le64_to_cpu(ie->data.dir.indexed_file); - unlock_page(page); - ntfs_unmap_page(page); + kfree(kaddr); + iput(ia_vi); return mref; } /* @@ -448,32 +463,27 @@ MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, * only cache the mft reference and the file name type (we set * the name length to zero for simplicity). */ - if (!NVolCaseSensitive(vol) && - ie->key.file_name.file_name_type && - ntfs_are_names_equal(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, - IGNORE_CASE, vol->upcase, vol->upcase_len)) { - int name_size = sizeof(ntfs_name); + if ((!NVolCaseSensitive(vol) || + ie->key.file_name.file_name_type == FILE_NAME_DOS) && + ntfs_are_names_equal(uname, uname_len, + (__le16 *)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, + IGNORE_CASE, vol->upcase, + vol->upcase_len)) { + int name_size = sizeof(struct ntfs_name); u8 type = ie->key.file_name.file_name_type; u8 len = ie->key.file_name.file_name_length; /* Only one case insensitive matching name allowed. */ if (name) { - ntfs_error(sb, "Found already allocated name " - "in phase 2. Please run chkdsk " - "and if that doesn't find any " - "errors please report you saw " - "this message to " - "linux-ntfs-dev@lists." - "sourceforge.net."); - unlock_page(page); - ntfs_unmap_page(page); + ntfs_error(sb, + "Found already allocated name in phase 2. Please run chkdsk"); + kfree(kaddr); goto dir_err_out; } if (type != FILE_NAME_DOS) - name_size += len * sizeof(ntfschar); + name_size += len * sizeof(__le16); name = kmalloc(name_size, GFP_NOFS); if (!name) { err = -ENOMEM; @@ -484,7 +494,7 @@ MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, if (type != FILE_NAME_DOS) { name->len = len; memcpy(name->name, ie->key.file_name.file_name, - len * sizeof(ntfschar)); + len * sizeof(__le16)); } else name->len = 0; *res = name; @@ -494,7 +504,7 @@ MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, * know which way in the B+tree we have to go. */ rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, + (__le16 *)&ie->key.file_name.file_name, ie->key.file_name.file_name_length, 1, IGNORE_CASE, vol->upcase, vol->upcase_len); /* @@ -513,7 +523,7 @@ MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, * collation. */ rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, + (__le16 *)&ie->key.file_name.file_name, ie->key.file_name.file_name_length, 1, CASE_SENSITIVE, vol->upcase, vol->upcase_len); if (rc == -1) @@ -533,29 +543,29 @@ MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, */ if (ie->flags & INDEX_ENTRY_NODE) { if ((ia->index.flags & NODE_MASK) == LEAF_NODE) { - ntfs_error(sb, "Index entry with child node found in " - "a leaf node in directory inode 0x%lx.", - dir_ni->mft_no); + ntfs_error(sb, + "Index entry with child node found in a leaf node in directory inode 0x%lx.", + dir_ni->mft_no); goto unm_err_out; } /* Child node present, descend into it. */ old_vcn = vcn; - vcn = sle64_to_cpup((sle64*)((u8*)ie + + vcn = le64_to_cpup((__le64 *)((u8 *)ie + le16_to_cpu(ie->length) - 8)); if (vcn >= 0) { - /* If vcn is in the same page cache page as old_vcn we - * recycle the mapped page. */ - if (old_vcn << vol->cluster_size_bits >> - PAGE_SHIFT == vcn << - vol->cluster_size_bits >> - PAGE_SHIFT) + /* + * If vcn is in the same page cache page as old_vcn we + * recycle the mapped page. + */ + if (NTFS_CLU_TO_PIDX(vol, old_vcn) == + NTFS_CLU_TO_PIDX(vol, vcn)) goto fast_descend_into_child_node; - unlock_page(page); - ntfs_unmap_page(page); + kfree(kaddr); + kaddr = NULL; goto descend_into_child_node; } - ntfs_error(sb, "Negative child node vcn in directory inode " - "0x%lx.", dir_ni->mft_no); + ntfs_error(sb, "Negative child node vcn in directory inode 0x%lx.", + dir_ni->mft_no); goto unm_err_out; } /* @@ -564,15 +574,14 @@ MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, * associated with it. */ if (name) { - unlock_page(page); - ntfs_unmap_page(page); + kfree(kaddr); + iput(ia_vi); return name->mref; } ntfs_debug("Entry not found."); err = -ENOENT; unm_err_out: - unlock_page(page); - ntfs_unmap_page(page); + kfree(kaddr); err_out: if (!err) err = -EIO; @@ -580,858 +589,495 @@ MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, ntfs_attr_put_search_ctx(ctx); if (m) unmap_mft_record(dir_ni); - if (name) { - kfree(name); - *res = NULL; - } + kfree(name); + *res = NULL; + if (ia_vi && !IS_ERR(ia_vi)) + iput(ia_vi); return ERR_MREF(err); dir_err_out: ntfs_error(sb, "Corrupt directory. Aborting lookup."); goto err_out; } -#if 0 - -// TODO: (AIA) -// The algorithm embedded in this code will be required for the time when we -// want to support adding of entries to directories, where we require correct -// collation of file names in order not to cause corruption of the filesystem. - /** - * ntfs_lookup_inode_by_name - find an inode in a directory given its name - * @dir_ni: ntfs inode of the directory in which to search for the name - * @uname: Unicode name for which to search in the directory - * @uname_len: length of the name @uname in Unicode characters + * ntfs_filldir - ntfs specific filldir method + * @vol: current ntfs volume + * @ndir: ntfs inode of current directory + * @ia_page: page in which the index allocation buffer @ie is in resides + * @ie: current index entry + * @name: buffer to use for the converted name + * @actor: what to feed the entries to * - * Look for an inode with name @uname in the directory with inode @dir_ni. - * ntfs_lookup_inode_by_name() walks the contents of the directory looking for - * the Unicode name. If the name is found in the directory, the corresponding - * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it - * is a 64-bit number containing the sequence number. + * Convert the Unicode @name to the loaded NLS and pass it to the @filldir + * callback. * - * On error, a negative value is returned corresponding to the error code. In - * particular if the inode is not found -ENOENT is returned. Note that you - * can't just check the return value for being negative, you have to check the - * inode number for being negative which you can extract using MREC(return - * value). + * If @ia_page is not NULL it is the locked page containing the index + * allocation block containing the index entry @ie. * - * Note, @uname_len does not include the (optional) terminating NULL character. + * Note, we drop (and then reacquire) the page lock on @ia_page across the + * @filldir() call otherwise we would deadlock with NFSd when it calls ->lookup + * since ntfs_lookup() will lock the same page. As an optimization, we do not + * retake the lock if we are returning a non-zero value as ntfs_readdir() + * would need to drop the lock immediately anyway. */ -u64 ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, - const int uname_len) +static inline int ntfs_filldir(struct ntfs_volume *vol, + struct ntfs_inode *ndir, struct page *ia_page, struct index_entry *ie, + u8 *name, struct dir_context *actor) { - ntfs_volume *vol = dir_ni->vol; - struct super_block *sb = vol->sb; - MFT_RECORD *m; - INDEX_ROOT *ir; - INDEX_ENTRY *ie; - INDEX_ALLOCATION *ia; - u8 *index_end; - u64 mref; - ntfs_attr_search_ctx *ctx; - int err, rc; - IGNORE_CASE_BOOL ic; - VCN vcn, old_vcn; - struct address_space *ia_mapping; - struct page *page; - u8 *kaddr; + unsigned long mref; + int name_len; + unsigned int dt_type; + u8 name_type; - /* Get hold of the mft record for the directory. */ - m = map_mft_record(dir_ni); - if (IS_ERR(m)) { - ntfs_error(sb, "map_mft_record() failed with error code %ld.", - -PTR_ERR(m)); - return ERR_MREF(PTR_ERR(m)); + name_type = ie->key.file_name.file_name_type; + if (name_type == FILE_NAME_DOS) { + ntfs_debug("Skipping DOS name space entry."); + return 0; } - ctx = ntfs_attr_get_search_ctx(dir_ni, m); - if (!ctx) { - err = -ENOMEM; - goto err_out; + if (MREF_LE(ie->data.dir.indexed_file) == FILE_root) { + ntfs_debug("Skipping root directory self reference entry."); + return 0; } - /* Find the index root attribute in the mft record. */ - err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, - 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) { - ntfs_error(sb, "Index root attribute missing in " - "directory inode 0x%lx.", - dir_ni->mft_no); - err = -EIO; - } - goto err_out; + if (MREF_LE(ie->data.dir.indexed_file) < FILE_first_user && + !NVolShowSystemFiles(vol)) { + ntfs_debug("Skipping system file."); + return 0; } - /* Get to the index root value (it's been verified in read_inode). */ - ir = (INDEX_ROOT*)((u8*)ctx->attr + - le16_to_cpu(ctx->attr->data.resident.value_offset)); - index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); - /* The first index entry. */ - ie = (INDEX_ENTRY*)((u8*)&ir->index + - le32_to_cpu(ir->index.entries_offset)); + if (!NVolShowHiddenFiles(vol) && + (ie->key.file_name.file_attributes & FILE_ATTR_HIDDEN)) { + ntfs_debug("Skipping hidden file."); + return 0; + } + + name_len = ntfs_ucstonls(vol, (__le16 *)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, &name, + NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1); + if (name_len <= 0) { + ntfs_warning(vol->sb, "Skipping unrepresentable inode 0x%llx.", + (long long)MREF_LE(ie->data.dir.indexed_file)); + return 0; + } + + mref = MREF_LE(ie->data.dir.indexed_file); + if (ie->key.file_name.file_attributes & + FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT) + dt_type = DT_DIR; + else if (ie->key.file_name.file_attributes & FILE_ATTR_REPARSE_POINT) + dt_type = ntfs_reparse_tag_dt_types(vol, mref); + else + dt_type = DT_REG; + /* - * Loop until we exceed valid memory (corruption case) or until we - * reach the last entry. + * Drop the page lock otherwise we deadlock with NFS when it calls + * ->lookup since ntfs_lookup() will lock the same page. */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - /* Bounds checks. */ - if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->key_length) > - index_end) - goto dir_err_out; - /* - * The last entry cannot contain a name. It can however contain - * a pointer to a child node in the B+tree so we just break out. - */ - if (ie->flags & INDEX_ENTRY_END) - break; - /* - * If the current entry has a name type of POSIX, the name is - * case sensitive and not otherwise. This has the effect of us - * not being able to access any POSIX file names which collate - * after the non-POSIX one when they only differ in case, but - * anyone doing screwy stuff like that deserves to burn in - * hell... Doing that kind of stuff on NT4 actually causes - * corruption on the partition even when using SP6a and Linux - * is not involved at all. - */ - ic = ie->key.file_name.file_name_type ? IGNORE_CASE : - CASE_SENSITIVE; - /* - * If the names match perfectly, we are done and return the - * mft reference of the inode (i.e. the inode number together - * with the sequence number for consistency checking. We - * convert it to cpu format before returning. - */ - if (ntfs_are_names_equal(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, ic, - vol->upcase, vol->upcase_len)) { -found_it: - mref = le64_to_cpu(ie->data.dir.indexed_file); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(dir_ni); - return mref; + if (ia_page) + unlock_page(ia_page); + ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode 0x%lx, DT_%s.", + name, name_len, actor->pos, mref, dt_type == DT_DIR ? "DIR" : "REG"); + if (!dir_emit(actor, name, name_len, mref, dt_type)) + return 1; + /* Relock the page but not if we are aborting ->readdir. */ + if (ia_page) + lock_page(ia_page); + return 0; +} + +struct ntfs_file_private { + void *key; + __le16 key_length; + bool end_in_iterate; + loff_t curr_pos; +}; + +struct ntfs_index_ra { + unsigned long start_index; + unsigned int count; + struct rb_node rb_node; +}; + +static void ntfs_insert_rb(struct ntfs_index_ra *nir, struct rb_root *root) +{ + struct rb_node **new = &root->rb_node, *parent = NULL; + struct ntfs_index_ra *cnir; + + while (*new) { + parent = *new; + cnir = rb_entry(parent, struct ntfs_index_ra, rb_node); + if (nir->start_index < cnir->start_index) + new = &parent->rb_left; + else if (nir->start_index >= cnir->start_index + cnir->count) + new = &parent->rb_right; + else { + pr_err("nir start index : %ld, count : %d, cnir start_index : %ld, count : %d\n", + nir->start_index, nir->count, cnir->start_index, cnir->count); + return; } - /* - * Not a perfect match, need to do full blown collation so we - * know which way in the B+tree we have to go. - */ - rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, 1, - IGNORE_CASE, vol->upcase, vol->upcase_len); - /* - * If uname collates before the name of the current entry, there - * is definitely no such name in this index but we might need to - * descend into the B+tree so we just break out of the loop. - */ - if (rc == -1) - break; - /* The names are not equal, continue the search. */ - if (rc) - continue; - /* - * Names match with case insensitive comparison, now try the - * case sensitive comparison, which is required for proper - * collation. - */ - rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, 1, - CASE_SENSITIVE, vol->upcase, vol->upcase_len); - if (rc == -1) - break; - if (rc) - continue; - /* - * Perfect match, this will never happen as the - * ntfs_are_names_equal() call will have gotten a match but we - * still treat it correctly. - */ - goto found_it; } - /* - * We have finished with this index without success. Check for the - * presence of a child node. - */ - if (!(ie->flags & INDEX_ENTRY_NODE)) { - /* No child node, return -ENOENT. */ - err = -ENOENT; - goto err_out; - } /* Child node present, descend into it. */ - /* Consistency check: Verify that an index allocation exists. */ - if (!NInoIndexAllocPresent(dir_ni)) { - ntfs_error(sb, "No index allocation attribute but index entry " - "requires one. Directory inode 0x%lx is " - "corrupt or driver bug.", dir_ni->mft_no); - goto err_out; + + rb_link_node(&nir->rb_node, parent, new); + rb_insert_color(&nir->rb_node, root); +} + +static int ntfs_ia_blocks_readahead(struct ntfs_inode *ia_ni, loff_t pos) +{ + unsigned long dir_start_index, dir_end_index; + struct inode *ia_vi = VFS_I(ia_ni); + struct file_ra_state *dir_ra; + + dir_end_index = (i_size_read(ia_vi) + PAGE_SIZE - 1) >> PAGE_SHIFT; + dir_start_index = (pos + PAGE_SIZE - 1) >> PAGE_SHIFT; + + if (dir_start_index >= dir_end_index) + return 0; + + dir_ra = kzalloc(sizeof(*dir_ra), GFP_NOFS); + if (!dir_ra) + return -ENOMEM; + + file_ra_state_init(dir_ra, ia_vi->i_mapping); + dir_end_index = (i_size_read(ia_vi) + PAGE_SIZE - 1) >> PAGE_SHIFT; + dir_start_index = (pos + PAGE_SIZE - 1) >> PAGE_SHIFT; + dir_ra->ra_pages = dir_end_index - dir_start_index; + page_cache_sync_readahead(ia_vi->i_mapping, dir_ra, NULL, + dir_start_index, dir_end_index - dir_start_index); + kfree(dir_ra); + + return 0; +} + +static int ntfs_readdir(struct file *file, struct dir_context *actor) +{ + struct inode *vdir = file_inode(file); + struct super_block *sb = vdir->i_sb; + struct ntfs_inode *ndir = NTFS_I(vdir); + struct ntfs_volume *vol = NTFS_SB(sb); + struct ntfs_attr_search_ctx *ctx = NULL; + struct ntfs_index_context *ictx = NULL; + u8 *name; + struct index_root *ir; + struct index_entry *next = NULL; + struct ntfs_file_private *private = NULL; + int err = 0; + loff_t ie_pos = 2; /* initialize it with dot and dotdot size */ + struct ntfs_index_ra *nir = NULL; + unsigned long index; + struct rb_root ra_root = RB_ROOT; + struct file_ra_state *ra; + + ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.", + vdir->i_ino, actor->pos); + + if (file->private_data) { + private = file->private_data; + + if (actor->pos != private->curr_pos) { + /* + * If actor->pos is different from the previous passed + * one, Discard the private->key and fill dirent buffer + * with linear lookup. + */ + kfree(private->key); + private->key = NULL; + private->end_in_iterate = false; + } else if (private->end_in_iterate) { + kfree(private->key); + kfree(file->private_data); + file->private_data = NULL; + return 0; + } } - /* Get the starting vcn of the index_block holding the child node. */ - vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8); - ia_mapping = VFS_I(dir_ni)->i_mapping; - /* - * We are done with the index root and the mft record. Release them, - * otherwise we deadlock with ntfs_map_page(). - */ - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(dir_ni); - m = NULL; - ctx = NULL; -descend_into_child_node: + + /* Emulate . and .. for all directories. */ + if (!dir_emit_dots(file, actor)) + return 0; + /* - * Convert vcn to index into the index allocation attribute in units - * of PAGE_SIZE and map the page cache page, reading it from - * disk if necessary. + * Allocate a buffer to store the current name being processed + * converted to format determined by current NLS. */ - page = ntfs_map_page(ia_mapping, vcn << - dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT); - if (IS_ERR(page)) { - ntfs_error(sb, "Failed to map directory index page, error %ld.", - -PTR_ERR(page)); - err = PTR_ERR(page); - goto err_out; + name = kmalloc(NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1, GFP_NOFS); + if (unlikely(!name)) + return -ENOMEM; + + mutex_lock_nested(&ndir->mrec_lock, NTFS_INODE_MUTEX_PARENT); + ictx = ntfs_index_ctx_get(ndir, I30, 4); + if (!ictx) { + kfree(name); + mutex_unlock(&ndir->mrec_lock); + return -ENOMEM; } - lock_page(page); - kaddr = (u8*)page_address(page); -fast_descend_into_child_node: - /* Get to the index allocation block. */ - ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << - dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK)); - /* Bounds checks. */ - if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) { - ntfs_error(sb, "Out of bounds check failed. Corrupt directory " - "inode 0x%lx or driver bug.", dir_ni->mft_no); - goto unm_err_out; - } - /* Catch multi sector transfer fixup errors. */ - if (unlikely(!ntfs_is_indx_record(ia->magic))) { - ntfs_error(sb, "Directory index record with vcn 0x%llx is " - "corrupt. Corrupt inode 0x%lx. Run chkdsk.", - (unsigned long long)vcn, dir_ni->mft_no); - goto unm_err_out; - } - if (sle64_to_cpu(ia->index_block_vcn) != vcn) { - ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " - "different from expected VCN (0x%llx). " - "Directory inode 0x%lx is corrupt or driver " - "bug.", (unsigned long long) - sle64_to_cpu(ia->index_block_vcn), - (unsigned long long)vcn, dir_ni->mft_no); - goto unm_err_out; - } - if (le32_to_cpu(ia->index.allocated_size) + 0x18 != - dir_ni->itype.index.block_size) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " - "0x%lx has a size (%u) differing from the " - "directory specified size (%u). Directory " - "inode is corrupt or driver bug.", - (unsigned long long)vcn, dir_ni->mft_no, - le32_to_cpu(ia->index.allocated_size) + 0x18, - dir_ni->itype.index.block_size); - goto unm_err_out; - } - index_end = (u8*)ia + dir_ni->itype.index.block_size; - if (index_end > kaddr + PAGE_SIZE) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " - "0x%lx crosses page boundary. Impossible! " - "Cannot access! This is probably a bug in the " - "driver.", (unsigned long long)vcn, - dir_ni->mft_no); - goto unm_err_out; - } - index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); - if (index_end > (u8*)ia + dir_ni->itype.index.block_size) { - ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory " - "inode 0x%lx exceeds maximum size.", - (unsigned long long)vcn, dir_ni->mft_no); - goto unm_err_out; + + ra = kzalloc(sizeof(struct file_ra_state), GFP_NOFS); + if (!ra) { + kfree(name); + ntfs_index_ctx_put(ictx); + mutex_unlock(&ndir->mrec_lock); + return -ENOMEM; } - /* The first index entry. */ - ie = (INDEX_ENTRY*)((u8*)&ia->index + - le32_to_cpu(ia->index.entries_offset)); - /* - * Iterate similar to above big loop but applied to index buffer, thus - * loop until we exceed valid memory (corruption case) or until we - * reach the last entry. - */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - /* Bounds check. */ - if ((u8*)ie < (u8*)ia || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->key_length) > - index_end) { - ntfs_error(sb, "Index entry out of bounds in " - "directory inode 0x%lx.", - dir_ni->mft_no); - goto unm_err_out; - } - /* - * The last entry cannot contain a name. It can however contain - * a pointer to a child node in the B+tree so we just break out. - */ - if (ie->flags & INDEX_ENTRY_END) - break; - /* - * If the current entry has a name type of POSIX, the name is - * case sensitive and not otherwise. This has the effect of us - * not being able to access any POSIX file names which collate - * after the non-POSIX one when they only differ in case, but - * anyone doing screwy stuff like that deserves to burn in - * hell... Doing that kind of stuff on NT4 actually causes - * corruption on the partition even when using SP6a and Linux - * is not involved at all. - */ - ic = ie->key.file_name.file_name_type ? IGNORE_CASE : - CASE_SENSITIVE; - /* - * If the names match perfectly, we are done and return the - * mft reference of the inode (i.e. the inode number together - * with the sequence number for consistency checking. We - * convert it to cpu format before returning. - */ - if (ntfs_are_names_equal(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, ic, - vol->upcase, vol->upcase_len)) { -found_it2: - mref = le64_to_cpu(ie->data.dir.indexed_file); - unlock_page(page); - ntfs_unmap_page(page); - return mref; - } - /* - * Not a perfect match, need to do full blown collation so we - * know which way in the B+tree we have to go. - */ - rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, 1, - IGNORE_CASE, vol->upcase, vol->upcase_len); - /* - * If uname collates before the name of the current entry, there - * is definitely no such name in this index but we might need to - * descend into the B+tree so we just break out of the loop. - */ - if (rc == -1) - break; - /* The names are not equal, continue the search. */ - if (rc) - continue; - /* - * Names match with case insensitive comparison, now try the - * case sensitive comparison, which is required for proper - * collation. - */ - rc = ntfs_collate_names(uname, uname_len, - (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, 1, - CASE_SENSITIVE, vol->upcase, vol->upcase_len); - if (rc == -1) - break; - if (rc) - continue; + file_ra_state_init(ra, vol->mft_ino->i_mapping); + + if (private && private->key) { /* - * Perfect match, this will never happen as the - * ntfs_are_names_equal() call will have gotten a match but we - * still treat it correctly. + * Find index witk private->key using ntfs_index_lookup() + * instead of linear index lookup. */ - goto found_it2; - } - /* - * We have finished with this index buffer without success. Check for - * the presence of a child node. - */ - if (ie->flags & INDEX_ENTRY_NODE) { - if ((ia->index.flags & NODE_MASK) == LEAF_NODE) { - ntfs_error(sb, "Index entry with child node found in " - "a leaf node in directory inode 0x%lx.", - dir_ni->mft_no); - goto unm_err_out; + err = ntfs_index_lookup(private->key, + le16_to_cpu(private->key_length), + ictx); + if (!err) { + next = ictx->entry; + /* + * Update ie_pos with private->curr_pos + * to make next d_off of dirent correct. + */ + ie_pos = private->curr_pos; + + if (actor->pos > vol->mft_record_size && ictx->ia_ni) { + err = ntfs_ia_blocks_readahead(ictx->ia_ni, actor->pos); + if (err) + goto out; + } + + goto nextdir; + } else { + goto out; } - /* Child node present, descend into it. */ - old_vcn = vcn; - vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8); - if (vcn >= 0) { - /* If vcn is in the same page cache page as old_vcn we - * recycle the mapped page. */ - if (old_vcn << vol->cluster_size_bits >> - PAGE_SHIFT == vcn << - vol->cluster_size_bits >> - PAGE_SHIFT) - goto fast_descend_into_child_node; - unlock_page(page); - ntfs_unmap_page(page); - goto descend_into_child_node; + } else if (!private) { + private = kzalloc(sizeof(struct ntfs_file_private), GFP_KERNEL); + if (!private) { + err = -ENOMEM; + goto out; } - ntfs_error(sb, "Negative child node vcn in directory inode " - "0x%lx.", dir_ni->mft_no); - goto unm_err_out; + file->private_data = private; } - /* No child node, return -ENOENT. */ - ntfs_debug("Entry not found."); - err = -ENOENT; -unm_err_out: - unlock_page(page); - ntfs_unmap_page(page); -err_out: - if (!err) - err = -EIO; - if (ctx) + + ctx = ntfs_attr_get_search_ctx(ndir, NULL); + if (!ctx) { + err = -ENOMEM; + goto out; + } + + /* Find the index root attribute in the mft record. */ + if (ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, 0, + ctx)) { + ntfs_error(sb, "Index root attribute missing in directory inode %ld", + ndir->mft_no); ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(dir_ni); - return ERR_MREF(err); -dir_err_out: - ntfs_error(sb, "Corrupt directory. Aborting lookup."); - goto err_out; -} + err = -ENOMEM; + goto out; + } -#endif + /* Get to the index root value. */ + ir = (struct index_root *)((u8 *)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset)); -/** - * ntfs_filldir - ntfs specific filldir method - * @vol: current ntfs volume - * @ndir: ntfs inode of current directory - * @ia_page: page in which the index allocation buffer @ie is in resides - * @ie: current index entry - * @name: buffer to use for the converted name - * @actor: what to feed the entries to - * - * Convert the Unicode @name to the loaded NLS and pass it to the @filldir - * callback. - * - * If @ia_page is not NULL it is the locked page containing the index - * allocation block containing the index entry @ie. - * - * Note, we drop (and then reacquire) the page lock on @ia_page across the - * @filldir() call otherwise we would deadlock with NFSd when it calls ->lookup - * since ntfs_lookup() will lock the same page. As an optimization, we do not - * retake the lock if we are returning a non-zero value as ntfs_readdir() - * would need to drop the lock immediately anyway. - */ -static inline int ntfs_filldir(ntfs_volume *vol, - ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie, - u8 *name, struct dir_context *actor) -{ - unsigned long mref; - int name_len; - unsigned dt_type; - FILE_NAME_TYPE_FLAGS name_type; + ictx->ir = ir; + ictx->actx = ctx; + ictx->parent_vcn[ictx->pindex] = VCN_INDEX_ROOT_PARENT; + ictx->is_in_root = true; + ictx->parent_pos[ictx->pindex] = 0; - name_type = ie->key.file_name.file_name_type; - if (name_type == FILE_NAME_DOS) { - ntfs_debug("Skipping DOS name space entry."); - return 0; - } - if (MREF_LE(ie->data.dir.indexed_file) == FILE_root) { - ntfs_debug("Skipping root directory self reference entry."); - return 0; - } - if (MREF_LE(ie->data.dir.indexed_file) < FILE_first_user && - !NVolShowSystemFiles(vol)) { - ntfs_debug("Skipping system file."); - return 0; - } - name_len = ntfs_ucstonls(vol, (ntfschar*)&ie->key.file_name.file_name, - ie->key.file_name.file_name_length, &name, - NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1); - if (name_len <= 0) { - ntfs_warning(vol->sb, "Skipping unrepresentable inode 0x%llx.", - (long long)MREF_LE(ie->data.dir.indexed_file)); - return 0; + ictx->block_size = le32_to_cpu(ir->index_block_size); + if (ictx->block_size < NTFS_BLOCK_SIZE) { + ntfs_error(sb, "Index block size (%d) is smaller than the sector size (%d)", + ictx->block_size, NTFS_BLOCK_SIZE); + err = -EIO; + goto out; } - if (ie->key.file_name.file_attributes & - FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT) - dt_type = DT_DIR; - else - dt_type = DT_REG; - mref = MREF_LE(ie->data.dir.indexed_file); - /* - * Drop the page lock otherwise we deadlock with NFS when it calls - * ->lookup since ntfs_lookup() will lock the same page. - */ - if (ia_page) - unlock_page(ia_page); - ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode " - "0x%lx, DT_%s.", name, name_len, actor->pos, mref, - dt_type == DT_DIR ? "DIR" : "REG"); - if (!dir_emit(actor, name, name_len, mref, dt_type)) - return 1; - /* Relock the page but not if we are aborting ->readdir. */ - if (ia_page) - lock_page(ia_page); - return 0; -} -/* - * We use the same basic approach as the old NTFS driver, i.e. we parse the - * index root entries and then the index allocation entries that are marked - * as in use in the index bitmap. - * - * While this will return the names in random order this doesn't matter for - * ->readdir but OTOH results in a faster ->readdir. - * - * VFS calls ->readdir without BKL but with i_mutex held. This protects the VFS - * parts (e.g. ->f_pos and ->i_size, and it also protects against directory - * modifications). - * - * Locking: - Caller must hold i_mutex on the directory. - * - Each page cache page in the index allocation mapping must be - * locked whilst being accessed otherwise we may find a corrupt - * page due to it being under ->writepage at the moment which - * applies the mst protection fixups before writing out and then - * removes them again after the write is complete after which it - * unlocks the page. - */ -static int ntfs_readdir(struct file *file, struct dir_context *actor) -{ - s64 ia_pos, ia_start, prev_ia_pos, bmp_pos; - loff_t i_size; - struct inode *bmp_vi, *vdir = file_inode(file); - struct super_block *sb = vdir->i_sb; - ntfs_inode *ndir = NTFS_I(vdir); - ntfs_volume *vol = NTFS_SB(sb); - MFT_RECORD *m; - INDEX_ROOT *ir = NULL; - INDEX_ENTRY *ie; - INDEX_ALLOCATION *ia; - u8 *name = NULL; - int rc, err, ir_pos, cur_bmp_pos; - struct address_space *ia_mapping, *bmp_mapping; - struct page *bmp_page = NULL, *ia_page = NULL; - u8 *kaddr, *bmp, *index_end; - ntfs_attr_search_ctx *ctx; + if (vol->cluster_size <= ictx->block_size) + ictx->vcn_size_bits = vol->cluster_size_bits; + else + ictx->vcn_size_bits = NTFS_BLOCK_SIZE_BITS; - ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.", - vdir->i_ino, actor->pos); - rc = err = 0; - /* Are we at end of dir yet? */ - i_size = i_size_read(vdir); - if (actor->pos >= i_size + vol->mft_record_size) - return 0; - /* Emulate . and .. for all directories. */ - if (!dir_emit_dots(file, actor)) - return 0; - m = NULL; - ctx = NULL; - /* - * Allocate a buffer to store the current name being processed - * converted to format determined by current NLS. - */ - name = kmalloc(NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1, GFP_NOFS); - if (unlikely(!name)) { - err = -ENOMEM; - goto err_out; - } - /* Are we jumping straight into the index allocation attribute? */ - if (actor->pos >= vol->mft_record_size) - goto skip_index_root; - /* Get hold of the mft record for the directory. */ - m = map_mft_record(ndir); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(ndir, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - /* Get the offset into the index root attribute. */ - ir_pos = (s64)actor->pos; - /* Find the index root attribute in the mft record. */ - err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, - 0, ctx); - if (unlikely(err)) { - ntfs_error(sb, "Index root attribute missing in directory " - "inode 0x%lx.", vdir->i_ino); - goto err_out; - } - /* - * Copy the index root attribute value to a buffer so that we can put - * the search context and unmap the mft record before calling the - * filldir() callback. We need to do this because of NFSd which calls - * ->lookup() from its filldir callback() and this causes NTFS to - * deadlock as ntfs_lookup() maps the mft record of the directory and - * we have got it mapped here already. The only solution is for us to - * unmap the mft record here so that a call to ntfs_lookup() is able to - * map the mft record without deadlocking. - */ - rc = le32_to_cpu(ctx->attr->data.resident.value_length); - ir = kmalloc(rc, GFP_NOFS); - if (unlikely(!ir)) { - err = -ENOMEM; - goto err_out; - } - /* Copy the index root value (it has been verified in read_inode). */ - memcpy(ir, (u8*)ctx->attr + - le16_to_cpu(ctx->attr->data.resident.value_offset), rc); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(ndir); - ctx = NULL; - m = NULL; - index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); /* The first index entry. */ - ie = (INDEX_ENTRY*)((u8*)&ir->index + + next = (struct index_entry *)((u8 *)&ir->index + le32_to_cpu(ir->index.entries_offset)); - /* - * Loop until we exceed valid memory (corruption case) or until we - * reach the last entry or until filldir tells us it has had enough - * or signals an error (both covered by the rc test). - */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - ntfs_debug("In index root, offset 0x%zx.", (u8*)ie - (u8*)ir); - /* Bounds checks. */ - if (unlikely((u8*)ie < (u8*)ir || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->key_length) > - index_end)) - goto err_out; - /* The last entry cannot contain a name. */ - if (ie->flags & INDEX_ENTRY_END) - break; - /* Skip index root entry if continuing previous readdir. */ - if (ir_pos > (u8*)ie - (u8*)ir) - continue; - /* Advance the position even if going to skip the entry. */ - actor->pos = (u8*)ie - (u8*)ir; - /* Submit the name to the filldir callback. */ - rc = ntfs_filldir(vol, ndir, NULL, ie, name, actor); - if (rc) { - kfree(ir); - goto abort; + + if (next->flags & INDEX_ENTRY_NODE) { + ictx->ia_ni = ntfs_ia_open(ictx, ictx->idx_ni); + if (!ictx->ia_ni) { + err = -EINVAL; + goto out; } + + err = ntfs_ia_blocks_readahead(ictx->ia_ni, actor->pos); + if (err) + goto out; } - /* We are done with the index root and can free the buffer. */ - kfree(ir); - ir = NULL; - /* If there is no index allocation attribute we are finished. */ - if (!NInoIndexAllocPresent(ndir)) - goto EOD; - /* Advance fpos to the beginning of the index allocation. */ - actor->pos = vol->mft_record_size; -skip_index_root: - kaddr = NULL; - prev_ia_pos = -1LL; - /* Get the offset into the index allocation attribute. */ - ia_pos = (s64)actor->pos - vol->mft_record_size; - ia_mapping = vdir->i_mapping; - ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino); - bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4); - if (IS_ERR(bmp_vi)) { - ntfs_error(sb, "Failed to get bitmap attribute."); - err = PTR_ERR(bmp_vi); - goto err_out; - } - bmp_mapping = bmp_vi->i_mapping; - /* Get the starting bitmap bit position and sanity check it. */ - bmp_pos = ia_pos >> ndir->itype.index.block_size_bits; - if (unlikely(bmp_pos >> 3 >= i_size_read(bmp_vi))) { - ntfs_error(sb, "Current index allocation position exceeds " - "index bitmap size."); - goto iput_err_out; - } - /* Get the starting bit position in the current bitmap page. */ - cur_bmp_pos = bmp_pos & ((PAGE_SIZE * 8) - 1); - bmp_pos &= ~(u64)((PAGE_SIZE * 8) - 1); -get_next_bmp_page: - ntfs_debug("Reading bitmap with page index 0x%llx, bit ofs 0x%llx", - (unsigned long long)bmp_pos >> (3 + PAGE_SHIFT), - (unsigned long long)bmp_pos & - (unsigned long long)((PAGE_SIZE * 8) - 1)); - bmp_page = ntfs_map_page(bmp_mapping, - bmp_pos >> (3 + PAGE_SHIFT)); - if (IS_ERR(bmp_page)) { - ntfs_error(sb, "Reading index bitmap failed."); - err = PTR_ERR(bmp_page); - bmp_page = NULL; - goto iput_err_out; - } - bmp = (u8*)page_address(bmp_page); - /* Find next index block in use. */ - while (!(bmp[cur_bmp_pos >> 3] & (1 << (cur_bmp_pos & 7)))) { -find_next_index_buffer: - cur_bmp_pos++; - /* - * If we have reached the end of the bitmap page, get the next - * page, and put away the old one. - */ - if (unlikely((cur_bmp_pos >> 3) >= PAGE_SIZE)) { - ntfs_unmap_page(bmp_page); - bmp_pos += PAGE_SIZE * 8; - cur_bmp_pos = 0; - goto get_next_bmp_page; + + if (next->flags & INDEX_ENTRY_NODE) { + next = ntfs_index_walk_down(next, ictx); + if (!next) { + err = -EIO; + goto out; } - /* If we have reached the end of the bitmap, we are done. */ - if (unlikely(((bmp_pos + cur_bmp_pos) >> 3) >= i_size)) - goto unm_EOD; - ia_pos = (bmp_pos + cur_bmp_pos) << - ndir->itype.index.block_size_bits; } - ntfs_debug("Handling index buffer 0x%llx.", - (unsigned long long)bmp_pos + cur_bmp_pos); - /* If the current index buffer is in the same page we reuse the page. */ - if ((prev_ia_pos & (s64)PAGE_MASK) != - (ia_pos & (s64)PAGE_MASK)) { - prev_ia_pos = ia_pos; - if (likely(ia_page != NULL)) { - unlock_page(ia_page); - ntfs_unmap_page(ia_page); + + if (next && !(next->flags & INDEX_ENTRY_END)) + goto nextdir; + + while ((next = ntfs_index_next(next, ictx)) != NULL) { +nextdir: + /* Check the consistency of an index entry */ + if (ntfs_index_entry_inconsistent(ictx, vol, next, COLLATION_FILE_NAME, + ndir->mft_no)) { + err = -EIO; + goto out; } - /* - * Map the page cache page containing the current ia_pos, - * reading it from disk if necessary. - */ - ia_page = ntfs_map_page(ia_mapping, ia_pos >> PAGE_SHIFT); - if (IS_ERR(ia_page)) { - ntfs_error(sb, "Reading index allocation data failed."); - err = PTR_ERR(ia_page); - ia_page = NULL; - goto err_out; + + if (ie_pos < actor->pos) { + ie_pos += next->length; + continue; } - lock_page(ia_page); - kaddr = (u8*)page_address(ia_page); - } - /* Get the current index buffer. */ - ia = (INDEX_ALLOCATION*)(kaddr + (ia_pos & ~PAGE_MASK & - ~(s64)(ndir->itype.index.block_size - 1))); - /* Bounds checks. */ - if (unlikely((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE)) { - ntfs_error(sb, "Out of bounds check failed. Corrupt directory " - "inode 0x%lx or driver bug.", vdir->i_ino); - goto err_out; - } - /* Catch multi sector transfer fixup errors. */ - if (unlikely(!ntfs_is_indx_record(ia->magic))) { - ntfs_error(sb, "Directory index record with vcn 0x%llx is " - "corrupt. Corrupt inode 0x%lx. Run chkdsk.", - (unsigned long long)ia_pos >> - ndir->itype.index.vcn_size_bits, vdir->i_ino); - goto err_out; - } - if (unlikely(sle64_to_cpu(ia->index_block_vcn) != (ia_pos & - ~(s64)(ndir->itype.index.block_size - 1)) >> - ndir->itype.index.vcn_size_bits)) { - ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " - "different from expected VCN (0x%llx). " - "Directory inode 0x%lx is corrupt or driver " - "bug. ", (unsigned long long) - sle64_to_cpu(ia->index_block_vcn), - (unsigned long long)ia_pos >> - ndir->itype.index.vcn_size_bits, vdir->i_ino); - goto err_out; - } - if (unlikely(le32_to_cpu(ia->index.allocated_size) + 0x18 != - ndir->itype.index.block_size)) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " - "0x%lx has a size (%u) differing from the " - "directory specified size (%u). Directory " - "inode is corrupt or driver bug.", - (unsigned long long)ia_pos >> - ndir->itype.index.vcn_size_bits, vdir->i_ino, - le32_to_cpu(ia->index.allocated_size) + 0x18, - ndir->itype.index.block_size); - goto err_out; - } - index_end = (u8*)ia + ndir->itype.index.block_size; - if (unlikely(index_end > kaddr + PAGE_SIZE)) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " - "0x%lx crosses page boundary. Impossible! " - "Cannot access! This is probably a bug in the " - "driver.", (unsigned long long)ia_pos >> - ndir->itype.index.vcn_size_bits, vdir->i_ino); - goto err_out; - } - ia_start = ia_pos & ~(s64)(ndir->itype.index.block_size - 1); - index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); - if (unlikely(index_end > (u8*)ia + ndir->itype.index.block_size)) { - ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory " - "inode 0x%lx exceeds maximum size.", - (unsigned long long)ia_pos >> - ndir->itype.index.vcn_size_bits, vdir->i_ino); - goto err_out; - } - /* The first index entry in this index buffer. */ - ie = (INDEX_ENTRY*)((u8*)&ia->index + - le32_to_cpu(ia->index.entries_offset)); - /* - * Loop until we exceed valid memory (corruption case) or until we - * reach the last entry or until filldir tells us it has had enough - * or signals an error (both covered by the rc test). - */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - ntfs_debug("In index allocation, offset 0x%llx.", - (unsigned long long)ia_start + - (unsigned long long)((u8*)ie - (u8*)ia)); - /* Bounds checks. */ - if (unlikely((u8*)ie < (u8*)ia || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->key_length) > - index_end)) - goto err_out; - /* The last entry cannot contain a name. */ - if (ie->flags & INDEX_ENTRY_END) + + actor->pos = ie_pos; + + index = NTFS_MFT_NR_TO_PIDX(vol, MREF_LE(next->data.dir.indexed_file)); + if (nir) { + struct ntfs_index_ra *cnir; + struct rb_node *node = ra_root.rb_node; + + if (nir->start_index <= index && + index < nir->start_index + nir->count) { + /* No behavior */ + goto filldir; + } + + while (node) { + cnir = rb_entry(node, struct ntfs_index_ra, rb_node); + if (cnir->start_index <= index && + index < cnir->start_index + cnir->count) { + goto filldir; + } else if (cnir->start_index + cnir->count == index) { + cnir->count++; + goto filldir; + } else if (!cnir->start_index && cnir->start_index - 1 == index) { + cnir->start_index = index; + goto filldir; + } + + if (index < cnir->start_index) + node = node->rb_left; + else if (index >= cnir->start_index + cnir->count) + node = node->rb_right; + } + + if (nir->start_index + nir->count == index) { + nir->count++; + } else if (!nir->start_index && nir->start_index - 1 == index) { + nir->start_index = index; + } else if (nir->count > 2) { + ntfs_insert_rb(nir, &ra_root); + nir = NULL; + } else { + nir->start_index = index; + nir->count = 1; + } + } + + if (!nir) { + nir = kzalloc(sizeof(struct ntfs_index_ra), GFP_KERNEL); + if (nir) { + nir->start_index = index; + nir->count = 1; + } + } + +filldir: + /* Submit the name to the filldir callback. */ + err = ntfs_filldir(vol, ndir, NULL, next, name, actor); + if (err) { + /* + * Store index key value to file private_data to start + * from current index offset on next round. + */ + private = file->private_data; + kfree(private->key); + private->key = kmalloc(le16_to_cpu(next->key_length), GFP_KERNEL); + if (!private->key) { + err = -ENOMEM; + goto out; + } + + memcpy(private->key, &next->key.file_name, le16_to_cpu(next->key_length)); + private->key_length = next->key_length; break; - /* Skip index block entry if continuing previous readdir. */ - if (ia_pos - ia_start > (u8*)ie - (u8*)ia) - continue; - /* Advance the position even if going to skip the entry. */ - actor->pos = (u8*)ie - (u8*)ia + - (sle64_to_cpu(ia->index_block_vcn) << - ndir->itype.index.vcn_size_bits) + - vol->mft_record_size; - /* - * Submit the name to the @filldir callback. Note, - * ntfs_filldir() drops the lock on @ia_page but it retakes it - * before returning, unless a non-zero value is returned in - * which case the page is left unlocked. - */ - rc = ntfs_filldir(vol, ndir, ia_page, ie, name, actor); - if (rc) { - /* @ia_page is already unlocked in this case. */ - ntfs_unmap_page(ia_page); - ntfs_unmap_page(bmp_page); - iput(bmp_vi); - goto abort; } + ie_pos += next->length; } - goto find_next_index_buffer; -unm_EOD: - if (ia_page) { - unlock_page(ia_page); - ntfs_unmap_page(ia_page); - } - ntfs_unmap_page(bmp_page); - iput(bmp_vi); -EOD: - /* We are finished, set fpos to EOD. */ - actor->pos = i_size + vol->mft_record_size; -abort: - kfree(name); - return 0; -err_out: - if (bmp_page) { - ntfs_unmap_page(bmp_page); -iput_err_out: - iput(bmp_vi); + + if (!err) + private->end_in_iterate = true; + else + err = 0; + + private->curr_pos = actor->pos = ie_pos; +out: + while (!RB_EMPTY_ROOT(&ra_root)) { + struct ntfs_index_ra *cnir; + struct rb_node *node; + + node = rb_first(&ra_root); + cnir = rb_entry(node, struct ntfs_index_ra, rb_node); + ra->ra_pages = cnir->count; + page_cache_sync_readahead(vol->mft_ino->i_mapping, ra, NULL, + cnir->start_index, cnir->count); + rb_erase(node, &ra_root); + kfree(cnir); } - if (ia_page) { - unlock_page(ia_page); - ntfs_unmap_page(ia_page); + + if (err) { + private->curr_pos = actor->pos; + private->end_in_iterate = true; + err = 0; } - kfree(ir); + ntfs_index_ctx_put(ictx); kfree(name); - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(ndir); - if (!err) - err = -EIO; - ntfs_debug("Failed. Returning error code %i.", -err); + kfree(nir); + kfree(ra); + mutex_unlock(&ndir->mrec_lock); return err; } +int ntfs_check_empty_dir(struct ntfs_inode *ni, struct mft_record *ni_mrec) +{ + struct ntfs_attr_search_ctx *ctx; + int ret = 0; + + if (!(ni_mrec->flags & MFT_RECORD_IS_DIRECTORY)) + return 0; + + ctx = ntfs_attr_get_search_ctx(ni, NULL); + if (!ctx) { + ntfs_error(ni->vol->sb, "Failed to get search context"); + return -ENOMEM; + } + + /* Find the index root attribute in the mft record. */ + ret = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, + 0, ctx); + if (ret) { + ntfs_error(ni->vol->sb, "Index root attribute missing in directory inode %lld", + (unsigned long long)ni->mft_no); + ntfs_attr_put_search_ctx(ctx); + return ret; + } + + /* Non-empty directory? */ + if (ctx->attr->data.resident.value_length != + sizeof(struct index_root) + sizeof(struct index_entry_header)) { + /* Both ENOTEMPTY and EEXIST are ok. We use the more common. */ + ret = -ENOTEMPTY; + ntfs_debug("Directory is not empty\n"); + } + + ntfs_attr_put_search_ctx(ctx); + + return ret; +} + /** * ntfs_dir_open - called when an inode is about to be opened * @vi: inode to be opened @@ -1457,13 +1103,21 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp) return 0; } -#ifdef NTFS_RW +static int ntfs_dir_release(struct inode *vi, struct file *filp) +{ + if (filp->private_data) { + kfree(((struct ntfs_file_private *)filp->private_data)->key); + kfree(filp->private_data); + filp->private_data = NULL; + } + return 0; +} /** * ntfs_dir_fsync - sync a directory to disk - * @filp: directory to be synced - * @start: offset in bytes of the beginning of data range to sync - * @end: offset in bytes of the end of data range (inclusive) + * @filp: file describing the directory to be synced + * @start: start offset to be synced + * @end: end offset to be synced * @datasync: if non-zero only flush user data and not metadata * * Data integrity sync of a directory to disk. Used for fsync, fdatasync, and @@ -1479,27 +1133,55 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp) * anyway. * * Locking: Caller must hold i_mutex on the inode. - * - * TODO: We should probably also write all attribute/index inodes associated - * with this inode but since we have no simple way of getting to them we ignore - * this problem for now. We do write the $BITMAP attribute if it is present - * which is the important one for a directory so things are not too bad. */ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *bmp_vi, *vi = filp->f_mapping->host; + struct ntfs_volume *vol = NTFS_I(vi)->vol; + struct ntfs_inode *ni = NTFS_I(vi); + struct ntfs_attr_search_ctx *ctx; + struct inode *parent_vi, *ia_vi; int err, ret; - ntfs_attr na; + struct ntfs_attr na; ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); + if (NVolShutdown(vol)) + return -EIO; + + ctx = ntfs_attr_get_search_ctx(ni, NULL); + if (!ctx) + return -ENOMEM; + + mutex_lock_nested(&ni->mrec_lock, NTFS_INODE_MUTEX_NORMAL_2); + while (!(err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0, NULL, 0, ctx))) { + struct file_name_attr *fn = (struct file_name_attr *)((u8 *)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset)); + + parent_vi = ntfs_iget(vi->i_sb, MREF_LE(fn->parent_directory)); + if (IS_ERR(parent_vi)) + continue; + mutex_lock_nested(&NTFS_I(parent_vi)->mrec_lock, NTFS_INODE_MUTEX_PARENT_2); + ia_vi = ntfs_index_iget(parent_vi, I30, 4); + mutex_unlock(&NTFS_I(parent_vi)->mrec_lock); + if (IS_ERR(ia_vi)) { + iput(parent_vi); + continue; + } + write_inode_now(ia_vi, 1); + iput(ia_vi); + write_inode_now(parent_vi, 1); + iput(parent_vi); + } + mutex_unlock(&ni->mrec_lock); + ntfs_attr_put_search_ctx(ctx); + err = file_write_and_wait_range(filp, start, end); if (err) return err; inode_lock(vi); - BUG_ON(!S_ISDIR(vi->i_mode)); /* If the bitmap attribute inode is in memory sync it, too. */ na.mft_no = vi->i_ino; na.type = AT_BITMAP; @@ -1507,34 +1189,41 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end, na.name_len = 4; bmp_vi = ilookup5(vi->i_sb, vi->i_ino, ntfs_test_inode, &na); if (bmp_vi) { - write_inode_now(bmp_vi, !datasync); + write_inode_now(bmp_vi, !datasync); iput(bmp_vi); } ret = __ntfs_write_inode(vi, 1); + write_inode_now(vi, !datasync); + + write_inode_now(vol->mftbmp_ino, 1); + down_write(&vol->lcnbmp_lock); + write_inode_now(vol->lcnbmp_ino, 1); + up_write(&vol->lcnbmp_lock); + write_inode_now(vol->mft_ino, 1); + err = sync_blockdev(vi->i_sb->s_bdev); if (unlikely(err && !ret)) ret = err; if (likely(!ret)) ntfs_debug("Done."); else - ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " - "%u.", datasync ? "data" : "", vi->i_ino, -ret); + ntfs_warning(vi->i_sb, + "Failed to f%ssync inode 0x%lx. Error %u.", + datasync ? "data" : "", vi->i_ino, -ret); inode_unlock(vi); return ret; } -#endif /* NTFS_RW */ - -WRAP_DIR_ITER(ntfs_readdir) // FIXME! const struct file_operations ntfs_dir_ops = { .llseek = generic_file_llseek, /* Seek inside directory. */ .read = generic_read_dir, /* Return -EISDIR. */ - .iterate_shared = shared_ntfs_readdir, /* Read directory contents. */ -#ifdef NTFS_RW + .iterate_shared = ntfs_readdir, /* Read directory contents. */ .fsync = ntfs_dir_fsync, /* Sync a directory to disk. */ -#endif /* NTFS_RW */ - /*.ioctl = ,*/ /* Perform function on the - mounted filesystem. */ .open = ntfs_dir_open, /* Open directory. */ + .release = ntfs_dir_release, + .unlocked_ioctl = ntfsp_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ntfsp_compat_ioctl, +#endif }; diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c index d46c2c03a032..d22a44dd4b33 100644 --- a/fs/ntfs/index.c +++ b/fs/ntfs/index.c @@ -1,217 +1,607 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * index.c - NTFS kernel index handling. Part of the Linux-NTFS project. + * NTFS kernel index handling. Part of the Linux-NTFS project. * * Copyright (c) 2004-2005 Anton Altaparmakov + * Copyright (c) 2025 LG Electronics Co., Ltd. + * + * Part of this file is based on code from the NTFS-3G project. + * and is copyrighted by the respective authors below: + * Copyright (c) 2004-2005 Anton Altaparmakov + * Copyright (c) 2004-2005 Richard Russon + * Copyright (c) 2005-2006 Yura Pakhuchiy + * Copyright (c) 2005-2008 Szabolcs Szakacsits + * Copyright (c) 2007-2021 Jean-Pierre Andre */ -#include - -#include "aops.h" #include "collate.h" -#include "debug.h" #include "index.h" #include "ntfs.h" +#include "malloc.h" +#include "attrlist.h" + +/* + * ntfs_index_entry_inconsistent - Check the consistency of an index entry + * + * Make sure data and key do not overflow from entry. + * As a side effect, an entry with zero length is rejected. + * This entry must be a full one (no INDEX_ENTRY_END flag), and its + * length must have been checked beforehand to not overflow from the + * index record. + */ +int ntfs_index_entry_inconsistent(struct ntfs_index_context *icx, + struct ntfs_volume *vol, const struct index_entry *ie, + __le32 collation_rule, u64 inum) +{ + if (icx) { + struct index_header *ih; + u8 *ie_start, *ie_end; + + if (icx->is_in_root) + ih = &icx->ir->index; + else + ih = &icx->ib->index; + + if ((le32_to_cpu(ih->index_length) > le32_to_cpu(ih->allocated_size)) || + (le32_to_cpu(ih->index_length) > icx->block_size)) { + ntfs_error(vol->sb, "%s Index entry(0x%p)'s length is too big.", + icx->is_in_root ? "Index root" : "Index block", + (u8 *)icx->entry); + return -EINVAL; + } + + ie_start = (u8 *)ih + le32_to_cpu(ih->entries_offset); + ie_end = (u8 *)ih + le32_to_cpu(ih->index_length); + + if (ie_start > (u8 *)ie || + ie_end <= ((u8 *)ie + ie->length) || + ie->length > le32_to_cpu(ih->allocated_size) || + ie->length > icx->block_size) { + ntfs_error(vol->sb, "Index entry(0x%p) is out of range from %s", + (u8 *)icx->entry, + icx->is_in_root ? "index root" : "index block"); + return -EIO; + } + } + + if (ie->key_length && + ((le16_to_cpu(ie->key_length) + offsetof(struct index_entry, key)) > + le16_to_cpu(ie->length))) { + ntfs_error(vol->sb, "Overflow from index entry in inode %lld\n", + (long long)inum); + return -EIO; + + } else { + if (collation_rule == COLLATION_FILE_NAME) { + if ((offsetof(struct index_entry, key.file_name.file_name) + + ie->key.file_name.file_name_length * sizeof(__le16)) > + le16_to_cpu(ie->length)) { + ntfs_error(vol->sb, + "File name overflow from index entry in inode %lld\n", + (long long)inum); + return -EIO; + } + } else { + if (ie->data.vi.data_length && + ((le16_to_cpu(ie->data.vi.data_offset) + + le16_to_cpu(ie->data.vi.data_length)) > + le16_to_cpu(ie->length))) { + ntfs_error(vol->sb, + "Data overflow from index entry in inode %lld\n", + (long long)inum); + return -EIO; + } + } + } + + return 0; +} + +/** + * ntfs_index_entry_mark_dirty - mark an index entry dirty + * @ictx: ntfs index context describing the index entry + * + * Mark the index entry described by the index entry context @ictx dirty. + * + * If the index entry is in the index root attribute, simply mark the inode + * containing the index root attribute dirty. This ensures the mftrecord, and + * hence the index root attribute, will be written out to disk later. + * + * If the index entry is in an index block belonging to the index allocation + * attribute, set ib_dirty to true, thus index block will be updated during + * ntfs_index_ctx_put. + */ +void ntfs_index_entry_mark_dirty(struct ntfs_index_context *ictx) +{ + if (ictx->is_in_root) + mark_mft_record_dirty(ictx->actx->ntfs_ino); + else if (ictx->ib) + ictx->ib_dirty = true; +} + +static s64 ntfs_ib_vcn_to_pos(struct ntfs_index_context *icx, s64 vcn) +{ + return vcn << icx->vcn_size_bits; +} + +static s64 ntfs_ib_pos_to_vcn(struct ntfs_index_context *icx, s64 pos) +{ + return pos >> icx->vcn_size_bits; +} + +static int ntfs_ib_write(struct ntfs_index_context *icx, struct index_block *ib) +{ + s64 ret, vcn = le64_to_cpu(ib->index_block_vcn); + + ntfs_debug("vcn: %lld\n", vcn); + + ret = pre_write_mst_fixup((struct ntfs_record *)ib, icx->block_size); + if (ret) + return -EIO; + + ret = ntfs_inode_attr_pwrite(VFS_I(icx->ia_ni), + ntfs_ib_vcn_to_pos(icx, vcn), icx->block_size, + (u8 *)ib, icx->sync_write); + if (ret != icx->block_size) { + ntfs_debug("Failed to write index block %lld, inode %llu", + vcn, (unsigned long long)icx->idx_ni->mft_no); + return ret; + } + + return 0; +} + +static int ntfs_icx_ib_write(struct ntfs_index_context *icx) +{ + int err; + + err = ntfs_ib_write(icx, icx->ib); + if (err) + return err; + + icx->ib_dirty = false; + + return 0; +} + +int ntfs_icx_ib_sync_write(struct ntfs_index_context *icx) +{ + int ret; + + if (icx->ib_dirty == false) + return 0; + + icx->sync_write = true; + + ret = ntfs_ib_write(icx, icx->ib); + if (!ret) { + ntfs_free(icx->ib); + icx->ib = NULL; + icx->ib_dirty = false; + } else { + post_write_mst_fixup((struct ntfs_record *)icx->ib); + icx->sync_write = false; + } + + return ret; +} /** * ntfs_index_ctx_get - allocate and initialize a new index context - * @idx_ni: ntfs index inode with which to initialize the context + * @ni: ntfs inode with which to initialize the context + * @name: name of the which context describes + * @name_len: length of the index name * - * Allocate a new index context, initialize it with @idx_ni and return it. + * Allocate a new index context, initialize it with @ni and return it. * Return NULL if allocation failed. - * - * Locking: Caller must hold i_mutex on the index inode. */ -ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni) +struct ntfs_index_context *ntfs_index_ctx_get(struct ntfs_inode *ni, + __le16 *name, u32 name_len) +{ + struct ntfs_index_context *icx; + + ntfs_debug("Entering\n"); + + if (!ni) + return NULL; + + if (ni->nr_extents == -1) + ni = ni->ext.base_ntfs_ino; + + icx = kmem_cache_alloc(ntfs_index_ctx_cache, GFP_NOFS); + if (icx) + *icx = (struct ntfs_index_context) { + .idx_ni = ni, + .name = name, + .name_len = name_len, + }; + return icx; +} + +static void ntfs_index_ctx_free(struct ntfs_index_context *icx) { - ntfs_index_context *ictx; + ntfs_debug("Entering\n"); + + if (icx->actx) { + ntfs_attr_put_search_ctx(icx->actx); + icx->actx = NULL; + } + + if (!icx->is_in_root) { + if (icx->ib_dirty) + ntfs_ib_write(icx, icx->ib); + ntfs_free(icx->ib); + icx->ib = NULL; + } - ictx = kmem_cache_alloc(ntfs_index_ctx_cache, GFP_NOFS); - if (ictx) - *ictx = (ntfs_index_context){ .idx_ni = idx_ni }; - return ictx; + if (icx->ia_ni) { + iput(VFS_I(icx->ia_ni)); + icx->ia_ni = NULL; + } } /** * ntfs_index_ctx_put - release an index context - * @ictx: index context to free + * @icx: index context to free * - * Release the index context @ictx, releasing all associated resources. + * Release the index context @icx, releasing all associated resources. + */ +void ntfs_index_ctx_put(struct ntfs_index_context *icx) +{ + ntfs_index_ctx_free(icx); + kmem_cache_free(ntfs_index_ctx_cache, icx); +} + +/** + * ntfs_index_ctx_reinit - reinitialize an index context + * @icx: index context to reinitialize * - * Locking: Caller must hold i_mutex on the index inode. + * Reinitialize the index context @icx so it can be used for ntfs_index_lookup. */ -void ntfs_index_ctx_put(ntfs_index_context *ictx) +void ntfs_index_ctx_reinit(struct ntfs_index_context *icx) { - if (ictx->entry) { - if (ictx->is_in_root) { - if (ictx->actx) - ntfs_attr_put_search_ctx(ictx->actx); - if (ictx->base_ni) - unmap_mft_record(ictx->base_ni); - } else { - struct page *page = ictx->page; - if (page) { - BUG_ON(!PageLocked(page)); - unlock_page(page); - ntfs_unmap_page(page); - } - } + ntfs_debug("Entering\n"); + + ntfs_index_ctx_free(icx); + + *icx = (struct ntfs_index_context) { + .idx_ni = icx->idx_ni, + .name = icx->name, + .name_len = icx->name_len, + }; +} + +static __le64 *ntfs_ie_get_vcn_addr(struct index_entry *ie) +{ + return (__le64 *)((u8 *)ie + le16_to_cpu(ie->length) - sizeof(s64)); +} + +/** + * Get the subnode vcn to which the index entry refers. + */ +static s64 ntfs_ie_get_vcn(struct index_entry *ie) +{ + return le64_to_cpup(ntfs_ie_get_vcn_addr(ie)); +} + +static struct index_entry *ntfs_ie_get_first(struct index_header *ih) +{ + return (struct index_entry *)((u8 *)ih + le32_to_cpu(ih->entries_offset)); +} + +static struct index_entry *ntfs_ie_get_next(struct index_entry *ie) +{ + return (struct index_entry *)((char *)ie + le16_to_cpu(ie->length)); +} + +static u8 *ntfs_ie_get_end(struct index_header *ih) +{ + return (u8 *)ih + le32_to_cpu(ih->index_length); +} + +static int ntfs_ie_end(struct index_entry *ie) +{ + return ie->flags & INDEX_ENTRY_END || !ie->length; +} + +/** + * Find the last entry in the index block + */ +static struct index_entry *ntfs_ie_get_last(struct index_entry *ie, char *ies_end) +{ + ntfs_debug("Entering\n"); + + while ((char *)ie < ies_end && !ntfs_ie_end(ie)) + ie = ntfs_ie_get_next(ie); + + return ie; +} + +static struct index_entry *ntfs_ie_get_by_pos(struct index_header *ih, int pos) +{ + struct index_entry *ie; + + ntfs_debug("pos: %d\n", pos); + + ie = ntfs_ie_get_first(ih); + + while (pos-- > 0) + ie = ntfs_ie_get_next(ie); + + return ie; +} + +static struct index_entry *ntfs_ie_prev(struct index_header *ih, struct index_entry *ie) +{ + struct index_entry *ie_prev = NULL; + struct index_entry *tmp; + + ntfs_debug("Entering\n"); + + tmp = ntfs_ie_get_first(ih); + + while (tmp != ie) { + ie_prev = tmp; + tmp = ntfs_ie_get_next(tmp); } - kmem_cache_free(ntfs_index_ctx_cache, ictx); - return; + + return ie_prev; +} + +static int ntfs_ih_numof_entries(struct index_header *ih) +{ + int n; + struct index_entry *ie; + u8 *end; + + ntfs_debug("Entering\n"); + + end = ntfs_ie_get_end(ih); + ie = ntfs_ie_get_first(ih); + for (n = 0; !ntfs_ie_end(ie) && (u8 *)ie < end; n++) + ie = ntfs_ie_get_next(ie); + return n; +} + +static int ntfs_ih_one_entry(struct index_header *ih) +{ + return (ntfs_ih_numof_entries(ih) == 1); +} + +static int ntfs_ih_zero_entry(struct index_header *ih) +{ + return (ntfs_ih_numof_entries(ih) == 0); +} + +static void ntfs_ie_delete(struct index_header *ih, struct index_entry *ie) +{ + u32 new_size; + + ntfs_debug("Entering\n"); + + new_size = le32_to_cpu(ih->index_length) - le16_to_cpu(ie->length); + ih->index_length = cpu_to_le32(new_size); + memmove(ie, (u8 *)ie + le16_to_cpu(ie->length), + new_size - ((u8 *)ie - (u8 *)ih)); +} + +static void ntfs_ie_set_vcn(struct index_entry *ie, s64 vcn) +{ + *ntfs_ie_get_vcn_addr(ie) = cpu_to_le64(vcn); } /** - * ntfs_index_lookup - find a key in an index and return its index entry - * @key: [IN] key for which to search in the index - * @key_len: [IN] length of @key in bytes - * @ictx: [IN/OUT] context describing the index and the returned entry - * - * Before calling ntfs_index_lookup(), @ictx must have been obtained from a - * call to ntfs_index_ctx_get(). - * - * Look for the @key in the index specified by the index lookup context @ictx. - * ntfs_index_lookup() walks the contents of the index looking for the @key. - * - * If the @key is found in the index, 0 is returned and @ictx is setup to - * describe the index entry containing the matching @key. @ictx->entry is the - * index entry and @ictx->data and @ictx->data_len are the index entry data and - * its length in bytes, respectively. - * - * If the @key is not found in the index, -ENOENT is returned and @ictx is - * setup to describe the index entry whose key collates immediately after the - * search @key, i.e. this is the position in the index at which an index entry - * with a key of @key would need to be inserted. + * Insert @ie index entry at @pos entry. Used @ih values should be ok already. + */ +static void ntfs_ie_insert(struct index_header *ih, struct index_entry *ie, + struct index_entry *pos) +{ + int ie_size = le16_to_cpu(ie->length); + + ntfs_debug("Entering\n"); + + ih->index_length = cpu_to_le32(le32_to_cpu(ih->index_length) + ie_size); + memmove((u8 *)pos + ie_size, pos, + le32_to_cpu(ih->index_length) - ((u8 *)pos - (u8 *)ih) - ie_size); + memcpy(pos, ie, ie_size); +} + +static struct index_entry *ntfs_ie_dup(struct index_entry *ie) +{ + struct index_entry *dup; + + ntfs_debug("Entering\n"); + + dup = ntfs_malloc_nofs(le16_to_cpu(ie->length)); + if (dup) + memcpy(dup, ie, le16_to_cpu(ie->length)); + + return dup; +} + +static struct index_entry *ntfs_ie_dup_novcn(struct index_entry *ie) +{ + struct index_entry *dup; + int size = le16_to_cpu(ie->length); + + ntfs_debug("Entering\n"); + + if (ie->flags & INDEX_ENTRY_NODE) + size -= sizeof(s64); + + dup = ntfs_malloc_nofs(size); + if (dup) { + memcpy(dup, ie, size); + dup->flags &= ~INDEX_ENTRY_NODE; + dup->length = cpu_to_le16(size); + } + return dup; +} + +/* + * Check the consistency of an index block * - * If an error occurs return the negative error code and @ictx is left - * untouched. + * Make sure the index block does not overflow from the index record. + * The size of block is assumed to have been checked to be what is + * defined in the index root. * - * When finished with the entry and its data, call ntfs_index_ctx_put() to free - * the context and other associated resources. + * Returns 0 if no error was found -1 otherwise (with errno unchanged) * - * If the index entry was modified, call flush_dcache_index_entry_page() - * immediately after the modification and either ntfs_index_entry_mark_dirty() - * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to - * ensure that the changes are written to disk. + * |<--->| offsetof(struct index_block, index) + * | |<--->| sizeof(struct index_header) + * | | | + * | | | seq index entries unused + * |=====|=====|=====|===========================|==============| + * | | | | | + * | |<--------->| entries_offset | | + * | |<---------------- index_length ------->| | + * | |<--------------------- allocated_size --------------->| + * |<--------------------------- block_size ------------------->| * - * Locking: - Caller must hold i_mutex on the index inode. - * - Each page cache page in the index allocation mapping must be - * locked whilst being accessed otherwise we may find a corrupt - * page due to it being under ->writepage at the moment which - * applies the mst protection fixups before writing out and then - * removes them again after the write is complete after which it - * unlocks the page. + * size(struct index_header) <= ent_offset < ind_length <= alloc_size < bk_size */ -int ntfs_index_lookup(const void *key, const int key_len, - ntfs_index_context *ictx) -{ - VCN vcn, old_vcn; - ntfs_inode *idx_ni = ictx->idx_ni; - ntfs_volume *vol = idx_ni->vol; - struct super_block *sb = vol->sb; - ntfs_inode *base_ni = idx_ni->ext.base_ntfs_ino; - MFT_RECORD *m; - INDEX_ROOT *ir; - INDEX_ENTRY *ie; - INDEX_ALLOCATION *ia; - u8 *index_end, *kaddr; - ntfs_attr_search_ctx *actx; - struct address_space *ia_mapping; - struct page *page; - int rc, err = 0; - - ntfs_debug("Entering."); - BUG_ON(!NInoAttr(idx_ni)); - BUG_ON(idx_ni->type != AT_INDEX_ALLOCATION); - BUG_ON(idx_ni->nr_extents != -1); - BUG_ON(!base_ni); - BUG_ON(!key); - BUG_ON(key_len <= 0); - if (!ntfs_is_collation_rule_supported( - idx_ni->itype.index.collation_rule)) { - ntfs_error(sb, "Index uses unsupported collation rule 0x%x. " - "Aborting lookup.", le32_to_cpu( - idx_ni->itype.index.collation_rule)); - return -EOPNOTSUPP; +static int ntfs_index_block_inconsistent(struct ntfs_index_context *icx, + struct index_block *ib, s64 vcn) +{ + u32 ib_size = (unsigned int)le32_to_cpu(ib->index.allocated_size) + + offsetof(struct index_block, index); + struct super_block *sb = icx->idx_ni->vol->sb; + unsigned long long inum = icx->idx_ni->mft_no; + + ntfs_debug("Entering\n"); + + if (!ntfs_is_indx_record(ib->magic)) { + + ntfs_error(sb, "Corrupt index block signature: vcn %lld inode %llu\n", + vcn, (unsigned long long)icx->idx_ni->mft_no); + return -1; + } + + if (le64_to_cpu(ib->index_block_vcn) != vcn) { + ntfs_error(sb, + "Corrupt index block: s64 (%lld) is different from expected s64 (%lld) in inode %llu\n", + (long long)le64_to_cpu(ib->index_block_vcn), + vcn, inum); + return -1; } - /* Get hold of the mft record for the index inode. */ - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - ntfs_error(sb, "map_mft_record() failed with error code %ld.", - -PTR_ERR(m)); - return PTR_ERR(m); + + if (ib_size != icx->block_size) { + ntfs_error(sb, + "Corrupt index block : s64 (%lld) of inode %llu has a size (%u) differing from the index specified size (%u)\n", + vcn, inum, ib_size, icx->block_size); + return -1; } - actx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!actx)) { - err = -ENOMEM; + + if (le32_to_cpu(ib->index.entries_offset) < sizeof(struct index_header)) { + ntfs_error(sb, "Invalid index entry offset in inode %lld\n", inum); + return -1; + } + if (le32_to_cpu(ib->index.index_length) <= + le32_to_cpu(ib->index.entries_offset)) { + ntfs_error(sb, "No space for index entries in inode %lld\n", inum); + return -1; + } + if (le32_to_cpu(ib->index.allocated_size) < + le32_to_cpu(ib->index.index_length)) { + ntfs_error(sb, "Index entries overflow in inode %lld\n", inum); + return -1; + } + + return 0; +} + +static struct index_root *ntfs_ir_lookup(struct ntfs_inode *ni, __le16 *name, + u32 name_len, struct ntfs_attr_search_ctx **ctx) +{ + struct attr_record *a; + struct index_root *ir = NULL; + + ntfs_debug("Entering\n"); + *ctx = ntfs_attr_get_search_ctx(ni, NULL); + if (!*ctx) { + ntfs_error(ni->vol->sb, "%s, Failed to get search context", __func__); + return NULL; + } + + if (ntfs_attr_lookup(AT_INDEX_ROOT, name, name_len, CASE_SENSITIVE, + 0, NULL, 0, *ctx)) { + ntfs_error(ni->vol->sb, "Failed to lookup $INDEX_ROOT"); goto err_out; } - /* Find the index root attribute in the mft record. */ - err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, actx); - if (unlikely(err)) { - if (err == -ENOENT) { - ntfs_error(sb, "Index root attribute missing in inode " - "0x%lx.", idx_ni->mft_no); - err = -EIO; - } + + a = (*ctx)->attr; + if (a->non_resident) { + ntfs_error(ni->vol->sb, "Non-resident $INDEX_ROOT detected"); goto err_out; } - /* Get to the index root value (it has been verified in read_inode). */ - ir = (INDEX_ROOT*)((u8*)actx->attr + - le16_to_cpu(actx->attr->data.resident.value_offset)); - index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); - /* The first index entry. */ - ie = (INDEX_ENTRY*)((u8*)&ir->index + - le32_to_cpu(ir->index.entries_offset)); + + ir = (struct index_root *)((char *)a + le16_to_cpu(a->data.resident.value_offset)); +err_out: + if (!ir) { + ntfs_attr_put_search_ctx(*ctx); + *ctx = NULL; + } + return ir; +} + +static struct index_root *ntfs_ir_lookup2(struct ntfs_inode *ni, __le16 *name, u32 len) +{ + struct ntfs_attr_search_ctx *ctx; + struct index_root *ir; + + ir = ntfs_ir_lookup(ni, name, len, &ctx); + if (ir) + ntfs_attr_put_search_ctx(ctx); + return ir; +} + +/** + * Find a key in the index block. + */ +static int ntfs_ie_lookup(const void *key, const int key_len, + struct ntfs_index_context *icx, struct index_header *ih, + s64 *vcn, struct index_entry **ie_out) +{ + struct index_entry *ie; + u8 *index_end; + int rc, item = 0; + + ntfs_debug("Entering\n"); + + index_end = ntfs_ie_get_end(ih); + /* * Loop until we exceed valid memory (corruption case) or until we * reach the last entry. */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + for (ie = ntfs_ie_get_first(ih); ; ie = ntfs_ie_get_next(ie)) { /* Bounds checks. */ - if ((u8*)ie < (u8*)actx->mrec || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->length) > index_end) - goto idx_err_out; + if ((u8 *)ie + sizeof(struct index_entry_header) > index_end || + (u8 *)ie + le16_to_cpu(ie->length) > index_end) { + ntfs_error(icx->idx_ni->vol->sb, + "Index entry out of bounds in inode %llu.\n", + (unsigned long long)icx->idx_ni->mft_no); + return -ERANGE; + } + /* * The last entry cannot contain a key. It can however contain * a pointer to a child node in the B+tree so we just break out. */ - if (ie->flags & INDEX_ENTRY_END) + if (ntfs_ie_end(ie)) break; - /* Further bounds checks. */ - if ((u32)sizeof(INDEX_ENTRY_HEADER) + - le16_to_cpu(ie->key_length) > - le16_to_cpu(ie->data.vi.data_offset) || - (u32)le16_to_cpu(ie->data.vi.data_offset) + - le16_to_cpu(ie->data.vi.data_length) > - le16_to_cpu(ie->length)) - goto idx_err_out; - /* If the keys match perfectly, we setup @ictx and return 0. */ - if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key, - &ie->key, key_len)) { -ir_done: - ictx->is_in_root = true; - ictx->ir = ir; - ictx->actx = actx; - ictx->base_ni = base_ni; - ictx->ia = NULL; - ictx->page = NULL; -done: - ictx->entry = ie; - ictx->data = (u8*)ie + - le16_to_cpu(ie->data.vi.data_offset); - ictx->data_len = le16_to_cpu(ie->data.vi.data_length); - ntfs_debug("Done."); - return err; - } + /* * Not a perfect match, need to do full blown collation so we * know which way in the B+tree we have to go. */ - rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key, - key_len, &ie->key, le16_to_cpu(ie->key_length)); + rc = ntfs_collate(icx->idx_ni->vol, icx->cr, key, key_len, &ie->key, + le16_to_cpu(ie->key_length)); + if (rc == -2) { + ntfs_error(icx->idx_ni->vol->sb, + "Collation error. Perhaps a filename contains invalid characters?\n"); + return -ERANGE; + } /* * If @key collates before the key of the current entry, there * is definitely no such key in this index but we might need to @@ -219,222 +609,1507 @@ int ntfs_index_lookup(const void *key, const int key_len, */ if (rc == -1) break; - /* - * A match should never happen as the memcmp() call should have - * cought it, but we still treat it correctly. - */ - if (!rc) - goto ir_done; - /* The keys are not equal, continue the search. */ + + if (!rc) { + *ie_out = ie; + icx->parent_pos[icx->pindex] = item; + return 0; + } + + item++; } /* - * We have finished with this index without success. Check for the - * presence of a child node and if not present setup @ictx and return - * -ENOENT. + * We have finished with this index block without success. Check for the + * presence of a child node and if not present return with errno ENOENT, + * otherwise we will keep searching in another index block. */ if (!(ie->flags & INDEX_ENTRY_NODE)) { - ntfs_debug("Entry not found."); - err = -ENOENT; - goto ir_done; - } /* Child node present, descend into it. */ - /* Consistency check: Verify that an index allocation exists. */ - if (!NInoIndexAllocPresent(idx_ni)) { - ntfs_error(sb, "No index allocation attribute but index entry " - "requires one. Inode 0x%lx is corrupt or " - "driver bug.", idx_ni->mft_no); - goto err_out; + ntfs_debug("Index entry wasn't found.\n"); + *ie_out = ie; + return -ENOENT; } + /* Get the starting vcn of the index_block holding the child node. */ - vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8)); - ia_mapping = VFS_I(idx_ni)->i_mapping; - /* - * We are done with the index root and the mft record. Release them, - * otherwise we deadlock with ntfs_map_page(). - */ - ntfs_attr_put_search_ctx(actx); - unmap_mft_record(base_ni); - m = NULL; - actx = NULL; -descend_into_child_node: - /* - * Convert vcn to index into the index allocation attribute in units - * of PAGE_SIZE and map the page cache page, reading it from - * disk if necessary. - */ - page = ntfs_map_page(ia_mapping, vcn << - idx_ni->itype.index.vcn_size_bits >> PAGE_SHIFT); - if (IS_ERR(page)) { - ntfs_error(sb, "Failed to map index page, error %ld.", - -PTR_ERR(page)); - err = PTR_ERR(page); - goto err_out; + *vcn = ntfs_ie_get_vcn(ie); + if (*vcn < 0) { + ntfs_error(icx->idx_ni->vol->sb, "Negative vcn in inode %llu\n", + (unsigned long long)icx->idx_ni->mft_no); + return -EINVAL; } - lock_page(page); - kaddr = (u8*)page_address(page); -fast_descend_into_child_node: - /* Get to the index allocation block. */ - ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << - idx_ni->itype.index.vcn_size_bits) & ~PAGE_MASK)); - /* Bounds checks. */ - if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) { - ntfs_error(sb, "Out of bounds check failed. Corrupt inode " - "0x%lx or driver bug.", idx_ni->mft_no); - goto unm_err_out; - } - /* Catch multi sector transfer fixup errors. */ - if (unlikely(!ntfs_is_indx_record(ia->magic))) { - ntfs_error(sb, "Index record with vcn 0x%llx is corrupt. " - "Corrupt inode 0x%lx. Run chkdsk.", - (long long)vcn, idx_ni->mft_no); - goto unm_err_out; - } - if (sle64_to_cpu(ia->index_block_vcn) != vcn) { - ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " - "different from expected VCN (0x%llx). Inode " - "0x%lx is corrupt or driver bug.", - (unsigned long long) - sle64_to_cpu(ia->index_block_vcn), - (unsigned long long)vcn, idx_ni->mft_no); - goto unm_err_out; - } - if (le32_to_cpu(ia->index.allocated_size) + 0x18 != - idx_ni->itype.index.block_size) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx has " - "a size (%u) differing from the index " - "specified size (%u). Inode is corrupt or " - "driver bug.", (unsigned long long)vcn, - idx_ni->mft_no, - le32_to_cpu(ia->index.allocated_size) + 0x18, - idx_ni->itype.index.block_size); - goto unm_err_out; - } - index_end = (u8*)ia + idx_ni->itype.index.block_size; - if (index_end > kaddr + PAGE_SIZE) { - ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx " - "crosses page boundary. Impossible! Cannot " - "access! This is probably a bug in the " - "driver.", (unsigned long long)vcn, - idx_ni->mft_no); - goto unm_err_out; - } - index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); - if (index_end > (u8*)ia + idx_ni->itype.index.block_size) { - ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of inode " - "0x%lx exceeds maximum size.", - (unsigned long long)vcn, idx_ni->mft_no); - goto unm_err_out; - } - /* The first index entry. */ - ie = (INDEX_ENTRY*)((u8*)&ia->index + - le32_to_cpu(ia->index.entries_offset)); - /* - * Iterate similar to above big loop but applied to index buffer, thus - * loop until we exceed valid memory (corruption case) or until we - * reach the last entry. - */ - for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { - /* Bounds checks. */ - if ((u8*)ie < (u8*)ia || (u8*)ie + - sizeof(INDEX_ENTRY_HEADER) > index_end || - (u8*)ie + le16_to_cpu(ie->length) > index_end) { - ntfs_error(sb, "Index entry out of bounds in inode " - "0x%lx.", idx_ni->mft_no); - goto unm_err_out; - } - /* - * The last entry cannot contain a key. It can however contain - * a pointer to a child node in the B+tree so we just break out. - */ - if (ie->flags & INDEX_ENTRY_END) - break; - /* Further bounds checks. */ - if ((u32)sizeof(INDEX_ENTRY_HEADER) + - le16_to_cpu(ie->key_length) > - le16_to_cpu(ie->data.vi.data_offset) || - (u32)le16_to_cpu(ie->data.vi.data_offset) + - le16_to_cpu(ie->data.vi.data_length) > - le16_to_cpu(ie->length)) { - ntfs_error(sb, "Index entry out of bounds in inode " - "0x%lx.", idx_ni->mft_no); - goto unm_err_out; - } - /* If the keys match perfectly, we setup @ictx and return 0. */ - if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key, - &ie->key, key_len)) { -ia_done: - ictx->is_in_root = false; - ictx->actx = NULL; - ictx->base_ni = NULL; - ictx->ia = ia; - ictx->page = page; - goto done; - } - /* - * Not a perfect match, need to do full blown collation so we - * know which way in the B+tree we have to go. - */ - rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key, - key_len, &ie->key, le16_to_cpu(ie->key_length)); - /* - * If @key collates before the key of the current entry, there - * is definitely no such key in this index but we might need to - * descend into the B+tree so we just break out of the loop. - */ - if (rc == -1) - break; - /* - * A match should never happen as the memcmp() call should have - * cought it, but we still treat it correctly. - */ - if (!rc) - goto ia_done; - /* The keys are not equal, continue the search. */ + + ntfs_debug("Parent entry number %d\n", item); + icx->parent_pos[icx->pindex] = item; + + return -EAGAIN; +} + +struct ntfs_inode *ntfs_ia_open(struct ntfs_index_context *icx, struct ntfs_inode *ni) +{ + struct inode *ia_vi; + + ia_vi = ntfs_index_iget(VFS_I(ni), icx->name, icx->name_len); + if (IS_ERR(ia_vi)) { + ntfs_error(icx->idx_ni->vol->sb, + "Failed to open index allocation of inode %llu", + (unsigned long long)ni->mft_no); + return NULL; } - /* - * We have finished with this index buffer without success. Check for - * the presence of a child node and if not present return -ENOENT. - */ - if (!(ie->flags & INDEX_ENTRY_NODE)) { - ntfs_debug("Entry not found."); - err = -ENOENT; - goto ia_done; + + return NTFS_I(ia_vi); +} + +static int ntfs_ib_read(struct ntfs_index_context *icx, s64 vcn, struct index_block *dst) +{ + s64 pos, ret; + + ntfs_debug("vcn: %lld\n", vcn); + + pos = ntfs_ib_vcn_to_pos(icx, vcn); + + ret = ntfs_inode_attr_pread(VFS_I(icx->ia_ni), pos, icx->block_size, (u8 *)dst); + if (ret != icx->block_size) { + if (ret == -1) + ntfs_error(icx->idx_ni->vol->sb, "Failed to read index block"); + else + ntfs_error(icx->idx_ni->vol->sb, + "Failed to read full index block at %lld\n", pos); + return -1; + } + + post_read_mst_fixup((struct ntfs_record *)((u8 *)dst), icx->block_size); + if (ntfs_index_block_inconsistent(icx, dst, vcn)) + return -1; + + return 0; +} + +static int ntfs_icx_parent_inc(struct ntfs_index_context *icx) +{ + icx->pindex++; + if (icx->pindex >= MAX_PARENT_VCN) { + ntfs_error(icx->idx_ni->vol->sb, "Index is over %d level deep", MAX_PARENT_VCN); + return -EOPNOTSUPP; } - if ((ia->index.flags & NODE_MASK) == LEAF_NODE) { - ntfs_error(sb, "Index entry with child node found in a leaf " - "node in inode 0x%lx.", idx_ni->mft_no); - goto unm_err_out; + return 0; +} + +static int ntfs_icx_parent_dec(struct ntfs_index_context *icx) +{ + icx->pindex--; + if (icx->pindex < 0) { + ntfs_error(icx->idx_ni->vol->sb, "Corrupt index pointer (%d)", icx->pindex); + return -EINVAL; } + return 0; +} + +/** + * ntfs_index_lookup - find a key in an index and return its index entry + * @key: key for which to search in the index + * @key_len: length of @key in bytes + * @icx: context describing the index and the returned entry + * + * Before calling ntfs_index_lookup(), @icx must have been obtained from a + * call to ntfs_index_ctx_get(). + * + * Look for the @key in the index specified by the index lookup context @icx. + * ntfs_index_lookup() walks the contents of the index looking for the @key. + * + * If the @key is found in the index, 0 is returned and @icx is setup to + * describe the index entry containing the matching @key. @icx->entry is the + * index entry and @icx->data and @icx->data_len are the index entry data and + * its length in bytes, respectively. + * + * If the @key is not found in the index, -ENOENT is returned and + * @icx is setup to describe the index entry whose key collates immediately + * after the search @key, i.e. this is the position in the index at which + * an index entry with a key of @key would need to be inserted. + * + * When finished with the entry and its data, call ntfs_index_ctx_put() to free + * the context and other associated resources. + * + * If the index entry was modified, call ntfs_index_entry_mark_dirty() before + * the call to ntfs_index_ctx_put() to ensure that the changes are written + * to disk. + */ +int ntfs_index_lookup(const void *key, const int key_len, struct ntfs_index_context *icx) +{ + s64 old_vcn, vcn; + struct ntfs_inode *ni = icx->idx_ni; + struct super_block *sb = ni->vol->sb; + struct index_root *ir; + struct index_entry *ie; + struct index_block *ib = NULL; + int err = 0; + + ntfs_debug("Entering\n"); + + if (!key || key_len <= 0) { + ntfs_error(sb, "key: %p key_len: %d", key, key_len); + return -EINVAL; + } + + ir = ntfs_ir_lookup(ni, icx->name, icx->name_len, &icx->actx); + if (!ir) + return -EIO; + + icx->block_size = le32_to_cpu(ir->index_block_size); + if (icx->block_size < NTFS_BLOCK_SIZE) { + err = -EINVAL; + ntfs_error(sb, + "Index block size (%d) is smaller than the sector size (%d)", + icx->block_size, NTFS_BLOCK_SIZE); + goto err_out; + } + + if (ni->vol->cluster_size <= icx->block_size) + icx->vcn_size_bits = ni->vol->cluster_size_bits; + else + icx->vcn_size_bits = ni->vol->sector_size_bits; + + icx->cr = ir->collation_rule; + if (!ntfs_is_collation_rule_supported(icx->cr)) { + err = -EOPNOTSUPP; + ntfs_error(sb, "Unknown collation rule 0x%x", + (unsigned int)le32_to_cpu(icx->cr)); + goto err_out; + } + + old_vcn = VCN_INDEX_ROOT_PARENT; + err = ntfs_ie_lookup(key, key_len, icx, &ir->index, &vcn, &ie); + if (err == -ERANGE || err == -EINVAL) + goto err_out; + + icx->ir = ir; + if (err != -EAGAIN) { + icx->is_in_root = true; + icx->parent_vcn[icx->pindex] = old_vcn; + goto done; + } + /* Child node present, descend into it. */ + icx->ia_ni = ntfs_ia_open(icx, ni); + if (!icx->ia_ni) { + err = -ENOENT; + goto err_out; + } + + ib = ntfs_malloc_nofs(icx->block_size); + if (!ib) { + err = -ENOMEM; + goto err_out; + } + +descend_into_child_node: + icx->parent_vcn[icx->pindex] = old_vcn; + if (ntfs_icx_parent_inc(icx)) { + err = -EIO; + goto err_out; + } old_vcn = vcn; - vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8)); - if (vcn >= 0) { - /* - * If vcn is in the same page cache page as old_vcn we recycle - * the mapped page. - */ - if (old_vcn << vol->cluster_size_bits >> - PAGE_SHIFT == vcn << - vol->cluster_size_bits >> - PAGE_SHIFT) - goto fast_descend_into_child_node; - unlock_page(page); - ntfs_unmap_page(page); - goto descend_into_child_node; - } - ntfs_error(sb, "Negative child node vcn in inode 0x%lx.", - idx_ni->mft_no); -unm_err_out: - unlock_page(page); - ntfs_unmap_page(page); + + ntfs_debug("Descend into node with s64 %lld.\n", vcn); + + if (ntfs_ib_read(icx, vcn, ib)) { + err = -EIO; + goto err_out; + } + err = ntfs_ie_lookup(key, key_len, icx, &ib->index, &vcn, &ie); + if (err != -EAGAIN) { + if (err == -EINVAL || err == -ERANGE) + goto err_out; + + icx->is_in_root = false; + icx->ib = ib; + icx->parent_vcn[icx->pindex] = vcn; + goto done; + } + + if ((ib->index.flags & NODE_MASK) == LEAF_NODE) { + ntfs_error(icx->idx_ni->vol->sb, + "Index entry with child node found in a leaf node in inode 0x%llx.\n", + (unsigned long long)ni->mft_no); + goto err_out; + } + + goto descend_into_child_node; err_out: + if (icx->actx) { + ntfs_attr_put_search_ctx(icx->actx); + icx->actx = NULL; + } + ntfs_free(ib); if (!err) err = -EIO; - if (actx) - ntfs_attr_put_search_ctx(actx); - if (m) - unmap_mft_record(base_ni); return err; -idx_err_out: - ntfs_error(sb, "Corrupt index. Aborting lookup."); +done: + icx->entry = ie; + icx->data = (u8 *)ie + offsetof(struct index_entry, key); + icx->data_len = le16_to_cpu(ie->key_length); + ntfs_debug("Done.\n"); + return err; + +} + +static struct index_block *ntfs_ib_alloc(s64 ib_vcn, u32 ib_size, + u8 node_type) +{ + struct index_block *ib; + int ih_size = sizeof(struct index_header); + + ntfs_debug("Entering ib_vcn = %lld ib_size = %u\n", ib_vcn, ib_size); + + ib = ntfs_malloc_nofs(ib_size); + if (!ib) + return NULL; + + ib->magic = magic_INDX; + ib->usa_ofs = cpu_to_le16(sizeof(struct index_block)); + ib->usa_count = cpu_to_le16(ib_size / NTFS_BLOCK_SIZE + 1); + /* Set USN to 1 */ + *(__le16 *)((char *)ib + le16_to_cpu(ib->usa_ofs)) = cpu_to_le16(1); + ib->lsn = 0; + ib->index_block_vcn = cpu_to_le64(ib_vcn); + ib->index.entries_offset = cpu_to_le32((ih_size + + le16_to_cpu(ib->usa_count) * 2 + 7) & ~7); + ib->index.index_length = 0; + ib->index.allocated_size = cpu_to_le32(ib_size - + (sizeof(struct index_block) - ih_size)); + ib->index.flags = node_type; + + return ib; +} + +/** + * Find the median by going through all the entries + */ +static struct index_entry *ntfs_ie_get_median(struct index_header *ih) +{ + struct index_entry *ie, *ie_start; + u8 *ie_end; + int i = 0, median; + + ntfs_debug("Entering\n"); + + ie = ie_start = ntfs_ie_get_first(ih); + ie_end = (u8 *)ntfs_ie_get_end(ih); + + while ((u8 *)ie < ie_end && !ntfs_ie_end(ie)) { + ie = ntfs_ie_get_next(ie); + i++; + } + /* + * NOTE: this could be also the entry at the half of the index block. + */ + median = i / 2 - 1; + + ntfs_debug("Entries: %d median: %d\n", i, median); + + for (i = 0, ie = ie_start; i <= median; i++) + ie = ntfs_ie_get_next(ie); + + return ie; +} + +static u64 ntfs_ibm_vcn_to_pos(struct ntfs_index_context *icx, s64 vcn) +{ + u64 pos = ntfs_ib_vcn_to_pos(icx, vcn); + + do_div(pos, icx->block_size); + return pos; +} + +static s64 ntfs_ibm_pos_to_vcn(struct ntfs_index_context *icx, s64 pos) +{ + return ntfs_ib_pos_to_vcn(icx, pos * icx->block_size); +} + +static int ntfs_ibm_add(struct ntfs_index_context *icx) +{ + u8 bmp[8]; + + ntfs_debug("Entering\n"); + + if (ntfs_attr_exist(icx->idx_ni, AT_BITMAP, icx->name, icx->name_len)) + return 0; + /* + * AT_BITMAP must be at least 8 bytes. + */ + memset(bmp, 0, sizeof(bmp)); + if (ntfs_attr_add(icx->idx_ni, AT_BITMAP, icx->name, icx->name_len, + bmp, sizeof(bmp))) { + ntfs_error(icx->idx_ni->vol->sb, "Failed to add AT_BITMAP"); + return -EINVAL; + } + + return 0; +} + +static int ntfs_ibm_modify(struct ntfs_index_context *icx, s64 vcn, int set) +{ + u8 byte; + u64 pos = ntfs_ibm_vcn_to_pos(icx, vcn); + u32 bpos = pos / 8; + u32 bit = 1 << (pos % 8); + struct ntfs_inode *bmp_ni; + struct inode *bmp_vi; + int ret = 0; + + ntfs_debug("%s vcn: %lld\n", set ? "set" : "clear", vcn); + + bmp_vi = ntfs_attr_iget(VFS_I(icx->idx_ni), AT_BITMAP, icx->name, icx->name_len); + if (IS_ERR(bmp_vi)) { + ntfs_error(icx->idx_ni->vol->sb, "Failed to open $BITMAP attribute"); + return PTR_ERR(bmp_vi); + } + + bmp_ni = NTFS_I(bmp_vi); + + if (set) { + if (bmp_ni->data_size < bpos + 1) { + ret = ntfs_attr_truncate(bmp_ni, (bmp_ni->data_size + 8) & ~7); + if (ret) { + ntfs_error(icx->idx_ni->vol->sb, "Failed to truncate AT_BITMAP"); + goto err; + } + i_size_write(bmp_vi, (loff_t)bmp_ni->data_size); + } + } + + if (ntfs_inode_attr_pread(bmp_vi, bpos, 1, &byte) != 1) { + ret = -EIO; + ntfs_error(icx->idx_ni->vol->sb, "Failed to read $BITMAP"); + goto err; + } + + if (set) + byte |= bit; + else + byte &= ~bit; + + if (ntfs_inode_attr_pwrite(bmp_vi, bpos, 1, &byte, false) != 1) { + ret = -EIO; + ntfs_error(icx->idx_ni->vol->sb, "Failed to write $Bitmap"); + goto err; + } + +err: + iput(bmp_vi); + return ret; +} + +static int ntfs_ibm_set(struct ntfs_index_context *icx, s64 vcn) +{ + return ntfs_ibm_modify(icx, vcn, 1); +} + +static int ntfs_ibm_clear(struct ntfs_index_context *icx, s64 vcn) +{ + return ntfs_ibm_modify(icx, vcn, 0); +} + +static s64 ntfs_ibm_get_free(struct ntfs_index_context *icx) +{ + u8 *bm; + int bit; + s64 vcn, byte, size; + + ntfs_debug("Entering\n"); + + bm = ntfs_attr_readall(icx->idx_ni, AT_BITMAP, icx->name, icx->name_len, + &size); + if (!bm) + return (s64)-1; + + for (byte = 0; byte < size; byte++) { + if (bm[byte] == 255) + continue; + + for (bit = 0; bit < 8; bit++) { + if (!(bm[byte] & (1 << bit))) { + vcn = ntfs_ibm_pos_to_vcn(icx, byte * 8 + bit); + goto out; + } + } + } + + vcn = ntfs_ibm_pos_to_vcn(icx, size * 8); +out: + ntfs_debug("allocated vcn: %lld\n", vcn); + + if (ntfs_ibm_set(icx, vcn)) + vcn = (s64)-1; + + ntfs_free(bm); + return vcn; +} + +static struct index_block *ntfs_ir_to_ib(struct index_root *ir, s64 ib_vcn) +{ + struct index_block *ib; + struct index_entry *ie_last; + char *ies_start, *ies_end; + int i; + + ntfs_debug("Entering\n"); + + ib = ntfs_ib_alloc(ib_vcn, le32_to_cpu(ir->index_block_size), LEAF_NODE); + if (!ib) + return NULL; + + ies_start = (char *)ntfs_ie_get_first(&ir->index); + ies_end = (char *)ntfs_ie_get_end(&ir->index); + ie_last = ntfs_ie_get_last((struct index_entry *)ies_start, ies_end); + /* + * Copy all entries, including the termination entry + * as well, which can never have any data. + */ + i = (char *)ie_last - ies_start + le16_to_cpu(ie_last->length); + memcpy(ntfs_ie_get_first(&ib->index), ies_start, i); + + ib->index.flags = ir->index.flags; + ib->index.index_length = cpu_to_le32(i + + le32_to_cpu(ib->index.entries_offset)); + return ib; +} + +static void ntfs_ir_nill(struct index_root *ir) +{ + struct index_entry *ie_last; + char *ies_start, *ies_end; + + ntfs_debug("Entering\n"); + + ies_start = (char *)ntfs_ie_get_first(&ir->index); + ies_end = (char *)ntfs_ie_get_end(&ir->index); + ie_last = ntfs_ie_get_last((struct index_entry *)ies_start, ies_end); + /* + * Move the index root termination entry forward + */ + if ((char *)ie_last > ies_start) { + memmove((char *)ntfs_ie_get_first(&ir->index), + (char *)ie_last, le16_to_cpu(ie_last->length)); + ie_last = (struct index_entry *)ies_start; + } +} + +static int ntfs_ib_copy_tail(struct ntfs_index_context *icx, struct index_block *src, + struct index_entry *median, s64 new_vcn) +{ + u8 *ies_end; + struct index_entry *ie_head; /* first entry after the median */ + int tail_size, ret; + struct index_block *dst; + + ntfs_debug("Entering\n"); + + dst = ntfs_ib_alloc(new_vcn, icx->block_size, + src->index.flags & NODE_MASK); + if (!dst) + return -ENOMEM; + + ie_head = ntfs_ie_get_next(median); + + ies_end = (u8 *)ntfs_ie_get_end(&src->index); + tail_size = ies_end - (u8 *)ie_head; + memcpy(ntfs_ie_get_first(&dst->index), ie_head, tail_size); + + dst->index.index_length = cpu_to_le32(tail_size + + le32_to_cpu(dst->index.entries_offset)); + ret = ntfs_ib_write(icx, dst); + + ntfs_free(dst); + return ret; +} + +static int ntfs_ib_cut_tail(struct ntfs_index_context *icx, struct index_block *ib, + struct index_entry *ie) +{ + char *ies_start, *ies_end; + struct index_entry *ie_last; + int ret; + + ntfs_debug("Entering\n"); + + ies_start = (char *)ntfs_ie_get_first(&ib->index); + ies_end = (char *)ntfs_ie_get_end(&ib->index); + + ie_last = ntfs_ie_get_last((struct index_entry *)ies_start, ies_end); + if (ie_last->flags & INDEX_ENTRY_NODE) + ntfs_ie_set_vcn(ie_last, ntfs_ie_get_vcn(ie)); + + unsafe_memcpy(ie, ie_last, le16_to_cpu(ie_last->length), + /* alloc is larger than ie_last->length, see ntfs_ie_get_last() */); + + ib->index.index_length = cpu_to_le32(((char *)ie - ies_start) + + le16_to_cpu(ie->length) + le32_to_cpu(ib->index.entries_offset)); + + ret = ntfs_ib_write(icx, ib); + return ret; +} + +static int ntfs_ia_add(struct ntfs_index_context *icx) +{ + int ret; + + ntfs_debug("Entering\n"); + + ret = ntfs_ibm_add(icx); + if (ret) + return ret; + + if (!ntfs_attr_exist(icx->idx_ni, AT_INDEX_ALLOCATION, icx->name, icx->name_len)) { + ret = ntfs_attr_add(icx->idx_ni, AT_INDEX_ALLOCATION, icx->name, + icx->name_len, NULL, 0); + if (ret) { + ntfs_error(icx->idx_ni->vol->sb, "Failed to add AT_INDEX_ALLOCATION"); + return ret; + } + } + + icx->ia_ni = ntfs_ia_open(icx, icx->idx_ni); + if (!icx->ia_ni) + return -ENOENT; + + return 0; +} + +static int ntfs_ir_reparent(struct ntfs_index_context *icx) +{ + struct ntfs_attr_search_ctx *ctx = NULL; + struct index_root *ir; + struct index_entry *ie; + struct index_block *ib = NULL; + s64 new_ib_vcn; + int ix_root_size; + int ret = 0; + + ntfs_debug("Entering\n"); + + ir = ntfs_ir_lookup2(icx->idx_ni, icx->name, icx->name_len); + if (!ir) { + ret = -ENOENT; + goto out; + } + + if ((ir->index.flags & NODE_MASK) == SMALL_INDEX) { + ret = ntfs_ia_add(icx); + if (ret) + goto out; + } + + new_ib_vcn = ntfs_ibm_get_free(icx); + if (new_ib_vcn < 0) { + ret = -EINVAL; + goto out; + } + + ir = ntfs_ir_lookup2(icx->idx_ni, icx->name, icx->name_len); + if (!ir) { + ret = -ENOENT; + goto clear_bmp; + } + + ib = ntfs_ir_to_ib(ir, new_ib_vcn); + if (ib == NULL) { + ret = -EIO; + ntfs_error(icx->idx_ni->vol->sb, "Failed to move index root to index block"); + goto clear_bmp; + } + + ret = ntfs_ib_write(icx, ib); + if (ret) + goto clear_bmp; + +retry: + ir = ntfs_ir_lookup(icx->idx_ni, icx->name, icx->name_len, &ctx); + if (!ir) { + ret = -ENOENT; + goto clear_bmp; + } + + ntfs_ir_nill(ir); + + ie = ntfs_ie_get_first(&ir->index); + ie->flags |= INDEX_ENTRY_NODE; + ie->length = cpu_to_le16(sizeof(struct index_entry_header) + sizeof(s64)); + + ir->index.flags = LARGE_INDEX; + NInoSetIndexAllocPresent(icx->idx_ni); + ir->index.index_length = cpu_to_le32(le32_to_cpu(ir->index.entries_offset) + + le16_to_cpu(ie->length)); + ir->index.allocated_size = ir->index.index_length; + + ix_root_size = sizeof(struct index_root) - sizeof(struct index_header) + + le32_to_cpu(ir->index.allocated_size); + ret = ntfs_resident_attr_value_resize(ctx->mrec, ctx->attr, ix_root_size); + if (ret) { + /* + * When there is no space to build a non-resident + * index, we may have to move the root to an extent + */ + if ((ret == -ENOSPC) && (ctx->al_entry || !ntfs_inode_add_attrlist(icx->idx_ni))) { + ntfs_attr_put_search_ctx(ctx); + ctx = NULL; + ir = ntfs_ir_lookup(icx->idx_ni, icx->name, icx->name_len, &ctx); + if (ir && !ntfs_attr_record_move_away(ctx, ix_root_size - + le32_to_cpu(ctx->attr->data.resident.value_length))) { + if (ntfs_attrlist_update(ctx->base_ntfs_ino ? + ctx->base_ntfs_ino : ctx->ntfs_ino)) + goto clear_bmp; + ntfs_attr_put_search_ctx(ctx); + ctx = NULL; + goto retry; + } + } + goto clear_bmp; + } else { + icx->idx_ni->data_size = icx->idx_ni->initialized_size = ix_root_size; + icx->idx_ni->allocated_size = (ix_root_size + 7) & ~7; + } + ntfs_ie_set_vcn(ie, new_ib_vcn); + +err_out: + ntfs_free(ib); + if (ctx) + ntfs_attr_put_search_ctx(ctx); +out: + return ret; +clear_bmp: + ntfs_ibm_clear(icx, new_ib_vcn); goto err_out; } + +/** + * ntfs_ir_truncate - Truncate index root attribute + */ +static int ntfs_ir_truncate(struct ntfs_index_context *icx, int data_size) +{ + int ret; + + ntfs_debug("Entering\n"); + + /* + * INDEX_ROOT must be resident and its entries can be moved to + * struct index_block, so ENOSPC isn't a real error. + */ + ret = ntfs_attr_truncate(icx->idx_ni, data_size + offsetof(struct index_root, index)); + if (!ret) { + i_size_write(VFS_I(icx->idx_ni), icx->idx_ni->initialized_size); + icx->ir = ntfs_ir_lookup2(icx->idx_ni, icx->name, icx->name_len); + if (!icx->ir) + return -ENOENT; + + icx->ir->index.allocated_size = cpu_to_le32(data_size); + } else if (ret != -ENOSPC) + ntfs_error(icx->idx_ni->vol->sb, "Failed to truncate INDEX_ROOT"); + + return ret; +} + +/** + * ntfs_ir_make_space - Make more space for the index root attribute + */ +static int ntfs_ir_make_space(struct ntfs_index_context *icx, int data_size) +{ + int ret; + + ntfs_debug("Entering\n"); + + ret = ntfs_ir_truncate(icx, data_size); + if (ret == -ENOSPC) { + ret = ntfs_ir_reparent(icx); + if (!ret) + ret = -EAGAIN; + else + ntfs_error(icx->idx_ni->vol->sb, "Failed to modify INDEX_ROOT"); + } + + return ret; +} + +/* + * NOTE: 'ie' must be a copy of a real index entry. + */ +static int ntfs_ie_add_vcn(struct index_entry **ie) +{ + struct index_entry *p, *old = *ie; + + old->length = cpu_to_le16(le16_to_cpu(old->length) + sizeof(s64)); + p = ntfs_realloc_nofs(old, le16_to_cpu(old->length), + le16_to_cpu(old->length) - sizeof(s64)); + if (!p) + return -ENOMEM; + + p->flags |= INDEX_ENTRY_NODE; + *ie = p; + return 0; +} + +static int ntfs_ih_insert(struct index_header *ih, struct index_entry *orig_ie, s64 new_vcn, + int pos) +{ + struct index_entry *ie_node, *ie; + int ret = 0; + s64 old_vcn; + + ntfs_debug("Entering\n"); + ie = ntfs_ie_dup(orig_ie); + if (!ie) + return -ENOMEM; + + if (!(ie->flags & INDEX_ENTRY_NODE)) { + ret = ntfs_ie_add_vcn(&ie); + if (ret) + goto out; + } + + ie_node = ntfs_ie_get_by_pos(ih, pos); + old_vcn = ntfs_ie_get_vcn(ie_node); + ntfs_ie_set_vcn(ie_node, new_vcn); + + ntfs_ie_insert(ih, ie, ie_node); + ntfs_ie_set_vcn(ie_node, old_vcn); +out: + ntfs_free(ie); + return ret; +} + +static s64 ntfs_icx_parent_vcn(struct ntfs_index_context *icx) +{ + return icx->parent_vcn[icx->pindex]; +} + +static s64 ntfs_icx_parent_pos(struct ntfs_index_context *icx) +{ + return icx->parent_pos[icx->pindex]; +} + +static int ntfs_ir_insert_median(struct ntfs_index_context *icx, struct index_entry *median, + s64 new_vcn) +{ + u32 new_size; + int ret; + + ntfs_debug("Entering\n"); + + icx->ir = ntfs_ir_lookup2(icx->idx_ni, icx->name, icx->name_len); + if (!icx->ir) + return -ENOENT; + + new_size = le32_to_cpu(icx->ir->index.index_length) + + le16_to_cpu(median->length); + if (!(median->flags & INDEX_ENTRY_NODE)) + new_size += sizeof(s64); + + ret = ntfs_ir_make_space(icx, new_size); + if (ret) + return ret; + + icx->ir = ntfs_ir_lookup2(icx->idx_ni, icx->name, icx->name_len); + if (!icx->ir) + return -ENOENT; + + return ntfs_ih_insert(&icx->ir->index, median, new_vcn, + ntfs_icx_parent_pos(icx)); +} + +static int ntfs_ib_split(struct ntfs_index_context *icx, struct index_block *ib); + +struct split_info { + struct list_head entry; + s64 new_vcn; + struct index_block *ib; +}; + +static int ntfs_ib_insert(struct ntfs_index_context *icx, struct index_entry *ie, s64 new_vcn, + struct split_info *si) +{ + struct index_block *ib; + u32 idx_size, allocated_size; + int err; + s64 old_vcn; + + ntfs_debug("Entering\n"); + + ib = ntfs_malloc_nofs(icx->block_size); + if (!ib) + return -ENOMEM; + + old_vcn = ntfs_icx_parent_vcn(icx); + + err = ntfs_ib_read(icx, old_vcn, ib); + if (err) + goto err_out; + + idx_size = le32_to_cpu(ib->index.index_length); + allocated_size = le32_to_cpu(ib->index.allocated_size); + if (idx_size + le16_to_cpu(ie->length) + sizeof(s64) > allocated_size) { + si->ib = ib; + si->new_vcn = new_vcn; + return -EAGAIN; + } + + err = ntfs_ih_insert(&ib->index, ie, new_vcn, ntfs_icx_parent_pos(icx)); + if (err) + goto err_out; + + err = ntfs_ib_write(icx, ib); + +err_out: + ntfs_free(ib); + return err; +} + +/** + * ntfs_ib_split - Split an index block + */ +static int ntfs_ib_split(struct ntfs_index_context *icx, struct index_block *ib) +{ + struct index_entry *median; + s64 new_vcn; + int ret; + struct split_info *si; + LIST_HEAD(ntfs_cut_tail_list); + + ntfs_debug("Entering\n"); + +resplit: + ret = ntfs_icx_parent_dec(icx); + if (ret) + goto out; + + median = ntfs_ie_get_median(&ib->index); + new_vcn = ntfs_ibm_get_free(icx); + if (new_vcn < 0) { + ret = -EINVAL; + goto out; + } + + ret = ntfs_ib_copy_tail(icx, ib, median, new_vcn); + if (ret) { + ntfs_ibm_clear(icx, new_vcn); + goto out; + } + + if (ntfs_icx_parent_vcn(icx) == VCN_INDEX_ROOT_PARENT) { + ret = ntfs_ir_insert_median(icx, median, new_vcn); + if (ret) { + ntfs_ibm_clear(icx, new_vcn); + goto out; + } + } else { + si = kzalloc(sizeof(struct split_info), GFP_NOFS); + if (!si) { + ntfs_ibm_clear(icx, new_vcn); + ret = -ENOMEM; + goto out; + } + + ret = ntfs_ib_insert(icx, median, new_vcn, si); + if (ret == -EAGAIN) { + list_add_tail(&si->entry, &ntfs_cut_tail_list); + ib = si->ib; + goto resplit; + } else if (ret) { + ntfs_free(si->ib); + kfree(si); + ntfs_ibm_clear(icx, new_vcn); + goto out; + } + kfree(si); + } + + ret = ntfs_ib_cut_tail(icx, ib, median); + +out: + while (!list_empty(&ntfs_cut_tail_list)) { + si = list_last_entry(&ntfs_cut_tail_list, struct split_info, entry); + ntfs_ibm_clear(icx, si->new_vcn); + ntfs_free(si->ib); + list_del(&si->entry); + kfree(si); + if (!ret) + ret = -EAGAIN; + } + + return ret; +} + +int ntfs_ie_add(struct ntfs_index_context *icx, struct index_entry *ie) +{ + struct index_header *ih; + int allocated_size, new_size; + int ret; + + while (1) { + ret = ntfs_index_lookup(&ie->key, le16_to_cpu(ie->key_length), icx); + if (!ret) { + ret = -EEXIST; + ntfs_error(icx->idx_ni->vol->sb, "Index already have such entry"); + goto err_out; + } + if (ret != -ENOENT) { + ntfs_error(icx->idx_ni->vol->sb, "Failed to find place for new entry"); + goto err_out; + } + ret = 0; + + if (icx->is_in_root) + ih = &icx->ir->index; + else + ih = &icx->ib->index; + + allocated_size = le32_to_cpu(ih->allocated_size); + new_size = le32_to_cpu(ih->index_length) + le16_to_cpu(ie->length); + + if (new_size <= allocated_size) + break; + + ntfs_debug("index block sizes: allocated: %d needed: %d\n", + allocated_size, new_size); + + if (icx->is_in_root) + ret = ntfs_ir_make_space(icx, new_size); + else + ret = ntfs_ib_split(icx, icx->ib); + if (ret && ret != -EAGAIN) + goto err_out; + + mark_mft_record_dirty(icx->actx->ntfs_ino); + ntfs_index_ctx_reinit(icx); + } + + ntfs_ie_insert(ih, ie, icx->entry); + ntfs_index_entry_mark_dirty(icx); + +err_out: + ntfs_debug("%s\n", ret ? "Failed" : "Done"); + return ret; +} + +/** + * ntfs_index_add_filename - add filename to directory index + * @ni: ntfs inode describing directory to which index add filename + * @fn: FILE_NAME attribute to add + * @mref: reference of the inode which @fn describes + */ +int ntfs_index_add_filename(struct ntfs_inode *ni, struct file_name_attr *fn, u64 mref) +{ + struct index_entry *ie; + struct ntfs_index_context *icx; + int fn_size, ie_size, err; + + ntfs_debug("Entering\n"); + + if (!ni || !fn) + return -EINVAL; + + fn_size = (fn->file_name_length * sizeof(__le16)) + + sizeof(struct file_name_attr); + ie_size = (sizeof(struct index_entry_header) + fn_size + 7) & ~7; + + ie = ntfs_malloc_nofs(ie_size); + if (!ie) + return -ENOMEM; + + ie->data.dir.indexed_file = cpu_to_le64(mref); + ie->length = cpu_to_le16(ie_size); + ie->key_length = cpu_to_le16(fn_size); + + unsafe_memcpy(&ie->key, fn, fn_size, + /* "fn_size" was correctly calculated above */); + + icx = ntfs_index_ctx_get(ni, I30, 4); + if (!icx) { + err = -ENOMEM; + goto out; + } + + err = ntfs_ie_add(icx, ie); + ntfs_index_ctx_put(icx); +out: + ntfs_free(ie); + return err; +} + +static int ntfs_ih_takeout(struct ntfs_index_context *icx, struct index_header *ih, + struct index_entry *ie, struct index_block *ib) +{ + struct index_entry *ie_roam; + int freed_space; + bool full; + int ret = 0; + + ntfs_debug("Entering\n"); + + full = ih->index_length == ih->allocated_size; + ie_roam = ntfs_ie_dup_novcn(ie); + if (!ie_roam) + return -ENOMEM; + + ntfs_ie_delete(ih, ie); + + if (ntfs_icx_parent_vcn(icx) == VCN_INDEX_ROOT_PARENT) { + /* + * Recover the space which may have been freed + * while deleting an entry from root index + */ + freed_space = le32_to_cpu(ih->allocated_size) - + le32_to_cpu(ih->index_length); + if (full && (freed_space > 0) && !(freed_space & 7)) { + ntfs_ir_truncate(icx, le32_to_cpu(ih->index_length)); + /* do nothing if truncation fails */ + } + + mark_mft_record_dirty(icx->actx->ntfs_ino); + } else { + ret = ntfs_ib_write(icx, ib); + if (ret) + goto out; + } + + ntfs_index_ctx_reinit(icx); + + ret = ntfs_ie_add(icx, ie_roam); +out: + ntfs_free(ie_roam); + return ret; +} + +/** + * Used if an empty index block to be deleted has END entry as the parent + * in the INDEX_ROOT which is the only one there. + */ +static void ntfs_ir_leafify(struct ntfs_index_context *icx, struct index_header *ih) +{ + struct index_entry *ie; + + ntfs_debug("Entering\n"); + + ie = ntfs_ie_get_first(ih); + ie->flags &= ~INDEX_ENTRY_NODE; + ie->length = cpu_to_le16(le16_to_cpu(ie->length) - sizeof(s64)); + + ih->index_length = cpu_to_le32(le32_to_cpu(ih->index_length) - sizeof(s64)); + ih->flags &= ~LARGE_INDEX; + NInoClearIndexAllocPresent(icx->idx_ni); + + /* Not fatal error */ + ntfs_ir_truncate(icx, le32_to_cpu(ih->index_length)); +} + +/** + * Used if an empty index block to be deleted has END entry as the parent + * in the INDEX_ROOT which is not the only one there. + */ +static int ntfs_ih_reparent_end(struct ntfs_index_context *icx, struct index_header *ih, + struct index_block *ib) +{ + struct index_entry *ie, *ie_prev; + + ntfs_debug("Entering\n"); + + ie = ntfs_ie_get_by_pos(ih, ntfs_icx_parent_pos(icx)); + ie_prev = ntfs_ie_prev(ih, ie); + if (!ie_prev) + return -EIO; + ntfs_ie_set_vcn(ie, ntfs_ie_get_vcn(ie_prev)); + + return ntfs_ih_takeout(icx, ih, ie_prev, ib); +} + +static int ntfs_index_rm_leaf(struct ntfs_index_context *icx) +{ + struct index_block *ib = NULL; + struct index_header *parent_ih; + struct index_entry *ie; + int ret; + + ntfs_debug("pindex: %d\n", icx->pindex); + + ret = ntfs_icx_parent_dec(icx); + if (ret) + return ret; + + ret = ntfs_ibm_clear(icx, icx->parent_vcn[icx->pindex + 1]); + if (ret) + return ret; + + if (ntfs_icx_parent_vcn(icx) == VCN_INDEX_ROOT_PARENT) + parent_ih = &icx->ir->index; + else { + ib = ntfs_malloc_nofs(icx->block_size); + if (!ib) + return -ENOMEM; + + ret = ntfs_ib_read(icx, ntfs_icx_parent_vcn(icx), ib); + if (ret) + goto out; + + parent_ih = &ib->index; + } + + ie = ntfs_ie_get_by_pos(parent_ih, ntfs_icx_parent_pos(icx)); + if (!ntfs_ie_end(ie)) { + ret = ntfs_ih_takeout(icx, parent_ih, ie, ib); + goto out; + } + + if (ntfs_ih_zero_entry(parent_ih)) { + if (ntfs_icx_parent_vcn(icx) == VCN_INDEX_ROOT_PARENT) { + ntfs_ir_leafify(icx, parent_ih); + goto out; + } + + ret = ntfs_index_rm_leaf(icx); + goto out; + } + + ret = ntfs_ih_reparent_end(icx, parent_ih, ib); +out: + ntfs_free(ib); + return ret; +} + +static int ntfs_index_rm_node(struct ntfs_index_context *icx) +{ + int entry_pos, pindex; + s64 vcn; + struct index_block *ib = NULL; + struct index_entry *ie_succ, *ie, *entry = icx->entry; + struct index_header *ih; + u32 new_size; + int delta, ret; + + ntfs_debug("Entering\n"); + + if (!icx->ia_ni) { + icx->ia_ni = ntfs_ia_open(icx, icx->idx_ni); + if (!icx->ia_ni) + return -EINVAL; + } + + ib = ntfs_malloc_nofs(icx->block_size); + if (!ib) + return -ENOMEM; + + ie_succ = ntfs_ie_get_next(icx->entry); + entry_pos = icx->parent_pos[icx->pindex]++; + pindex = icx->pindex; +descend: + vcn = ntfs_ie_get_vcn(ie_succ); + ret = ntfs_ib_read(icx, vcn, ib); + if (ret) + goto out; + + ie_succ = ntfs_ie_get_first(&ib->index); + + ret = ntfs_icx_parent_inc(icx); + if (ret) + goto out; + + icx->parent_vcn[icx->pindex] = vcn; + icx->parent_pos[icx->pindex] = 0; + + if ((ib->index.flags & NODE_MASK) == INDEX_NODE) + goto descend; + + if (ntfs_ih_zero_entry(&ib->index)) { + ret = -EIO; + ntfs_error(icx->idx_ni->vol->sb, "Empty index block"); + goto out; + } + + ie = ntfs_ie_dup(ie_succ); + if (!ie) { + ret = -ENOMEM; + goto out; + } + + ret = ntfs_ie_add_vcn(&ie); + if (ret) + goto out2; + + ntfs_ie_set_vcn(ie, ntfs_ie_get_vcn(icx->entry)); + + if (icx->is_in_root) + ih = &icx->ir->index; + else + ih = &icx->ib->index; + + delta = le16_to_cpu(ie->length) - le16_to_cpu(icx->entry->length); + new_size = le32_to_cpu(ih->index_length) + delta; + if (delta > 0) { + if (icx->is_in_root) { + ret = ntfs_ir_make_space(icx, new_size); + if (ret != 0) + goto out2; + + ih = &icx->ir->index; + entry = ntfs_ie_get_by_pos(ih, entry_pos); + + } else if (new_size > le32_to_cpu(ih->allocated_size)) { + icx->pindex = pindex; + ret = ntfs_ib_split(icx, icx->ib); + if (!ret) + ret = -EAGAIN; + goto out2; + } + } + + ntfs_ie_delete(ih, entry); + ntfs_ie_insert(ih, ie, entry); + + if (icx->is_in_root) + ret = ntfs_ir_truncate(icx, new_size); + else + ret = ntfs_icx_ib_write(icx); + if (ret) + goto out2; + + ntfs_ie_delete(&ib->index, ie_succ); + + if (ntfs_ih_zero_entry(&ib->index)) + ret = ntfs_index_rm_leaf(icx); + else + ret = ntfs_ib_write(icx, ib); + +out2: + ntfs_free(ie); +out: + ntfs_free(ib); + return ret; +} + +/** + * ntfs_index_rm - remove entry from the index + * @icx: index context describing entry to delete + * + * Delete entry described by @icx from the index. Index context is always + * reinitialized after use of this function, so it can be used for index + * lookup once again. + */ +int ntfs_index_rm(struct ntfs_index_context *icx) +{ + struct index_header *ih; + int ret = 0; + + ntfs_debug("Entering\n"); + + if (!icx || (!icx->ib && !icx->ir) || ntfs_ie_end(icx->entry)) { + ret = -EINVAL; + goto err_out; + } + if (icx->is_in_root) + ih = &icx->ir->index; + else + ih = &icx->ib->index; + + if (icx->entry->flags & INDEX_ENTRY_NODE) { + ret = ntfs_index_rm_node(icx); + if (ret) + goto err_out; + } else if (icx->is_in_root || !ntfs_ih_one_entry(ih)) { + ntfs_ie_delete(ih, icx->entry); + + if (icx->is_in_root) + ret = ntfs_ir_truncate(icx, le32_to_cpu(ih->index_length)); + else + ret = ntfs_icx_ib_write(icx); + if (ret) + goto err_out; + } else { + ret = ntfs_index_rm_leaf(icx); + if (ret) + goto err_out; + } + + return 0; +err_out: + return ret; +} + +int ntfs_index_remove(struct ntfs_inode *dir_ni, const void *key, const int keylen) +{ + int ret = 0; + struct ntfs_index_context *icx; + + icx = ntfs_index_ctx_get(dir_ni, I30, 4); + if (!icx) + return -EINVAL; + + while (1) { + ret = ntfs_index_lookup(key, keylen, icx); + if (ret) + goto err_out; + + ret = ntfs_index_rm(icx); + if (ret && ret != -EAGAIN) + goto err_out; + else if (!ret) + break; + + mark_mft_record_dirty(icx->actx->ntfs_ino); + ntfs_index_ctx_reinit(icx); + } + + mark_mft_record_dirty(icx->actx->ntfs_ino); + + ntfs_index_ctx_put(icx); + return 0; +err_out: + ntfs_index_ctx_put(icx); + ntfs_error(dir_ni->vol->sb, "Delete failed"); + return ret; +} + +/* + * ntfs_index_walk_down - walk down the index tree (leaf bound) + * until there are no subnode in the first index entry returns + * the entry at the bottom left in subnode + */ +struct index_entry *ntfs_index_walk_down(struct index_entry *ie, struct ntfs_index_context *ictx) +{ + struct index_entry *entry; + s64 vcn; + + entry = ie; + do { + vcn = ntfs_ie_get_vcn(entry); + if (ictx->is_in_root) { + /* down from level zero */ + ictx->ir = NULL; + ictx->ib = (struct index_block *)ntfs_malloc_nofs(ictx->block_size); + ictx->pindex = 1; + ictx->is_in_root = false; + } else { + /* down from non-zero level */ + ictx->pindex++; + } + + ictx->parent_pos[ictx->pindex] = 0; + ictx->parent_vcn[ictx->pindex] = vcn; + if (!ntfs_ib_read(ictx, vcn, ictx->ib)) { + ictx->entry = ntfs_ie_get_first(&ictx->ib->index); + entry = ictx->entry; + } else + entry = NULL; + } while (entry && (entry->flags & INDEX_ENTRY_NODE)); + + return entry; +} + +/** + * ntfs_index_walk_up - walk up the index tree (root bound) until + * there is a valid data entry in parent returns the parent entry + * or NULL if no more parent. + */ +static struct index_entry *ntfs_index_walk_up(struct index_entry *ie, + struct ntfs_index_context *ictx) +{ + struct index_entry *entry; + s64 vcn; + + entry = ie; + if (ictx->pindex > 0) { + do { + ictx->pindex--; + if (!ictx->pindex) { + /* we have reached the root */ + kfree(ictx->ib); + ictx->ib = NULL; + ictx->is_in_root = true; + /* a new search context is to be allocated */ + if (ictx->actx) + ntfs_attr_put_search_ctx(ictx->actx); + ictx->ir = ntfs_ir_lookup(ictx->idx_ni, ictx->name, + ictx->name_len, &ictx->actx); + if (ictx->ir) + entry = ntfs_ie_get_by_pos(&ictx->ir->index, + ictx->parent_pos[ictx->pindex]); + else + entry = NULL; + } else { + /* up into non-root node */ + vcn = ictx->parent_vcn[ictx->pindex]; + if (!ntfs_ib_read(ictx, vcn, ictx->ib)) { + entry = ntfs_ie_get_by_pos(&ictx->ib->index, + ictx->parent_pos[ictx->pindex]); + } else + entry = NULL; + } + ictx->entry = entry; + } while (entry && (ictx->pindex > 0) && + (entry->flags & INDEX_ENTRY_END)); + } else + entry = NULL; + + return entry; +} + +/** + * ntfs_index_next - get next entry in an index according to collating sequence. + * Returns next entry or NULL if none. + * + * Sample layout : + * + * +---+---+---+---+---+---+---+---+ n ptrs to subnodes + * | | | 10| 25| 33| | | | n-1 keys in between + * +---+---+---+---+---+---+---+---+ no key in last entry + * | A | A + * | | | +-------------------------------+ + * +--------------------------+ | +-----+ | + * | +--+ | | + * V | V | + * +---+---+---+---+---+---+---+---+ | +---+---+---+---+---+---+---+---+ + * | 11| 12| 13| 14| 15| 16| 17| | | | 26| 27| 28| 29| 30| 31| 32| | + * +---+---+---+---+---+---+---+---+ | +---+---+---+---+---+---+---+---+ + * | | + * +-----------------------+ | + * | | + * +---+---+---+---+---+---+---+---+ + * | 18| 19| 20| 21| 22| 23| 24| | + * +---+---+---+---+---+---+---+---+ + */ +struct index_entry *ntfs_index_next(struct index_entry *ie, struct ntfs_index_context *ictx) +{ + struct index_entry *next; + __le16 flags; + + /* + * lookup() may have returned an invalid node + * when searching for a partial key + * if this happens, walk up + */ + if (ie->flags & INDEX_ENTRY_END) + next = ntfs_index_walk_up(ie, ictx); + else { + /* + * get next entry in same node + * there is always one after any entry with data + */ + next = (struct index_entry *)((char *)ie + le16_to_cpu(ie->length)); + ++ictx->parent_pos[ictx->pindex]; + flags = next->flags; + + /* walk down if it has a subnode */ + if (flags & INDEX_ENTRY_NODE) { + if (!ictx->ia_ni) + ictx->ia_ni = ntfs_ia_open(ictx, ictx->idx_ni); + + next = ntfs_index_walk_down(next, ictx); + } else { + + /* walk up it has no subnode, nor data */ + if (flags & INDEX_ENTRY_END) + next = ntfs_index_walk_up(next, ictx); + } + } + + /* return NULL if stuck at end of a block */ + if (next && (next->flags & INDEX_ENTRY_END)) + next = NULL; + + return next; +} -- 2.25.1 This updates the implementation of file operations Signed-off-by: Hyunchul Lee Signed-off-by: Namjae Jeon --- fs/ntfs/file.c | 2855 +++++++++++++++++------------------------------- 1 file changed, 1000 insertions(+), 1855 deletions(-) diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 297c0b9db621..245489f6558d 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1,32 +1,30 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. + * NTFS kernel file operations. Part of the Linux-NTFS project. * * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc. + * Copyright (c) 2025 LG Electronics Co., Ltd. */ +#include #include -#include -#include -#include -#include -#include -#include -#include +#include +#include #include -#include +#include +#include +#include +#include +#include -#include -#include - -#include "attrib.h" -#include "bitmap.h" -#include "inode.h" -#include "debug.h" #include "lcnalloc.h" -#include "malloc.h" -#include "mft.h" #include "ntfs.h" +#include "aops.h" +#include "reparse.h" +#include "ea.h" +#include "iomap.h" +#include "bitmap.h" +#include "malloc.h" /** * ntfs_file_open - called when an inode is about to be opened @@ -48,1948 +46,1095 @@ */ static int ntfs_file_open(struct inode *vi, struct file *filp) { + struct ntfs_inode *ni = NTFS_I(vi); + + if (NVolShutdown(ni->vol)) + return -EIO; + if (sizeof(unsigned long) < 8) { if (i_size_read(vi) > MAX_LFS_FILESIZE) return -EOVERFLOW; } + + if (filp->f_flags & O_TRUNC && NInoNonResident(ni)) { + int err; + + mutex_lock(&ni->mrec_lock); + down_read(&ni->runlist.lock); + if (!ni->runlist.rl) { + err = ntfs_attr_map_whole_runlist(ni); + if (err) { + up_read(&ni->runlist.lock); + mutex_unlock(&ni->mrec_lock); + return err; + } + } + ni->lcn_seek_trunc = ni->runlist.rl->lcn; + up_read(&ni->runlist.lock); + mutex_unlock(&ni->mrec_lock); + } + + filp->f_mode |= FMODE_NOWAIT; + return generic_file_open(vi, filp); } -#ifdef NTFS_RW +static int ntfs_file_release(struct inode *vi, struct file *filp) +{ + struct ntfs_inode *ni = NTFS_I(vi); + struct ntfs_volume *vol = ni->vol; + s64 aligned_data_size = round_up(ni->data_size, vol->cluster_size); + + if (NInoCompressed(ni)) + return 0; + + inode_lock(vi); + mutex_lock(&ni->mrec_lock); + down_write(&ni->runlist.lock); + if (aligned_data_size < ni->allocated_size) { + int err; + s64 vcn_ds = NTFS_B_TO_CLU(vol, aligned_data_size); + s64 vcn_tr = -1; + struct runlist_element *rl = ni->runlist.rl; + ssize_t rc = ni->runlist.count - 2; + + while (rc >= 0 && rl[rc].lcn == LCN_HOLE && vcn_ds <= rl[rc].vcn) { + vcn_tr = rl[rc].vcn; + rc--; + } + + if (vcn_tr >= 0) { + err = ntfs_rl_truncate_nolock(vol, &ni->runlist, vcn_tr); + if (err) { + ntfs_free(ni->runlist.rl); + ni->runlist.rl = NULL; + ntfs_error(vol->sb, "Preallocated block rollback failed"); + } else { + ni->allocated_size = NTFS_CLU_TO_B(vol, vcn_tr); + err = ntfs_attr_update_mapping_pairs(ni, 0); + if (err) + ntfs_error(vol->sb, + "Failed to rollback mapping pairs for prealloc"); + } + } + } + up_write(&ni->runlist.lock); + mutex_unlock(&ni->mrec_lock); + inode_unlock(vi); + + return 0; +} /** - * ntfs_attr_extend_initialized - extend the initialized size of an attribute - * @ni: ntfs inode of the attribute to extend - * @new_init_size: requested new initialized size in bytes - * - * Extend the initialized size of an attribute described by the ntfs inode @ni - * to @new_init_size bytes. This involves zeroing any non-sparse space between - * the old initialized size and @new_init_size both in the page cache and on - * disk (if relevant complete pages are already uptodate in the page cache then - * these are simply marked dirty). + * ntfs_file_fsync - sync a file to disk + * @filp: file to be synced + * @start: start offset to be synced + * @end: end offset to be synced + * @datasync: if non-zero only flush user data and not metadata * - * As a side-effect, the file size (vfs inode->i_size) may be incremented as, - * in the resident attribute case, it is tied to the initialized size and, in - * the non-resident attribute case, it may not fall below the initialized size. + * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync + * system calls. This function is inspired by fs/buffer.c::file_fsync(). * - * Note that if the attribute is resident, we do not need to touch the page - * cache at all. This is because if the page cache page is not uptodate we - * bring it uptodate later, when doing the write to the mft record since we - * then already have the page mapped. And if the page is uptodate, the - * non-initialized region will already have been zeroed when the page was - * brought uptodate and the region may in fact already have been overwritten - * with new data via mmap() based writes, so we cannot just zero it. And since - * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped - * is unspecified, we choose not to do zeroing and thus we do not need to touch - * the page at all. For a more detailed explanation see ntfs_truncate() in - * fs/ntfs/inode.c. + * If @datasync is false, write the mft record and all associated extent mft + * records as well as the $DATA attribute and then sync the block device. * - * Return 0 on success and -errno on error. In the case that an error is - * encountered it is possible that the initialized size will already have been - * incremented some way towards @new_init_size but it is guaranteed that if - * this is the case, the necessary zeroing will also have happened and that all - * metadata is self-consistent. + * If @datasync is true and the attribute is non-resident, we skip the writing + * of the mft record and all associated extent mft records (this might still + * happen due to the write_inode_now() call). * - * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be - * held by the caller. + * Also, if @datasync is true, we do not wait on the inode to be written out + * but we always wait on the page cache pages to be written out. */ -static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size) +static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) { - s64 old_init_size; - loff_t old_i_size; - pgoff_t index, end_index; - unsigned long flags; - struct inode *vi = VFS_I(ni); - ntfs_inode *base_ni; - MFT_RECORD *m = NULL; - ATTR_RECORD *a; - ntfs_attr_search_ctx *ctx = NULL; - struct address_space *mapping; - struct page *page = NULL; - u8 *kattr; - int err; - u32 attr_len; + struct inode *vi = filp->f_mapping->host; + struct ntfs_inode *ni = NTFS_I(vi); + struct ntfs_volume *vol = ni->vol; + int err, ret = 0; + struct inode *parent_vi, *ia_vi; + struct ntfs_attr_search_ctx *ctx; - read_lock_irqsave(&ni->size_lock, flags); - old_init_size = ni->initialized_size; - old_i_size = i_size_read(vi); - BUG_ON(new_init_size > ni->allocated_size); - read_unlock_irqrestore(&ni->size_lock, flags); - ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " - "old_initialized_size 0x%llx, " - "new_initialized_size 0x%llx, i_size 0x%llx.", - vi->i_ino, (unsigned)le32_to_cpu(ni->type), - (unsigned long long)old_init_size, - (unsigned long long)new_init_size, old_i_size); - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - /* Use goto to reduce indentation and we need the label below anyway. */ - if (NInoNonResident(ni)) - goto do_non_resident_extend; - BUG_ON(old_init_size != old_i_size); - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto err_out; - } - m = ctx->mrec; - a = ctx->attr; - BUG_ON(a->non_resident); - /* The total length of the attribute value. */ - attr_len = le32_to_cpu(a->data.resident.value_length); - BUG_ON(old_i_size != (loff_t)attr_len); - /* - * Do the zeroing in the mft record and update the attribute size in - * the mft record. - */ - kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); - memset(kattr + attr_len, 0, new_init_size - attr_len); - a->data.resident.value_length = cpu_to_le32((u32)new_init_size); - /* Finally, update the sizes in the vfs and ntfs inodes. */ - write_lock_irqsave(&ni->size_lock, flags); - i_size_write(vi, new_init_size); - ni->initialized_size = new_init_size; - write_unlock_irqrestore(&ni->size_lock, flags); - goto done; -do_non_resident_extend: - /* - * If the new initialized size @new_init_size exceeds the current file - * size (vfs inode->i_size), we need to extend the file size to the - * new initialized size. - */ - if (new_init_size > old_i_size) { - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto err_out; - } - m = ctx->mrec; - a = ctx->attr; - BUG_ON(!a->non_resident); - BUG_ON(old_i_size != (loff_t) - sle64_to_cpu(a->data.non_resident.data_size)); - a->data.non_resident.data_size = cpu_to_sle64(new_init_size); - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - /* Update the file size in the vfs inode. */ - i_size_write(vi, new_init_size); - ntfs_attr_put_search_ctx(ctx); - ctx = NULL; - unmap_mft_record(base_ni); - m = NULL; - } - mapping = vi->i_mapping; - index = old_init_size >> PAGE_SHIFT; - end_index = (new_init_size + PAGE_SIZE - 1) >> PAGE_SHIFT; - do { - /* - * Read the page. If the page is not present, this will zero - * the uninitialized regions for us. - */ - page = read_mapping_page(mapping, index, NULL); - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto init_err_out; - } - /* - * Update the initialized size in the ntfs inode. This is - * enough to make ntfs_writepage() work. - */ - write_lock_irqsave(&ni->size_lock, flags); - ni->initialized_size = (s64)(index + 1) << PAGE_SHIFT; - if (ni->initialized_size > new_init_size) - ni->initialized_size = new_init_size; - write_unlock_irqrestore(&ni->size_lock, flags); - /* Set the page dirty so it gets written out. */ - set_page_dirty(page); - put_page(page); - /* - * Play nice with the vm and the rest of the system. This is - * very much needed as we can potentially be modifying the - * initialised size from a very small value to a really huge - * value, e.g. - * f = open(somefile, O_TRUNC); - * truncate(f, 10GiB); - * seek(f, 10GiB); - * write(f, 1); - * And this would mean we would be marking dirty hundreds of - * thousands of pages or as in the above example more than - * two and a half million pages! - * - * TODO: For sparse pages could optimize this workload by using - * the FsMisc / MiscFs page bit as a "PageIsSparse" bit. This - * would be set in read_folio for sparse pages and here we would - * not need to mark dirty any pages which have this bit set. - * The only caveat is that we have to clear the bit everywhere - * where we allocate any clusters that lie in the page or that - * contain the page. - * - * TODO: An even greater optimization would be for us to only - * call read_folio() on pages which are not in sparse regions as - * determined from the runlist. This would greatly reduce the - * number of pages we read and make dirty in the case of sparse - * files. - */ - balance_dirty_pages_ratelimited(mapping); - cond_resched(); - } while (++index < end_index); - read_lock_irqsave(&ni->size_lock, flags); - BUG_ON(ni->initialized_size != new_init_size); - read_unlock_irqrestore(&ni->size_lock, flags); - /* Now bring in sync the initialized_size in the mft record. */ - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - goto init_err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto init_err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto init_err_out; - } - m = ctx->mrec; - a = ctx->attr; - BUG_ON(!a->non_resident); - a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size); -done: - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.", - (unsigned long long)new_init_size, i_size_read(vi)); - return 0; -init_err_out: - write_lock_irqsave(&ni->size_lock, flags); - ni->initialized_size = old_init_size; - write_unlock_irqrestore(&ni->size_lock, flags); -err_out: - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - ntfs_debug("Failed. Returning error code %i.", err); - return err; -} + ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); -static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb, - struct iov_iter *from) -{ - loff_t pos; - s64 end, ll; - ssize_t err; - unsigned long flags; - struct file *file = iocb->ki_filp; - struct inode *vi = file_inode(file); - ntfs_inode *ni = NTFS_I(vi); - ntfs_volume *vol = ni->vol; - - ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos " - "0x%llx, count 0x%zx.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), - (unsigned long long)iocb->ki_pos, - iov_iter_count(from)); - err = generic_write_checks(iocb, from); - if (unlikely(err <= 0)) - goto out; - /* - * All checks have passed. Before we start doing any writing we want - * to abort any totally illegal writes. - */ - BUG_ON(NInoMstProtected(ni)); - BUG_ON(ni->type != AT_DATA); - /* If file is encrypted, deny access, just like NT4. */ - if (NInoEncrypted(ni)) { - /* Only $DATA attributes can be encrypted. */ - /* - * Reminder for later: Encrypted files are _always_ - * non-resident so that the content can always be encrypted. - */ - ntfs_debug("Denying write access to encrypted file."); - err = -EACCES; - goto out; - } - if (NInoCompressed(ni)) { - /* Only unnamed $DATA attribute can be compressed. */ - BUG_ON(ni->name_len); - /* - * Reminder for later: If resident, the data is not actually - * compressed. Only on the switch to non-resident does - * compression kick in. This is in contrast to encrypted files - * (see above). - */ - ntfs_error(vi->i_sb, "Writing to compressed files is not " - "implemented yet. Sorry."); - err = -EOPNOTSUPP; - goto out; - } - err = file_remove_privs(file); - if (unlikely(err)) - goto out; - /* - * Our ->update_time method always succeeds thus file_update_time() - * cannot fail either so there is no need to check the return code. - */ - file_update_time(file); - pos = iocb->ki_pos; - /* The first byte after the last cluster being written to. */ - end = (pos + iov_iter_count(from) + vol->cluster_size_mask) & - ~(u64)vol->cluster_size_mask; - /* - * If the write goes beyond the allocated size, extend the allocation - * to cover the whole of the write, rounded up to the nearest cluster. - */ - read_lock_irqsave(&ni->size_lock, flags); - ll = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (end > ll) { - /* - * Extend the allocation without changing the data size. - * - * Note we ensure the allocation is big enough to at least - * write some data but we do not require the allocation to be - * complete, i.e. it may be partial. - */ - ll = ntfs_attr_extend_allocation(ni, end, -1, pos); - if (likely(ll >= 0)) { - BUG_ON(pos >= ll); - /* If the extension was partial truncate the write. */ - if (end > ll) { - ntfs_debug("Truncating write to inode 0x%lx, " - "attribute type 0x%x, because " - "the allocation was only " - "partially extended.", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type)); - iov_iter_truncate(from, ll - pos); - } - } else { - err = ll; - read_lock_irqsave(&ni->size_lock, flags); - ll = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - /* Perform a partial write if possible or fail. */ - if (pos < ll) { - ntfs_debug("Truncating write to inode 0x%lx " - "attribute type 0x%x, because " - "extending the allocation " - "failed (error %d).", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type), - (int)-err); - iov_iter_truncate(from, ll - pos); - } else { - if (err != -ENOSPC) - ntfs_error(vi->i_sb, "Cannot perform " - "write to inode " - "0x%lx, attribute " - "type 0x%x, because " - "extending the " - "allocation failed " - "(error %ld).", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type), - (long)-err); - else - ntfs_debug("Cannot perform write to " - "inode 0x%lx, " - "attribute type 0x%x, " - "because there is not " - "space left.", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type)); - goto out; + if (NVolShutdown(vol)) + return -EIO; + + err = file_write_and_wait_range(filp, start, end); + if (err) + return err; + + if (!datasync || !NInoNonResident(NTFS_I(vi))) + ret = __ntfs_write_inode(vi, 1); + write_inode_now(vi, !datasync); + + ctx = ntfs_attr_get_search_ctx(ni, NULL); + if (!ctx) + return -ENOMEM; + + mutex_lock_nested(&ni->mrec_lock, NTFS_INODE_MUTEX_NORMAL_2); + while (!(err = ntfs_attr_lookup(AT_UNUSED, NULL, 0, 0, 0, NULL, 0, ctx))) { + if (ctx->attr->type == AT_FILE_NAME) { + struct file_name_attr *fn = (struct file_name_attr *)((u8 *)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset)); + + parent_vi = ntfs_iget(vi->i_sb, MREF_LE(fn->parent_directory)); + if (IS_ERR(parent_vi)) + continue; + mutex_lock_nested(&NTFS_I(parent_vi)->mrec_lock, NTFS_INODE_MUTEX_PARENT_2); + ia_vi = ntfs_index_iget(parent_vi, I30, 4); + mutex_unlock(&NTFS_I(parent_vi)->mrec_lock); + if (IS_ERR(ia_vi)) { + iput(parent_vi); + continue; } + write_inode_now(ia_vi, 1); + iput(ia_vi); + write_inode_now(parent_vi, 1); + iput(parent_vi); + } else if (ctx->attr->non_resident) { + struct inode *attr_vi; + __le16 *name; + + name = (__le16 *)((u8 *)ctx->attr + le16_to_cpu(ctx->attr->name_offset)); + if (ctx->attr->type == AT_DATA && ctx->attr->name_length == 0) + continue; + + attr_vi = ntfs_attr_iget(vi, ctx->attr->type, + name, ctx->attr->name_length); + if (IS_ERR(attr_vi)) + continue; + spin_lock(&attr_vi->i_lock); + if (inode_state_read_once(attr_vi) & I_DIRTY_PAGES) { + spin_unlock(&attr_vi->i_lock); + filemap_write_and_wait(attr_vi->i_mapping); + } else + spin_unlock(&attr_vi->i_lock); + iput(attr_vi); } } + mutex_unlock(&ni->mrec_lock); + ntfs_attr_put_search_ctx(ctx); + + write_inode_now(vol->mftbmp_ino, 1); + down_write(&vol->lcnbmp_lock); + write_inode_now(vol->lcnbmp_ino, 1); + up_write(&vol->lcnbmp_lock); + write_inode_now(vol->mft_ino, 1); + /* - * If the write starts beyond the initialized size, extend it up to the - * beginning of the write and initialize all non-sparse space between - * the old initialized size and the new one. This automatically also - * increments the vfs inode->i_size to keep it above or equal to the - * initialized_size. + * NOTE: If we were to use mapping->private_list (see ext2 and + * fs/buffer.c) for dirty blocks then we could optimize the below to be + * sync_mapping_buffers(vi->i_mapping). */ - read_lock_irqsave(&ni->size_lock, flags); - ll = ni->initialized_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (pos > ll) { - /* - * Wait for ongoing direct i/o to complete before proceeding. - * New direct i/o cannot start as we hold i_mutex. - */ - inode_dio_wait(vi); - err = ntfs_attr_extend_initialized(ni, pos); - if (unlikely(err < 0)) - ntfs_error(vi->i_sb, "Cannot perform write to inode " - "0x%lx, attribute type 0x%x, because " - "extending the initialized size " - "failed (error %d).", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), - (int)-err); - } -out: - return err; + err = sync_blockdev(vi->i_sb->s_bdev); + if (unlikely(err && !ret)) + ret = err; + if (likely(!ret)) + ntfs_debug("Done."); + else + ntfs_warning(vi->i_sb, + "Failed to f%ssync inode 0x%lx. Error %u.", + datasync ? "data" : "", vi->i_ino, -ret); + if (!ret) + blkdev_issue_flush(vi->i_sb->s_bdev); + return ret; } /** - * __ntfs_grab_cache_pages - obtain a number of locked pages - * @mapping: address space mapping from which to obtain page cache pages - * @index: starting index in @mapping at which to begin obtaining pages - * @nr_pages: number of page cache pages to obtain - * @pages: array of pages in which to return the obtained page cache pages - * @cached_page: allocated but as yet unused page + * ntfsp_setattr - called from notify_change() when an attribute is being changed + * @idmap: idmap of the mount the inode was found from + * @dentry: dentry whose attributes to change + * @attr: structure describing the attributes and the changes * - * Obtain @nr_pages locked page cache pages from the mapping @mapping and - * starting at index @index. + * We have to trap VFS attempts to truncate the file described by @dentry as + * soon as possible, because we do not implement changes in i_size yet. So we + * abort all i_size changes here. * - * If a page is newly created, add it to lru list - * - * Note, the page locks are obtained in ascending page index order. + * We also abort all changes of user, group, and mode as we do not implement + * the NTFS ACLs yet. */ -static inline int __ntfs_grab_cache_pages(struct address_space *mapping, - pgoff_t index, const unsigned nr_pages, struct page **pages, - struct page **cached_page) +int ntfsp_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr) { - int err, nr; - - BUG_ON(!nr_pages); - err = nr = 0; - do { - pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK | - FGP_ACCESSED); - if (!pages[nr]) { - if (!*cached_page) { - *cached_page = page_cache_alloc(mapping); - if (unlikely(!*cached_page)) { - err = -ENOMEM; - goto err_out; - } + struct inode *vi = d_inode(dentry); + int err; + unsigned int ia_valid = attr->ia_valid; + struct ntfs_inode *ni = NTFS_I(vi); + struct ntfs_volume *vol = ni->vol; + + if (NVolShutdown(vol)) + return -EIO; + + err = setattr_prepare(idmap, dentry, attr); + if (err) + goto out; + + if (!(vol->vol_flags & VOLUME_IS_DIRTY)) + ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY); + + if (ia_valid & ATTR_SIZE) { + if (NInoCompressed(ni) || NInoEncrypted(ni)) { + ntfs_warning(vi->i_sb, + "Changes in inode size are not supported yet for %s files, ignoring.", + NInoCompressed(ni) ? "compressed" : "encrypted"); + err = -EOPNOTSUPP; + } else { + loff_t old_size = vi->i_size; + + err = inode_newsize_ok(vi, attr->ia_size); + if (err) + goto out; + + inode_dio_wait(vi); + /* Serialize against page faults */ + if (NInoNonResident(NTFS_I(vi)) && + attr->ia_size < old_size) { + err = iomap_truncate_page(vi, attr->ia_size, NULL, + &ntfs_read_iomap_ops, + &ntfs_iomap_folio_ops, NULL); + if (err) + goto out; + } + + truncate_setsize(vi, attr->ia_size); + err = ntfs_truncate_vfs(vi, attr->ia_size, old_size); + if (err) { + i_size_write(vi, old_size); + goto out; } - err = add_to_page_cache_lru(*cached_page, mapping, - index, - mapping_gfp_constraint(mapping, GFP_KERNEL)); - if (unlikely(err)) { - if (err == -EEXIST) - continue; - goto err_out; + + if (NInoNonResident(ni) && attr->ia_size > old_size && + old_size % PAGE_SIZE != 0) { + loff_t len = min_t(loff_t, + round_up(old_size, PAGE_SIZE) - old_size, + attr->ia_size - old_size); + err = iomap_zero_range(vi, old_size, len, + NULL, &ntfs_read_iomap_ops, + &ntfs_iomap_folio_ops, NULL); } - pages[nr] = *cached_page; - *cached_page = NULL; } - index++; - nr++; - } while (nr < nr_pages); + if (ia_valid == ATTR_SIZE) + goto out; + ia_valid |= ATTR_MTIME | ATTR_CTIME; + } + + setattr_copy(idmap, vi, attr); + + if (vol->sb->s_flags & SB_POSIXACL && !S_ISLNK(vi->i_mode)) { + err = posix_acl_chmod(idmap, dentry, vi->i_mode); + if (err) + goto out; + } + + if (0222 & vi->i_mode) + ni->flags &= ~FILE_ATTR_READONLY; + else + ni->flags |= FILE_ATTR_READONLY; + + if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE)) { + unsigned int flags = 0; + + if (ia_valid & ATTR_UID) + flags |= NTFS_EA_UID; + if (ia_valid & ATTR_GID) + flags |= NTFS_EA_GID; + if (ia_valid & ATTR_MODE) + flags |= NTFS_EA_MODE; + + if (S_ISDIR(vi->i_mode)) + vi->i_mode &= ~vol->dmask; + else + vi->i_mode &= ~vol->fmask; + + mutex_lock(&ni->mrec_lock); + ntfs_ea_set_wsl_inode(vi, 0, NULL, flags); + mutex_unlock(&ni->mrec_lock); + } + + mark_inode_dirty(vi); out: return err; -err_out: - while (nr > 0) { - unlock_page(pages[--nr]); - put_page(pages[nr]); - } - goto out; } -static inline void ntfs_submit_bh_for_read(struct buffer_head *bh) +int ntfsp_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, unsigned int request_mask, + unsigned int query_flags) { - lock_buffer(bh); - get_bh(bh); - bh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, bh); + struct inode *inode = d_backing_inode(path->dentry); + + generic_fillattr(idmap, request_mask, inode, stat); + + stat->blksize = NTFS_SB(inode->i_sb)->cluster_size; + stat->blocks = (((u64)NTFS_I(inode)->i_dealloc_clusters << + NTFS_SB(inode->i_sb)->cluster_size_bits) >> 9) + inode->i_blocks; + stat->result_mask |= STATX_BTIME; + stat->btime = NTFS_I(inode)->i_crtime; + + return 0; } -/** - * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data - * @pages: array of destination pages - * @nr_pages: number of pages in @pages - * @pos: byte position in file at which the write begins - * @bytes: number of bytes to be written - * - * This is called for non-resident attributes from ntfs_file_buffered_write() - * with i_mutex held on the inode (@pages[0]->mapping->host). There are - * @nr_pages pages in @pages which are locked but not kmap()ped. The source - * data has not yet been copied into the @pages. - * - * Need to fill any holes with actual clusters, allocate buffers if necessary, - * ensure all the buffers are mapped, and bring uptodate any buffers that are - * only partially being written to. - * - * If @nr_pages is greater than one, we are guaranteed that the cluster size is - * greater than PAGE_SIZE, that all pages in @pages are entirely inside - * the same cluster and that they are the entirety of that cluster, and that - * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole. - * - * i_size is not to be modified yet. - * - * Return 0 on success or -errno on error. - */ -static int ntfs_prepare_pages_for_non_resident_write(struct page **pages, - unsigned nr_pages, s64 pos, size_t bytes) +static loff_t ntfs_file_llseek(struct file *file, loff_t offset, int whence) { - VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend; - LCN lcn; - s64 bh_pos, vcn_len, end, initialized_size; - sector_t lcn_block; - struct folio *folio; - struct inode *vi; - ntfs_inode *ni, *base_ni = NULL; - ntfs_volume *vol; - runlist_element *rl, *rl2; - struct buffer_head *bh, *head, *wait[2], **wait_bh = wait; - ntfs_attr_search_ctx *ctx = NULL; - MFT_RECORD *m = NULL; - ATTR_RECORD *a = NULL; - unsigned long flags; - u32 attr_rec_len = 0; - unsigned blocksize, u; - int err, mp_size; - bool rl_write_locked, was_hole, is_retry; - unsigned char blocksize_bits; - struct { - u8 runlist_merged:1; - u8 mft_attr_mapped:1; - u8 mp_rebuilt:1; - u8 attr_switched:1; - } status = { 0, 0, 0, 0 }; - - BUG_ON(!nr_pages); - BUG_ON(!pages); - BUG_ON(!*pages); - vi = pages[0]->mapping->host; - ni = NTFS_I(vi); - vol = ni->vol; - ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page " - "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", - vi->i_ino, ni->type, pages[0]->index, nr_pages, - (long long)pos, bytes); - blocksize = vol->sb->s_blocksize; - blocksize_bits = vol->sb->s_blocksize_bits; - rl_write_locked = false; - rl = NULL; - err = 0; - vcn = lcn = -1; - vcn_len = 0; - lcn_block = -1; - was_hole = false; - cpos = pos >> vol->cluster_size_bits; - end = pos + bytes; - cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits; - /* - * Loop over each buffer in each folio. Use goto to - * reduce indentation. - */ - u = 0; -do_next_folio: - folio = page_folio(pages[u]); - bh_pos = folio_pos(folio); - head = folio_buffers(folio); - if (!head) - /* - * create_empty_buffers() will create uptodate/dirty - * buffers if the folio is uptodate/dirty. - */ - head = create_empty_buffers(folio, blocksize, 0); - bh = head; - do { - VCN cdelta; - s64 bh_end; - unsigned bh_cofs; - - /* Clear buffer_new on all buffers to reinitialise state. */ - if (buffer_new(bh)) - clear_buffer_new(bh); - bh_end = bh_pos + blocksize; - bh_cpos = bh_pos >> vol->cluster_size_bits; - bh_cofs = bh_pos & vol->cluster_size_mask; - if (buffer_mapped(bh)) { - /* - * The buffer is already mapped. If it is uptodate, - * ignore it. - */ - if (buffer_uptodate(bh)) - continue; - /* - * The buffer is not uptodate. If the folio is uptodate - * set the buffer uptodate and otherwise ignore it. - */ - if (folio_test_uptodate(folio)) { - set_buffer_uptodate(bh); - continue; - } - /* - * Neither the folio nor the buffer are uptodate. If - * the buffer is only partially being written to, we - * need to read it in before the write, i.e. now. - */ - if ((bh_pos < pos && bh_end > pos) || - (bh_pos < end && bh_end > end)) { - /* - * If the buffer is fully or partially within - * the initialized size, do an actual read. - * Otherwise, simply zero the buffer. - */ - read_lock_irqsave(&ni->size_lock, flags); - initialized_size = ni->initialized_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (bh_pos < initialized_size) { - ntfs_submit_bh_for_read(bh); - *wait_bh++ = bh; - } else { - folio_zero_range(folio, bh_offset(bh), - blocksize); - set_buffer_uptodate(bh); - } - } - continue; - } - /* Unmapped buffer. Need to map it. */ - bh->b_bdev = vol->sb->s_bdev; - /* - * If the current buffer is in the same clusters as the map - * cache, there is no need to check the runlist again. The - * map cache is made up of @vcn, which is the first cached file - * cluster, @vcn_len which is the number of cached file - * clusters, @lcn is the device cluster corresponding to @vcn, - * and @lcn_block is the block number corresponding to @lcn. - */ - cdelta = bh_cpos - vcn; - if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) { -map_buffer_cached: - BUG_ON(lcn < 0); - bh->b_blocknr = lcn_block + - (cdelta << (vol->cluster_size_bits - - blocksize_bits)) + - (bh_cofs >> blocksize_bits); - set_buffer_mapped(bh); - /* - * If the folio is uptodate so is the buffer. If the - * buffer is fully outside the write, we ignore it if - * it was already allocated and we mark it dirty so it - * gets written out if we allocated it. On the other - * hand, if we allocated the buffer but we are not - * marking it dirty we set buffer_new so we can do - * error recovery. - */ - if (folio_test_uptodate(folio)) { - if (!buffer_uptodate(bh)) - set_buffer_uptodate(bh); - if (unlikely(was_hole)) { - /* We allocated the buffer. */ - clean_bdev_bh_alias(bh); - if (bh_end <= pos || bh_pos >= end) - mark_buffer_dirty(bh); - else - set_buffer_new(bh); - } - continue; - } - /* Page is _not_ uptodate. */ - if (likely(!was_hole)) { - /* - * Buffer was already allocated. If it is not - * uptodate and is only partially being written - * to, we need to read it in before the write, - * i.e. now. - */ - if (!buffer_uptodate(bh) && bh_pos < end && - bh_end > pos && - (bh_pos < pos || - bh_end > end)) { - /* - * If the buffer is fully or partially - * within the initialized size, do an - * actual read. Otherwise, simply zero - * the buffer. - */ - read_lock_irqsave(&ni->size_lock, - flags); - initialized_size = ni->initialized_size; - read_unlock_irqrestore(&ni->size_lock, - flags); - if (bh_pos < initialized_size) { - ntfs_submit_bh_for_read(bh); - *wait_bh++ = bh; - } else { - folio_zero_range(folio, - bh_offset(bh), - blocksize); - set_buffer_uptodate(bh); - } - } - continue; - } - /* We allocated the buffer. */ - clean_bdev_bh_alias(bh); - /* - * If the buffer is fully outside the write, zero it, - * set it uptodate, and mark it dirty so it gets - * written out. If it is partially being written to, - * zero region surrounding the write but leave it to - * commit write to do anything else. Finally, if the - * buffer is fully being overwritten, do nothing. - */ - if (bh_end <= pos || bh_pos >= end) { - if (!buffer_uptodate(bh)) { - folio_zero_range(folio, bh_offset(bh), - blocksize); - set_buffer_uptodate(bh); - } - mark_buffer_dirty(bh); - continue; - } - set_buffer_new(bh); - if (!buffer_uptodate(bh) && - (bh_pos < pos || bh_end > end)) { - u8 *kaddr; - unsigned pofs; - - kaddr = kmap_local_folio(folio, 0); - if (bh_pos < pos) { - pofs = bh_pos & ~PAGE_MASK; - memset(kaddr + pofs, 0, pos - bh_pos); - } - if (bh_end > end) { - pofs = end & ~PAGE_MASK; - memset(kaddr + pofs, 0, bh_end - end); - } - kunmap_local(kaddr); - flush_dcache_folio(folio); - } - continue; - } - /* - * Slow path: this is the first buffer in the cluster. If it - * is outside allocated size and is not uptodate, zero it and - * set it uptodate. - */ + struct inode *vi = file->f_mapping->host; + + if (whence == SEEK_DATA || whence == SEEK_HOLE) { + struct ntfs_inode *ni = NTFS_I(vi); + struct ntfs_volume *vol = ni->vol; + struct runlist_element *rl; + s64 vcn; + unsigned int vcn_off; + loff_t end_off; + unsigned long flags; + int i; + + inode_lock_shared(vi); + + if (NInoCompressed(ni) || NInoEncrypted(ni)) + goto error; + read_lock_irqsave(&ni->size_lock, flags); - initialized_size = ni->allocated_size; + end_off = ni->data_size; read_unlock_irqrestore(&ni->size_lock, flags); - if (bh_pos > initialized_size) { - if (folio_test_uptodate(folio)) { - if (!buffer_uptodate(bh)) - set_buffer_uptodate(bh); - } else if (!buffer_uptodate(bh)) { - folio_zero_range(folio, bh_offset(bh), - blocksize); - set_buffer_uptodate(bh); - } - continue; - } - is_retry = false; - if (!rl) { - down_read(&ni->runlist.lock); -retry_remap: - rl = ni->runlist.rl; + + if (offset < 0 || offset >= end_off) + goto error; + + if (!NInoNonResident(ni)) { + if (whence == SEEK_HOLE) + offset = end_off; + goto found_no_runlist_lock; } - if (likely(rl != NULL)) { - /* Seek to element containing target cluster. */ - while (rl->length && rl[1].vcn <= bh_cpos) - rl++; - lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos); - if (likely(lcn >= 0)) { - /* - * Successful remap, setup the map cache and - * use that to deal with the buffer. - */ - was_hole = false; - vcn = bh_cpos; - vcn_len = rl[1].vcn - vcn; - lcn_block = lcn << (vol->cluster_size_bits - - blocksize_bits); - cdelta = 0; - /* - * If the number of remaining clusters touched - * by the write is smaller or equal to the - * number of cached clusters, unlock the - * runlist as the map cache will be used from - * now on. - */ - if (likely(vcn + vcn_len >= cend)) { - if (rl_write_locked) { - up_write(&ni->runlist.lock); - rl_write_locked = false; - } else - up_read(&ni->runlist.lock); - rl = NULL; - } - goto map_buffer_cached; - } - } else - lcn = LCN_RL_NOT_MAPPED; - /* - * If it is not a hole and not out of bounds, the runlist is - * probably unmapped so try to map it now. - */ - if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) { - if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) { - /* Attempt to map runlist. */ - if (!rl_write_locked) { - /* - * We need the runlist locked for - * writing, so if it is locked for - * reading relock it now and retry in - * case it changed whilst we dropped - * the lock. - */ + + vcn = NTFS_B_TO_CLU(vol, offset); + vcn_off = NTFS_B_TO_CLU_OFS(vol, offset); + + down_read(&ni->runlist.lock); + rl = ni->runlist.rl; + i = 0; + +#ifdef DEBUG + ntfs_debug("init:"); + ntfs_debug_dump_runlist(rl); +#endif + while (1) { + if (!rl || !NInoFullyMapped(ni) || rl[i].lcn == LCN_RL_NOT_MAPPED) { + int ret; + + up_read(&ni->runlist.lock); + ret = ntfs_map_runlist(ni, rl ? rl[i].vcn : 0); + if (ret) + goto error; + down_read(&ni->runlist.lock); + rl = ni->runlist.rl; +#ifdef DEBUG + ntfs_debug("mapped:"); + ntfs_debug_dump_runlist(ni->runlist.rl); +#endif + continue; + } else if (rl[i].lcn == LCN_ENOENT) { + if (whence == SEEK_DATA) { up_read(&ni->runlist.lock); - down_write(&ni->runlist.lock); - rl_write_locked = true; - goto retry_remap; - } - err = ntfs_map_runlist_nolock(ni, bh_cpos, - NULL); - if (likely(!err)) { - is_retry = true; - goto retry_remap; - } - /* - * If @vcn is out of bounds, pretend @lcn is - * LCN_ENOENT. As long as the buffer is out - * of bounds this will work fine. - */ - if (err == -ENOENT) { - lcn = LCN_ENOENT; - err = 0; - goto rl_not_mapped_enoent; - } - } else - err = -EIO; - /* Failed to map the buffer, even after retrying. */ - bh->b_blocknr = -1; - ntfs_error(vol->sb, "Failed to write to inode 0x%lx, " - "attribute type 0x%x, vcn 0x%llx, " - "vcn offset 0x%x, because its " - "location on disk could not be " - "determined%s (error code %i).", - ni->mft_no, ni->type, - (unsigned long long)bh_cpos, - (unsigned)bh_pos & - vol->cluster_size_mask, - is_retry ? " even after retrying" : "", - err); - break; - } -rl_not_mapped_enoent: - /* - * The buffer is in a hole or out of bounds. We need to fill - * the hole, unless the buffer is in a cluster which is not - * touched by the write, in which case we just leave the buffer - * unmapped. This can only happen when the cluster size is - * less than the page cache size. - */ - if (unlikely(vol->cluster_size < PAGE_SIZE)) { - bh_cend = (bh_end + vol->cluster_size - 1) >> - vol->cluster_size_bits; - if ((bh_cend <= cpos || bh_cpos >= cend)) { - bh->b_blocknr = -1; - /* - * If the buffer is uptodate we skip it. If it - * is not but the folio is uptodate, we can set - * the buffer uptodate. If the folio is not - * uptodate, we can clear the buffer and set it - * uptodate. Whether this is worthwhile is - * debatable and this could be removed. - */ - if (folio_test_uptodate(folio)) { - if (!buffer_uptodate(bh)) - set_buffer_uptodate(bh); - } else if (!buffer_uptodate(bh)) { - folio_zero_range(folio, bh_offset(bh), - blocksize); - set_buffer_uptodate(bh); + goto error; + } else { + offset = end_off; + goto found; } - continue; - } - } - /* - * Out of bounds buffer is invalid if it was not really out of - * bounds. - */ - BUG_ON(lcn != LCN_HOLE); - /* - * We need the runlist locked for writing, so if it is locked - * for reading relock it now and retry in case it changed - * whilst we dropped the lock. - */ - BUG_ON(!rl); - if (!rl_write_locked) { - up_read(&ni->runlist.lock); - down_write(&ni->runlist.lock); - rl_write_locked = true; - goto retry_remap; - } - /* Find the previous last allocated cluster. */ - BUG_ON(rl->lcn != LCN_HOLE); - lcn = -1; - rl2 = rl; - while (--rl2 >= ni->runlist.rl) { - if (rl2->lcn >= 0) { - lcn = rl2->lcn + rl2->length; - break; - } - } - rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE, - false); - if (IS_ERR(rl2)) { - err = PTR_ERR(rl2); - ntfs_debug("Failed to allocate cluster, error code %i.", - err); - break; - } - lcn = rl2->lcn; - rl = ntfs_runlists_merge(ni->runlist.rl, rl2); - if (IS_ERR(rl)) { - err = PTR_ERR(rl); - if (err != -ENOMEM) - err = -EIO; - if (ntfs_cluster_free_from_rl(vol, rl2)) { - ntfs_error(vol->sb, "Failed to release " - "allocated cluster in error " - "code path. Run chkdsk to " - "recover the lost cluster."); - NVolSetErrors(vol); - } - ntfs_free(rl2); - break; - } - ni->runlist.rl = rl; - status.runlist_merged = 1; - ntfs_debug("Allocated cluster, lcn 0x%llx.", - (unsigned long long)lcn); - /* Map and lock the mft record and get the attribute record. */ - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - break; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - unmap_mft_record(base_ni); - break; - } - status.mft_attr_mapped = 1; - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, bh_cpos, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - break; - } - m = ctx->mrec; - a = ctx->attr; - /* - * Find the runlist element with which the attribute extent - * starts. Note, we cannot use the _attr_ version because we - * have mapped the mft record. That is ok because we know the - * runlist fragment must be mapped already to have ever gotten - * here, so we can just use the _rl_ version. - */ - vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn); - rl2 = ntfs_rl_find_vcn_nolock(rl, vcn); - BUG_ON(!rl2); - BUG_ON(!rl2->length); - BUG_ON(rl2->lcn < LCN_HOLE); - highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); - /* - * If @highest_vcn is zero, calculate the real highest_vcn - * (which can really be zero). - */ - if (!highest_vcn) - highest_vcn = (sle64_to_cpu( - a->data.non_resident.allocated_size) >> - vol->cluster_size_bits) - 1; - /* - * Determine the size of the mapping pairs array for the new - * extent, i.e. the old extent with the hole filled. - */ - mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn, - highest_vcn); - if (unlikely(mp_size <= 0)) { - if (!(err = mp_size)) - err = -EIO; - ntfs_debug("Failed to get size for mapping pairs " - "array, error code %i.", err); - break; - } - /* - * Resize the attribute record to fit the new mapping pairs - * array. - */ - attr_rec_len = le32_to_cpu(a->length); - err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu( - a->data.non_resident.mapping_pairs_offset)); - if (unlikely(err)) { - BUG_ON(err != -ENOSPC); - // TODO: Deal with this by using the current attribute - // and fill it with as much of the mapping pairs - // array as possible. Then loop over each attribute - // extent rewriting the mapping pairs arrays as we go - // along and if when we reach the end we have not - // enough space, try to resize the last attribute - // extent and if even that fails, add a new attribute - // extent. - // We could also try to resize at each step in the hope - // that we will not need to rewrite every single extent. - // Note, we may need to decompress some extents to fill - // the runlist as we are walking the extents... - ntfs_error(vol->sb, "Not enough space in the mft " - "record for the extended attribute " - "record. This case is not " - "implemented yet."); - err = -EOPNOTSUPP; - break ; - } - status.mp_rebuilt = 1; - /* - * Generate the mapping pairs array directly into the attribute - * record. - */ - err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( - a->data.non_resident.mapping_pairs_offset), - mp_size, rl2, vcn, highest_vcn, NULL); - if (unlikely(err)) { - ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, " - "attribute type 0x%x, because building " - "the mapping pairs failed with error " - "code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - err = -EIO; - break; - } - /* Update the highest_vcn but only if it was not set. */ - if (unlikely(!a->data.non_resident.highest_vcn)) - a->data.non_resident.highest_vcn = - cpu_to_sle64(highest_vcn); - /* - * If the attribute is sparse/compressed, update the compressed - * size in the ntfs_inode structure and the attribute record. - */ - if (likely(NInoSparse(ni) || NInoCompressed(ni))) { - /* - * If we are not in the first attribute extent, switch - * to it, but first ensure the changes will make it to - * disk later. - */ - if (a->data.non_resident.lowest_vcn) { - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_reinit_search_ctx(ctx); - err = ntfs_attr_lookup(ni->type, ni->name, - ni->name_len, CASE_SENSITIVE, - 0, NULL, 0, ctx); - if (unlikely(err)) { - status.attr_switched = 1; - break; + } else if (rl[i + 1].vcn > vcn) { + if ((whence == SEEK_DATA && (rl[i].lcn >= 0 || + rl[i].lcn == LCN_DELALLOC)) || + (whence == SEEK_HOLE && rl[i].lcn == LCN_HOLE)) { + offset = NTFS_CLU_TO_B(vol, vcn) + vcn_off; + if (offset < ni->data_size) + goto found; } - /* @m is not used any more so do not set it. */ - a = ctx->attr; + vcn = rl[i + 1].vcn; + vcn_off = 0; } - write_lock_irqsave(&ni->size_lock, flags); - ni->itype.compressed.size += vol->cluster_size; - a->data.non_resident.compressed_size = - cpu_to_sle64(ni->itype.compressed.size); - write_unlock_irqrestore(&ni->size_lock, flags); + i++; } - /* Ensure the changes make it to disk. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - /* Successfully filled the hole. */ - status.runlist_merged = 0; - status.mft_attr_mapped = 0; - status.mp_rebuilt = 0; - /* Setup the map cache and use that to deal with the buffer. */ - was_hole = true; - vcn = bh_cpos; - vcn_len = 1; - lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits); - cdelta = 0; - /* - * If the number of remaining clusters in the @pages is smaller - * or equal to the number of cached clusters, unlock the - * runlist as the map cache will be used from now on. - */ - if (likely(vcn + vcn_len >= cend)) { - up_write(&ni->runlist.lock); - rl_write_locked = false; - rl = NULL; - } - goto map_buffer_cached; - } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); - /* If there are no errors, do the next page. */ - if (likely(!err && ++u < nr_pages)) - goto do_next_folio; - /* If there are no errors, release the runlist lock if we took it. */ - if (likely(!err)) { - if (unlikely(rl_write_locked)) { - up_write(&ni->runlist.lock); - rl_write_locked = false; - } else if (unlikely(rl)) - up_read(&ni->runlist.lock); - rl = NULL; - } - /* If we issued read requests, let them complete. */ - read_lock_irqsave(&ni->size_lock, flags); - initialized_size = ni->initialized_size; - read_unlock_irqrestore(&ni->size_lock, flags); - while (wait_bh > wait) { - bh = *--wait_bh; - wait_on_buffer(bh); - if (likely(buffer_uptodate(bh))) { - folio = bh->b_folio; - bh_pos = folio_pos(folio) + bh_offset(bh); - /* - * If the buffer overflows the initialized size, need - * to zero the overflowing region. - */ - if (unlikely(bh_pos + blocksize > initialized_size)) { - int ofs = 0; - - if (likely(bh_pos < initialized_size)) - ofs = initialized_size - bh_pos; - folio_zero_segment(folio, bh_offset(bh) + ofs, - blocksize); - } - } else /* if (unlikely(!buffer_uptodate(bh))) */ - err = -EIO; - } - if (likely(!err)) { - /* Clear buffer_new on all buffers. */ - u = 0; - do { - bh = head = page_buffers(pages[u]); - do { - if (buffer_new(bh)) - clear_buffer_new(bh); - } while ((bh = bh->b_this_page) != head); - } while (++u < nr_pages); - ntfs_debug("Done."); - return err; + up_read(&ni->runlist.lock); + inode_unlock_shared(vi); + return -EIO; +found: + up_read(&ni->runlist.lock); +found_no_runlist_lock: + inode_unlock_shared(vi); + return vfs_setpos(file, offset, vi->i_sb->s_maxbytes); +error: + inode_unlock_shared(vi); + return -ENXIO; + } else { + return generic_file_llseek_size(file, offset, whence, + vi->i_sb->s_maxbytes, + i_size_read(vi)); } - if (status.attr_switched) { - /* Get back to the attribute extent we modified. */ - ntfs_attr_reinit_search_ctx(ctx); - if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) { - ntfs_error(vol->sb, "Failed to find required " - "attribute extent of attribute in " - "error code path. Run chkdsk to " - "recover."); - write_lock_irqsave(&ni->size_lock, flags); - ni->itype.compressed.size += vol->cluster_size; - write_unlock_irqrestore(&ni->size_lock, flags); - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - /* - * The only thing that is now wrong is the compressed - * size of the base attribute extent which chkdsk - * should be able to fix. - */ - NVolSetErrors(vol); - } else { - m = ctx->mrec; - a = ctx->attr; - status.attr_switched = 0; +} + +static ssize_t ntfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *vi = file_inode(iocb->ki_filp); + struct super_block *sb = vi->i_sb; + ssize_t ret; + + if (NVolShutdown(NTFS_SB(sb))) + return -EIO; + + if (NInoCompressed(NTFS_I(vi)) && iocb->ki_flags & IOCB_DIRECT) + return -EOPNOTSUPP; + + inode_lock_shared(vi); + + if (iocb->ki_flags & IOCB_DIRECT) { + size_t count = iov_iter_count(to); + + if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { + ret = -EINVAL; + goto inode_unlock; } + + file_accessed(iocb->ki_filp); + ret = iomap_dio_rw(iocb, to, &ntfs_read_iomap_ops, NULL, IOMAP_DIO_PARTIAL, + NULL, 0); + } else { + ret = generic_file_read_iter(iocb, to); } - /* - * If the runlist has been modified, need to restore it by punching a - * hole into it and we then need to deallocate the on-disk cluster as - * well. Note, we only modify the runlist if we are able to generate a - * new mapping pairs array, i.e. only when the mapped attribute extent - * is not switched. - */ - if (status.runlist_merged && !status.attr_switched) { - BUG_ON(!rl_write_locked); - /* Make the file cluster we allocated sparse in the runlist. */ - if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) { - ntfs_error(vol->sb, "Failed to punch hole into " - "attribute runlist in error code " - "path. Run chkdsk to recover the " - "lost cluster."); - NVolSetErrors(vol); - } else /* if (success) */ { - status.runlist_merged = 0; - /* - * Deallocate the on-disk cluster we allocated but only - * if we succeeded in punching its vcn out of the - * runlist. - */ - down_write(&vol->lcnbmp_lock); - if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) { - ntfs_error(vol->sb, "Failed to release " - "allocated cluster in error " - "code path. Run chkdsk to " - "recover the lost cluster."); - NVolSetErrors(vol); - } - up_write(&vol->lcnbmp_lock); + +inode_unlock: + inode_unlock_shared(vi); + + return ret; +} + +static int ntfs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, + int error, unsigned int flags) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + if (error) + return error; + + if (size) { + if (i_size_read(inode) < iocb->ki_pos + size) { + i_size_write(inode, iocb->ki_pos + size); + mark_inode_dirty(inode); } } - /* - * Resize the attribute record to its old size and rebuild the mapping - * pairs array. Note, we only can do this if the runlist has been - * restored to its old state which also implies that the mapped - * attribute extent is not switched. - */ - if (status.mp_rebuilt && !status.runlist_merged) { - if (ntfs_attr_record_resize(m, a, attr_rec_len)) { - ntfs_error(vol->sb, "Failed to restore attribute " - "record in error code path. Run " - "chkdsk to recover."); - NVolSetErrors(vol); - } else /* if (success) */ { - if (ntfs_mapping_pairs_build(vol, (u8*)a + - le16_to_cpu(a->data.non_resident. - mapping_pairs_offset), attr_rec_len - - le16_to_cpu(a->data.non_resident. - mapping_pairs_offset), ni->runlist.rl, - vcn, highest_vcn, NULL)) { - ntfs_error(vol->sb, "Failed to restore " - "mapping pairs array in error " - "code path. Run chkdsk to " - "recover."); - NVolSetErrors(vol); + + return 0; +} + +static const struct iomap_dio_ops ntfs_write_dio_ops = { + .end_io = ntfs_file_write_dio_end_io, +}; + +static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *vi = file->f_mapping->host; + struct ntfs_inode *ni = NTFS_I(vi); + struct ntfs_volume *vol = ni->vol; + ssize_t ret; + ssize_t count; + loff_t pos; + int err; + loff_t old_data_size, old_init_size; + + if (NVolShutdown(vol)) + return -EIO; + + if (NInoEncrypted(ni)) { + ntfs_error(vi->i_sb, "Writing for %s files is not supported yet", + NInoCompressed(ni) ? "Compressed" : "Encrypted"); + return -EOPNOTSUPP; + } + + if (NInoCompressed(ni) && iocb->ki_flags & IOCB_DIRECT) + return -EOPNOTSUPP; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(vi)) + return -EAGAIN; + } else + inode_lock(vi); + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto out_lock; + + if (NInoNonResident(ni) && (iocb->ki_flags & IOCB_DIRECT) && + ((iocb->ki_pos | ret) & (vi->i_sb->s_blocksize - 1))) { + ret = -EINVAL; + goto out_lock; + } + + err = file_modified(iocb->ki_filp); + if (err) { + ret = err; + goto out_lock; + } + + if (!(vol->vol_flags & VOLUME_IS_DIRTY)) + ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY); + + pos = iocb->ki_pos; + count = ret; + + old_data_size = ni->data_size; + old_init_size = ni->initialized_size; + if (iocb->ki_pos + ret > old_data_size) { + mutex_lock(&ni->mrec_lock); + if (!NInoCompressed(ni) && iocb->ki_pos + ret > ni->allocated_size && + iocb->ki_pos + ret < ni->allocated_size + vol->preallocated_size) + ret = ntfs_attr_expand(ni, iocb->ki_pos + ret, + ni->allocated_size + vol->preallocated_size); + else if (NInoCompressed(ni) && iocb->ki_pos + ret > ni->allocated_size) + ret = ntfs_attr_expand(ni, iocb->ki_pos + ret, + round_up(iocb->ki_pos + ret, ni->itype.compressed.block_size)); + else + ret = ntfs_attr_expand(ni, iocb->ki_pos + ret, 0); + mutex_unlock(&ni->mrec_lock); + if (ret < 0) + goto out; + } + + if (NInoNonResident(ni) && iocb->ki_pos + count > old_init_size) { + ret = ntfs_extend_initialized_size(vi, iocb->ki_pos, + iocb->ki_pos + count); + if (ret < 0) + goto out; + } + + if (NInoNonResident(ni) && NInoCompressed(ni)) { + ret = ntfs_compress_write(ni, pos, count, from); + if (ret > 0) + iocb->ki_pos += ret; + goto out; + } + + if (NInoNonResident(ni) && iocb->ki_flags & IOCB_DIRECT) { + ret = iomap_dio_rw(iocb, from, &ntfs_dio_iomap_ops, + &ntfs_write_dio_ops, 0, NULL, 0); + if (ret == -ENOTBLK) + ret = 0; + else if (ret < 0) + goto out; + + if (iov_iter_count(from)) { + loff_t offset, end; + ssize_t written; + int ret2; + + offset = iocb->ki_pos; + iocb->ki_flags &= ~IOCB_DIRECT; + written = iomap_file_buffered_write(iocb, from, + &ntfs_write_iomap_ops, &ntfs_iomap_folio_ops, + NULL); + if (written < 0) { + err = written; + goto out; } - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); + + ret += written; + end = iocb->ki_pos + written - 1; + ret2 = filemap_write_and_wait_range(iocb->ki_filp->f_mapping, + offset, end); + if (ret2) + goto out_err; + if (!ret2) + invalidate_mapping_pages(iocb->ki_filp->f_mapping, + offset >> PAGE_SHIFT, + end >> PAGE_SHIFT); } + } else { + ret = iomap_file_buffered_write(iocb, from, &ntfs_write_iomap_ops, + &ntfs_iomap_folio_ops, NULL); } - /* Release the mft record and the attribute. */ - if (status.mft_attr_mapped) { - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); +out: + if (ret < 0 && ret != -EIOCBQUEUED) { +out_err: + if (ni->initialized_size != old_init_size) { + mutex_lock(&ni->mrec_lock); + ntfs_attr_set_initialized_size(ni, old_init_size); + mutex_unlock(&ni->mrec_lock); + } + if (ni->data_size != old_data_size) { + truncate_setsize(vi, old_data_size); + ntfs_attr_truncate(ni, old_data_size); + } } - /* Release the runlist lock. */ - if (rl_write_locked) - up_write(&ni->runlist.lock); - else if (rl) - up_read(&ni->runlist.lock); - /* - * Zero out any newly allocated blocks to avoid exposing stale data. - * If BH_New is set, we know that the block was newly allocated above - * and that it has not been fully zeroed and marked dirty yet. - */ - nr_pages = u; - u = 0; - end = bh_cpos << vol->cluster_size_bits; - do { - folio = page_folio(pages[u]); - bh = head = folio_buffers(folio); - do { - if (u == nr_pages && - folio_pos(folio) + bh_offset(bh) >= end) - break; - if (!buffer_new(bh)) - continue; - clear_buffer_new(bh); - if (!buffer_uptodate(bh)) { - if (folio_test_uptodate(folio)) - set_buffer_uptodate(bh); - else { - folio_zero_range(folio, bh_offset(bh), - blocksize); - set_buffer_uptodate(bh); - } - } - mark_buffer_dirty(bh); - } while ((bh = bh->b_this_page) != head); - } while (++u <= nr_pages); - ntfs_error(vol->sb, "Failed. Returning error code %i.", err); - return err; +out_lock: + inode_unlock(vi); + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; } -static inline void ntfs_flush_dcache_pages(struct page **pages, - unsigned nr_pages) +static vm_fault_t ntfs_filemap_page_mkwrite(struct vm_fault *vmf) { - BUG_ON(!nr_pages); - /* - * Warning: Do not do the decrement at the same time as the call to - * flush_dcache_page() because it is a NULL macro on i386 and hence the - * decrement never happens so the loop never terminates. - */ - do { - --nr_pages; - flush_dcache_page(pages[nr_pages]); - } while (nr_pages > 0); + struct inode *inode = file_inode(vmf->vma->vm_file); + vm_fault_t ret; + + if (unlikely(IS_IMMUTABLE(inode))) + return VM_FAULT_SIGBUS; + + sb_start_pagefault(inode->i_sb); + file_update_time(vmf->vma->vm_file); + + ret = iomap_page_mkwrite(vmf, &ntfs_page_mkwrite_iomap_ops, NULL); + sb_end_pagefault(inode->i_sb); + return ret; } -/** - * ntfs_commit_pages_after_non_resident_write - commit the received data - * @pages: array of destination pages - * @nr_pages: number of pages in @pages - * @pos: byte position in file at which the write begins - * @bytes: number of bytes to be written - * - * See description of ntfs_commit_pages_after_write(), below. - */ -static inline int ntfs_commit_pages_after_non_resident_write( - struct page **pages, const unsigned nr_pages, - s64 pos, size_t bytes) +static const struct vm_operations_struct ntfs_file_vm_ops = { + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = ntfs_filemap_page_mkwrite, +}; + +static int ntfs_file_mmap_prepare(struct vm_area_desc *desc) { - s64 end, initialized_size; - struct inode *vi; - ntfs_inode *ni, *base_ni; - struct buffer_head *bh, *head; - ntfs_attr_search_ctx *ctx; - MFT_RECORD *m; - ATTR_RECORD *a; - unsigned long flags; - unsigned blocksize, u; - int err; + struct file *file = desc->file; + struct inode *inode = file_inode(file); - vi = pages[0]->mapping->host; - ni = NTFS_I(vi); - blocksize = vi->i_sb->s_blocksize; - end = pos + bytes; - u = 0; - do { - s64 bh_pos; - struct page *page; - bool partial; - - page = pages[u]; - bh_pos = (s64)page->index << PAGE_SHIFT; - bh = head = page_buffers(page); - partial = false; - do { - s64 bh_end; - - bh_end = bh_pos + blocksize; - if (bh_end <= pos || bh_pos >= end) { - if (!buffer_uptodate(bh)) - partial = true; - } else { - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - } - } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); - /* - * If all buffers are now uptodate but the page is not, set the - * page uptodate. - */ - if (!partial && !PageUptodate(page)) - SetPageUptodate(page); - } while (++u < nr_pages); - /* - * Finally, if we do not need to update initialized_size or i_size we - * are finished. - */ - read_lock_irqsave(&ni->size_lock, flags); - initialized_size = ni->initialized_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (end <= initialized_size) { - ntfs_debug("Done."); - return 0; - } - /* - * Update initialized_size/i_size as appropriate, both in the inode and - * the mft record. - */ - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - /* Map, pin, and lock the mft record. */ - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - ctx = NULL; - goto err_out; - } - BUG_ON(!NInoNonResident(ni)); - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto err_out; - } - a = ctx->attr; - BUG_ON(!a->non_resident); - write_lock_irqsave(&ni->size_lock, flags); - BUG_ON(end > ni->allocated_size); - ni->initialized_size = end; - a->data.non_resident.initialized_size = cpu_to_sle64(end); - if (end > i_size_read(vi)) { - i_size_write(vi, end); - a->data.non_resident.data_size = - a->data.non_resident.initialized_size; + if (NVolShutdown(NTFS_SB(file->f_mapping->host->i_sb))) + return -EIO; + + if (NInoCompressed(NTFS_I(inode))) + return -EOPNOTSUPP; + + if (desc->vm_flags & VM_WRITE) { + struct inode *inode = file_inode(file); + loff_t from, to; + int err; + + from = ((loff_t)desc->pgoff << PAGE_SHIFT); + to = min_t(loff_t, i_size_read(inode), + from + desc->end - desc->start); + + if (NTFS_I(inode)->initialized_size < to) { + err = ntfs_extend_initialized_size(inode, to, to); + if (err) + return err; + } } - write_unlock_irqrestore(&ni->size_lock, flags); - /* Mark the mft record dirty, so it gets written back. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - ntfs_debug("Done."); + + + file_accessed(file); + desc->vm_ops = &ntfs_file_vm_ops; return 0; -err_out: - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error " - "code %i).", err); - if (err != -ENOMEM) - NVolSetErrors(ni->vol); - return err; } -/** - * ntfs_commit_pages_after_write - commit the received data - * @pages: array of destination pages - * @nr_pages: number of pages in @pages - * @pos: byte position in file at which the write begins - * @bytes: number of bytes to be written - * - * This is called from ntfs_file_buffered_write() with i_mutex held on the inode - * (@pages[0]->mapping->host). There are @nr_pages pages in @pages which are - * locked but not kmap()ped. The source data has already been copied into the - * @page. ntfs_prepare_pages_for_non_resident_write() has been called before - * the data was copied (for non-resident attributes only) and it returned - * success. - * - * Need to set uptodate and mark dirty all buffers within the boundary of the - * write. If all buffers in a page are uptodate we set the page uptodate, too. - * - * Setting the buffers dirty ensures that they get written out later when - * ntfs_writepage() is invoked by the VM. - * - * Finally, we need to update i_size and initialized_size as appropriate both - * in the inode and the mft record. - * - * This is modelled after fs/buffer.c::generic_commit_write(), which marks - * buffers uptodate and dirty, sets the page uptodate if all buffers in the - * page are uptodate, and updates i_size if the end of io is beyond i_size. In - * that case, it also marks the inode dirty. - * - * If things have gone as outlined in - * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page - * content modifications here for non-resident attributes. For resident - * attributes we need to do the uptodate bringing here which we combine with - * the copying into the mft record which means we save one atomic kmap. - * - * Return 0 on success or -errno on error. - */ -static int ntfs_commit_pages_after_write(struct page **pages, - const unsigned nr_pages, s64 pos, size_t bytes) +static int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + return iomap_fiemap(inode, fieinfo, start, len, &ntfs_read_iomap_ops); +} + +static const char *ntfs_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *done) +{ + if (!NTFS_I(inode)->target) + return ERR_PTR(-EINVAL); + + return NTFS_I(inode)->target; +} + +static ssize_t ntfs_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, unsigned int flags) +{ + if (NVolShutdown(NTFS_SB(in->f_mapping->host->i_sb))) + return -EIO; + + return filemap_splice_read(in, ppos, pipe, len, flags); +} + +static int ntfs_ioctl_shutdown(struct super_block *sb, unsigned long arg) { - s64 end, initialized_size; - loff_t i_size; - struct inode *vi; - ntfs_inode *ni, *base_ni; - struct page *page; - ntfs_attr_search_ctx *ctx; - MFT_RECORD *m; - ATTR_RECORD *a; - char *kattr, *kaddr; - unsigned long flags; - u32 attr_len; + u32 flags; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(flags, (__u32 __user *)arg)) + return -EFAULT; + + return ntfs_force_shutdown(sb, flags); +} + +static int ntfs_ioctl_get_volume_label(struct file *filp, unsigned long arg) +{ + struct ntfs_volume *vol = NTFS_SB(file_inode(filp)->i_sb); + char __user *buf = (char __user *)arg; + + if (!vol->volume_label) { + if (copy_to_user(buf, "", 1)) + return -EFAULT; + } else if (copy_to_user(buf, vol->volume_label, + MIN(FSLABEL_MAX, strlen(vol->volume_label) + 1))) + return -EFAULT; + return 0; +} + +static int ntfs_ioctl_set_volume_label(struct file *filp, unsigned long arg) +{ + struct ntfs_volume *vol = NTFS_SB(file_inode(filp)->i_sb); + char *label; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + label = strndup_user((const char __user *)arg, FSLABEL_MAX); + if (IS_ERR(label)) + return PTR_ERR(label); + + ret = mnt_want_write_file(filp); + if (ret) + goto out; + + ret = ntfs_write_volume_label(vol, label); + mnt_drop_write_file(filp); +out: + kfree(label); + return ret; +} + +static int ntfs_ioctl_fitrim(struct ntfs_volume *vol, unsigned long arg) +{ + struct fstrim_range __user *user_range; + struct fstrim_range range; + struct block_device *dev; int err; - BUG_ON(!nr_pages); - BUG_ON(!pages); - page = pages[0]; - BUG_ON(!page); - vi = page->mapping->host; - ni = NTFS_I(vi); - ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page " - "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", - vi->i_ino, ni->type, page->index, nr_pages, - (long long)pos, bytes); - if (NInoNonResident(ni)) - return ntfs_commit_pages_after_non_resident_write(pages, - nr_pages, pos, bytes); - BUG_ON(nr_pages > 1); - /* - * Attribute is resident, implying it is not compressed, encrypted, or - * sparse. - */ - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - BUG_ON(NInoNonResident(ni)); - /* Map, pin, and lock the mft record. */ - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - ctx = NULL; - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; - } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto err_out; - } - a = ctx->attr; - BUG_ON(a->non_resident); - /* The total length of the attribute value. */ - attr_len = le32_to_cpu(a->data.resident.value_length); - i_size = i_size_read(vi); - BUG_ON(attr_len != i_size); - BUG_ON(pos > attr_len); - end = pos + bytes; - BUG_ON(end > le32_to_cpu(a->length) - - le16_to_cpu(a->data.resident.value_offset)); - kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); - kaddr = kmap_atomic(page); - /* Copy the received data from the page to the mft record. */ - memcpy(kattr + pos, kaddr + pos, bytes); - /* Update the attribute length if necessary. */ - if (end > attr_len) { - attr_len = end; - a->data.resident.value_length = cpu_to_le32(attr_len); - } - /* - * If the page is not uptodate, bring the out of bounds area(s) - * uptodate by copying data from the mft record to the page. - */ - if (!PageUptodate(page)) { - if (pos > 0) - memcpy(kaddr, kattr, pos); - if (end < attr_len) - memcpy(kaddr + end, kattr + end, attr_len - end); - /* Zero the region outside the end of the attribute value. */ - memset(kaddr + attr_len, 0, PAGE_SIZE - attr_len); - flush_dcache_page(page); - SetPageUptodate(page); - } - kunmap_atomic(kaddr); - /* Update initialized_size/i_size if necessary. */ - read_lock_irqsave(&ni->size_lock, flags); - initialized_size = ni->initialized_size; - BUG_ON(end > ni->allocated_size); - read_unlock_irqrestore(&ni->size_lock, flags); - BUG_ON(initialized_size != i_size); - if (end > initialized_size) { - write_lock_irqsave(&ni->size_lock, flags); - ni->initialized_size = end; - i_size_write(vi, end); - write_unlock_irqrestore(&ni->size_lock, flags); - } - /* Mark the mft record dirty, so it gets written back. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - ntfs_debug("Done."); + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + dev = vol->sb->s_bdev; + if (!bdev_max_discard_sectors(dev)) + return -EOPNOTSUPP; + + user_range = (struct fstrim_range __user *)arg; + if (copy_from_user(&range, user_range, sizeof(range))) + return -EFAULT; + + if (range.len == 0) + return -EINVAL; + + if (range.len < vol->cluster_size) + return -EINVAL; + + range.minlen = max_t(u32, range.minlen, bdev_discard_granularity(dev)); + + err = ntfsp_trim_fs(vol, &range); + if (err < 0) + return err; + + if (copy_to_user(user_range, &range, sizeof(range))) + return -EFAULT; + return 0; -err_out: - if (err == -ENOMEM) { - ntfs_warning(vi->i_sb, "Error allocating memory required to " - "commit the write."); - if (PageUptodate(page)) { - ntfs_warning(vi->i_sb, "Page is uptodate, setting " - "dirty so the write will be retried " - "later on by the VM."); - /* - * Put the page on mapping->dirty_pages, but leave its - * buffers' dirty state as-is. - */ - __set_page_dirty_nobuffers(page); - err = 0; - } else - ntfs_error(vi->i_sb, "Page is not uptodate. Written " - "data has been lost."); - } else { - ntfs_error(vi->i_sb, "Resident attribute commit write failed " - "with error %i.", err); - NVolSetErrors(ni->vol); +} + +long ntfsp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case NTFS_IOC_SHUTDOWN: + return ntfs_ioctl_shutdown(file_inode(filp)->i_sb, arg); + case FS_IOC_GETFSLABEL: + return ntfs_ioctl_get_volume_label(filp, arg); + case FS_IOC_SETFSLABEL: + return ntfs_ioctl_set_volume_label(filp, arg); + case FITRIM: + return ntfs_ioctl_fitrim(NTFS_SB(file_inode(filp)->i_sb), arg); + default: + return -ENOTTY; } - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - return err; } -/* - * Copy as much as we can into the pages and return the number of bytes which - * were successfully copied. If a fault is encountered then clear the pages - * out to (ofs + bytes) and return the number of bytes which were copied. - */ -static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages, - unsigned ofs, struct iov_iter *i, size_t bytes) +#ifdef CONFIG_COMPAT +long ntfsp_compat_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) { - struct page **last_page = pages + nr_pages; - size_t total = 0; - unsigned len, copied; - - do { - len = PAGE_SIZE - ofs; - if (len > bytes) - len = bytes; - copied = copy_page_from_iter_atomic(*pages, ofs, len, i); - total += copied; - bytes -= copied; - if (!bytes) - break; - if (copied < len) - goto err; - ofs = 0; - } while (++pages < last_page); -out: - return total; -err: - /* Zero the rest of the target like __copy_from_user(). */ - len = PAGE_SIZE - copied; - do { - if (len > bytes) - len = bytes; - zero_user(*pages, copied, len); - bytes -= len; - copied = 0; - len = PAGE_SIZE; - } while (++pages < last_page); - goto out; + return ntfsp_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)); } +#endif -/** - * ntfs_perform_write - perform buffered write to a file - * @file: file to write to - * @i: iov_iter with data to write - * @pos: byte offset in file at which to begin writing to - */ -static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i, - loff_t pos) +static long ntfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { - struct address_space *mapping = file->f_mapping; - struct inode *vi = mapping->host; - ntfs_inode *ni = NTFS_I(vi); - ntfs_volume *vol = ni->vol; - struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER]; - struct page *cached_page = NULL; - VCN last_vcn; - LCN lcn; - size_t bytes; - ssize_t status, written = 0; - unsigned nr_pages; - - ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos " - "0x%llx, count 0x%lx.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), - (unsigned long long)pos, - (unsigned long)iov_iter_count(i)); - /* - * If a previous ntfs_truncate() failed, repeat it and abort if it - * fails again. - */ - if (unlikely(NInoTruncateFailed(ni))) { - int err; + struct inode *vi = file_inode(file); + struct ntfs_inode *ni = NTFS_I(vi); + struct ntfs_volume *vol = ni->vol; + int err = 0; + loff_t end_offset = offset + len; + loff_t old_size, new_size; + s64 start_vcn, end_vcn; + bool map_locked = false; + + if (!S_ISREG(vi->i_mode)) + return -EOPNOTSUPP; + + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_INSERT_RANGE | + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE)) + return -EOPNOTSUPP; + + if (!NVolFreeClusterKnown(vol)) + wait_event(vol->free_waitq, NVolFreeClusterKnown(vol)); + + if ((ni->vol->mft_zone_end - ni->vol->mft_zone_start) == 0) + return -ENOSPC; - inode_dio_wait(vi); - err = ntfs_truncate(vi); - if (err || NInoTruncateFailed(ni)) { - if (!err) - err = -EIO; - ntfs_error(vol->sb, "Cannot perform write to inode " - "0x%lx, attribute type 0x%x, because " - "ntfs_truncate() failed (error code " - "%i).", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); + if (NInoNonResident(ni) && !NInoFullyMapped(ni)) { + down_write(&ni->runlist.lock); + err = ntfs_attr_map_whole_runlist(ni); + up_write(&ni->runlist.lock); + if (err) return err; - } } - /* - * Determine the number of pages per cluster for non-resident - * attributes. - */ - nr_pages = 1; - if (vol->cluster_size > PAGE_SIZE && NInoNonResident(ni)) - nr_pages = vol->cluster_size >> PAGE_SHIFT; - last_vcn = -1; - do { - VCN vcn; - pgoff_t start_idx; - unsigned ofs, do_pages, u; - size_t copied; - - start_idx = pos >> PAGE_SHIFT; - ofs = pos & ~PAGE_MASK; - bytes = PAGE_SIZE - ofs; - do_pages = 1; - if (nr_pages > 1) { - vcn = pos >> vol->cluster_size_bits; - if (vcn != last_vcn) { - last_vcn = vcn; - /* - * Get the lcn of the vcn the write is in. If - * it is a hole, need to lock down all pages in - * the cluster. - */ - down_read(&ni->runlist.lock); - lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >> - vol->cluster_size_bits, false); - up_read(&ni->runlist.lock); - if (unlikely(lcn < LCN_HOLE)) { - if (lcn == LCN_ENOMEM) - status = -ENOMEM; - else { - status = -EIO; - ntfs_error(vol->sb, "Cannot " - "perform write to " - "inode 0x%lx, " - "attribute type 0x%x, " - "because the attribute " - "is corrupt.", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type)); - } - break; - } - if (lcn == LCN_HOLE) { - start_idx = (pos & ~(s64) - vol->cluster_size_mask) - >> PAGE_SHIFT; - bytes = vol->cluster_size - (pos & - vol->cluster_size_mask); - do_pages = nr_pages; - } - } + + if (!(vol->vol_flags & VOLUME_IS_DIRTY)) { + err = ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY); + if (err) + return err; + } + + old_size = i_size_read(vi); + new_size = max_t(loff_t, old_size, end_offset); + start_vcn = NTFS_B_TO_CLU(vol, offset); + end_vcn = (NTFS_B_TO_CLU(vol, end_offset - 1)) + 1; + + inode_lock(vi); + if (NInoCompressed(ni) || NInoEncrypted(ni)) { + err = -EOPNOTSUPP; + goto out; + } + + inode_dio_wait(vi); + if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | + FALLOC_FL_INSERT_RANGE)) { + filemap_invalidate_lock(vi->i_mapping); + map_locked = true; + } + + if (mode & FALLOC_FL_INSERT_RANGE) { + loff_t offset_down = round_down(offset, + max_t(unsigned long, vol->cluster_size, PAGE_SIZE)); + loff_t alloc_size; + + if (NVolDisableSparse(vol)) { + err = -EOPNOTSUPP; + goto out; } - if (bytes > iov_iter_count(i)) - bytes = iov_iter_count(i); -again: - /* - * Bring in the user page(s) that we will copy from _first_. - * Otherwise there is a nasty deadlock on copying from the same - * page(s) as we are writing to, without it/them being marked - * up-to-date. Note, at present there is nothing to stop the - * pages being swapped out between us bringing them into memory - * and doing the actual copying. - */ - if (unlikely(fault_in_iov_iter_readable(i, bytes))) { - status = -EFAULT; - break; + + if ((offset & vol->cluster_size_mask) || + (len & vol->cluster_size_mask) || + offset >= ni->allocated_size) { + err = -EINVAL; + goto out; } - /* Get and lock @do_pages starting at index @start_idx. */ - status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages, - pages, &cached_page); - if (unlikely(status)) - break; - /* - * For non-resident attributes, we need to fill any holes with - * actual clusters and ensure all bufferes are mapped. We also - * need to bring uptodate any buffers that are only partially - * being written to. - */ - if (NInoNonResident(ni)) { - status = ntfs_prepare_pages_for_non_resident_write( - pages, do_pages, pos, bytes); - if (unlikely(status)) { - do { - unlock_page(pages[--do_pages]); - put_page(pages[do_pages]); - } while (do_pages); - break; - } + + new_size = old_size + + (NTFS_CLU_TO_B(vol, end_vcn - start_vcn)); + alloc_size = ni->allocated_size + + (NTFS_CLU_TO_B(vol, end_vcn - start_vcn)); + if (alloc_size < 0) { + err = -EFBIG; + goto out; } - u = (pos >> PAGE_SHIFT) - pages[0]->index; - copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs, - i, bytes); - ntfs_flush_dcache_pages(pages + u, do_pages - u); - status = 0; - if (likely(copied == bytes)) { - status = ntfs_commit_pages_after_write(pages, do_pages, - pos, bytes); + err = inode_newsize_ok(vi, alloc_size); + if (err) + goto out; + + err = filemap_write_and_wait_range(vi->i_mapping, + offset_down, LLONG_MAX); + if (err) + goto out; + + truncate_pagecache(vi, offset_down); + + mutex_lock_nested(&ni->mrec_lock, NTFS_INODE_MUTEX_NORMAL); + err = ntfs_non_resident_attr_insert_range(ni, start_vcn, + end_vcn - start_vcn); + mutex_unlock(&ni->mrec_lock); + if (err) + goto out; + } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { + loff_t offset_down = round_down(offset, + max_t(unsigned long, vol->cluster_size, PAGE_SIZE)); + + if ((offset & vol->cluster_size_mask) || + (len & vol->cluster_size_mask) || + offset >= ni->allocated_size) { + err = -EINVAL; + goto out; } - do { - unlock_page(pages[--do_pages]); - put_page(pages[do_pages]); - } while (do_pages); - if (unlikely(status < 0)) { - iov_iter_revert(i, copied); - break; + + if (NTFS_CLU_TO_B(vol, end_vcn) > ni->allocated_size) + end_vcn = (round_up(ni->allocated_size - 1, vol->cluster_size) >> + vol->cluster_size_bits) + 1; + new_size = old_size - + (NTFS_CLU_TO_B(vol, end_vcn - start_vcn)); + if (new_size < 0) + new_size = 0; + err = filemap_write_and_wait_range(vi->i_mapping, + offset_down, LLONG_MAX); + if (err) + goto out; + + truncate_pagecache(vi, offset_down); + + mutex_lock_nested(&ni->mrec_lock, NTFS_INODE_MUTEX_NORMAL); + err = ntfs_non_resident_attr_collapse_range(ni, start_vcn, + end_vcn - start_vcn); + mutex_unlock(&ni->mrec_lock); + if (err) + goto out; + } else if (mode & FALLOC_FL_PUNCH_HOLE) { + loff_t offset_down = round_down(offset, max_t(unsigned int, + vol->cluster_size, PAGE_SIZE)); + + if (NVolDisableSparse(vol)) { + err = -EOPNOTSUPP; + goto out; } - cond_resched(); - if (unlikely(copied < bytes)) { - iov_iter_revert(i, copied); - if (copied) - bytes = copied; - else if (bytes > PAGE_SIZE - ofs) - bytes = PAGE_SIZE - ofs; - goto again; + + if (!(mode & FALLOC_FL_KEEP_SIZE)) { + err = -EINVAL; + goto out; } - pos += copied; - written += copied; - balance_dirty_pages_ratelimited(mapping); - if (fatal_signal_pending(current)) { - status = -EINTR; - break; + + if (offset >= ni->data_size) + goto out; + + if (offset + len > ni->data_size) { + end_offset = ni->data_size; + end_vcn = (NTFS_B_TO_CLU(vol, end_offset - 1)) + 1; } - } while (iov_iter_count(i)); - if (cached_page) - put_page(cached_page); - ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", - written ? "written" : "status", (unsigned long)written, - (long)status); - return written ? written : status; -} -/** - * ntfs_file_write_iter - simple wrapper for ntfs_file_write_iter_nolock() - * @iocb: IO state structure - * @from: iov_iter with data to write - * - * Basically the same as generic_file_write_iter() except that it ends up - * up calling ntfs_perform_write() instead of generic_perform_write() and that - * O_DIRECT is not implemented. - */ -static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - struct file *file = iocb->ki_filp; - struct inode *vi = file_inode(file); - ssize_t written = 0; - ssize_t err; + err = filemap_write_and_wait_range(vi->i_mapping, offset_down, LLONG_MAX); + if (err) + goto out; + truncate_pagecache(vi, offset_down); - inode_lock(vi); - /* We can write back this queue in page reclaim. */ - err = ntfs_prepare_file_for_write(iocb, from); - if (iov_iter_count(from) && !err) - written = ntfs_perform_write(file, from, iocb->ki_pos); - inode_unlock(vi); - iocb->ki_pos += written; - if (likely(written > 0)) - written = generic_write_sync(iocb, written); - return written ? written : err; -} + if (offset & vol->cluster_size_mask) { + loff_t to; -/** - * ntfs_file_fsync - sync a file to disk - * @filp: file to be synced - * @datasync: if non-zero only flush user data and not metadata - * - * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync - * system calls. This function is inspired by fs/buffer.c::file_fsync(). - * - * If @datasync is false, write the mft record and all associated extent mft - * records as well as the $DATA attribute and then sync the block device. - * - * If @datasync is true and the attribute is non-resident, we skip the writing - * of the mft record and all associated extent mft records (this might still - * happen due to the write_inode_now() call). - * - * Also, if @datasync is true, we do not wait on the inode to be written out - * but we always wait on the page cache pages to be written out. - * - * Locking: Caller must hold i_mutex on the inode. - * - * TODO: We should probably also write all attribute/index inodes associated - * with this inode but since we have no simple way of getting to them we ignore - * this problem for now. - */ -static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, - int datasync) -{ - struct inode *vi = filp->f_mapping->host; - int err, ret = 0; + to = min_t(loff_t, NTFS_CLU_TO_B(vol, start_vcn + 1), + end_offset); + err = iomap_zero_range(vi, offset, to - offset, NULL, + &ntfs_read_iomap_ops, + &ntfs_iomap_folio_ops, NULL); + if (err < 0 || (end_vcn - start_vcn) == 1) + goto out; + start_vcn++; + } + if (end_offset & vol->cluster_size_mask) { + loff_t from; - ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); + from = NTFS_CLU_TO_B(vol, end_vcn - 1); + err = iomap_zero_range(vi, from, end_offset - from, NULL, + &ntfs_read_iomap_ops, + &ntfs_iomap_folio_ops, NULL); + if (err < 0 || (end_vcn - start_vcn) == 1) + goto out; + end_vcn--; + } - err = file_write_and_wait_range(filp, start, end); - if (err) - return err; - inode_lock(vi); + mutex_lock_nested(&ni->mrec_lock, NTFS_INODE_MUTEX_NORMAL); + err = ntfs_non_resident_attr_punch_hole(ni, start_vcn, + end_vcn - start_vcn); + mutex_unlock(&ni->mrec_lock); + if (err) + goto out; + } else if (mode == 0 || mode == FALLOC_FL_KEEP_SIZE) { + s64 need_space; + + err = inode_newsize_ok(vi, new_size); + if (err) + goto out; + + need_space = NTFS_B_TO_CLU(vol, ni->allocated_size); + if (need_space > start_vcn) + need_space = end_vcn - need_space; + else + need_space = end_vcn - start_vcn; + if (need_space > 0 && + need_space > (atomic64_read(&vol->free_clusters) - + atomic64_read(&vol->dirty_clusters))) { + err = -ENOSPC; + goto out; + } + + err = ntfs_attr_fallocate(ni, offset, len, + mode & FALLOC_FL_KEEP_SIZE ? true : false); + if (err) + goto out; + } + + /* inode->i_blocks is already updated in ntfs_attr_update_mapping_pairs */ + if (!(mode & FALLOC_FL_KEEP_SIZE) && new_size != old_size) + i_size_write(vi, ni->data_size); + +out: + if (map_locked) + filemap_invalidate_unlock(vi->i_mapping); + if (!err) { + if (mode == 0 && NInoNonResident(ni) && + offset > old_size && old_size % PAGE_SIZE != 0) { + loff_t len = min_t(loff_t, + round_up(old_size, PAGE_SIZE) - old_size, + offset - old_size); + err = iomap_zero_range(vi, old_size, len, NULL, + &ntfs_read_iomap_ops, + &ntfs_iomap_folio_ops, NULL); + } + NInoSetFileNameDirty(ni); + inode_set_mtime_to_ts(vi, inode_set_ctime_current(vi)); + mark_inode_dirty(vi); + } - BUG_ON(S_ISDIR(vi->i_mode)); - if (!datasync || !NInoNonResident(NTFS_I(vi))) - ret = __ntfs_write_inode(vi, 1); - write_inode_now(vi, !datasync); - /* - * NOTE: If we were to use mapping->private_list (see ext2 and - * fs/buffer.c) for dirty blocks then we could optimize the below to be - * sync_mapping_buffers(vi->i_mapping). - */ - err = sync_blockdev(vi->i_sb->s_bdev); - if (unlikely(err && !ret)) - ret = err; - if (likely(!ret)) - ntfs_debug("Done."); - else - ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " - "%u.", datasync ? "data" : "", vi->i_ino, -ret); inode_unlock(vi); - return ret; + return err; } -#endif /* NTFS_RW */ - const struct file_operations ntfs_file_ops = { - .llseek = generic_file_llseek, - .read_iter = generic_file_read_iter, -#ifdef NTFS_RW + .llseek = ntfs_file_llseek, + .read_iter = ntfs_file_read_iter, .write_iter = ntfs_file_write_iter, .fsync = ntfs_file_fsync, -#endif /* NTFS_RW */ - .mmap = generic_file_mmap, + .mmap_prepare = ntfs_file_mmap_prepare, .open = ntfs_file_open, - .splice_read = filemap_splice_read, + .release = ntfs_file_release, + .splice_read = ntfs_file_splice_read, + .splice_write = iter_file_splice_write, + .unlocked_ioctl = ntfsp_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ntfsp_compat_ioctl, +#endif + .fallocate = ntfs_fallocate, }; const struct inode_operations ntfs_file_inode_ops = { -#ifdef NTFS_RW - .setattr = ntfs_setattr, -#endif /* NTFS_RW */ + .setattr = ntfsp_setattr, + .getattr = ntfsp_getattr, + .listxattr = ntfsp_listxattr, + .get_acl = ntfsp_get_acl, + .set_acl = ntfsp_set_acl, + .fiemap = ntfs_fiemap, +}; + +const struct inode_operations ntfs_symlink_inode_operations = { + .get_link = ntfs_get_link, + .setattr = ntfsp_setattr, + .listxattr = ntfsp_listxattr, +}; + +const struct inode_operations ntfsp_special_inode_operations = { + .setattr = ntfsp_setattr, + .getattr = ntfsp_getattr, + .listxattr = ntfsp_listxattr, + .get_acl = ntfsp_get_acl, + .set_acl = ntfsp_set_acl, }; const struct file_operations ntfs_empty_file_ops = {}; -- 2.25.1 This updates the implementation of attrib operations Signed-off-by: Hyunchul Lee Signed-off-by: Namjae Jeon --- fs/ntfs/attrib.c | 5406 +++++++++++++++++++++++++++++++++----------- fs/ntfs/attrlist.c | 285 +++ fs/ntfs/compress.c | 1023 +++++++-- 3 files changed, 5184 insertions(+), 1530 deletions(-) create mode 100644 fs/ntfs/attrlist.c diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index f79408f9127a..08a7f88153fa 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -1,25 +1,35 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* - * attrib.c - NTFS attribute operations. Part of the Linux-NTFS project. +/** + * NTFS attribute operations. Part of the Linux-NTFS project. * * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. * Copyright (c) 2002 Richard Russon + * Copyright (c) 2025 LG Electronics Co., Ltd. + * + * Part of this file is based on code from the NTFS-3G project. + * and is copyrighted by the respective authors below: + * Copyright (c) 2000-2010 Anton Altaparmakov + * Copyright (c) 2002-2005 Richard Russon + * Copyright (c) 2002-2008 Szabolcs Szakacsits + * Copyright (c) 2004-2007 Yura Pakhuchiy + * Copyright (c) 2007-2021 Jean-Pierre Andre + * Copyright (c) 2010 Erik Larsson */ -#include -#include -#include -#include #include +#include #include "attrib.h" -#include "debug.h" -#include "layout.h" +#include "attrlist.h" #include "lcnalloc.h" -#include "malloc.h" +#include "debug.h" #include "mft.h" #include "ntfs.h" -#include "types.h" +#include "aops.h" +#include "iomap.h" +#include "malloc.h" + +__le16 AT_UNNAMED[] = { cpu_to_le16('\0') }; /** * ntfs_map_runlist_nolock - map (a part of) a runlist of an ntfs inode @@ -43,42 +53,22 @@ * ntfs_map_runlist_nolock(), you will probably want to do: * m = ctx->mrec; * a = ctx->attr; - * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that - * you cache ctx->mrec in a variable @m of type MFT_RECORD *. - * - * Return 0 on success and -errno on error. There is one special error code - * which is not an error as such. This is -ENOENT. It means that @vcn is out - * of bounds of the runlist. - * - * Note the runlist can be NULL after this function returns if @vcn is zero and - * the attribute has zero allocated size, i.e. there simply is no runlist. - * - * WARNING: If @ctx is supplied, regardless of whether success or failure is - * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx - * is no longer valid, i.e. you need to either call - * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. - * In that case PTR_ERR(@ctx->mrec) will give you the error code for - * why the mapping of the old inode failed. - * - * Locking: - The runlist described by @ni must be locked for writing on entry - * and is locked on return. Note the runlist will be modified. - * - If @ctx is NULL, the base mft record of @ni must not be mapped on - * entry and it will be left unmapped on return. - * - If @ctx is not NULL, the base mft record must be mapped on entry - * and it will be left mapped on return. + * Assuming you cache ctx->attr in a variable @a of type attr_record * and that + * you cache ctx->mrec in a variable @m of type struct mft_record *. */ -int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx) +int ntfs_map_runlist_nolock(struct ntfs_inode *ni, s64 vcn, struct ntfs_attr_search_ctx *ctx) { - VCN end_vcn; + s64 end_vcn; unsigned long flags; - ntfs_inode *base_ni; - MFT_RECORD *m; - ATTR_RECORD *a; - runlist_element *rl; - struct page *put_this_page = NULL; + struct ntfs_inode *base_ni; + struct mft_record *m; + struct attr_record *a; + struct runlist_element *rl; + struct folio *put_this_folio = NULL; int err = 0; - bool ctx_is_temporary, ctx_needs_reset; - ntfs_attr_search_ctx old_ctx = { NULL, }; + bool ctx_is_temporary = false, ctx_needs_reset; + struct ntfs_attr_search_ctx old_ctx = { NULL, }; + size_t new_rl_count; ntfs_debug("Mapping runlist part containing vcn 0x%llx.", (unsigned long long)vcn); @@ -97,16 +87,17 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx) goto err_out; } } else { - VCN allocated_size_vcn; + s64 allocated_size_vcn; - BUG_ON(IS_ERR(ctx->mrec)); + WARN_ON(IS_ERR(ctx->mrec)); a = ctx->attr; - BUG_ON(!a->non_resident); - ctx_is_temporary = false; - end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); + if (!a->non_resident) { + err = -EIO; + goto err_out; + } + end_vcn = le64_to_cpu(a->data.non_resident.highest_vcn); read_lock_irqsave(&ni->size_lock, flags); - allocated_size_vcn = ni->allocated_size >> - ni->vol->cluster_size_bits; + allocated_size_vcn = NTFS_B_TO_CLU(ni->vol, ni->allocated_size); read_unlock_irqrestore(&ni->size_lock, flags); if (!a->data.non_resident.lowest_vcn && end_vcn <= 0) end_vcn = allocated_size_vcn - 1; @@ -119,9 +110,9 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx) */ if (vcn >= allocated_size_vcn || (a->type == ni->type && a->name_length == ni->name_len && - !memcmp((u8*)a + le16_to_cpu(a->name_offset), + !memcmp((u8 *)a + le16_to_cpu(a->name_offset), ni->name, ni->name_len) && - sle64_to_cpu(a->data.non_resident.lowest_vcn) + le64_to_cpu(a->data.non_resident.lowest_vcn) <= vcn && end_vcn >= vcn)) ctx_needs_reset = false; else { @@ -137,8 +128,8 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx) */ if (old_ctx.base_ntfs_ino && old_ctx.ntfs_ino != old_ctx.base_ntfs_ino) { - put_this_page = old_ctx.ntfs_ino->page; - get_page(put_this_page); + put_this_folio = old_ctx.ntfs_ino->folio; + folio_get(put_this_folio); } /* * Reinitialize the search context so we can lookup the @@ -156,7 +147,7 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx) err = -EIO; goto err_out; } - BUG_ON(!ctx->attr->non_resident); + WARN_ON(!ctx->attr->non_resident); } a = ctx->attr; /* @@ -165,16 +156,18 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx) * we then try to map the already mapped runlist fragment and * ntfs_mapping_pairs_decompress() fails. */ - end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn) + 1; + end_vcn = le64_to_cpu(a->data.non_resident.highest_vcn) + 1; if (unlikely(vcn && vcn >= end_vcn)) { err = -ENOENT; goto err_out; } - rl = ntfs_mapping_pairs_decompress(ni->vol, a, ni->runlist.rl); + rl = ntfs_mapping_pairs_decompress(ni->vol, a, &ni->runlist, &new_rl_count); if (IS_ERR(rl)) err = PTR_ERR(rl); - else + else { ni->runlist.rl = rl; + ni->runlist.count = new_rl_count; + } err_out: if (ctx_is_temporary) { if (likely(ctx)) @@ -203,18 +196,16 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx) ctx->base_ntfs_ino) { unmap_extent_mft_record(ctx->ntfs_ino); ctx->mrec = ctx->base_mrec; - BUG_ON(!ctx->mrec); + WARN_ON(!ctx->mrec); } /* * If the old mapped inode is not the base * inode, map it. */ if (old_ctx.base_ntfs_ino && - old_ctx.ntfs_ino != - old_ctx.base_ntfs_ino) { + old_ctx.ntfs_ino != old_ctx.base_ntfs_ino) { retry_map: - ctx->mrec = map_mft_record( - old_ctx.ntfs_ino); + ctx->mrec = map_mft_record(old_ctx.ntfs_ino); /* * Something bad has happened. If out * of memory retry till it succeeds. @@ -226,24 +217,22 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx) * search context safely. */ if (IS_ERR(ctx->mrec)) { - if (PTR_ERR(ctx->mrec) == - -ENOMEM) { + if (PTR_ERR(ctx->mrec) == -ENOMEM) { schedule(); goto retry_map; } else old_ctx.ntfs_ino = - old_ctx. - base_ntfs_ino; + old_ctx.base_ntfs_ino; } } } /* Update the changed pointers in the saved context. */ if (ctx->mrec != old_ctx.mrec) { if (!IS_ERR(ctx->mrec)) - old_ctx.attr = (ATTR_RECORD*)( - (u8*)ctx->mrec + - ((u8*)old_ctx.attr - - (u8*)old_ctx.mrec)); + old_ctx.attr = (struct attr_record *)( + (u8 *)ctx->mrec + + ((u8 *)old_ctx.attr - + (u8 *)old_ctx.mrec)); old_ctx.mrec = ctx->mrec; } } @@ -260,8 +249,8 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx) * immediately and mark the volume dirty for chkdsk to pick up * the pieces anyway. */ - if (put_this_page) - put_page(put_this_page); + if (put_this_folio) + folio_put(put_this_folio); } return err; } @@ -272,16 +261,8 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx) * @vcn: map runlist part containing this vcn * * Map the part of a runlist containing the @vcn of the ntfs inode @ni. - * - * Return 0 on success and -errno on error. There is one special error code - * which is not an error as such. This is -ENOENT. It means that @vcn is out - * of bounds of the runlist. - * - * Locking: - The runlist must be unlocked on entry and is unlocked on return. - * - This function takes the runlist lock for writing and may modify - * the runlist. */ -int ntfs_map_runlist(ntfs_inode *ni, VCN vcn) +int ntfs_map_runlist(struct ntfs_inode *ni, s64 vcn) { int err = 0; @@ -294,6 +275,37 @@ int ntfs_map_runlist(ntfs_inode *ni, VCN vcn) return err; } +struct runlist_element *ntfs_attr_vcn_to_rl(struct ntfs_inode *ni, s64 vcn, s64 *lcn) +{ + struct runlist_element *rl; + int err; + bool is_retry = false; + + rl = ni->runlist.rl; + if (!rl) { + err = ntfs_attr_map_whole_runlist(ni); + if (err) + return ERR_PTR(-ENOENT); + rl = ni->runlist.rl; + } + +remap_rl: + /* Seek to element containing target vcn. */ + while (rl->length && rl[1].vcn <= vcn) + rl++; + *lcn = ntfs_rl_vcn_to_lcn(rl, vcn); + + if (*lcn <= LCN_RL_NOT_MAPPED && is_retry == false) { + is_retry = true; + if (!ntfs_map_runlist_nolock(ni, vcn, NULL)) { + rl = ni->runlist.rl; + goto remap_rl; + } + } + + return rl; +} + /** * ntfs_attr_vcn_to_lcn_nolock - convert a vcn into a lcn given an ntfs inode * @ni: ntfs inode of the attribute whose runlist to search @@ -324,19 +336,16 @@ int ntfs_map_runlist(ntfs_inode *ni, VCN vcn) * the lock may be dropped inside the function so you cannot rely on * the runlist still being the same when this function returns. */ -LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn, +s64 ntfs_attr_vcn_to_lcn_nolock(struct ntfs_inode *ni, const s64 vcn, const bool write_locked) { - LCN lcn; + s64 lcn; unsigned long flags; bool is_retry = false; - BUG_ON(!ni); ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.", ni->mft_no, (unsigned long long)vcn, write_locked ? "write" : "read"); - BUG_ON(!NInoNonResident(ni)); - BUG_ON(vcn < 0); if (!ni->runlist.rl) { read_lock_irqsave(&ni->size_lock, flags); if (!ni->allocated_size) { @@ -390,6 +399,61 @@ LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn, return lcn; } +struct runlist_element *__ntfs_attr_find_vcn_nolock(struct runlist *runlist, const s64 vcn) +{ + size_t lower_idx, upper_idx, idx; + struct runlist_element *run; + int rh = runlist->rl_hint; + + if (runlist->count <= 1) + return ERR_PTR(-ENOENT); + + if (runlist->count - 1 > rh && runlist->rl[rh].vcn <= vcn) { + if (vcn < runlist->rl[rh].vcn + runlist->rl[rh].length) + return &runlist->rl[rh]; + if (runlist->count - 2 == rh) + return ERR_PTR(-ENOENT); + + lower_idx = rh + 1; + } else { + run = &runlist->rl[0]; + if (vcn < run->vcn) + return ERR_PTR(-ENOENT); + else if (vcn < run->vcn + run->length) { + runlist->rl_hint = 0; + return run; + } + + lower_idx = 1; + } + + run = &runlist->rl[runlist->count - 2]; + if (vcn >= run->vcn && vcn < run->vcn + run->length) { + runlist->rl_hint = runlist->count - 2; + return run; + } + if (vcn >= run->vcn + run->length) + return ERR_PTR(-ENOENT); + + upper_idx = runlist->count - 2; + + while (lower_idx <= upper_idx) { + idx = (lower_idx + upper_idx) >> 1; + run = &runlist->rl[idx]; + + if (vcn < run->vcn) + upper_idx = idx - 1; + else if (vcn >= run->vcn + run->length) + lower_idx = idx + 1; + else { + runlist->rl_hint = idx; + return run; + } + } + + return ERR_PTR(-ENOENT); +} + /** * ntfs_attr_find_vcn_nolock - find a vcn in the runlist of an ntfs inode * @ni: ntfs inode describing the runlist to search @@ -416,50 +480,22 @@ LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn, * ntfs_attr_find_vcn_nolock(), you will probably want to do: * m = ctx->mrec; * a = ctx->attr; - * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that - * you cache ctx->mrec in a variable @m of type MFT_RECORD *. + * Assuming you cache ctx->attr in a variable @a of type attr_record * and that + * you cache ctx->mrec in a variable @m of type struct mft_record *. * Note you need to distinguish between the lcn of the returned runlist element * being >= 0 and LCN_HOLE. In the later case you have to return zeroes on * read and allocate clusters on write. - * - * Return the runlist element containing the @vcn on success and - * ERR_PTR(-errno) on error. You need to test the return value with IS_ERR() - * to decide if the return is success or failure and PTR_ERR() to get to the - * error code if IS_ERR() is true. - * - * The possible error return codes are: - * -ENOENT - No such vcn in the runlist, i.e. @vcn is out of bounds. - * -ENOMEM - Not enough memory to map runlist. - * -EIO - Critical error (runlist/file is corrupt, i/o error, etc). - * - * WARNING: If @ctx is supplied, regardless of whether success or failure is - * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx - * is no longer valid, i.e. you need to either call - * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. - * In that case PTR_ERR(@ctx->mrec) will give you the error code for - * why the mapping of the old inode failed. - * - * Locking: - The runlist described by @ni must be locked for writing on entry - * and is locked on return. Note the runlist may be modified when - * needed runlist fragments need to be mapped. - * - If @ctx is NULL, the base mft record of @ni must not be mapped on - * entry and it will be left unmapped on return. - * - If @ctx is not NULL, the base mft record must be mapped on entry - * and it will be left mapped on return. */ -runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn, - ntfs_attr_search_ctx *ctx) +struct runlist_element *ntfs_attr_find_vcn_nolock(struct ntfs_inode *ni, const s64 vcn, + struct ntfs_attr_search_ctx *ctx) { unsigned long flags; - runlist_element *rl; + struct runlist_element *rl; int err = 0; bool is_retry = false; - BUG_ON(!ni); ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, with%s ctx.", ni->mft_no, (unsigned long long)vcn, ctx ? "" : "out"); - BUG_ON(!NInoNonResident(ni)); - BUG_ON(vcn < 0); if (!ni->runlist.rl) { read_lock_irqsave(&ni->size_lock, flags); if (!ni->allocated_size) { @@ -468,32 +504,24 @@ runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn, } read_unlock_irqrestore(&ni->size_lock, flags); } + retry_remap: rl = ni->runlist.rl; if (likely(rl && vcn >= rl[0].vcn)) { - while (likely(rl->length)) { - if (unlikely(vcn < rl[1].vcn)) { - if (likely(rl->lcn >= LCN_HOLE)) { - ntfs_debug("Done."); - return rl; - } - break; - } - rl++; - } - if (likely(rl->lcn != LCN_RL_NOT_MAPPED)) { - if (likely(rl->lcn == LCN_ENOENT)) - err = -ENOENT; - else - err = -EIO; - } + rl = __ntfs_attr_find_vcn_nolock(&ni->runlist, vcn); + if (IS_ERR(rl)) + err = PTR_ERR(rl); + else if (rl->lcn >= LCN_HOLE) + return rl; + else if (rl->lcn <= LCN_ENOENT) + err = -EIO; } if (!err && !is_retry) { /* * If the search context is invalid we cannot map the unmapped * region. */ - if (IS_ERR(ctx->mrec)) + if (ctx && IS_ERR(ctx->mrec)) err = PTR_ERR(ctx->mrec); else { /* @@ -572,14 +600,15 @@ runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn, * Warning: Never use @val when looking for attribute types which can be * non-resident as this most likely will result in a crash! */ -static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name, - const u32 name_len, const IGNORE_CASE_BOOL ic, - const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx) +static int ntfs_attr_find(const __le32 type, const __le16 *name, + const u32 name_len, const u32 ic, + const u8 *val, const u32 val_len, struct ntfs_attr_search_ctx *ctx) { - ATTR_RECORD *a; - ntfs_volume *vol = ctx->ntfs_ino->vol; - ntfschar *upcase = vol->upcase; + struct attr_record *a; + struct ntfs_volume *vol = ctx->ntfs_ino->vol; + __le16 *upcase = vol->upcase; u32 upcase_len = vol->upcase_len; + unsigned int space; /* * Iterate over attributes in mft record starting at @ctx->attr, or the @@ -589,80 +618,72 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name, a = ctx->attr; ctx->is_first = false; } else - a = (ATTR_RECORD*)((u8*)ctx->attr + + a = (struct attr_record *)((u8 *)ctx->attr + le32_to_cpu(ctx->attr->length)); - for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) { - u8 *mrec_end = (u8 *)ctx->mrec + - le32_to_cpu(ctx->mrec->bytes_allocated); - u8 *name_end; - - /* check whether ATTR_RECORD wrap */ - if ((u8 *)a < (u8 *)ctx->mrec) - break; - - /* check whether Attribute Record Header is within bounds */ - if ((u8 *)a > mrec_end || - (u8 *)a + sizeof(ATTR_RECORD) > mrec_end) + for (;; a = (struct attr_record *)((u8 *)a + le32_to_cpu(a->length))) { + if ((u8 *)a < (u8 *)ctx->mrec || (u8 *)a > (u8 *)ctx->mrec + + le32_to_cpu(ctx->mrec->bytes_allocated)) break; - /* check whether ATTR_RECORD's name is within bounds */ - name_end = (u8 *)a + le16_to_cpu(a->name_offset) + - a->name_length * sizeof(ntfschar); - if (name_end > mrec_end) + space = le32_to_cpu(ctx->mrec->bytes_in_use) - ((u8 *)a - (u8 *)ctx->mrec); + if ((space < offsetof(struct attr_record, data.resident.reserved) + 1 || + space < le32_to_cpu(a->length)) && (space < 4 || a->type != AT_END)) break; ctx->attr = a; - if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) || - a->type == AT_END)) + if (((type != AT_UNUSED) && (le32_to_cpu(a->type) > le32_to_cpu(type))) || + a->type == AT_END) return -ENOENT; if (unlikely(!a->length)) break; - - /* check whether ATTR_RECORD's length wrap */ - if ((u8 *)a + le32_to_cpu(a->length) < (u8 *)a) - break; - /* check whether ATTR_RECORD's length is within bounds */ - if ((u8 *)a + le32_to_cpu(a->length) > mrec_end) - break; - + if (type == AT_UNUSED) + return 0; if (a->type != type) continue; /* * If @name is present, compare the two names. If @name is * missing, assume we want an unnamed attribute. */ - if (!name) { + if (!name || name == AT_UNNAMED) { /* The search failed if the found attribute is named. */ if (a->name_length) return -ENOENT; - } else if (!ntfs_are_names_equal(name, name_len, - (ntfschar*)((u8*)a + le16_to_cpu(a->name_offset)), - a->name_length, ic, upcase, upcase_len)) { - register int rc; + } else { + if (a->name_length && ((le16_to_cpu(a->name_offset) + + a->name_length * sizeof(__le16)) > + le32_to_cpu(a->length))) { + ntfs_error(vol->sb, "Corrupt attribute name in MFT record %lld\n", + (long long)ctx->ntfs_ino->mft_no); + break; + } - rc = ntfs_collate_names(name, name_len, - (ntfschar*)((u8*)a + - le16_to_cpu(a->name_offset)), - a->name_length, 1, IGNORE_CASE, - upcase, upcase_len); - /* - * If @name collates before a->name, there is no - * matching attribute. - */ - if (rc == -1) - return -ENOENT; - /* If the strings are not equal, continue search. */ - if (rc) - continue; - rc = ntfs_collate_names(name, name_len, - (ntfschar*)((u8*)a + - le16_to_cpu(a->name_offset)), - a->name_length, 1, CASE_SENSITIVE, - upcase, upcase_len); - if (rc == -1) - return -ENOENT; - if (rc) - continue; + if (!ntfs_are_names_equal(name, name_len, + (__le16 *)((u8 *)a + le16_to_cpu(a->name_offset)), + a->name_length, ic, upcase, upcase_len)) { + register int rc; + + rc = ntfs_collate_names(name, name_len, + (__le16 *)((u8 *)a + le16_to_cpu(a->name_offset)), + a->name_length, 1, IGNORE_CASE, + upcase, upcase_len); + /* + * If @name collates before a->name, there is no + * matching attribute. + */ + if (rc == -1) + return -ENOENT; + /* If the strings are not equal, continue search. */ + if (rc) + continue; + rc = ntfs_collate_names(name, name_len, + (__le16 *)((u8 *)a + le16_to_cpu(a->name_offset)), + a->name_length, 1, CASE_SENSITIVE, + upcase, upcase_len); + if (rc == -1) + return -ENOENT; + if (rc) + continue; + } } /* * The names match or @name not present and attribute is @@ -675,7 +696,7 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name, else { register int rc; - rc = memcmp(val, (u8*)a + le16_to_cpu( + rc = memcmp(val, (u8 *)a + le16_to_cpu( a->data.resident.value_offset), min_t(u32, val_len, le32_to_cpu( a->data.resident.value_length))); @@ -686,8 +707,7 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name, if (!rc) { register u32 avl; - avl = le32_to_cpu( - a->data.resident.value_length); + avl = le32_to_cpu(a->data.resident.value_length); if (val_len == avl) return 0; if (val_len < avl) @@ -701,117 +721,80 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name, return -EIO; } -/** - * load_attribute_list - load an attribute list into memory - * @vol: ntfs volume from which to read - * @runlist: runlist of the attribute list - * @al_start: destination buffer - * @size: size of the destination buffer in bytes - * @initialized_size: initialized size of the attribute list - * - * Walk the runlist @runlist and load all clusters from it copying them into - * the linear buffer @al. The maximum number of bytes copied to @al is @size - * bytes. Note, @size does not need to be a multiple of the cluster size. If - * @initialized_size is less than @size, the region in @al between - * @initialized_size and @size will be zeroed and not read from disk. - * - * Return 0 on success or -errno on error. - */ -int load_attribute_list(ntfs_volume *vol, runlist *runlist, u8 *al_start, - const s64 size, const s64 initialized_size) +void ntfs_attr_name_free(unsigned char **name) { - LCN lcn; - u8 *al = al_start; - u8 *al_end = al + initialized_size; - runlist_element *rl; - struct buffer_head *bh; - struct super_block *sb; - unsigned long block_size; - unsigned long block, max_block; - int err = 0; - unsigned char block_size_bits; + if (*name) { + ntfs_free(*name); + *name = NULL; + } +} - ntfs_debug("Entering."); - if (!vol || !runlist || !al || size <= 0 || initialized_size < 0 || - initialized_size > size) +char *ntfs_attr_name_get(const struct ntfs_volume *vol, const __le16 *uname, + const int uname_len) +{ + unsigned char *name = NULL; + int name_len; + + name_len = ntfs_ucstonls(vol, uname, uname_len, &name, 0); + if (name_len < 0) { + ntfs_error(vol->sb, "ntfs_ucstonls error"); + /* This function when returns -1, memory for name might + * be allocated. So lets free this memory. + */ + ntfs_attr_name_free(&name); + return NULL; + + } else if (name_len > 0) + return name; + + ntfs_attr_name_free(&name); + return NULL; +} + +int load_attribute_list(struct ntfs_inode *base_ni, u8 *al_start, const s64 size) +{ + struct inode *attr_vi = NULL; + u8 *al; + struct attr_list_entry *ale; + + if (!al_start || size <= 0) return -EINVAL; - if (!initialized_size) { - memset(al, 0, size); - return 0; + + attr_vi = ntfs_attr_iget(VFS_I(base_ni), AT_ATTRIBUTE_LIST, AT_UNNAMED, 0); + if (IS_ERR(attr_vi)) { + ntfs_error(base_ni->vol->sb, + "Failed to open an inode for Attribute list, mft = %ld", + base_ni->mft_no); + return PTR_ERR(attr_vi); } - sb = vol->sb; - block_size = sb->s_blocksize; - block_size_bits = sb->s_blocksize_bits; - down_read(&runlist->lock); - rl = runlist->rl; - if (!rl) { - ntfs_error(sb, "Cannot read attribute list since runlist is " - "missing."); - goto err_out; - } - /* Read all clusters specified by the runlist one run at a time. */ - while (rl->length) { - lcn = ntfs_rl_vcn_to_lcn(rl, rl->vcn); - ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.", - (unsigned long long)rl->vcn, - (unsigned long long)lcn); - /* The attribute list cannot be sparse. */ - if (lcn < 0) { - ntfs_error(sb, "ntfs_rl_vcn_to_lcn() failed. Cannot " - "read attribute list."); - goto err_out; - } - block = lcn << vol->cluster_size_bits >> block_size_bits; - /* Read the run from device in chunks of block_size bytes. */ - max_block = block + (rl->length << vol->cluster_size_bits >> - block_size_bits); - ntfs_debug("max_block = 0x%lx.", max_block); - do { - ntfs_debug("Reading block = 0x%lx.", block); - bh = sb_bread(sb, block); - if (!bh) { - ntfs_error(sb, "sb_bread() failed. Cannot " - "read attribute list."); - goto err_out; - } - if (al + block_size >= al_end) - goto do_final; - memcpy(al, bh->b_data, block_size); - brelse(bh); - al += block_size; - } while (++block < max_block); - rl++; + + if (ntfs_inode_attr_pread(attr_vi, 0, size, al_start) != size) { + iput(attr_vi); + ntfs_error(base_ni->vol->sb, + "Failed to read attribute list, mft = %ld", + base_ni->mft_no); + return -EIO; } - if (initialized_size < size) { -initialize: - memset(al_start + initialized_size, 0, size - initialized_size); + iput(attr_vi); + + for (al = al_start; al < al_start + size; al += le16_to_cpu(ale->length)) { + ale = (struct attr_list_entry *)al; + if (ale->name_offset != sizeof(struct attr_list_entry)) + break; + if (le16_to_cpu(ale->length) <= ale->name_offset + ale->name_length || + al + le16_to_cpu(ale->length) > al_start + size) + break; + if (ale->type == AT_UNUSED) + break; + if (MSEQNO_LE(ale->mft_reference) == 0) + break; } -done: - up_read(&runlist->lock); - return err; -do_final: - if (al < al_end) { - /* - * Partial block. - * - * Note: The attribute list can be smaller than its allocation - * by multiple clusters. This has been encountered by at least - * two people running Windows XP, thus we cannot do any - * truncation sanity checking here. (AIA) - */ - memcpy(al, bh->b_data, al_end - al); - brelse(bh); - if (initialized_size < size) - goto initialize; - goto done; - } - brelse(bh); - /* Real overflow! */ - ntfs_error(sb, "Attribute list buffer overflow. Read attribute list " - "is truncated."); -err_out: - err = -EIO; - goto done; + if (al != al_start + size) { + ntfs_error(base_ni->vol->sb, "Corrupt attribute list, mft = %ld", + base_ni->mft_no); + return -EIO; + } + return 0; } /** @@ -864,18 +847,19 @@ int load_attribute_list(ntfs_volume *vol, runlist *runlist, u8 *al_start, * On actual error, ntfs_external_attr_find() returns -EIO. In this case * @ctx->attr is undefined and in particular do not rely on it not changing. */ -static int ntfs_external_attr_find(const ATTR_TYPE type, - const ntfschar *name, const u32 name_len, - const IGNORE_CASE_BOOL ic, const VCN lowest_vcn, - const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx) +static int ntfs_external_attr_find(const __le32 type, + const __le16 *name, const u32 name_len, + const u32 ic, const s64 lowest_vcn, + const u8 *val, const u32 val_len, struct ntfs_attr_search_ctx *ctx) { - ntfs_inode *base_ni, *ni; - ntfs_volume *vol; - ATTR_LIST_ENTRY *al_entry, *next_al_entry; + struct ntfs_inode *base_ni, *ni; + struct ntfs_volume *vol; + struct attr_list_entry *al_entry, *next_al_entry; u8 *al_start, *al_end; - ATTR_RECORD *a; - ntfschar *al_name; + struct attr_record *a; + __le16 *al_name; u32 al_name_len; + bool is_first_search = false; int err = 0; static const char *es = " Unmount and run chkdsk."; @@ -886,6 +870,7 @@ static int ntfs_external_attr_find(const ATTR_TYPE type, /* First call happens with the base mft record. */ base_ni = ctx->base_ntfs_ino = ctx->ntfs_ino; ctx->base_mrec = ctx->mrec; + ctx->mapped_base_mrec = ctx->mapped_mrec; } if (ni == base_ni) ctx->base_attr = ctx->attr; @@ -894,8 +879,10 @@ static int ntfs_external_attr_find(const ATTR_TYPE type, vol = base_ni->vol; al_start = base_ni->attr_list; al_end = al_start + base_ni->attr_list_size; - if (!ctx->al_entry) - ctx->al_entry = (ATTR_LIST_ENTRY*)al_start; + if (!ctx->al_entry) { + ctx->al_entry = (struct attr_list_entry *)al_start; + is_first_search = true; + } /* * Iterate over entries in attribute list starting at @ctx->al_entry, * or the entry following that, if @ctx->is_first is 'true'. @@ -903,36 +890,128 @@ static int ntfs_external_attr_find(const ATTR_TYPE type, if (ctx->is_first) { al_entry = ctx->al_entry; ctx->is_first = false; - } else - al_entry = (ATTR_LIST_ENTRY*)((u8*)ctx->al_entry + + /* + * If an enumeration and the first attribute is higher than + * the attribute list itself, need to return the attribute list + * attribute. + */ + if ((type == AT_UNUSED) && is_first_search && + le32_to_cpu(al_entry->type) > + le32_to_cpu(AT_ATTRIBUTE_LIST)) + goto find_attr_list_attr; + } else { + /* Check for small entry */ + if (((al_end - (u8 *)ctx->al_entry) < + (long)offsetof(struct attr_list_entry, name)) || + (le16_to_cpu(ctx->al_entry->length) & 7) || + (le16_to_cpu(ctx->al_entry->length) < offsetof(struct attr_list_entry, name))) + goto corrupt; + + al_entry = (struct attr_list_entry *)((u8 *)ctx->al_entry + le16_to_cpu(ctx->al_entry->length)); + + if ((u8 *)al_entry == al_end) + goto not_found; + + /* Preliminary check for small entry */ + if ((al_end - (u8 *)al_entry) < + (long)offsetof(struct attr_list_entry, name)) + goto corrupt; + + /* + * If this is an enumeration and the attribute list attribute + * is the next one in the enumeration sequence, just return the + * attribute list attribute from the base mft record as it is + * not listed in the attribute list itself. + */ + if ((type == AT_UNUSED) && le32_to_cpu(ctx->al_entry->type) < + le32_to_cpu(AT_ATTRIBUTE_LIST) && + le32_to_cpu(al_entry->type) > + le32_to_cpu(AT_ATTRIBUTE_LIST)) { +find_attr_list_attr: + + /* Check for bogus calls. */ + if (name || name_len || val || val_len || lowest_vcn) + return -EINVAL; + + /* We want the base record. */ + if (ctx->ntfs_ino != base_ni) + unmap_mft_record(ctx->ntfs_ino); + ctx->ntfs_ino = base_ni; + ctx->mapped_mrec = ctx->mapped_base_mrec; + ctx->mrec = ctx->base_mrec; + ctx->is_first = true; + + /* Sanity checks are performed elsewhere. */ + ctx->attr = (struct attr_record *)((u8 *)ctx->mrec + + le16_to_cpu(ctx->mrec->attrs_offset)); + + /* Find the attribute list attribute. */ + err = ntfs_attr_find(AT_ATTRIBUTE_LIST, NULL, 0, + IGNORE_CASE, NULL, 0, ctx); + + /* + * Setup the search context so the correct + * attribute is returned next time round. + */ + ctx->al_entry = al_entry; + ctx->is_first = true; + + /* Got it. Done. */ + if (!err) + return 0; + + /* Error! If other than not found return it. */ + if (err != -ENOENT) + return err; + + /* Not found?!? Absurd! */ + ntfs_error(ctx->ntfs_ino->vol->sb, "Attribute list wasn't found"); + return -EIO; + } + } for (;; al_entry = next_al_entry) { /* Out of bounds check. */ - if ((u8*)al_entry < base_ni->attr_list || - (u8*)al_entry > al_end) + if ((u8 *)al_entry < base_ni->attr_list || + (u8 *)al_entry > al_end) break; /* Inode is corrupt. */ ctx->al_entry = al_entry; /* Catch the end of the attribute list. */ - if ((u8*)al_entry == al_end) + if ((u8 *)al_entry == al_end) goto not_found; - if (!al_entry->length) - break; - if ((u8*)al_entry + 6 > al_end || (u8*)al_entry + - le16_to_cpu(al_entry->length) > al_end) - break; - next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry + + + if ((((u8 *)al_entry + offsetof(struct attr_list_entry, name)) > al_end) || + ((u8 *)al_entry + le16_to_cpu(al_entry->length) > al_end) || + (le16_to_cpu(al_entry->length) & 7) || + (le16_to_cpu(al_entry->length) < + offsetof(struct attr_list_entry, name_length)) || + (al_entry->name_length && ((u8 *)al_entry + al_entry->name_offset + + al_entry->name_length * sizeof(__le16)) > al_end)) + break; /* corrupt */ + + next_al_entry = (struct attr_list_entry *)((u8 *)al_entry + le16_to_cpu(al_entry->length)); - if (le32_to_cpu(al_entry->type) > le32_to_cpu(type)) - goto not_found; - if (type != al_entry->type) - continue; + if (type != AT_UNUSED) { + if (le32_to_cpu(al_entry->type) > le32_to_cpu(type)) + goto not_found; + if (type != al_entry->type) + continue; + } /* * If @name is present, compare the two names. If @name is * missing, assume we want an unnamed attribute. */ al_name_len = al_entry->name_length; - al_name = (ntfschar*)((u8*)al_entry + al_entry->name_offset); - if (!name) { + al_name = (__le16 *)((u8 *)al_entry + al_entry->name_offset); + + /* + * If !@type we want the attribute represented by this + * attribute list entry. + */ + if (type == AT_UNUSED) + goto is_enumeration; + + if (!name || name == AT_UNNAMED) { if (al_name_len) goto not_found; } else if (!ntfs_are_names_equal(al_name, al_name_len, name, @@ -951,14 +1030,7 @@ static int ntfs_external_attr_find(const ATTR_TYPE type, /* If the strings are not equal, continue search. */ if (rc) continue; - /* - * FIXME: Reverse engineering showed 0, IGNORE_CASE but - * that is inconsistent with ntfs_attr_find(). The - * subsequent rc checks were also different. Perhaps I - * made a mistake in one of the two. Need to recheck - * which is correct or at least see what is going on... - * (AIA) - */ + rc = ntfs_collate_names(name, name_len, al_name, al_name_len, 1, CASE_SENSITIVE, vol->upcase, vol->upcase_len); @@ -973,27 +1045,28 @@ static int ntfs_external_attr_find(const ATTR_TYPE type, * next attribute list entry still fits @lowest_vcn. Otherwise * we have reached the right one or the search has failed. */ - if (lowest_vcn && (u8*)next_al_entry >= al_start && - (u8*)next_al_entry + 6 < al_end && - (u8*)next_al_entry + le16_to_cpu( - next_al_entry->length) <= al_end && - sle64_to_cpu(next_al_entry->lowest_vcn) <= - lowest_vcn && - next_al_entry->type == al_entry->type && - next_al_entry->name_length == al_name_len && - ntfs_are_names_equal((ntfschar*)((u8*) + if (lowest_vcn && (u8 *)next_al_entry >= al_start && + (u8 *)next_al_entry + 6 < al_end && + (u8 *)next_al_entry + le16_to_cpu( + next_al_entry->length) <= al_end && + le64_to_cpu(next_al_entry->lowest_vcn) <= + lowest_vcn && + next_al_entry->type == al_entry->type && + next_al_entry->name_length == al_name_len && + ntfs_are_names_equal((__le16 *)((u8 *) next_al_entry + next_al_entry->name_offset), next_al_entry->name_length, al_name, al_name_len, CASE_SENSITIVE, vol->upcase, vol->upcase_len)) continue; + +is_enumeration: if (MREF_LE(al_entry->mft_reference) == ni->mft_no) { if (MSEQNO_LE(al_entry->mft_reference) != ni->seq_no) { - ntfs_error(vol->sb, "Found stale mft " - "reference in attribute list " - "of base inode 0x%lx.%s", - base_ni->mft_no, es); + ntfs_error(vol->sb, + "Found stale mft reference in attribute list of base inode 0x%lx.%s", + base_ni->mft_no, es); err = -EIO; break; } @@ -1006,18 +1079,16 @@ static int ntfs_external_attr_find(const ATTR_TYPE type, base_ni->mft_no) { ni = ctx->ntfs_ino = base_ni; ctx->mrec = ctx->base_mrec; + ctx->mapped_mrec = ctx->mapped_base_mrec; } else { /* We want an extent record. */ ctx->mrec = map_extent_mft_record(base_ni, le64_to_cpu( al_entry->mft_reference), &ni); if (IS_ERR(ctx->mrec)) { - ntfs_error(vol->sb, "Failed to map " - "extent mft record " - "0x%lx of base inode " - "0x%lx.%s", - MREF_LE(al_entry-> - mft_reference), + ntfs_error(vol->sb, + "Failed to map extent mft record 0x%lx of base inode 0x%lx.%s", + MREF_LE(al_entry->mft_reference), base_ni->mft_no, es); err = PTR_ERR(ctx->mrec); if (err == -ENOENT) @@ -1027,10 +1098,12 @@ static int ntfs_external_attr_find(const ATTR_TYPE type, break; } ctx->ntfs_ino = ni; + ctx->mapped_mrec = true; + } - ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec + - le16_to_cpu(ctx->mrec->attrs_offset)); } + a = ctx->attr = (struct attr_record *)((u8 *)ctx->mrec + + le16_to_cpu(ctx->mrec->attrs_offset)); /* * ctx->vfs_ino, ctx->mrec, and ctx->attr now point to the * mft record containing the attribute represented by the @@ -1046,17 +1119,16 @@ static int ntfs_external_attr_find(const ATTR_TYPE type, * entry above, the comparison can now be optimized. So it is * worth re-implementing a simplified ntfs_attr_find() here. */ - a = ctx->attr; /* * Use a manual loop so we can still use break and continue * with the same meanings as above. */ do_next_attr_loop: - if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec + + if ((u8 *)a < (u8 *)ctx->mrec || (u8 *)a > (u8 *)ctx->mrec + le32_to_cpu(ctx->mrec->bytes_allocated)) break; if (a->type == AT_END) - break; + continue; if (!a->length) break; if (al_entry->instance != a->instance) @@ -1068,7 +1140,7 @@ static int ntfs_external_attr_find(const ATTR_TYPE type, */ if (al_entry->type != a->type) break; - if (!ntfs_are_names_equal((ntfschar*)((u8*)a + + if (!ntfs_are_names_equal((__le16 *)((u8 *)a + le16_to_cpu(a->name_offset)), a->name_length, al_name, al_name_len, CASE_SENSITIVE, vol->upcase, vol->upcase_len)) @@ -1078,9 +1150,9 @@ static int ntfs_external_attr_find(const ATTR_TYPE type, * If no @val specified or @val specified and it matches, we * have found it! */ - if (!val || (!a->non_resident && le32_to_cpu( + if ((type == AT_UNUSED) || !val || (!a->non_resident && le32_to_cpu( a->data.resident.value_length) == val_len && - !memcmp((u8*)a + + !memcmp((u8 *)a + le16_to_cpu(a->data.resident.value_offset), val, val_len))) { ntfs_debug("Done, found."); @@ -1088,22 +1160,27 @@ static int ntfs_external_attr_find(const ATTR_TYPE type, } do_next_attr: /* Proceed to the next attribute in the current mft record. */ - a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length)); + a = (struct attr_record *)((u8 *)a + le32_to_cpu(a->length)); goto do_next_attr_loop; } - if (!err) { - ntfs_error(vol->sb, "Base inode 0x%lx contains corrupt " - "attribute list attribute.%s", base_ni->mft_no, - es); - err = -EIO; - } + +corrupt: if (ni != base_ni) { if (ni) unmap_extent_mft_record(ni); ctx->ntfs_ino = base_ni; ctx->mrec = ctx->base_mrec; ctx->attr = ctx->base_attr; + ctx->mapped_mrec = ctx->mapped_base_mrec; + } + + if (!err) { + ntfs_error(vol->sb, + "Base inode 0x%lx contains corrupt attribute list attribute.%s", + base_ni->mft_no, es); + err = -EIO; } + if (err != -ENOMEM) NVolSetErrors(vol); return err; @@ -1112,7 +1189,7 @@ static int ntfs_external_attr_find(const ATTR_TYPE type, * If we were looking for AT_END, we reset the search context @ctx and * use ntfs_attr_find() to seek to the end of the base mft record. */ - if (type == AT_END) { + if (type == AT_UNUSED || type == AT_END) { ntfs_attr_reinit_search_ctx(ctx); return ntfs_attr_find(AT_END, name, name_len, ic, val, val_len, ctx); @@ -1133,13 +1210,14 @@ static int ntfs_external_attr_find(const ATTR_TYPE type, if (ni != base_ni) unmap_extent_mft_record(ni); ctx->mrec = ctx->base_mrec; - ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec + + ctx->attr = (struct attr_record *)((u8 *)ctx->mrec + le16_to_cpu(ctx->mrec->attrs_offset)); ctx->is_first = true; ctx->ntfs_ino = base_ni; ctx->base_ntfs_ino = NULL; ctx->base_mrec = NULL; ctx->base_attr = NULL; + ctx->mapped_mrec = ctx->mapped_base_mrec; /* * In case there are multiple matches in the base mft record, need to * keep enumerating until we get an attribute not found response (or @@ -1190,26 +1268,21 @@ static int ntfs_external_attr_find(const ATTR_TYPE type, * collates just after the attribute list entry of the attribute being searched * for, i.e. if one wants to add the attribute to the mft record this is the * correct place to insert its attribute list entry into. - * - * When -errno != -ENOENT, an error occurred during the lookup. @ctx->attr is - * then undefined and in particular you should not rely on it not changing. */ -int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name, - const u32 name_len, const IGNORE_CASE_BOOL ic, - const VCN lowest_vcn, const u8 *val, const u32 val_len, - ntfs_attr_search_ctx *ctx) +int ntfs_attr_lookup(const __le32 type, const __le16 *name, + const u32 name_len, const u32 ic, + const s64 lowest_vcn, const u8 *val, const u32 val_len, + struct ntfs_attr_search_ctx *ctx) { - ntfs_inode *base_ni; + struct ntfs_inode *base_ni; ntfs_debug("Entering."); - BUG_ON(IS_ERR(ctx->mrec)); if (ctx->base_ntfs_ino) base_ni = ctx->base_ntfs_ino; else base_ni = ctx->ntfs_ino; /* Sanity check, just for debugging really. */ - BUG_ON(!base_ni); - if (!NInoAttrList(base_ni) || type == AT_ATTRIBUTE_LIST) + if (!base_ni || !NInoAttrList(base_ni) || type == AT_ATTRIBUTE_LIST) return ntfs_attr_find(type, name, name_len, ic, val, val_len, ctx); return ntfs_external_attr_find(type, name, name_len, ic, lowest_vcn, @@ -1218,23 +1291,35 @@ int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name, /** * ntfs_attr_init_search_ctx - initialize an attribute search context - * @ctx: attribute search context to initialize - * @ni: ntfs inode with which to initialize the search context - * @mrec: mft record with which to initialize the search context + * @ctx: attribute search context to initialize + * @ni: ntfs inode with which to initialize the search context + * @mrec: mft record with which to initialize the search context * * Initialize the attribute search context @ctx with @ni and @mrec. */ -static inline void ntfs_attr_init_search_ctx(ntfs_attr_search_ctx *ctx, - ntfs_inode *ni, MFT_RECORD *mrec) +static bool ntfs_attr_init_search_ctx(struct ntfs_attr_search_ctx *ctx, + struct ntfs_inode *ni, struct mft_record *mrec) { - *ctx = (ntfs_attr_search_ctx) { - .mrec = mrec, - /* Sanity checks are performed elsewhere. */ - .attr = (ATTR_RECORD*)((u8*)mrec + - le16_to_cpu(mrec->attrs_offset)), - .is_first = true, - .ntfs_ino = ni, - }; + if (!mrec) { + mrec = map_mft_record(ni); + if (IS_ERR(mrec)) + return false; + ctx->mapped_mrec = true; + } else { + ctx->mapped_mrec = false; + } + + ctx->mrec = mrec; + /* Sanity checks are performed elsewhere. */ + ctx->attr = (struct attr_record *)((u8 *)mrec + le16_to_cpu(mrec->attrs_offset)); + ctx->is_first = true; + ctx->ntfs_ino = ni; + ctx->al_entry = NULL; + ctx->base_ntfs_ino = NULL; + ctx->base_mrec = NULL; + ctx->base_attr = NULL; + ctx->mapped_base_mrec = false; + return true; } /** @@ -1247,13 +1332,15 @@ static inline void ntfs_attr_init_search_ctx(ntfs_attr_search_ctx *ctx, * This is used when a search for a new attribute is being started to reset * the search context to the beginning. */ -void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx) +void ntfs_attr_reinit_search_ctx(struct ntfs_attr_search_ctx *ctx) { + bool mapped_mrec; + if (likely(!ctx->base_ntfs_ino)) { /* No attribute list. */ ctx->is_first = true; /* Sanity checks are performed elsewhere. */ - ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec + + ctx->attr = (struct attr_record *)((u8 *)ctx->mrec + le16_to_cpu(ctx->mrec->attrs_offset)); /* * This needs resetting due to ntfs_external_attr_find() which @@ -1262,10 +1349,12 @@ void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx) ctx->al_entry = NULL; return; } /* Attribute list. */ - if (ctx->ntfs_ino != ctx->base_ntfs_ino) + if (ctx->ntfs_ino != ctx->base_ntfs_ino && ctx->ntfs_ino) unmap_extent_mft_record(ctx->ntfs_ino); + + mapped_mrec = ctx->mapped_base_mrec; ntfs_attr_init_search_ctx(ctx, ctx->base_ntfs_ino, ctx->base_mrec); - return; + ctx->mapped_mrec = mapped_mrec; } /** @@ -1276,13 +1365,21 @@ void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx) * Allocate a new attribute search context, initialize it with @ni and @mrec, * and return it. Return NULL if allocation failed. */ -ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, MFT_RECORD *mrec) +struct ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(struct ntfs_inode *ni, + struct mft_record *mrec) { - ntfs_attr_search_ctx *ctx; + struct ntfs_attr_search_ctx *ctx; + bool init; ctx = kmem_cache_alloc(ntfs_attr_ctx_cache, GFP_NOFS); - if (ctx) - ntfs_attr_init_search_ctx(ctx, ni, mrec); + if (ctx) { + init = ntfs_attr_init_search_ctx(ctx, ni, mrec); + if (init == false) { + kmem_cache_free(ntfs_attr_ctx_cache, ctx); + ctx = NULL; + } + } + return ctx; } @@ -1293,16 +1390,17 @@ ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, MFT_RECORD *mrec) * Release the attribute search context @ctx, unmapping an associated extent * mft record if present. */ -void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx) +void ntfs_attr_put_search_ctx(struct ntfs_attr_search_ctx *ctx) { - if (ctx->base_ntfs_ino && ctx->ntfs_ino != ctx->base_ntfs_ino) - unmap_extent_mft_record(ctx->ntfs_ino); + if (ctx->mapped_mrec) + unmap_mft_record(ctx->ntfs_ino); + + if (ctx->mapped_base_mrec && ctx->base_ntfs_ino && + ctx->ntfs_ino != ctx->base_ntfs_ino) + unmap_extent_mft_record(ctx->base_ntfs_ino); kmem_cache_free(ntfs_attr_ctx_cache, ctx); - return; } -#ifdef NTFS_RW - /** * ntfs_attr_find_in_attrdef - find an attribute in the $AttrDef system file * @vol: ntfs volume to which the attribute belongs @@ -1313,14 +1411,13 @@ void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx) * * Return the attribute type definition record if found and NULL if not found. */ -static ATTR_DEF *ntfs_attr_find_in_attrdef(const ntfs_volume *vol, - const ATTR_TYPE type) +static struct attr_def *ntfs_attr_find_in_attrdef(const struct ntfs_volume *vol, + const __le32 type) { - ATTR_DEF *ad; + struct attr_def *ad; - BUG_ON(!vol->attrdef); - BUG_ON(!type); - for (ad = vol->attrdef; (u8*)ad - (u8*)vol->attrdef < + WARN_ON(!type); + for (ad = vol->attrdef; (u8 *)ad - (u8 *)vol->attrdef < vol->attrdef_size && ad->type; ++ad) { /* We have not found it yet, carry on searching. */ if (likely(le32_to_cpu(ad->type) < le32_to_cpu(type))) @@ -1345,16 +1442,15 @@ static ATTR_DEF *ntfs_attr_find_in_attrdef(const ntfs_volume *vol, * * Check whether the @size in bytes is valid for an attribute of @type on the * ntfs volume @vol. This information is obtained from $AttrDef system file. - * - * Return 0 if valid, -ERANGE if not valid, or -ENOENT if the attribute is not - * listed in $AttrDef. */ -int ntfs_attr_size_bounds_check(const ntfs_volume *vol, const ATTR_TYPE type, +int ntfs_attr_size_bounds_check(const struct ntfs_volume *vol, const __le32 type, const s64 size) { - ATTR_DEF *ad; + struct attr_def *ad; + + if (size < 0) + return -EINVAL; - BUG_ON(size < 0); /* * $ATTRIBUTE_LIST has a maximum size of 256kiB, but this is not * listed in $AttrDef. @@ -1366,10 +1462,10 @@ int ntfs_attr_size_bounds_check(const ntfs_volume *vol, const ATTR_TYPE type, if (unlikely(!ad)) return -ENOENT; /* Do the bounds check. */ - if (((sle64_to_cpu(ad->min_size) > 0) && - size < sle64_to_cpu(ad->min_size)) || - ((sle64_to_cpu(ad->max_size) > 0) && size > - sle64_to_cpu(ad->max_size))) + if (((le64_to_cpu(ad->min_size) > 0) && + size < le64_to_cpu(ad->min_size)) || + ((le64_to_cpu(ad->max_size) > 0) && size > + le64_to_cpu(ad->max_size))) return -ERANGE; return 0; } @@ -1381,13 +1477,11 @@ int ntfs_attr_size_bounds_check(const ntfs_volume *vol, const ATTR_TYPE type, * * Check whether the attribute of @type on the ntfs volume @vol is allowed to * be non-resident. This information is obtained from $AttrDef system file. - * - * Return 0 if the attribute is allowed to be non-resident, -EPERM if not, and - * -ENOENT if the attribute is not listed in $AttrDef. */ -int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, const ATTR_TYPE type) +static int ntfs_attr_can_be_non_resident(const struct ntfs_volume *vol, + const __le32 type) { - ATTR_DEF *ad; + struct attr_def *ad; /* Find the attribute definition record in $AttrDef. */ ad = ntfs_attr_find_in_attrdef(vol, type); @@ -1417,7 +1511,7 @@ int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, const ATTR_TYPE type) * check for this here as we do not know which inode's $Bitmap is * being asked about so the caller needs to special case this. */ -int ntfs_attr_can_be_resident(const ntfs_volume *vol, const ATTR_TYPE type) +int ntfs_attr_can_be_resident(const struct ntfs_volume *vol, const __le32 type) { if (type == AT_INDEX_ALLOCATION) return -EPERM; @@ -1432,37 +1526,45 @@ int ntfs_attr_can_be_resident(const ntfs_volume *vol, const ATTR_TYPE type) * * Resize the attribute record @a, i.e. the resident part of the attribute, in * the mft record @m to @new_size bytes. - * - * Return 0 on success and -errno on error. The following error codes are - * defined: - * -ENOSPC - Not enough space in the mft record @m to perform the resize. - * - * Note: On error, no modifications have been performed whatsoever. - * - * Warning: If you make a record smaller without having copied all the data you - * are interested in the data may be overwritten. */ -int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size) +int ntfs_attr_record_resize(struct mft_record *m, struct attr_record *a, u32 new_size) { - ntfs_debug("Entering for new_size %u.", new_size); + u32 old_size, alloc_size, attr_size; + + old_size = le32_to_cpu(m->bytes_in_use); + alloc_size = le32_to_cpu(m->bytes_allocated); + attr_size = le32_to_cpu(a->length); + + ntfs_debug("Sizes: old=%u alloc=%u attr=%u new=%u\n", + (unsigned int)old_size, (unsigned int)alloc_size, + (unsigned int)attr_size, (unsigned int)new_size); + /* Align to 8 bytes if it is not already done. */ if (new_size & 7) new_size = (new_size + 7) & ~7; /* If the actual attribute length has changed, move things around. */ - if (new_size != le32_to_cpu(a->length)) { + if (new_size != attr_size) { u32 new_muse = le32_to_cpu(m->bytes_in_use) - - le32_to_cpu(a->length) + new_size; + attr_size + new_size; /* Not enough space in this mft record. */ if (new_muse > le32_to_cpu(m->bytes_allocated)) return -ENOSPC; + + if (a->type == AT_INDEX_ROOT && new_size > attr_size && + new_muse + 120 > alloc_size && old_size + 120 <= alloc_size) { + ntfs_debug("Too big struct index_root (%u > %u)\n", + new_muse, alloc_size); + return -ENOSPC; + } + /* Move attributes following @a to their new location. */ - memmove((u8*)a + new_size, (u8*)a + le32_to_cpu(a->length), - le32_to_cpu(m->bytes_in_use) - ((u8*)a - - (u8*)m) - le32_to_cpu(a->length)); + memmove((u8 *)a + new_size, (u8 *)a + le32_to_cpu(a->length), + le32_to_cpu(m->bytes_in_use) - ((u8 *)a - + (u8 *)m) - attr_size); /* Adjust @m to reflect the change in used space. */ m->bytes_in_use = cpu_to_le32(new_muse); /* Adjust @a to reflect the new size. */ - if (new_size >= offsetof(ATTR_REC, length) + sizeof(a->length)) + if (new_size >= offsetof(struct attr_record, length) + sizeof(a->length)) a->length = cpu_to_le32(new_size); } return 0; @@ -1476,17 +1578,8 @@ int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size) * * Resize the value of the attribute @a in the mft record @m to @new_size bytes. * If the value is made bigger, the newly allocated space is cleared. - * - * Return 0 on success and -errno on error. The following error codes are - * defined: - * -ENOSPC - Not enough space in the mft record @m to perform the resize. - * - * Note: On error, no modifications have been performed whatsoever. - * - * Warning: If you make a record smaller without having copied all the data you - * are interested in the data may be overwritten. */ -int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a, +int ntfs_resident_attr_value_resize(struct mft_record *m, struct attr_record *a, const u32 new_size) { u32 old_size; @@ -1501,7 +1594,7 @@ int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a, */ old_size = le32_to_cpu(a->data.resident.value_length); if (new_size > old_size) - memset((u8*)a + le16_to_cpu(a->data.resident.value_offset) + + memset((u8 *)a + le16_to_cpu(a->data.resident.value_offset) + old_size, 0, new_size - old_size); /* Finally update the length of the attribute value. */ a->data.resident.value_length = cpu_to_le32(new_size); @@ -1521,100 +1614,43 @@ int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a, * always know it. The reason we cannot simply read the size from the vfs * inode i_size is that this is not necessarily uptodate. This happens when * ntfs_attr_make_non_resident() is called in the ->truncate call path(s). - * - * Return 0 on success and -errno on error. The following error return codes - * are defined: - * -EPERM - The attribute is not allowed to be non-resident. - * -ENOMEM - Not enough memory. - * -ENOSPC - Not enough disk space. - * -EINVAL - Attribute not defined on the volume. - * -EIO - I/o error or other error. - * Note that -ENOSPC is also returned in the case that there is not enough - * space in the mft record to do the conversion. This can happen when the mft - * record is already very full. The caller is responsible for trying to make - * space in the mft record and trying again. FIXME: Do we need a separate - * error return code for this kind of -ENOSPC or is it always worth trying - * again in case the attribute may then fit in a resident state so no need to - * make it non-resident at all? Ho-hum... (AIA) - * - * NOTE to self: No changes in the attribute list are required to move from - * a resident to a non-resident attribute. - * - * Locking: - The caller must hold i_mutex on the inode. */ -int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size) +int ntfs_attr_make_non_resident(struct ntfs_inode *ni, const u32 data_size) { s64 new_size; struct inode *vi = VFS_I(ni); - ntfs_volume *vol = ni->vol; - ntfs_inode *base_ni; - MFT_RECORD *m; - ATTR_RECORD *a; - ntfs_attr_search_ctx *ctx; - struct page *page; - runlist_element *rl; + struct ntfs_volume *vol = ni->vol; + struct ntfs_inode *base_ni; + struct mft_record *m; + struct attr_record *a; + struct ntfs_attr_search_ctx *ctx; + struct folio *folio; + struct runlist_element *rl; u8 *kaddr; unsigned long flags; int mp_size, mp_ofs, name_ofs, arec_size, err, err2; u32 attr_size; u8 old_res_attr_flags; + if (NInoNonResident(ni)) { + ntfs_warning(vol->sb, + "Trying to make non-resident attribute non-resident. Aborting...\n"); + return -EINVAL; + } + /* Check that the attribute is allowed to be non-resident. */ err = ntfs_attr_can_be_non_resident(vol, ni->type); if (unlikely(err)) { if (err == -EPERM) - ntfs_debug("Attribute is not allowed to be " - "non-resident."); + ntfs_debug("Attribute is not allowed to be non-resident."); else - ntfs_debug("Attribute not defined on the NTFS " - "volume!"); + ntfs_debug("Attribute not defined on the NTFS volume!"); return err; } - /* - * FIXME: Compressed and encrypted attributes are not supported when - * writing and we should never have gotten here for them. - */ - BUG_ON(NInoCompressed(ni)); - BUG_ON(NInoEncrypted(ni)); - /* - * The size needs to be aligned to a cluster boundary for allocation - * purposes. - */ - new_size = (data_size + vol->cluster_size - 1) & - ~(vol->cluster_size - 1); - if (new_size > 0) { - /* - * Will need the page later and since the page lock nests - * outside all ntfs locks, we need to get the page now. - */ - page = find_or_create_page(vi->i_mapping, 0, - mapping_gfp_mask(vi->i_mapping)); - if (unlikely(!page)) - return -ENOMEM; - /* Start by allocating clusters to hold the attribute value. */ - rl = ntfs_cluster_alloc(vol, 0, new_size >> - vol->cluster_size_bits, -1, DATA_ZONE, true); - if (IS_ERR(rl)) { - err = PTR_ERR(rl); - ntfs_debug("Failed to allocate cluster%s, error code " - "%i.", (new_size >> - vol->cluster_size_bits) > 1 ? "s" : "", - err); - goto page_err_out; - } - } else { - rl = NULL; - page = NULL; - } - /* Determine the size of the mapping pairs array. */ - mp_size = ntfs_get_size_for_mapping_pairs(vol, rl, 0, -1); - if (unlikely(mp_size < 0)) { - err = mp_size; - ntfs_debug("Failed to get size for mapping pairs array, error " - "code %i.", err); - goto rl_err_out; - } - down_write(&ni->runlist.lock); + + if (NInoEncrypted(ni)) + return -EIO; + if (!NInoAttr(ni)) base_ni = ni; else @@ -1640,47 +1676,105 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size) } m = ctx->mrec; a = ctx->attr; - BUG_ON(NInoNonResident(ni)); - BUG_ON(a->non_resident); + + /* + * The size needs to be aligned to a cluster boundary for allocation + * purposes. + */ + new_size = (data_size + vol->cluster_size - 1) & + ~(vol->cluster_size - 1); + if (new_size > 0) { + if ((a->flags & ATTR_COMPRESSION_MASK) == ATTR_IS_COMPRESSED) { + /* must allocate full compression blocks */ + new_size = + ((new_size - 1) | + ((1L << (STANDARD_COMPRESSION_UNIT + + vol->cluster_size_bits)) - 1)) + 1; + } + + /* + * Will need folio later and since folio lock nests + * outside all ntfs locks, we need to get the folio now. + */ + folio = __filemap_get_folio(vi->i_mapping, 0, + FGP_CREAT | FGP_LOCK, + mapping_gfp_mask(vi->i_mapping)); + if (IS_ERR(folio)) { + err = -ENOMEM; + goto err_out; + } + + /* Start by allocating clusters to hold the attribute value. */ + rl = ntfs_cluster_alloc(vol, 0, NTFS_B_TO_CLU(vol, new_size), + -1, DATA_ZONE, true, + false, false); + if (IS_ERR(rl)) { + err = PTR_ERR(rl); + ntfs_debug("Failed to allocate cluster%s, error code %i.", + (NTFS_B_TO_CLU(vol, new_size)) > 1 ? "s" : "", + err); + goto folio_err_out; + } + } else { + rl = NULL; + folio = NULL; + } + + down_write(&ni->runlist.lock); + /* Determine the size of the mapping pairs array. */ + mp_size = ntfs_get_size_for_mapping_pairs(vol, rl, 0, -1, -1); + if (unlikely(mp_size < 0)) { + err = mp_size; + ntfs_debug("Failed to get size for mapping pairs array, error code %i.\n", err); + goto rl_err_out; + } + + if (NInoNonResident(ni) || a->non_resident) { + err = -EIO; + goto rl_err_out; + } + /* * Calculate new offsets for the name and the mapping pairs array. */ if (NInoSparse(ni) || NInoCompressed(ni)) - name_ofs = (offsetof(ATTR_REC, + name_ofs = (offsetof(struct attr_record, data.non_resident.compressed_size) + sizeof(a->data.non_resident.compressed_size) + 7) & ~7; else - name_ofs = (offsetof(ATTR_REC, + name_ofs = (offsetof(struct attr_record, data.non_resident.compressed_size) + 7) & ~7; - mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7; + mp_ofs = (name_ofs + a->name_length * sizeof(__le16) + 7) & ~7; /* * Determine the size of the resident part of the now non-resident * attribute record. */ arec_size = (mp_ofs + mp_size + 7) & ~7; /* - * If the page is not uptodate bring it uptodate by copying from the + * If the folio is not uptodate bring it uptodate by copying from the * attribute value. */ attr_size = le32_to_cpu(a->data.resident.value_length); - BUG_ON(attr_size != data_size); - if (page && !PageUptodate(page)) { - kaddr = kmap_atomic(page); - memcpy(kaddr, (u8*)a + + WARN_ON(attr_size != data_size); + if (folio && !folio_test_uptodate(folio)) { + kaddr = kmap_local_folio(folio, 0); + memcpy(kaddr, (u8 *)a + le16_to_cpu(a->data.resident.value_offset), attr_size); memset(kaddr + attr_size, 0, PAGE_SIZE - attr_size); - kunmap_atomic(kaddr); - flush_dcache_page(page); - SetPageUptodate(page); + kunmap_local(kaddr); + flush_dcache_folio(folio); + folio_mark_uptodate(folio); } + /* Backup the attribute flag. */ old_res_attr_flags = a->data.resident.flags; /* Resize the resident part of the attribute record. */ err = ntfs_attr_record_resize(m, a, arec_size); if (unlikely(err)) - goto err_out; + goto rl_err_out; + /* * Convert the resident part of the attribute record to describe a * non-resident attribute. @@ -1688,20 +1782,19 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size) a->non_resident = 1; /* Move the attribute name if it exists and update the offset. */ if (a->name_length) - memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset), - a->name_length * sizeof(ntfschar)); + memmove((u8 *)a + name_ofs, (u8 *)a + le16_to_cpu(a->name_offset), + a->name_length * sizeof(__le16)); a->name_offset = cpu_to_le16(name_ofs); /* Setup the fields specific to non-resident attributes. */ a->data.non_resident.lowest_vcn = 0; - a->data.non_resident.highest_vcn = cpu_to_sle64((new_size - 1) >> - vol->cluster_size_bits); + a->data.non_resident.highest_vcn = cpu_to_le64(NTFS_B_TO_CLU(vol, new_size - 1)); a->data.non_resident.mapping_pairs_offset = cpu_to_le16(mp_ofs); memset(&a->data.non_resident.reserved, 0, sizeof(a->data.non_resident.reserved)); - a->data.non_resident.allocated_size = cpu_to_sle64(new_size); + a->data.non_resident.allocated_size = cpu_to_le64(new_size); a->data.non_resident.data_size = a->data.non_resident.initialized_size = - cpu_to_sle64(attr_size); + cpu_to_le64(attr_size); if (NInoSparse(ni) || NInoCompressed(ni)) { a->data.non_resident.compression_unit = 0; if (NInoCompressed(ni) || vol->major_ver < 3) @@ -1711,23 +1804,29 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size) } else a->data.non_resident.compression_unit = 0; /* Generate the mapping pairs array into the attribute record. */ - err = ntfs_mapping_pairs_build(vol, (u8*)a + mp_ofs, - arec_size - mp_ofs, rl, 0, -1, NULL); + err = ntfs_mapping_pairs_build(vol, (u8 *)a + mp_ofs, + arec_size - mp_ofs, rl, 0, -1, NULL, NULL, NULL); if (unlikely(err)) { - ntfs_debug("Failed to build mapping pairs, error code %i.", + ntfs_error(vol->sb, "Failed to build mapping pairs, error code %i.", err); goto undo_err_out; } + /* Setup the in-memory attribute structure to be non-resident. */ ni->runlist.rl = rl; + if (rl) { + for (ni->runlist.count = 1; rl->length != 0; rl++) + ni->runlist.count++; + } else + ni->runlist.count = 0; write_lock_irqsave(&ni->size_lock, flags); ni->allocated_size = new_size; if (NInoSparse(ni) || NInoCompressed(ni)) { ni->itype.compressed.size = ni->allocated_size; if (a->data.non_resident.compression_unit) { - ni->itype.compressed.block_size = 1U << (a->data. - non_resident.compression_unit + - vol->cluster_size_bits); + ni->itype.compressed.block_size = 1U << + (a->data.non_resident.compression_unit + + vol->cluster_size_bits); ni->itype.compressed.block_size_bits = ffs(ni->itype.compressed.block_size) - 1; @@ -1749,16 +1848,16 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size) * this switch, which is another reason to do this last. */ NInoSetNonResident(ni); + NInoSetFullyMapped(ni); /* Mark the mft record dirty, so it gets written back. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); mark_mft_record_dirty(ctx->ntfs_ino); ntfs_attr_put_search_ctx(ctx); unmap_mft_record(base_ni); up_write(&ni->runlist.lock); - if (page) { - set_page_dirty(page); - unlock_page(page); - put_page(page); + if (folio) { + iomap_dirty_folio(vi->i_mapping, folio); + folio_unlock(folio); + folio_put(folio); } ntfs_debug("Done."); return 0; @@ -1766,12 +1865,12 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size) /* Convert the attribute back into a resident attribute. */ a->non_resident = 0; /* Move the attribute name if it exists and update the offset. */ - name_ofs = (offsetof(ATTR_RECORD, data.resident.reserved) + + name_ofs = (offsetof(struct attr_record, data.resident.reserved) + sizeof(a->data.resident.reserved) + 7) & ~7; if (a->name_length) - memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset), - a->name_length * sizeof(ntfschar)); - mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7; + memmove((u8 *)a + name_ofs, (u8 *)a + le16_to_cpu(a->name_offset), + a->name_length * sizeof(__le16)); + mp_ofs = (name_ofs + a->name_length * sizeof(__le16) + 7) & ~7; a->name_offset = cpu_to_le16(name_ofs); arec_size = (mp_ofs + attr_size + 7) & ~7; /* Resize the resident part of the attribute record. */ @@ -1782,25 +1881,18 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size) * could happen in theory), but deal with it as well as we can. * If the old size is too small, truncate the attribute, * otherwise simply give it a larger allocated size. - * FIXME: Should check whether chkdsk complains when the - * allocated size is much bigger than the resident value size. */ arec_size = le32_to_cpu(a->length); if ((mp_ofs + attr_size) > arec_size) { err2 = attr_size; attr_size = arec_size - mp_ofs; - ntfs_error(vol->sb, "Failed to undo partial resident " - "to non-resident attribute " - "conversion. Truncating inode 0x%lx, " - "attribute type 0x%x from %i bytes to " - "%i bytes to maintain metadata " - "consistency. THIS MEANS YOU ARE " - "LOSING %i BYTES DATA FROM THIS %s.", + ntfs_error(vol->sb, + "Failed to undo partial resident to non-resident attribute conversion. Truncating inode 0x%lx, attribute type 0x%x from %i bytes to %i bytes to maintain metadata consistency. THIS MEANS YOU ARE LOSING %i BYTES DATA FROM THIS %s.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), + (unsigned int)le32_to_cpu(ni->type), err2, attr_size, err2 - attr_size, ((ni->type == AT_DATA) && - !ni->name_len) ? "FILE": "ATTRIBUTE"); + !ni->name_len) ? "FILE" : "ATTRIBUTE"); write_lock_irqsave(&ni->size_lock, flags); ni->initialized_size = attr_size; i_size_write(vi, attr_size); @@ -1813,812 +1905,3480 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size) a->data.resident.flags = old_res_attr_flags; memset(&a->data.resident.reserved, 0, sizeof(a->data.resident.reserved)); - /* Copy the data from the page back to the attribute value. */ - if (page) { - kaddr = kmap_atomic(page); - memcpy((u8*)a + mp_ofs, kaddr, attr_size); - kunmap_atomic(kaddr); - } + /* Copy the data from folio back to the attribute value. */ + if (folio) + memcpy_from_folio((u8 *)a + mp_ofs, folio, 0, attr_size); /* Setup the allocated size in the ntfs inode in case it changed. */ write_lock_irqsave(&ni->size_lock, flags); ni->allocated_size = arec_size - mp_ofs; write_unlock_irqrestore(&ni->size_lock, flags); /* Mark the mft record dirty, so it gets written back. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); mark_mft_record_dirty(ctx->ntfs_ino); -err_out: - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - ni->runlist.rl = NULL; - up_write(&ni->runlist.lock); rl_err_out: + up_write(&ni->runlist.lock); if (rl) { if (ntfs_cluster_free_from_rl(vol, rl) < 0) { - ntfs_error(vol->sb, "Failed to release allocated " - "cluster(s) in error code path. Run " - "chkdsk to recover the lost " - "cluster(s)."); + ntfs_error(vol->sb, + "Failed to release allocated cluster(s) in error code path. Run chkdsk to recover the lost cluster(s)."); NVolSetErrors(vol); } ntfs_free(rl); -page_err_out: - unlock_page(page); - put_page(page); +folio_err_out: + folio_unlock(folio); + folio_put(folio); } +err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + ni->runlist.rl = NULL; + if (err == -EINVAL) err = -EIO; return err; } /** - * ntfs_attr_extend_allocation - extend the allocated space of an attribute - * @ni: ntfs inode of the attribute whose allocation to extend - * @new_alloc_size: new size in bytes to which to extend the allocation to - * @new_data_size: new size in bytes to which to extend the data to - * @data_start: beginning of region which is required to be non-sparse - * - * Extend the allocated space of an attribute described by the ntfs inode @ni - * to @new_alloc_size bytes. If @data_start is -1, the whole extension may be - * implemented as a hole in the file (as long as both the volume and the ntfs - * inode @ni have sparse support enabled). If @data_start is >= 0, then the - * region between the old allocated size and @data_start - 1 may be made sparse - * but the regions between @data_start and @new_alloc_size must be backed by - * actual clusters. - * - * If @new_data_size is -1, it is ignored. If it is >= 0, then the data size - * of the attribute is extended to @new_data_size. Note that the i_size of the - * vfs inode is not updated. Only the data size in the base attribute record - * is updated. The caller has to update i_size separately if this is required. - * WARNING: It is a BUG() for @new_data_size to be smaller than the old data - * size as well as for @new_data_size to be greater than @new_alloc_size. - * - * For resident attributes this involves resizing the attribute record and if - * necessary moving it and/or other attributes into extent mft records and/or - * converting the attribute to a non-resident attribute which in turn involves - * extending the allocation of a non-resident attribute as described below. - * - * For non-resident attributes this involves allocating clusters in the data - * zone on the volume (except for regions that are being made sparse) and - * extending the run list to describe the allocated clusters as well as - * updating the mapping pairs array of the attribute. This in turn involves - * resizing the attribute record and if necessary moving it and/or other - * attributes into extent mft records and/or splitting the attribute record - * into multiple extent attribute records. - * - * Also, the attribute list attribute is updated if present and in some of the - * above cases (the ones where extent mft records/attributes come into play), - * an attribute list attribute is created if not already present. - * - * Return the new allocated size on success and -errno on error. In the case - * that an error is encountered but a partial extension at least up to - * @data_start (if present) is possible, the allocation is partially extended - * and this is returned. This means the caller must check the returned size to - * determine if the extension was partial. If @data_start is -1 then partial - * allocations are not performed. - * - * WARNING: Do not call ntfs_attr_extend_allocation() for $MFT/$DATA. - * - * Locking: This function takes the runlist lock of @ni for writing as well as - * locking the mft record of the base ntfs inode. These locks are maintained - * throughout execution of the function. These locks are required so that the - * attribute can be resized safely and so that it can for example be converted - * from resident to non-resident safely. - * - * TODO: At present attribute list attribute handling is not implemented. - * - * TODO: At present it is not safe to call this function for anything other - * than the $DATA attribute(s) of an uncompressed and unencrypted file. + * ntfs_attr_set - fill (a part of) an attribute with a byte + * @ni: ntfs inode describing the attribute to fill + * @ofs: offset inside the attribute at which to start to fill + * @cnt: number of bytes to fill + * @val: the unsigned 8-bit value with which to fill the attribute + * + * Fill @cnt bytes of the attribute described by the ntfs inode @ni starting at + * byte offset @ofs inside the attribute with the constant byte @val. + * + * This function is effectively like memset() applied to an ntfs attribute. + * Note thie function actually only operates on the page cache pages belonging + * to the ntfs attribute and it marks them dirty after doing the memset(). + * Thus it relies on the vm dirty page write code paths to cause the modified + * pages to be written to the mft record/disk. */ -s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size, - const s64 new_data_size, const s64 data_start) +int ntfs_attr_set(struct ntfs_inode *ni, s64 ofs, s64 cnt, const u8 val) { - VCN vcn; - s64 ll, allocated_size, start = data_start; - struct inode *vi = VFS_I(ni); - ntfs_volume *vol = ni->vol; - ntfs_inode *base_ni; - MFT_RECORD *m; - ATTR_RECORD *a; - ntfs_attr_search_ctx *ctx; - runlist_element *rl, *rl2; - unsigned long flags; - int err, mp_size; - u32 attr_len = 0; /* Silence stupid gcc warning. */ - bool mp_rebuilt; - -#ifdef DEBUG - read_lock_irqsave(&ni->size_lock, flags); - allocated_size = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " - "old_allocated_size 0x%llx, " - "new_allocated_size 0x%llx, new_data_size 0x%llx, " - "data_start 0x%llx.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), - (unsigned long long)allocated_size, - (unsigned long long)new_alloc_size, - (unsigned long long)new_data_size, - (unsigned long long)start); -#endif -retry_extend: - /* - * For non-resident attributes, @start and @new_size need to be aligned - * to cluster boundaries for allocation purposes. - */ - if (NInoNonResident(ni)) { - if (start > 0) - start &= ~(s64)vol->cluster_size_mask; - new_alloc_size = (new_alloc_size + vol->cluster_size - 1) & - ~(s64)vol->cluster_size_mask; - } - BUG_ON(new_data_size >= 0 && new_data_size > new_alloc_size); - /* Check if new size is allowed in $AttrDef. */ - err = ntfs_attr_size_bounds_check(vol, ni->type, new_alloc_size); - if (unlikely(err)) { - /* Only emit errors when the write will fail completely. */ - read_lock_irqsave(&ni->size_lock, flags); - allocated_size = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (start < 0 || start >= allocated_size) { - if (err == -ERANGE) { - ntfs_error(vol->sb, "Cannot extend allocation " - "of inode 0x%lx, attribute " - "type 0x%x, because the new " - "allocation would exceed the " - "maximum allowed size for " - "this attribute type.", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type)); - } else { - ntfs_error(vol->sb, "Cannot extend allocation " - "of inode 0x%lx, attribute " - "type 0x%x, because this " - "attribute type is not " - "defined on the NTFS volume. " - "Possible corruption! You " - "should run chkdsk!", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type)); - } + struct address_space *mapping = VFS_I(ni)->i_mapping; + struct folio *folio; + pgoff_t index; + u8 *addr; + unsigned long offset; + size_t attr_len; + int ret = 0; + + index = ofs >> PAGE_SHIFT; + while (cnt) { + folio = read_mapping_folio(mapping, index, NULL); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + ntfs_error(VFS_I(ni)->i_sb, "Failed to read a page %lu for attr %#x: %ld", + index, ni->type, PTR_ERR(folio)); + break; } - /* Translate error code to be POSIX conformant for write(2). */ - if (err == -ERANGE) - err = -EFBIG; + + offset = offset_in_folio(folio, ofs); + attr_len = min_t(size_t, (size_t)cnt, folio_size(folio) - offset); + + folio_lock(folio); + addr = kmap_local_folio(folio, offset); + memset(addr, val, attr_len); + kunmap_local(addr); + + flush_dcache_folio(folio); + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); + + ofs += attr_len; + cnt -= attr_len; + index++; + cond_resched(); + } + + return ret; +} + +int ntfs_attr_set_initialized_size(struct ntfs_inode *ni, loff_t new_size) +{ + struct ntfs_attr_search_ctx *ctx; + int err = 0; + + if (!NInoNonResident(ni)) + return -EINVAL; + + ctx = ntfs_attr_get_search_ctx(ni, NULL); + if (!ctx) + return -ENOMEM; + + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (err) + goto out_ctx; + + ctx->attr->data.non_resident.initialized_size = cpu_to_le64(new_size); + ni->initialized_size = new_size; + mark_mft_record_dirty(ctx->ntfs_ino); +out_ctx: + ntfs_attr_put_search_ctx(ctx); + return err; +} + +/** + * ntfs_make_room_for_attr - make room for an attribute inside an mft record + * @m: mft record + * @pos: position at which to make space + * @size: byte size to make available at this position + * + * @pos points to the attribute in front of which we want to make space. + */ +static int ntfs_make_room_for_attr(struct mft_record *m, u8 *pos, u32 size) +{ + u32 biu; + + ntfs_debug("Entering for pos 0x%x, size %u.\n", + (int)(pos - (u8 *)m), (unsigned int) size); + + /* Make size 8-byte alignment. */ + size = (size + 7) & ~7; + + /* Rigorous consistency checks. */ + if (!m || !pos || pos < (u8 *)m) { + pr_err("%s: pos=%p m=%p", __func__, pos, m); + return -EINVAL; + } + + /* The -8 is for the attribute terminator. */ + if (pos - (u8 *)m > (int)le32_to_cpu(m->bytes_in_use) - 8) + return -EINVAL; + /* Nothing to do. */ + if (!size) + return 0; + + biu = le32_to_cpu(m->bytes_in_use); + /* Do we have enough space? */ + if (biu + size > le32_to_cpu(m->bytes_allocated) || + pos + size > (u8 *)m + le32_to_cpu(m->bytes_allocated)) { + ntfs_debug("No enough space in the MFT record\n"); + return -ENOSPC; + } + /* Move everything after pos to pos + size. */ + memmove(pos + size, pos, biu - (pos - (u8 *)m)); + /* Update mft record. */ + m->bytes_in_use = cpu_to_le32(biu + size); + return 0; +} + +/** + * ntfs_resident_attr_record_add - add resident attribute to inode + * @ni: opened ntfs inode to which MFT record add attribute + * @type: type of the new attribute + * @name: name of the new attribute + * @name_len: name length of the new attribute + * @val: value of the new attribute + * @size: size of new attribute (length of @val, if @val != NULL) + * @flags: flags of the new attribute + */ +int ntfs_resident_attr_record_add(struct ntfs_inode *ni, __le32 type, + __le16 *name, u8 name_len, u8 *val, u32 size, + __le16 flags) +{ + struct ntfs_attr_search_ctx *ctx; + u32 length; + struct attr_record *a; + struct mft_record *m; + int err, offset; + struct ntfs_inode *base_ni; + + ntfs_debug("Entering for inode 0x%llx, attr 0x%x, flags 0x%x.\n", + (long long) ni->mft_no, (unsigned int) le32_to_cpu(type), + (unsigned int) le16_to_cpu(flags)); + + if (!ni || (!name && name_len)) + return -EINVAL; + + err = ntfs_attr_can_be_resident(ni->vol, type); + if (err) { + if (err == -EPERM) + ntfs_debug("Attribute can't be resident.\n"); else - err = -EIO; + ntfs_debug("ntfs_attr_can_be_resident failed.\n"); return err; } - if (!NInoAttr(ni)) - base_ni = ni; - else - base_ni = ni->ext.base_ntfs_ino; - /* - * We will be modifying both the runlist (if non-resident) and the mft - * record so lock them both down. - */ - down_write(&ni->runlist.lock); - m = map_mft_record(base_ni); - if (IS_ERR(m)) { - err = PTR_ERR(m); - m = NULL; - ctx = NULL; - goto err_out; - } - ctx = ntfs_attr_get_search_ctx(base_ni, m); - if (unlikely(!ctx)) { - err = -ENOMEM; - goto err_out; + + /* Locate place where record should be. */ + ctx = ntfs_attr_get_search_ctx(ni, NULL); + if (!ctx) { + ntfs_error(ni->vol->sb, "%s: Failed to get search context", + __func__); + return -ENOMEM; } - read_lock_irqsave(&ni->size_lock, flags); - allocated_size = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - /* - * If non-resident, seek to the last extent. If resident, there is - * only one extent, so seek to that. - */ - vcn = NInoNonResident(ni) ? allocated_size >> vol->cluster_size_bits : - 0; /* - * Abort if someone did the work whilst we waited for the locks. If we - * just converted the attribute from resident to non-resident it is - * likely that exactly this has happened already. We cannot quite - * abort if we need to update the data size. + * Use ntfs_attr_find instead of ntfs_attr_lookup to find place for + * attribute in @ni->mrec, not any extent inode in case if @ni is base + * file record. */ - if (unlikely(new_alloc_size <= allocated_size)) { - ntfs_debug("Allocated size already exceeds requested size."); - new_alloc_size = allocated_size; - if (new_data_size < 0) - goto done; - /* - * We want the first attribute extent so that we can update the - * data size. - */ - vcn = 0; + err = ntfs_attr_find(type, name, name_len, CASE_SENSITIVE, val, size, ctx); + if (!err) { + err = -EEXIST; + ntfs_debug("Attribute already present.\n"); + goto put_err_out; } - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, vcn, NULL, 0, ctx); - if (unlikely(err)) { - if (err == -ENOENT) - err = -EIO; - goto err_out; + if (err != -ENOENT) { + err = -EIO; + goto put_err_out; } - m = ctx->mrec; a = ctx->attr; - /* Use goto to reduce indentation. */ - if (a->non_resident) - goto do_non_resident_extend; - BUG_ON(NInoNonResident(ni)); - /* The total length of the attribute value. */ - attr_len = le32_to_cpu(a->data.resident.value_length); - /* - * Extend the attribute record to be able to store the new attribute - * size. ntfs_attr_record_resize() will not do anything if the size is - * not changing. - */ - if (new_alloc_size < vol->mft_record_size && - !ntfs_attr_record_resize(m, a, - le16_to_cpu(a->data.resident.value_offset) + - new_alloc_size)) { - /* The resize succeeded! */ - write_lock_irqsave(&ni->size_lock, flags); - ni->allocated_size = le32_to_cpu(a->length) - - le16_to_cpu(a->data.resident.value_offset); - write_unlock_irqrestore(&ni->size_lock, flags); - if (new_data_size >= 0) { - BUG_ON(new_data_size < attr_len); - a->data.resident.value_length = - cpu_to_le32((u32)new_data_size); - } - goto flush_done; + m = ctx->mrec; + + /* Make room for attribute. */ + length = offsetof(struct attr_record, data.resident.reserved) + + sizeof(a->data.resident.reserved) + + ((name_len * sizeof(__le16) + 7) & ~7) + + ((size + 7) & ~7); + err = ntfs_make_room_for_attr(ctx->mrec, (u8 *) ctx->attr, length); + if (err) { + ntfs_debug("Failed to make room for attribute.\n"); + goto put_err_out; } - /* - * We have to drop all the locks so we can call - * ntfs_attr_make_non_resident(). This could be optimised by try- - * locking the first page cache page and only if that fails dropping - * the locks, locking the page, and redoing all the locking and - * lookups. While this would be a huge optimisation, it is not worth - * it as this is definitely a slow code path. - */ + + /* Setup record fields. */ + offset = ((u8 *)a - (u8 *)m); + a->type = type; + a->length = cpu_to_le32(length); + a->non_resident = 0; + a->name_length = name_len; + a->name_offset = + name_len ? cpu_to_le16((offsetof(struct attr_record, data.resident.reserved) + + sizeof(a->data.resident.reserved))) : cpu_to_le16(0); + + a->flags = flags; + a->instance = m->next_attr_instance; + a->data.resident.value_length = cpu_to_le32(size); + a->data.resident.value_offset = cpu_to_le16(length - ((size + 7) & ~7)); + if (val) + memcpy((u8 *)a + le16_to_cpu(a->data.resident.value_offset), val, size); + else + memset((u8 *)a + le16_to_cpu(a->data.resident.value_offset), 0, size); + if (type == AT_FILE_NAME) + a->data.resident.flags = RESIDENT_ATTR_IS_INDEXED; + else + a->data.resident.flags = 0; + if (name_len) + memcpy((u8 *)a + le16_to_cpu(a->name_offset), + name, sizeof(__le16) * name_len); + m->next_attr_instance = + cpu_to_le16((le16_to_cpu(m->next_attr_instance) + 1) & 0xffff); + if (ni->nr_extents == -1) + base_ni = ni->ext.base_ntfs_ino; + else + base_ni = ni; + if (type != AT_ATTRIBUTE_LIST && NInoAttrList(base_ni)) { + err = ntfs_attrlist_entry_add(ni, a); + if (err) { + ntfs_attr_record_resize(m, a, 0); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_debug("Failed add attribute entry to ATTRIBUTE_LIST.\n"); + goto put_err_out; + } + } + mark_mft_record_dirty(ni); ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); - /* - * Not enough space in the mft record, try to make the attribute - * non-resident and if successful restart the extension process. - */ - err = ntfs_attr_make_non_resident(ni, attr_len); - if (likely(!err)) - goto retry_extend; - /* - * Could not make non-resident. If this is due to this not being - * permitted for this attribute type or there not being enough space, - * try to make other attributes non-resident. Otherwise fail. - */ - if (unlikely(err != -EPERM && err != -ENOSPC)) { - /* Only emit errors when the write will fail completely. */ - read_lock_irqsave(&ni->size_lock, flags); - allocated_size = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Cannot extend allocation of " - "inode 0x%lx, attribute type 0x%x, " - "because the conversion from resident " - "to non-resident attribute failed " - "with error code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - if (err != -ENOMEM) - err = -EIO; - goto conv_err_out; - } - /* TODO: Not implemented from here, abort. */ - read_lock_irqsave(&ni->size_lock, flags); - allocated_size = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (start < 0 || start >= allocated_size) { - if (err == -ENOSPC) - ntfs_error(vol->sb, "Not enough space in the mft " - "record/on disk for the non-resident " - "attribute value. This case is not " - "implemented yet."); - else /* if (err == -EPERM) */ - ntfs_error(vol->sb, "This attribute type may not be " - "non-resident. This case is not " - "implemented yet."); - } - err = -EOPNOTSUPP; - goto conv_err_out; -#if 0 - // TODO: Attempt to make other attributes non-resident. - if (!err) - goto do_resident_extend; - /* - * Both the attribute list attribute and the standard information - * attribute must remain in the base inode. Thus, if this is one of - * these attributes, we have to try to move other attributes out into - * extent mft records instead. - */ - if (ni->type == AT_ATTRIBUTE_LIST || - ni->type == AT_STANDARD_INFORMATION) { - // TODO: Attempt to move other attributes into extent mft - // records. - err = -EOPNOTSUPP; - if (!err) - goto do_resident_extend; - goto err_out; + return offset; +put_err_out: + ntfs_attr_put_search_ctx(ctx); + return -EIO; +} + +/** + * ntfs_non_resident_attr_record_add - add extent of non-resident attribute + * @ni: opened ntfs inode to which MFT record add attribute + * @type: type of the new attribute extent + * @name: name of the new attribute extent + * @name_len: name length of the new attribute extent + * @lowest_vcn: lowest vcn of the new attribute extent + * @dataruns_size: dataruns size of the new attribute extent + * @flags: flags of the new attribute extent + */ +static int ntfs_non_resident_attr_record_add(struct ntfs_inode *ni, __le32 type, + __le16 *name, u8 name_len, s64 lowest_vcn, int dataruns_size, + __le16 flags) +{ + struct ntfs_attr_search_ctx *ctx; + u32 length; + struct attr_record *a; + struct mft_record *m; + struct ntfs_inode *base_ni; + int err, offset; + + ntfs_debug("Entering for inode 0x%llx, attr 0x%x, lowest_vcn %lld, dataruns_size %d, flags 0x%x.\n", + (long long) ni->mft_no, (unsigned int) le32_to_cpu(type), + (long long) lowest_vcn, dataruns_size, + (unsigned int) le16_to_cpu(flags)); + + if (!ni || dataruns_size <= 0 || (!name && name_len)) + return -EINVAL; + + err = ntfs_attr_can_be_non_resident(ni->vol, type); + if (err) { + if (err == -EPERM) + pr_err("Attribute can't be non resident"); + else + pr_err("ntfs_attr_can_be_non_resident failed"); + return err; } - // TODO: Attempt to move this attribute to an extent mft record, but - // only if it is not already the only attribute in an mft record in - // which case there would be nothing to gain. - err = -EOPNOTSUPP; - if (!err) - goto do_resident_extend; - /* There is nothing we can do to make enough space. )-: */ - goto err_out; -#endif -do_non_resident_extend: - BUG_ON(!NInoNonResident(ni)); - if (new_alloc_size == allocated_size) { - BUG_ON(vcn); - goto alloc_done; + + /* Locate place where record should be. */ + ctx = ntfs_attr_get_search_ctx(ni, NULL); + if (!ctx) { + pr_err("%s: Failed to get search context", __func__); + return -ENOMEM; } /* - * If the data starts after the end of the old allocation, this is a - * $DATA attribute and sparse attributes are enabled on the volume and - * for this inode, then create a sparse region between the old - * allocated size and the start of the data. Otherwise simply proceed - * with filling the whole space between the old allocated size and the - * new allocated size with clusters. + * Use ntfs_attr_find instead of ntfs_attr_lookup to find place for + * attribute in @ni->mrec, not any extent inode in case if @ni is base + * file record. */ - if ((start >= 0 && start <= allocated_size) || ni->type != AT_DATA || - !NVolSparseEnabled(vol) || NInoSparseDisabled(ni)) - goto skip_sparse; - // TODO: This is not implemented yet. We just fill in with real - // clusters for now... - ntfs_debug("Inserting holes is not-implemented yet. Falling back to " - "allocating real clusters instead."); -skip_sparse: - rl = ni->runlist.rl; - if (likely(rl)) { - /* Seek to the end of the runlist. */ - while (rl->length) - rl++; - } - /* If this attribute extent is not mapped, map it now. */ - if (unlikely(!rl || rl->lcn == LCN_RL_NOT_MAPPED || - (rl->lcn == LCN_ENOENT && rl > ni->runlist.rl && - (rl-1)->lcn == LCN_RL_NOT_MAPPED))) { - if (!rl && !allocated_size) - goto first_alloc; - rl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl); - if (IS_ERR(rl)) { - err = PTR_ERR(rl); - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Cannot extend allocation " - "of inode 0x%lx, attribute " - "type 0x%x, because the " - "mapping of a runlist " - "fragment failed with error " - "code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), - err); - if (err != -ENOMEM) - err = -EIO; - goto err_out; - } - ni->runlist.rl = rl; - /* Seek to the end of the runlist. */ - while (rl->length) - rl++; + err = ntfs_attr_find(type, name, name_len, CASE_SENSITIVE, NULL, 0, ctx); + if (!err) { + err = -EEXIST; + pr_err("Attribute 0x%x already present", type); + goto put_err_out; } - /* - * We now know the runlist of the last extent is mapped and @rl is at - * the end of the runlist. We want to begin allocating clusters - * starting at the last allocated cluster to reduce fragmentation. If - * there are no valid LCNs in the attribute we let the cluster - * allocator choose the starting cluster. - */ - /* If the last LCN is a hole or simillar seek back to last real LCN. */ - while (rl->lcn < 0 && rl > ni->runlist.rl) - rl--; -first_alloc: - // FIXME: Need to implement partial allocations so at least part of the - // write can be performed when start >= 0. (Needed for POSIX write(2) - // conformance.) - rl2 = ntfs_cluster_alloc(vol, allocated_size >> vol->cluster_size_bits, - (new_alloc_size - allocated_size) >> - vol->cluster_size_bits, (rl && (rl->lcn >= 0)) ? - rl->lcn + rl->length : -1, DATA_ZONE, true); - if (IS_ERR(rl2)) { - err = PTR_ERR(rl2); - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Cannot extend allocation of " - "inode 0x%lx, attribute type 0x%x, " - "because the allocation of clusters " - "failed with error code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - if (err != -ENOMEM && err != -ENOSPC) - err = -EIO; - goto err_out; + if (err != -ENOENT) { + pr_err("ntfs_attr_find failed"); + err = -EIO; + goto put_err_out; } - rl = ntfs_runlists_merge(ni->runlist.rl, rl2); - if (IS_ERR(rl)) { - err = PTR_ERR(rl); - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Cannot extend allocation of " - "inode 0x%lx, attribute type 0x%x, " - "because the runlist merge failed " - "with error code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - if (err != -ENOMEM) - err = -EIO; - if (ntfs_cluster_free_from_rl(vol, rl2)) { - ntfs_error(vol->sb, "Failed to release allocated " - "cluster(s) in error code path. Run " - "chkdsk to recover the lost " - "cluster(s)."); - NVolSetErrors(vol); - } - ntfs_free(rl2); - goto err_out; + a = ctx->attr; + m = ctx->mrec; + + /* Make room for attribute. */ + dataruns_size = (dataruns_size + 7) & ~7; + length = offsetof(struct attr_record, data.non_resident.compressed_size) + + ((sizeof(__le16) * name_len + 7) & ~7) + dataruns_size + + ((flags & (ATTR_IS_COMPRESSED | ATTR_IS_SPARSE)) ? + sizeof(a->data.non_resident.compressed_size) : 0); + err = ntfs_make_room_for_attr(ctx->mrec, (u8 *) ctx->attr, length); + if (err) { + pr_err("Failed to make room for attribute"); + goto put_err_out; } - ni->runlist.rl = rl; - ntfs_debug("Allocated 0x%llx clusters.", (long long)(new_alloc_size - - allocated_size) >> vol->cluster_size_bits); - /* Find the runlist element with which the attribute extent starts. */ - ll = sle64_to_cpu(a->data.non_resident.lowest_vcn); - rl2 = ntfs_rl_find_vcn_nolock(rl, ll); - BUG_ON(!rl2); - BUG_ON(!rl2->length); - BUG_ON(rl2->lcn < LCN_HOLE); - mp_rebuilt = false; - /* Get the size for the new mapping pairs array for this extent. */ - mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1); - if (unlikely(mp_size <= 0)) { - err = mp_size; - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Cannot extend allocation of " - "inode 0x%lx, attribute type 0x%x, " - "because determining the size for the " - "mapping pairs failed with error code " - "%i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - err = -EIO; - goto undo_alloc; + + /* Setup record fields. */ + a->type = type; + a->length = cpu_to_le32(length); + a->non_resident = 1; + a->name_length = name_len; + a->name_offset = cpu_to_le16(offsetof(struct attr_record, + data.non_resident.compressed_size) + + ((flags & (ATTR_IS_COMPRESSED | ATTR_IS_SPARSE)) ? + sizeof(a->data.non_resident.compressed_size) : 0)); + a->flags = flags; + a->instance = m->next_attr_instance; + a->data.non_resident.lowest_vcn = cpu_to_le64(lowest_vcn); + a->data.non_resident.mapping_pairs_offset = cpu_to_le16(length - dataruns_size); + a->data.non_resident.compression_unit = + (flags & ATTR_IS_COMPRESSED) ? STANDARD_COMPRESSION_UNIT : 0; + /* If @lowest_vcn == 0, than setup empty attribute. */ + if (!lowest_vcn) { + a->data.non_resident.highest_vcn = cpu_to_le64(-1); + a->data.non_resident.allocated_size = 0; + a->data.non_resident.data_size = 0; + a->data.non_resident.initialized_size = 0; + /* Set empty mapping pairs. */ + *((u8 *)a + le16_to_cpu(a->data.non_resident.mapping_pairs_offset)) = 0; } - /* Extend the attribute record to fit the bigger mapping pairs array. */ - attr_len = le32_to_cpu(a->length); - err = ntfs_attr_record_resize(m, a, mp_size + - le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); - if (unlikely(err)) { - BUG_ON(err != -ENOSPC); - // TODO: Deal with this by moving this extent to a new mft - // record or by starting a new extent in a new mft record, - // possibly by extending this extent partially and filling it - // and creating a new extent for the remainder, or by making - // other attributes non-resident and/or by moving other - // attributes out of this mft record. - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Not enough space in the mft " - "record for the extended attribute " - "record. This case is not " - "implemented yet."); - err = -EOPNOTSUPP; - goto undo_alloc; - } - mp_rebuilt = true; - /* Generate the mapping pairs array directly into the attr record. */ - err = ntfs_mapping_pairs_build(vol, (u8*)a + - le16_to_cpu(a->data.non_resident.mapping_pairs_offset), - mp_size, rl2, ll, -1, NULL); - if (unlikely(err)) { - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Cannot extend allocation of " - "inode 0x%lx, attribute type 0x%x, " - "because building the mapping pairs " - "failed with error code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - err = -EIO; - goto undo_alloc; + if (name_len) + memcpy((u8 *)a + le16_to_cpu(a->name_offset), + name, sizeof(__le16) * name_len); + m->next_attr_instance = + cpu_to_le16((le16_to_cpu(m->next_attr_instance) + 1) & 0xffff); + if (ni->nr_extents == -1) + base_ni = ni->ext.base_ntfs_ino; + else + base_ni = ni; + if (type != AT_ATTRIBUTE_LIST && NInoAttrList(base_ni)) { + err = ntfs_attrlist_entry_add(ni, a); + if (err) { + pr_err("Failed add attr entry to attrlist"); + ntfs_attr_record_resize(m, a, 0); + goto put_err_out; + } } - /* Update the highest_vcn. */ - a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >> - vol->cluster_size_bits) - 1); + mark_mft_record_dirty(ni); /* - * We now have extended the allocated size of the attribute. Reflect - * this in the ntfs_inode structure and the attribute record. + * Locate offset from start of the MFT record where new attribute is + * placed. We need relookup it, because record maybe moved during + * update of attribute list. */ - if (a->data.non_resident.lowest_vcn) { - /* - * We are not in the first attribute extent, switch to it, but - * first ensure the changes will make it to disk later. - */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - ntfs_attr_reinit_search_ctx(ctx); - err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, - CASE_SENSITIVE, 0, NULL, 0, ctx); - if (unlikely(err)) - goto restore_undo_alloc; - /* @m is not used any more so no need to set it. */ - a = ctx->attr; - } - write_lock_irqsave(&ni->size_lock, flags); - ni->allocated_size = new_alloc_size; - a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size); - /* - * FIXME: This would fail if @ni is a directory, $MFT, or an index, - * since those can have sparse/compressed set. For example can be - * set compressed even though it is not compressed itself and in that - * case the bit means that files are to be created compressed in the - * directory... At present this is ok as this code is only called for - * regular files, and only for their $DATA attribute(s). - * FIXME: The calculation is wrong if we created a hole above. For now - * it does not matter as we never create holes. - */ - if (NInoSparse(ni) || NInoCompressed(ni)) { - ni->itype.compressed.size += new_alloc_size - allocated_size; - a->data.non_resident.compressed_size = - cpu_to_sle64(ni->itype.compressed.size); - vi->i_blocks = ni->itype.compressed.size >> 9; - } else - vi->i_blocks = new_alloc_size >> 9; - write_unlock_irqrestore(&ni->size_lock, flags); -alloc_done: - if (new_data_size >= 0) { - BUG_ON(new_data_size < - sle64_to_cpu(a->data.non_resident.data_size)); - a->data.non_resident.data_size = cpu_to_sle64(new_data_size); - } -flush_done: - /* Ensure the changes make it to disk. */ - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); -done: - ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); - ntfs_debug("Done, new_allocated_size 0x%llx.", - (unsigned long long)new_alloc_size); - return new_alloc_size; -restore_undo_alloc: - if (start < 0 || start >= allocated_size) - ntfs_error(vol->sb, "Cannot complete extension of allocation " - "of inode 0x%lx, attribute type 0x%x, because " - "lookup of first attribute extent failed with " - "error code %i.", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - if (err == -ENOENT) - err = -EIO; ntfs_attr_reinit_search_ctx(ctx); - if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, CASE_SENSITIVE, - allocated_size >> vol->cluster_size_bits, NULL, 0, - ctx)) { - ntfs_error(vol->sb, "Failed to find last attribute extent of " - "attribute in error code path. Run chkdsk to " - "recover."); - write_lock_irqsave(&ni->size_lock, flags); - ni->allocated_size = new_alloc_size; - /* - * FIXME: This would fail if @ni is a directory... See above. - * FIXME: The calculation is wrong if we created a hole above. - * For now it does not matter as we never create holes. - */ - if (NInoSparse(ni) || NInoCompressed(ni)) { - ni->itype.compressed.size += new_alloc_size - - allocated_size; - vi->i_blocks = ni->itype.compressed.size >> 9; - } else - vi->i_blocks = new_alloc_size >> 9; - write_unlock_irqrestore(&ni->size_lock, flags); + err = ntfs_attr_lookup(type, name, name_len, CASE_SENSITIVE, + lowest_vcn, NULL, 0, ctx); + if (err) { + pr_err("%s: attribute lookup failed", __func__); ntfs_attr_put_search_ctx(ctx); - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); - /* - * The only thing that is now wrong is the allocated size of the - * base attribute extent which chkdsk should be able to fix. - */ - NVolSetErrors(vol); return err; + } - ctx->attr->data.non_resident.highest_vcn = cpu_to_sle64( - (allocated_size >> vol->cluster_size_bits) - 1); -undo_alloc: - ll = allocated_size >> vol->cluster_size_bits; - if (ntfs_cluster_free(ni, ll, -1, ctx) < 0) { - ntfs_error(vol->sb, "Failed to release allocated cluster(s) " - "in error code path. Run chkdsk to recover " - "the lost cluster(s)."); - NVolSetErrors(vol); + offset = (u8 *)ctx->attr - (u8 *)ctx->mrec; + ntfs_attr_put_search_ctx(ctx); + return offset; +put_err_out: + ntfs_attr_put_search_ctx(ctx); + return -1; +} + +/** + * ntfs_attr_record_rm - remove attribute extent + * @ctx: search context describing the attribute which should be removed + * + * If this function succeed, user should reinit search context if he/she wants + * use it anymore. + */ +int ntfs_attr_record_rm(struct ntfs_attr_search_ctx *ctx) +{ + struct ntfs_inode *base_ni, *ni; + __le32 type; + int err; + + if (!ctx || !ctx->ntfs_ino || !ctx->mrec || !ctx->attr) + return -EINVAL; + + ntfs_debug("Entering for inode 0x%llx, attr 0x%x.\n", + (long long) ctx->ntfs_ino->mft_no, + (unsigned int) le32_to_cpu(ctx->attr->type)); + type = ctx->attr->type; + ni = ctx->ntfs_ino; + if (ctx->base_ntfs_ino) + base_ni = ctx->base_ntfs_ino; + else + base_ni = ctx->ntfs_ino; + + /* Remove attribute itself. */ + if (ntfs_attr_record_resize(ctx->mrec, ctx->attr, 0)) { + ntfs_debug("Couldn't remove attribute record. Bug or damaged MFT record.\n"); + return -EIO; } - m = ctx->mrec; - a = ctx->attr; + mark_mft_record_dirty(ni); + /* - * If the runlist truncation fails and/or the search context is no - * longer valid, we cannot resize the attribute record or build the - * mapping pairs array thus we mark the inode bad so that no access to - * the freed clusters can happen. + * Remove record from $ATTRIBUTE_LIST if present and we don't want + * delete $ATTRIBUTE_LIST itself. */ - if (ntfs_rl_truncate_nolock(vol, &ni->runlist, ll) || IS_ERR(m)) { - ntfs_error(vol->sb, "Failed to %s in error code path. Run " - "chkdsk to recover.", IS_ERR(m) ? - "restore attribute search context" : - "truncate attribute runlist"); - NVolSetErrors(vol); - } else if (mp_rebuilt) { - if (ntfs_attr_record_resize(m, a, attr_len)) { - ntfs_error(vol->sb, "Failed to restore attribute " - "record in error code path. Run " - "chkdsk to recover."); - NVolSetErrors(vol); - } else /* if (success) */ { - if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( - a->data.non_resident. - mapping_pairs_offset), attr_len - - le16_to_cpu(a->data.non_resident. - mapping_pairs_offset), rl2, ll, -1, - NULL)) { - ntfs_error(vol->sb, "Failed to restore " - "mapping pairs array in error " - "code path. Run chkdsk to " - "recover."); - NVolSetErrors(vol); + if (NInoAttrList(base_ni) && type != AT_ATTRIBUTE_LIST) { + err = ntfs_attrlist_entry_rm(ctx); + if (err) { + ntfs_debug("Couldn't delete record from $ATTRIBUTE_LIST.\n"); + return err; + } + } + + /* Post $ATTRIBUTE_LIST delete setup. */ + if (type == AT_ATTRIBUTE_LIST) { + if (NInoAttrList(base_ni) && base_ni->attr_list) + ntfs_free(base_ni->attr_list); + base_ni->attr_list = NULL; + NInoClearAttrList(base_ni); + } + + /* Free MFT record, if it doesn't contain attributes. */ + if (le32_to_cpu(ctx->mrec->bytes_in_use) - + le16_to_cpu(ctx->mrec->attrs_offset) == 8) { + if (ntfs_mft_record_free(ni->vol, ni)) { + ntfs_debug("Couldn't free MFT record.\n"); + return -EIO; + } + /* Remove done if we freed base inode. */ + if (ni == base_ni) + return 0; + ntfs_inode_close(ni); + ctx->ntfs_ino = ni = NULL; + } + + if (type == AT_ATTRIBUTE_LIST || !NInoAttrList(base_ni)) + return 0; + + /* Remove attribute list if we don't need it any more. */ + if (!ntfs_attrlist_need(base_ni)) { + struct ntfs_attr na; + struct inode *attr_vi; + + ntfs_attr_reinit_search_ctx(ctx); + if (ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, CASE_SENSITIVE, + 0, NULL, 0, ctx)) { + ntfs_debug("Couldn't find attribute list. Succeed anyway.\n"); + return 0; + } + /* Deallocate clusters. */ + if (ctx->attr->non_resident) { + struct runlist_element *al_rl; + size_t new_rl_count; + + al_rl = ntfs_mapping_pairs_decompress(base_ni->vol, + ctx->attr, NULL, &new_rl_count); + if (IS_ERR(al_rl)) { + ntfs_debug("Couldn't decompress attribute list runlist. Succeed anyway.\n"); + return 0; } - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); + if (ntfs_cluster_free_from_rl(base_ni->vol, al_rl)) + ntfs_debug("Leaking clusters! Run chkdsk. Couldn't free clusters from attribute list runlist.\n"); + ntfs_free(al_rl); + } + /* Remove attribute record itself. */ + if (ntfs_attr_record_rm(ctx)) { + ntfs_debug("Couldn't remove attribute list. Succeed anyway.\n"); + return 0; + } + + na.mft_no = VFS_I(base_ni)->i_ino; + na.type = AT_ATTRIBUTE_LIST; + na.name = NULL; + na.name_len = 0; + + attr_vi = ilookup5(VFS_I(base_ni)->i_sb, VFS_I(base_ni)->i_ino, + ntfs_test_inode, &na); + if (attr_vi) { + clear_nlink(attr_vi); + iput(attr_vi); } + } -err_out: - if (ctx) - ntfs_attr_put_search_ctx(ctx); - if (m) - unmap_mft_record(base_ni); - up_write(&ni->runlist.lock); -conv_err_out: - ntfs_debug("Failed. Returning error code %i.", err); - return err; + return 0; } /** - * ntfs_attr_set - fill (a part of) an attribute with a byte - * @ni: ntfs inode describing the attribute to fill - * @ofs: offset inside the attribute at which to start to fill - * @cnt: number of bytes to fill - * @val: the unsigned 8-bit value with which to fill the attribute + * ntfs_attr_add - add attribute to inode + * @ni: opened ntfs inode to which add attribute + * @type: type of the new attribute + * @name: name in unicode of the new attribute + * @name_len: name length in unicode characters of the new attribute + * @val: value of new attribute + * @size: size of the new attribute / length of @val (if specified) * - * Fill @cnt bytes of the attribute described by the ntfs inode @ni starting at - * byte offset @ofs inside the attribute with the constant byte @val. + * @val should always be specified for always resident attributes (eg. FILE_NAME + * attribute), for attributes that can become non-resident @val can be NULL + * (eg. DATA attribute). @size can be specified even if @val is NULL, in this + * case data size will be equal to @size and initialized size will be equal + * to 0. * - * This function is effectively like memset() applied to an ntfs attribute. - * Note this function actually only operates on the page cache pages belonging - * to the ntfs attribute and it marks them dirty after doing the memset(). - * Thus it relies on the vm dirty page write code paths to cause the modified - * pages to be written to the mft record/disk. + * If inode haven't got enough space to add attribute, add attribute to one of + * it extents, if no extents present or no one of them have enough space, than + * allocate new extent and add attribute to it. + * + * If on one of this steps attribute list is needed but not present, than it is + * added transparently to caller. So, this function should not be called with + * @type == AT_ATTRIBUTE_LIST, if you really need to add attribute list call + * ntfs_inode_add_attrlist instead. * - * Return 0 on success and -errno on error. An error code of -ESPIPE means - * that @ofs + @cnt were outside the end of the attribute and no write was - * performed. + * On success return 0. On error return -1 with errno set to the error code. */ -int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val) +int ntfs_attr_add(struct ntfs_inode *ni, __le32 type, + __le16 *name, u8 name_len, u8 *val, s64 size) { - ntfs_volume *vol = ni->vol; - struct address_space *mapping; - struct page *page; - u8 *kaddr; - pgoff_t idx, end; - unsigned start_ofs, end_ofs, size; - - ntfs_debug("Entering for ofs 0x%llx, cnt 0x%llx, val 0x%hx.", - (long long)ofs, (long long)cnt, val); - BUG_ON(ofs < 0); - BUG_ON(cnt < 0); - if (!cnt) - goto done; + struct super_block *sb; + u32 attr_rec_size; + int err, i, offset; + bool is_resident; + bool can_be_non_resident = false; + struct ntfs_inode *attr_ni; + struct inode *attr_vi; + struct mft_record *ni_mrec; + + if (!ni || size < 0 || type == AT_ATTRIBUTE_LIST) + return -EINVAL; + + ntfs_debug("Entering for inode 0x%llx, attr %x, size %lld.\n", + (long long) ni->mft_no, type, size); + + if (ni->nr_extents == -1) + ni = ni->ext.base_ntfs_ino; + + /* Check the attribute type and the size. */ + err = ntfs_attr_size_bounds_check(ni->vol, type, size); + if (err) { + if (err == -ENOENT) + err = -EIO; + return err; + } + + sb = ni->vol->sb; + /* Sanity checks for always resident attributes. */ + err = ntfs_attr_can_be_non_resident(ni->vol, type); + if (err) { + if (err != -EPERM) { + ntfs_error(sb, "ntfs_attr_can_be_non_resident failed"); + goto err_out; + } + /* @val is mandatory. */ + if (!val) { + ntfs_error(sb, + "val is mandatory for always resident attributes"); + return -EINVAL; + } + if (size > ni->vol->mft_record_size) { + ntfs_error(sb, "Attribute is too big"); + return -ERANGE; + } + } else + can_be_non_resident = true; + /* - * FIXME: Compressed and encrypted attributes are not supported when - * writing and we should never have gotten here for them. + * Determine resident or not will be new attribute. We add 8 to size in + * non resident case for mapping pairs. */ - BUG_ON(NInoCompressed(ni)); - BUG_ON(NInoEncrypted(ni)); - mapping = VFS_I(ni)->i_mapping; - /* Work out the starting index and page offset. */ - idx = ofs >> PAGE_SHIFT; - start_ofs = ofs & ~PAGE_MASK; - /* Work out the ending index and page offset. */ - end = ofs + cnt; - end_ofs = end & ~PAGE_MASK; - /* If the end is outside the inode size return -ESPIPE. */ - if (unlikely(end > i_size_read(VFS_I(ni)))) { - ntfs_error(vol->sb, "Request exceeds end of attribute."); - return -ESPIPE; - } - end >>= PAGE_SHIFT; - /* If there is a first partial page, need to do it the slow way. */ - if (start_ofs) { - page = read_mapping_page(mapping, idx, NULL); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to read first partial " - "page (error, index 0x%lx).", idx); - return PTR_ERR(page); + err = ntfs_attr_can_be_resident(ni->vol, type); + if (!err) { + is_resident = true; + } else { + if (err != -EPERM) { + ntfs_error(sb, "ntfs_attr_can_be_resident failed"); + goto err_out; } - /* - * If the last page is the same as the first page, need to - * limit the write to the end offset. - */ - size = PAGE_SIZE; - if (idx == end) - size = end_ofs; - kaddr = kmap_atomic(page); - memset(kaddr + start_ofs, val, size - start_ofs); - flush_dcache_page(page); - kunmap_atomic(kaddr); - set_page_dirty(page); - put_page(page); - balance_dirty_pages_ratelimited(mapping); - cond_resched(); - if (idx == end) - goto done; - idx++; - } - /* Do the whole pages the fast way. */ - for (; idx < end; idx++) { - /* Find or create the current page. (The page is locked.) */ - page = grab_cache_page(mapping, idx); - if (unlikely(!page)) { - ntfs_error(vol->sb, "Insufficient memory to grab " - "page (index 0x%lx).", idx); - return -ENOMEM; + is_resident = false; + } + + /* Calculate attribute record size. */ + if (is_resident) + attr_rec_size = offsetof(struct attr_record, data.resident.reserved) + + 1 + + ((name_len * sizeof(__le16) + 7) & ~7) + + ((size + 7) & ~7); + else + attr_rec_size = offsetof(struct attr_record, data.non_resident.compressed_size) + + ((name_len * sizeof(__le16) + 7) & ~7) + 8; + + /* + * If we have enough free space for the new attribute in the base MFT + * record, then add attribute to it. + */ +retry: + ni_mrec = map_mft_record(ni); + if (IS_ERR(ni_mrec)) { + err = -EIO; + goto err_out; + } + + if (le32_to_cpu(ni_mrec->bytes_allocated) - + le32_to_cpu(ni_mrec->bytes_in_use) >= attr_rec_size) { + attr_ni = ni; + unmap_mft_record(ni); + goto add_attr_record; + } + unmap_mft_record(ni); + + /* Try to add to extent inodes. */ + err = ntfs_inode_attach_all_extents(ni); + if (err) { + ntfs_error(sb, "Failed to attach all extents to inode"); + goto err_out; + } + + for (i = 0; i < ni->nr_extents; i++) { + attr_ni = ni->ext.extent_ntfs_inos[i]; + ni_mrec = map_mft_record(attr_ni); + if (IS_ERR(ni_mrec)) { + err = -EIO; + goto err_out; } - kaddr = kmap_atomic(page); - memset(kaddr, val, PAGE_SIZE); - flush_dcache_page(page); - kunmap_atomic(kaddr); - /* - * If the page has buffers, mark them uptodate since buffer - * state and not page state is definitive in 2.6 kernels. - */ - if (page_has_buffers(page)) { - struct buffer_head *bh, *head; - bh = head = page_buffers(page); - do { - set_buffer_uptodate(bh); - } while ((bh = bh->b_this_page) != head); + if (le32_to_cpu(ni_mrec->bytes_allocated) - + le32_to_cpu(ni_mrec->bytes_in_use) >= + attr_rec_size) { + unmap_mft_record(attr_ni); + goto add_attr_record; + } + unmap_mft_record(attr_ni); + } + + /* There is no extent that contain enough space for new attribute. */ + if (!NInoAttrList(ni)) { + /* Add attribute list not present, add it and retry. */ + err = ntfs_inode_add_attrlist(ni); + if (err) { + ntfs_error(sb, "Failed to add attribute list"); + goto err_out; + } + goto retry; + } + + attr_ni = NULL; + /* Allocate new extent. */ + err = ntfs_mft_record_alloc(ni->vol, 0, &attr_ni, ni, NULL); + if (err) { + ntfs_error(sb, "Failed to allocate extent record"); + goto err_out; + } + unmap_mft_record(attr_ni); + +add_attr_record: + if (is_resident) { + /* Add resident attribute. */ + offset = ntfs_resident_attr_record_add(attr_ni, type, name, + name_len, val, size, 0); + if (offset < 0) { + if (offset == -ENOSPC && can_be_non_resident) + goto add_non_resident; + err = offset; + ntfs_error(sb, "Failed to add resident attribute"); + goto free_err_out; + } + return 0; + } + +add_non_resident: + /* Add non resident attribute. */ + offset = ntfs_non_resident_attr_record_add(attr_ni, type, name, + name_len, 0, 8, 0); + if (offset < 0) { + err = offset; + ntfs_error(sb, "Failed to add non resident attribute"); + goto free_err_out; + } + + /* If @size == 0, we are done. */ + if (!size) + return 0; + + /* Open new attribute and resize it. */ + attr_vi = ntfs_attr_iget(VFS_I(ni), type, name, name_len); + if (IS_ERR(attr_vi)) { + ntfs_error(sb, "Failed to open just added attribute"); + goto rm_attr_err_out; + } + attr_ni = NTFS_I(attr_vi); + + /* Resize and set attribute value. */ + if (ntfs_attr_truncate(attr_ni, size) || + (val && (ntfs_inode_attr_pwrite(attr_vi, 0, size, val, false) != size))) { + err = -EIO; + ntfs_error(sb, "Failed to initialize just added attribute"); + if (ntfs_attr_rm(attr_ni)) + ntfs_error(sb, "Failed to remove just added attribute"); + iput(attr_vi); + goto err_out; + } + iput(attr_vi); + return 0; + +rm_attr_err_out: + /* Remove just added attribute. */ + ni_mrec = map_mft_record(attr_ni); + if (!IS_ERR(ni_mrec)) { + if (ntfs_attr_record_resize(ni_mrec, + (struct attr_record *)((u8 *)ni_mrec + offset), 0)) + ntfs_error(sb, "Failed to remove just added attribute #2"); + unmap_mft_record(attr_ni); + } else + pr_err("EIO when try to remove new added attr\n"); + +free_err_out: + /* Free MFT record, if it doesn't contain attributes. */ + ni_mrec = map_mft_record(attr_ni); + if (!IS_ERR(ni_mrec)) { + int attr_size; + + attr_size = le32_to_cpu(ni_mrec->bytes_in_use) - + le16_to_cpu(ni_mrec->attrs_offset); + unmap_mft_record(attr_ni); + if (attr_size == 8) { + if (ntfs_mft_record_free(attr_ni->vol, attr_ni)) + ntfs_error(sb, "Failed to free MFT record"); + if (attr_ni->nr_extents < 0) + ntfs_inode_close(attr_ni); + } + } else + pr_err("EIO when testing mft record is free-able\n"); + +err_out: + return err; +} + +/** + * __ntfs_attr_init - primary initialization of an ntfs attribute structure + * @ni: ntfs attribute inode to initialize + * @ni: ntfs inode with which to initialize the ntfs attribute + * @type: attribute type + * @name: attribute name in little endian Unicode or NULL + * @name_len: length of attribute @name in Unicode characters (if @name given) + * + * Initialize the ntfs attribute @na with @ni, @type, @name, and @name_len. + */ +static void __ntfs_attr_init(struct ntfs_inode *ni, + const __le32 type, __le16 *name, const u32 name_len) +{ + ni->runlist.rl = NULL; + ni->type = type; + ni->name = name; + if (name) + ni->name_len = name_len; + else + ni->name_len = 0; +} + +/** + * ntfs_attr_init - initialize an ntfs_attr with data sizes and status + * Final initialization for an ntfs attribute. + */ +static void ntfs_attr_init(struct ntfs_inode *ni, const bool non_resident, + const bool compressed, const bool encrypted, const bool sparse, + const s64 allocated_size, const s64 data_size, + const s64 initialized_size, const s64 compressed_size, + const u8 compression_unit) +{ + if (non_resident) + NInoSetNonResident(ni); + if (compressed) { + NInoSetCompressed(ni); + ni->flags |= FILE_ATTR_COMPRESSED; + } + if (encrypted) { + NInoSetEncrypted(ni); + ni->flags |= FILE_ATTR_ENCRYPTED; + } + if (sparse) { + NInoSetSparse(ni); + ni->flags |= FILE_ATTR_SPARSE_FILE; + } + ni->allocated_size = allocated_size; + ni->data_size = data_size; + ni->initialized_size = initialized_size; + if (compressed || sparse) { + struct ntfs_volume *vol = ni->vol; + + ni->itype.compressed.size = compressed_size; + ni->itype.compressed.block_clusters = 1 << compression_unit; + ni->itype.compressed.block_size = 1 << (compression_unit + + vol->cluster_size_bits); + ni->itype.compressed.block_size_bits = ffs( + ni->itype.compressed.block_size) - 1; + } +} + +/** + * ntfs_attr_open - open an ntfs attribute for access + * @ni: open ntfs inode in which the ntfs attribute resides + * @type: attribute type + * @name: attribute name in little endian Unicode or AT_UNNAMED or NULL + * @name_len: length of attribute @name in Unicode characters (if @name given) + */ +int ntfs_attr_open(struct ntfs_inode *ni, const __le32 type, + __le16 *name, u32 name_len) +{ + struct ntfs_attr_search_ctx *ctx; + __le16 *newname = NULL; + struct attr_record *a; + bool cs; + struct ntfs_inode *base_ni; + int err; + + ntfs_debug("Entering for inode %lld, attr 0x%x.\n", + (unsigned long long)ni->mft_no, type); + + if (!ni || !ni->vol) + return -EINVAL; + + if (NInoAttr(ni)) + base_ni = ni->ext.base_ntfs_ino; + else + base_ni = ni; + + if (name && name != AT_UNNAMED && name != I30) { + name = ntfs_ucsndup(name, name_len); + if (!name) { + err = -ENOMEM; + goto err_out; + } + newname = name; + } + + ctx = ntfs_attr_get_search_ctx(base_ni, NULL); + if (!ctx) { + err = -ENOMEM; + pr_err("%s: Failed to get search context", __func__); + goto err_out; + } + + err = ntfs_attr_lookup(type, name, name_len, 0, 0, NULL, 0, ctx); + if (err) + goto put_err_out; + + a = ctx->attr; + + if (!name) { + if (a->name_length) { + name = ntfs_ucsndup((__le16 *)((u8 *)a + le16_to_cpu(a->name_offset)), + a->name_length); + if (!name) + goto put_err_out; + newname = name; + name_len = a->name_length; + } else { + name = AT_UNNAMED; + name_len = 0; } - /* Now that buffers are uptodate, set the page uptodate, too. */ - SetPageUptodate(page); + } + + __ntfs_attr_init(ni, type, name, name_len); + + /* + * Wipe the flags in case they are not zero for an attribute list + * attribute. Windows does not complain about invalid flags and chkdsk + * does not detect or fix them so we need to cope with it, too. + */ + if (type == AT_ATTRIBUTE_LIST) + a->flags = 0; + + if ((type == AT_DATA) && + (a->non_resident ? !a->data.non_resident.initialized_size : + !a->data.resident.value_length)) { /* - * Set the page and all its buffers dirty and mark the inode - * dirty, too. The VM will write the page later on. + * Define/redefine the compression state if stream is + * empty, based on the compression mark on parent + * directory (for unnamed data streams) or on current + * inode (for named data streams). The compression mark + * may change any time, the compression state can only + * change when stream is wiped out. + * + * Also prevent compression on NTFS version < 3.0 + * or cluster size > 4K or compression is disabled */ - set_page_dirty(page); - /* Finally unlock and release the page. */ - unlock_page(page); - put_page(page); - balance_dirty_pages_ratelimited(mapping); - cond_resched(); + a->flags &= ~ATTR_COMPRESSION_MASK; + if (NInoCompressed(ni) + && (ni->vol->major_ver >= 3) + && NVolCompression(ni->vol) + && (ni->vol->cluster_size <= MAX_COMPRESSION_CLUSTER_SIZE)) + a->flags |= ATTR_IS_COMPRESSED; } - /* If there is a last partial page, need to do it the slow way. */ - if (end_ofs) { - page = read_mapping_page(mapping, idx, NULL); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Failed to read last partial page " - "(error, index 0x%lx).", idx); - return PTR_ERR(page); - } - kaddr = kmap_atomic(page); - memset(kaddr, val, end_ofs); - flush_dcache_page(page); - kunmap_atomic(kaddr); - set_page_dirty(page); - put_page(page); - balance_dirty_pages_ratelimited(mapping); - cond_resched(); + + cs = a->flags & (ATTR_IS_COMPRESSED | ATTR_IS_SPARSE); + + if (ni->type == AT_DATA && ni->name == AT_UNNAMED && + ((!(a->flags & ATTR_IS_COMPRESSED) != !NInoCompressed(ni)) || + (!(a->flags & ATTR_IS_SPARSE) != !NInoSparse(ni)) || + (!(a->flags & ATTR_IS_ENCRYPTED) != !NInoEncrypted(ni)))) { + err = -EIO; + pr_err("Inode %lld has corrupt attribute flags (0x%x <> 0x%x)\n", + (unsigned long long)ni->mft_no, + a->flags, ni->flags); + goto put_err_out; } -done: - ntfs_debug("Done."); - return 0; + + if (a->non_resident) { + if (((a->flags & ATTR_COMPRESSION_MASK) || a->data.non_resident.compression_unit) && + (ni->vol->major_ver < 3)) { + err = -EIO; + pr_err("Compressed inode %lld not allowed on NTFS %d.%d\n", + (unsigned long long)ni->mft_no, + ni->vol->major_ver, + ni->vol->major_ver); + goto put_err_out; + } + + if ((a->flags & ATTR_IS_COMPRESSED) && !a->data.non_resident.compression_unit) { + err = -EIO; + pr_err("Compressed inode %lld attr 0x%x has no compression unit\n", + (unsigned long long)ni->mft_no, type); + goto put_err_out; + } + if ((a->flags & ATTR_COMPRESSION_MASK) && + (a->data.non_resident.compression_unit != STANDARD_COMPRESSION_UNIT)) { + err = -EIO; + pr_err("Compressed inode %lld attr 0x%lx has an unsupported compression unit %d\n", + (unsigned long long)ni->mft_no, + (long)le32_to_cpu(type), + (int)a->data.non_resident.compression_unit); + goto put_err_out; + } + ntfs_attr_init(ni, true, a->flags & ATTR_IS_COMPRESSED, + a->flags & ATTR_IS_ENCRYPTED, + a->flags & ATTR_IS_SPARSE, + le64_to_cpu(a->data.non_resident.allocated_size), + le64_to_cpu(a->data.non_resident.data_size), + le64_to_cpu(a->data.non_resident.initialized_size), + cs ? le64_to_cpu(a->data.non_resident.compressed_size) : 0, + cs ? a->data.non_resident.compression_unit : 0); + } else { + s64 l = le32_to_cpu(a->data.resident.value_length); + + ntfs_attr_init(ni, false, a->flags & ATTR_IS_COMPRESSED, + a->flags & ATTR_IS_ENCRYPTED, + a->flags & ATTR_IS_SPARSE, (l + 7) & ~7, l, l, + cs ? (l + 7) & ~7 : 0, 0); + } + ntfs_attr_put_search_ctx(ctx); +out: + ntfs_debug("\n"); + return err; + +put_err_out: + ntfs_attr_put_search_ctx(ctx); +err_out: + ntfs_free(newname); + goto out; +} + +/** + * ntfs_attr_close - free an ntfs attribute structure + * @ni: ntfs inode to free + * + * Release all memory associated with the ntfs attribute @na and then release + * @na itself. + */ +void ntfs_attr_close(struct ntfs_inode *ni) +{ + if (NInoNonResident(ni) && ni->runlist.rl) + ntfs_free(ni->runlist.rl); + /* Don't release if using an internal constant. */ + if (ni->name != AT_UNNAMED && ni->name != I30) + ntfs_free(ni->name); } -#endif /* NTFS_RW */ +/** + * ntfs_attr_map_whole_runlist - map the whole runlist of an ntfs attribute + * @ni: ntfs inode for which to map the runlist + * + * Map the whole runlist of the ntfs attribute @na. For an attribute made up + * of only one attribute extent this is the same as calling + * ntfs_map_runlist(ni, 0) but for an attribute with multiple extents this + * will map the runlist fragments from each of the extents thus giving access + * to the entirety of the disk allocation of an attribute. + */ +int ntfs_attr_map_whole_runlist(struct ntfs_inode *ni) +{ + s64 next_vcn, last_vcn, highest_vcn; + struct ntfs_attr_search_ctx *ctx; + struct ntfs_volume *vol = ni->vol; + struct super_block *sb = vol->sb; + struct attr_record *a; + int err; + struct ntfs_inode *base_ni; + int not_mapped; + size_t new_rl_count; + + ntfs_debug("Entering for inode 0x%llx, attr 0x%x.\n", + (unsigned long long)ni->mft_no, ni->type); + + if (NInoFullyMapped(ni) && ni->runlist.rl) + return 0; + + if (NInoAttr(ni)) + base_ni = ni->ext.base_ntfs_ino; + else + base_ni = ni; + + ctx = ntfs_attr_get_search_ctx(base_ni, NULL); + if (!ctx) { + ntfs_error(sb, "%s: Failed to get search context", __func__); + return -ENOMEM; + } + + /* Map all attribute extents one by one. */ + next_vcn = last_vcn = highest_vcn = 0; + a = NULL; + while (1) { + struct runlist_element *rl; + + not_mapped = 0; + if (ntfs_rl_vcn_to_lcn(ni->runlist.rl, next_vcn) == LCN_RL_NOT_MAPPED) + not_mapped = 1; + + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, next_vcn, NULL, 0, ctx); + if (err) + break; + + a = ctx->attr; + + if (not_mapped) { + /* Decode the runlist. */ + rl = ntfs_mapping_pairs_decompress(ni->vol, a, &ni->runlist, + &new_rl_count); + if (IS_ERR(rl)) { + err = PTR_ERR(rl); + goto err_out; + } + ni->runlist.rl = rl; + ni->runlist.count = new_rl_count; + } + + /* Are we in the first extent? */ + if (!next_vcn) { + if (a->data.non_resident.lowest_vcn) { + err = -EIO; + ntfs_error(sb, + "First extent of inode %llu attribute has non-zero lowest_vcn", + (unsigned long long)ni->mft_no); + goto err_out; + } + /* Get the last vcn in the attribute. */ + last_vcn = NTFS_B_TO_CLU(vol, + le64_to_cpu(a->data.non_resident.allocated_size)); + } + + /* Get the lowest vcn for the next extent. */ + highest_vcn = le64_to_cpu(a->data.non_resident.highest_vcn); + next_vcn = highest_vcn + 1; + + /* Only one extent or error, which we catch below. */ + if (next_vcn <= 0) { + err = -ENOENT; + break; + } + + /* Avoid endless loops due to corruption. */ + if (next_vcn < le64_to_cpu(a->data.non_resident.lowest_vcn)) { + err = -EIO; + ntfs_error(sb, "Inode %llu has corrupt attribute list", + (unsigned long long)ni->mft_no); + goto err_out; + } + } + if (!a) { + ntfs_error(sb, "Couldn't find attribute for runlist mapping"); + goto err_out; + } + if (not_mapped && highest_vcn && highest_vcn != last_vcn - 1) { + err = -EIO; + ntfs_error(sb, + "Failed to load full runlist: inode: %llu highest_vcn: 0x%llx last_vcn: 0x%llx", + (unsigned long long)ni->mft_no, + (long long)highest_vcn, (long long)last_vcn); + goto err_out; + } + ntfs_attr_put_search_ctx(ctx); + if (err == -ENOENT) { + NInoSetFullyMapped(ni); + return 0; + } + + return err; + +err_out: + ntfs_attr_put_search_ctx(ctx); + return err; +} + +/** + * ntfs_attr_record_move_to - move attribute record to target inode + * @ctx: attribute search context describing the attribute record + * @ni: opened ntfs inode to which move attribute record + */ +int ntfs_attr_record_move_to(struct ntfs_attr_search_ctx *ctx, struct ntfs_inode *ni) +{ + struct ntfs_attr_search_ctx *nctx; + struct attr_record *a; + int err; + struct mft_record *ni_mrec; + struct super_block *sb; + + if (!ctx || !ctx->attr || !ctx->ntfs_ino || !ni) { + ntfs_debug("Invalid arguments passed.\n"); + return -EINVAL; + } + + sb = ni->vol->sb; + ntfs_debug("Entering for ctx->attr->type 0x%x, ctx->ntfs_ino->mft_no 0x%llx, ni->mft_no 0x%llx.\n", + (unsigned int) le32_to_cpu(ctx->attr->type), + (long long) ctx->ntfs_ino->mft_no, + (long long) ni->mft_no); + + if (ctx->ntfs_ino == ni) + return 0; + + if (!ctx->al_entry) { + ntfs_debug("Inode should contain attribute list to use this function.\n"); + return -EINVAL; + } + + /* Find place in MFT record where attribute will be moved. */ + a = ctx->attr; + nctx = ntfs_attr_get_search_ctx(ni, NULL); + if (!nctx) { + ntfs_error(sb, "%s: Failed to get search context", __func__); + return -ENOMEM; + } + + /* + * Use ntfs_attr_find instead of ntfs_attr_lookup to find place for + * attribute in @ni->mrec, not any extent inode in case if @ni is base + * file record. + */ + err = ntfs_attr_find(a->type, (__le16 *)((u8 *)a + le16_to_cpu(a->name_offset)), + a->name_length, CASE_SENSITIVE, NULL, + 0, nctx); + if (!err) { + ntfs_debug("Attribute of such type, with same name already present in this MFT record.\n"); + err = -EEXIST; + goto put_err_out; + } + if (err != -ENOENT) { + ntfs_debug("Attribute lookup failed.\n"); + goto put_err_out; + } + + /* Make space and move attribute. */ + ni_mrec = map_mft_record(ni); + if (IS_ERR(ni_mrec)) { + err = -EIO; + goto put_err_out; + } + + err = ntfs_make_room_for_attr(ni_mrec, (u8 *) nctx->attr, + le32_to_cpu(a->length)); + if (err) { + ntfs_debug("Couldn't make space for attribute.\n"); + unmap_mft_record(ni); + goto put_err_out; + } + memcpy(nctx->attr, a, le32_to_cpu(a->length)); + nctx->attr->instance = nctx->mrec->next_attr_instance; + nctx->mrec->next_attr_instance = + cpu_to_le16((le16_to_cpu(nctx->mrec->next_attr_instance) + 1) & 0xffff); + ntfs_attr_record_resize(ctx->mrec, a, 0); + mark_mft_record_dirty(ctx->ntfs_ino); + mark_mft_record_dirty(ni); + + /* Update attribute list. */ + ctx->al_entry->mft_reference = + MK_LE_MREF(ni->mft_no, le16_to_cpu(ni_mrec->sequence_number)); + ctx->al_entry->instance = nctx->attr->instance; + unmap_mft_record(ni); +put_err_out: + ntfs_attr_put_search_ctx(nctx); + return err; +} + +/** + * ntfs_attr_record_move_away - move away attribute record from it's mft record + * @ctx: attribute search context describing the attribute record + * @extra: minimum amount of free space in the new holder of record + */ +int ntfs_attr_record_move_away(struct ntfs_attr_search_ctx *ctx, int extra) +{ + struct ntfs_inode *base_ni, *ni = NULL; + struct mft_record *m; + int i, err; + struct super_block *sb; + + if (!ctx || !ctx->attr || !ctx->ntfs_ino || extra < 0) + return -EINVAL; + + ntfs_debug("Entering for attr 0x%x, inode %llu\n", + (unsigned int) le32_to_cpu(ctx->attr->type), + (unsigned long long)ctx->ntfs_ino->mft_no); + + if (ctx->ntfs_ino->nr_extents == -1) + base_ni = ctx->base_ntfs_ino; + else + base_ni = ctx->ntfs_ino; + + sb = ctx->ntfs_ino->vol->sb; + if (!NInoAttrList(base_ni)) { + ntfs_error(sb, "Inode %llu has no attrlist", + (unsigned long long)base_ni->mft_no); + return -EINVAL; + } + + err = ntfs_inode_attach_all_extents(ctx->ntfs_ino); + if (err) { + ntfs_error(sb, "Couldn't attach extents, inode=%llu", + (unsigned long long)base_ni->mft_no); + return err; + } + + mutex_lock(&base_ni->extent_lock); + /* Walk through all extents and try to move attribute to them. */ + for (i = 0; i < base_ni->nr_extents; i++) { + ni = base_ni->ext.extent_ntfs_inos[i]; + + if (ctx->ntfs_ino->mft_no == ni->mft_no) + continue; + m = map_mft_record(ni); + if (IS_ERR(m)) { + ntfs_error(sb, "Can not map mft record for mft_no %lld", + (unsigned long long)ni->mft_no); + mutex_unlock(&base_ni->extent_lock); + return -EIO; + } + if (le32_to_cpu(m->bytes_allocated) - + le32_to_cpu(m->bytes_in_use) < le32_to_cpu(ctx->attr->length) + extra) { + unmap_mft_record(ni); + continue; + } + unmap_mft_record(ni); + + /* + * ntfs_attr_record_move_to can fail if extent with other lowest + * s64 already present in inode we trying move record to. So, + * do not return error. + */ + if (!ntfs_attr_record_move_to(ctx, ni)) { + mutex_unlock(&base_ni->extent_lock); + return 0; + } + } + mutex_unlock(&base_ni->extent_lock); + + /* + * Failed to move attribute to one of the current extents, so allocate + * new extent and move attribute to it. + */ + ni = NULL; + err = ntfs_mft_record_alloc(base_ni->vol, 0, &ni, base_ni, NULL); + if (err) { + ntfs_error(sb, "Couldn't allocate MFT record, err : %d", err); + return err; + } + unmap_mft_record(ni); + + err = ntfs_attr_record_move_to(ctx, ni); + if (err) + ntfs_error(sb, "Couldn't move attribute to MFT record"); + + return err; +} + +/* + * If we are in the first extent, then set/clean sparse bit, + * update allocated and compressed size. + */ +static int ntfs_attr_update_meta(struct attr_record *a, struct ntfs_inode *ni, + struct mft_record *m, struct ntfs_attr_search_ctx *ctx) +{ + int sparse, err = 0; + struct ntfs_inode *base_ni; + struct super_block *sb = ni->vol->sb; + + ntfs_debug("Entering for inode 0x%llx, attr 0x%x\n", + (unsigned long long)ni->mft_no, ni->type); + + if (NInoAttr(ni)) + base_ni = ni->ext.base_ntfs_ino; + else + base_ni = ni; + + if (a->data.non_resident.lowest_vcn) + goto out; + + a->data.non_resident.allocated_size = cpu_to_le64(ni->allocated_size); + + sparse = ntfs_rl_sparse(ni->runlist.rl); + if (sparse < 0) { + err = -EIO; + goto out; + } + + /* Attribute become sparse. */ + if (sparse && !(a->flags & (ATTR_IS_SPARSE | ATTR_IS_COMPRESSED))) { + /* + * Move attribute to another mft record, if attribute is too + * small to add compressed_size field to it and we have no + * free space in the current mft record. + */ + if ((le32_to_cpu(a->length) - + le16_to_cpu(a->data.non_resident.mapping_pairs_offset) == 8) && + !(le32_to_cpu(m->bytes_allocated) - le32_to_cpu(m->bytes_in_use))) { + + if (!NInoAttrList(base_ni)) { + err = ntfs_inode_add_attrlist(base_ni); + if (err) + goto out; + err = -EAGAIN; + goto out; + } + err = ntfs_attr_record_move_away(ctx, 8); + if (err) { + ntfs_error(sb, "Failed to move attribute"); + goto out; + } + + err = ntfs_attrlist_update(base_ni); + if (err) + goto out; + err = -EAGAIN; + goto out; + } + if (!(le32_to_cpu(a->length) - + le16_to_cpu(a->data.non_resident.mapping_pairs_offset))) { + err = -EIO; + ntfs_error(sb, "Mapping pairs space is 0"); + goto out; + } + + NInoSetSparse(ni); + ni->flags |= FILE_ATTR_SPARSE_FILE; + a->flags |= ATTR_IS_SPARSE; + a->data.non_resident.compression_unit = 0; + + memmove((u8 *)a + le16_to_cpu(a->name_offset) + 8, + (u8 *)a + le16_to_cpu(a->name_offset), + a->name_length * sizeof(__le16)); + + a->name_offset = cpu_to_le16(le16_to_cpu(a->name_offset) + 8); + + a->data.non_resident.mapping_pairs_offset = + cpu_to_le16(le16_to_cpu(a->data.non_resident.mapping_pairs_offset) + 8); + } + + /* Attribute no longer sparse. */ + if (!sparse && (a->flags & ATTR_IS_SPARSE) && + !(a->flags & ATTR_IS_COMPRESSED)) { + NInoClearSparse(ni); + ni->flags &= ~FILE_ATTR_SPARSE_FILE; + a->flags &= ~ATTR_IS_SPARSE; + a->data.non_resident.compression_unit = 0; + + memmove((u8 *)a + le16_to_cpu(a->name_offset) - 8, + (u8 *)a + le16_to_cpu(a->name_offset), + a->name_length * sizeof(__le16)); + + if (le16_to_cpu(a->name_offset) >= 8) + a->name_offset = cpu_to_le16(le16_to_cpu(a->name_offset) - 8); + + a->data.non_resident.mapping_pairs_offset = + cpu_to_le16(le16_to_cpu(a->data.non_resident.mapping_pairs_offset) - 8); + } + + /* Update compressed size if required. */ + if (NInoFullyMapped(ni) && (sparse || NInoCompressed(ni))) { + s64 new_compr_size; + + new_compr_size = ntfs_rl_get_compressed_size(ni->vol, ni->runlist.rl); + if (new_compr_size < 0) { + err = new_compr_size; + goto out; + } + + ni->itype.compressed.size = new_compr_size; + a->data.non_resident.compressed_size = cpu_to_le64(new_compr_size); + } + + if (NInoSparse(ni) || NInoCompressed(ni)) + VFS_I(base_ni)->i_blocks = ni->itype.compressed.size >> 9; + else + VFS_I(base_ni)->i_blocks = ni->allocated_size >> 9; + /* + * Set FILE_NAME dirty flag, to update sparse bit and + * allocated size in the index. + */ + if (ni->type == AT_DATA && ni->name == AT_UNNAMED) + NInoSetFileNameDirty(ni); +out: + return err; +} + +#define NTFS_VCN_DELETE_MARK -2 +/** + * ntfs_attr_update_mapping_pairs - update mapping pairs for ntfs attribute + * @ni: non-resident ntfs inode for which we need update + * @from_vcn: update runlist starting this VCN + * + * Build mapping pairs from @na->rl and write them to the disk. Also, this + * function updates sparse bit, allocated and compressed size (allocates/frees + * space for this field if required). + * + * @na->allocated_size should be set to correct value for the new runlist before + * call to this function. Vice-versa @na->compressed_size will be calculated and + * set to correct value during this function. + */ +int ntfs_attr_update_mapping_pairs(struct ntfs_inode *ni, s64 from_vcn) +{ + struct ntfs_attr_search_ctx *ctx; + struct ntfs_inode *base_ni; + struct mft_record *m; + struct attr_record *a; + s64 stop_vcn; + int err = 0, mp_size, cur_max_mp_size, exp_max_mp_size; + bool finished_build; + bool first_updated = false; + struct super_block *sb; + struct runlist_element *start_rl; + unsigned int de_cluster_count = 0; + +retry: + if (!ni || !ni->runlist.rl) + return -EINVAL; + + ntfs_debug("Entering for inode %llu, attr 0x%x\n", + (unsigned long long)ni->mft_no, ni->type); + + sb = ni->vol->sb; + if (!NInoNonResident(ni)) { + ntfs_error(sb, "%s: resident attribute", __func__); + return -EINVAL; + } + + if (ni->nr_extents == -1) + base_ni = ni->ext.base_ntfs_ino; + else + base_ni = ni; + + ctx = ntfs_attr_get_search_ctx(base_ni, NULL); + if (!ctx) { + ntfs_error(sb, "%s: Failed to get search context", __func__); + return -ENOMEM; + } + + /* Fill attribute records with new mapping pairs. */ + stop_vcn = 0; + finished_build = false; + start_rl = ni->runlist.rl; + while (!(err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, from_vcn, NULL, 0, ctx))) { + unsigned int de_cnt = 0; + + a = ctx->attr; + m = ctx->mrec; + if (!a->data.non_resident.lowest_vcn) + first_updated = true; + + /* + * If runlist is updating not from the beginning, then set + * @stop_vcn properly, i.e. to the lowest vcn of record that + * contain @from_vcn. Also we do not need @from_vcn anymore, + * set it to 0 to make ntfs_attr_lookup enumerate attributes. + */ + if (from_vcn) { + s64 first_lcn; + + stop_vcn = le64_to_cpu(a->data.non_resident.lowest_vcn); + from_vcn = 0; + /* + * Check whether the first run we need to update is + * the last run in runlist, if so, then deallocate + * all attrubute extents starting this one. + */ + first_lcn = ntfs_rl_vcn_to_lcn(ni->runlist.rl, stop_vcn); + if (first_lcn == LCN_EINVAL) { + err = -EIO; + ntfs_error(sb, "Bad runlist"); + goto put_err_out; + } + if (first_lcn == LCN_ENOENT || + first_lcn == LCN_RL_NOT_MAPPED) + finished_build = true; + } + + /* + * Check whether we finished mapping pairs build, if so mark + * extent as need to delete (by setting highest vcn to + * NTFS_VCN_DELETE_MARK (-2), we shall check it later and + * delete extent) and continue search. + */ + if (finished_build) { + ntfs_debug("Mark attr 0x%x for delete in inode 0x%lx.\n", + (unsigned int)le32_to_cpu(a->type), ctx->ntfs_ino->mft_no); + a->data.non_resident.highest_vcn = cpu_to_le64(NTFS_VCN_DELETE_MARK); + mark_mft_record_dirty(ctx->ntfs_ino); + continue; + } + + err = ntfs_attr_update_meta(a, ni, m, ctx); + if (err < 0) { + if (err == -EAGAIN) { + ntfs_attr_put_search_ctx(ctx); + goto retry; + } + goto put_err_out; + } + + /* + * Determine maximum possible length of mapping pairs, + * if we shall *not* expand space for mapping pairs. + */ + cur_max_mp_size = le32_to_cpu(a->length) - + le16_to_cpu(a->data.non_resident.mapping_pairs_offset); + /* + * Determine maximum possible length of mapping pairs in the + * current mft record, if we shall expand space for mapping + * pairs. + */ + exp_max_mp_size = le32_to_cpu(m->bytes_allocated) - + le32_to_cpu(m->bytes_in_use) + cur_max_mp_size; + + /* Get the size for the rest of mapping pairs array. */ + mp_size = ntfs_get_size_for_mapping_pairs(ni->vol, start_rl, + stop_vcn, -1, exp_max_mp_size); + if (mp_size <= 0) { + err = mp_size; + ntfs_error(sb, "%s: get MP size failed", __func__); + goto put_err_out; + } + /* Test mapping pairs for fitting in the current mft record. */ + if (mp_size > exp_max_mp_size) { + /* + * Mapping pairs of $ATTRIBUTE_LIST attribute must fit + * in the base mft record. Try to move out other + * attributes and try again. + */ + if (ni->type == AT_ATTRIBUTE_LIST) { + ntfs_attr_put_search_ctx(ctx); + if (ntfs_inode_free_space(base_ni, mp_size - + cur_max_mp_size)) { + ntfs_debug("Attribute list is too big. Defragment the volume\n"); + return -ENOSPC; + } + if (ntfs_attrlist_update(base_ni)) + return -EIO; + goto retry; + } + + /* Add attribute list if it isn't present, and retry. */ + if (!NInoAttrList(base_ni)) { + ntfs_attr_put_search_ctx(ctx); + if (ntfs_inode_add_attrlist(base_ni)) { + ntfs_error(sb, "Can not add attrlist"); + return -EIO; + } + goto retry; + } + + /* + * Set mapping pairs size to maximum possible for this + * mft record. We shall write the rest of mapping pairs + * to another MFT records. + */ + mp_size = exp_max_mp_size; + } + + /* Change space for mapping pairs if we need it. */ + if (((mp_size + 7) & ~7) != cur_max_mp_size) { + if (ntfs_attr_record_resize(m, a, + le16_to_cpu(a->data.non_resident.mapping_pairs_offset) + + mp_size)) { + err = -EIO; + ntfs_error(sb, "Failed to resize attribute"); + goto put_err_out; + } + } + + /* Update lowest vcn. */ + a->data.non_resident.lowest_vcn = cpu_to_le64(stop_vcn); + mark_mft_record_dirty(ctx->ntfs_ino); + if ((ctx->ntfs_ino->nr_extents == -1 || NInoAttrList(ctx->ntfs_ino)) && + ctx->attr->type != AT_ATTRIBUTE_LIST) { + ctx->al_entry->lowest_vcn = cpu_to_le64(stop_vcn); + err = ntfs_attrlist_update(base_ni); + if (err) + goto put_err_out; + } + + /* + * Generate the new mapping pairs array directly into the + * correct destination, i.e. the attribute record itself. + */ + err = ntfs_mapping_pairs_build(ni->vol, + (u8 *)a + le16_to_cpu(a->data.non_resident.mapping_pairs_offset), + mp_size, start_rl, stop_vcn, -1, &stop_vcn, &start_rl, &de_cnt); + if (!err) + finished_build = true; + if (!finished_build && err != -ENOSPC) { + ntfs_error(sb, "Failed to build mapping pairs"); + goto put_err_out; + } + a->data.non_resident.highest_vcn = cpu_to_le64(stop_vcn - 1); + mark_mft_record_dirty(ctx->ntfs_ino); + de_cluster_count += de_cnt; + } + + /* Check whether error occurred. */ + if (err && err != -ENOENT) { + ntfs_error(sb, "%s: Attribute lookup failed", __func__); + goto put_err_out; + } + + /* + * If the base extent was skipped in the above process, + * we still may have to update the sizes. + */ + if (!first_updated) { + ntfs_attr_reinit_search_ctx(ctx); + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (!err) { + a = ctx->attr; + a->data.non_resident.allocated_size = cpu_to_le64(ni->allocated_size); + if (NInoCompressed(ni) || NInoSparse(ni)) + a->data.non_resident.compressed_size = + cpu_to_le64(ni->itype.compressed.size); + /* Updating sizes taints the extent holding the attr */ + if (ni->type == AT_DATA && ni->name == AT_UNNAMED) + NInoSetFileNameDirty(ni); + mark_mft_record_dirty(ctx->ntfs_ino); + } else { + ntfs_error(sb, "Failed to update sizes in base extent\n"); + goto put_err_out; + } + } + + /* Deallocate not used attribute extents and return with success. */ + if (finished_build) { + ntfs_attr_reinit_search_ctx(ctx); + ntfs_debug("Deallocate marked extents.\n"); + while (!(err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx))) { + if (le64_to_cpu(ctx->attr->data.non_resident.highest_vcn) != + NTFS_VCN_DELETE_MARK) + continue; + /* Remove unused attribute record. */ + err = ntfs_attr_record_rm(ctx); + if (err) { + ntfs_error(sb, "Could not remove unused attr"); + goto put_err_out; + } + ntfs_attr_reinit_search_ctx(ctx); + } + if (err && err != -ENOENT) { + ntfs_error(sb, "%s: Attr lookup failed", __func__); + goto put_err_out; + } + ntfs_debug("Deallocate done.\n"); + ntfs_attr_put_search_ctx(ctx); + goto out; + } + ntfs_attr_put_search_ctx(ctx); + ctx = NULL; + + /* Allocate new MFT records for the rest of mapping pairs. */ + while (1) { + struct ntfs_inode *ext_ni = NULL; + unsigned int de_cnt = 0; + + /* Allocate new mft record. */ + err = ntfs_mft_record_alloc(ni->vol, 0, &ext_ni, base_ni, NULL); + if (err) { + ntfs_error(sb, "Failed to allocate extent record"); + goto put_err_out; + } + unmap_mft_record(ext_ni); + + m = map_mft_record(ext_ni); + if (IS_ERR(m)) { + ntfs_error(sb, "Could not map new MFT record"); + if (ntfs_mft_record_free(ni->vol, ext_ni)) + ntfs_error(sb, "Could not free MFT record"); + ntfs_inode_close(ext_ni); + err = -ENOMEM; + ext_ni = NULL; + goto put_err_out; + } + /* + * If mapping size exceed available space, set them to + * possible maximum. + */ + cur_max_mp_size = le32_to_cpu(m->bytes_allocated) - + le32_to_cpu(m->bytes_in_use) - + (sizeof(struct attr_record) + + ((NInoCompressed(ni) || NInoSparse(ni)) ? + sizeof(a->data.non_resident.compressed_size) : 0)) - + ((sizeof(__le16) * ni->name_len + 7) & ~7); + + /* Calculate size of rest mapping pairs. */ + mp_size = ntfs_get_size_for_mapping_pairs(ni->vol, + start_rl, stop_vcn, -1, cur_max_mp_size); + if (mp_size <= 0) { + unmap_mft_record(ext_ni); + ntfs_inode_close(ext_ni); + err = mp_size; + ntfs_error(sb, "%s: get mp size failed", __func__); + goto put_err_out; + } + + if (mp_size > cur_max_mp_size) + mp_size = cur_max_mp_size; + /* Add attribute extent to new record. */ + err = ntfs_non_resident_attr_record_add(ext_ni, ni->type, + ni->name, ni->name_len, stop_vcn, mp_size, 0); + if (err < 0) { + ntfs_error(sb, "Could not add attribute extent"); + unmap_mft_record(ext_ni); + if (ntfs_mft_record_free(ni->vol, ext_ni)) + ntfs_error(sb, "Could not free MFT record"); + ntfs_inode_close(ext_ni); + goto put_err_out; + } + a = (struct attr_record *)((u8 *)m + err); + + err = ntfs_mapping_pairs_build(ni->vol, (u8 *)a + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset), + mp_size, start_rl, stop_vcn, -1, &stop_vcn, &start_rl, + &de_cnt); + if (err < 0 && err != -ENOSPC) { + ntfs_error(sb, "Failed to build MP"); + unmap_mft_record(ext_ni); + if (ntfs_mft_record_free(ni->vol, ext_ni)) + ntfs_error(sb, "Couldn't free MFT record"); + goto put_err_out; + } + a->data.non_resident.highest_vcn = cpu_to_le64(stop_vcn - 1); + mark_mft_record_dirty(ext_ni); + unmap_mft_record(ext_ni); + + de_cluster_count += de_cnt; + /* All mapping pairs has been written. */ + if (!err) + break; + } +out: + if (from_vcn == 0) + ni->i_dealloc_clusters = de_cluster_count; + return 0; + +put_err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + return err; +} + +/** + * ntfs_attr_make_resident - convert a non-resident to a resident attribute + * @ni: open ntfs attribute to make resident + * @ctx: ntfs search context describing the attribute + * + * Convert a non-resident ntfs attribute to a resident one. + */ +static int ntfs_attr_make_resident(struct ntfs_inode *ni, struct ntfs_attr_search_ctx *ctx) +{ + struct ntfs_volume *vol = ni->vol; + struct super_block *sb = vol->sb; + struct attr_record *a = ctx->attr; + int name_ofs, val_ofs, err; + s64 arec_size; + + ntfs_debug("Entering for inode 0x%llx, attr 0x%x.\n", + (unsigned long long)ni->mft_no, ni->type); + + /* Should be called for the first extent of the attribute. */ + if (le64_to_cpu(a->data.non_resident.lowest_vcn)) { + ntfs_debug("Eeek! Should be called for the first extent of the attribute. Aborting...\n"); + return -EINVAL; + } + + /* Some preliminary sanity checking. */ + if (!NInoNonResident(ni)) { + ntfs_debug("Eeek! Trying to make resident attribute resident. Aborting...\n"); + return -EINVAL; + } + + /* Make sure this is not $MFT/$BITMAP or Windows will not boot! */ + if (ni->type == AT_BITMAP && ni->mft_no == FILE_MFT) + return -EPERM; + + /* Check that the attribute is allowed to be resident. */ + err = ntfs_attr_can_be_resident(vol, ni->type); + if (err) + return err; + + if (NInoCompressed(ni) || NInoEncrypted(ni)) { + ntfs_debug("Making compressed or encrypted files resident is not implemented yet.\n"); + return -EOPNOTSUPP; + } + + /* Work out offsets into and size of the resident attribute. */ + name_ofs = 24; /* = sizeof(resident_struct attr_record); */ + val_ofs = (name_ofs + a->name_length * sizeof(__le16) + 7) & ~7; + arec_size = (val_ofs + ni->data_size + 7) & ~7; + + /* Sanity check the size before we start modifying the attribute. */ + if (le32_to_cpu(ctx->mrec->bytes_in_use) - le32_to_cpu(a->length) + + arec_size > le32_to_cpu(ctx->mrec->bytes_allocated)) { + ntfs_debug("Not enough space to make attribute resident\n"); + return -ENOSPC; + } + + /* Read and cache the whole runlist if not already done. */ + err = ntfs_attr_map_whole_runlist(ni); + if (err) + return err; + + /* Move the attribute name if it exists and update the offset. */ + if (a->name_length) { + memmove((u8 *)a + name_ofs, (u8 *)a + le16_to_cpu(a->name_offset), + a->name_length * sizeof(__le16)); + } + a->name_offset = cpu_to_le16(name_ofs); + + /* Resize the resident part of the attribute record. */ + if (ntfs_attr_record_resize(ctx->mrec, a, arec_size) < 0) { + /* + * Bug, because ntfs_attr_record_resize should not fail (we + * already checked that attribute fits MFT record). + */ + ntfs_error(ctx->ntfs_ino->vol->sb, "BUG! Failed to resize attribute record. "); + return -EIO; + } + + /* Convert the attribute record to describe a resident attribute. */ + a->non_resident = 0; + a->flags = 0; + a->data.resident.value_length = cpu_to_le32(ni->data_size); + a->data.resident.value_offset = cpu_to_le16(val_ofs); + /* + * File names cannot be non-resident so we would never see this here + * but at least it serves as a reminder that there may be attributes + * for which we do need to set this flag. (AIA) + */ + if (a->type == AT_FILE_NAME) + a->data.resident.flags = RESIDENT_ATTR_IS_INDEXED; + else + a->data.resident.flags = 0; + a->data.resident.reserved = 0; + + /* + * Deallocate clusters from the runlist. + * + * NOTE: We can use ntfs_cluster_free() because we have already mapped + * the whole run list and thus it doesn't matter that the attribute + * record is in a transiently corrupted state at this moment in time. + */ + err = ntfs_cluster_free(ni, 0, -1, ctx); + if (err) { + ntfs_error(sb, "Eeek! Failed to release allocated clusters"); + ntfs_debug("Ignoring error and leaving behind wasted clusters.\n"); + } + + /* Throw away the now unused runlist. */ + ntfs_free(ni->runlist.rl); + ni->runlist.rl = NULL; + ni->runlist.count = 0; + /* Update in-memory struct ntfs_attr. */ + NInoClearNonResident(ni); + NInoClearCompressed(ni); + ni->flags &= ~FILE_ATTR_COMPRESSED; + NInoClearSparse(ni); + ni->flags &= ~FILE_ATTR_SPARSE_FILE; + NInoClearEncrypted(ni); + ni->flags &= ~FILE_ATTR_ENCRYPTED; + ni->initialized_size = ni->data_size; + ni->allocated_size = ni->itype.compressed.size = (ni->data_size + 7) & ~7; + ni->itype.compressed.block_size = 0; + ni->itype.compressed.block_size_bits = ni->itype.compressed.block_clusters = 0; + return 0; +} + +/** + * ntfs_non_resident_attr_shrink - shrink a non-resident, open ntfs attribute + * @ni: non-resident ntfs attribute to shrink + * @newsize: new size (in bytes) to which to shrink the attribute + * + * Reduce the size of a non-resident, open ntfs attribute @na to @newsize bytes. + */ +static int ntfs_non_resident_attr_shrink(struct ntfs_inode *ni, const s64 newsize) +{ + struct ntfs_volume *vol; + struct ntfs_attr_search_ctx *ctx; + s64 first_free_vcn; + s64 nr_freed_clusters; + int err; + struct ntfs_inode *base_ni; + + ntfs_debug("Inode 0x%llx attr 0x%x new size %lld\n", + (unsigned long long)ni->mft_no, ni->type, (long long)newsize); + + vol = ni->vol; + + if (NInoAttr(ni)) + base_ni = ni->ext.base_ntfs_ino; + else + base_ni = ni; + + /* + * Check the attribute type and the corresponding minimum size + * against @newsize and fail if @newsize is too small. + */ + err = ntfs_attr_size_bounds_check(vol, ni->type, newsize); + if (err) { + if (err == -ERANGE) + ntfs_debug("Eeek! Size bounds check failed. Aborting...\n"); + else if (err == -ENOENT) + err = -EIO; + return err; + } + + /* The first cluster outside the new allocation. */ + if (NInoCompressed(ni)) + /* + * For compressed files we must keep full compressions blocks, + * but currently we do not decompress/recompress the last + * block to truncate the data, so we may leave more allocated + * clusters than really needed. + */ + first_free_vcn = NTFS_B_TO_CLU(vol, + ((newsize - 1) | (ni->itype.compressed.block_size - 1)) + 1); + else + first_free_vcn = NTFS_B_TO_CLU(vol, newsize + vol->cluster_size - 1); + + if (first_free_vcn < 0) + return -EINVAL; + /* + * Compare the new allocation with the old one and only deallocate + * clusters if there is a change. + */ + if (NTFS_B_TO_CLU(vol, ni->allocated_size) != first_free_vcn) { + struct ntfs_attr_search_ctx *ctx; + + err = ntfs_attr_map_whole_runlist(ni); + if (err) { + ntfs_debug("Eeek! ntfs_attr_map_whole_runlist failed.\n"); + return err; + } + + ctx = ntfs_attr_get_search_ctx(ni, NULL); + if (!ctx) { + ntfs_error(vol->sb, "%s: Failed to get search context", __func__); + return -ENOMEM; + } + + /* Deallocate all clusters starting with the first free one. */ + nr_freed_clusters = ntfs_cluster_free(ni, first_free_vcn, -1, ctx); + if (nr_freed_clusters < 0) { + ntfs_debug("Eeek! Freeing of clusters failed. Aborting...\n"); + ntfs_attr_put_search_ctx(ctx); + return (int)nr_freed_clusters; + } + ntfs_attr_put_search_ctx(ctx); + + /* Truncate the runlist itself. */ + if (ntfs_rl_truncate_nolock(vol, &ni->runlist, first_free_vcn)) { + /* + * Failed to truncate the runlist, so just throw it + * away, it will be mapped afresh on next use. + */ + ntfs_free(ni->runlist.rl); + ni->runlist.rl = NULL; + ntfs_error(vol->sb, "Eeek! Run list truncation failed.\n"); + return -EIO; + } + + /* Prepare to mapping pairs update. */ + ni->allocated_size = NTFS_CLU_TO_B(vol, first_free_vcn); + + if (NInoSparse(ni) || NInoCompressed(ni)) { + if (nr_freed_clusters) { + ni->itype.compressed.size -= + NTFS_CLU_TO_B(vol, nr_freed_clusters); + VFS_I(base_ni)->i_blocks = ni->itype.compressed.size >> 9; + } + } else + VFS_I(base_ni)->i_blocks = ni->allocated_size >> 9; + + /* Write mapping pairs for new runlist. */ + err = ntfs_attr_update_mapping_pairs(ni, 0 /*first_free_vcn*/); + if (err) { + ntfs_debug("Eeek! Mapping pairs update failed. Leaving inconstant metadata. Run chkdsk.\n"); + return err; + } + } + + /* Get the first attribute record. */ + ctx = ntfs_attr_get_search_ctx(base_ni, NULL); + if (!ctx) { + ntfs_error(vol->sb, "%s: Failed to get search context", __func__); + return -ENOMEM; + } + + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, CASE_SENSITIVE, + 0, NULL, 0, ctx); + if (err) { + if (err == -ENOENT) + err = -EIO; + ntfs_debug("Eeek! Lookup of first attribute extent failed. Leaving inconstant metadata.\n"); + goto put_err_out; + } + + /* Update data and initialized size. */ + ni->data_size = newsize; + ctx->attr->data.non_resident.data_size = cpu_to_le64(newsize); + if (newsize < ni->initialized_size) { + ni->initialized_size = newsize; + ctx->attr->data.non_resident.initialized_size = cpu_to_le64(newsize); + } + /* Update data size in the index. */ + if (ni->type == AT_DATA && ni->name == AT_UNNAMED) + NInoSetFileNameDirty(ni); + + /* If the attribute now has zero size, make it resident. */ + if (!newsize && !NInoEncrypted(ni) && !NInoCompressed(ni)) { + err = ntfs_attr_make_resident(ni, ctx); + if (err) { + /* If couldn't make resident, just continue. */ + if (err != -EPERM) + ntfs_error(ni->vol->sb, + "Failed to make attribute resident. Leaving as is...\n"); + } + } + + /* Set the inode dirty so it is written out later. */ + mark_mft_record_dirty(ctx->ntfs_ino); + /* Done! */ + ntfs_attr_put_search_ctx(ctx); + return 0; +put_err_out: + ntfs_attr_put_search_ctx(ctx); + return err; +} + +/** + * ntfs_non_resident_attr_expand - expand a non-resident, open ntfs attribute + * @ni: non-resident ntfs attribute to expand + * @prealloc_size: preallocation size (in bytes) to which to expand the attribute + * @newsize: new size (in bytes) to which to expand the attribute + * + * Expand the size of a non-resident, open ntfs attribute @na to @newsize bytes, + * by allocating new clusters. + */ +static int ntfs_non_resident_attr_expand(struct ntfs_inode *ni, const s64 newsize, + const s64 prealloc_size, unsigned int holes) +{ + s64 lcn_seek_from; + s64 first_free_vcn; + struct ntfs_volume *vol; + struct ntfs_attr_search_ctx *ctx = NULL; + struct runlist_element *rl, *rln; + s64 org_alloc_size, org_compressed_size; + int err, err2; + struct ntfs_inode *base_ni; + struct super_block *sb = ni->vol->sb; + size_t new_rl_count; + + ntfs_debug("Inode 0x%llx, attr 0x%x, new size %lld old size %lld\n", + (unsigned long long)ni->mft_no, ni->type, + (long long)newsize, (long long)ni->data_size); + + vol = ni->vol; + + if (NInoAttr(ni)) + base_ni = ni->ext.base_ntfs_ino; + else + base_ni = ni; + + /* + * Check the attribute type and the corresponding maximum size + * against @newsize and fail if @newsize is too big. + */ + err = ntfs_attr_size_bounds_check(vol, ni->type, newsize); + if (err < 0) { + ntfs_error(sb, "%s: bounds check failed", __func__); + return err; + } + + /* Save for future use. */ + org_alloc_size = ni->allocated_size; + org_compressed_size = ni->itype.compressed.size; + + /* The first cluster outside the new allocation. */ + if (prealloc_size) + first_free_vcn = NTFS_B_TO_CLU(vol, prealloc_size + vol->cluster_size - 1); + else + first_free_vcn = NTFS_B_TO_CLU(vol, newsize + vol->cluster_size - 1); + if (first_free_vcn < 0) + return -EFBIG; + + /* + * Compare the new allocation with the old one and only allocate + * clusters if there is a change. + */ + if (NTFS_B_TO_CLU(vol, ni->allocated_size) < first_free_vcn) { + err = ntfs_attr_map_whole_runlist(ni); + if (err) { + ntfs_error(sb, "ntfs_attr_map_whole_runlist failed"); + return err; + } + + /* + * If we extend $DATA attribute on NTFS 3+ volume, we can add + * sparse runs instead of real allocation of clusters. + */ + if ((ni->type == AT_DATA && (vol->major_ver >= 3 || !NInoSparseDisabled(ni))) && + (holes != HOLES_NO)) { + if (NInoCompressed(ni)) { + int last = 0, i = 0; + s64 alloc_size; + u64 more_entries = round_up(first_free_vcn - + NTFS_B_TO_CLU(vol, ni->allocated_size), + ni->itype.compressed.block_clusters); + + do_div(more_entries, ni->itype.compressed.block_clusters); + + while (ni->runlist.rl[last].length) + last++; + + rl = ntfs_rl_realloc(ni->runlist.rl, last + 1, + last + more_entries + 1); + if (IS_ERR(rl)) { + err = -ENOMEM; + goto put_err_out; + } + + alloc_size = ni->allocated_size; + while (i++ < more_entries) { + rl[last].vcn = NTFS_B_TO_CLU(vol, + round_up(alloc_size, vol->cluster_size)); + rl[last].length = ni->itype.compressed.block_clusters - + (rl[last].vcn & + (ni->itype.compressed.block_clusters - 1)); + rl[last].lcn = LCN_HOLE; + last++; + alloc_size += ni->itype.compressed.block_size; + } + + rl[last].vcn = first_free_vcn; + rl[last].lcn = LCN_ENOENT; + rl[last].length = 0; + + ni->runlist.rl = rl; + ni->runlist.count += more_entries; + } else { + rl = ntfs_malloc_nofs(sizeof(struct runlist_element) * 2); + if (!rl) { + err = -ENOMEM; + goto put_err_out; + } + + rl[0].vcn = NTFS_B_TO_CLU(vol, ni->allocated_size); + rl[0].lcn = LCN_HOLE; + rl[0].length = first_free_vcn - + NTFS_B_TO_CLU(vol, ni->allocated_size); + rl[1].vcn = first_free_vcn; + rl[1].lcn = LCN_ENOENT; + rl[1].length = 0; + } + } else { + /* + * Determine first after last LCN of attribute. + * We will start seek clusters from this LCN to avoid + * fragmentation. If there are no valid LCNs in the + * attribute let the cluster allocator choose the + * starting LCN. + */ + lcn_seek_from = -1; + if (ni->runlist.rl->length) { + /* Seek to the last run list element. */ + for (rl = ni->runlist.rl; (rl + 1)->length; rl++) + ; + /* + * If the last LCN is a hole or similar seek + * back to last valid LCN. + */ + while (rl->lcn < 0 && rl != ni->runlist.rl) + rl--; + /* + * Only set lcn_seek_from it the LCN is valid. + */ + if (rl->lcn >= 0) + lcn_seek_from = rl->lcn + rl->length; + } + + rl = ntfs_cluster_alloc(vol, NTFS_B_TO_CLU(vol, ni->allocated_size), + first_free_vcn - NTFS_B_TO_CLU(vol, ni->allocated_size), + lcn_seek_from, DATA_ZONE, false, false, false); + if (IS_ERR(rl)) { + ntfs_debug("Cluster allocation failed (%lld)", + (long long)first_free_vcn - + NTFS_B_TO_CLU(vol, ni->allocated_size)); + return PTR_ERR(rl); + } + } + + if (!NInoCompressed(ni)) { + /* Append new clusters to attribute runlist. */ + rln = ntfs_runlists_merge(&ni->runlist, rl, 0, &new_rl_count); + if (IS_ERR(rln)) { + /* Failed, free just allocated clusters. */ + ntfs_error(sb, "Run list merge failed"); + ntfs_cluster_free_from_rl(vol, rl); + ntfs_free(rl); + return -EIO; + } + ni->runlist.rl = rln; + ni->runlist.count = new_rl_count; + } + + /* Prepare to mapping pairs update. */ + ni->allocated_size = NTFS_CLU_TO_B(vol, first_free_vcn); + err = ntfs_attr_update_mapping_pairs(ni, 0); + if (err) { + ntfs_debug("Mapping pairs update failed"); + goto rollback; + } + } + + ctx = ntfs_attr_get_search_ctx(base_ni, NULL); + if (!ctx) { + err = -ENOMEM; + if (ni->allocated_size == org_alloc_size) + return err; + goto rollback; + } + + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, CASE_SENSITIVE, + 0, NULL, 0, ctx); + if (err) { + if (err == -ENOENT) + err = -EIO; + if (ni->allocated_size != org_alloc_size) + goto rollback; + goto put_err_out; + } + + /* Update data size. */ + ni->data_size = newsize; + ctx->attr->data.non_resident.data_size = cpu_to_le64(newsize); + /* Update data size in the index. */ + if (ni->type == AT_DATA && ni->name == AT_UNNAMED) + NInoSetFileNameDirty(ni); + /* Set the inode dirty so it is written out later. */ + mark_mft_record_dirty(ctx->ntfs_ino); + /* Done! */ + ntfs_attr_put_search_ctx(ctx); + return 0; +rollback: + /* Free allocated clusters. */ + err2 = ntfs_cluster_free(ni, NTFS_B_TO_CLU(vol, org_alloc_size), + -1, ctx); + if (err2) + ntfs_debug("Leaking clusters"); + + /* Now, truncate the runlist itself. */ + down_write(&ni->runlist.lock); + err2 = ntfs_rl_truncate_nolock(vol, &ni->runlist, NTFS_B_TO_CLU(vol, org_alloc_size)); + up_write(&ni->runlist.lock); + if (err2) { + /* + * Failed to truncate the runlist, so just throw it away, it + * will be mapped afresh on next use. + */ + ntfs_free(ni->runlist.rl); + ni->runlist.rl = NULL; + ntfs_error(sb, "Couldn't truncate runlist. Rollback failed"); + } else { + /* Prepare to mapping pairs update. */ + ni->allocated_size = org_alloc_size; + /* Restore mapping pairs. */ + down_read(&ni->runlist.lock); + if (ntfs_attr_update_mapping_pairs(ni, 0)) + ntfs_error(sb, "Failed to restore old mapping pairs"); + up_read(&ni->runlist.lock); + + if (NInoSparse(ni) || NInoCompressed(ni)) { + ni->itype.compressed.size = org_compressed_size; + VFS_I(base_ni)->i_blocks = ni->itype.compressed.size >> 9; + } else + VFS_I(base_ni)->i_blocks = ni->allocated_size >> 9; + } + if (ctx) + ntfs_attr_put_search_ctx(ctx); + return err; +put_err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + return err; +} + +/** + * ntfs_resident_attr_resize - resize a resident, open ntfs attribute + * @attr_ni: resident ntfs inode to resize + * @prealloc_size: preallocation size (in bytes) to which to resize the attribute + * @newsize: new size (in bytes) to which to resize the attribute + * + * Change the size of a resident, open ntfs attribute @na to @newsize bytes. + */ +static int ntfs_resident_attr_resize(struct ntfs_inode *attr_ni, const s64 newsize, + const s64 prealloc_size, unsigned int holes) +{ + struct ntfs_attr_search_ctx *ctx; + struct ntfs_volume *vol = attr_ni->vol; + struct super_block *sb = vol->sb; + int err = -EIO; + struct ntfs_inode *base_ni, *ext_ni = NULL; + +attr_resize_again: + ntfs_debug("Inode 0x%llx attr 0x%x new size %lld\n", + (unsigned long long)attr_ni->mft_no, attr_ni->type, + (long long)newsize); + + if (NInoAttr(attr_ni)) + base_ni = attr_ni->ext.base_ntfs_ino; + else + base_ni = attr_ni; + + /* Get the attribute record that needs modification. */ + ctx = ntfs_attr_get_search_ctx(base_ni, NULL); + if (!ctx) { + ntfs_error(sb, "%s: Failed to get search context", __func__); + return -ENOMEM; + } + err = ntfs_attr_lookup(attr_ni->type, attr_ni->name, attr_ni->name_len, + 0, 0, NULL, 0, ctx); + if (err) { + ntfs_error(sb, "ntfs_attr_lookup failed"); + goto put_err_out; + } + + /* + * Check the attribute type and the corresponding minimum and maximum + * sizes against @newsize and fail if @newsize is out of bounds. + */ + err = ntfs_attr_size_bounds_check(vol, attr_ni->type, newsize); + if (err) { + if (err == -ENOENT) + err = -EIO; + ntfs_debug("%s: bounds check failed", __func__); + goto put_err_out; + } + /* + * If @newsize is bigger than the mft record we need to make the + * attribute non-resident if the attribute type supports it. If it is + * smaller we can go ahead and attempt the resize. + */ + if (newsize < vol->mft_record_size) { + /* Perform the resize of the attribute record. */ + err = ntfs_resident_attr_value_resize(ctx->mrec, ctx->attr, + newsize); + if (!err) { + /* Update attribute size everywhere. */ + attr_ni->data_size = attr_ni->initialized_size = newsize; + attr_ni->allocated_size = (newsize + 7) & ~7; + if (NInoCompressed(attr_ni) || NInoSparse(attr_ni)) + attr_ni->itype.compressed.size = attr_ni->allocated_size; + if (attr_ni->type == AT_DATA && attr_ni->name == AT_UNNAMED) + NInoSetFileNameDirty(attr_ni); + goto resize_done; + } + + /* Prefer AT_INDEX_ALLOCATION instead of AT_ATTRIBUTE_LIST */ + if (err == -ENOSPC && ctx->attr->type == AT_INDEX_ROOT) + goto put_err_out; + + } + /* There is not enough space in the mft record to perform the resize. */ + + /* Make the attribute non-resident if possible. */ + err = ntfs_attr_make_non_resident(attr_ni, + le32_to_cpu(ctx->attr->data.resident.value_length)); + if (!err) { + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + /* Resize non-resident attribute */ + return ntfs_non_resident_attr_expand(attr_ni, newsize, prealloc_size, holes); + } else if (err != -ENOSPC && err != -EPERM) { + ntfs_error(sb, "Failed to make attribute non-resident"); + goto put_err_out; + } + + /* Try to make other attributes non-resident and retry each time. */ + ntfs_attr_reinit_search_ctx(ctx); + while (!(err = ntfs_attr_lookup(AT_UNUSED, NULL, 0, 0, 0, NULL, 0, ctx))) { + struct inode *tvi; + struct attr_record *a; + + a = ctx->attr; + if (a->non_resident || a->type == AT_ATTRIBUTE_LIST) + continue; + + if (ntfs_attr_can_be_non_resident(vol, a->type)) + continue; + + /* + * Check out whether convert is reasonable. Assume that mapping + * pairs will take 8 bytes. + */ + if (le32_to_cpu(a->length) <= (sizeof(struct attr_record) - sizeof(s64)) + + ((a->name_length * sizeof(__le16) + 7) & ~7) + 8) + continue; + + if (a->type == AT_DATA) + tvi = ntfs_iget(sb, base_ni->mft_no); + else + tvi = ntfs_attr_iget(VFS_I(base_ni), a->type, + (__le16 *)((u8 *)a + le16_to_cpu(a->name_offset)), + a->name_length); + if (IS_ERR(tvi)) { + ntfs_error(sb, "Couldn't open attribute"); + continue; + } + + if (ntfs_attr_make_non_resident(NTFS_I(tvi), + le32_to_cpu(ctx->attr->data.resident.value_length))) { + iput(tvi); + continue; + } + + mark_mft_record_dirty(ctx->ntfs_ino); + iput(tvi); + ntfs_attr_put_search_ctx(ctx); + goto attr_resize_again; + } + + /* Check whether error occurred. */ + if (err != -ENOENT) { + ntfs_error(sb, "%s: Attribute lookup failed 1", __func__); + goto put_err_out; + } + + /* + * The standard information and attribute list attributes can't be + * moved out from the base MFT record, so try to move out others. + */ + if (attr_ni->type == AT_STANDARD_INFORMATION || + attr_ni->type == AT_ATTRIBUTE_LIST) { + ntfs_attr_put_search_ctx(ctx); + + if (!NInoAttrList(base_ni)) { + err = ntfs_inode_add_attrlist(base_ni); + if (err) + return err; + } + + err = ntfs_inode_free_space(base_ni, sizeof(struct attr_record)); + if (err) { + err = -ENOSPC; + ntfs_error(sb, + "Couldn't free space in the MFT record to make attribute list non resident"); + return err; + } + err = ntfs_attrlist_update(base_ni); + if (err) + return err; + goto attr_resize_again; + } + + /* + * Move the attribute to a new mft record, creating an attribute list + * attribute or modifying it if it is already present. + */ + + /* Point search context back to attribute which we need resize. */ + ntfs_attr_reinit_search_ctx(ctx); + err = ntfs_attr_lookup(attr_ni->type, attr_ni->name, attr_ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (err) { + ntfs_error(sb, "%s: Attribute lookup failed 2", __func__); + goto put_err_out; + } + + /* + * Check whether attribute is already single in this MFT record. + * 8 added for the attribute terminator. + */ + if (le32_to_cpu(ctx->mrec->bytes_in_use) == + le16_to_cpu(ctx->mrec->attrs_offset) + le32_to_cpu(ctx->attr->length) + 8) { + err = -ENOSPC; + ntfs_debug("MFT record is filled with one attribute\n"); + goto put_err_out; + } + + /* Add attribute list if not present. */ + if (!NInoAttrList(base_ni)) { + ntfs_attr_put_search_ctx(ctx); + err = ntfs_inode_add_attrlist(base_ni); + if (err) + return err; + goto attr_resize_again; + } + + /* Allocate new mft record. */ + err = ntfs_mft_record_alloc(base_ni->vol, 0, &ext_ni, base_ni, NULL); + if (err) { + ntfs_error(sb, "Couldn't allocate MFT record"); + goto put_err_out; + } + unmap_mft_record(ext_ni); + + /* Move attribute to it. */ + err = ntfs_attr_record_move_to(ctx, ext_ni); + if (err) { + ntfs_error(sb, "Couldn't move attribute to new MFT record"); + err = -ENOMEM; + goto put_err_out; + } + + err = ntfs_attrlist_update(base_ni); + if (err < 0) + goto put_err_out; + + ntfs_attr_put_search_ctx(ctx); + /* Try to perform resize once again. */ + goto attr_resize_again; + +resize_done: + /* + * Set the inode (and its base inode if it exists) dirty so it is + * written out later. + */ + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + return 0; + +put_err_out: + ntfs_attr_put_search_ctx(ctx); + return err; +} + +int __ntfs_attr_truncate_vfs(struct ntfs_inode *ni, const s64 newsize, + const s64 i_size) +{ + int err = 0; + + if (newsize < 0 || + (ni->mft_no == FILE_MFT && ni->type == AT_DATA)) { + ntfs_debug("Invalid arguments passed.\n"); + return -EINVAL; + } + + ntfs_debug("Entering for inode 0x%llx, attr 0x%x, size %lld\n", + (unsigned long long)ni->mft_no, ni->type, newsize); + + if (NInoNonResident(ni)) { + if (newsize > i_size) { + down_write(&ni->runlist.lock); + err = ntfs_non_resident_attr_expand(ni, newsize, 0, + NVolDisableSparse(ni->vol) ? + HOLES_NO : HOLES_OK); + up_write(&ni->runlist.lock); + } else + err = ntfs_non_resident_attr_shrink(ni, newsize); + } else + err = ntfs_resident_attr_resize(ni, newsize, 0, + NVolDisableSparse(ni->vol) ? + HOLES_NO : HOLES_OK); + ntfs_debug("Return status %d\n", err); + return err; +} + +int ntfs_attr_expand(struct ntfs_inode *ni, const s64 newsize, const s64 prealloc_size) +{ + int err = 0; + + if (newsize < 0 || + (ni->mft_no == FILE_MFT && ni->type == AT_DATA)) { + ntfs_debug("Invalid arguments passed.\n"); + return -EINVAL; + } + + ntfs_debug("Entering for inode 0x%llx, attr 0x%x, size %lld\n", + (unsigned long long)ni->mft_no, ni->type, newsize); + + if (ni->data_size == newsize) { + ntfs_debug("Size is already ok\n"); + return 0; + } + + /* + * Encrypted attributes are not supported. We return access denied, + * which is what Windows NT4 does, too. + */ + if (NInoEncrypted(ni)) { + pr_err("Failed to truncate encrypted attribute"); + return -EACCES; + } + + if (NInoNonResident(ni)) { + if (newsize > ni->data_size) + err = ntfs_non_resident_attr_expand(ni, newsize, prealloc_size, + NVolDisableSparse(ni->vol) ? + HOLES_NO : HOLES_OK); + } else + err = ntfs_resident_attr_resize(ni, newsize, prealloc_size, + NVolDisableSparse(ni->vol) ? + HOLES_NO : HOLES_OK); + if (!err) + i_size_write(VFS_I(ni), newsize); + ntfs_debug("Return status %d\n", err); + return err; +} + +/** + * ntfs_attr_truncate_i - resize an ntfs attribute + * @ni: open ntfs inode to resize + * @newsize: new size (in bytes) to which to resize the attribute + * + * Change the size of an open ntfs attribute @na to @newsize bytes. If the + * attribute is made bigger and the attribute is resident the newly + * "allocated" space is cleared and if the attribute is non-resident the + * newly allocated space is marked as not initialised and no real allocation + * on disk is performed. + */ +int ntfs_attr_truncate_i(struct ntfs_inode *ni, const s64 newsize, unsigned int holes) +{ + int err; + + if (newsize < 0 || + (ni->mft_no == FILE_MFT && ni->type == AT_DATA)) { + ntfs_debug("Invalid arguments passed.\n"); + return -EINVAL; + } + + ntfs_debug("Entering for inode 0x%llx, attr 0x%x, size %lld\n", + (unsigned long long)ni->mft_no, ni->type, newsize); + + if (ni->data_size == newsize) { + ntfs_debug("Size is already ok\n"); + return 0; + } + + /* + * Encrypted attributes are not supported. We return access denied, + * which is what Windows NT4 does, too. + */ + if (NInoEncrypted(ni)) { + pr_err("Failed to truncate encrypted attribute"); + return -EACCES; + } + + if (NInoCompressed(ni)) { + pr_err("Failed to truncate compressed attribute"); + return -EOPNOTSUPP; + } + + if (NInoNonResident(ni)) { + if (newsize > ni->data_size) + err = ntfs_non_resident_attr_expand(ni, newsize, 0, holes); + else + err = ntfs_non_resident_attr_shrink(ni, newsize); + } else + err = ntfs_resident_attr_resize(ni, newsize, 0, holes); + ntfs_debug("Return status %d\n", err); + return err; +} + +/* + * Resize an attribute, creating a hole if relevant + */ +int ntfs_attr_truncate(struct ntfs_inode *ni, const s64 newsize) +{ + return ntfs_attr_truncate_i(ni, newsize, + NVolDisableSparse(ni->vol) ? + HOLES_NO : HOLES_OK); +} + +int ntfs_attr_map_cluster(struct ntfs_inode *ni, s64 vcn_start, s64 *lcn_start, + s64 *lcn_count, s64 max_clu_count, bool *balloc, bool update_mp, + bool skip_holes) +{ + struct ntfs_volume *vol = ni->vol; + struct ntfs_attr_search_ctx *ctx; + struct runlist_element *rl, *rlc; + s64 vcn = vcn_start, lcn, clu_count; + s64 lcn_seek_from = -1; + int err = 0; + size_t new_rl_count; + + err = ntfs_attr_map_whole_runlist(ni); + if (err) + return err; + + if (NInoAttr(ni)) + ctx = ntfs_attr_get_search_ctx(ni->ext.base_ntfs_ino, NULL); + else + ctx = ntfs_attr_get_search_ctx(ni, NULL); + if (!ctx) { + ntfs_error(vol->sb, "%s: Failed to get search context", __func__); + return -ENOMEM; + } + + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, vcn, NULL, 0, ctx); + if (err) { + ntfs_error(vol->sb, + "ntfs_attr_lookup failed, ntfs inode(mft_no : %ld) type : 0x%x, err : %d", + ni->mft_no, ni->type, err); + goto out; + } + + rl = ntfs_attr_find_vcn_nolock(ni, vcn, ctx); + if (IS_ERR(rl)) { + ntfs_error(vol->sb, "Failed to find run after mapping runlist."); + err = PTR_ERR(rl); + goto out; + } + + lcn = ntfs_rl_vcn_to_lcn(rl, vcn); + clu_count = min(max_clu_count, rl->length - (vcn - rl->vcn)); + if (lcn >= LCN_HOLE) { + if (lcn > LCN_DELALLOC || + (lcn == LCN_HOLE && skip_holes)) { + *lcn_start = lcn; + *lcn_count = clu_count; + *balloc = false; + goto out; + } + } else { + WARN_ON(lcn == LCN_RL_NOT_MAPPED); + if (lcn == LCN_ENOENT) + err = -ENOENT; + else + err = -EIO; + goto out; + } + + /* Search backwards to find the best lcn to start seek from. */ + rlc = rl; + while (rlc->vcn) { + rlc--; + if (rlc->lcn >= 0) { + /* + * avoid fragmenting a compressed file + * Windows does not do that, and that may + * not be desirable for files which can + * be updated + */ + if (NInoCompressed(ni)) + lcn_seek_from = rlc->lcn + rlc->length; + else + lcn_seek_from = rlc->lcn + (vcn - rlc->vcn); + break; + } + } + + if (lcn_seek_from == -1) { + /* Backwards search failed, search forwards. */ + rlc = rl; + while (rlc->length) { + rlc++; + if (rlc->lcn >= 0) { + lcn_seek_from = rlc->lcn - (rlc->vcn - vcn); + if (lcn_seek_from < -1) + lcn_seek_from = -1; + break; + } + } + } + + if (lcn_seek_from == -1 && ni->lcn_seek_trunc != LCN_RL_NOT_MAPPED) { + lcn_seek_from = ni->lcn_seek_trunc; + ni->lcn_seek_trunc = LCN_RL_NOT_MAPPED; + } + + rlc = ntfs_cluster_alloc(vol, vcn, clu_count, lcn_seek_from, DATA_ZONE, + false, true, true); + if (IS_ERR(rlc)) { + err = PTR_ERR(rlc); + goto out; + } + + WARN_ON(rlc->vcn != vcn); + lcn = rlc->lcn; + clu_count = rlc->length; + + rl = ntfs_runlists_merge(&ni->runlist, rlc, 0, &new_rl_count); + if (IS_ERR(rl)) { + ntfs_error(vol->sb, "Failed to merge runlists"); + err = PTR_ERR(rl); + if (ntfs_cluster_free_from_rl(vol, rlc)) + ntfs_error(vol->sb, "Failed to free hot clusters."); + ntfs_free(rlc); + goto out; + } + ni->runlist.rl = rl; + ni->runlist.count = new_rl_count; + + if (!update_mp) { + u64 free = atomic64_read(&vol->free_clusters) * 100; + + do_div(free, vol->nr_clusters); + if (free <= 5) + update_mp = true; + } + + if (update_mp) { + ntfs_attr_reinit_search_ctx(ctx); + err = ntfs_attr_update_mapping_pairs(ni, 0); + if (err) { + int err2; + + err2 = ntfs_cluster_free(ni, vcn, clu_count, ctx); + if (err2 < 0) + ntfs_error(vol->sb, + "Failed to free cluster allocation. Leaving inconstant metadata.\n"); + goto out; + } + } else { + VFS_I(ni)->i_blocks += clu_count << (vol->cluster_size_bits - 9); + NInoSetRunlistDirty(ni); + mark_mft_record_dirty(ni); + } + + *lcn_start = lcn; + *lcn_count = clu_count; + *balloc = true; +out: + ntfs_attr_put_search_ctx(ctx); + return err; +} + +/** + * ntfs_attr_rm - remove attribute from ntfs inode + * @ni: opened ntfs attribute to delete + * + * Remove attribute and all it's extents from ntfs inode. If attribute was non + * resident also free all clusters allocated by attribute. + */ +int ntfs_attr_rm(struct ntfs_inode *ni) +{ + struct ntfs_attr_search_ctx *ctx; + int err = 0, ret = 0; + struct ntfs_inode *base_ni; + struct super_block *sb = ni->vol->sb; + + if (NInoAttr(ni)) + base_ni = ni->ext.base_ntfs_ino; + else + base_ni = ni; + + ntfs_debug("Entering for inode 0x%llx, attr 0x%x.\n", + (long long) ni->mft_no, ni->type); + + /* Free cluster allocation. */ + if (NInoNonResident(ni)) { + struct ntfs_attr_search_ctx *ctx; + + err = ntfs_attr_map_whole_runlist(ni); + if (err) + return err; + ctx = ntfs_attr_get_search_ctx(ni, NULL); + if (!ctx) { + ntfs_error(sb, "%s: Failed to get search context", __func__); + return -ENOMEM; + } + + ret = ntfs_cluster_free(ni, 0, -1, ctx); + if (ret < 0) + ntfs_error(sb, + "Failed to free cluster allocation. Leaving inconstant metadata.\n"); + ntfs_attr_put_search_ctx(ctx); + } + + /* Search for attribute extents and remove them all. */ + ctx = ntfs_attr_get_search_ctx(base_ni, NULL); + if (!ctx) { + ntfs_error(sb, "%s: Failed to get search context", __func__); + return -ENOMEM; + } + while (!(err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx))) { + err = ntfs_attr_record_rm(ctx); + if (err) { + ntfs_error(sb, + "Failed to remove attribute extent. Leaving inconstant metadata.\n"); + ret = err; + } + ntfs_attr_reinit_search_ctx(ctx); + } + ntfs_attr_put_search_ctx(ctx); + if (err != -ENOENT) { + ntfs_error(sb, "Attribute lookup failed. Probably leaving inconstant metadata.\n"); + ret = err; + } + + return ret; +} + +int ntfs_attr_exist(struct ntfs_inode *ni, const __le32 type, __le16 *name, + u32 name_len) +{ + struct ntfs_attr_search_ctx *ctx; + int ret; + + ntfs_debug("Entering\n"); + + ctx = ntfs_attr_get_search_ctx(ni, NULL); + if (!ctx) { + ntfs_error(ni->vol->sb, "%s: Failed to get search context", + __func__); + return 0; + } + + ret = ntfs_attr_lookup(type, name, name_len, CASE_SENSITIVE, + 0, NULL, 0, ctx); + ntfs_attr_put_search_ctx(ctx); + + return !ret; +} + +int ntfs_attr_remove(struct ntfs_inode *ni, const __le32 type, __le16 *name, + u32 name_len) +{ + struct super_block *sb; + int err; + struct inode *attr_vi; + struct ntfs_inode *attr_ni; + + ntfs_debug("Entering\n"); + + sb = ni->vol->sb; + if (!ni) { + ntfs_error(sb, "NULL inode pointer\n"); + return -EINVAL; + } + + attr_vi = ntfs_attr_iget(VFS_I(ni), type, name, name_len); + if (IS_ERR(attr_vi)) { + err = PTR_ERR(attr_vi); + ntfs_error(sb, "Failed to open attribute 0x%02x of inode 0x%llx", + type, (unsigned long long)ni->mft_no); + return err; + } + attr_ni = NTFS_I(attr_vi); + + err = ntfs_attr_rm(attr_ni); + if (err) + ntfs_error(sb, "Failed to remove attribute 0x%02x of inode 0x%llx", + type, (unsigned long long)ni->mft_no); + iput(attr_vi); + return err; +} + +/** + * ntfs_attr_readall - read the entire data from an ntfs attribute + * @ni: open ntfs inode in which the ntfs attribute resides + * @type: attribute type + * @name: attribute name in little endian Unicode or AT_UNNAMED or NULL + * @name_len: length of attribute @name in Unicode characters (if @name given) + * @data_size: if non-NULL then store here the data size + * + * This function will read the entire content of an ntfs attribute. + * If @name is AT_UNNAMED then look specifically for an unnamed attribute. + * If @name is NULL then the attribute could be either named or not. + * In both those cases @name_len is not used at all. + * + * On success a buffer is allocated with the content of the attribute + * and which needs to be freed when it's not needed anymore. If the + * @data_size parameter is non-NULL then the data size is set there. + */ +void *ntfs_attr_readall(struct ntfs_inode *ni, const __le32 type, + __le16 *name, u32 name_len, s64 *data_size) +{ + struct ntfs_inode *bmp_ni; + struct inode *bmp_vi; + void *data, *ret = NULL; + s64 size; + struct super_block *sb = ni->vol->sb; + + ntfs_debug("Entering\n"); + + bmp_vi = ntfs_attr_iget(VFS_I(ni), type, name, name_len); + if (IS_ERR(bmp_vi)) { + ntfs_debug("ntfs_attr_iget failed"); + goto err_exit; + } + bmp_ni = NTFS_I(bmp_vi); + + data = ntfs_malloc_nofs(bmp_ni->data_size); + if (!data) { + ntfs_error(sb, "ntfs_malloc_nofs failed"); + goto out; + } + + size = ntfs_inode_attr_pread(VFS_I(bmp_ni), 0, bmp_ni->data_size, + (u8 *)data); + if (size != bmp_ni->data_size) { + ntfs_error(sb, "ntfs_attr_pread failed"); + ntfs_free(data); + goto out; + } + ret = data; + if (data_size) + *data_size = size; +out: + iput(bmp_vi); +err_exit: + ntfs_debug("\n"); + return ret; +} + +int ntfs_non_resident_attr_insert_range(struct ntfs_inode *ni, s64 start_vcn, s64 len) +{ + struct ntfs_volume *vol = ni->vol; + struct runlist_element *hole_rl, *rl; + struct ntfs_attr_search_ctx *ctx; + int ret; + size_t new_rl_count; + + if (NInoAttr(ni) || ni->type != AT_DATA) + return -EOPNOTSUPP; + if (start_vcn > NTFS_B_TO_CLU(vol, ni->allocated_size)) + return -EINVAL; + + hole_rl = ntfs_malloc_nofs(sizeof(*hole_rl) * 2); + if (!hole_rl) + return -ENOMEM; + hole_rl[0].vcn = start_vcn; + hole_rl[0].lcn = LCN_HOLE; + hole_rl[0].length = len; + hole_rl[1].vcn = start_vcn + len; + hole_rl[1].lcn = LCN_ENOENT; + hole_rl[1].length = 0; + + down_write(&ni->runlist.lock); + ret = ntfs_attr_map_whole_runlist(ni); + if (ret) { + up_write(&ni->runlist.lock); + return ret; + } + + rl = ntfs_rl_find_vcn_nolock(ni->runlist.rl, start_vcn); + if (!rl) { + up_write(&ni->runlist.lock); + ntfs_free(hole_rl); + return -EIO; + } + + rl = ntfs_rl_insert_range(ni->runlist.rl, (int)ni->runlist.count, + hole_rl, 1, &new_rl_count); + if (IS_ERR(rl)) { + up_write(&ni->runlist.lock); + ntfs_free(hole_rl); + return PTR_ERR(rl); + } + ni->runlist.rl = rl; + ni->runlist.count = new_rl_count; + + ni->allocated_size += NTFS_CLU_TO_B(vol, len); + ni->data_size += NTFS_CLU_TO_B(vol, len); + if (NTFS_CLU_TO_B(vol, start_vcn) < ni->initialized_size) + ni->initialized_size += NTFS_CLU_TO_B(vol, len); + ret = ntfs_attr_update_mapping_pairs(ni, 0); + up_write(&ni->runlist.lock); + if (ret) + return ret; + + ctx = ntfs_attr_get_search_ctx(ni, NULL); + if (!ctx) { + ret = -ENOMEM; + return ret; + } + + ret = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, CASE_SENSITIVE, + 0, NULL, 0, ctx); + if (ret) { + ntfs_attr_put_search_ctx(ctx); + return ret; + } + + ctx->attr->data.non_resident.data_size = cpu_to_le64(ni->data_size); + ctx->attr->data.non_resident.initialized_size = cpu_to_le64(ni->initialized_size); + if (ni->type == AT_DATA && ni->name == AT_UNNAMED) + NInoSetFileNameDirty(ni); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + return ret; +} + +int ntfs_non_resident_attr_collapse_range(struct ntfs_inode *ni, s64 start_vcn, s64 len) +{ + struct ntfs_volume *vol = ni->vol; + struct runlist_element *punch_rl, *rl; + struct ntfs_attr_search_ctx *ctx = NULL; + s64 end_vcn; + int dst_cnt; + int ret; + size_t new_rl_cnt; + + if (NInoAttr(ni) || ni->type != AT_DATA) + return -EOPNOTSUPP; + + end_vcn = NTFS_B_TO_CLU(vol, ni->allocated_size); + if (start_vcn >= end_vcn) + return -EINVAL; + + down_write(&ni->runlist.lock); + ret = ntfs_attr_map_whole_runlist(ni); + if (ret) + return ret; + + len = min(len, end_vcn - start_vcn); + for (rl = ni->runlist.rl, dst_cnt = 0; rl && rl->length; rl++) + dst_cnt++; + rl = ntfs_rl_find_vcn_nolock(ni->runlist.rl, start_vcn); + if (!rl) { + up_write(&ni->runlist.lock); + return -EIO; + } + + rl = ntfs_rl_collapse_range(ni->runlist.rl, dst_cnt + 1, + start_vcn, len, &punch_rl, &new_rl_cnt); + if (IS_ERR(rl)) { + up_write(&ni->runlist.lock); + return PTR_ERR(rl); + } + ni->runlist.rl = rl; + ni->runlist.count = new_rl_cnt; + + ni->allocated_size -= NTFS_CLU_TO_B(vol, len); + if (ni->data_size > NTFS_CLU_TO_B(vol, start_vcn)) { + if (ni->data_size > NTFS_CLU_TO_B(vol, (start_vcn + len))) + ni->data_size -= NTFS_CLU_TO_B(vol, len); + else + ni->data_size = NTFS_CLU_TO_B(vol, start_vcn); + } + if (ni->initialized_size > NTFS_CLU_TO_B(vol, start_vcn)) { + if (ni->initialized_size > + NTFS_CLU_TO_B(vol, start_vcn + len)) + ni->initialized_size -= NTFS_CLU_TO_B(vol, len); + else + ni->initialized_size = NTFS_CLU_TO_B(vol, start_vcn); + } + + if (ni->allocated_size > 0) { + ret = ntfs_attr_update_mapping_pairs(ni, 0); + if (ret) { + up_write(&ni->runlist.lock); + goto out_rl; + } + } + up_write(&ni->runlist.lock); + + ctx = ntfs_attr_get_search_ctx(ni, NULL); + if (!ctx) { + ret = -ENOMEM; + goto out_rl; + } + + ret = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, CASE_SENSITIVE, + 0, NULL, 0, ctx); + if (ret) + goto out_ctx; + + ctx->attr->data.non_resident.data_size = cpu_to_le64(ni->data_size); + ctx->attr->data.non_resident.initialized_size = cpu_to_le64(ni->initialized_size); + if (ni->allocated_size == 0) + ntfs_attr_make_resident(ni, ctx); + mark_mft_record_dirty(ctx->ntfs_ino); + + ret = ntfs_cluster_free_from_rl(vol, punch_rl); + if (ret) + ntfs_error(vol->sb, "Freeing of clusters failed"); +out_ctx: + if (ctx) + ntfs_attr_put_search_ctx(ctx); +out_rl: + ntfs_free(punch_rl); + mark_mft_record_dirty(ni); + return ret; +} + +int ntfs_non_resident_attr_punch_hole(struct ntfs_inode *ni, s64 start_vcn, s64 len) +{ + struct ntfs_volume *vol = ni->vol; + struct runlist_element *punch_rl, *rl; + s64 end_vcn; + int dst_cnt; + int ret; + size_t new_rl_count; + + if (NInoAttr(ni) || ni->type != AT_DATA) + return -EOPNOTSUPP; + + end_vcn = NTFS_B_TO_CLU(vol, ni->allocated_size); + if (start_vcn >= end_vcn) + return -EINVAL; + + down_write(&ni->runlist.lock); + ret = ntfs_attr_map_whole_runlist(ni); + if (ret) { + up_write(&ni->runlist.lock); + return ret; + } + + len = min(len, end_vcn - start_vcn + 1); + for (rl = ni->runlist.rl, dst_cnt = 0; rl && rl->length; rl++) + dst_cnt++; + rl = ntfs_rl_find_vcn_nolock(ni->runlist.rl, start_vcn); + if (!rl) { + up_write(&ni->runlist.lock); + return -EIO; + } + + rl = ntfs_rl_punch_hole(ni->runlist.rl, dst_cnt + 1, + start_vcn, len, &punch_rl, &new_rl_count); + if (IS_ERR(rl)) { + up_write(&ni->runlist.lock); + return PTR_ERR(rl); + } + ni->runlist.rl = rl; + ni->runlist.count = new_rl_count; + + ret = ntfs_attr_update_mapping_pairs(ni, 0); + up_write(&ni->runlist.lock); + if (ret) { + ntfs_free(punch_rl); + return ret; + } + + ret = ntfs_cluster_free_from_rl(vol, punch_rl); + if (ret) + ntfs_error(vol->sb, "Freeing of clusters failed"); + + ntfs_free(punch_rl); + mark_mft_record_dirty(ni); + return ret; +} + +int ntfs_attr_fallocate(struct ntfs_inode *ni, loff_t start, loff_t byte_len, bool keep_size) +{ + struct ntfs_volume *vol = ni->vol; + struct mft_record *mrec; + struct ntfs_attr_search_ctx *ctx; + s64 old_data_size; + s64 vcn_start, vcn_end, vcn_uninit, vcn, try_alloc_cnt; + s64 lcn, alloc_cnt; + int err = 0; + struct runlist_element *rl; + bool balloc; + + if (NInoAttr(ni) || ni->type != AT_DATA) + return -EINVAL; + + if (NInoNonResident(ni) && !NInoFullyMapped(ni)) { + down_write(&ni->runlist.lock); + err = ntfs_attr_map_whole_runlist(ni); + up_write(&ni->runlist.lock); + if (err) + return err; + } + + mutex_lock_nested(&ni->mrec_lock, NTFS_INODE_MUTEX_NORMAL); + mrec = map_mft_record(ni); + if (IS_ERR(mrec)) { + mutex_unlock(&ni->mrec_lock); + return PTR_ERR(mrec); + } + + ctx = ntfs_attr_get_search_ctx(ni, mrec); + if (!ctx) { + err = -ENOMEM; + goto out_unmap; + } + + err = ntfs_attr_lookup(AT_DATA, AT_UNNAMED, 0, 0, 0, NULL, 0, ctx); + if (err) { + err = -EIO; + goto out_unmap; + } + + old_data_size = ni->data_size; + if (start + byte_len > ni->data_size) { + err = ntfs_attr_truncate(ni, start + byte_len); + if (err) + goto out_unmap; + if (keep_size) { + ntfs_attr_reinit_search_ctx(ctx); + err = ntfs_attr_lookup(AT_DATA, AT_UNNAMED, 0, 0, 0, NULL, 0, ctx); + if (err) { + err = -EIO; + goto out_unmap; + } + ni->data_size = old_data_size; + if (NInoNonResident(ni)) + ctx->attr->data.non_resident.data_size = + cpu_to_le64(old_data_size); + else + ctx->attr->data.resident.value_length = + cpu_to_le64(old_data_size); + mark_mft_record_dirty(ni); + } + } + + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); + mutex_unlock(&ni->mrec_lock); + + if (!NInoNonResident(ni)) + goto out; + + vcn_start = (s64)NTFS_B_TO_CLU(vol, start); + vcn_end = (s64)NTFS_B_TO_CLU(vol, round_up(start + byte_len, vol->cluster_size)); + vcn_uninit = (s64)NTFS_B_TO_CLU(vol, round_up(ni->initialized_size, vol->cluster_size)); + vcn_uninit = min_t(s64, vcn_uninit, vcn_end); + + /* + * we have to allocate clusters for holes and delayed within initialized_size, + * and zero out the clusters only for the holes. + */ + vcn = vcn_start; + while (vcn < vcn_uninit) { + down_read(&ni->runlist.lock); + rl = ntfs_attr_find_vcn_nolock(ni, vcn, NULL); + up_read(&ni->runlist.lock); + if (IS_ERR(rl)) { + err = PTR_ERR(rl); + goto out; + } + + if (rl->lcn > 0) { + vcn += rl->length - (vcn - rl->vcn); + } else if (rl->lcn == LCN_DELALLOC || rl->lcn == LCN_HOLE) { + try_alloc_cnt = min(rl->length - (vcn - rl->vcn), + vcn_uninit - vcn); + + if (rl->lcn == LCN_DELALLOC) { + vcn += try_alloc_cnt; + continue; + } + + while (try_alloc_cnt > 0) { + mutex_lock_nested(&ni->mrec_lock, NTFS_INODE_MUTEX_NORMAL); + down_write(&ni->runlist.lock); + err = ntfs_attr_map_cluster(ni, vcn, &lcn, &alloc_cnt, + try_alloc_cnt, &balloc, false, false); + up_write(&ni->runlist.lock); + mutex_unlock(&ni->mrec_lock); + if (err) + goto out; + + err = ntfs_zero_range(VFS_I(ni), + lcn << vol->cluster_size_bits, + alloc_cnt << vol->cluster_size_bits, + true); + if (err > 0) + goto out; + + if (signal_pending(current)) + goto out; + + vcn += alloc_cnt; + try_alloc_cnt -= alloc_cnt; + } + } else { + err = -EIO; + goto out; + } + } + + /* allocate clusters outside of initialized_size */ + try_alloc_cnt = vcn_end - vcn; + while (try_alloc_cnt > 0) { + mutex_lock_nested(&ni->mrec_lock, NTFS_INODE_MUTEX_NORMAL); + down_write(&ni->runlist.lock); + err = ntfs_attr_map_cluster(ni, vcn, &lcn, &alloc_cnt, + try_alloc_cnt, &balloc, false, false); + up_write(&ni->runlist.lock); + mutex_unlock(&ni->mrec_lock); + if (err || signal_pending(current)) + goto out; + + vcn += alloc_cnt; + try_alloc_cnt -= alloc_cnt; + cond_resched(); + } + + if (NInoRunlistDirty(ni)) { + mutex_lock_nested(&ni->mrec_lock, NTFS_INODE_MUTEX_NORMAL); + down_write(&ni->runlist.lock); + err = ntfs_attr_update_mapping_pairs(ni, 0); + if (err) + ntfs_error(ni->vol->sb, "Updating mapping pairs failed"); + else + NInoClearRunlistDirty(ni); + up_write(&ni->runlist.lock); + mutex_unlock(&ni->mrec_lock); + } + return err; +out_unmap: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); + mutex_unlock(&ni->mrec_lock); +out: + return err >= 0 ? 0 : err; +} diff --git a/fs/ntfs/attrlist.c b/fs/ntfs/attrlist.c new file mode 100644 index 000000000000..447e3029ba77 --- /dev/null +++ b/fs/ntfs/attrlist.c @@ -0,0 +1,285 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Attribute list attribute handling code. Originated from the Linux-NTFS + * project. + * Part of this file is based on code from the NTFS-3G project. + * + * Copyright (c) 2004-2005 Anton Altaparmakov + * Copyright (c) 2004-2005 Yura Pakhuchiy + * Copyright (c) 2006 Szabolcs Szakacsits + * Copyright (c) 2025 LG Electronics Co., Ltd. + */ + +#include "mft.h" +#include "attrib.h" +#include "malloc.h" +#include "attrlist.h" + +/** + * ntfs_attrlist_need - check whether inode need attribute list + * @ni: opened ntfs inode for which perform check + * + * Check whether all are attributes belong to one MFT record, in that case + * attribute list is not needed. + */ +int ntfs_attrlist_need(struct ntfs_inode *ni) +{ + struct attr_list_entry *ale; + + if (!ni) { + ntfs_debug("Invalid arguments.\n"); + return -EINVAL; + } + ntfs_debug("Entering for inode 0x%llx.\n", (long long) ni->mft_no); + + if (!NInoAttrList(ni)) { + ntfs_debug("Inode haven't got attribute list.\n"); + return -EINVAL; + } + + if (!ni->attr_list) { + ntfs_debug("Corrupt in-memory struct.\n"); + return -EINVAL; + } + + ale = (struct attr_list_entry *)ni->attr_list; + while ((u8 *)ale < ni->attr_list + ni->attr_list_size) { + if (MREF_LE(ale->mft_reference) != ni->mft_no) + return 1; + ale = (struct attr_list_entry *)((u8 *)ale + le16_to_cpu(ale->length)); + } + return 0; +} + +int ntfs_attrlist_update(struct ntfs_inode *base_ni) +{ + struct inode *attr_vi; + struct ntfs_inode *attr_ni; + int err; + + attr_vi = ntfs_attr_iget(VFS_I(base_ni), AT_ATTRIBUTE_LIST, AT_UNNAMED, 0); + if (IS_ERR(attr_vi)) { + err = PTR_ERR(attr_vi); + return err; + } + attr_ni = NTFS_I(attr_vi); + + err = ntfs_attr_truncate_i(attr_ni, base_ni->attr_list_size, HOLES_NO); + if (err == -ENOSPC && attr_ni->mft_no == FILE_MFT) { + err = ntfs_attr_truncate(attr_ni, 0); + if (err || ntfs_attr_truncate_i(attr_ni, base_ni->attr_list_size, HOLES_NO) != 0) { + iput(attr_vi); + ntfs_error(base_ni->vol->sb, + "Failed to truncate attribute list of inode %#llx", + (long long)base_ni->mft_no); + return -EIO; + } + } else if (err) { + iput(attr_vi); + ntfs_error(base_ni->vol->sb, + "Failed to truncate attribute list of inode %#llx", + (long long)base_ni->mft_no); + return -EIO; + } + + i_size_write(attr_vi, base_ni->attr_list_size); + + if (NInoNonResident(attr_ni) && !NInoAttrListNonResident(base_ni)) + NInoSetAttrListNonResident(base_ni); + + if (ntfs_inode_attr_pwrite(attr_vi, 0, base_ni->attr_list_size, + base_ni->attr_list, false) != + base_ni->attr_list_size) { + iput(attr_vi); + ntfs_error(base_ni->vol->sb, + "Failed to write attribute list of inode %#llx", + (long long)base_ni->mft_no); + return -EIO; + } + + NInoSetAttrListDirty(base_ni); + iput(attr_vi); + return 0; +} + +/** + * ntfs_attrlist_entry_add - add an attribute list attribute entry + * @ni: opened ntfs inode, which contains that attribute + * @attr: attribute record to add to attribute list + */ +int ntfs_attrlist_entry_add(struct ntfs_inode *ni, struct attr_record *attr) +{ + struct attr_list_entry *ale; + __le64 mref; + struct ntfs_attr_search_ctx *ctx; + u8 *new_al; + int entry_len, entry_offset, err; + struct mft_record *ni_mrec; + u8 *old_al; + + ntfs_debug("Entering for inode 0x%llx, attr 0x%x.\n", + (long long) ni->mft_no, + (unsigned int) le32_to_cpu(attr->type)); + + if (!ni || !attr) { + ntfs_debug("Invalid arguments.\n"); + return -EINVAL; + } + + ni_mrec = map_mft_record(ni); + if (IS_ERR(ni_mrec)) { + ntfs_debug("Invalid arguments.\n"); + return -EIO; + } + + mref = MK_LE_MREF(ni->mft_no, le16_to_cpu(ni_mrec->sequence_number)); + unmap_mft_record(ni); + + if (ni->nr_extents == -1) + ni = ni->ext.base_ntfs_ino; + + if (!NInoAttrList(ni)) { + ntfs_debug("Attribute list isn't present.\n"); + return -ENOENT; + } + + /* Determine size and allocate memory for new attribute list. */ + entry_len = (sizeof(struct attr_list_entry) + sizeof(__le16) * + attr->name_length + 7) & ~7; + new_al = ntfs_malloc_nofs(ni->attr_list_size + entry_len); + if (!new_al) + return -ENOMEM; + + /* Find place for the new entry. */ + ctx = ntfs_attr_get_search_ctx(ni, NULL); + if (!ctx) { + err = -ENOMEM; + ntfs_error(ni->vol->sb, "Failed to get search context"); + goto err_out; + } + + err = ntfs_attr_lookup(attr->type, (attr->name_length) ? (__le16 *) + ((u8 *)attr + le16_to_cpu(attr->name_offset)) : + AT_UNNAMED, attr->name_length, CASE_SENSITIVE, + (attr->non_resident) ? le64_to_cpu(attr->data.non_resident.lowest_vcn) : + 0, (attr->non_resident) ? NULL : ((u8 *)attr + + le16_to_cpu(attr->data.resident.value_offset)), (attr->non_resident) ? + 0 : le32_to_cpu(attr->data.resident.value_length), ctx); + if (!err) { + /* Found some extent, check it to be before new extent. */ + if (ctx->al_entry->lowest_vcn == attr->data.non_resident.lowest_vcn) { + err = -EEXIST; + ntfs_debug("Such attribute already present in the attribute list.\n"); + ntfs_attr_put_search_ctx(ctx); + goto err_out; + } + /* Add new entry after this extent. */ + ale = (struct attr_list_entry *)((u8 *)ctx->al_entry + + le16_to_cpu(ctx->al_entry->length)); + } else { + /* Check for real errors. */ + if (err != -ENOENT) { + ntfs_debug("Attribute lookup failed.\n"); + ntfs_attr_put_search_ctx(ctx); + goto err_out; + } + /* No previous extents found. */ + ale = ctx->al_entry; + } + /* Don't need it anymore, @ctx->al_entry points to @ni->attr_list. */ + ntfs_attr_put_search_ctx(ctx); + + /* Determine new entry offset. */ + entry_offset = ((u8 *)ale - ni->attr_list); + /* Set pointer to new entry. */ + ale = (struct attr_list_entry *)(new_al + entry_offset); + memset(ale, 0, entry_len); + /* Form new entry. */ + ale->type = attr->type; + ale->length = cpu_to_le16(entry_len); + ale->name_length = attr->name_length; + ale->name_offset = offsetof(struct attr_list_entry, name); + if (attr->non_resident) + ale->lowest_vcn = attr->data.non_resident.lowest_vcn; + else + ale->lowest_vcn = 0; + ale->mft_reference = mref; + ale->instance = attr->instance; + memcpy(ale->name, (u8 *)attr + le16_to_cpu(attr->name_offset), + attr->name_length * sizeof(__le16)); + + /* Copy entries from old attribute list to new. */ + memcpy(new_al, ni->attr_list, entry_offset); + memcpy(new_al + entry_offset + entry_len, ni->attr_list + + entry_offset, ni->attr_list_size - entry_offset); + + /* Set new runlist. */ + old_al = ni->attr_list; + ni->attr_list = new_al; + ni->attr_list_size = ni->attr_list_size + entry_len; + + err = ntfs_attrlist_update(ni); + if (err) { + ni->attr_list = old_al; + ni->attr_list_size -= entry_len; + goto err_out; + } + ntfs_free(old_al); + return 0; +err_out: + ntfs_free(new_al); + return err; +} + +/** + * ntfs_attrlist_entry_rm - remove an attribute list attribute entry + * @ctx: attribute search context describing the attribute list entry + * + * Remove the attribute list entry @ctx->al_entry from the attribute list. + */ +int ntfs_attrlist_entry_rm(struct ntfs_attr_search_ctx *ctx) +{ + u8 *new_al; + int new_al_len; + struct ntfs_inode *base_ni; + struct attr_list_entry *ale; + + if (!ctx || !ctx->ntfs_ino || !ctx->al_entry) { + ntfs_debug("Invalid arguments.\n"); + return -EINVAL; + } + + if (ctx->base_ntfs_ino) + base_ni = ctx->base_ntfs_ino; + else + base_ni = ctx->ntfs_ino; + ale = ctx->al_entry; + + ntfs_debug("Entering for inode 0x%llx, attr 0x%x, lowest_vcn %lld.\n", + (long long)ctx->ntfs_ino->mft_no, + (unsigned int)le32_to_cpu(ctx->al_entry->type), + (long long)le64_to_cpu(ctx->al_entry->lowest_vcn)); + + if (!NInoAttrList(base_ni)) { + ntfs_debug("Attribute list isn't present.\n"); + return -ENOENT; + } + + /* Allocate memory for new attribute list. */ + new_al_len = base_ni->attr_list_size - le16_to_cpu(ale->length); + new_al = ntfs_malloc_nofs(new_al_len); + if (!new_al) + return -ENOMEM; + + /* Copy entries from old attribute list to new. */ + memcpy(new_al, base_ni->attr_list, (u8 *)ale - base_ni->attr_list); + memcpy(new_al + ((u8 *)ale - base_ni->attr_list), (u8 *)ale + le16_to_cpu( + ale->length), new_al_len - ((u8 *)ale - base_ni->attr_list)); + + /* Set new runlist. */ + ntfs_free(base_ni->attr_list); + base_ni->attr_list = new_al; + base_ni->attr_list_size = new_al_len; + + return ntfs_attrlist_update(base_ni); +} diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c index 761aaa0195d6..f81a134107b0 100644 --- a/fs/ntfs/compress.c +++ b/fs/ntfs/compress.c @@ -1,14 +1,22 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* - * compress.c - NTFS kernel compressed attributes handling. - * Part of the Linux-NTFS project. +/** + * NTFS kernel compressed attributes handling. + * Part of the Linux-NTFS project. * * Copyright (c) 2001-2004 Anton Altaparmakov * Copyright (c) 2002 Richard Russon + * Copyright (c) 2025 LG Electronics Co., Ltd. + * + * Part of this file is based on code from the NTFS-3G project. + * and is copyrighted by the respective authors below: + * Copyright (c) 2004-2005 Anton Altaparmakov + * Copyright (c) 2004-2006 Szabolcs Szakacsits + * Copyright (c) 2005 Yura Pakhuchiy + * Copyright (c) 2009-2014 Jean-Pierre Andre + * Copyright (c) 2014 Eric Biggers */ #include -#include #include #include #include @@ -17,11 +25,15 @@ #include "inode.h" #include "debug.h" #include "ntfs.h" +#include "malloc.h" +#include "aops.h" +#include "lcnalloc.h" +#include "mft.h" /** - * ntfs_compression_constants - enum of constants used in the compression code + * enum of constants used in the compression code */ -typedef enum { +enum { /* Token types and access mask. */ NTFS_SYMBOL_TOKEN = 0, NTFS_PHRASE_TOKEN = 1, @@ -39,17 +51,17 @@ typedef enum { * initializing the compression buffer. */ NTFS_MAX_CB_SIZE = 64 * 1024, -} ntfs_compression_constants; +}; -/* +/** * ntfs_compression_buffer - one buffer for the decompression engine */ static u8 *ntfs_compression_buffer; -/* - * ntfs_cb_lock - spinlock which protects ntfs_compression_buffer +/** + * ntfs_cb_lock - mutex lock which protects ntfs_compression_buffer */ -static DEFINE_SPINLOCK(ntfs_cb_lock); +static DEFINE_MUTEX(ntfs_cb_lock); /** * allocate_compression_buffers - allocate the decompression buffers @@ -60,7 +72,8 @@ static DEFINE_SPINLOCK(ntfs_cb_lock); */ int allocate_compression_buffers(void) { - BUG_ON(ntfs_compression_buffer); + if (ntfs_compression_buffer) + return 0; ntfs_compression_buffer = vmalloc(NTFS_MAX_CB_SIZE); if (!ntfs_compression_buffer) @@ -75,9 +88,15 @@ int allocate_compression_buffers(void) */ void free_compression_buffers(void) { - BUG_ON(!ntfs_compression_buffer); + mutex_lock(&ntfs_cb_lock); + if (!ntfs_compression_buffer) { + mutex_unlock(&ntfs_cb_lock); + return; + } + vfree(ntfs_compression_buffer); ntfs_compression_buffer = NULL; + mutex_unlock(&ntfs_cb_lock); } /** @@ -90,13 +109,12 @@ static void zero_partial_compressed_page(struct page *page, unsigned int kp_ofs; ntfs_debug("Zeroing page region outside initialized size."); - if (((s64)page->index << PAGE_SHIFT) >= initialized_size) { + if (((s64)page->__folio_index << PAGE_SHIFT) >= initialized_size) { clear_page(kp); return; } kp_ofs = initialized_size & ~PAGE_MASK; memset(kp + kp_ofs, 0, PAGE_SIZE - kp_ofs); - return; } /** @@ -105,10 +123,9 @@ static void zero_partial_compressed_page(struct page *page, static inline void handle_bounds_compressed_page(struct page *page, const loff_t i_size, const s64 initialized_size) { - if ((page->index >= (initialized_size >> PAGE_SHIFT)) && + if ((page->__folio_index >= (initialized_size >> PAGE_SHIFT)) && (initialized_size < i_size)) zero_partial_compressed_page(page, initialized_size); - return; } /** @@ -161,18 +178,16 @@ static int ntfs_decompress(struct page *dest_pages[], int completed_pages[], */ u8 *cb_end = cb_start + cb_size; /* End of cb. */ u8 *cb = cb_start; /* Current position in cb. */ - u8 *cb_sb_start; /* Beginning of the current sb in the cb. */ + u8 *cb_sb_start = cb; /* Beginning of the current sb in the cb. */ u8 *cb_sb_end; /* End of current sb / beginning of next sb. */ /* Variables for uncompressed data / destination. */ struct page *dp; /* Current destination page being worked on. */ u8 *dp_addr; /* Current pointer into dp. */ u8 *dp_sb_start; /* Start of current sub-block in dp. */ - u8 *dp_sb_end; /* End of current sb in dp (dp_sb_start + - NTFS_SB_SIZE). */ + u8 *dp_sb_end; /* End of current sb in dp (dp_sb_start + NTFS_SB_SIZE). */ u16 do_sb_start; /* @dest_ofs when starting this sub-block. */ - u16 do_sb_end; /* @dest_ofs of end of this sb (do_sb_start + - NTFS_SB_SIZE). */ + u16 do_sb_end; /* @dest_ofs of end of this sb (do_sb_start + NTFS_SB_SIZE). */ /* Variables for tag and token parsing. */ u8 tag; /* Current tag. */ @@ -192,7 +207,7 @@ static int ntfs_decompress(struct page *dest_pages[], int completed_pages[], * position in the compression block is one byte before its end so the * first two checks do not detect it. */ - if (cb == cb_end || !le16_to_cpup((le16*)cb) || + if (cb == cb_end || !le16_to_cpup((__le16 *)cb) || (*dest_index == dest_max_index && *dest_ofs == dest_max_ofs)) { int i; @@ -201,7 +216,7 @@ static int ntfs_decompress(struct page *dest_pages[], int completed_pages[], err = 0; return_error: /* We can sleep from now on, so we drop lock. */ - spin_unlock(&ntfs_cb_lock); + mutex_unlock(&ntfs_cb_lock); /* Second stage: finalize completed pages. */ if (nr_completed_pages > 0) { for (i = 0; i < nr_completed_pages; i++) { @@ -215,7 +230,7 @@ static int ntfs_decompress(struct page *dest_pages[], int completed_pages[], handle_bounds_compressed_page(dp, i_size, initialized_size); flush_dcache_page(dp); - kunmap(dp); + kunmap_local(page_address(dp)); SetPageUptodate(dp); unlock_page(dp); if (di == xpage) @@ -242,7 +257,7 @@ static int ntfs_decompress(struct page *dest_pages[], int completed_pages[], /* Setup the current sub-block source pointers and validate range. */ cb_sb_start = cb; - cb_sb_end = cb_sb_start + (le16_to_cpup((le16*)cb) & NTFS_SB_SIZE_MASK) + cb_sb_end = cb_sb_start + (le16_to_cpup((__le16 *)cb) & NTFS_SB_SIZE_MASK) + 3; if (cb_sb_end > cb_end) goto return_overflow; @@ -261,10 +276,10 @@ static int ntfs_decompress(struct page *dest_pages[], int completed_pages[], } /* We have a valid destination page. Setup the destination pointers. */ - dp_addr = (u8*)page_address(dp) + do_sb_start; + dp_addr = (u8 *)page_address(dp) + do_sb_start; /* Now, we are ready to process the current sub-block (sb). */ - if (!(le16_to_cpup((le16*)cb) & NTFS_SB_IS_COMPRESSED)) { + if (!(le16_to_cpup((__le16 *)cb) & NTFS_SB_IS_COMPRESSED)) { ntfs_debug("Found uncompressed sub-block."); /* This sb is not compressed, just copy it into destination. */ @@ -281,7 +296,8 @@ static int ntfs_decompress(struct page *dest_pages[], int completed_pages[], /* Advance destination position to next sub-block. */ *dest_ofs += NTFS_SB_SIZE; - if (!(*dest_ofs &= ~PAGE_MASK)) { + *dest_ofs &= ~PAGE_MASK; + if (!(*dest_ofs)) { finalize_page: /* * First stage: add current page index to array of @@ -308,14 +324,14 @@ static int ntfs_decompress(struct page *dest_pages[], int completed_pages[], if (dp_addr < dp_sb_end) { int nr_bytes = do_sb_end - *dest_ofs; - ntfs_debug("Filling incomplete sub-block with " - "zeroes."); + ntfs_debug("Filling incomplete sub-block with zeroes."); /* Zero remainder and update destination position. */ memset(dp_addr, 0, nr_bytes); *dest_ofs += nr_bytes; } /* We have finished the current sub-block. */ - if (!(*dest_ofs &= ~PAGE_MASK)) + *dest_ofs &= ~PAGE_MASK; + if (!(*dest_ofs)) goto finalize_page; goto do_next_sb; } @@ -329,8 +345,8 @@ static int ntfs_decompress(struct page *dest_pages[], int completed_pages[], /* Parse the eight tokens described by the tag. */ for (token = 0; token < 8; token++, tag >>= 1) { - u16 lg, pt, length, max_non_overlap; register u16 i; + u16 lg, pt, length, max_non_overlap; u8 *dp_back_addr; /* Check if we are done / still in range. */ @@ -369,7 +385,7 @@ static int ntfs_decompress(struct page *dest_pages[], int completed_pages[], lg++; /* Get the phrase token into i. */ - pt = le16_to_cpup((le16*)cb); + pt = le16_to_cpup((__le16 *)cb); /* * Calculate starting position of the byte sequence in @@ -426,7 +442,7 @@ static int ntfs_decompress(struct page *dest_pages[], int completed_pages[], /** * ntfs_read_compressed_block - read a compressed block into the page cache - * @page: locked page in the compression block(s) we need to read + * @folio: locked folio in the compression block(s) we need to read * * When we are called the page has already been verified to be locked and the * attribute is known to be non-resident, not encrypted, but compressed. @@ -441,86 +457,65 @@ static int ntfs_decompress(struct page *dest_pages[], int completed_pages[], * Warning: We have to be careful what we do about existing pages. They might * have been written to so that we would lose data if we were to just overwrite * them with the out-of-date uncompressed data. - * - * FIXME: For PAGE_SIZE > cb_size we are not doing the Right Thing(TM) at - * the end of the file I think. We need to detect this case and zero the out - * of bounds remainder of the page in question and mark it as handled. At the - * moment we would just return -EIO on such a page. This bug will only become - * apparent if pages are above 8kiB and the NTFS volume only uses 512 byte - * clusters so is probably not going to be seen by anyone. Still this should - * be fixed. (AIA) - * - * FIXME: Again for PAGE_SIZE > cb_size we are screwing up both in - * handling sparse and compressed cbs. (AIA) - * - * FIXME: At the moment we don't do any zeroing out in the case that - * initialized_size is less than data_size. This should be safe because of the - * nature of the compression algorithm used. Just in case we check and output - * an error message in read inode if the two sizes are not equal for a - * compressed file. (AIA) */ -int ntfs_read_compressed_block(struct page *page) +int ntfs_read_compressed_block(struct folio *folio) { + struct page *page = &folio->page; loff_t i_size; s64 initialized_size; struct address_space *mapping = page->mapping; - ntfs_inode *ni = NTFS_I(mapping->host); - ntfs_volume *vol = ni->vol; + struct ntfs_inode *ni = NTFS_I(mapping->host); + struct ntfs_volume *vol = ni->vol; struct super_block *sb = vol->sb; - runlist_element *rl; - unsigned long flags, block_size = sb->s_blocksize; - unsigned char block_size_bits = sb->s_blocksize_bits; + struct runlist_element *rl; + unsigned long flags; u8 *cb, *cb_pos, *cb_end; - struct buffer_head **bhs; - unsigned long offset, index = page->index; + unsigned long offset, index = page->__folio_index; u32 cb_size = ni->itype.compressed.block_size; u64 cb_size_mask = cb_size - 1UL; - VCN vcn; - LCN lcn; + s64 vcn; + s64 lcn; /* The first wanted vcn (minimum alignment is PAGE_SIZE). */ - VCN start_vcn = (((s64)index << PAGE_SHIFT) & ~cb_size_mask) >> + s64 start_vcn = (((s64)index << PAGE_SHIFT) & ~cb_size_mask) >> vol->cluster_size_bits; /* * The first vcn after the last wanted vcn (minimum alignment is again * PAGE_SIZE. */ - VCN end_vcn = ((((s64)(index + 1UL) << PAGE_SHIFT) + cb_size - 1) + s64 end_vcn = ((((s64)(index + 1UL) << PAGE_SHIFT) + cb_size - 1) & ~cb_size_mask) >> vol->cluster_size_bits; /* Number of compression blocks (cbs) in the wanted vcn range. */ - unsigned int nr_cbs = (end_vcn - start_vcn) << vol->cluster_size_bits - >> ni->itype.compressed.block_size_bits; + unsigned int nr_cbs = NTFS_CLU_TO_B(vol, end_vcn - start_vcn) >> + ni->itype.compressed.block_size_bits; /* * Number of pages required to store the uncompressed data from all * compression blocks (cbs) overlapping @page. Due to alignment * guarantees of start_vcn and end_vcn, no need to round up here. */ - unsigned int nr_pages = (end_vcn - start_vcn) << - vol->cluster_size_bits >> PAGE_SHIFT; - unsigned int xpage, max_page, cur_page, cur_ofs, i; + unsigned int nr_pages = NTFS_CLU_TO_PIDX(vol, end_vcn - start_vcn); + unsigned int xpage, max_page, cur_page, cur_ofs, i, page_ofs, page_index; unsigned int cb_clusters, cb_max_ofs; - int block, max_block, cb_max_page, bhs_size, nr_bhs, err = 0; + int cb_max_page, err = 0; struct page **pages; int *completed_pages; unsigned char xpage_done = 0; + struct page *lpage; - ntfs_debug("Entering, page->index = 0x%lx, cb_size = 0x%x, nr_pages = " - "%i.", index, cb_size, nr_pages); + ntfs_debug("Entering, page->index = 0x%lx, cb_size = 0x%x, nr_pages = %i.", + index, cb_size, nr_pages); /* * Bad things happen if we get here for anything that is not an * unnamed $DATA attribute. */ - BUG_ON(ni->type != AT_DATA); - BUG_ON(ni->name_len); + if (ni->type != AT_DATA || ni->name_len) { + unlock_page(page); + return -EIO; + } pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); completed_pages = kmalloc_array(nr_pages + 1, sizeof(int), GFP_NOFS); - /* Allocate memory to store the buffer heads we need. */ - bhs_size = cb_size / block_size * sizeof(struct buffer_head *); - bhs = kmalloc(bhs_size, GFP_NOFS); - - if (unlikely(!pages || !bhs || !completed_pages)) { - kfree(bhs); + if (unlikely(!pages || !completed_pages)) { kfree(pages); kfree(completed_pages); unlock_page(page); @@ -532,7 +527,7 @@ int ntfs_read_compressed_block(struct page *page) * We have already been given one page, this is the one we must do. * Once again, the alignment guarantees keep it simple. */ - offset = start_vcn << vol->cluster_size_bits >> PAGE_SHIFT; + offset = NTFS_CLU_TO_PIDX(vol, start_vcn); xpage = index - offset; pages[xpage] = page; /* @@ -547,10 +542,9 @@ int ntfs_read_compressed_block(struct page *page) offset; /* Is the page fully outside i_size? (truncate in progress) */ if (xpage >= max_page) { - kfree(bhs); kfree(pages); kfree(completed_pages); - zero_user(page, 0, PAGE_SIZE); + zero_user_segments(page, 0, PAGE_SIZE, 0, 0); ntfs_debug("Compressed read outside i_size - truncated?"); SetPageUptodate(page); unlock_page(page); @@ -558,6 +552,7 @@ int ntfs_read_compressed_block(struct page *page) } if (nr_pages < max_page) max_page = nr_pages; + for (i = 0; i < max_page; i++, offset++) { if (i != xpage) pages[i] = grab_cache_page_nowait(mapping, offset); @@ -568,10 +563,8 @@ int ntfs_read_compressed_block(struct page *page) * in and/or dirty or we would be losing data or at * least wasting our time. */ - if (!PageDirty(page) && (!PageUptodate(page) || - PageError(page))) { - ClearPageError(page); - kmap(page); + if (!PageDirty(page) && (!PageUptodate(page))) { + kmap_local_page(page); continue; } unlock_page(page); @@ -589,9 +582,19 @@ int ntfs_read_compressed_block(struct page *page) cb_clusters = ni->itype.compressed.block_clusters; do_next_cb: nr_cbs--; - nr_bhs = 0; - /* Read all cb buffer heads one cluster at a time. */ + mutex_lock(&ntfs_cb_lock); + if (!ntfs_compression_buffer) + if (allocate_compression_buffers()) { + mutex_unlock(&ntfs_cb_lock); + goto err_out; + } + + + cb = ntfs_compression_buffer; + cb_pos = cb; + cb_end = cb + cb_size; + rl = NULL; for (vcn = start_vcn, start_vcn += cb_clusters; vcn < start_vcn; vcn++) { @@ -619,8 +622,10 @@ int ntfs_read_compressed_block(struct page *page) */ if (lcn == LCN_HOLE) break; - if (is_retry || lcn != LCN_RL_NOT_MAPPED) + if (is_retry || lcn != LCN_RL_NOT_MAPPED) { + mutex_unlock(&ntfs_cb_lock); goto rl_err; + } is_retry = true; /* * Attempt to map runlist, dropping lock for the @@ -629,88 +634,36 @@ int ntfs_read_compressed_block(struct page *page) up_read(&ni->runlist.lock); if (!ntfs_map_runlist(ni, vcn)) goto lock_retry_remap; + mutex_unlock(&ntfs_cb_lock); goto map_rl_err; } - block = lcn << vol->cluster_size_bits >> block_size_bits; - /* Read the lcn from device in chunks of block_size bytes. */ - max_block = block + (vol->cluster_size >> block_size_bits); - do { - ntfs_debug("block = 0x%x.", block); - if (unlikely(!(bhs[nr_bhs] = sb_getblk(sb, block)))) - goto getblk_err; - nr_bhs++; - } while (++block < max_block); - } - /* Release the lock if we took it. */ - if (rl) - up_read(&ni->runlist.lock); - - /* Setup and initiate io on all buffer heads. */ - for (i = 0; i < nr_bhs; i++) { - struct buffer_head *tbh = bhs[i]; + page_ofs = NTFS_CLU_TO_POFS(vol, lcn); + page_index = NTFS_CLU_TO_PIDX(vol, lcn); - if (!trylock_buffer(tbh)) - continue; - if (unlikely(buffer_uptodate(tbh))) { - unlock_buffer(tbh); - continue; + lpage = read_mapping_page(sb->s_bdev->bd_mapping, + page_index, NULL); + if (IS_ERR(lpage)) { + err = PTR_ERR(lpage); + mutex_unlock(&ntfs_cb_lock); + goto read_err; } - get_bh(tbh); - tbh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, tbh); - } - - /* Wait for io completion on all buffer heads. */ - for (i = 0; i < nr_bhs; i++) { - struct buffer_head *tbh = bhs[i]; - if (buffer_uptodate(tbh)) - continue; - wait_on_buffer(tbh); - /* - * We need an optimization barrier here, otherwise we start - * hitting the below fixup code when accessing a loopback - * mounted ntfs partition. This indicates either there is a - * race condition in the loop driver or, more likely, gcc - * overoptimises the code without the barrier and it doesn't - * do the Right Thing(TM). - */ - barrier(); - if (unlikely(!buffer_uptodate(tbh))) { - ntfs_warning(vol->sb, "Buffer is unlocked but not " - "uptodate! Unplugging the disk queue " - "and rescheduling."); - get_bh(tbh); - io_schedule(); - put_bh(tbh); - if (unlikely(!buffer_uptodate(tbh))) - goto read_err; - ntfs_warning(vol->sb, "Buffer is now uptodate. Good."); - } + lock_page(lpage); + memcpy(cb_pos, page_address(lpage) + page_ofs, + vol->cluster_size); + unlock_page(lpage); + put_page(lpage); + cb_pos += vol->cluster_size; } - /* - * Get the compression buffer. We must not sleep any more - * until we are finished with it. - */ - spin_lock(&ntfs_cb_lock); - cb = ntfs_compression_buffer; - - BUG_ON(!cb); - - cb_pos = cb; - cb_end = cb + cb_size; - - /* Copy the buffer heads into the contiguous buffer. */ - for (i = 0; i < nr_bhs; i++) { - memcpy(cb_pos, bhs[i]->b_data, block_size); - cb_pos += block_size; - } + /* Release the lock if we took it. */ + if (rl) + up_read(&ni->runlist.lock); /* Just a precaution. */ if (cb_pos + 2 <= cb + cb_size) - *(u16*)cb_pos = 0; + *(u16 *)cb_pos = 0; /* Reset cb_pos back to the beginning. */ cb_pos = cb; @@ -731,7 +684,7 @@ int ntfs_read_compressed_block(struct page *page) /* Sparse cb, zero out page range overlapping the cb. */ ntfs_debug("Found sparse compression block."); /* We can sleep from now on, so we drop lock. */ - spin_unlock(&ntfs_cb_lock); + mutex_unlock(&ntfs_cb_lock); if (cb_max_ofs) cb_max_page--; for (; cur_page < cb_max_page; cur_page++) { @@ -744,7 +697,7 @@ int ntfs_read_compressed_block(struct page *page) PAGE_SIZE - cur_ofs); flush_dcache_page(page); - kunmap(page); + kunmap_local(page_address(page)); SetPageUptodate(page); unlock_page(page); if (cur_page == xpage) @@ -778,16 +731,6 @@ int ntfs_read_compressed_block(struct page *page) ntfs_debug("Found uncompressed compression block."); /* Uncompressed cb, copy it to the destination pages. */ - /* - * TODO: As a big optimization, we could detect this case - * before we read all the pages and use block_read_full_folio() - * on all full pages instead (we still have to treat partial - * pages especially but at least we are getting rid of the - * synchronous io for the majority of pages. - * Or if we choose not to do the read-ahead/-behind stuff, we - * could just return block_read_full_folio(pages[xpage]) as long - * as PAGE_SIZE <= cb_size. - */ if (cb_max_ofs) cb_max_page--; /* First stage: copy data into destination pages. */ @@ -811,7 +754,7 @@ int ntfs_read_compressed_block(struct page *page) cur_ofs = cb_max_ofs; } /* We can sleep from now on, so drop lock. */ - spin_unlock(&ntfs_cb_lock); + mutex_unlock(&ntfs_cb_lock); /* Second stage: finalize pages. */ for (; cur2_page < cb_max_page; cur2_page++) { page = pages[cur2_page]; @@ -823,7 +766,7 @@ int ntfs_read_compressed_block(struct page *page) handle_bounds_compressed_page(page, i_size, initialized_size); flush_dcache_page(page); - kunmap(page); + kunmap_local(page_address(page)); SetPageUptodate(page); unlock_page(page); if (cur2_page == xpage) @@ -851,16 +794,15 @@ int ntfs_read_compressed_block(struct page *page) * ntfs_decompress(). */ if (err) { - ntfs_error(vol->sb, "ntfs_decompress() failed in inode " - "0x%lx with error code %i. Skipping " - "this compression block.", - ni->mft_no, -err); + ntfs_error(vol->sb, + "ntfs_decompress() failed in inode 0x%lx with error code %i. Skipping this compression block.", + ni->mft_no, -err); /* Release the unfinished pages. */ for (; prev_cur_page < cur_page; prev_cur_page++) { page = pages[prev_cur_page]; if (page) { flush_dcache_page(page); - kunmap(page); + kunmap_local(page_address(page)); unlock_page(page); if (prev_cur_page != xpage) put_page(page); @@ -870,27 +812,19 @@ int ntfs_read_compressed_block(struct page *page) } } - /* Release the buffer heads. */ - for (i = 0; i < nr_bhs; i++) - brelse(bhs[i]); - /* Do we have more work to do? */ if (nr_cbs) goto do_next_cb; - /* We no longer need the list of buffer heads. */ - kfree(bhs); - /* Clean up if we have any pages left. Should never happen. */ for (cur_page = 0; cur_page < max_page; cur_page++) { page = pages[cur_page]; if (page) { - ntfs_error(vol->sb, "Still have pages left! " - "Terminating them with extreme " - "prejudice. Inode 0x%lx, page index " - "0x%lx.", ni->mft_no, page->index); + ntfs_error(vol->sb, + "Still have pages left! Terminating them with extreme prejudice. Inode 0x%lx, page index 0x%lx.", + ni->mft_no, page->__folio_index); flush_dcache_page(page); - kunmap(page); + kunmap_local(page_address(page)); unlock_page(page); if (cur_page != xpage) put_page(page); @@ -910,35 +844,25 @@ int ntfs_read_compressed_block(struct page *page) "EOVERFLOW" : (!err ? "EIO" : "unknown error")); return err < 0 ? err : -EIO; -read_err: - ntfs_error(vol->sb, "IO error while reading compressed data."); - /* Release the buffer heads. */ - for (i = 0; i < nr_bhs; i++) - brelse(bhs[i]); - goto err_out; - map_rl_err: - ntfs_error(vol->sb, "ntfs_map_runlist() failed. Cannot read " - "compression block."); + ntfs_error(vol->sb, "ntfs_map_runlist() failed. Cannot read compression block."); goto err_out; rl_err: up_read(&ni->runlist.lock); - ntfs_error(vol->sb, "ntfs_rl_vcn_to_lcn() failed. Cannot read " - "compression block."); + ntfs_error(vol->sb, "ntfs_rl_vcn_to_lcn() failed. Cannot read compression block."); goto err_out; -getblk_err: +read_err: up_read(&ni->runlist.lock); - ntfs_error(vol->sb, "getblk() failed. Cannot read compression block."); + ntfs_error(vol->sb, "IO error while reading compressed data."); err_out: - kfree(bhs); for (i = cur_page; i < max_page; i++) { page = pages[i]; if (page) { flush_dcache_page(page); - kunmap(page); + kunmap_local(page_address(page)); unlock_page(page); if (i != xpage) put_page(page); @@ -948,3 +872,688 @@ int ntfs_read_compressed_block(struct page *page) kfree(completed_pages); return -EIO; } + +/* + * Match length at or above which ntfs_best_match() will stop searching for + * longer matches. + */ +#define NICE_MATCH_LEN 18 + +/* + * Maximum number of potential matches that ntfs_best_match() will consider at + * each position. + */ +#define MAX_SEARCH_DEPTH 24 + +/* log base 2 of the number of entries in the hash table for match-finding. */ +#define HASH_SHIFT 14 + +/* Constant for the multiplicative hash function. */ +#define HASH_MULTIPLIER 0x1E35A7BD + +struct COMPRESS_CONTEXT { + const unsigned char *inbuf; + int bufsize; + int size; + int rel; + int mxsz; + s16 head[1 << HASH_SHIFT]; + s16 prev[NTFS_SB_SIZE]; +}; + +/* + * Hash the next 3-byte sequence in the input buffer + */ +static inline unsigned int ntfs_hash(const u8 *p) +{ + u32 str; + u32 hash; + + /* + * Unaligned access allowed, and little endian CPU. + * Callers ensure that at least 4 (not 3) bytes are remaining. + */ + str = *(const u32 *)p & 0xFFFFFF; + hash = str * HASH_MULTIPLIER; + + /* High bits are more random than the low bits. */ + return hash >> (32 - HASH_SHIFT); +} + +/* + * Search for the longest sequence matching current position + * + * A hash table, each entry of which points to a chain of sequence + * positions sharing the corresponding hash code, is maintained to speed up + * searching for matches. To maintain the hash table, either + * ntfs_best_match() or ntfs_skip_position() has to be called for each + * consecutive position. + * + * This function is heavily used; it has to be optimized carefully. + * + * This function sets pctx->size and pctx->rel to the length and offset, + * respectively, of the longest match found. + * + * The minimum match length is assumed to be 3, and the maximum match + * length is assumed to be pctx->mxsz. If this function produces + * pctx->size < 3, then no match was found. + * + * Note: for the following reasons, this function is not guaranteed to find + * *the* longest match up to pctx->mxsz: + * + * (1) If this function finds a match of NICE_MATCH_LEN bytes or greater, + * it ends early because a match this long is good enough and it's not + * worth spending more time searching. + * + * (2) If this function considers MAX_SEARCH_DEPTH matches with a single + * position, it ends early and returns the longest match found so far. + * This saves a lot of time on degenerate inputs. + */ +static void ntfs_best_match(struct COMPRESS_CONTEXT *pctx, const int i, + int best_len) +{ + const u8 * const inbuf = pctx->inbuf; + const u8 * const strptr = &inbuf[i]; /* String we're matching against */ + s16 * const prev = pctx->prev; + const int max_len = min(pctx->bufsize - i, pctx->mxsz); + const int nice_len = min(NICE_MATCH_LEN, max_len); + int depth_remaining = MAX_SEARCH_DEPTH; + const u8 *best_matchptr = strptr; + unsigned int hash; + s16 cur_match; + const u8 *matchptr; + int len; + + if (max_len < 4) + goto out; + + /* Insert the current sequence into the appropriate hash chain. */ + hash = ntfs_hash(strptr); + cur_match = pctx->head[hash]; + prev[i] = cur_match; + pctx->head[hash] = i; + + if (best_len >= max_len) { + /* + * Lazy match is being attempted, but there aren't enough length + * bits remaining to code a longer match. + */ + goto out; + } + + /* Search the appropriate hash chain for matches. */ + + for (; cur_match >= 0 && depth_remaining--; cur_match = prev[cur_match]) { + matchptr = &inbuf[cur_match]; + + /* + * Considering the potential match at 'matchptr': is it longer + * than 'best_len'? + * + * The bytes at index 'best_len' are the most likely to differ, + * so check them first. + * + * The bytes at indices 'best_len - 1' and '0' are less + * important to check separately. But doing so still gives a + * slight performance improvement, at least on x86_64, probably + * because they create separate branches for the CPU to predict + * independently of the branches in the main comparison loops. + */ + if (matchptr[best_len] != strptr[best_len] || + matchptr[best_len - 1] != strptr[best_len - 1] || + matchptr[0] != strptr[0]) + goto next_match; + + for (len = 1; len < best_len - 1; len++) + if (matchptr[len] != strptr[len]) + goto next_match; + + /* + * The match is the longest found so far --- + * at least 'best_len' + 1 bytes. Continue extending it. + */ + + best_matchptr = matchptr; + + do { + if (++best_len >= nice_len) { + /* + * 'nice_len' reached; don't waste time + * searching for longer matches. Extend the + * match as far as possible and terminate the + * search. + */ + while (best_len < max_len && + (best_matchptr[best_len] == + strptr[best_len])) + best_len++; + goto out; + } + } while (best_matchptr[best_len] == strptr[best_len]); + + /* Found a longer match, but 'nice_len' not yet reached. */ + +next_match: + /* Continue to next match in the chain. */ + ; + } + + /* + * Reached end of chain, or ended early due to reaching the maximum + * search depth. + */ + +out: + /* Return the longest match we were able to find. */ + pctx->size = best_len; + pctx->rel = best_matchptr - strptr; /* given as a negative number! */ +} + +/* + * Advance the match-finder, but don't search for matches. + */ +static void ntfs_skip_position(struct COMPRESS_CONTEXT *pctx, const int i) +{ + unsigned int hash; + + if (pctx->bufsize - i < 4) + return; + + /* Insert the current sequence into the appropriate hash chain. */ + hash = ntfs_hash(pctx->inbuf + i); + pctx->prev[i] = pctx->head[hash]; + pctx->head[hash] = i; +} + +/* + * Compress a 4096-byte block + * + * Returns a header of two bytes followed by the compressed data. + * If compression is not effective, the header and an uncompressed + * block is returned. + * + * Note : two bytes may be output before output buffer overflow + * is detected, so a 4100-bytes output buffer must be reserved. + * + * Returns the size of the compressed block, including the + * header (minimal size is 2, maximum size is 4098) + * 0 if an error has been met. + */ +static unsigned int ntfs_compress_block(const char *inbuf, const int bufsize, + char *outbuf) +{ + struct COMPRESS_CONTEXT *pctx; + int i; /* current position */ + int j; /* end of best match from current position */ + int k; /* end of best match from next position */ + int offs; /* offset to best match */ + int bp; /* bits to store offset */ + int bp_cur; /* saved bits to store offset at current position */ + int mxoff; /* max match offset : 1 << bp */ + unsigned int xout; + unsigned int q; /* aggregated offset and size */ + int have_match; /* do we have a match at the current position? */ + char *ptag; /* location reserved for a tag */ + int tag; /* current value of tag */ + int ntag; /* count of bits still undefined in tag */ + + pctx = ntfs_malloc_nofs(sizeof(struct COMPRESS_CONTEXT)); + if (!pctx) + return -ENOMEM; + + /* + * All hash chains start as empty. The special value '-1' indicates the + * end of each hash chain. + */ + memset(pctx->head, 0xFF, sizeof(pctx->head)); + + pctx->inbuf = (const unsigned char *)inbuf; + pctx->bufsize = bufsize; + xout = 2; + i = 0; + bp = 4; + mxoff = 1 << bp; + pctx->mxsz = (1 << (16 - bp)) + 2; + have_match = 0; + tag = 0; + ntag = 8; + ptag = &outbuf[xout++]; + + while ((i < bufsize) && (xout < (NTFS_SB_SIZE + 2))) { + + /* + * This implementation uses "lazy" parsing: it always chooses + * the longest match, unless the match at the next position is + * longer. This is the same strategy used by the high + * compression modes of zlib. + */ + if (!have_match) { + /* + * Find the longest match at the current position. But + * first adjust the maximum match length if needed. + * (This loop might need to run more than one time in + * the case that we just output a long match.) + */ + while (mxoff < i) { + bp++; + mxoff <<= 1; + pctx->mxsz = (pctx->mxsz + 2) >> 1; + } + ntfs_best_match(pctx, i, 2); + } + + if (pctx->size >= 3) { + /* Found a match at the current position. */ + j = i + pctx->size; + bp_cur = bp; + offs = pctx->rel; + + if (pctx->size >= NICE_MATCH_LEN) { + /* Choose long matches immediately. */ + q = (~offs << (16 - bp_cur)) + (j - i - 3); + outbuf[xout++] = q & 255; + outbuf[xout++] = (q >> 8) & 255; + tag |= (1 << (8 - ntag)); + + if (j == bufsize) { + /* + * Shortcut if the match extends to the + * end of the buffer. + */ + i = j; + --ntag; + break; + } + i += 1; + do { + ntfs_skip_position(pctx, i); + } while (++i != j); + have_match = 0; + } else { + /* + * Check for a longer match at the next + * position. + */ + + /* + * Doesn't need to be while() since we just + * adjusted the maximum match length at the + * previous position. + */ + if (mxoff < i + 1) { + bp++; + mxoff <<= 1; + pctx->mxsz = (pctx->mxsz + 2) >> 1; + } + ntfs_best_match(pctx, i + 1, pctx->size); + k = i + 1 + pctx->size; + + if (k > (j + 1)) { + /* + * Next match is longer. + * Output a literal. + */ + outbuf[xout++] = inbuf[i++]; + have_match = 1; + } else { + /* + * Next match isn't longer. + * Output the current match. + */ + q = (~offs << (16 - bp_cur)) + + (j - i - 3); + outbuf[xout++] = q & 255; + outbuf[xout++] = (q >> 8) & 255; + tag |= (1 << (8 - ntag)); + + /* + * The minimum match length is 3, and + * we've run two bytes through the + * matchfinder already. So the minimum + * number of positions we need to skip + * is 1. + */ + i += 2; + do { + ntfs_skip_position(pctx, i); + } while (++i != j); + have_match = 0; + } + } + } else { + /* No match at current position. Output a literal. */ + outbuf[xout++] = inbuf[i++]; + have_match = 0; + } + + /* Store the tag if fully used. */ + if (!--ntag) { + *ptag = tag; + ntag = 8; + ptag = &outbuf[xout++]; + tag = 0; + } + } + + /* Store the last tag if partially used. */ + if (ntag == 8) + xout--; + else + *ptag = tag; + + /* Determine whether to store the data compressed or uncompressed. */ + if ((i >= bufsize) && (xout < (NTFS_SB_SIZE + 2))) { + /* Compressed. */ + outbuf[0] = (xout - 3) & 255; + outbuf[1] = 0xb0 + (((xout - 3) >> 8) & 15); + } else { + /* Uncompressed. */ + memcpy(&outbuf[2], inbuf, bufsize); + if (bufsize < NTFS_SB_SIZE) + memset(&outbuf[bufsize + 2], 0, NTFS_SB_SIZE - bufsize); + outbuf[0] = 0xff; + outbuf[1] = 0x3f; + xout = NTFS_SB_SIZE + 2; + } + + /* + * Free the compression context and return the total number of bytes + * written to 'outbuf'. + */ + ntfs_free(pctx); + return xout; +} + +static int ntfs_write_cb(struct ntfs_inode *ni, loff_t pos, struct page **pages, + int pages_per_cb) +{ + struct ntfs_volume *vol = ni->vol; + char *outbuf = NULL, *pbuf, *inbuf; + u32 compsz, p, insz = pages_per_cb << PAGE_SHIFT; + s32 rounded, bio_size; + unsigned int sz, bsz; + bool fail = false, allzeroes; + /* a single compressed zero */ + static char onezero[] = {0x01, 0xb0, 0x00, 0x00}; + /* a couple of compressed zeroes */ + static char twozeroes[] = {0x02, 0xb0, 0x00, 0x00, 0x00}; + /* more compressed zeroes, to be followed by some count */ + static char morezeroes[] = {0x03, 0xb0, 0x02, 0x00}; + struct page **pages_disk = NULL, *pg; + s64 bio_lcn; + struct runlist_element *rlc, *rl; + int i, err; + int pages_count = (round_up(ni->itype.compressed.block_size + 2 * + (ni->itype.compressed.block_size / NTFS_SB_SIZE) + 2, PAGE_SIZE)) / PAGE_SIZE; + size_t new_rl_count; + struct bio *bio = NULL; + loff_t new_length; + s64 new_vcn; + + inbuf = vmap(pages, pages_per_cb, VM_MAP, PAGE_KERNEL_RO); + if (!inbuf) + return -ENOMEM; + + /* may need 2 extra bytes per block and 2 more bytes */ + pages_disk = kcalloc(pages_count, sizeof(struct page *), GFP_NOFS); + if (!pages_disk) { + vunmap(inbuf); + return -ENOMEM; + } + + for (i = 0; i < pages_count; i++) { + pg = alloc_page(GFP_KERNEL); + if (!pg) { + err = -ENOMEM; + goto out; + } + pages_disk[i] = pg; + lock_page(pg); + kmap_local_page(pg); + } + + outbuf = vmap(pages_disk, pages_count, VM_MAP, PAGE_KERNEL); + if (!outbuf) { + err = -ENOMEM; + goto out; + } + + compsz = 0; + allzeroes = true; + for (p = 0; (p < insz) && !fail; p += NTFS_SB_SIZE) { + if ((p + NTFS_SB_SIZE) < insz) + bsz = NTFS_SB_SIZE; + else + bsz = insz - p; + pbuf = &outbuf[compsz]; + sz = ntfs_compress_block(&inbuf[p], bsz, pbuf); + /* fail if all the clusters (or more) are needed */ + if (!sz || ((compsz + sz + vol->cluster_size + 2) > + ni->itype.compressed.block_size)) + fail = true; + else { + if (allzeroes) { + /* check whether this is all zeroes */ + switch (sz) { + case 4: + allzeroes = !memcmp(pbuf, onezero, 4); + break; + case 5: + allzeroes = !memcmp(pbuf, twozeroes, 5); + break; + case 6: + allzeroes = !memcmp(pbuf, morezeroes, 4); + break; + default: + allzeroes = false; + break; + } + } + compsz += sz; + } + } + + if (!fail && !allzeroes) { + outbuf[compsz++] = 0; + outbuf[compsz++] = 0; + rounded = ((compsz - 1) | (vol->cluster_size - 1)) + 1; + memset(&outbuf[compsz], 0, rounded - compsz); + bio_size = rounded; + pages = pages_disk; + } else if (allzeroes) { + err = 0; + goto out; + } else { + bio_size = insz; + } + + new_vcn = NTFS_B_TO_CLU(vol, pos & ~(ni->itype.compressed.block_size - 1)); + new_length = NTFS_B_TO_CLU(vol, round_up(bio_size, vol->cluster_size)); + + err = ntfs_non_resident_attr_punch_hole(ni, new_vcn, ni->itype.compressed.block_clusters); + if (err < 0) + goto out; + + rlc = ntfs_cluster_alloc(vol, new_vcn, new_length, -1, DATA_ZONE, + false, true, true); + if (IS_ERR(rlc)) { + err = PTR_ERR(rlc); + goto out; + } + + bio_lcn = rlc->lcn; + down_write(&ni->runlist.lock); + rl = ntfs_runlists_merge(&ni->runlist, rlc, 0, &new_rl_count); + if (IS_ERR(rl)) { + up_write(&ni->runlist.lock); + ntfs_error(vol->sb, "Failed to merge runlists"); + err = PTR_ERR(rl); + if (ntfs_cluster_free_from_rl(vol, rlc)) + ntfs_error(vol->sb, "Failed to free hot clusters."); + ntfs_free(rlc); + goto out; + } + + ni->runlist.count = new_rl_count; + ni->runlist.rl = rl; + + err = ntfs_attr_update_mapping_pairs(ni, 0); + up_write(&ni->runlist.lock); + if (err) { + err = -EIO; + goto out; + } + + i = 0; + while (bio_size > 0) { + int page_size; + + if (bio_size >= PAGE_SIZE) { + page_size = PAGE_SIZE; + bio_size -= PAGE_SIZE; + } else { + page_size = bio_size; + bio_size = 0; + } + +setup_bio: + if (!bio) { + bio = bio_alloc(vol->sb->s_bdev, 1, REQ_OP_WRITE, + GFP_NOIO); + bio->bi_iter.bi_sector = + NTFS_B_TO_SECTOR(vol, NTFS_CLU_TO_B(vol, bio_lcn + i)); + } + + if (!bio_add_page(bio, pages[i], page_size, 0)) { + err = submit_bio_wait(bio); + bio_put(bio); + if (err) + goto out; + bio = NULL; + goto setup_bio; + } + i++; + } + + err = submit_bio_wait(bio); + bio_put(bio); +out: + vunmap(outbuf); + for (i = 0; i < pages_count; i++) { + pg = pages_disk[i]; + if (pg) { + kunmap_local(page_address(pg)); + unlock_page(pg); + put_page(pg); + } + } + kfree(pages_disk); + vunmap(inbuf); + NInoSetFileNameDirty(ni); + mark_mft_record_dirty(ni); + + return err; +} + +int ntfs_compress_write(struct ntfs_inode *ni, loff_t pos, size_t count, + struct iov_iter *from) +{ + struct folio *folio; + struct page **pages = NULL, *page; + int pages_per_cb = ni->itype.compressed.block_size >> PAGE_SHIFT; + int cb_size = ni->itype.compressed.block_size, cb_off, err = 0; + int i, ip; + size_t written = 0; + struct address_space *mapping = VFS_I(ni)->i_mapping; + + pages = kmalloc_array(pages_per_cb, sizeof(struct page *), GFP_NOFS); + if (!pages) + return -ENOMEM; + + while (count) { + pgoff_t index; + size_t copied, bytes; + int off; + + off = pos & (cb_size - 1); + bytes = cb_size - off; + if (bytes > count) + bytes = count; + + cb_off = pos & ~(cb_size - 1); + index = cb_off >> PAGE_SHIFT; + + if (unlikely(fault_in_iov_iter_readable(from, bytes))) { + err = -EFAULT; + goto out; + } + + for (i = 0; i < pages_per_cb; i++) { + folio = read_mapping_folio(mapping, index + i, NULL); + if (IS_ERR(folio)) { + for (ip = 0; ip < i; ip++) { + folio_unlock(page_folio(pages[ip])); + folio_put(page_folio(pages[ip])); + } + err = PTR_ERR(folio); + goto out; + } + + folio_lock(folio); + pages[i] = folio_page(folio, 0); + } + + WARN_ON(!bytes); + copied = 0; + ip = off >> PAGE_SHIFT; + off = offset_in_page(pos); + + for (;;) { + size_t cp, tail = PAGE_SIZE - off; + + page = pages[ip]; + cp = copy_folio_from_iter_atomic(page_folio(page), off, + min(tail, bytes), from); + flush_dcache_page(page); + + copied += cp; + bytes -= cp; + if (!bytes || !cp) + break; + + if (cp < tail) { + off += cp; + } else { + ip++; + off = 0; + } + } + + err = ntfs_write_cb(ni, pos, pages, pages_per_cb); + + for (i = 0; i < pages_per_cb; i++) { + folio = page_folio(pages[i]); + if (i < ip) { + folio_clear_dirty(folio); + folio_mark_uptodate(folio); + } + folio_unlock(folio); + folio_put(folio); + } + + if (err) + goto out; + + cond_resched(); + pos += copied; + written += copied; + count = iov_iter_count(from); + } + +out: + kfree(pages); + if (err < 0) + written = err; + + return written; +} -- 2.25.1 This updates the implementation of runlist handling and cluster allocator. Signed-off-by: Hyunchul Lee Signed-off-by: Namjae Jeon --- fs/ntfs/bitmap.c | 201 ++++++-- fs/ntfs/lcnalloc.c | 692 +++++++++++++------------ fs/ntfs/runlist.c | 1228 ++++++++++++++++++++++++-------------------- 3 files changed, 1169 insertions(+), 952 deletions(-) diff --git a/fs/ntfs/bitmap.c b/fs/ntfs/bitmap.c index 0675b2400873..69e8d9727c03 100644 --- a/fs/ntfs/bitmap.c +++ b/fs/ntfs/bitmap.c @@ -1,19 +1,111 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * bitmap.c - NTFS kernel bitmap handling. Part of the Linux-NTFS project. + * NTFS kernel bitmap handling. Part of the Linux-NTFS project. * * Copyright (c) 2004-2005 Anton Altaparmakov + * Copyright (c) 2025 LG Electronics Co., Ltd. */ -#ifdef NTFS_RW - -#include +#include #include "bitmap.h" -#include "debug.h" #include "aops.h" #include "ntfs.h" +int ntfsp_trim_fs(struct ntfs_volume *vol, struct fstrim_range *range) +{ + size_t buf_clusters; + pgoff_t index, start_index, end_index; + struct file_ra_state *ra; + struct folio *folio; + unsigned long *bitmap; + char *kaddr; + u64 end, trimmed = 0, start_buf, end_buf, end_cluster; + u64 start_cluster = NTFS_B_TO_CLU(vol, range->start); + u32 dq = bdev_discard_granularity(vol->sb->s_bdev); + int ret = 0; + + if (!dq) + dq = vol->cluster_size; + + if (start_cluster >= vol->nr_clusters) + return -EINVAL; + + if (range->len == (u64)-1) + end_cluster = vol->nr_clusters; + else { + end_cluster = NTFS_B_TO_CLU(vol, + (range->start + range->len + vol->cluster_size - 1)); + if (end_cluster > vol->nr_clusters) + end_cluster = vol->nr_clusters; + } + + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return -ENOMEM; + + buf_clusters = PAGE_SIZE * 8; + start_index = start_cluster >> 15; + end_index = (end_cluster + buf_clusters - 1) >> 15; + + for (index = start_index; index < end_index; index++) { + folio = filemap_lock_folio(vol->lcnbmp_ino->i_mapping, index); + if (IS_ERR(folio)) { + page_cache_sync_readahead(vol->lcnbmp_ino->i_mapping, ra, NULL, + index, end_index - index); + folio = read_mapping_folio(vol->lcnbmp_ino->i_mapping, index, NULL); + if (!IS_ERR(folio)) + folio_lock(folio); + } + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + goto out_free; + } + + kaddr = kmap_local_folio(folio, 0); + bitmap = (unsigned long *)kaddr; + + start_buf = max_t(u64, index * buf_clusters, start_cluster); + end_buf = min_t(u64, (index + 1) * buf_clusters, end_cluster); + + end = start_buf; + while (end < end_buf) { + u64 aligned_start, aligned_count; + u64 start = find_next_zero_bit(bitmap, end_buf - start_buf, + end - start_buf) + start_buf; + if (start >= end_buf) + break; + + end = find_next_bit(bitmap, end_buf - start_buf, + start - start_buf) + start_buf; + + aligned_start = ALIGN(NTFS_CLU_TO_B(vol, start), dq); + aligned_count = ALIGN_DOWN(NTFS_CLU_TO_B(vol, end - start), dq); + if (aligned_count >= range->minlen) { + ret = blkdev_issue_discard(vol->sb->s_bdev, aligned_start >> 9, + aligned_count >> 9, GFP_NOFS); + if (ret) + goto out_unmap; + trimmed += aligned_count; + } + } + +out_unmap: + kunmap_local(kaddr); + folio_unlock(folio); + folio_put(folio); + + if (ret) + goto out_free; + } + + range->len = trimmed; + +out_free: + kfree(ra); + return ret; +} + /** * __ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value * @vi: vfs inode describing the bitmap @@ -27,8 +119,6 @@ * * @is_rollback should always be 'false', it is for internal use to rollback * errors. You probably want to use ntfs_bitmap_set_bits_in_run() instead. - * - * Return 0 on success and -errno on error. */ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, const s64 count, const u8 value, const bool is_rollback) @@ -36,19 +126,21 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, s64 cnt = count; pgoff_t index, end_index; struct address_space *mapping; - struct page *page; + struct folio *folio; u8 *kaddr; int pos, len; u8 bit; + struct ntfs_inode *ni = NTFS_I(vi); + struct ntfs_volume *vol = ni->vol; - BUG_ON(!vi); - ntfs_debug("Entering for i_ino 0x%lx, start_bit 0x%llx, count 0x%llx, " - "value %u.%s", vi->i_ino, (unsigned long long)start_bit, + ntfs_debug("Entering for i_ino 0x%lx, start_bit 0x%llx, count 0x%llx, value %u.%s", + vi->i_ino, (unsigned long long)start_bit, (unsigned long long)cnt, (unsigned int)value, is_rollback ? " (rollback)" : ""); - BUG_ON(start_bit < 0); - BUG_ON(cnt < 0); - BUG_ON(value > 1); + + if (start_bit < 0 || cnt < 0 || value > 1) + return -EINVAL; + /* * Calculate the indices for the pages containing the first and last * bits, i.e. @start_bit and @start_bit + @cnt - 1, respectively. @@ -58,14 +150,17 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, /* Get the page containing the first bit (@start_bit). */ mapping = vi->i_mapping; - page = ntfs_map_page(mapping, index); - if (IS_ERR(page)) { + folio = read_mapping_folio(mapping, index, NULL); + if (IS_ERR(folio)) { if (!is_rollback) - ntfs_error(vi->i_sb, "Failed to map first page (error " - "%li), aborting.", PTR_ERR(page)); - return PTR_ERR(page); + ntfs_error(vi->i_sb, + "Failed to map first page (error %li), aborting.", + PTR_ERR(folio)); + return PTR_ERR(folio); } - kaddr = page_address(page); + + folio_lock(folio); + kaddr = kmap_local_folio(folio, 0); /* Set @pos to the position of the byte containing @start_bit. */ pos = (start_bit >> 3) & ~PAGE_MASK; @@ -76,6 +171,9 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, /* If the first byte is partial, modify the appropriate bits in it. */ if (bit) { u8 *byte = kaddr + pos; + + if (ni->mft_no == FILE_Bitmap) + ntfs_set_lcn_empty_bits(vol, index, value, min_t(s64, 8 - bit, cnt)); while ((bit & 7) && cnt) { cnt--; if (value) @@ -97,6 +195,8 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, len = min_t(s64, cnt >> 3, PAGE_SIZE - pos); memset(kaddr + pos, value ? 0xff : 0, len); cnt -= len << 3; + if (ni->mft_no == FILE_Bitmap) + ntfs_set_lcn_empty_bits(vol, index, value, len << 3); /* Update @len to point to the first not-done byte in the page. */ if (cnt < 8) @@ -104,16 +204,25 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, /* If we are not in the last page, deal with all subsequent pages. */ while (index < end_index) { - BUG_ON(cnt <= 0); - - /* Update @index and get the next page. */ - flush_dcache_page(page); - set_page_dirty(page); - ntfs_unmap_page(page); - page = ntfs_map_page(mapping, ++index); - if (IS_ERR(page)) + if (cnt <= 0) + goto rollback; + + /* Update @index and get the next folio. */ + flush_dcache_folio(folio); + folio_mark_dirty(folio); + folio_unlock(folio); + kunmap_local(kaddr); + folio_put(folio); + folio = read_mapping_folio(mapping, ++index, NULL); + if (IS_ERR(folio)) { + ntfs_error(vi->i_sb, + "Failed to map subsequent page (error %li), aborting.", + PTR_ERR(folio)); goto rollback; - kaddr = page_address(page); + } + + folio_lock(folio); + kaddr = kmap_local_folio(folio, 0); /* * Depending on @value, modify all remaining whole bytes in the * page up to @cnt. @@ -121,6 +230,8 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, len = min_t(s64, cnt >> 3, PAGE_SIZE); memset(kaddr, value ? 0xff : 0, len); cnt -= len << 3; + if (ni->mft_no == FILE_Bitmap) + ntfs_set_lcn_empty_bits(vol, index, value, len << 3); } /* * The currently mapped page is the last one. If the last byte is @@ -130,10 +241,12 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, if (cnt) { u8 *byte; - BUG_ON(cnt > 7); + WARN_ON(cnt > 7); bit = cnt; byte = kaddr + len; + if (ni->mft_no == FILE_Bitmap) + ntfs_set_lcn_empty_bits(vol, index, value, bit); while (bit--) { if (value) *byte |= 1 << bit; @@ -142,10 +255,12 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, } } done: - /* We are done. Unmap the page and return success. */ - flush_dcache_page(page); - set_page_dirty(page); - ntfs_unmap_page(page); + /* We are done. Unmap the folio and return success. */ + flush_dcache_folio(folio); + folio_mark_dirty(folio); + folio_unlock(folio); + kunmap_local(kaddr); + folio_put(folio); ntfs_debug("Done."); return 0; rollback: @@ -155,7 +270,7 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, * - @count - @cnt is the number of bits that have been modified */ if (is_rollback) - return PTR_ERR(page); + return PTR_ERR(folio); if (count != cnt) pos = __ntfs_bitmap_set_bits_in_run(vi, start_bit, count - cnt, value ? 0 : 1, true); @@ -163,17 +278,15 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, pos = 0; if (!pos) { /* Rollback was successful. */ - ntfs_error(vi->i_sb, "Failed to map subsequent page (error " - "%li), aborting.", PTR_ERR(page)); + ntfs_error(vi->i_sb, + "Failed to map subsequent page (error %li), aborting.", + PTR_ERR(folio)); } else { /* Rollback failed. */ - ntfs_error(vi->i_sb, "Failed to map subsequent page (error " - "%li) and rollback failed (error %i). " - "Aborting and leaving inconsistent metadata. " - "Unmount and run chkdsk.", PTR_ERR(page), pos); + ntfs_error(vi->i_sb, + "Failed to map subsequent page (error %li) and rollback failed (error %i). Aborting and leaving inconsistent metadata. Unmount and run chkdsk.", + PTR_ERR(folio), pos); NVolSetErrors(NTFS_SB(vi->i_sb)); } - return PTR_ERR(page); + return PTR_ERR(folio); } - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c index eda9972e6159..6234f1041ba0 100644 --- a/fs/ntfs/lcnalloc.c +++ b/fs/ntfs/lcnalloc.c @@ -1,20 +1,20 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * lcnalloc.c - Cluster (de)allocation code. Part of the Linux-NTFS project. + * Cluster (de)allocation code. Part of the Linux-NTFS project. * * Copyright (c) 2004-2005 Anton Altaparmakov + * Copyright (c) 2025 LG Electronics Co., Ltd. + * + * Part of this file is based on code from the NTFS-3G project. + * and is copyrighted by the respective authors below: + * Copyright (c) 2002-2004 Anton Altaparmakov + * Copyright (c) 2004 Yura Pakhuchiy + * Copyright (c) 2004-2008 Szabolcs Szakacsits + * Copyright (c) 2008-2009 Jean-Pierre Andre */ -#ifdef NTFS_RW - -#include - #include "lcnalloc.h" -#include "debug.h" #include "bitmap.h" -#include "inode.h" -#include "volume.h" -#include "attrib.h" #include "malloc.h" #include "aops.h" #include "ntfs.h" @@ -33,15 +33,20 @@ * Locking: - The volume lcn bitmap must be locked for writing on entry and is * left locked on return. */ -int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol, - const runlist_element *rl) +int ntfs_cluster_free_from_rl_nolock(struct ntfs_volume *vol, + const struct runlist_element *rl) { struct inode *lcnbmp_vi = vol->lcnbmp_ino; int ret = 0; + s64 nr_freed = 0; ntfs_debug("Entering."); if (!rl) return 0; + + if (!NVolFreeClusterKnown(vol)) + wait_event(vol->free_waitq, NVolFreeClusterKnown(vol)); + for (; rl->length; rl++) { int err; @@ -50,19 +55,69 @@ int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol, err = ntfs_bitmap_clear_run(lcnbmp_vi, rl->lcn, rl->length); if (unlikely(err && (!ret || ret == -ENOMEM) && ret != err)) ret = err; + else + nr_freed += rl->length; } + ntfs_inc_free_clusters(vol, nr_freed); ntfs_debug("Done."); return ret; } +static s64 max_empty_bit_range(unsigned char *buf, int size) +{ + int i, j, run = 0; + int max_range = 0; + s64 start_pos = -1; + + ntfs_debug("Entering\n"); + + i = 0; + while (i < size) { + switch (*buf) { + case 0: + do { + buf++; + run += 8; + i++; + } while ((i < size) && !*buf); + break; + case 255: + if (run > max_range) { + max_range = run; + start_pos = (s64)i * 8 - run; + } + run = 0; + do { + buf++; + i++; + } while ((i < size) && (*buf == 255)); + break; + default: + for (j = 0; j < 8; j++) { + int bit = *buf & (1 << j); + + if (bit) { + if (run > max_range) { + max_range = run; + start_pos = (s64)i * 8 + (j - run); + } + run = 0; + } else + run++; + } + i++; + buf++; + } + } + + if (run > max_range) + start_pos = (s64)i * 8 - run; + + return start_pos; +} + /** * ntfs_cluster_alloc - allocate clusters on an ntfs volume - * @vol: mounted ntfs volume on which to allocate the clusters - * @start_vcn: vcn to use for the first allocated cluster - * @count: number of clusters to allocate - * @start_lcn: starting lcn at which to allocate the clusters (or -1 if none) - * @zone: zone from which to allocate the clusters - * @is_extension: if 'true', this is an attribute extension * * Allocate @count clusters preferably starting at cluster @start_lcn or at the * current allocator position if @start_lcn is -1, on the mounted ntfs volume @@ -109,62 +164,62 @@ int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol, * for speed, but the algorithm is, so further speed improvements are probably * possible). * - * FIXME: We should be monitoring cluster allocation and increment the MFT zone - * size dynamically but this is something for the future. We will just cause - * heavier fragmentation by not doing it and I am not even sure Windows would - * grow the MFT zone dynamically, so it might even be correct not to do this. - * The overhead in doing dynamic MFT zone expansion would be very large and - * unlikely worth the effort. (AIA) - * - * TODO: I have added in double the required zone position pointer wrap around - * logic which can be optimized to having only one of the two logic sets. - * However, having the double logic will work fine, but if we have only one of - * the sets and we get it wrong somewhere, then we get into trouble, so - * removing the duplicate logic requires _very_ careful consideration of _all_ - * possible code paths. So at least for now, I am leaving the double logic - - * better safe than sorry... (AIA) - * * Locking: - The volume lcn bitmap must be unlocked on entry and is unlocked * on return. * - This function takes the volume lcn bitmap lock for writing and * modifies the bitmap contents. */ -runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn, - const s64 count, const LCN start_lcn, - const NTFS_CLUSTER_ALLOCATION_ZONES zone, - const bool is_extension) +struct runlist_element *ntfs_cluster_alloc(struct ntfs_volume *vol, const s64 start_vcn, + const s64 count, const s64 start_lcn, + const int zone, + const bool is_extension, + const bool is_contig, + const bool is_dealloc) { - LCN zone_start, zone_end, bmp_pos, bmp_initial_pos, last_read_pos, lcn; - LCN prev_lcn = 0, prev_run_len = 0, mft_zone_size; - s64 clusters; + s64 zone_start, zone_end, bmp_pos, bmp_initial_pos, last_read_pos, lcn; + s64 prev_lcn = 0, prev_run_len = 0, mft_zone_size; + s64 clusters, free_clusters; loff_t i_size; struct inode *lcnbmp_vi; - runlist_element *rl = NULL; + struct runlist_element *rl = NULL; struct address_space *mapping; - struct page *page = NULL; - u8 *buf, *byte; - int err = 0, rlpos, rlsize, buf_size; + struct folio *folio = NULL; + u8 *buf = NULL, *byte; + int err = 0, rlpos, rlsize, buf_size, pg_off; u8 pass, done_zones, search_zone, need_writeback = 0, bit; + unsigned int memalloc_flags; + u8 has_guess; + pgoff_t index; - ntfs_debug("Entering for start_vcn 0x%llx, count 0x%llx, start_lcn " - "0x%llx, zone %s_ZONE.", (unsigned long long)start_vcn, - (unsigned long long)count, - (unsigned long long)start_lcn, + ntfs_debug("Entering for start_vcn 0x%llx, count 0x%llx, start_lcn 0x%llx, zone %s_ZONE.", + start_vcn, count, start_lcn, zone == MFT_ZONE ? "MFT" : "DATA"); - BUG_ON(!vol); + lcnbmp_vi = vol->lcnbmp_ino; - BUG_ON(!lcnbmp_vi); - BUG_ON(start_vcn < 0); - BUG_ON(count < 0); - BUG_ON(start_lcn < -1); - BUG_ON(zone < FIRST_ZONE); - BUG_ON(zone > LAST_ZONE); + if (start_vcn < 0 || start_lcn < LCN_HOLE || + zone < FIRST_ZONE || zone > LAST_ZONE) + return ERR_PTR(-EINVAL); /* Return NULL if @count is zero. */ - if (!count) - return NULL; + if (count < 0 || !count) + return ERR_PTR(-EINVAL); + + memalloc_flags = memalloc_nofs_save(); + + if (!NVolFreeClusterKnown(vol)) + wait_event(vol->free_waitq, NVolFreeClusterKnown(vol)); + free_clusters = atomic64_read(&vol->free_clusters); + /* Take the lcnbmp lock for writing. */ down_write(&vol->lcnbmp_lock); + if (is_dealloc == false) + free_clusters -= atomic64_read(&vol->dirty_clusters); + + if (free_clusters < count) { + up_write(&vol->lcnbmp_lock); + return ERR_PTR(-ENOSPC); + } + /* * If no specific @start_lcn was requested, use the current data zone * position, otherwise use the requested @start_lcn but make sure it @@ -183,7 +238,9 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn, * volume) and 4 for data zone 2 (start of volume till start of mft * zone). */ + has_guess = 1; zone_start = start_lcn; + if (zone_start < 0) { if (zone == DATA_ZONE) zone_start = vol->data1_zone_pos; @@ -196,39 +253,28 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn, */ pass = 2; } - } else if (zone == DATA_ZONE && zone_start >= vol->mft_zone_start && - zone_start < vol->mft_zone_end) { - zone_start = vol->mft_zone_end; - /* - * Starting at beginning of data1_zone which means a single - * pass in this zone is sufficient. - */ - pass = 2; - } else if (zone == MFT_ZONE && (zone_start < vol->mft_zone_start || - zone_start >= vol->mft_zone_end)) { - zone_start = vol->mft_lcn; - if (!vol->mft_zone_end) - zone_start = 0; - /* - * Starting at beginning of volume which means a single pass - * is sufficient. - */ - pass = 2; + has_guess = 0; } - if (zone == MFT_ZONE) { + + if (!zone_start || zone_start == vol->mft_zone_start || + zone_start == vol->mft_zone_end) + pass = 2; + + if (zone_start < vol->mft_zone_start) { + zone_end = vol->mft_zone_start; + search_zone = 4; + /* Skip searching the mft zone. */ + done_zones |= 1; + } else if (zone_start < vol->mft_zone_end) { zone_end = vol->mft_zone_end; search_zone = 1; - } else /* if (zone == DATA_ZONE) */ { + } else { + zone_end = vol->nr_clusters; + search_zone = 2; /* Skip searching the mft zone. */ done_zones |= 1; - if (zone_start >= vol->mft_zone_end) { - zone_end = vol->nr_clusters; - search_zone = 2; - } else { - zone_end = vol->mft_zone_start; - search_zone = 4; - } } + /* * bmp_pos is the current bit position inside the bitmap. We use * bmp_initial_pos to determine whether or not to do a zone switch. @@ -241,75 +287,75 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn, mapping = lcnbmp_vi->i_mapping; i_size = i_size_read(lcnbmp_vi); while (1) { - ntfs_debug("Start of outer while loop: done_zones 0x%x, " - "search_zone %i, pass %i, zone_start 0x%llx, " - "zone_end 0x%llx, bmp_initial_pos 0x%llx, " - "bmp_pos 0x%llx, rlpos %i, rlsize %i.", + ntfs_debug("Start of outer while loop: done_zones 0x%x, search_zone %i, pass %i, zone_start 0x%llx, zone_end 0x%llx, bmp_initial_pos 0x%llx, bmp_pos 0x%llx, rlpos %i, rlsize %i.", done_zones, search_zone, pass, - (unsigned long long)zone_start, - (unsigned long long)zone_end, - (unsigned long long)bmp_initial_pos, - (unsigned long long)bmp_pos, rlpos, rlsize); + zone_start, zone_end, bmp_initial_pos, + bmp_pos, rlpos, rlsize); /* Loop until we run out of free clusters. */ last_read_pos = bmp_pos >> 3; - ntfs_debug("last_read_pos 0x%llx.", - (unsigned long long)last_read_pos); - if (last_read_pos > i_size) { - ntfs_debug("End of attribute reached. " - "Skipping to zone_pass_done."); + ntfs_debug("last_read_pos 0x%llx.", last_read_pos); + if (last_read_pos >= i_size) { + ntfs_debug("End of attribute reached. Skipping to zone_pass_done."); goto zone_pass_done; } - if (likely(page)) { + if (likely(folio)) { if (need_writeback) { ntfs_debug("Marking page dirty."); - flush_dcache_page(page); - set_page_dirty(page); + flush_dcache_folio(folio); + folio_mark_dirty(folio); need_writeback = 0; } - ntfs_unmap_page(page); - } - page = ntfs_map_page(mapping, last_read_pos >> - PAGE_SHIFT); - if (IS_ERR(page)) { - err = PTR_ERR(page); - ntfs_error(vol->sb, "Failed to map page."); - goto out; + folio_unlock(folio); + kunmap_local(buf); + folio_put(folio); + folio = NULL; } - buf_size = last_read_pos & ~PAGE_MASK; - buf = page_address(page) + buf_size; - buf_size = PAGE_SIZE - buf_size; + + index = last_read_pos >> PAGE_SHIFT; + pg_off = last_read_pos & ~PAGE_MASK; + buf_size = PAGE_SIZE - pg_off; if (unlikely(last_read_pos + buf_size > i_size)) buf_size = i_size - last_read_pos; buf_size <<= 3; lcn = bmp_pos & 7; - bmp_pos &= ~(LCN)7; - ntfs_debug("Before inner while loop: buf_size %i, lcn 0x%llx, " - "bmp_pos 0x%llx, need_writeback %i.", buf_size, - (unsigned long long)lcn, - (unsigned long long)bmp_pos, need_writeback); + bmp_pos &= ~(s64)7; + + if (vol->lcn_empty_bits_per_page[index] == 0) + goto next_bmp_pos; + + folio = read_mapping_folio(mapping, index, NULL); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); + ntfs_error(vol->sb, "Failed to map page."); + goto out; + } + + folio_lock(folio); + buf = kmap_local_folio(folio, 0) + pg_off; + ntfs_debug("Before inner while loop: buf_size %i, lcn 0x%llx, bmp_pos 0x%llx, need_writeback %i.", + buf_size, lcn, bmp_pos, need_writeback); while (lcn < buf_size && lcn + bmp_pos < zone_end) { byte = buf + (lcn >> 3); - ntfs_debug("In inner while loop: buf_size %i, " - "lcn 0x%llx, bmp_pos 0x%llx, " - "need_writeback %i, byte ofs 0x%x, " - "*byte 0x%x.", buf_size, - (unsigned long long)lcn, - (unsigned long long)bmp_pos, - need_writeback, + ntfs_debug("In inner while loop: buf_size %i, lcn 0x%llx, bmp_pos 0x%llx, need_writeback %i, byte ofs 0x%x, *byte 0x%x.", + buf_size, lcn, bmp_pos, need_writeback, (unsigned int)(lcn >> 3), (unsigned int)*byte); - /* Skip full bytes. */ - if (*byte == 0xff) { - lcn = (lcn + 8) & ~(LCN)7; - ntfs_debug("Continuing while loop 1."); - continue; - } bit = 1 << (lcn & 7); ntfs_debug("bit 0x%x.", bit); - /* If the bit is already set, go onto the next one. */ - if (*byte & bit) { - lcn++; - ntfs_debug("Continuing while loop 2."); + + if (has_guess) { + if (*byte & bit) { + if (is_contig == true && prev_run_len > 0) + goto done; + + has_guess = 0; + break; + } + } else { + lcn = max_empty_bit_range(buf, buf_size >> 3); + if (lcn < 0) + break; + has_guess = 1; continue; } /* @@ -318,19 +364,16 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn, * ntfs_malloc_nofs() operates on whole pages only. */ if ((rlpos + 2) * sizeof(*rl) > rlsize) { - runlist_element *rl2; + struct runlist_element *rl2; ntfs_debug("Reallocating memory."); if (!rl) - ntfs_debug("First free bit is at LCN " - "0x%llx.", - (unsigned long long) - (lcn + bmp_pos)); + ntfs_debug("First free bit is at s64 0x%llx.", + lcn + bmp_pos); rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE); if (unlikely(!rl2)) { err = -ENOMEM; - ntfs_error(vol->sb, "Failed to " - "allocate memory."); + ntfs_error(vol->sb, "Failed to allocate memory."); goto out; } memcpy(rl2, rl, rlsize); @@ -346,50 +389,33 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn, need_writeback = 1; ntfs_debug("*byte 0x%x, need_writeback is set.", (unsigned int)*byte); + ntfs_dec_free_clusters(vol, 1); + ntfs_set_lcn_empty_bits(vol, index, 1, 1); + /* * Coalesce with previous run if adjacent LCNs. * Otherwise, append a new run. */ - ntfs_debug("Adding run (lcn 0x%llx, len 0x%llx), " - "prev_lcn 0x%llx, lcn 0x%llx, " - "bmp_pos 0x%llx, prev_run_len 0x%llx, " - "rlpos %i.", - (unsigned long long)(lcn + bmp_pos), - 1ULL, (unsigned long long)prev_lcn, - (unsigned long long)lcn, - (unsigned long long)bmp_pos, - (unsigned long long)prev_run_len, - rlpos); + ntfs_debug("Adding run (lcn 0x%llx, len 0x%llx), prev_lcn 0x%llx, lcn 0x%llx, bmp_pos 0x%llx, prev_run_len 0x%llx, rlpos %i.", + lcn + bmp_pos, 1ULL, prev_lcn, + lcn, bmp_pos, prev_run_len, rlpos); if (prev_lcn == lcn + bmp_pos - prev_run_len && rlpos) { - ntfs_debug("Coalescing to run (lcn 0x%llx, " - "len 0x%llx).", - (unsigned long long) + ntfs_debug("Coalescing to run (lcn 0x%llx, len 0x%llx).", rl[rlpos - 1].lcn, - (unsigned long long) rl[rlpos - 1].length); rl[rlpos - 1].length = ++prev_run_len; - ntfs_debug("Run now (lcn 0x%llx, len 0x%llx), " - "prev_run_len 0x%llx.", - (unsigned long long) + ntfs_debug("Run now (lcn 0x%llx, len 0x%llx), prev_run_len 0x%llx.", rl[rlpos - 1].lcn, - (unsigned long long) rl[rlpos - 1].length, - (unsigned long long) prev_run_len); } else { if (likely(rlpos)) { - ntfs_debug("Adding new run, (previous " - "run lcn 0x%llx, " - "len 0x%llx).", - (unsigned long long) - rl[rlpos - 1].lcn, - (unsigned long long) - rl[rlpos - 1].length); + ntfs_debug("Adding new run, (previous run lcn 0x%llx, len 0x%llx).", + rl[rlpos - 1].lcn, rl[rlpos - 1].length); rl[rlpos].vcn = rl[rlpos - 1].vcn + prev_run_len; } else { - ntfs_debug("Adding new run, is first " - "run."); + ntfs_debug("Adding new run, is first run."); rl[rlpos].vcn = start_vcn; } rl[rlpos].lcn = prev_lcn = lcn + bmp_pos; @@ -398,24 +424,19 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn, } /* Done? */ if (!--clusters) { - LCN tc; + s64 tc; +done: /* * Update the current zone position. Positions * of already scanned zones have been updated * during the respective zone switches. */ tc = lcn + bmp_pos + 1; - ntfs_debug("Done. Updating current zone " - "position, tc 0x%llx, " - "search_zone %i.", - (unsigned long long)tc, - search_zone); + ntfs_debug("Done. Updating current zone position, tc 0x%llx, search_zone %i.", + tc, search_zone); switch (search_zone) { case 1: - ntfs_debug("Before checks, " - "vol->mft_zone_pos " - "0x%llx.", - (unsigned long long) + ntfs_debug("Before checks, vol->mft_zone_pos 0x%llx.", vol->mft_zone_pos); if (tc >= vol->mft_zone_end) { vol->mft_zone_pos = @@ -427,17 +448,11 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn, tc > vol->mft_zone_pos) && tc >= vol->mft_lcn) vol->mft_zone_pos = tc; - ntfs_debug("After checks, " - "vol->mft_zone_pos " - "0x%llx.", - (unsigned long long) + ntfs_debug("After checks, vol->mft_zone_pos 0x%llx.", vol->mft_zone_pos); break; case 2: - ntfs_debug("Before checks, " - "vol->data1_zone_pos " - "0x%llx.", - (unsigned long long) + ntfs_debug("Before checks, vol->data1_zone_pos 0x%llx.", vol->data1_zone_pos); if (tc >= vol->nr_clusters) vol->data1_zone_pos = @@ -447,17 +462,11 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn, tc > vol->data1_zone_pos) && tc >= vol->mft_zone_end) vol->data1_zone_pos = tc; - ntfs_debug("After checks, " - "vol->data1_zone_pos " - "0x%llx.", - (unsigned long long) + ntfs_debug("After checks, vol->data1_zone_pos 0x%llx.", vol->data1_zone_pos); break; case 4: - ntfs_debug("Before checks, " - "vol->data2_zone_pos " - "0x%llx.", - (unsigned long long) + ntfs_debug("Before checks, vol->data2_zone_pos 0x%llx.", vol->data2_zone_pos); if (tc >= vol->mft_zone_start) vol->data2_zone_pos = 0; @@ -465,30 +474,24 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn, vol->data2_zone_pos || tc > vol->data2_zone_pos) vol->data2_zone_pos = tc; - ntfs_debug("After checks, " - "vol->data2_zone_pos " - "0x%llx.", - (unsigned long long) + ntfs_debug("After checks, vol->data2_zone_pos 0x%llx.", vol->data2_zone_pos); break; default: - BUG(); + WARN_ON(1); } ntfs_debug("Finished. Going to out."); goto out; } lcn++; } +next_bmp_pos: bmp_pos += buf_size; - ntfs_debug("After inner while loop: buf_size 0x%x, lcn " - "0x%llx, bmp_pos 0x%llx, need_writeback %i.", - buf_size, (unsigned long long)lcn, - (unsigned long long)bmp_pos, need_writeback); + ntfs_debug("After inner while loop: buf_size 0x%x, lcn 0x%llx, bmp_pos 0x%llx, need_writeback %i.", + buf_size, lcn, bmp_pos, need_writeback); if (bmp_pos < zone_end) { - ntfs_debug("Continuing outer while loop, " - "bmp_pos 0x%llx, zone_end 0x%llx.", - (unsigned long long)bmp_pos, - (unsigned long long)zone_end); + ntfs_debug("Continuing outer while loop, bmp_pos 0x%llx, zone_end 0x%llx.", + bmp_pos, zone_end); continue; } zone_pass_done: /* Finished with the current zone pass. */ @@ -511,23 +514,18 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn, zone_start = 0; break; default: - BUG(); + WARN_ON(1); } /* Sanity check. */ if (zone_end < zone_start) zone_end = zone_start; bmp_pos = zone_start; - ntfs_debug("Continuing outer while loop, pass 2, " - "zone_start 0x%llx, zone_end 0x%llx, " - "bmp_pos 0x%llx.", - (unsigned long long)zone_start, - (unsigned long long)zone_end, - (unsigned long long)bmp_pos); + ntfs_debug("Continuing outer while loop, pass 2, zone_start 0x%llx, zone_end 0x%llx, bmp_pos 0x%llx.", + zone_start, zone_end, bmp_pos); continue; } /* pass == 2 */ done_zones_check: - ntfs_debug("At done_zones_check, search_zone %i, done_zones " - "before 0x%x, done_zones after 0x%x.", + ntfs_debug("At done_zones_check, search_zone %i, done_zones before 0x%x, done_zones after 0x%x.", search_zone, done_zones, done_zones | search_zone); done_zones |= search_zone; @@ -537,16 +535,12 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn, pass = 1; switch (search_zone) { case 1: - ntfs_debug("Switching from mft zone to data1 " - "zone."); + ntfs_debug("Switching from mft zone to data1 zone."); /* Update mft zone position. */ if (rlpos) { - LCN tc; + s64 tc; - ntfs_debug("Before checks, " - "vol->mft_zone_pos " - "0x%llx.", - (unsigned long long) + ntfs_debug("Before checks, vol->mft_zone_pos 0x%llx.", vol->mft_zone_pos); tc = rl[rlpos - 1].lcn + rl[rlpos - 1].length; @@ -560,10 +554,7 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn, tc > vol->mft_zone_pos) && tc >= vol->mft_lcn) vol->mft_zone_pos = tc; - ntfs_debug("After checks, " - "vol->mft_zone_pos " - "0x%llx.", - (unsigned long long) + ntfs_debug("After checks, vol->mft_zone_pos 0x%llx.", vol->mft_zone_pos); } /* Switch from mft zone to data1 zone. */ @@ -580,16 +571,12 @@ switch_to_data1_zone: search_zone = 2; } break; case 2: - ntfs_debug("Switching from data1 zone to " - "data2 zone."); + ntfs_debug("Switching from data1 zone to data2 zone."); /* Update data1 zone position. */ if (rlpos) { - LCN tc; + s64 tc; - ntfs_debug("Before checks, " - "vol->data1_zone_pos " - "0x%llx.", - (unsigned long long) + ntfs_debug("Before checks, vol->data1_zone_pos 0x%llx.", vol->data1_zone_pos); tc = rl[rlpos - 1].lcn + rl[rlpos - 1].length; @@ -601,10 +588,7 @@ switch_to_data1_zone: search_zone = 2; tc > vol->data1_zone_pos) && tc >= vol->mft_zone_end) vol->data1_zone_pos = tc; - ntfs_debug("After checks, " - "vol->data1_zone_pos " - "0x%llx.", - (unsigned long long) + ntfs_debug("After checks, vol->data1_zone_pos 0x%llx.", vol->data1_zone_pos); } /* Switch from data1 zone to data2 zone. */ @@ -621,16 +605,12 @@ switch_to_data1_zone: search_zone = 2; } break; case 4: - ntfs_debug("Switching from data2 zone to " - "data1 zone."); + ntfs_debug("Switching from data2 zone to data1 zone."); /* Update data2 zone position. */ if (rlpos) { - LCN tc; + s64 tc; - ntfs_debug("Before checks, " - "vol->data2_zone_pos " - "0x%llx.", - (unsigned long long) + ntfs_debug("Before checks, vol->data2_zone_pos 0x%llx.", vol->data2_zone_pos); tc = rl[rlpos - 1].lcn + rl[rlpos - 1].length; @@ -640,28 +620,22 @@ switch_to_data1_zone: search_zone = 2; vol->data2_zone_pos || tc > vol->data2_zone_pos) vol->data2_zone_pos = tc; - ntfs_debug("After checks, " - "vol->data2_zone_pos " - "0x%llx.", - (unsigned long long) + ntfs_debug("After checks, vol->data2_zone_pos 0x%llx.", vol->data2_zone_pos); } /* Switch from data2 zone to data1 zone. */ goto switch_to_data1_zone; default: - BUG(); + WARN_ON(1); } - ntfs_debug("After zone switch, search_zone %i, " - "pass %i, bmp_initial_pos 0x%llx, " - "zone_start 0x%llx, zone_end 0x%llx.", + ntfs_debug("After zone switch, search_zone %i, pass %i, bmp_initial_pos 0x%llx, zone_start 0x%llx, zone_end 0x%llx.", search_zone, pass, - (unsigned long long)bmp_initial_pos, - (unsigned long long)zone_start, - (unsigned long long)zone_end); + bmp_initial_pos, + zone_start, + zone_end); bmp_pos = zone_start; if (zone_start == zone_end) { - ntfs_debug("Empty zone, going to " - "done_zones_check."); + ntfs_debug("Empty zone, going to done_zones_check."); /* Empty zone. Don't bother searching it. */ goto done_zones_check; } @@ -674,11 +648,9 @@ switch_to_data1_zone: search_zone = 2; * MFT_ZONE, we have really run out of space. */ mft_zone_size = vol->mft_zone_end - vol->mft_zone_start; - ntfs_debug("vol->mft_zone_start 0x%llx, vol->mft_zone_end " - "0x%llx, mft_zone_size 0x%llx.", - (unsigned long long)vol->mft_zone_start, - (unsigned long long)vol->mft_zone_end, - (unsigned long long)mft_zone_size); + ntfs_debug("vol->mft_zone_start 0x%llx, vol->mft_zone_end 0x%llx, mft_zone_size 0x%llx.", + vol->mft_zone_start, vol->mft_zone_end, + mft_zone_size); if (zone == MFT_ZONE || mft_zone_size <= 0) { ntfs_debug("No free clusters left, going to out."); /* Really no more space left on device. */ @@ -703,20 +675,11 @@ switch_to_data1_zone: search_zone = 2; search_zone = 2; pass = 2; done_zones &= ~2; - ntfs_debug("After shrinking mft zone, mft_zone_size 0x%llx, " - "vol->mft_zone_start 0x%llx, " - "vol->mft_zone_end 0x%llx, " - "vol->mft_zone_pos 0x%llx, search_zone 2, " - "pass 2, dones_zones 0x%x, zone_start 0x%llx, " - "zone_end 0x%llx, vol->data1_zone_pos 0x%llx, " - "continuing outer while loop.", - (unsigned long long)mft_zone_size, - (unsigned long long)vol->mft_zone_start, - (unsigned long long)vol->mft_zone_end, - (unsigned long long)vol->mft_zone_pos, - done_zones, (unsigned long long)zone_start, - (unsigned long long)zone_end, - (unsigned long long)vol->data1_zone_pos); + ntfs_debug("After shrinking mft zone, mft_zone_size 0x%llx, vol->mft_zone_start 0x%llx, vol->mft_zone_end 0x%llx, vol->mft_zone_pos 0x%llx, search_zone 2, pass 2, dones_zones 0x%x, zone_start 0x%llx, zone_end 0x%llx, vol->data1_zone_pos 0x%llx, continuing outer while loop.", + mft_zone_size, vol->mft_zone_start, + vol->mft_zone_end, vol->mft_zone_pos, + done_zones, zone_start, zone_end, + vol->data1_zone_pos); } ntfs_debug("After outer while loop."); out: @@ -727,48 +690,52 @@ switch_to_data1_zone: search_zone = 2; rl[rlpos].lcn = is_extension ? LCN_ENOENT : LCN_RL_NOT_MAPPED; rl[rlpos].length = 0; } - if (likely(page && !IS_ERR(page))) { + if (likely(folio && !IS_ERR(folio))) { if (need_writeback) { ntfs_debug("Marking page dirty."); - flush_dcache_page(page); - set_page_dirty(page); + flush_dcache_folio(folio); + folio_mark_dirty(folio); need_writeback = 0; } - ntfs_unmap_page(page); + folio_unlock(folio); + kunmap_local(buf); + folio_put(folio); } if (likely(!err)) { + if (is_dealloc == true) + ntfs_release_dirty_clusters(vol, rl->length); up_write(&vol->lcnbmp_lock); + memalloc_nofs_restore(memalloc_flags); ntfs_debug("Done."); - return rl; + return rl == NULL ? ERR_PTR(-EIO) : rl; } - ntfs_error(vol->sb, "Failed to allocate clusters, aborting " - "(error %i).", err); + if (err != -ENOSPC) + ntfs_error(vol->sb, + "Failed to allocate clusters, aborting (error %i).", + err); if (rl) { int err2; if (err == -ENOSPC) - ntfs_debug("Not enough space to complete allocation, " - "err -ENOSPC, first free lcn 0x%llx, " - "could allocate up to 0x%llx " - "clusters.", - (unsigned long long)rl[0].lcn, - (unsigned long long)(count - clusters)); + ntfs_debug("Not enough space to complete allocation, err -ENOSPC, first free lcn 0x%llx, could allocate up to 0x%llx clusters.", + rl[0].lcn, count - clusters); /* Deallocate all allocated clusters. */ ntfs_debug("Attempting rollback..."); err2 = ntfs_cluster_free_from_rl_nolock(vol, rl); if (err2) { - ntfs_error(vol->sb, "Failed to rollback (error %i). " - "Leaving inconsistent metadata! " - "Unmount and run chkdsk.", err2); + ntfs_error(vol->sb, + "Failed to rollback (error %i). Leaving inconsistent metadata! Unmount and run chkdsk.", + err2); NVolSetErrors(vol); } /* Free the runlist. */ ntfs_free(rl); } else if (err == -ENOSPC) - ntfs_debug("No space left at all, err = -ENOSPC, first free " - "lcn = 0x%llx.", - (long long)vol->data1_zone_pos); + ntfs_debug("No space left at all, err = -ENOSPC, first free lcn = 0x%llx.", + vol->data1_zone_pos); + atomic64_set(&vol->dirty_clusters, 0); up_write(&vol->lcnbmp_lock); + memalloc_nofs_restore(memalloc_flags); return ERR_PTR(err); } @@ -801,8 +768,8 @@ switch_to_data1_zone: search_zone = 2; * you will probably want to do: * m = ctx->mrec; * a = ctx->attr; - * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that - * you cache ctx->mrec in a variable @m of type MFT_RECORD *. + * Assuming you cache ctx->attr in a variable @a of type attr_record * and that + * you cache ctx->mrec in a variable @m of type struct mft_record *. * * @is_rollback should always be 'false', it is for internal use to rollback * errors. You probably want to use ntfs_cluster_free() instead. @@ -832,25 +799,27 @@ switch_to_data1_zone: search_zone = 2; * - If @ctx is not NULL, the base mft record must be mapped on entry * and it will be left mapped on return. */ -s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count, - ntfs_attr_search_ctx *ctx, const bool is_rollback) +s64 __ntfs_cluster_free(struct ntfs_inode *ni, const s64 start_vcn, s64 count, + struct ntfs_attr_search_ctx *ctx, const bool is_rollback) { s64 delta, to_free, total_freed, real_freed; - ntfs_volume *vol; + struct ntfs_volume *vol; struct inode *lcnbmp_vi; - runlist_element *rl; + struct runlist_element *rl; int err; + unsigned int memalloc_flags; - BUG_ON(!ni); - ntfs_debug("Entering for i_ino 0x%lx, start_vcn 0x%llx, count " - "0x%llx.%s", ni->mft_no, (unsigned long long)start_vcn, - (unsigned long long)count, + ntfs_debug("Entering for i_ino 0x%lx, start_vcn 0x%llx, count 0x%llx.%s", + ni->mft_no, start_vcn, count, is_rollback ? " (rollback)" : ""); vol = ni->vol; lcnbmp_vi = vol->lcnbmp_ino; - BUG_ON(!lcnbmp_vi); - BUG_ON(start_vcn < 0); - BUG_ON(count < -1); + if (start_vcn < 0 || count < -1) + return -EINVAL; + + if (!NVolFreeClusterKnown(vol)) + wait_event(vol->free_waitq, NVolFreeClusterKnown(vol)); + /* * Lock the lcn bitmap for writing but only if not rolling back. We * must hold the lock all the way including through rollback otherwise @@ -858,24 +827,33 @@ s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count, * dropped the lock, anyone could have set the bit again, thus * allocating the cluster for another use. */ - if (likely(!is_rollback)) + if (likely(!is_rollback)) { + memalloc_flags = memalloc_nofs_save(); down_write(&vol->lcnbmp_lock); + } total_freed = real_freed = 0; rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, ctx); if (IS_ERR(rl)) { - if (!is_rollback) - ntfs_error(vol->sb, "Failed to find first runlist " - "element (error %li), aborting.", - PTR_ERR(rl)); err = PTR_ERR(rl); + if (err == -ENOENT) { + if (likely(!is_rollback)) { + up_write(&vol->lcnbmp_lock); + memalloc_nofs_restore(memalloc_flags); + } + return 0; + } + + if (!is_rollback) + ntfs_error(vol->sb, + "Failed to find first runlist element (error %d), aborting.", + err); goto err_out; } if (unlikely(rl->lcn < LCN_HOLE)) { if (!is_rollback) - ntfs_error(vol->sb, "First runlist element has " - "invalid lcn, aborting."); + ntfs_error(vol->sb, "First runlist element has invalid lcn, aborting."); err = -EIO; goto err_out; } @@ -893,13 +871,14 @@ s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count, to_free, likely(!is_rollback) ? 0 : 1); if (unlikely(err)) { if (!is_rollback) - ntfs_error(vol->sb, "Failed to clear first run " - "(error %i), aborting.", err); + ntfs_error(vol->sb, + "Failed to clear first run (error %i), aborting.", + err); goto err_out; } /* We have freed @to_free real clusters. */ real_freed = to_free; - }; + } /* Go to the next run and adjust the number of clusters left to free. */ ++rl; if (count >= 0) @@ -913,7 +892,7 @@ s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count, */ for (; rl->length && count != 0; ++rl) { if (unlikely(rl->lcn < LCN_HOLE)) { - VCN vcn; + s64 vcn; /* Attempt to map runlist. */ vcn = rl->vcn; @@ -921,20 +900,15 @@ s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count, if (IS_ERR(rl)) { err = PTR_ERR(rl); if (!is_rollback) - ntfs_error(vol->sb, "Failed to map " - "runlist fragment or " - "failed to find " - "subsequent runlist " - "element."); + ntfs_error(vol->sb, + "Failed to map runlist fragment or failed to find subsequent runlist element."); goto err_out; } if (unlikely(rl->lcn < LCN_HOLE)) { if (!is_rollback) - ntfs_error(vol->sb, "Runlist element " - "has invalid lcn " - "(0x%llx).", - (unsigned long long) - rl->lcn); + ntfs_error(vol->sb, + "Runlist element has invalid lcn (0x%llx).", + rl->lcn); err = -EIO; goto err_out; } @@ -950,8 +924,7 @@ s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count, to_free, likely(!is_rollback) ? 0 : 1); if (unlikely(err)) { if (!is_rollback) - ntfs_error(vol->sb, "Failed to clear " - "subsequent run."); + ntfs_error(vol->sb, "Failed to clear subsequent run."); goto err_out; } /* We have freed @to_free real clusters. */ @@ -960,14 +933,54 @@ s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count, /* Adjust the number of clusters left to free. */ if (count >= 0) count -= to_free; - + /* Update the total done clusters. */ total_freed += to_free; } - if (likely(!is_rollback)) + ntfs_inc_free_clusters(vol, real_freed); + if (likely(!is_rollback)) { up_write(&vol->lcnbmp_lock); + memalloc_nofs_restore(memalloc_flags); + } - BUG_ON(count > 0); + WARN_ON(count > 0); + + if (NVolDiscard(vol) && !is_rollback) { + s64 total_discarded = 0, rl_off; + u32 gran = bdev_discard_granularity(vol->sb->s_bdev); + + rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, ctx); + if (IS_ERR(rl)) + return real_freed; + rl_off = start_vcn - rl->vcn; + while (rl->length && total_discarded < total_freed) { + s64 to_discard = rl->length - rl_off; + + if (to_discard + total_discarded > total_freed) + to_discard = total_freed - total_discarded; + if (rl->lcn >= 0) { + sector_t start_sector, end_sector; + int ret; + + start_sector = ALIGN(NTFS_CLU_TO_B(vol, rl->lcn + rl_off), + gran) >> SECTOR_SHIFT; + end_sector = ALIGN_DOWN(NTFS_CLU_TO_B(vol, + rl->lcn + rl_off + to_discard), + gran) >> SECTOR_SHIFT; + if (start_sector < end_sector) { + ret = blkdev_issue_discard(vol->sb->s_bdev, start_sector, + end_sector - start_sector, + GFP_NOFS); + if (ret) + break; + } + } + + total_discarded += to_discard; + ++rl; + rl_off = 0; + } + } /* We are done. Return the number of actually freed clusters. */ ntfs_debug("Done."); @@ -978,6 +991,7 @@ s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count, /* If no real clusters were freed, no need to rollback. */ if (!real_freed) { up_write(&vol->lcnbmp_lock); + memalloc_nofs_restore(memalloc_flags); return err; } /* @@ -987,14 +1001,14 @@ s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count, */ delta = __ntfs_cluster_free(ni, start_vcn, total_freed, ctx, true); if (delta < 0) { - ntfs_error(vol->sb, "Failed to rollback (error %i). Leaving " - "inconsistent metadata! Unmount and run " - "chkdsk.", (int)delta); + ntfs_error(vol->sb, + "Failed to rollback (error %i). Leaving inconsistent metadata! Unmount and run chkdsk.", + (int)delta); NVolSetErrors(vol); } + ntfs_dec_free_clusters(vol, delta); up_write(&vol->lcnbmp_lock); + memalloc_nofs_restore(memalloc_flags); ntfs_error(vol->sb, "Aborting (error %i).", err); return err; } - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c index 0d448e9881f7..0b9c84489de6 100644 --- a/fs/ntfs/runlist.c +++ b/fs/ntfs/runlist.c @@ -1,24 +1,31 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* - * runlist.c - NTFS runlist handling code. Part of the Linux-NTFS project. +/** + * NTFS runlist handling code. + * Part of the Linux-NTFS project. * * Copyright (c) 2001-2007 Anton Altaparmakov * Copyright (c) 2002-2005 Richard Russon + * Copyright (c) 2025 LG Electronics Co., Ltd. + * + * Part of this file is based on code from the NTFS-3G project. + * and is copyrighted by the respective authors below: + * Copyright (c) 2002-2005 Anton Altaparmakov + * Copyright (c) 2002-2005 Richard Russon + * Copyright (c) 2002-2008 Szabolcs Szakacsits + * Copyright (c) 2004 Yura Pakhuchiy + * Copyright (c) 2007-2022 Jean-Pierre Andre */ -#include "debug.h" -#include "dir.h" -#include "endian.h" #include "malloc.h" #include "ntfs.h" +#include "attrib.h" /** * ntfs_rl_mm - runlist memmove * * It is up to the caller to serialize access to the runlist @base. */ -static inline void ntfs_rl_mm(runlist_element *base, int dst, int src, - int size) +static inline void ntfs_rl_mm(struct runlist_element *base, int dst, int src, int size) { if (likely((dst != src) && (size > 0))) memmove(base + dst, base + src, size * sizeof(*base)); @@ -30,8 +37,8 @@ static inline void ntfs_rl_mm(runlist_element *base, int dst, int src, * It is up to the caller to serialize access to the runlists @dstbase and * @srcbase. */ -static inline void ntfs_rl_mc(runlist_element *dstbase, int dst, - runlist_element *srcbase, int src, int size) +static inline void ntfs_rl_mc(struct runlist_element *dstbase, int dst, + struct runlist_element *srcbase, int src, int size) { if (likely(size > 0)) memcpy(dstbase + dst, srcbase + src, size * sizeof(*dstbase)); @@ -51,16 +58,11 @@ static inline void ntfs_rl_mc(runlist_element *dstbase, int dst, * * N.B. If the new allocation doesn't require a different number of pages in * memory, the function will return the original pointer. - * - * On success, return a pointer to the newly allocated, or recycled, memory. - * On error, return -errno. The following error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EINVAL - Invalid parameters were passed in. */ -static inline runlist_element *ntfs_rl_realloc(runlist_element *rl, +struct runlist_element *ntfs_rl_realloc(struct runlist_element *rl, int old_size, int new_size) { - runlist_element *new_rl; + struct runlist_element *new_rl; old_size = PAGE_ALIGN(old_size * sizeof(*rl)); new_size = PAGE_ALIGN(new_size * sizeof(*rl)); @@ -97,16 +99,11 @@ static inline runlist_element *ntfs_rl_realloc(runlist_element *rl, * * N.B. If the new allocation doesn't require a different number of pages in * memory, the function will return the original pointer. - * - * On success, return a pointer to the newly allocated, or recycled, memory. - * On error, return -errno. The following error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EINVAL - Invalid parameters were passed in. */ -static inline runlist_element *ntfs_rl_realloc_nofail(runlist_element *rl, +static inline struct runlist_element *ntfs_rl_realloc_nofail(struct runlist_element *rl, int old_size, int new_size) { - runlist_element *new_rl; + struct runlist_element *new_rl; old_size = PAGE_ALIGN(old_size * sizeof(*rl)); new_size = PAGE_ALIGN(new_size * sizeof(*rl)); @@ -114,7 +111,6 @@ static inline runlist_element *ntfs_rl_realloc_nofail(runlist_element *rl, return rl; new_rl = ntfs_malloc_nofs_nofail(new_size); - BUG_ON(!new_rl); if (likely(rl != NULL)) { if (unlikely(old_size > new_size)) @@ -138,12 +134,9 @@ static inline runlist_element *ntfs_rl_realloc_nofail(runlist_element *rl, * Return: true Success, the runlists can be merged. * false Failure, the runlists cannot be merged. */ -static inline bool ntfs_are_rl_mergeable(runlist_element *dst, - runlist_element *src) +static inline bool ntfs_are_rl_mergeable(struct runlist_element *dst, + struct runlist_element *src) { - BUG_ON(!dst); - BUG_ON(!src); - /* We can merge unmapped regions even if they are misaligned. */ if ((dst->lcn == LCN_RL_NOT_MAPPED) && (src->lcn == LCN_RL_NOT_MAPPED)) return true; @@ -157,6 +150,9 @@ static inline bool ntfs_are_rl_mergeable(runlist_element *dst, /* If we are merging two holes, we can merge them. */ if ((dst->lcn == LCN_HOLE) && (src->lcn == LCN_HOLE)) return true; + /* If we are merging two dealloc, we can merge them. */ + if ((dst->lcn == LCN_DELALLOC) && (src->lcn == LCN_DELALLOC)) + return true; /* Cannot merge. */ return false; } @@ -172,18 +168,13 @@ static inline bool ntfs_are_rl_mergeable(runlist_element *dst, * * It is up to the caller to serialize access to the runlists @dst and @src. */ -static inline void __ntfs_rl_merge(runlist_element *dst, runlist_element *src) +static inline void __ntfs_rl_merge(struct runlist_element *dst, struct runlist_element *src) { dst->length += src->length; } /** * ntfs_rl_append - append a runlist after a given element - * @dst: original runlist to be worked on - * @dsize: number of elements in @dst (including end marker) - * @src: runlist to be inserted into @dst - * @ssize: number of elements in @src (excluding end marker) - * @loc: append the new runlist @src after this element in @dst * * Append the runlist @src after element @loc in @dst. Merge the right end of * the new runlist, if necessary. Adjust the size of the hole before the @@ -195,21 +186,14 @@ static inline void __ntfs_rl_merge(runlist_element *dst, runlist_element *src) * runlists @dst and @src are deallocated before returning so you cannot use * the pointers for anything any more. (Strictly speaking the returned runlist * may be the same as @dst but this is irrelevant.) - * - * On error, return -errno. Both runlists are left unmodified. The following - * error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EINVAL - Invalid parameters were passed in. */ -static inline runlist_element *ntfs_rl_append(runlist_element *dst, - int dsize, runlist_element *src, int ssize, int loc) +static inline struct runlist_element *ntfs_rl_append(struct runlist_element *dst, + int dsize, struct runlist_element *src, int ssize, int loc, + size_t *new_size) { bool right = false; /* Right end of @src needs merging. */ int marker; /* End of the inserted runs. */ - BUG_ON(!dst); - BUG_ON(!src); - /* First, check if the right hand end needs merging. */ if ((loc + 1) < dsize) right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1); @@ -218,6 +202,8 @@ static inline runlist_element *ntfs_rl_append(runlist_element *dst, dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - right); if (IS_ERR(dst)) return dst; + + *new_size = dsize + ssize - right; /* * We are guaranteed to succeed from here so can start modifying the * original runlists. @@ -246,11 +232,6 @@ static inline runlist_element *ntfs_rl_append(runlist_element *dst, /** * ntfs_rl_insert - insert a runlist into another - * @dst: original runlist to be worked on - * @dsize: number of elements in @dst (including end marker) - * @src: new runlist to be inserted - * @ssize: number of elements in @src (excluding end marker) - * @loc: insert the new runlist @src before this element in @dst * * Insert the runlist @src before element @loc in the runlist @dst. Merge the * left end of the new runlist, if necessary. Adjust the size of the hole @@ -262,22 +243,15 @@ static inline runlist_element *ntfs_rl_append(runlist_element *dst, * runlists @dst and @src are deallocated before returning so you cannot use * the pointers for anything any more. (Strictly speaking the returned runlist * may be the same as @dst but this is irrelevant.) - * - * On error, return -errno. Both runlists are left unmodified. The following - * error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EINVAL - Invalid parameters were passed in. */ -static inline runlist_element *ntfs_rl_insert(runlist_element *dst, - int dsize, runlist_element *src, int ssize, int loc) +static inline struct runlist_element *ntfs_rl_insert(struct runlist_element *dst, + int dsize, struct runlist_element *src, int ssize, int loc, + size_t *new_size) { bool left = false; /* Left end of @src needs merging. */ bool disc = false; /* Discontinuity between @dst and @src. */ int marker; /* End of the inserted runs. */ - BUG_ON(!dst); - BUG_ON(!src); - /* * disc => Discontinuity between the end of @dst and the start of @src. * This means we might need to insert a "not mapped" run. @@ -302,6 +276,8 @@ static inline runlist_element *ntfs_rl_insert(runlist_element *dst, dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left + disc); if (IS_ERR(dst)) return dst; + + *new_size = dsize + ssize - left + disc; /* * We are guaranteed to succeed from here so can start modifying the * original runlist. @@ -324,7 +300,8 @@ static inline runlist_element *ntfs_rl_insert(runlist_element *dst, /* Adjust the VCN of the first run after the insertion... */ dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length; /* ... and the length. */ - if (dst[marker].lcn == LCN_HOLE || dst[marker].lcn == LCN_RL_NOT_MAPPED) + if (dst[marker].lcn == LCN_HOLE || dst[marker].lcn == LCN_RL_NOT_MAPPED || + dst[marker].lcn == LCN_DELALLOC) dst[marker].length = dst[marker + 1].vcn - dst[marker].vcn; /* Writing beyond the end of the file and there is a discontinuity. */ @@ -343,11 +320,6 @@ static inline runlist_element *ntfs_rl_insert(runlist_element *dst, /** * ntfs_rl_replace - overwrite a runlist element with another runlist - * @dst: original runlist to be worked on - * @dsize: number of elements in @dst (including end marker) - * @src: new runlist to be inserted - * @ssize: number of elements in @src (excluding end marker) - * @loc: index in runlist @dst to overwrite with @src * * Replace the runlist element @dst at @loc with @src. Merge the left and * right ends of the inserted runlist, if necessary. @@ -358,24 +330,17 @@ static inline runlist_element *ntfs_rl_insert(runlist_element *dst, * runlists @dst and @src are deallocated before returning so you cannot use * the pointers for anything any more. (Strictly speaking the returned runlist * may be the same as @dst but this is irrelevant.) - * - * On error, return -errno. Both runlists are left unmodified. The following - * error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EINVAL - Invalid parameters were passed in. */ -static inline runlist_element *ntfs_rl_replace(runlist_element *dst, - int dsize, runlist_element *src, int ssize, int loc) +static inline struct runlist_element *ntfs_rl_replace(struct runlist_element *dst, + int dsize, struct runlist_element *src, int ssize, int loc, + size_t *new_size) { - signed delta; + int delta; bool left = false; /* Left end of @src needs merging. */ bool right = false; /* Right end of @src needs merging. */ int tail; /* Start of tail of @dst. */ int marker; /* End of the inserted runs. */ - BUG_ON(!dst); - BUG_ON(!src); - /* First, see if the left and right ends need merging. */ if ((loc + 1) < dsize) right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1); @@ -391,6 +356,8 @@ static inline runlist_element *ntfs_rl_replace(runlist_element *dst, if (IS_ERR(dst)) return dst; } + + *new_size = dsize + delta; /* * We are guaranteed to succeed from here so can start modifying the * original runlists. @@ -431,11 +398,6 @@ static inline runlist_element *ntfs_rl_replace(runlist_element *dst, /** * ntfs_rl_split - insert a runlist into the centre of a hole - * @dst: original runlist to be worked on - * @dsize: number of elements in @dst (including end marker) - * @src: new runlist to be inserted - * @ssize: number of elements in @src (excluding end marker) - * @loc: index in runlist @dst at which to split and insert @src * * Split the runlist @dst at @loc into two and insert @new in between the two * fragments. No merging of runlists is necessary. Adjust the size of the @@ -447,22 +409,17 @@ static inline runlist_element *ntfs_rl_replace(runlist_element *dst, * runlists @dst and @src are deallocated before returning so you cannot use * the pointers for anything any more. (Strictly speaking the returned runlist * may be the same as @dst but this is irrelevant.) - * - * On error, return -errno. Both runlists are left unmodified. The following - * error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EINVAL - Invalid parameters were passed in. */ -static inline runlist_element *ntfs_rl_split(runlist_element *dst, int dsize, - runlist_element *src, int ssize, int loc) +static inline struct runlist_element *ntfs_rl_split(struct runlist_element *dst, int dsize, + struct runlist_element *src, int ssize, int loc, + size_t *new_size) { - BUG_ON(!dst); - BUG_ON(!src); - /* Space required: @dst size + @src size + one new hole. */ dst = ntfs_rl_realloc(dst, dsize, dsize + ssize + 1); if (IS_ERR(dst)) return dst; + + *new_size = dsize + ssize + 1; /* * We are guaranteed to succeed from here so can start modifying the * original runlists. @@ -482,8 +439,6 @@ static inline runlist_element *ntfs_rl_split(runlist_element *dst, int dsize, /** * ntfs_runlists_merge - merge two runlists into one - * @drl: original runlist to be worked on - * @srl: new runlist to be merged into @drl * * First we sanity check the two runlists @srl and @drl to make sure that they * are sensible and can be merged. The runlist @srl must be either after the @@ -507,24 +462,19 @@ static inline runlist_element *ntfs_rl_split(runlist_element *dst, int dsize, * runlists @drl and @srl are deallocated before returning so you cannot use * the pointers for anything any more. (Strictly speaking the returned runlist * may be the same as @dst but this is irrelevant.) - * - * On error, return -errno. Both runlists are left unmodified. The following - * error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EINVAL - Invalid parameters were passed in. - * -ERANGE - The runlists overlap and cannot be merged. */ -runlist_element *ntfs_runlists_merge(runlist_element *drl, - runlist_element *srl) +struct runlist_element *ntfs_runlists_merge(struct runlist *d_runlist, + struct runlist_element *srl, size_t s_rl_count, + size_t *new_rl_count) { int di, si; /* Current index into @[ds]rl. */ int sstart; /* First index with lcn > LCN_RL_NOT_MAPPED. */ int dins; /* Index into @drl at which to insert @srl. */ int dend, send; /* Last index into @[ds]rl. */ - int dfinal, sfinal; /* The last index into @[ds]rl with - lcn >= LCN_HOLE. */ + int dfinal, sfinal; /* The last index into @[ds]rl with lcn >= LCN_HOLE. */ int marker = 0; - VCN marker_vcn = 0; + s64 marker_vcn = 0; + struct runlist_element *drl = d_runlist->rl, *rl; #ifdef DEBUG ntfs_debug("dst:"); @@ -539,27 +489,36 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl, if (IS_ERR(srl) || IS_ERR(drl)) return ERR_PTR(-EINVAL); + if (s_rl_count == 0) { + for (; srl[s_rl_count].length; s_rl_count++) + ; + s_rl_count++; + } + /* Check for the case where the first mapping is being done now. */ if (unlikely(!drl)) { drl = srl; /* Complete the source runlist if necessary. */ if (unlikely(drl[0].vcn)) { /* Scan to the end of the source runlist. */ - for (dend = 0; likely(drl[dend].length); dend++) - ; - dend++; - drl = ntfs_rl_realloc(drl, dend, dend + 1); + drl = ntfs_rl_realloc(drl, s_rl_count, s_rl_count + 1); if (IS_ERR(drl)) return drl; /* Insert start element at the front of the runlist. */ - ntfs_rl_mm(drl, 1, 0, dend); + ntfs_rl_mm(drl, 1, 0, s_rl_count); drl[0].vcn = 0; drl[0].lcn = LCN_RL_NOT_MAPPED; drl[0].length = drl[1].vcn; + s_rl_count++; } + + *new_rl_count = s_rl_count; goto finished; } + if (d_runlist->count < 1 || s_rl_count < 2) + return ERR_PTR(-EINVAL); + si = di = 0; /* Skip any unmapped start element(s) in the source runlist. */ @@ -567,7 +526,7 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl, si++; /* Can't have an entirely unmapped source runlist. */ - BUG_ON(!srl[si].length); + WARN_ON(!srl[si].length); /* Record the starting points. */ sstart = si; @@ -577,10 +536,11 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl, * be inserted. If we reach the end of @drl, @srl just needs to be * appended to @drl. */ - for (; drl[di].length; di++) { - if (drl[di].vcn + drl[di].length > srl[sstart].vcn) - break; - } + rl = __ntfs_attr_find_vcn_nolock(d_runlist, srl[sstart].vcn); + if (IS_ERR(rl)) + di = (int)d_runlist->count - 1; + else + di = (int)(rl - d_runlist->rl); dins = di; /* Sanity check for illegal overlaps. */ @@ -591,10 +551,8 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl, } /* Scan to the end of both runlists in order to know their sizes. */ - for (send = si; srl[send].length; send++) - ; - for (dend = di; drl[dend].length; dend++) - ; + send = (int)s_rl_count - 1; + dend = (int)d_runlist->count - 1; if (srl[send].lcn == LCN_ENOENT) marker_vcn = srl[marker = send].vcn; @@ -622,22 +580,17 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl, ss++; if (marker && (drl[dins].vcn + drl[dins].length > srl[send - 1].vcn)) finish = false; -#if 0 - ntfs_debug("dfinal = %i, dend = %i", dfinal, dend); - ntfs_debug("sstart = %i, sfinal = %i, send = %i", sstart, sfinal, send); - ntfs_debug("start = %i, finish = %i", start, finish); - ntfs_debug("ds = %i, ss = %i, dins = %i", ds, ss, dins); -#endif + if (start) { if (finish) - drl = ntfs_rl_replace(drl, ds, srl + sstart, ss, dins); + drl = ntfs_rl_replace(drl, ds, srl + sstart, ss, dins, new_rl_count); else - drl = ntfs_rl_insert(drl, ds, srl + sstart, ss, dins); + drl = ntfs_rl_insert(drl, ds, srl + sstart, ss, dins, new_rl_count); } else { if (finish) - drl = ntfs_rl_append(drl, ds, srl + sstart, ss, dins); + drl = ntfs_rl_append(drl, ds, srl + sstart, ss, dins, new_rl_count); else - drl = ntfs_rl_split(drl, ds, srl + sstart, ss, dins); + drl = ntfs_rl_split(drl, ds, srl + sstart, ss, dins, new_rl_count); } if (IS_ERR(drl)) { ntfs_error(NULL, "Merge failed."); @@ -653,9 +606,7 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl, int slots = 0; if (drl[ds].vcn == marker_vcn) { - ntfs_debug("Old marker = 0x%llx, replacing " - "with LCN_ENOENT.", - (unsigned long long) + ntfs_debug("Old marker = 0x%llx, replacing with LCN_ENOENT.", drl[ds].lcn); drl[ds].lcn = LCN_ENOENT; goto finished; @@ -675,6 +626,7 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl, drl = ntfs_rl_realloc_nofail(drl, ds, ds + 2); slots = 2; + *new_rl_count += 2; } ds++; /* Need to set vcn if it isn't set already. */ @@ -688,8 +640,10 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl, drl[ds].length = marker_vcn - drl[ds].vcn; /* Finally add the ENOENT terminator. */ ds++; - if (!slots) + if (!slots) { drl = ntfs_rl_realloc_nofail(drl, ds, ds + 1); + *new_rl_count += 1; + } drl[ds].vcn = marker_vcn; drl[ds].lcn = LCN_ENOENT; drl[ds].length = (s64)0; @@ -706,9 +660,6 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl, /** * ntfs_mapping_pairs_decompress - convert mapping pairs array to runlist - * @vol: ntfs volume on which the attribute resides - * @attr: attribute record whose mapping pairs array to decompress - * @old_rl: optional runlist in which to insert @attr's runlist * * It is up to the caller to serialize access to the runlist @old_rl. * @@ -720,54 +671,41 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl, * returned. The original @old_rl is deallocated. * * On error, return -errno. @old_rl is left unmodified in that case. - * - * The following error codes are defined: - * -ENOMEM - Not enough memory to allocate runlist array. - * -EIO - Corrupt runlist. - * -EINVAL - Invalid parameters were passed in. - * -ERANGE - The two runlists overlap. - * - * FIXME: For now we take the conceptionally simplest approach of creating the - * new runlist disregarding the already existing one and then splicing the - * two into one, if that is possible (we check for overlap and discard the new - * runlist if overlap present before returning ERR_PTR(-ERANGE)). */ -runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol, - const ATTR_RECORD *attr, runlist_element *old_rl) +struct runlist_element *ntfs_mapping_pairs_decompress(const struct ntfs_volume *vol, + const struct attr_record *attr, struct runlist *old_runlist, + size_t *new_rl_count) { - VCN vcn; /* Current vcn. */ - LCN lcn; /* Current lcn. */ + s64 vcn; /* Current vcn. */ + s64 lcn; /* Current lcn. */ s64 deltaxcn; /* Change in [vl]cn. */ - runlist_element *rl; /* The output runlist. */ + struct runlist_element *rl, *new_rl; /* The output runlist. */ u8 *buf; /* Current position in mapping pairs array. */ u8 *attr_end; /* End of attribute. */ int rlsize; /* Size of runlist buffer. */ - u16 rlpos; /* Current runlist position in units of - runlist_elements. */ + u16 rlpos; /* Current runlist position in units of struct runlist_elements. */ u8 b; /* Current byte offset in buf. */ #ifdef DEBUG /* Make sure attr exists and is non-resident. */ - if (!attr || !attr->non_resident || sle64_to_cpu( - attr->data.non_resident.lowest_vcn) < (VCN)0) { + if (!attr || !attr->non_resident || + le64_to_cpu(attr->data.non_resident.lowest_vcn) < 0) { ntfs_error(vol->sb, "Invalid arguments."); return ERR_PTR(-EINVAL); } #endif /* Start at vcn = lowest_vcn and lcn 0. */ - vcn = sle64_to_cpu(attr->data.non_resident.lowest_vcn); + vcn = le64_to_cpu(attr->data.non_resident.lowest_vcn); lcn = 0; /* Get start of the mapping pairs array. */ - buf = (u8*)attr + le16_to_cpu( - attr->data.non_resident.mapping_pairs_offset); - attr_end = (u8*)attr + le32_to_cpu(attr->length); - if (unlikely(buf < (u8*)attr || buf > attr_end)) { + buf = (u8 *)attr + + le16_to_cpu(attr->data.non_resident.mapping_pairs_offset); + attr_end = (u8 *)attr + le32_to_cpu(attr->length); + if (unlikely(buf < (u8 *)attr || buf > attr_end)) { ntfs_error(vol->sb, "Corrupt attribute."); return ERR_PTR(-EIO); } - /* If the mapping pairs array is valid but empty, nothing to do. */ - if (!vcn && !*buf) - return old_rl; + /* Current position in runlist array. */ rlpos = 0; /* Allocate first page and set current runlist size to one page. */ @@ -787,8 +725,8 @@ runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol, * not-mapped and terminator elements. ntfs_malloc_nofs() * operates on whole pages only. */ - if (((rlpos + 3) * sizeof(*old_rl)) > rlsize) { - runlist_element *rl2; + if (((rlpos + 3) * sizeof(*rl)) > rlsize) { + struct runlist_element *rl2; rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE); if (unlikely(!rl2)) { @@ -816,8 +754,7 @@ runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol, for (deltaxcn = (s8)buf[b--]; b; b--) deltaxcn = (deltaxcn << 8) + buf[b]; } else { /* The length entry is compulsory. */ - ntfs_error(vol->sb, "Missing length entry in mapping " - "pairs array."); + ntfs_error(vol->sb, "Missing length entry in mapping pairs array."); deltaxcn = (s64)-1; } /* @@ -825,8 +762,7 @@ runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol, * hence clean-up and return NULL. */ if (unlikely(deltaxcn < 0)) { - ntfs_error(vol->sb, "Invalid length in mapping pairs " - "array."); + ntfs_error(vol->sb, "Invalid length in mapping pairs array."); goto err_out; } /* @@ -846,6 +782,7 @@ runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol, else { /* Get the lcn change which really can be negative. */ u8 b2 = *buf & 0xf; + b = b2 + ((*buf >> 4) & 0xf); if (buf + b > attr_end) goto io_error; @@ -862,23 +799,30 @@ runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol, * can investigate it further! */ if (vol->major_ver < 3) { - if (unlikely(deltaxcn == (LCN)-1)) + if (unlikely(deltaxcn == -1)) ntfs_error(vol->sb, "lcn delta == -1"); - if (unlikely(lcn == (LCN)-1)) + if (unlikely(lcn == -1)) ntfs_error(vol->sb, "lcn == -1"); } #endif /* Check lcn is not below -1. */ - if (unlikely(lcn < (LCN)-1)) { - ntfs_error(vol->sb, "Invalid LCN < -1 in " - "mapping pairs array."); + if (unlikely(lcn < -1)) { + ntfs_error(vol->sb, "Invalid s64 < -1 in mapping pairs array."); + goto err_out; + } + + /* chkdsk accepts zero-sized runs only for holes */ + if ((lcn != -1) && !rl[rlpos].length) { + ntfs_error(vol->sb, "Invalid zero-sized data run.\n"); goto err_out; } + /* Enter the current lcn into the runlist element. */ rl[rlpos].lcn = lcn; } - /* Get to the next runlist element. */ - rlpos++; + /* Get to the next runlist element, skipping zero-sized holes */ + if (rl[rlpos].length) + rlpos++; /* Increment the buffer position to the next mapping pair. */ buf += (*buf & 0xf) + ((*buf >> 4) & 0xf) + 1; } @@ -888,19 +832,17 @@ runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol, * If there is a highest_vcn specified, it must be equal to the final * vcn in the runlist - 1, or something has gone badly wrong. */ - deltaxcn = sle64_to_cpu(attr->data.non_resident.highest_vcn); + deltaxcn = le64_to_cpu(attr->data.non_resident.highest_vcn); if (unlikely(deltaxcn && vcn - 1 != deltaxcn)) { mpa_err: - ntfs_error(vol->sb, "Corrupt mapping pairs array in " - "non-resident attribute."); + ntfs_error(vol->sb, "Corrupt mapping pairs array in non-resident attribute."); goto err_out; } /* Setup not mapped runlist element if this is the base extent. */ if (!attr->data.non_resident.lowest_vcn) { - VCN max_cluster; + s64 max_cluster; - max_cluster = ((sle64_to_cpu( - attr->data.non_resident.allocated_size) + + max_cluster = ((le64_to_cpu(attr->data.non_resident.allocated_size) + vol->cluster_size - 1) >> vol->cluster_size_bits) - 1; /* @@ -915,24 +857,17 @@ runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol, * this one. */ if (deltaxcn < max_cluster) { - ntfs_debug("More extents to follow; deltaxcn " - "= 0x%llx, max_cluster = " - "0x%llx", - (unsigned long long)deltaxcn, - (unsigned long long) - max_cluster); + ntfs_debug("More extents to follow; deltaxcn = 0x%llx, max_cluster = 0x%llx", + deltaxcn, max_cluster); rl[rlpos].vcn = vcn; vcn += rl[rlpos].length = max_cluster - deltaxcn; rl[rlpos].lcn = LCN_RL_NOT_MAPPED; rlpos++; } else if (unlikely(deltaxcn > max_cluster)) { - ntfs_error(vol->sb, "Corrupt attribute. " - "deltaxcn = 0x%llx, " - "max_cluster = 0x%llx", - (unsigned long long)deltaxcn, - (unsigned long long) - max_cluster); + ntfs_error(vol->sb, + "Corrupt attribute. deltaxcn = 0x%llx, max_cluster = 0x%llx", + deltaxcn, max_cluster); goto mpa_err; } } @@ -944,18 +879,19 @@ runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol, rl[rlpos].vcn = vcn; rl[rlpos].length = (s64)0; /* If no existing runlist was specified, we are done. */ - if (!old_rl) { + if (!old_runlist || !old_runlist->rl) { + *new_rl_count = rlpos + 1; ntfs_debug("Mapping pairs array successfully decompressed:"); ntfs_debug_dump_runlist(rl); return rl; } /* Now combine the new and old runlists checking for overlaps. */ - old_rl = ntfs_runlists_merge(old_rl, rl); - if (!IS_ERR(old_rl)) - return old_rl; + new_rl = ntfs_runlists_merge(old_runlist, rl, rlpos + 1, new_rl_count); + if (!IS_ERR(new_rl)) + return new_rl; ntfs_free(rl); ntfs_error(vol->sb, "Failed to merge runlists."); - return old_rl; + return new_rl; io_error: ntfs_error(vol->sb, "Corrupt attribute."); err_out: @@ -987,11 +923,10 @@ runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol, * - This function does not touch the lock, nor does it modify the * runlist. */ -LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn) +s64 ntfs_rl_vcn_to_lcn(const struct runlist_element *rl, const s64 vcn) { int i; - BUG_ON(vcn < 0); /* * If rl is NULL, assume that we have found an unmapped runlist. The * caller can then attempt to map it and fail appropriately if @@ -1005,8 +940,8 @@ LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn) return LCN_ENOENT; for (i = 0; likely(rl[i].length); i++) { - if (unlikely(vcn < rl[i+1].vcn)) { - if (likely(rl[i].lcn >= (LCN)0)) + if (vcn < rl[i+1].vcn) { + if (likely(rl[i].lcn >= 0)) return rl[i].lcn + (vcn - rl[i].vcn); return rl[i].lcn; } @@ -1015,14 +950,12 @@ LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn) * The terminator element is setup to the correct value, i.e. one of * LCN_HOLE, LCN_RL_NOT_MAPPED, or LCN_ENOENT. */ - if (likely(rl[i].lcn < (LCN)0)) + if (likely(rl[i].lcn < 0)) return rl[i].lcn; /* Just in case... We could replace this with BUG() some day. */ return LCN_ENOENT; } -#ifdef NTFS_RW - /** * ntfs_rl_find_vcn_nolock - find a vcn in a runlist * @rl: runlist to search @@ -1036,9 +969,8 @@ LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn) * * Locking: The runlist must be locked on entry. */ -runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl, const VCN vcn) +struct runlist_element *ntfs_rl_find_vcn_nolock(struct runlist_element *rl, const s64 vcn) { - BUG_ON(vcn < 0); if (unlikely(!rl || vcn < rl[0].vcn)) return NULL; while (likely(rl->length)) { @@ -1087,10 +1019,6 @@ static inline int ntfs_get_nr_significant_bytes(const s64 n) /** * ntfs_get_size_for_mapping_pairs - get bytes needed for mapping pairs array - * @vol: ntfs volume (needed for the ntfs version) - * @rl: locked runlist to determine the size of the mapping pairs of - * @first_vcn: first vcn which to include in the mapping pairs array - * @last_vcn: last vcn which to include in the mapping pairs array * * Walk the locked runlist @rl and calculate the size in bytes of the mapping * pairs array corresponding to the runlist @rl, starting at vcn @first_vcn and @@ -1106,30 +1034,28 @@ static inline int ntfs_get_nr_significant_bytes(const s64 n) * If @rl is NULL, just return 1 (for the single terminator byte). * * Return the calculated size in bytes on success. On error, return -errno. - * The following error codes are defined: - * -EINVAL - Run list contains unmapped elements. Make sure to only pass - * fully mapped runlists to this function. - * -EIO - The runlist is corrupt. - * - * Locking: @rl must be locked on entry (either for reading or writing), it - * remains locked throughout, and is left locked upon return. */ -int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol, - const runlist_element *rl, const VCN first_vcn, - const VCN last_vcn) +int ntfs_get_size_for_mapping_pairs(const struct ntfs_volume *vol, + const struct runlist_element *rl, const s64 first_vcn, + const s64 last_vcn, int max_mp_size) { - LCN prev_lcn; + s64 prev_lcn; int rls; bool the_end = false; - BUG_ON(first_vcn < 0); - BUG_ON(last_vcn < -1); - BUG_ON(last_vcn >= 0 && first_vcn > last_vcn); + if (first_vcn < 0 || last_vcn < -1) + return -EINVAL; + + if (last_vcn >= 0 && first_vcn > last_vcn) + return -EINVAL; + if (!rl) { - BUG_ON(first_vcn); - BUG_ON(last_vcn > 0); + WARN_ON(first_vcn); + WARN_ON(last_vcn > 0); return 1; } + if (max_mp_size <= 0) + max_mp_size = INT_MAX; /* Skip to runlist element containing @first_vcn. */ while (rl->length && first_vcn >= rl[1].vcn) rl++; @@ -1152,6 +1078,7 @@ int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol, */ if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { s64 s1 = last_vcn + 1; + if (unlikely(rl[1].vcn > s1)) length = s1 - rl->vcn; the_end = true; @@ -1188,6 +1115,7 @@ int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol, */ if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { s64 s1 = last_vcn + 1; + if (unlikely(rl[1].vcn > s1)) length = s1 - rl->vcn; the_end = true; @@ -1207,6 +1135,9 @@ int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol, prev_lcn); prev_lcn = rl->lcn; } + + if (rls > max_mp_size) + break; } return rls; err_out: @@ -1270,13 +1201,6 @@ static inline int ntfs_write_significant_bytes(s8 *dst, const s8 *dst_max, /** * ntfs_mapping_pairs_build - build the mapping pairs array from a runlist - * @vol: ntfs volume (needed for the ntfs version) - * @dst: destination buffer to which to write the mapping pairs array - * @dst_len: size of destination buffer @dst in bytes - * @rl: locked runlist for which to build the mapping pairs array - * @first_vcn: first vcn which to include in the mapping pairs array - * @last_vcn: last vcn which to include in the mapping pairs array - * @stop_vcn: first vcn outside destination buffer on success or -ENOSPC * * Create the mapping pairs array from the locked runlist @rl, starting at vcn * @first_vcn and finishing with vcn @last_vcn and save the array in @dst. @@ -1295,34 +1219,26 @@ static inline int ntfs_write_significant_bytes(s8 *dst, const s8 *dst_max, * as partial success, in that a new attribute extent needs to be created or * the next extent has to be used and the mapping pairs build has to be * continued with @first_vcn set to *@stop_vcn. - * - * Return 0 on success and -errno on error. The following error codes are - * defined: - * -EINVAL - Run list contains unmapped elements. Make sure to only pass - * fully mapped runlists to this function. - * -EIO - The runlist is corrupt. - * -ENOSPC - The destination buffer is too small. - * - * Locking: @rl must be locked on entry (either for reading or writing), it - * remains locked throughout, and is left locked upon return. */ -int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, - const int dst_len, const runlist_element *rl, - const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn) +int ntfs_mapping_pairs_build(const struct ntfs_volume *vol, s8 *dst, + const int dst_len, const struct runlist_element *rl, + const s64 first_vcn, const s64 last_vcn, s64 *const stop_vcn, + struct runlist_element **stop_rl, unsigned int *de_cluster_count) { - LCN prev_lcn; + s64 prev_lcn; s8 *dst_max, *dst_next; int err = -ENOSPC; bool the_end = false; s8 len_len, lcn_len; + unsigned int de_cnt = 0; + + if (first_vcn < 0 || last_vcn < -1 || dst_len < 1) + return -EINVAL; + if (last_vcn >= 0 && first_vcn > last_vcn) + return -EINVAL; - BUG_ON(first_vcn < 0); - BUG_ON(last_vcn < -1); - BUG_ON(last_vcn >= 0 && first_vcn > last_vcn); - BUG_ON(dst_len < 1); if (!rl) { - BUG_ON(first_vcn); - BUG_ON(last_vcn > 0); + WARN_ON(first_vcn || last_vcn > 0); if (stop_vcn) *stop_vcn = 0; /* Terminator byte. */ @@ -1354,6 +1270,7 @@ int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, */ if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { s64 s1 = last_vcn + 1; + if (unlikely(rl[1].vcn > s1)) length = s1 - rl->vcn; the_end = true; @@ -1368,10 +1285,7 @@ int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, * If the logical cluster number (lcn) denotes a hole and we * are on NTFS 3.0+, we don't store it at all, i.e. we need * zero space. On earlier NTFS versions we just write the lcn - * change. FIXME: Do we need to write the lcn change or just - * the lcn in that case? Not sure as I have never seen this - * case on NT4. - We assume that we just need to write the lcn - * change until someone tells us otherwise... (AIA) + * change. */ if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { prev_lcn = rl->lcn; @@ -1406,6 +1320,7 @@ int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, */ if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { s64 s1 = last_vcn + 1; + if (unlikely(rl[1].vcn > s1)) length = s1 - rl->vcn; the_end = true; @@ -1419,10 +1334,7 @@ int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, * If the logical cluster number (lcn) denotes a hole and we * are on NTFS 3.0+, we don't store it at all, i.e. we need * zero space. On earlier NTFS versions we just write the lcn - * change. FIXME: Do we need to write the lcn change or just - * the lcn in that case? Not sure as I have never seen this - * case on NT4. - We assume that we just need to write the lcn - * change until someone tells us otherwise... (AIA) + * change. */ if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { /* Write change in lcn. */ @@ -1431,8 +1343,11 @@ int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, if (unlikely(lcn_len < 0)) goto size_err; prev_lcn = rl->lcn; - } else + } else { + if (rl->lcn == LCN_DELALLOC) + de_cnt += rl->length; lcn_len = 0; + } dst_next = dst + len_len + lcn_len + 1; if (unlikely(dst_next > dst_max)) goto size_err; @@ -1442,11 +1357,15 @@ int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, dst = dst_next; } /* Success. */ + if (de_cluster_count) + *de_cluster_count = de_cnt; err = 0; size_err: /* Set stop vcn. */ if (stop_vcn) *stop_vcn = rl->vcn; + if (stop_rl) + *stop_rl = (struct runlist_element *)rl; /* Add terminator byte. */ *dst = 0; return err; @@ -1479,45 +1398,22 @@ int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, * the caller has mapped any elements that need to be mapped already. * * Return 0 on success and -errno on error. - * - * Locking: The caller must hold @runlist->lock for writing. */ -int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist, +int ntfs_rl_truncate_nolock(const struct ntfs_volume *vol, struct runlist *const runlist, const s64 new_length) { - runlist_element *rl; + struct runlist_element *rl; int old_size; ntfs_debug("Entering for new_length 0x%llx.", (long long)new_length); - BUG_ON(!runlist); - BUG_ON(new_length < 0); + + if (!runlist || new_length < 0) + return -EINVAL; + rl = runlist->rl; - if (!new_length) { - ntfs_debug("Freeing runlist."); - runlist->rl = NULL; - if (rl) - ntfs_free(rl); - return 0; - } - if (unlikely(!rl)) { - /* - * Create a runlist consisting of a sparse runlist element of - * length @new_length followed by a terminator runlist element. - */ - rl = ntfs_malloc_nofs(PAGE_SIZE); - if (unlikely(!rl)) { - ntfs_error(vol->sb, "Not enough memory to allocate " - "runlist element buffer."); - return -ENOMEM; - } - runlist->rl = rl; - rl[1].length = rl->vcn = 0; - rl->lcn = LCN_HOLE; - rl[1].vcn = rl->length = new_length; - rl[1].lcn = LCN_ENOENT; - return 0; - } - BUG_ON(new_length < rl->vcn); + if (new_length < rl->vcn) + return -EINVAL; + /* Find @new_length in the runlist. */ while (likely(rl->length && new_length >= rl[1].vcn)) rl++; @@ -1526,7 +1422,7 @@ int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist, * If at the end of the runlist we need to expand it. */ if (rl->length) { - runlist_element *trl; + struct runlist_element *trl; bool is_end; ntfs_debug("Shrinking runlist."); @@ -1550,16 +1446,15 @@ int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist, rl->length = 0; } rl->lcn = LCN_ENOENT; + runlist->count = rl - runlist->rl + 1; /* Reallocate memory if necessary. */ if (!is_end) { int new_size = rl - runlist->rl + 1; + rl = ntfs_rl_realloc(runlist->rl, old_size, new_size); if (IS_ERR(rl)) - ntfs_warning(vol->sb, "Failed to shrink " - "runlist buffer. This just " - "wastes a bit of memory " - "temporarily so we ignore it " - "and return success."); + ntfs_warning(vol->sb, + "Failed to shrink runlist buffer. This just wastes a bit of memory temporarily so we ignore it and return success."); else runlist->rl = rl; } @@ -1579,8 +1474,7 @@ int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist, rl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1); if (IS_ERR(rl)) { - ntfs_error(vol->sb, "Failed to expand runlist " - "buffer, aborting."); + ntfs_error(vol->sb, "Failed to expand runlist buffer, aborting."); return PTR_ERR(rl); } runlist->rl = rl; @@ -1595,6 +1489,7 @@ int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist, /* Add a new terminator runlist element. */ rl++; rl->length = 0; + runlist->count = old_size + 1; } rl->vcn = new_length; rl->lcn = LCN_ENOENT; @@ -1607,287 +1502,482 @@ int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist, } /** - * ntfs_rl_punch_nolock - punch a hole into a runlist - * @vol: ntfs volume (needed for error output) - * @runlist: runlist to punch a hole into - * @start: starting VCN of the hole to be created - * @length: size of the hole to be created in units of clusters - * - * Punch a hole into the runlist @runlist starting at VCN @start and of size - * @length clusters. - * - * Return 0 on success and -errno on error, in which case @runlist has not been - * modified. - * - * If @start and/or @start + @length are outside the runlist return error code - * -ENOENT. + * ntfs_rl_sparse - check whether runlist have sparse regions or not. + * @rl: runlist to check * - * If the runlist contains unmapped or error elements between @start and @start - * + @length return error code -EINVAL. + * Return 1 if have, 0 if not, -errno on error. + */ +int ntfs_rl_sparse(struct runlist_element *rl) +{ + struct runlist_element *rlc; + + if (!rl) + return -EINVAL; + + for (rlc = rl; rlc->length; rlc++) + if (rlc->lcn < 0) { + if (rlc->lcn != LCN_HOLE && rlc->lcn != LCN_DELALLOC) { + pr_err("%s: bad runlist", __func__); + return -EINVAL; + } + return 1; + } + return 0; +} + +/** + * ntfs_rl_get_compressed_size - calculate length of non sparse regions + * @vol: ntfs volume (need for cluster size) + * @rl: runlist to calculate for * - * Locking: The caller must hold @runlist->lock for writing. + * Return compressed size or -errno on error. */ -int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist, - const VCN start, const s64 length) +s64 ntfs_rl_get_compressed_size(struct ntfs_volume *vol, struct runlist_element *rl) { - const VCN end = start + length; - s64 delta; - runlist_element *rl, *rl_end, *rl_real_end, *trl; - int old_size; - bool lcn_fixup = false; - - ntfs_debug("Entering for start 0x%llx, length 0x%llx.", - (long long)start, (long long)length); - BUG_ON(!runlist); - BUG_ON(start < 0); - BUG_ON(length < 0); - BUG_ON(end < 0); - rl = runlist->rl; - if (unlikely(!rl)) { - if (likely(!start && !length)) - return 0; - return -ENOENT; + struct runlist_element *rlc; + s64 ret = 0; + + if (!rl) + return -EINVAL; + + for (rlc = rl; rlc->length; rlc++) { + if (rlc->lcn < 0) { + if (rlc->lcn != LCN_HOLE && rlc->lcn != LCN_DELALLOC) { + ntfs_error(vol->sb, "%s: bad runlist, rlc->lcn : %lld", + __func__, rlc->lcn); + return -EINVAL; + } + } else + ret += rlc->length; } - /* Find @start in the runlist. */ - while (likely(rl->length && start >= rl[1].vcn)) - rl++; - rl_end = rl; - /* Find @end in the runlist. */ - while (likely(rl_end->length && end >= rl_end[1].vcn)) { - /* Verify there are no unmapped or error elements. */ - if (unlikely(rl_end->lcn < LCN_HOLE)) - return -EINVAL; - rl_end++; + return NTFS_CLU_TO_B(vol, ret); +} + +static inline bool ntfs_rle_lcn_contiguous(struct runlist_element *left_rle, + struct runlist_element *right_rle) +{ + if (left_rle->lcn > LCN_HOLE && + left_rle->lcn + left_rle->length == right_rle->lcn) + return true; + else if (left_rle->lcn == LCN_HOLE && right_rle->lcn == LCN_HOLE) + return true; + else + return false; +} + +static inline bool ntfs_rle_contain(struct runlist_element *rle, s64 vcn) +{ + if (rle->length > 0 && + vcn >= rle->vcn && vcn < rle->vcn + rle->length) + return true; + else + return false; +} + +struct runlist_element *ntfs_rl_insert_range(struct runlist_element *dst_rl, int dst_cnt, + struct runlist_element *src_rl, int src_cnt, + size_t *new_rl_cnt) +{ + struct runlist_element *i_rl, *new_rl, *src_rl_origin = src_rl; + struct runlist_element dst_rl_split; + s64 start_vcn = src_rl[0].vcn; + int new_1st_cnt, new_2nd_cnt, new_3rd_cnt, new_cnt; + + if (!dst_rl || !src_rl || !new_rl_cnt) + return ERR_PTR(-EINVAL); + if (dst_cnt <= 0 || src_cnt <= 0) + return ERR_PTR(-EINVAL); + if (!(dst_rl[dst_cnt - 1].lcn == LCN_ENOENT && + dst_rl[dst_cnt - 1].length == 0) || + src_rl[src_cnt - 1].lcn < LCN_HOLE) + return ERR_PTR(-EINVAL); + + start_vcn = src_rl[0].vcn; + + i_rl = ntfs_rl_find_vcn_nolock(dst_rl, start_vcn); + if (!i_rl || + (i_rl->lcn == LCN_ENOENT && i_rl->vcn != start_vcn) || + (i_rl->lcn != LCN_ENOENT && !ntfs_rle_contain(i_rl, start_vcn))) + return ERR_PTR(-EINVAL); + + new_1st_cnt = (int)(i_rl - dst_rl); + if (new_1st_cnt > dst_cnt) + return ERR_PTR(-EINVAL); + new_3rd_cnt = dst_cnt - new_1st_cnt; + if (new_3rd_cnt < 1) + return ERR_PTR(-EINVAL); + + if (i_rl[0].vcn != start_vcn) { + if (i_rl[0].lcn == LCN_HOLE && src_rl[0].lcn == LCN_HOLE) + goto merge_src_rle; + + /* split @i_rl[0] and create @dst_rl_split */ + dst_rl_split.vcn = i_rl[0].vcn; + dst_rl_split.length = start_vcn - i_rl[0].vcn; + dst_rl_split.lcn = i_rl[0].lcn; + + i_rl[0].vcn = start_vcn; + i_rl[0].length -= dst_rl_split.length; + i_rl[0].lcn += dst_rl_split.length; + } else { + struct runlist_element *dst_rle, *src_rle; +merge_src_rle: + + /* not split @i_rl[0] */ + dst_rl_split.lcn = LCN_ENOENT; + + /* merge @src_rl's first run and @i_rl[0]'s left run if possible */ + dst_rle = &dst_rl[new_1st_cnt - 1]; + src_rle = &src_rl[0]; + if (new_1st_cnt > 0 && ntfs_rle_lcn_contiguous(dst_rle, src_rle)) { + WARN_ON(dst_rle->vcn + dst_rle->length != src_rle->vcn); + dst_rle->length += src_rle->length; + src_rl++; + src_cnt--; + } else { + /* merge @src_rl's last run and @i_rl[0]'s right if possible */ + dst_rle = &dst_rl[new_1st_cnt]; + src_rle = &src_rl[src_cnt - 1]; + + if (ntfs_rle_lcn_contiguous(dst_rle, src_rle)) { + dst_rle->length += src_rle->length; + src_cnt--; + } + } } - /* Check the last element. */ - if (unlikely(rl_end->length && rl_end->lcn < LCN_HOLE)) - return -EINVAL; - /* This covers @start being out of bounds, too. */ - if (!rl_end->length && end > rl_end->vcn) - return -ENOENT; - if (!length) - return 0; - if (!rl->length) - return -ENOENT; - rl_real_end = rl_end; - /* Determine the runlist size. */ - while (likely(rl_real_end->length)) - rl_real_end++; - old_size = rl_real_end - runlist->rl + 1; - /* If @start is in a hole simply extend the hole. */ - if (rl->lcn == LCN_HOLE) { - /* - * If both @start and @end are in the same sparse run, we are - * done. + + new_2nd_cnt = src_cnt; + new_cnt = new_1st_cnt + new_2nd_cnt + new_3rd_cnt; + new_cnt += dst_rl_split.lcn >= LCN_HOLE ? 1 : 0; + new_rl = ntfs_malloc_nofs(new_cnt * sizeof(*new_rl)); + if (!new_rl) + return ERR_PTR(-ENOMEM); + + /* Copy the @dst_rl's first half to @new_rl */ + ntfs_rl_mc(new_rl, 0, dst_rl, 0, new_1st_cnt); + if (dst_rl_split.lcn >= LCN_HOLE) { + ntfs_rl_mc(new_rl, new_1st_cnt, &dst_rl_split, 0, 1); + new_1st_cnt++; + } + /* Copy the @src_rl to @new_rl */ + ntfs_rl_mc(new_rl, new_1st_cnt, src_rl, 0, new_2nd_cnt); + /* Copy the @dst_rl's second half to @new_rl */ + if (new_3rd_cnt >= 1) { + struct runlist_element *rl, *rl_3rd; + int dst_1st_cnt = dst_rl_split.lcn >= LCN_HOLE ? + new_1st_cnt - 1 : new_1st_cnt; + + ntfs_rl_mc(new_rl, new_1st_cnt + new_2nd_cnt, + dst_rl, dst_1st_cnt, new_3rd_cnt); + /* Update vcn of the @dst_rl's second half runs to reflect + * appended @src_rl. */ - if (end <= rl[1].vcn) { - ntfs_debug("Done (requested hole is already sparse)."); - return 0; - } -extend_hole: - /* Extend the hole. */ - rl->length = end - rl->vcn; - /* If @end is in a hole, merge it with the current one. */ - if (rl_end->lcn == LCN_HOLE) { - rl_end++; - rl->length = rl_end->vcn - rl->vcn; - } - /* We have done the hole. Now deal with the remaining tail. */ - rl++; - /* Cut out all runlist elements up to @end. */ - if (rl < rl_end) - memmove(rl, rl_end, (rl_real_end - rl_end + 1) * - sizeof(*rl)); - /* Adjust the beginning of the tail if necessary. */ - if (end > rl->vcn) { - delta = end - rl->vcn; - rl->vcn = end; - rl->length -= delta; - /* Only adjust the lcn if it is real. */ - if (rl->lcn >= 0) - rl->lcn += delta; - } -shrink_allocation: - /* Reallocate memory if the allocation changed. */ - if (rl < rl_end) { - rl = ntfs_rl_realloc(runlist->rl, old_size, - old_size - (rl_end - rl)); - if (IS_ERR(rl)) - ntfs_warning(vol->sb, "Failed to shrink " - "runlist buffer. This just " - "wastes a bit of memory " - "temporarily so we ignore it " - "and return success."); - else - runlist->rl = rl; + if (new_1st_cnt + new_2nd_cnt == 0) { + rl_3rd = &new_rl[new_1st_cnt + new_2nd_cnt + 1]; + rl = &new_rl[new_1st_cnt + new_2nd_cnt]; + } else { + rl_3rd = &new_rl[new_1st_cnt + new_2nd_cnt]; + rl = &new_rl[new_1st_cnt + new_2nd_cnt - 1]; } - ntfs_debug("Done (extend hole)."); - return 0; + do { + rl_3rd->vcn = rl->vcn + rl->length; + if (rl_3rd->length <= 0) + break; + rl = rl_3rd; + rl_3rd++; + } while (1); } - /* - * If @start is at the beginning of a run things are easier as there is - * no need to split the first run. - */ - if (start == rl->vcn) { - /* - * @start is at the beginning of a run. - * - * If the previous run is sparse, extend its hole. - * - * If @end is not in the same run, switch the run to be sparse - * and extend the newly created hole. - * - * Thus both of these cases reduce the problem to the above - * case of "@start is in a hole". + *new_rl_cnt = new_1st_cnt + new_2nd_cnt + new_3rd_cnt; + + ntfs_free(dst_rl); + ntfs_free(src_rl_origin); + return new_rl; +} + +struct runlist_element *ntfs_rl_punch_hole(struct runlist_element *dst_rl, int dst_cnt, + s64 start_vcn, s64 len, + struct runlist_element **punch_rl, + size_t *new_rl_cnt) +{ + struct runlist_element *s_rl, *e_rl, *new_rl, *dst_3rd_rl, hole_rl[1]; + s64 end_vcn; + int new_1st_cnt, dst_3rd_cnt, new_cnt, punch_cnt, merge_cnt; + bool begin_split, end_split, one_split_3; + + if (dst_cnt < 2 || + !(dst_rl[dst_cnt - 1].lcn == LCN_ENOENT && + dst_rl[dst_cnt - 1].length == 0)) + return ERR_PTR(-EINVAL); + + end_vcn = min(start_vcn + len - 1, + dst_rl[dst_cnt - 2].vcn + dst_rl[dst_cnt - 2].length - 1); + + s_rl = ntfs_rl_find_vcn_nolock(dst_rl, start_vcn); + if (!s_rl || + s_rl->lcn <= LCN_ENOENT || + !ntfs_rle_contain(s_rl, start_vcn)) + return ERR_PTR(-EINVAL); + + begin_split = s_rl->vcn != start_vcn ? true : false; + + e_rl = ntfs_rl_find_vcn_nolock(dst_rl, end_vcn); + if (!e_rl || + e_rl->lcn <= LCN_ENOENT || + !ntfs_rle_contain(e_rl, end_vcn)) + return ERR_PTR(-EINVAL); + + end_split = e_rl->vcn + e_rl->length - 1 != end_vcn ? true : false; + + /* @s_rl has to be split into left, punched hole, and right */ + one_split_3 = e_rl == s_rl && begin_split && end_split ? true : false; + + punch_cnt = (int)(e_rl - s_rl) + 1; + + *punch_rl = ntfs_malloc_nofs((punch_cnt + 1) * sizeof(struct runlist_element)); + if (!*punch_rl) + return ERR_PTR(-ENOMEM); + + new_cnt = dst_cnt - (int)(e_rl - s_rl + 1) + 3; + new_rl = ntfs_malloc_nofs(new_cnt * sizeof(struct runlist_element)); + if (!new_rl) { + ntfs_free(*punch_rl); + *punch_rl = NULL; + return ERR_PTR(-ENOMEM); + } + + new_1st_cnt = (int)(s_rl - dst_rl) + 1; + ntfs_rl_mc(*punch_rl, 0, dst_rl, new_1st_cnt - 1, punch_cnt); + + (*punch_rl)[punch_cnt].lcn = LCN_ENOENT; + (*punch_rl)[punch_cnt].length = 0; + + if (!begin_split) + new_1st_cnt--; + dst_3rd_rl = e_rl; + dst_3rd_cnt = (int)(&dst_rl[dst_cnt - 1] - e_rl) + 1; + if (!end_split) { + dst_3rd_rl++; + dst_3rd_cnt--; + } + + /* Copy the 1st part of @dst_rl into @new_rl */ + ntfs_rl_mc(new_rl, 0, dst_rl, 0, new_1st_cnt); + if (begin_split) { + /* the @e_rl has to be splited and copied into the last of @new_rl + * and the first of @punch_rl */ - if (rl > runlist->rl && (rl - 1)->lcn == LCN_HOLE) { - rl--; - goto extend_hole; - } - if (end >= rl[1].vcn) { - rl->lcn = LCN_HOLE; - goto extend_hole; - } - /* - * The final case is when @end is in the same run as @start. - * For this need to split the run into two. One run for the - * sparse region between the beginning of the old run, i.e. - * @start, and @end and one for the remaining non-sparse - * region, i.e. between @end and the end of the old run. + s64 first_cnt = start_vcn - dst_rl[new_1st_cnt - 1].vcn; + + if (new_1st_cnt) + new_rl[new_1st_cnt - 1].length = first_cnt; + + (*punch_rl)[0].vcn = start_vcn; + (*punch_rl)[0].length -= first_cnt; + if ((*punch_rl)[0].lcn > LCN_HOLE) + (*punch_rl)[0].lcn += first_cnt; + } + + /* Copy a hole into @new_rl */ + hole_rl[0].vcn = start_vcn; + hole_rl[0].length = (s64)len; + hole_rl[0].lcn = LCN_HOLE; + ntfs_rl_mc(new_rl, new_1st_cnt, hole_rl, 0, 1); + + /* Copy the 3rd part of @dst_rl into @new_rl */ + ntfs_rl_mc(new_rl, new_1st_cnt + 1, dst_3rd_rl, 0, dst_3rd_cnt); + if (end_split) { + /* the @e_rl has to be splited and copied into the first of + * @new_rl and the last of @punch_rl */ - trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1); - if (IS_ERR(trl)) - goto enomem_out; - old_size++; - if (runlist->rl != trl) { - rl = trl + (rl - runlist->rl); - rl_end = trl + (rl_end - runlist->rl); - rl_real_end = trl + (rl_real_end - runlist->rl); - runlist->rl = trl; - } -split_end: - /* Shift all the runs up by one. */ - memmove(rl + 1, rl, (rl_real_end - rl + 1) * sizeof(*rl)); - /* Finally, setup the two split runs. */ - rl->lcn = LCN_HOLE; - rl->length = length; - rl++; - rl->vcn += length; - /* Only adjust the lcn if it is real. */ - if (rl->lcn >= 0 || lcn_fixup) - rl->lcn += length; - rl->length -= length; - ntfs_debug("Done (split one)."); - return 0; + s64 first_cnt = end_vcn - dst_3rd_rl[0].vcn + 1; + + new_rl[new_1st_cnt + 1].vcn = end_vcn + 1; + new_rl[new_1st_cnt + 1].length -= first_cnt; + if (new_rl[new_1st_cnt + 1].lcn > LCN_HOLE) + new_rl[new_1st_cnt + 1].lcn += first_cnt; + + if (one_split_3) + (*punch_rl)[punch_cnt - 1].length -= + new_rl[new_1st_cnt + 1].length; + else + (*punch_rl)[punch_cnt - 1].length = first_cnt; } - /* - * @start is neither in a hole nor at the beginning of a run. - * - * If @end is in a hole, things are easier as simply truncating the run - * @start is in to end at @start - 1, deleting all runs after that up - * to @end, and finally extending the beginning of the run @end is in - * to be @start is all that is needed. + + /* Merge left and hole, or hole and right in @new_rl, if left or right + * consists of holes. */ - if (rl_end->lcn == LCN_HOLE) { - /* Truncate the run containing @start. */ - rl->length = start - rl->vcn; - rl++; - /* Cut out all runlist elements up to @end. */ - if (rl < rl_end) - memmove(rl, rl_end, (rl_real_end - rl_end + 1) * - sizeof(*rl)); - /* Extend the beginning of the run @end is in to be @start. */ - rl->vcn = start; - rl->length = rl[1].vcn - start; - goto shrink_allocation; + merge_cnt = 0; + if (new_1st_cnt > 0 && new_rl[new_1st_cnt - 1].lcn == LCN_HOLE) { + /* Merge right and hole */ + s_rl = &new_rl[new_1st_cnt - 1]; + s_rl->length += s_rl[1].length; + merge_cnt = 1; + /* Merge left and right */ + if (new_1st_cnt + 1 < new_cnt && + new_rl[new_1st_cnt + 1].lcn == LCN_HOLE) { + s_rl->length += s_rl[2].length; + merge_cnt++; + } + } else if (new_1st_cnt + 1 < new_cnt && + new_rl[new_1st_cnt + 1].lcn == LCN_HOLE) { + /* Merge left and hole */ + s_rl = &new_rl[new_1st_cnt]; + s_rl->length += s_rl[1].length; + merge_cnt = 1; } - /* - * If @end is not in a hole there are still two cases to distinguish. - * Either @end is or is not in the same run as @start. - * - * The second case is easier as it can be reduced to an already solved - * problem by truncating the run @start is in to end at @start - 1. - * Then, if @end is in the next run need to split the run into a sparse - * run followed by a non-sparse run (already covered above) and if @end - * is not in the next run switching it to be sparse, again reduces the - * problem to the already covered case of "@start is in a hole". - */ - if (end >= rl[1].vcn) { - /* - * If @end is not in the next run, reduce the problem to the - * case of "@start is in a hole". + if (merge_cnt) { + struct runlist_element *d_rl, *src_rl; + + d_rl = s_rl + 1; + src_rl = s_rl + 1 + merge_cnt; + ntfs_rl_mm(new_rl, (int)(d_rl - new_rl), (int)(src_rl - new_rl), + (int)(&new_rl[new_cnt - 1] - src_rl) + 1); + } + + (*punch_rl)[punch_cnt].vcn = (*punch_rl)[punch_cnt - 1].vcn + + (*punch_rl)[punch_cnt - 1].length; + + /* punch_cnt elements of dst are replaced with one hole */ + *new_rl_cnt = dst_cnt - (punch_cnt - (int)begin_split - (int)end_split) + + 1 - merge_cnt; + ntfs_free(dst_rl); + return new_rl; +} + +struct runlist_element *ntfs_rl_collapse_range(struct runlist_element *dst_rl, int dst_cnt, + s64 start_vcn, s64 len, + struct runlist_element **punch_rl, + size_t *new_rl_cnt) +{ + struct runlist_element *s_rl, *e_rl, *new_rl, *dst_3rd_rl; + s64 end_vcn; + int new_1st_cnt, dst_3rd_cnt, new_cnt, punch_cnt, merge_cnt, i; + bool begin_split, end_split, one_split_3; + + if (dst_cnt < 2 || + !(dst_rl[dst_cnt - 1].lcn == LCN_ENOENT && + dst_rl[dst_cnt - 1].length == 0)) + return ERR_PTR(-EINVAL); + + end_vcn = min(start_vcn + len - 1, + dst_rl[dst_cnt - 1].vcn - 1); + + s_rl = ntfs_rl_find_vcn_nolock(dst_rl, start_vcn); + if (!s_rl || + s_rl->lcn <= LCN_ENOENT || + !ntfs_rle_contain(s_rl, start_vcn)) + return ERR_PTR(-EINVAL); + + begin_split = s_rl->vcn != start_vcn ? true : false; + + e_rl = ntfs_rl_find_vcn_nolock(dst_rl, end_vcn); + if (!e_rl || + e_rl->lcn <= LCN_ENOENT || + !ntfs_rle_contain(e_rl, end_vcn)) + return ERR_PTR(-EINVAL); + + end_split = e_rl->vcn + e_rl->length - 1 != end_vcn ? true : false; + + /* @s_rl has to be split into left, collapsed, and right */ + one_split_3 = e_rl == s_rl && begin_split && end_split ? true : false; + + punch_cnt = (int)(e_rl - s_rl) + 1; + *punch_rl = ntfs_malloc_nofs((punch_cnt + 1) * sizeof(struct runlist_element)); + if (!*punch_rl) + return ERR_PTR(-ENOMEM); + + new_cnt = dst_cnt - (int)(e_rl - s_rl + 1) + 3; + new_rl = ntfs_malloc_nofs(new_cnt * sizeof(struct runlist_element)); + if (!new_rl) { + ntfs_free(*punch_rl); + *punch_rl = NULL; + return ERR_PTR(-ENOMEM); + } + + new_1st_cnt = (int)(s_rl - dst_rl) + 1; + ntfs_rl_mc(*punch_rl, 0, dst_rl, new_1st_cnt - 1, punch_cnt); + (*punch_rl)[punch_cnt].lcn = LCN_ENOENT; + (*punch_rl)[punch_cnt].length = 0; + + if (!begin_split) + new_1st_cnt--; + dst_3rd_rl = e_rl; + dst_3rd_cnt = (int)(&dst_rl[dst_cnt - 1] - e_rl) + 1; + if (!end_split) { + dst_3rd_rl++; + dst_3rd_cnt--; + } + + /* Copy the 1st part of @dst_rl into @new_rl */ + ntfs_rl_mc(new_rl, 0, dst_rl, 0, new_1st_cnt); + if (begin_split) { + /* the @e_rl has to be splited and copied into the last of @new_rl + * and the first of @punch_rl */ - if (rl[1].length && end >= rl[2].vcn) { - /* Truncate the run containing @start. */ - rl->length = start - rl->vcn; - rl++; - rl->vcn = start; - rl->lcn = LCN_HOLE; - goto extend_hole; - } - trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1); - if (IS_ERR(trl)) - goto enomem_out; - old_size++; - if (runlist->rl != trl) { - rl = trl + (rl - runlist->rl); - rl_end = trl + (rl_end - runlist->rl); - rl_real_end = trl + (rl_real_end - runlist->rl); - runlist->rl = trl; - } - /* Truncate the run containing @start. */ - rl->length = start - rl->vcn; - rl++; - /* - * @end is in the next run, reduce the problem to the case - * where "@start is at the beginning of a run and @end is in - * the same run as @start". + s64 first_cnt = start_vcn - dst_rl[new_1st_cnt - 1].vcn; + + new_rl[new_1st_cnt - 1].length = first_cnt; + + (*punch_rl)[0].vcn = start_vcn; + (*punch_rl)[0].length -= first_cnt; + if ((*punch_rl)[0].lcn > LCN_HOLE) + (*punch_rl)[0].lcn += first_cnt; + } + + /* Copy the 3rd part of @dst_rl into @new_rl */ + ntfs_rl_mc(new_rl, new_1st_cnt, dst_3rd_rl, 0, dst_3rd_cnt); + if (end_split) { + /* the @e_rl has to be splited and copied into the first of + * @new_rl and the last of @punch_rl */ - delta = rl->vcn - start; - rl->vcn = start; - if (rl->lcn >= 0) { - rl->lcn -= delta; - /* Need this in case the lcn just became negative. */ - lcn_fixup = true; - } - rl->length += delta; - goto split_end; + s64 first_cnt = end_vcn - dst_3rd_rl[0].vcn + 1; + + new_rl[new_1st_cnt].vcn = end_vcn + 1; + new_rl[new_1st_cnt].length -= first_cnt; + if (new_rl[new_1st_cnt].lcn > LCN_HOLE) + new_rl[new_1st_cnt].lcn += first_cnt; + + if (one_split_3) + (*punch_rl)[punch_cnt - 1].length -= + new_rl[new_1st_cnt].length; + else + (*punch_rl)[punch_cnt - 1].length = first_cnt; } - /* - * The first case from above, i.e. @end is in the same run as @start. - * We need to split the run into three. One run for the non-sparse - * region between the beginning of the old run and @start, one for the - * sparse region between @start and @end, and one for the remaining - * non-sparse region, i.e. between @end and the end of the old run. + + /* Adjust vcn */ + if (new_1st_cnt == 0) + new_rl[new_1st_cnt].vcn = 0; + for (i = new_1st_cnt == 0 ? 1 : new_1st_cnt; new_rl[i].length; i++) + new_rl[i].vcn = new_rl[i - 1].vcn + new_rl[i - 1].length; + new_rl[i].vcn = new_rl[i - 1].vcn + new_rl[i - 1].length; + + /* Merge left and hole, or hole and right in @new_rl, if left or right + * consists of holes. */ - trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 2); - if (IS_ERR(trl)) - goto enomem_out; - old_size += 2; - if (runlist->rl != trl) { - rl = trl + (rl - runlist->rl); - rl_end = trl + (rl_end - runlist->rl); - rl_real_end = trl + (rl_real_end - runlist->rl); - runlist->rl = trl; + merge_cnt = 0; + i = new_1st_cnt == 0 ? 1 : new_1st_cnt; + if (ntfs_rle_lcn_contiguous(&new_rl[i - 1], &new_rl[i])) { + /* Merge right and left */ + s_rl = &new_rl[new_1st_cnt - 1]; + s_rl->length += s_rl[1].length; + merge_cnt = 1; } - /* Shift all the runs up by two. */ - memmove(rl + 2, rl, (rl_real_end - rl + 1) * sizeof(*rl)); - /* Finally, setup the three split runs. */ - rl->length = start - rl->vcn; - rl++; - rl->vcn = start; - rl->lcn = LCN_HOLE; - rl->length = length; - rl++; - delta = end - rl->vcn; - rl->vcn = end; - rl->lcn += delta; - rl->length -= delta; - ntfs_debug("Done (split both)."); - return 0; -enomem_out: - ntfs_error(vol->sb, "Not enough memory to extend runlist buffer."); - return -ENOMEM; -} + if (merge_cnt) { + struct runlist_element *d_rl, *src_rl; + + d_rl = s_rl + 1; + src_rl = s_rl + 1 + merge_cnt; + ntfs_rl_mm(new_rl, (int)(d_rl - new_rl), (int)(src_rl - new_rl), + (int)(&new_rl[new_cnt - 1] - src_rl) + 1); + } + + (*punch_rl)[punch_cnt].vcn = (*punch_rl)[punch_cnt - 1].vcn + + (*punch_rl)[punch_cnt - 1].length; -#endif /* NTFS_RW */ + /* punch_cnt elements of dst are extracted */ + *new_rl_cnt = dst_cnt - (punch_cnt - (int)begin_split - (int)end_split) - + merge_cnt; + + ntfs_free(dst_rl); + return new_rl; +} -- 2.25.1 This adds the implementation of reparse and ea operations Signed-off-by: Namjae Jeon --- fs/ntfs/ea.c | 933 ++++++++++++++++++++++++++++++++++++++++++++++ fs/ntfs/reparse.c | 550 +++++++++++++++++++++++++++ 2 files changed, 1483 insertions(+) create mode 100644 fs/ntfs/ea.c create mode 100644 fs/ntfs/reparse.c diff --git a/fs/ntfs/ea.c b/fs/ntfs/ea.c new file mode 100644 index 000000000000..11efffaed366 --- /dev/null +++ b/fs/ntfs/ea.c @@ -0,0 +1,933 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/** + * Pocessing of EA's + * + * Part of this file is based on code from the NTFS-3G project. + * + * Copyright (c) 2014-2021 Jean-Pierre Andre + * Copyright (c) 2025 LG Electronics Co., Ltd. + */ + +#include +#include +#include +#include + +#include "layout.h" +#include "attrib.h" +#include "index.h" +#include "dir.h" +#include "ea.h" +#include "malloc.h" + +static int ntfs_write_ea(struct ntfs_inode *ni, int type, char *value, s64 ea_off, + s64 ea_size, bool need_truncate) +{ + struct inode *ea_vi; + int err = 0; + s64 written; + + ea_vi = ntfs_attr_iget(VFS_I(ni), type, AT_UNNAMED, 0); + if (IS_ERR(ea_vi)) + return PTR_ERR(ea_vi); + + written = ntfs_inode_attr_pwrite(ea_vi, ea_off, ea_size, value, false); + if (written != ea_size) + err = -EIO; + else { + struct ntfs_inode *ea_ni = NTFS_I(ea_vi); + + if (need_truncate && ea_ni->data_size > ea_off + ea_size) + ntfs_attr_truncate(ea_ni, ea_off + ea_size); + mark_mft_record_dirty(ni); + } + + iput(ea_vi); + return err; +} + +static int ntfs_ea_lookup(char *ea_buf, s64 ea_buf_size, const char *name, + int name_len, s64 *ea_offset, s64 *ea_size) +{ + const struct ea_attr *p_ea; + s64 offset; + unsigned int next; + + if (ea_buf_size < sizeof(struct ea_attr)) + goto out; + + offset = 0; + do { + p_ea = (const struct ea_attr *)&ea_buf[offset]; + next = le32_to_cpu(p_ea->next_entry_offset); + + if (offset + next > ea_buf_size || + ((1 + p_ea->ea_name_length) > (ea_buf_size - offset))) + break; + + if (p_ea->ea_name_length == name_len && + !memcmp(p_ea->ea_name, name, name_len)) { + *ea_offset = offset; + if (next) + *ea_size = next; + else { + unsigned int ea_len = 1 + p_ea->ea_name_length + + le16_to_cpu(p_ea->ea_value_length); + + if ((ea_buf_size - offset) < ea_len) + goto out; + + *ea_size = ALIGN(struct_size(p_ea, ea_name, + 1 + p_ea->ea_name_length + + le16_to_cpu(p_ea->ea_value_length)), 4); + } + + if (ea_buf_size < *ea_offset + *ea_size) + goto out; + + return 0; + } + offset += next; + } while (next > 0 && offset < ea_buf_size && + sizeof(struct ea_attr) < (ea_buf_size - offset)); + +out: + return -ENOENT; +} + +/* + * Return the existing EA + * + * The EA_INFORMATION is not examined and the consistency of the + * existing EA is not checked. + * + * If successful, the full attribute is returned unchanged + * and its size is returned. + * If the designated buffer is too small, the needed size is + * returned, and the buffer is left unchanged. + * If there is an error, a negative value is returned and errno + * is set according to the error. + */ +static int ntfs_get_ea(struct inode *inode, const char *name, size_t name_len, + void *buffer, size_t size) +{ + struct ntfs_inode *ni = NTFS_I(inode); + const struct ea_attr *p_ea; + char *ea_buf; + s64 ea_off, ea_size, all_ea_size, ea_info_size; + int err; + unsigned short int ea_value_len, ea_info_qlen; + struct ea_information *p_ea_info; + + if (!NInoHasEA(ni)) + return -ENODATA; + + p_ea_info = ntfs_attr_readall(ni, AT_EA_INFORMATION, NULL, 0, + &ea_info_size); + if (!p_ea_info || ea_info_size != sizeof(struct ea_information)) { + ntfs_free(p_ea_info); + return -ENODATA; + } + + ea_info_qlen = le16_to_cpu(p_ea_info->ea_query_length); + ntfs_free(p_ea_info); + + ea_buf = ntfs_attr_readall(ni, AT_EA, NULL, 0, &all_ea_size); + if (!ea_buf) + return -ENODATA; + + err = ntfs_ea_lookup(ea_buf, ea_info_qlen, name, name_len, &ea_off, + &ea_size); + if (!err) { + p_ea = (struct ea_attr *)&ea_buf[ea_off]; + ea_value_len = le16_to_cpu(p_ea->ea_value_length); + if (!buffer) { + ntfs_free(ea_buf); + return ea_value_len; + } + + if (ea_value_len > size) { + err = -ERANGE; + goto free_ea_buf; + } + + memcpy(buffer, &p_ea->ea_name[p_ea->ea_name_length + 1], + ea_value_len); + ntfs_free(ea_buf); + return ea_value_len; + } + + err = -ENODATA; +free_ea_buf: + ntfs_free(ea_buf); + return err; +} + +static inline int ea_packed_size(const struct ea_attr *p_ea) +{ + /* + * 4 bytes for header (flags and lengths) + name length + 1 + + * value length. + */ + return 5 + p_ea->ea_name_length + le16_to_cpu(p_ea->ea_value_length); +} + +/* + * Set a new EA, and set EA_INFORMATION accordingly + * + * This is roughly the same as ZwSetEaFile() on Windows, however + * the "offset to next" of the last EA should not be cleared. + * + * Consistency of the new EA is first checked. + * + * EA_INFORMATION is set first, and it is restored to its former + * state if setting EA fails. + */ +static int ntfs_set_ea(struct inode *inode, const char *name, size_t name_len, + const void *value, size_t val_size, int flags, + __le16 *packed_ea_size) +{ + struct ntfs_inode *ni = NTFS_I(inode); + struct ea_information *p_ea_info = NULL; + int ea_packed, err = 0; + struct ea_attr *p_ea; + unsigned short int ea_info_qsize = 0; + char *ea_buf = NULL; + size_t new_ea_size = ALIGN(struct_size(p_ea, ea_name, 1 + name_len + val_size), 4); + s64 ea_off, ea_info_size, all_ea_size, ea_size; + + if (name_len > 255) + return -ENAMETOOLONG; + + if (ntfs_attr_exist(ni, AT_EA_INFORMATION, AT_UNNAMED, 0)) { + p_ea_info = ntfs_attr_readall(ni, AT_EA_INFORMATION, NULL, 0, + &ea_info_size); + if (!p_ea_info || ea_info_size != sizeof(struct ea_information)) + goto out; + + ea_buf = ntfs_attr_readall(ni, AT_EA, NULL, 0, &all_ea_size); + if (!ea_buf) { + ea_info_qsize = 0; + ntfs_free(p_ea_info); + goto create_ea_info; + } + + ea_info_qsize = le32_to_cpu(p_ea_info->ea_query_length); + } else { +create_ea_info: + p_ea_info = ntfs_malloc_nofs(sizeof(struct ea_information)); + if (!p_ea_info) + return -ENOMEM; + + ea_info_qsize = 0; + err = ntfs_attr_add(ni, AT_EA_INFORMATION, AT_UNNAMED, 0, + (char *)p_ea_info, sizeof(struct ea_information)); + if (err) + goto out; + + if (ntfs_attr_exist(ni, AT_EA, AT_UNNAMED, 0)) { + err = ntfs_attr_remove(ni, AT_EA, AT_UNNAMED, 0); + if (err) + goto out; + } + + goto alloc_new_ea; + } + + if (ea_info_qsize > all_ea_size) { + err = -EIO; + goto out; + } + + err = ntfs_ea_lookup(ea_buf, ea_info_qsize, name, name_len, &ea_off, + &ea_size); + if (ea_info_qsize && !err) { + if (flags & XATTR_CREATE) { + err = -EEXIST; + goto out; + } + + p_ea = (struct ea_attr *)(ea_buf + ea_off); + + if (val_size && + le16_to_cpu(p_ea->ea_value_length) == val_size && + !memcmp(p_ea->ea_name + p_ea->ea_name_length + 1, value, + val_size)) + goto out; + + le16_add_cpu(&p_ea_info->ea_length, 0 - ea_packed_size(p_ea)); + + if (p_ea->flags & NEED_EA) + le16_add_cpu(&p_ea_info->need_ea_count, -1); + + memmove((char *)p_ea, (char *)p_ea + ea_size, ea_info_qsize - (ea_off + ea_size)); + ea_info_qsize -= ea_size; + p_ea_info->ea_query_length = cpu_to_le16(ea_info_qsize); + + err = ntfs_write_ea(ni, AT_EA_INFORMATION, (char *)p_ea_info, 0, + sizeof(struct ea_information), false); + if (err) + goto out; + + err = ntfs_write_ea(ni, AT_EA, ea_buf, 0, ea_info_qsize, true); + if (err) + goto out; + + if ((flags & XATTR_REPLACE) && !val_size) { + /* Remove xattr. */ + goto out; + } + } else { + if (flags & XATTR_REPLACE) { + err = -ENODATA; + goto out; + } + } + ntfs_free(ea_buf); + +alloc_new_ea: + ea_buf = kzalloc(new_ea_size, GFP_NOFS); + if (!ea_buf) { + err = -ENOMEM; + goto out; + } + + /* + * EA and REPARSE_POINT compatibility not checked any more, + * required by Windows 10, but having both may lead to + * problems with earlier versions. + */ + p_ea = (struct ea_attr *)ea_buf; + memcpy(p_ea->ea_name, name, name_len); + p_ea->ea_name_length = name_len; + p_ea->ea_name[name_len] = 0; + memcpy(p_ea->ea_name + name_len + 1, value, val_size); + p_ea->ea_value_length = cpu_to_le16(val_size); + p_ea->next_entry_offset = cpu_to_le32(new_ea_size); + + ea_packed = le16_to_cpu(p_ea_info->ea_length) + ea_packed_size(p_ea); + p_ea_info->ea_length = cpu_to_le16(ea_packed); + p_ea_info->ea_query_length = cpu_to_le32(ea_info_qsize + new_ea_size); + + if (ea_packed > 0xffff || + ntfs_attr_size_bounds_check(ni->vol, AT_EA, new_ea_size)) { + err = -EFBIG; + goto out; + } + + /* + * no EA or EA_INFORMATION : add them + */ + if (!ntfs_attr_exist(ni, AT_EA, AT_UNNAMED, 0)) { + err = ntfs_attr_add(ni, AT_EA, AT_UNNAMED, 0, (char *)p_ea, + new_ea_size); + if (err) + goto out; + } else { + err = ntfs_write_ea(ni, AT_EA, (char *)p_ea, ea_info_qsize, + new_ea_size, false); + if (err) + goto out; + } + + err = ntfs_write_ea(ni, AT_EA_INFORMATION, (char *)p_ea_info, 0, + sizeof(struct ea_information), false); + if (err) + goto out; + + if (packed_ea_size) + *packed_ea_size = p_ea_info->ea_length; + mark_mft_record_dirty(ni); +out: + if (ea_info_qsize > 0) + NInoSetHasEA(ni); + else + NInoClearHasEA(ni); + + ntfs_free(ea_buf); + ntfs_free(p_ea_info); + + return err; +} + +/* + * Check for the presence of an EA "$LXDEV" (used by WSL) + * and return its value as a device address + */ +int ntfs_ea_get_wsl_inode(struct inode *inode, dev_t *rdevp, unsigned int flags) +{ + int err; + __le32 v; + + if (!(flags & NTFS_VOL_UID)) { + /* Load uid to lxuid EA */ + err = ntfs_get_ea(inode, "$LXUID", sizeof("$LXUID") - 1, &v, + sizeof(v)); + if (err < 0) + return err; + i_uid_write(inode, le32_to_cpu(v)); + } + + if (!(flags & NTFS_VOL_UID)) { + /* Load gid to lxgid EA */ + err = ntfs_get_ea(inode, "$LXGID", sizeof("$LXGID") - 1, &v, + sizeof(v)); + if (err < 0) + return err; + i_gid_write(inode, le32_to_cpu(v)); + } + + /* Load mode to lxmod EA */ + err = ntfs_get_ea(inode, "$LXMOD", sizeof("$LXMOD") - 1, &v, sizeof(v)); + if (err > 0) { + inode->i_mode = le32_to_cpu(v); + } else { + /* Everyone gets all permissions. */ + inode->i_mode |= 0777; + } + + /* Load mode to lxdev EA */ + err = ntfs_get_ea(inode, "$LXDEV", sizeof("$LXDEV") - 1, &v, sizeof(v)); + if (err > 0) + *rdevp = le32_to_cpu(v); + err = 0; + + return err; +} + +int ntfs_ea_set_wsl_inode(struct inode *inode, dev_t rdev, __le16 *ea_size, + unsigned int flags) +{ + __le32 v; + int err; + + if (flags & NTFS_EA_UID) { + /* Store uid to lxuid EA */ + v = cpu_to_le32(i_uid_read(inode)); + err = ntfs_set_ea(inode, "$LXUID", sizeof("$LXUID") - 1, &v, + sizeof(v), 0, ea_size); + if (err) + return err; + } + + if (flags & NTFS_EA_GID) { + /* Store gid to lxgid EA */ + v = cpu_to_le32(i_gid_read(inode)); + err = ntfs_set_ea(inode, "$LXGID", sizeof("$LXGID") - 1, &v, + sizeof(v), 0, ea_size); + if (err) + return err; + } + + if (flags & NTFS_EA_MODE) { + /* Store mode to lxmod EA */ + v = cpu_to_le32(inode->i_mode); + err = ntfs_set_ea(inode, "$LXMOD", sizeof("$LXMOD") - 1, &v, + sizeof(v), 0, ea_size); + if (err) + return err; + } + + if (rdev) { + v = cpu_to_le32(rdev); + err = ntfs_set_ea(inode, "$LXDEV", sizeof("$LXDEV") - 1, &v, sizeof(v), + 0, ea_size); + } + + return err; +} + +ssize_t ntfsp_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct inode *inode = d_inode(dentry); + struct ntfs_inode *ni = NTFS_I(inode); + const struct ea_attr *p_ea; + s64 offset, ea_buf_size, ea_info_size; + int next, err = 0, ea_size; + unsigned int ea_info_qsize; + char *ea_buf = NULL; + ssize_t ret = 0; + struct ea_information *ea_info; + + if (!NInoHasEA(ni)) + return 0; + + mutex_lock(&NTFS_I(inode)->mrec_lock); + ea_info = ntfs_attr_readall(ni, AT_EA_INFORMATION, NULL, 0, + &ea_info_size); + if (!ea_info || ea_info_size != sizeof(struct ea_information)) + goto out; + + ea_info_qsize = le16_to_cpu(ea_info->ea_query_length); + + ea_buf = ntfs_attr_readall(ni, AT_EA, NULL, 0, &ea_buf_size); + if (!ea_buf) + goto out; + + if (ea_info_qsize > ea_buf_size) + goto out; + + if (ea_buf_size < sizeof(struct ea_attr)) + goto out; + + offset = 0; + do { + p_ea = (const struct ea_attr *)&ea_buf[offset]; + next = le32_to_cpu(p_ea->next_entry_offset); + if (next) + ea_size = next; + else + ea_size = ALIGN(struct_size(p_ea, ea_name, + 1 + p_ea->ea_name_length + + le16_to_cpu(p_ea->ea_value_length)), + 4); + if (buffer) { + if (offset + ea_size > ea_info_qsize) + break; + + if (ret + p_ea->ea_name_length + 1 > size) { + err = -ERANGE; + goto out; + } + + if (p_ea->ea_name_length + 1 > (ea_info_qsize - offset)) + break; + + memcpy(buffer + ret, p_ea->ea_name, p_ea->ea_name_length); + buffer[ret + p_ea->ea_name_length] = 0; + } + + ret += p_ea->ea_name_length + 1; + offset += ea_size; + } while (next > 0 && offset < ea_info_qsize && + sizeof(struct ea_attr) < (ea_info_qsize - offset)); + +out: + mutex_unlock(&NTFS_I(inode)->mrec_lock); + ntfs_free(ea_info); + ntfs_free(ea_buf); + + return err ? err : ret; +} + +// clang-format off +#define SYSTEM_DOS_ATTRIB "system.dos_attrib" +#define SYSTEM_NTFS_ATTRIB "system.ntfs_attrib" +#define SYSTEM_NTFS_ATTRIB_BE "system.ntfs_attrib_be" +// clang-format on + +static int ntfs_getxattr(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, const char *name, + void *buffer, size_t size) +{ + struct ntfs_inode *ni = NTFS_I(inode); + int err; + + if (NVolShutdown(ni->vol)) + return -EIO; + + if (!strcmp(name, SYSTEM_DOS_ATTRIB)) { + if (!buffer) { + err = sizeof(u8); + } else if (size < sizeof(u8)) { + err = -ENODATA; + } else { + err = sizeof(u8); + *(u8 *)buffer = ni->flags; + } + goto out; + } + + if (!strcmp(name, SYSTEM_NTFS_ATTRIB) || + !strcmp(name, SYSTEM_NTFS_ATTRIB_BE)) { + if (!buffer) { + err = sizeof(u32); + } else if (size < sizeof(u32)) { + err = -ENODATA; + } else { + err = sizeof(u32); + *(u32 *)buffer = le32_to_cpu(ni->flags); + if (!strcmp(name, SYSTEM_NTFS_ATTRIB_BE)) + *(__be32 *)buffer = cpu_to_be32(*(u32 *)buffer); + } + goto out; + } + + mutex_lock(&ni->mrec_lock); + err = ntfs_get_ea(inode, name, strlen(name), buffer, size); + mutex_unlock(&ni->mrec_lock); + +out: + return err; +} + +static int ntfs_new_attr_flags(struct ntfs_inode *ni, __le32 fattr) +{ + struct ntfs_attr_search_ctx *ctx; + struct mft_record *m; + struct attr_record *a; + __le16 new_aflags; + int mp_size, mp_ofs, name_ofs, arec_size, err; + + m = map_mft_record(ni); + if (IS_ERR(m)) + return PTR_ERR(m); + + ctx = ntfs_attr_get_search_ctx(ni, m); + if (!ctx) { + err = -ENOMEM; + goto err_out; + } + + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (err) { + err = -EINVAL; + goto err_out; + } + + a = ctx->attr; + new_aflags = ctx->attr->flags; + + if (fattr & FILE_ATTR_SPARSE_FILE) + new_aflags |= ATTR_IS_SPARSE; + else + new_aflags &= ~ATTR_IS_SPARSE; + + if (fattr & FILE_ATTR_COMPRESSED) + new_aflags |= ATTR_IS_COMPRESSED; + else + new_aflags &= ~ATTR_IS_COMPRESSED; + + if (new_aflags == a->flags) + return 0; + + if ((new_aflags & (ATTR_IS_SPARSE | ATTR_IS_COMPRESSED)) == + (ATTR_IS_SPARSE | ATTR_IS_COMPRESSED)) { + pr_err("file can't be sparsed and compressed\n"); + err = -EOPNOTSUPP; + goto err_out; + } + + if (!a->non_resident) + goto out; + + if (a->data.non_resident.data_size) { + pr_err("Can't change sparsed/compressed for non-empty file"); + err = -EOPNOTSUPP; + goto err_out; + } + + if (new_aflags & (ATTR_IS_SPARSE | ATTR_IS_COMPRESSED)) + name_ofs = (offsetof(struct attr_record, + data.non_resident.compressed_size) + + sizeof(a->data.non_resident.compressed_size) + 7) & ~7; + else + name_ofs = (offsetof(struct attr_record, + data.non_resident.compressed_size) + 7) & ~7; + + mp_size = ntfs_get_size_for_mapping_pairs(ni->vol, ni->runlist.rl, 0, -1, -1); + if (unlikely(mp_size < 0)) { + err = mp_size; + ntfs_debug("Failed to get size for mapping pairs array, error code %i.\n", err); + goto err_out; + } + + mp_ofs = (name_ofs + a->name_length * sizeof(__le16) + 7) & ~7; + arec_size = (mp_ofs + mp_size + 7) & ~7; + + err = ntfs_attr_record_resize(m, a, arec_size); + if (unlikely(err)) + goto err_out; + + if (new_aflags & (ATTR_IS_SPARSE | ATTR_IS_COMPRESSED)) { + a->data.non_resident.compression_unit = 0; + if (new_aflags & ATTR_IS_COMPRESSED || ni->vol->major_ver < 3) + a->data.non_resident.compression_unit = 4; + a->data.non_resident.compressed_size = 0; + ni->itype.compressed.size = 0; + if (a->data.non_resident.compression_unit) { + ni->itype.compressed.block_size = 1U << + (a->data.non_resident.compression_unit + + ni->vol->cluster_size_bits); + ni->itype.compressed.block_size_bits = + ffs(ni->itype.compressed.block_size) - + 1; + ni->itype.compressed.block_clusters = 1U << + a->data.non_resident.compression_unit; + } else { + ni->itype.compressed.block_size = 0; + ni->itype.compressed.block_size_bits = 0; + ni->itype.compressed.block_clusters = 0; + } + + if (new_aflags & ATTR_IS_SPARSE) { + NInoSetSparse(ni); + ni->flags |= FILE_ATTR_SPARSE_FILE; + } + + if (new_aflags & ATTR_IS_COMPRESSED) { + NInoSetCompressed(ni); + ni->flags |= FILE_ATTR_COMPRESSED; + } + } else { + ni->flags &= ~(FILE_ATTR_SPARSE_FILE | FILE_ATTR_COMPRESSED); + a->data.non_resident.compression_unit = 0; + NInoClearSparse(ni); + NInoClearCompressed(ni); + } + + a->name_offset = cpu_to_le16(name_ofs); + a->data.non_resident.mapping_pairs_offset = cpu_to_le16(mp_ofs); + +out: + a->flags = new_aflags; + mark_mft_record_dirty(ctx->ntfs_ino); +err_out: + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); + return err; +} + +static int ntfs_setxattr(const struct xattr_handler *handler, + struct mnt_idmap *idmap, struct dentry *unused, + struct inode *inode, const char *name, const void *value, + size_t size, int flags) +{ + struct ntfs_inode *ni = NTFS_I(inode); + int err; + __le32 fattr; + + if (NVolShutdown(ni->vol)) + return -EIO; + + if (!strcmp(name, SYSTEM_DOS_ATTRIB)) { + if (sizeof(u8) != size) { + err = -EINVAL; + goto out; + } + fattr = cpu_to_le32(*(u8 *)value); + goto set_fattr; + } + + if (!strcmp(name, SYSTEM_NTFS_ATTRIB) || + !strcmp(name, SYSTEM_NTFS_ATTRIB_BE)) { + if (size != sizeof(u32)) { + err = -EINVAL; + goto out; + } + if (!strcmp(name, SYSTEM_NTFS_ATTRIB_BE)) + fattr = cpu_to_le32(be32_to_cpu(*(__be32 *)value)); + else + fattr = cpu_to_le32(*(u32 *)value); + + if (S_ISREG(inode->i_mode)) { + mutex_lock(&ni->mrec_lock); + err = ntfs_new_attr_flags(ni, fattr); + mutex_unlock(&ni->mrec_lock); + if (err) + goto out; + } + +set_fattr: + if (S_ISDIR(inode->i_mode)) + fattr |= FILE_ATTR_DIRECTORY; + else + fattr &= ~FILE_ATTR_DIRECTORY; + + if (ni->flags != fattr) { + ni->flags = fattr; + if (fattr & FILE_ATTR_READONLY) + inode->i_mode &= ~0222; + else + inode->i_mode |= 0222; + NInoSetFileNameDirty(ni); + mark_inode_dirty(inode); + } + err = 0; + goto out; + } + + mutex_lock(&ni->mrec_lock); + err = ntfs_set_ea(inode, name, strlen(name), value, size, flags, NULL); + mutex_unlock(&ni->mrec_lock); + +out: + inode_set_ctime_current(inode); + mark_inode_dirty(inode); + return err; +} + +static bool ntfs_xattr_user_list(struct dentry *dentry) +{ + return true; +} + +// clang-format off +static const struct xattr_handler ntfs_other_xattr_handler = { + .prefix = "", + .get = ntfs_getxattr, + .set = ntfs_setxattr, + .list = ntfs_xattr_user_list, +}; + +const struct xattr_handler * const ntfsp_xattr_handlers[] = { + &ntfs_other_xattr_handler, + NULL, +}; +// clang-format on + +#ifdef CONFIG_NTFS_FS_POSIX_ACL +struct posix_acl *ntfsp_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, + int type) +{ + struct inode *inode = d_inode(dentry); + struct ntfs_inode *ni = NTFS_I(inode); + const char *name; + size_t name_len; + struct posix_acl *acl; + int err; + void *buf; + + /* Allocate PATH_MAX bytes. */ + buf = __getname(); + if (!buf) + return ERR_PTR(-ENOMEM); + + /* Possible values of 'type' was already checked above. */ + if (type == ACL_TYPE_ACCESS) { + name = XATTR_NAME_POSIX_ACL_ACCESS; + name_len = sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1; + } else { + name = XATTR_NAME_POSIX_ACL_DEFAULT; + name_len = sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1; + } + + mutex_lock(&ni->mrec_lock); + err = ntfs_get_ea(inode, name, name_len, buf, PATH_MAX); + mutex_unlock(&ni->mrec_lock); + + /* Translate extended attribute to acl. */ + if (err >= 0) + acl = posix_acl_from_xattr(&init_user_ns, buf, err); + else if (err == -ENODATA) + acl = NULL; + else + acl = ERR_PTR(err); + + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + + __putname(buf); + + return acl; +} + +static noinline int ntfs_set_acl_ex(struct mnt_idmap *idmap, + struct inode *inode, struct posix_acl *acl, + int type, bool init_acl) +{ + const char *name; + size_t size, name_len; + void *value; + int err; + int flags; + umode_t mode; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + mode = inode->i_mode; + switch (type) { + case ACL_TYPE_ACCESS: + /* Do not change i_mode if we are in init_acl */ + if (acl && !init_acl) { + err = posix_acl_update_mode(idmap, inode, &mode, &acl); + if (err) + return err; + } + name = XATTR_NAME_POSIX_ACL_ACCESS; + name_len = sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1; + break; + + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + name = XATTR_NAME_POSIX_ACL_DEFAULT; + name_len = sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1; + break; + + default: + return -EINVAL; + } + + if (!acl) { + /* Remove xattr if it can be presented via mode. */ + size = 0; + value = NULL; + flags = XATTR_REPLACE; + } else { + size = posix_acl_xattr_size(acl->a_count); + value = kmalloc(size, GFP_NOFS); + if (!value) + return -ENOMEM; + err = posix_acl_to_xattr(&init_user_ns, acl, value, size); + if (err < 0) + goto out; + flags = 0; + } + + mutex_lock(&NTFS_I(inode)->mrec_lock); + err = ntfs_set_ea(inode, name, name_len, value, size, flags, NULL); + mutex_unlock(&NTFS_I(inode)->mrec_lock); + if (err == -ENODATA && !size) + err = 0; /* Removing non existed xattr. */ + if (!err) { + set_cached_acl(inode, type, acl); + inode->i_mode = mode; + inode_set_ctime_current(inode); + mark_inode_dirty(inode); + } + +out: + kfree(value); + + return err; +} + +int ntfsp_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, + struct posix_acl *acl, int type) +{ + return ntfs_set_acl_ex(idmap, d_inode(dentry), acl, type, false); +} + +int ntfsp_init_acl(struct mnt_idmap *idmap, struct inode *inode, + struct inode *dir) +{ + struct posix_acl *default_acl, *acl; + int err; + + err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (err) + return err; + + if (default_acl) { + err = ntfs_set_acl_ex(idmap, inode, default_acl, + ACL_TYPE_DEFAULT, true); + posix_acl_release(default_acl); + } else { + inode->i_default_acl = NULL; + } + + if (acl) { + if (!err) + err = ntfs_set_acl_ex(idmap, inode, acl, + ACL_TYPE_ACCESS, true); + posix_acl_release(acl); + } else { + inode->i_acl = NULL; + } + + return err; +} +#endif diff --git a/fs/ntfs/reparse.c b/fs/ntfs/reparse.c new file mode 100644 index 000000000000..2cd03e68ed6c --- /dev/null +++ b/fs/ntfs/reparse.c @@ -0,0 +1,550 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/** + * Processing of reparse points + * + * Part of this file is based on code from the NTFS-3G project. + * + * Copyright (c) 2008-2021 Jean-Pierre Andre + * Copyright (c) 2025 LG Electronics Co., Ltd. + */ + +#include "ntfs.h" +#include "layout.h" +#include "attrib.h" +#include "inode.h" +#include "dir.h" +#include "volume.h" +#include "mft.h" +#include "index.h" +#include "lcnalloc.h" +#include "reparse.h" +#include "malloc.h" + +struct WSL_LINK_REPARSE_DATA { + __le32 type; + char link[]; +}; + +struct REPARSE_INDEX { /* index entry in $Extend/$Reparse */ + struct index_entry_header header; + struct reparse_index_key key; + __le32 filling; +}; + +__le16 reparse_index_name[] = { cpu_to_le16('$'), + cpu_to_le16('R') }; + +/* + * Do some sanity checks on reparse data + * + * Microsoft reparse points have an 8-byte header whereas + * non-Microsoft reparse points have a 24-byte header. In each case, + * 'reparse_data_length' must equal the number of non-header bytes. + * + * If the reparse data looks like a junction point or symbolic + * link, more checks can be done. + */ +static bool valid_reparse_data(struct ntfs_inode *ni, + const struct reparse_point *reparse_attr, size_t size) +{ + bool ok; + const struct WSL_LINK_REPARSE_DATA *wsl_reparse_data; + + ok = ni && reparse_attr && (size >= sizeof(struct reparse_point)) && + (reparse_attr->reparse_tag != IO_REPARSE_TAG_RESERVED_ZERO) && + (((size_t)le16_to_cpu(reparse_attr->reparse_data_length) + + sizeof(struct reparse_point) + + ((reparse_attr->reparse_tag & IO_REPARSE_TAG_IS_MICROSOFT) ? + 0 : sizeof(struct guid))) == size); + if (ok) { + switch (reparse_attr->reparse_tag) { + case IO_REPARSE_TAG_LX_SYMLINK: + wsl_reparse_data = (const struct WSL_LINK_REPARSE_DATA *) + reparse_attr->reparse_data; + if ((le16_to_cpu(reparse_attr->reparse_data_length) <= + sizeof(wsl_reparse_data->type)) || + (wsl_reparse_data->type != cpu_to_le32(2))) + ok = false; + break; + case IO_REPARSE_TAG_AF_UNIX: + case IO_REPARSE_TAG_LX_FIFO: + case IO_REPARSE_TAG_LX_CHR: + case IO_REPARSE_TAG_LX_BLK: + if (reparse_attr->reparse_data_length || + !(ni->flags & FILE_ATTRIBUTE_RECALL_ON_OPEN)) + ok = false; + break; + default: + break; + } + } + return ok; +} + +static unsigned int ntfs_reparse_tag_mode(struct reparse_point *reparse_attr) +{ + unsigned int mode = 0; + + switch (reparse_attr->reparse_tag) { + case IO_REPARSE_TAG_SYMLINK: + case IO_REPARSE_TAG_LX_SYMLINK: + mode = S_IFLNK; + break; + case IO_REPARSE_TAG_AF_UNIX: + mode = S_IFSOCK; + break; + case IO_REPARSE_TAG_LX_FIFO: + mode = S_IFIFO; + break; + case IO_REPARSE_TAG_LX_CHR: + mode = S_IFCHR; + break; + case IO_REPARSE_TAG_LX_BLK: + mode = S_IFBLK; + } + + return mode; +} + +/* + * Get the target for symbolic link + */ +unsigned int ntfs_make_symlink(struct ntfs_inode *ni) +{ + s64 attr_size = 0; + unsigned int lth; + struct reparse_point *reparse_attr; + struct WSL_LINK_REPARSE_DATA *wsl_link_data; + unsigned int mode = 0; + + reparse_attr = ntfs_attr_readall(ni, AT_REPARSE_POINT, NULL, 0, + &attr_size); + if (reparse_attr && attr_size && + valid_reparse_data(ni, reparse_attr, attr_size)) { + switch (reparse_attr->reparse_tag) { + case IO_REPARSE_TAG_LX_SYMLINK: + wsl_link_data = (struct WSL_LINK_REPARSE_DATA *)reparse_attr->reparse_data; + if (wsl_link_data->type == cpu_to_le32(2)) { + lth = le16_to_cpu(reparse_attr->reparse_data_length) - + sizeof(wsl_link_data->type); + ni->target = ntfs_malloc_nofs(lth + 1); + if (ni->target) { + memcpy(ni->target, wsl_link_data->link, lth); + ni->target[lth] = 0; + mode = ntfs_reparse_tag_mode(reparse_attr); + } + } + break; + default: + mode = ntfs_reparse_tag_mode(reparse_attr); + } + } else + ni->flags &= ~FILE_ATTR_REPARSE_POINT; + + if (reparse_attr) + ntfs_free(reparse_attr); + + return mode; +} + +unsigned int ntfs_reparse_tag_dt_types(struct ntfs_volume *vol, unsigned long mref) +{ + s64 attr_size = 0; + struct reparse_point *reparse_attr; + unsigned int dt_type = DT_UNKNOWN; + struct inode *vi; + + vi = ntfs_iget(vol->sb, mref); + if (IS_ERR(vi)) + return PTR_ERR(vi); + + reparse_attr = (struct reparse_point *)ntfs_attr_readall(NTFS_I(vi), + AT_REPARSE_POINT, NULL, 0, &attr_size); + + if (reparse_attr && attr_size) { + switch (reparse_attr->reparse_tag) { + case IO_REPARSE_TAG_SYMLINK: + case IO_REPARSE_TAG_LX_SYMLINK: + dt_type = DT_LNK; + break; + case IO_REPARSE_TAG_AF_UNIX: + dt_type = DT_SOCK; + break; + case IO_REPARSE_TAG_LX_FIFO: + dt_type = DT_FIFO; + break; + case IO_REPARSE_TAG_LX_CHR: + dt_type = DT_CHR; + break; + case IO_REPARSE_TAG_LX_BLK: + dt_type = DT_BLK; + } + } + + if (reparse_attr) + ntfs_free(reparse_attr); + + iput(vi); + return dt_type; +} + +/* + * Set the index for new reparse data + */ +static int set_reparse_index(struct ntfs_inode *ni, struct ntfs_index_context *xr, + __le32 reparse_tag) +{ + struct REPARSE_INDEX indx; + u64 file_id_cpu; + __le64 file_id; + + file_id_cpu = MK_MREF(ni->mft_no, ni->seq_no); + file_id = cpu_to_le64(file_id_cpu); + indx.header.data.vi.data_offset = + cpu_to_le16(sizeof(struct index_entry_header) + sizeof(struct reparse_index_key)); + indx.header.data.vi.data_length = 0; + indx.header.data.vi.reservedV = 0; + indx.header.length = cpu_to_le16(sizeof(struct REPARSE_INDEX)); + indx.header.key_length = cpu_to_le16(sizeof(struct reparse_index_key)); + indx.header.flags = 0; + indx.header.reserved = 0; + indx.key.reparse_tag = reparse_tag; + /* danger on processors which require proper alignment! */ + memcpy(&indx.key.file_id, &file_id, 8); + indx.filling = 0; + ntfs_index_ctx_reinit(xr); + + return ntfs_ie_add(xr, (struct index_entry *)&indx); +} + +/* + * Remove a reparse data index entry if attribute present + */ +static int remove_reparse_index(struct inode *rp, struct ntfs_index_context *xr, + __le32 *preparse_tag) +{ + struct reparse_index_key key; + u64 file_id_cpu; + __le64 file_id; + s64 size; + struct ntfs_inode *ni = NTFS_I(rp); + int err = 0, ret = ni->data_size; + + if (ni->data_size == 0) + return 0; + + /* read the existing reparse_tag */ + size = ntfs_inode_attr_pread(rp, 0, 4, (char *)preparse_tag); + if (size != 4) + return -ENODATA; + + file_id_cpu = MK_MREF(ni->mft_no, ni->seq_no); + file_id = cpu_to_le64(file_id_cpu); + key.reparse_tag = *preparse_tag; + /* danger on processors which require proper alignment! */ + memcpy(&key.file_id, &file_id, 8); + if (!ntfs_index_lookup(&key, sizeof(struct reparse_index_key), xr)) { + err = ntfs_index_rm(xr); + if (err) + ret = err; + } + return ret; +} + +/* + * Open the $Extend/$Reparse file and its index + */ +static struct ntfs_index_context *open_reparse_index(struct ntfs_volume *vol) +{ + struct ntfs_index_context *xr = NULL; + u64 mref; + __le16 *uname; + struct ntfs_name *name = NULL; + int uname_len; + struct inode *vi, *dir_vi; + + /* do not use path_name_to inode - could reopen root */ + dir_vi = ntfs_iget(vol->sb, FILE_Extend); + if (IS_ERR(dir_vi)) + return NULL; + + uname_len = ntfs_nlstoucs(vol, "$Reparse", 8, &uname, + NTFS_MAX_NAME_LEN); + if (uname_len < 0) { + iput(dir_vi); + return NULL; + } + + mutex_lock_nested(&NTFS_I(dir_vi)->mrec_lock, NTFS_REPARSE_MUTEX_PARENT); + mref = ntfs_lookup_inode_by_name(NTFS_I(dir_vi), uname, uname_len, + &name); + mutex_unlock(&NTFS_I(dir_vi)->mrec_lock); + kfree(name); + kmem_cache_free(ntfs_name_cache, uname); + if (IS_ERR_MREF(mref)) + goto put_dir_vi; + + vi = ntfs_iget(vol->sb, MREF(mref)); + if (IS_ERR(vi)) + goto put_dir_vi; + + xr = ntfs_index_ctx_get(NTFS_I(vi), reparse_index_name, 2); + if (!xr) + iput(vi); +put_dir_vi: + iput(dir_vi); + return xr; +} + + +/* + * Update the reparse data and index + * + * The reparse data attribute should have been created, and + * an existing index is expected if there is an existing value. + * + */ +static int update_reparse_data(struct ntfs_inode *ni, struct ntfs_index_context *xr, + char *value, size_t size) +{ + struct inode *rp_inode; + int err = 0; + s64 written; + int oldsize; + __le32 reparse_tag; + struct ntfs_inode *rp_ni; + + rp_inode = ntfs_attr_iget(VFS_I(ni), AT_REPARSE_POINT, AT_UNNAMED, 0); + if (IS_ERR(rp_inode)) + return -EINVAL; + rp_ni = NTFS_I(rp_inode); + + /* remove the existing reparse data */ + oldsize = remove_reparse_index(rp_inode, xr, &reparse_tag); + if (oldsize < 0) { + err = oldsize; + goto put_rp_inode; + } + + /* overwrite value if any */ + written = ntfs_inode_attr_pwrite(rp_inode, 0, size, value, false); + if (written != size) { + ntfs_error(ni->vol->sb, "Failed to update reparse data\n"); + err = -EIO; + goto put_rp_inode; + } + + if (set_reparse_index(ni, xr, ((const struct reparse_point *)value)->reparse_tag) && + oldsize > 0) { + /* + * If cannot index, try to remove the reparse + * data and log the error. There will be an + * inconsistency if removal fails. + */ + ntfs_attr_rm(rp_ni); + ntfs_error(ni->vol->sb, + "Failed to index reparse data. Possible corruption.\n"); + } + + mark_mft_record_dirty(ni); +put_rp_inode: + iput(rp_inode); + + return err; +} + +/* + * Delete a reparse index entry + */ +int ntfs_delete_reparse_index(struct ntfs_inode *ni) +{ + struct inode *vi; + struct ntfs_index_context *xr; + struct ntfs_inode *xrni; + __le32 reparse_tag; + int err = 0; + + if (!(ni->flags & FILE_ATTR_REPARSE_POINT)) + return 0; + + vi = ntfs_attr_iget(VFS_I(ni), AT_REPARSE_POINT, AT_UNNAMED, 0); + if (IS_ERR(vi)) + return PTR_ERR(vi); + + /* + * read the existing reparse data (the tag is enough) + * and un-index it + */ + xr = open_reparse_index(ni->vol); + if (xr) { + xrni = xr->idx_ni; + mutex_lock_nested(&xrni->mrec_lock, NTFS_REPARSE_MUTEX_PARENT); + err = remove_reparse_index(vi, xr, &reparse_tag); + if (err < 0) { + ntfs_index_ctx_put(xr); + mutex_unlock(&xrni->mrec_lock); + iput(VFS_I(xrni)); + goto out; + } + mark_mft_record_dirty(xrni); + ntfs_index_ctx_put(xr); + mutex_unlock(&xrni->mrec_lock); + iput(VFS_I(xrni)); + } + + ni->flags &= ~FILE_ATTR_REPARSE_POINT; + NInoSetFileNameDirty(ni); + mark_mft_record_dirty(ni); + +out: + iput(vi); + return err; +} + +/* + * Set the reparse data from an extended attribute + */ +static int ntfs_set_ntfs_reparse_data(struct ntfs_inode *ni, char *value, size_t size) +{ + int err = 0; + struct ntfs_inode *xrni; + struct ntfs_index_context *xr; + + if (!ni) + return -EINVAL; + + /* + * reparse data compatibily with EA is not checked + * any more, it is required by Windows 10, but may + * lead to problems with earlier versions. + */ + if (valid_reparse_data(ni, (const struct reparse_point *)value, size) == false) + return -EINVAL; + + xr = open_reparse_index(ni->vol); + if (!xr) + return -EINVAL; + xrni = xr->idx_ni; + + if (!ntfs_attr_exist(ni, AT_REPARSE_POINT, AT_UNNAMED, 0)) { + u8 dummy = 0; + + /* + * no reparse data attribute : add one, + * apparently, this does not feed the new value in + * Note : NTFS version must be >= 3 + */ + if (ni->vol->major_ver < 3) { + err = -EOPNOTSUPP; + ntfs_index_ctx_put(xr); + goto out; + } + + err = ntfs_attr_add(ni, AT_REPARSE_POINT, AT_UNNAMED, 0, &dummy, 0); + if (err) { + ntfs_index_ctx_put(xr); + goto out; + } + ni->flags |= FILE_ATTR_REPARSE_POINT; + NInoSetFileNameDirty(ni); + mark_mft_record_dirty(ni); + } + + /* update value and index */ + mutex_lock_nested(&xrni->mrec_lock, NTFS_REPARSE_MUTEX_PARENT); + err = update_reparse_data(ni, xr, value, size); + if (err) { + ni->flags &= ~FILE_ATTR_REPARSE_POINT; + NInoSetFileNameDirty(ni); + mark_mft_record_dirty(ni); + } + ntfs_index_ctx_put(xr); + mutex_unlock(&xrni->mrec_lock); + +out: + if (!err) + mark_mft_record_dirty(xrni); + iput(VFS_I(xrni)); + + return err; +} + +/* + * Set reparse data for a WSL type symlink + */ +int ntfs_reparse_set_wsl_symlink(struct ntfs_inode *ni, + const __le16 *target, int target_len) +{ + int err = 0; + int len; + int reparse_len; + unsigned char *utarget = NULL; + struct reparse_point *reparse; + struct WSL_LINK_REPARSE_DATA *data; + + utarget = (char *)NULL; + len = ntfs_ucstonls(ni->vol, target, target_len, &utarget, 0); + if (len <= 0) + return -EINVAL; + + reparse_len = sizeof(struct reparse_point) + sizeof(data->type) + len; + reparse = (struct reparse_point *)ntfs_malloc_nofs(reparse_len); + if (!reparse) { + err = -ENOMEM; + ntfs_free(utarget); + } else { + data = (struct WSL_LINK_REPARSE_DATA *)reparse->reparse_data; + reparse->reparse_tag = IO_REPARSE_TAG_LX_SYMLINK; + reparse->reparse_data_length = + cpu_to_le16(sizeof(data->type) + len); + reparse->reserved = 0; + data->type = cpu_to_le32(2); + memcpy(data->link, utarget, len); + err = ntfs_set_ntfs_reparse_data(ni, + (char *)reparse, reparse_len); + ntfs_free(reparse); + if (!err) + ni->target = utarget; + } + return err; +} + +/* + * Set reparse data for a WSL special file other than a symlink + * (socket, fifo, character or block device) + */ +int ntfs_reparse_set_wsl_not_symlink(struct ntfs_inode *ni, mode_t mode) +{ + int err; + int len; + int reparse_len; + __le32 reparse_tag; + struct reparse_point *reparse; + + len = 0; + if (S_ISSOCK(mode)) + reparse_tag = IO_REPARSE_TAG_AF_UNIX; + else if (S_ISFIFO(mode)) + reparse_tag = IO_REPARSE_TAG_LX_FIFO; + else if (S_ISCHR(mode)) + reparse_tag = IO_REPARSE_TAG_LX_CHR; + else if (S_ISBLK(mode)) + reparse_tag = IO_REPARSE_TAG_LX_BLK; + else + return -EOPNOTSUPP; + + reparse_len = sizeof(struct reparse_point) + len; + reparse = (struct reparse_point *)ntfs_malloc_nofs(reparse_len); + if (!reparse) + err = -ENOMEM; + else { + reparse->reparse_tag = reparse_tag; + reparse->reparse_data_length = cpu_to_le16(len); + reparse->reserved = cpu_to_le16(0); + err = ntfs_set_ntfs_reparse_data(ni, (char *)reparse, + reparse_len); + ntfs_free(reparse); + } + + return err; +} -- 2.25.1 This updates the implementation of misc operations. Signed-off-by: Namjae Jeon --- fs/ntfs/collate.c | 106 +++++++++--- fs/ntfs/debug.c | 44 +++-- fs/ntfs/logfile.c | 402 +++++++++++++++++++--------------------------- fs/ntfs/quota.c | 36 ++--- fs/ntfs/sysctl.c | 12 +- fs/ntfs/unistr.c | 243 +++++++++++++++++++--------- fs/ntfs/upcase.c | 12 +- 7 files changed, 471 insertions(+), 384 deletions(-) diff --git a/fs/ntfs/collate.c b/fs/ntfs/collate.c index 3ab6ec96abfe..2bde1ddceff1 100644 --- a/fs/ntfs/collate.c +++ b/fs/ntfs/collate.c @@ -1,15 +1,20 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * collate.c - NTFS kernel collation handling. Part of the Linux-NTFS project. + * NTFS kernel collation handling. Part of the Linux-NTFS project. * * Copyright (c) 2004 Anton Altaparmakov + * + * Part of this file is based on code from the NTFS-3G project. + * and is copyrighted by the respective authors below: + * Copyright (c) 2004 Anton Altaparmakov + * Copyright (c) 2005 Yura Pakhuchiy */ #include "collate.h" #include "debug.h" #include "ntfs.h" -static int ntfs_collate_binary(ntfs_volume *vol, +static int ntfs_collate_binary(struct ntfs_volume *vol, const void *data1, const int data1_len, const void *data2, const int data2_len) { @@ -27,7 +32,7 @@ static int ntfs_collate_binary(ntfs_volume *vol, return rc; } -static int ntfs_collate_ntofs_ulong(ntfs_volume *vol, +static int ntfs_collate_ntofs_ulong(struct ntfs_volume *vol, const void *data1, const int data1_len, const void *data2, const int data2_len) { @@ -35,9 +40,10 @@ static int ntfs_collate_ntofs_ulong(ntfs_volume *vol, u32 d1, d2; ntfs_debug("Entering."); - // FIXME: We don't really want to bug here. - BUG_ON(data1_len != data2_len); - BUG_ON(data1_len != 4); + + if (data1_len != data2_len || data1_len != 4) + return -EINVAL; + d1 = le32_to_cpup(data1); d2 = le32_to_cpup(data2); if (d1 < d2) @@ -52,12 +58,72 @@ static int ntfs_collate_ntofs_ulong(ntfs_volume *vol, return rc; } -typedef int (*ntfs_collate_func_t)(ntfs_volume *, const void *, const int, +/** + * ntfs_collate_ntofs_ulongs - Which of two le32 arrays should be listed first + * + * Returns: -1, 0 or 1 depending of how the arrays compare + */ +static int ntfs_collate_ntofs_ulongs(struct ntfs_volume *vol, + const void *data1, const int data1_len, + const void *data2, const int data2_len) +{ + int rc; + int len; + const __le32 *p1, *p2; + u32 d1, d2; + + ntfs_debug("Entering."); + if ((data1_len != data2_len) || (data1_len <= 0) || (data1_len & 3)) { + ntfs_error(vol->sb, "data1_len or data2_len not valid\n"); + return -1; + } + + p1 = (const __le32 *)data1; + p2 = (const __le32 *)data2; + len = data1_len; + do { + d1 = le32_to_cpup(p1); + p1++; + d2 = le32_to_cpup(p2); + p2++; + } while ((d1 == d2) && ((len -= 4) > 0)); + if (d1 < d2) + rc = -1; + else { + if (d1 == d2) + rc = 0; + else + rc = 1; + } + ntfs_debug("Done, returning %i.", rc); + return rc; +} + +/** + * ntfs_collate_file_name - Which of two filenames should be listed first + */ +static int ntfs_collate_file_name(struct ntfs_volume *vol, + const void *data1, const int __always_unused data1_len, + const void *data2, const int __always_unused data2_len) +{ + int rc; + + ntfs_debug("Entering.\n"); + rc = ntfs_file_compare_values(data1, data2, -2, + IGNORE_CASE, vol->upcase, vol->upcase_len); + if (!rc) + rc = ntfs_file_compare_values(data1, data2, + -2, CASE_SENSITIVE, vol->upcase, vol->upcase_len); + ntfs_debug("Done, returning %i.\n", rc); + return rc; +} + +typedef int (*ntfs_collate_func_t)(struct ntfs_volume *, const void *, const int, const void *, const int); static ntfs_collate_func_t ntfs_do_collate0x0[3] = { ntfs_collate_binary, - NULL/*ntfs_collate_file_name*/, + ntfs_collate_file_name, NULL/*ntfs_collate_unicode_string*/, }; @@ -65,7 +131,7 @@ static ntfs_collate_func_t ntfs_do_collate0x1[4] = { ntfs_collate_ntofs_ulong, NULL/*ntfs_collate_ntofs_sid*/, NULL/*ntfs_collate_ntofs_security_hash*/, - NULL/*ntfs_collate_ntofs_ulongs*/, + ntfs_collate_ntofs_ulongs, }; /** @@ -84,27 +150,29 @@ static ntfs_collate_func_t ntfs_do_collate0x1[4] = { * For speed we use the collation rule @cr as an index into two tables of * function pointers to call the appropriate collation function. */ -int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr, +int ntfs_collate(struct ntfs_volume *vol, __le32 cr, const void *data1, const int data1_len, - const void *data2, const int data2_len) { + const void *data2, const int data2_len) +{ int i; ntfs_debug("Entering."); - /* - * FIXME: At the moment we only support COLLATION_BINARY and - * COLLATION_NTOFS_ULONG, so we BUG() for everything else for now. - */ - BUG_ON(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG); + + if (cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG && + cr != COLLATION_FILE_NAME && cr != COLLATION_NTOFS_ULONGS) + return -EINVAL; + i = le32_to_cpu(cr); - BUG_ON(i < 0); + if (i < 0) + return -1; if (i <= 0x02) return ntfs_do_collate0x0[i](vol, data1, data1_len, data2, data2_len); - BUG_ON(i < 0x10); + if (i < 0x10) + return -1; i -= 0x10; if (likely(i <= 3)) return ntfs_do_collate0x1[i](vol, data1, data1_len, data2, data2_len); - BUG(); return 0; } diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c index a3c1c5656f8f..5c63d22c2b98 100644 --- a/fs/ntfs/debug.c +++ b/fs/ntfs/debug.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * debug.c - NTFS kernel debug support. Part of the Linux-NTFS project. + * NTFS kernel debug support. Part of the Linux-NTFS project. * * Copyright (c) 2001-2004 Anton Altaparmakov */ @@ -33,20 +33,24 @@ void __ntfs_warning(const char *function, const struct super_block *sb, va_list args; int flen = 0; -#ifndef DEBUG - if (!printk_ratelimit()) - return; -#endif if (function) flen = strlen(function); va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; +#ifndef DEBUG + if (sb) + pr_warn_ratelimited("(device %s): %s(): %pV\n", + sb->s_id, flen ? function : "", &vaf); + else + pr_warn_ratelimited("%s(): %pV\n", flen ? function : "", &vaf); +#else if (sb) pr_warn("(device %s): %s(): %pV\n", sb->s_id, flen ? function : "", &vaf); else pr_warn("%s(): %pV\n", flen ? function : "", &vaf); +#endif va_end(args); } @@ -69,34 +73,41 @@ void __ntfs_warning(const char *function, const struct super_block *sb, * Note, you should be using debug.h::ntfs_error(@sb, @fmt, @...) instead * as this provides the @function parameter automatically. */ -void __ntfs_error(const char *function, const struct super_block *sb, +void __ntfs_error(const char *function, struct super_block *sb, const char *fmt, ...) { struct va_format vaf; va_list args; int flen = 0; -#ifndef DEBUG - if (!printk_ratelimit()) - return; -#endif if (function) flen = strlen(function); va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; +#ifndef DEBUG + if (sb) + pr_err_ratelimited("(device %s): %s(): %pV\n", + sb->s_id, flen ? function : "", &vaf); + else + pr_err_ratelimited("%s(): %pV\n", flen ? function : "", &vaf); +#else if (sb) pr_err("(device %s): %s(): %pV\n", sb->s_id, flen ? function : "", &vaf); else pr_err("%s(): %pV\n", flen ? function : "", &vaf); +#endif va_end(args); + + if (sb) + ntfs_handle_error(sb); } #ifdef DEBUG /* If 1, output debug messages, and if 0, don't. */ -int debug_msgs = 0; +int debug_msgs; void __ntfs_debug(const char *file, int line, const char *function, const char *fmt, ...) @@ -117,11 +128,12 @@ void __ntfs_debug(const char *file, int line, const char *function, } /* Dump a runlist. Caller has to provide synchronisation for @rl. */ -void ntfs_debug_dump_runlist(const runlist_element *rl) +void ntfs_debug_dump_runlist(const struct runlist_element *rl) { int i; - const char *lcn_str[5] = { "LCN_HOLE ", "LCN_RL_NOT_MAPPED", - "LCN_ENOENT ", "LCN_unknown " }; + const char *lcn_str[5] = { "LCN_DELALLOC ", "LCN_HOLE ", + "LCN_RL_NOT_MAPPED", "LCN_ENOENT ", + "LCN_unknown " }; if (!debug_msgs) return; @@ -132,9 +144,9 @@ void ntfs_debug_dump_runlist(const runlist_element *rl) } pr_debug("VCN LCN Run length\n"); for (i = 0; ; i++) { - LCN lcn = (rl + i)->lcn; + s64 lcn = (rl + i)->lcn; - if (lcn < (LCN)0) { + if (lcn < 0) { int index = -lcn - 1; if (index > -LCN_ENOENT - 1) diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c index 6ce60ffc6ac0..31b3de595459 100644 --- a/fs/ntfs/logfile.c +++ b/fs/ntfs/logfile.c @@ -1,31 +1,21 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * logfile.c - NTFS kernel journal handling. Part of the Linux-NTFS project. + * NTFS kernel journal handling. Part of the Linux-NTFS project. * * Copyright (c) 2002-2007 Anton Altaparmakov */ -#ifdef NTFS_RW - -#include -#include -#include -#include -#include -#include #include #include "attrib.h" #include "aops.h" -#include "debug.h" #include "logfile.h" #include "malloc.h" -#include "volume.h" #include "ntfs.h" /** * ntfs_check_restart_page_header - check the page header for consistency - * @vi: $LogFile inode to which the restart page header belongs + * @vi: LogFile inode to which the restart page header belongs * @rp: restart page header to check * @pos: position in @vi at which the restart page header resides * @@ -36,7 +26,7 @@ * require the full restart page. */ static bool ntfs_check_restart_page_header(struct inode *vi, - RESTART_PAGE_HEADER *rp, s64 pos) + struct restart_page_header *rp, s64 pos) { u32 logfile_system_page_size, logfile_log_page_size; u16 ra_ofs, usa_count, usa_ofs, usa_end = 0; @@ -54,7 +44,7 @@ static bool ntfs_check_restart_page_header(struct inode *vi, logfile_system_page_size & (logfile_system_page_size - 1) || !is_power_of_2(logfile_log_page_size)) { - ntfs_error(vi->i_sb, "$LogFile uses unsupported page size."); + ntfs_error(vi->i_sb, "LogFile uses unsupported page size."); return false; } /* @@ -62,17 +52,16 @@ static bool ntfs_check_restart_page_header(struct inode *vi, * size (2nd restart page). */ if (pos && pos != logfile_system_page_size) { - ntfs_error(vi->i_sb, "Found restart area in incorrect " - "position in $LogFile."); + ntfs_error(vi->i_sb, "Found restart area in incorrect position in LogFile."); return false; } /* We only know how to handle version 1.1. */ - if (sle16_to_cpu(rp->major_ver) != 1 || - sle16_to_cpu(rp->minor_ver) != 1) { - ntfs_error(vi->i_sb, "$LogFile version %i.%i is not " - "supported. (This driver supports version " - "1.1 only.)", (int)sle16_to_cpu(rp->major_ver), - (int)sle16_to_cpu(rp->minor_ver)); + if (le16_to_cpu(rp->major_ver) != 1 || + le16_to_cpu(rp->minor_ver) != 1) { + ntfs_error(vi->i_sb, + "LogFile version %i.%i is not supported. (This driver supports version 1.1 only.)", + (int)le16_to_cpu(rp->major_ver), + (int)le16_to_cpu(rp->minor_ver)); return false; } /* @@ -86,17 +75,17 @@ static bool ntfs_check_restart_page_header(struct inode *vi, /* Verify the size of the update sequence array. */ usa_count = 1 + (logfile_system_page_size >> NTFS_BLOCK_SIZE_BITS); if (usa_count != le16_to_cpu(rp->usa_count)) { - ntfs_error(vi->i_sb, "$LogFile restart page specifies " - "inconsistent update sequence array count."); + ntfs_error(vi->i_sb, + "LogFile restart page specifies inconsistent update sequence array count."); return false; } /* Verify the position of the update sequence array. */ usa_ofs = le16_to_cpu(rp->usa_ofs); usa_end = usa_ofs + usa_count * sizeof(u16); - if (usa_ofs < sizeof(RESTART_PAGE_HEADER) || + if (usa_ofs < sizeof(struct restart_page_header) || usa_end > NTFS_BLOCK_SIZE - sizeof(u16)) { - ntfs_error(vi->i_sb, "$LogFile restart page specifies " - "inconsistent update sequence array offset."); + ntfs_error(vi->i_sb, + "LogFile restart page specifies inconsistent update sequence array offset."); return false; } skip_usa_checks: @@ -108,19 +97,19 @@ static bool ntfs_check_restart_page_header(struct inode *vi, */ ra_ofs = le16_to_cpu(rp->restart_area_offset); if (ra_ofs & 7 || (have_usa ? ra_ofs < usa_end : - ra_ofs < sizeof(RESTART_PAGE_HEADER)) || + ra_ofs < sizeof(struct restart_page_header)) || ra_ofs > logfile_system_page_size) { - ntfs_error(vi->i_sb, "$LogFile restart page specifies " - "inconsistent restart area offset."); + ntfs_error(vi->i_sb, + "LogFile restart page specifies inconsistent restart area offset."); return false; } /* * Only restart pages modified by chkdsk are allowed to have chkdsk_lsn * set. */ - if (!ntfs_is_chkd_record(rp->magic) && sle64_to_cpu(rp->chkdsk_lsn)) { - ntfs_error(vi->i_sb, "$LogFile restart page is not modified " - "by chkdsk but a chkdsk LSN is specified."); + if (!ntfs_is_chkd_record(rp->magic) && le64_to_cpu(rp->chkdsk_lsn)) { + ntfs_error(vi->i_sb, + "LogFile restart page is not modified by chkdsk but a chkdsk LSN is specified."); return false; } ntfs_debug("Done."); @@ -129,7 +118,7 @@ static bool ntfs_check_restart_page_header(struct inode *vi, /** * ntfs_check_restart_area - check the restart area for consistency - * @vi: $LogFile inode to which the restart page belongs + * @vi: LogFile inode to which the restart page belongs * @rp: restart page whose restart area to check * * Check the restart area of the restart page @rp for consistency and return @@ -141,25 +130,25 @@ static bool ntfs_check_restart_page_header(struct inode *vi, * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not * require the full restart page. */ -static bool ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp) +static bool ntfs_check_restart_area(struct inode *vi, struct restart_page_header *rp) { u64 file_size; - RESTART_AREA *ra; + struct restart_area *ra; u16 ra_ofs, ra_len, ca_ofs; u8 fs_bits; ntfs_debug("Entering."); ra_ofs = le16_to_cpu(rp->restart_area_offset); - ra = (RESTART_AREA*)((u8*)rp + ra_ofs); + ra = (struct restart_area *)((u8 *)rp + ra_ofs); /* * Everything before ra->file_size must be before the first word * protected by an update sequence number. This ensures that it is * safe to access ra->client_array_offset. */ - if (ra_ofs + offsetof(RESTART_AREA, file_size) > + if (ra_ofs + offsetof(struct restart_area, file_size) > NTFS_BLOCK_SIZE - sizeof(u16)) { - ntfs_error(vi->i_sb, "$LogFile restart area specifies " - "inconsistent file offset."); + ntfs_error(vi->i_sb, + "LogFile restart area specifies inconsistent file offset."); return false; } /* @@ -172,8 +161,8 @@ static bool ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp) ca_ofs = le16_to_cpu(ra->client_array_offset); if (((ca_ofs + 7) & ~7) != ca_ofs || ra_ofs + ca_ofs > NTFS_BLOCK_SIZE - sizeof(u16)) { - ntfs_error(vi->i_sb, "$LogFile restart area specifies " - "inconsistent client array offset."); + ntfs_error(vi->i_sb, + "LogFile restart area specifies inconsistent client array offset."); return false; } /* @@ -182,15 +171,13 @@ static bool ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp) * Also, the calculated length must not exceed the specified length. */ ra_len = ca_ofs + le16_to_cpu(ra->log_clients) * - sizeof(LOG_CLIENT_RECORD); + sizeof(struct log_client_record); if (ra_ofs + ra_len > le32_to_cpu(rp->system_page_size) || ra_ofs + le16_to_cpu(ra->restart_area_length) > le32_to_cpu(rp->system_page_size) || ra_len > le16_to_cpu(ra->restart_area_length)) { - ntfs_error(vi->i_sb, "$LogFile restart area is out of bounds " - "of the system page size specified by the " - "restart page header and/or the specified " - "restart area length is inconsistent."); + ntfs_error(vi->i_sb, + "LogFile restart area is out of bounds of the system page size specified by the restart page header and/or the specified restart area length is inconsistent."); return false; } /* @@ -204,37 +191,37 @@ static bool ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp) (ra->client_in_use_list != LOGFILE_NO_CLIENT && le16_to_cpu(ra->client_in_use_list) >= le16_to_cpu(ra->log_clients))) { - ntfs_error(vi->i_sb, "$LogFile restart area specifies " - "overflowing client free and/or in use lists."); + ntfs_error(vi->i_sb, + "LogFile restart area specifies overflowing client free and/or in use lists."); return false; } /* * Check ra->seq_number_bits against ra->file_size for consistency. * We cannot just use ffs() because the file size is not a power of 2. */ - file_size = (u64)sle64_to_cpu(ra->file_size); + file_size = le64_to_cpu(ra->file_size); fs_bits = 0; while (file_size) { file_size >>= 1; fs_bits++; } if (le32_to_cpu(ra->seq_number_bits) != 67 - fs_bits) { - ntfs_error(vi->i_sb, "$LogFile restart area specifies " - "inconsistent sequence number bits."); + ntfs_error(vi->i_sb, + "LogFile restart area specifies inconsistent sequence number bits."); return false; } /* The log record header length must be a multiple of 8. */ if (((le16_to_cpu(ra->log_record_header_length) + 7) & ~7) != le16_to_cpu(ra->log_record_header_length)) { - ntfs_error(vi->i_sb, "$LogFile restart area specifies " - "inconsistent log record header length."); + ntfs_error(vi->i_sb, + "LogFile restart area specifies inconsistent log record header length."); return false; } /* Dito for the log page data offset. */ if (((le16_to_cpu(ra->log_page_data_offset) + 7) & ~7) != le16_to_cpu(ra->log_page_data_offset)) { - ntfs_error(vi->i_sb, "$LogFile restart area specifies " - "inconsistent log page data offset."); + ntfs_error(vi->i_sb, + "LogFile restart area specifies inconsistent log page data offset."); return false; } ntfs_debug("Done."); @@ -243,7 +230,7 @@ static bool ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp) /** * ntfs_check_log_client_array - check the log client array for consistency - * @vi: $LogFile inode to which the restart page belongs + * @vi: LogFile inode to which the restart page belongs * @rp: restart page whose log client array to check * * Check the log client array of the restart page @rp for consistency and @@ -257,16 +244,16 @@ static bool ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp) * restart page and the page must be multi sector transfer deprotected. */ static bool ntfs_check_log_client_array(struct inode *vi, - RESTART_PAGE_HEADER *rp) + struct restart_page_header *rp) { - RESTART_AREA *ra; - LOG_CLIENT_RECORD *ca, *cr; + struct restart_area *ra; + struct log_client_record *ca, *cr; u16 nr_clients, idx; bool in_free_list, idx_is_first; ntfs_debug("Entering."); - ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset)); - ca = (LOG_CLIENT_RECORD*)((u8*)ra + + ra = (struct restart_area *)((u8 *)rp + le16_to_cpu(rp->restart_area_offset)); + ca = (struct log_client_record *)((u8 *)ra + le16_to_cpu(ra->client_array_offset)); /* * Check the ra->client_free_list first and then check the @@ -302,13 +289,13 @@ static bool ntfs_check_log_client_array(struct inode *vi, ntfs_debug("Done."); return true; err_out: - ntfs_error(vi->i_sb, "$LogFile log client array is corrupt."); + ntfs_error(vi->i_sb, "LogFile log client array is corrupt."); return false; } /** * ntfs_check_and_load_restart_page - check the restart page for consistency - * @vi: $LogFile inode to which the restart page belongs + * @vi: LogFile inode to which the restart page belongs * @rp: restart page to check * @pos: position in @vi at which the restart page resides * @wrp: [OUT] copy of the multi sector transfer deprotected restart page @@ -331,14 +318,14 @@ static bool ntfs_check_log_client_array(struct inode *vi, * The following error codes are defined: * -EINVAL - The restart page is inconsistent. * -ENOMEM - Not enough memory to load the restart page. - * -EIO - Failed to reading from $LogFile. + * -EIO - Failed to reading from LogFile. */ static int ntfs_check_and_load_restart_page(struct inode *vi, - RESTART_PAGE_HEADER *rp, s64 pos, RESTART_PAGE_HEADER **wrp, - LSN *lsn) + struct restart_page_header *rp, s64 pos, struct restart_page_header **wrp, + s64 *lsn) { - RESTART_AREA *ra; - RESTART_PAGE_HEADER *trp; + struct restart_area *ra; + struct restart_page_header *trp; int size, err; ntfs_debug("Entering."); @@ -352,15 +339,14 @@ static int ntfs_check_and_load_restart_page(struct inode *vi, /* Error output already done inside the function. */ return -EINVAL; } - ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset)); + ra = (struct restart_area *)((u8 *)rp + le16_to_cpu(rp->restart_area_offset)); /* * Allocate a buffer to store the whole restart page so we can multi * sector transfer deprotect it. */ trp = ntfs_malloc_nofs(le32_to_cpu(rp->system_page_size)); if (!trp) { - ntfs_error(vi->i_sb, "Failed to allocate memory for $LogFile " - "restart page buffer."); + ntfs_error(vi->i_sb, "Failed to allocate memory for LogFile restart page buffer."); return -ENOMEM; } /* @@ -373,7 +359,7 @@ static int ntfs_check_and_load_restart_page(struct inode *vi, memcpy(trp, rp, le32_to_cpu(rp->system_page_size)); } else { pgoff_t idx; - struct page *page; + struct folio *folio; int have_read, to_read; /* First copy what we already have in @rp. */ @@ -382,20 +368,19 @@ static int ntfs_check_and_load_restart_page(struct inode *vi, have_read = size; to_read = le32_to_cpu(rp->system_page_size) - size; idx = (pos + size) >> PAGE_SHIFT; - BUG_ON((pos + size) & ~PAGE_MASK); do { - page = ntfs_map_page(vi->i_mapping, idx); - if (IS_ERR(page)) { - ntfs_error(vi->i_sb, "Error mapping $LogFile " - "page (index %lu).", idx); - err = PTR_ERR(page); + folio = read_mapping_folio(vi->i_mapping, idx, NULL); + if (IS_ERR(folio)) { + ntfs_error(vi->i_sb, "Error mapping LogFile page (index %lu).", + idx); + err = PTR_ERR(folio); if (err != -EIO && err != -ENOMEM) err = -EIO; goto err_out; } size = min_t(int, to_read, PAGE_SIZE); - memcpy((u8*)trp + have_read, page_address(page), size); - ntfs_unmap_page(page); + memcpy((u8 *)trp + have_read, folio_address(folio), size); + folio_put(folio); have_read += size; to_read -= size; idx++; @@ -405,19 +390,18 @@ static int ntfs_check_and_load_restart_page(struct inode *vi, * Perform the multi sector transfer deprotection on the buffer if the * restart page is protected. */ - if ((!ntfs_is_chkd_record(trp->magic) || le16_to_cpu(trp->usa_count)) - && post_read_mst_fixup((NTFS_RECORD*)trp, - le32_to_cpu(rp->system_page_size))) { + if ((!ntfs_is_chkd_record(trp->magic) || le16_to_cpu(trp->usa_count)) && + post_read_mst_fixup((struct ntfs_record *)trp, le32_to_cpu(rp->system_page_size))) { /* - * A multi sector tranfer error was detected. We only need to + * A multi sector transfer error was detected. We only need to * abort if the restart page contents exceed the multi sector * transfer fixup of the first sector. */ if (le16_to_cpu(rp->restart_area_offset) + le16_to_cpu(ra->restart_area_length) > NTFS_BLOCK_SIZE - sizeof(u16)) { - ntfs_error(vi->i_sb, "Multi sector transfer error " - "detected in $LogFile restart page."); + ntfs_error(vi->i_sb, + "Multi sector transfer error detected in LogFile restart page."); err = -EINVAL; goto err_out; } @@ -437,9 +421,9 @@ static int ntfs_check_and_load_restart_page(struct inode *vi, } if (lsn) { if (ntfs_is_rstr_record(rp->magic)) - *lsn = sle64_to_cpu(ra->current_lsn); + *lsn = le64_to_cpu(ra->current_lsn); else /* if (ntfs_is_chkd_record(rp->magic)) */ - *lsn = sle64_to_cpu(rp->chkdsk_lsn); + *lsn = le64_to_cpu(rp->chkdsk_lsn); } ntfs_debug("Done."); if (wrp) @@ -453,37 +437,37 @@ static int ntfs_check_and_load_restart_page(struct inode *vi, /** * ntfs_check_logfile - check the journal for consistency - * @log_vi: struct inode of loaded journal $LogFile to check + * @log_vi: struct inode of loaded journal LogFile to check * @rp: [OUT] on success this is a copy of the current restart page * - * Check the $LogFile journal for consistency and return 'true' if it is + * Check the LogFile journal for consistency and return 'true' if it is * consistent and 'false' if not. On success, the current restart page is * returned in *@rp. Caller must call ntfs_free(*@rp) when finished with it. * * At present we only check the two restart pages and ignore the log record * pages. * - * Note that the MstProtected flag is not set on the $LogFile inode and hence + * Note that the MstProtected flag is not set on the LogFile inode and hence * when reading pages they are not deprotected. This is because we do not know - * if the $LogFile was created on a system with a different page size to ours + * if the LogFile was created on a system with a different page size to ours * yet and mst deprotection would fail if our page size is smaller. */ -bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp) +bool ntfs_check_logfile(struct inode *log_vi, struct restart_page_header **rp) { s64 size, pos; - LSN rstr1_lsn, rstr2_lsn; - ntfs_volume *vol = NTFS_SB(log_vi->i_sb); + s64 rstr1_lsn, rstr2_lsn; + struct ntfs_volume *vol = NTFS_SB(log_vi->i_sb); struct address_space *mapping = log_vi->i_mapping; - struct page *page = NULL; + struct folio *folio = NULL; u8 *kaddr = NULL; - RESTART_PAGE_HEADER *rstr1_ph = NULL; - RESTART_PAGE_HEADER *rstr2_ph = NULL; + struct restart_page_header *rstr1_ph = NULL; + struct restart_page_header *rstr2_ph = NULL; int log_page_size, err; bool logfile_is_empty = true; u8 log_page_bits; ntfs_debug("Entering."); - /* An empty $LogFile must have been clean before it got emptied. */ + /* An empty LogFile must have been clean before it got emptied. */ if (NVolLogFileEmpty(vol)) goto is_empty; size = i_size_read(log_vi); @@ -496,8 +480,8 @@ bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp) * log page size if the page cache size is between the default log page * size and twice that. */ - if (PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <= - DefaultLogPageSize * 2) + if (DefaultLogPageSize <= PAGE_SIZE && + DefaultLogPageSize * 2 <= PAGE_SIZE) log_page_size = DefaultLogPageSize; else log_page_size = PAGE_SIZE; @@ -513,7 +497,7 @@ bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp) */ if (size < log_page_size * 2 || (size - log_page_size * 2) >> log_page_bits < MinLogRecordPages) { - ntfs_error(vol->sb, "$LogFile is too small."); + ntfs_error(vol->sb, "LogFile is too small."); return false; } /* @@ -526,23 +510,26 @@ bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp) */ for (pos = 0; pos < size; pos <<= 1) { pgoff_t idx = pos >> PAGE_SHIFT; - if (!page || page->index != idx) { - if (page) - ntfs_unmap_page(page); - page = ntfs_map_page(mapping, idx); - if (IS_ERR(page)) { - ntfs_error(vol->sb, "Error mapping $LogFile " - "page (index %lu).", idx); + + if (!folio || folio->index != idx) { + if (folio) { + kunmap_local(kaddr); + folio_put(folio); + } + folio = read_mapping_folio(mapping, idx, NULL); + if (IS_ERR(folio)) { + ntfs_error(vol->sb, "Error mapping LogFile page (index %lu).", + idx); goto err_out; } } - kaddr = (u8*)page_address(page) + (pos & ~PAGE_MASK); + kaddr = (u8 *)kmap_local_folio(folio, 0) + (pos & ~PAGE_MASK); /* * A non-empty block means the logfile is not empty while an * empty block after a non-empty block has been encountered * means we are done. */ - if (!ntfs_is_empty_recordp((le32*)kaddr)) + if (!ntfs_is_empty_recordp((__le32 *)kaddr)) logfile_is_empty = false; else if (!logfile_is_empty) break; @@ -550,11 +537,11 @@ bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp) * A log record page means there cannot be a restart page after * this so no need to continue searching. */ - if (ntfs_is_rcrd_recordp((le32*)kaddr)) + if (ntfs_is_rcrd_recordp((__le32 *)kaddr)) break; /* If not a (modified by chkdsk) restart page, continue. */ - if (!ntfs_is_rstr_recordp((le32*)kaddr) && - !ntfs_is_chkd_recordp((le32*)kaddr)) { + if (!ntfs_is_rstr_recordp((__le32 *)kaddr) && + !ntfs_is_chkd_recordp((__le32 *)kaddr)) { if (!pos) pos = NTFS_BLOCK_SIZE >> 1; continue; @@ -565,7 +552,7 @@ bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp) * deprotected restart page. */ err = ntfs_check_and_load_restart_page(log_vi, - (RESTART_PAGE_HEADER*)kaddr, pos, + (struct restart_page_header *)kaddr, pos, !rstr1_ph ? &rstr1_ph : &rstr2_ph, !rstr1_ph ? &rstr1_lsn : &rstr2_lsn); if (!err) { @@ -589,25 +576,27 @@ bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp) * find a valid one further in the file. */ if (err != -EINVAL) { - ntfs_unmap_page(page); + kunmap_local(kaddr); + folio_put(folio); goto err_out; } /* Continue looking. */ if (!pos) pos = NTFS_BLOCK_SIZE >> 1; } - if (page) - ntfs_unmap_page(page); + if (folio) { + kunmap_local(kaddr); + folio_put(folio); + } if (logfile_is_empty) { NVolSetLogFileEmpty(vol); is_empty: - ntfs_debug("Done. ($LogFile is empty.)"); + ntfs_debug("Done. (LogFile is empty.)"); return true; } if (!rstr1_ph) { - BUG_ON(rstr2_ph); - ntfs_error(vol->sb, "Did not find any restart pages in " - "$LogFile and it was not empty."); + ntfs_error(vol->sb, + "Did not find any restart pages in LogFile and it was not empty."); return false; } /* If both restart pages were found, use the more recent one. */ @@ -617,14 +606,12 @@ bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp) * Otherwise just throw it away. */ if (rstr2_lsn > rstr1_lsn) { - ntfs_debug("Using second restart page as it is more " - "recent."); + ntfs_debug("Using second restart page as it is more recent."); ntfs_free(rstr1_ph); rstr1_ph = rstr2_ph; /* rstr1_lsn = rstr2_lsn; */ } else { - ntfs_debug("Using first restart page as it is more " - "recent."); + ntfs_debug("Using first restart page as it is more recent."); ntfs_free(rstr2_ph); } rstr2_ph = NULL; @@ -643,98 +630,42 @@ bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp) } /** - * ntfs_is_logfile_clean - check in the journal if the volume is clean - * @log_vi: struct inode of loaded journal $LogFile to check - * @rp: copy of the current restart page - * - * Analyze the $LogFile journal and return 'true' if it indicates the volume was - * shutdown cleanly and 'false' if not. - * - * At present we only look at the two restart pages and ignore the log record - * pages. This is a little bit crude in that there will be a very small number - * of cases where we think that a volume is dirty when in fact it is clean. - * This should only affect volumes that have not been shutdown cleanly but did - * not have any pending, non-check-pointed i/o, i.e. they were completely idle - * at least for the five seconds preceding the unclean shutdown. - * - * This function assumes that the $LogFile journal has already been consistency - * checked by a call to ntfs_check_logfile() and in particular if the $LogFile - * is empty this function requires that NVolLogFileEmpty() is true otherwise an - * empty volume will be reported as dirty. - */ -bool ntfs_is_logfile_clean(struct inode *log_vi, const RESTART_PAGE_HEADER *rp) -{ - ntfs_volume *vol = NTFS_SB(log_vi->i_sb); - RESTART_AREA *ra; - - ntfs_debug("Entering."); - /* An empty $LogFile must have been clean before it got emptied. */ - if (NVolLogFileEmpty(vol)) { - ntfs_debug("Done. ($LogFile is empty.)"); - return true; - } - BUG_ON(!rp); - if (!ntfs_is_rstr_record(rp->magic) && - !ntfs_is_chkd_record(rp->magic)) { - ntfs_error(vol->sb, "Restart page buffer is invalid. This is " - "probably a bug in that the $LogFile should " - "have been consistency checked before calling " - "this function."); - return false; - } - ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset)); - /* - * If the $LogFile has active clients, i.e. it is open, and we do not - * have the RESTART_VOLUME_IS_CLEAN bit set in the restart area flags, - * we assume there was an unclean shutdown. - */ - if (ra->client_in_use_list != LOGFILE_NO_CLIENT && - !(ra->flags & RESTART_VOLUME_IS_CLEAN)) { - ntfs_debug("Done. $LogFile indicates a dirty shutdown."); - return false; - } - /* $LogFile indicates a clean shutdown. */ - ntfs_debug("Done. $LogFile indicates a clean shutdown."); - return true; -} - -/** - * ntfs_empty_logfile - empty the contents of the $LogFile journal - * @log_vi: struct inode of loaded journal $LogFile to empty + * ntfs_empty_logfile - empty the contents of the LogFile journal + * @log_vi: struct inode of loaded journal LogFile to empty * - * Empty the contents of the $LogFile journal @log_vi and return 'true' on + * Empty the contents of the LogFile journal @log_vi and return 'true' on * success and 'false' on error. * - * This function assumes that the $LogFile journal has already been consistency + * This function assumes that the LogFile journal has already been consistency * checked by a call to ntfs_check_logfile() and that ntfs_is_logfile_clean() - * has been used to ensure that the $LogFile is clean. + * has been used to ensure that the LogFile is clean. */ bool ntfs_empty_logfile(struct inode *log_vi) { - VCN vcn, end_vcn; - ntfs_inode *log_ni = NTFS_I(log_vi); - ntfs_volume *vol = log_ni->vol; + s64 vcn, end_vcn; + struct ntfs_inode *log_ni = NTFS_I(log_vi); + struct ntfs_volume *vol = log_ni->vol; struct super_block *sb = vol->sb; - runlist_element *rl; + struct runlist_element *rl; unsigned long flags; - unsigned block_size, block_size_bits; int err; bool should_wait = true; + char *empty_buf = NULL; + struct file_ra_state *ra = NULL; ntfs_debug("Entering."); if (NVolLogFileEmpty(vol)) { ntfs_debug("Done."); return true; } + /* * We cannot use ntfs_attr_set() because we may be still in the middle * of a mount operation. Thus we do the emptying by hand by first - * zapping the page cache pages for the $LogFile/$DATA attribute and + * zapping the page cache pages for the LogFile/DATA attribute and * then emptying each of the buffers in each of the clusters specified * by the runlist by hand. */ - block_size = sb->s_blocksize; - block_size_bits = sb->s_blocksize_bits; vcn = 0; read_lock_irqsave(&log_ni->size_lock, flags); end_vcn = (log_ni->initialized_size + vol->cluster_size_mask) >> @@ -747,19 +678,30 @@ bool ntfs_empty_logfile(struct inode *log_vi) map_vcn: err = ntfs_map_runlist_nolock(log_ni, vcn, NULL); if (err) { - ntfs_error(sb, "Failed to map runlist fragment (error " - "%d).", -err); + ntfs_error(sb, "Failed to map runlist fragment (error %d).", -err); goto err; } rl = log_ni->runlist.rl; - BUG_ON(!rl || vcn < rl->vcn || !rl->length); } /* Seek to the runlist element containing @vcn. */ while (rl->length && vcn >= rl[1].vcn) rl++; + + err = -ENOMEM; + empty_buf = ntfs_malloc_nofs(vol->cluster_size); + if (!empty_buf) + goto err; + + memset(empty_buf, 0xff, vol->cluster_size); + + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + goto err; + + file_ra_state_init(ra, sb->s_bdev->bd_mapping); do { - LCN lcn; - sector_t block, end_block; + s64 lcn; + loff_t start, end; s64 len; /* @@ -769,6 +711,7 @@ bool ntfs_empty_logfile(struct inode *log_vi) lcn = rl->lcn; if (unlikely(lcn == LCN_RL_NOT_MAPPED)) { vcn = rl->vcn; + ntfs_free(empty_buf); goto map_vcn; } /* If this run is not valid abort with an error. */ @@ -777,29 +720,23 @@ bool ntfs_empty_logfile(struct inode *log_vi) /* Skip holes. */ if (lcn == LCN_HOLE) continue; - block = lcn << vol->cluster_size_bits >> block_size_bits; + start = NTFS_CLU_TO_B(vol, lcn); len = rl->length; if (rl[1].vcn > end_vcn) len = end_vcn - rl->vcn; - end_block = (lcn + len) << vol->cluster_size_bits >> - block_size_bits; - /* Iterate over the blocks in the run and empty them. */ + end = NTFS_CLU_TO_B(vol, lcn + len); + + page_cache_sync_readahead(sb->s_bdev->bd_mapping, ra, NULL, + start >> PAGE_SHIFT, (end - start) >> PAGE_SHIFT); + do { - struct buffer_head *bh; + err = ntfs_dev_write(sb, empty_buf, start, + vol->cluster_size, should_wait); + if (err) { + ntfs_error(sb, "ntfs_dev_write failed, err : %d\n", err); + goto io_err; + } - /* Obtain the buffer, possibly not uptodate. */ - bh = sb_getblk(sb, block); - BUG_ON(!bh); - /* Setup buffer i/o submission. */ - lock_buffer(bh); - bh->b_end_io = end_buffer_write_sync; - get_bh(bh); - /* Set the entire contents of the buffer to 0xff. */ - memset(bh->b_data, -1, block_size); - if (!buffer_uptodate(bh)) - set_buffer_uptodate(bh); - if (buffer_dirty(bh)) - clear_buffer_dirty(bh); /* * Submit the buffer and wait for i/o to complete but * only for the first buffer so we do not miss really @@ -807,25 +744,14 @@ bool ntfs_empty_logfile(struct inode *log_vi) * completed ignore errors afterwards as we can assume * that if one buffer worked all of them will work. */ - submit_bh(REQ_OP_WRITE, bh); - if (should_wait) { + if (should_wait) should_wait = false; - wait_on_buffer(bh); - if (unlikely(!buffer_uptodate(bh))) - goto io_err; - } - brelse(bh); - } while (++block < end_block); + start += vol->cluster_size; + } while (start < end); } while ((++rl)->vcn < end_vcn); up_write(&log_ni->runlist.lock); - /* - * Zap the pages again just in case any got instantiated whilst we were - * emptying the blocks by hand. FIXME: We may not have completed - * writing to all the buffer heads yet so this may happen too early. - * We really should use a kernel thread to do the emptying - * asynchronously and then we can also set the volume dirty and output - * an error message if emptying should fail. - */ + kfree(empty_buf); + kfree(ra); truncate_inode_pages(log_vi->i_mapping, 0); /* Set the flag so we do not have to do it again on remount. */ NVolSetLogFileEmpty(vol); @@ -840,10 +766,10 @@ bool ntfs_empty_logfile(struct inode *log_vi) NVolSetErrors(vol); err = -EIO; err: + ntfs_free(empty_buf); + kfree(ra); up_write(&log_ni->runlist.lock); - ntfs_error(sb, "Failed to fill $LogFile with 0xff bytes (error %d).", + ntfs_error(sb, "Failed to fill LogFile with 0xff bytes (error %d).", -err); return false; } - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c index 9160480222fd..6f370184aafe 100644 --- a/fs/ntfs/quota.c +++ b/fs/ntfs/quota.c @@ -1,13 +1,11 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * quota.c - NTFS kernel quota ($Quota) handling. Part of the Linux-NTFS - * project. + * NTFS kernel quota ($Quota) handling. + * Part of the Linux-NTFS project. * * Copyright (c) 2004 Anton Altaparmakov */ -#ifdef NTFS_RW - #include "index.h" #include "quota.h" #include "debug.h" @@ -20,11 +18,11 @@ * Mark the quotas out of date on the ntfs volume @vol and return 'true' on * success and 'false' on error. */ -bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol) +bool ntfs_mark_quotas_out_of_date(struct ntfs_volume *vol) { - ntfs_index_context *ictx; - QUOTA_CONTROL_ENTRY *qce; - const le32 qid = QUOTA_DEFAULTS_ID; + struct ntfs_index_context *ictx; + struct quota_control_entry *qce; + const __le32 qid = QUOTA_DEFAULTS_ID; int err; ntfs_debug("Entering."); @@ -35,7 +33,7 @@ bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol) return false; } inode_lock(vol->quota_q_ino); - ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino)); + ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino), I30, 4); if (!ictx) { ntfs_error(vol->sb, "Failed to get index context."); goto err_out; @@ -43,22 +41,20 @@ bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol) err = ntfs_index_lookup(&qid, sizeof(qid), ictx); if (err) { if (err == -ENOENT) - ntfs_error(vol->sb, "Quota defaults entry is not " - "present."); + ntfs_error(vol->sb, "Quota defaults entry is not present."); else - ntfs_error(vol->sb, "Lookup of quota defaults entry " - "failed."); + ntfs_error(vol->sb, "Lookup of quota defaults entry failed."); goto err_out; } - if (ictx->data_len < offsetof(QUOTA_CONTROL_ENTRY, sid)) { - ntfs_error(vol->sb, "Quota defaults entry size is invalid. " - "Run chkdsk."); + if (ictx->data_len < offsetof(struct quota_control_entry, sid)) { + ntfs_error(vol->sb, "Quota defaults entry size is invalid. Run chkdsk."); goto err_out; } - qce = (QUOTA_CONTROL_ENTRY*)ictx->data; + qce = (struct quota_control_entry *)ictx->data; if (le32_to_cpu(qce->version) != QUOTA_VERSION) { - ntfs_error(vol->sb, "Quota defaults entry version 0x%x is not " - "supported.", le32_to_cpu(qce->version)); + ntfs_error(vol->sb, + "Quota defaults entry version 0x%x is not supported.", + le32_to_cpu(qce->version)); goto err_out; } ntfs_debug("Quota defaults flags = 0x%x.", le32_to_cpu(qce->flags)); @@ -99,5 +95,3 @@ bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol) inode_unlock(vol->quota_q_ino); return false; } - -#endif /* NTFS_RW */ diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c index 4e980170d86a..695c8a998c56 100644 --- a/fs/ntfs/sysctl.c +++ b/fs/ntfs/sysctl.c @@ -1,9 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * sysctl.c - Code for sysctl handling in NTFS Linux kernel driver. Part of - * the Linux-NTFS project. Adapted from the old NTFS driver, - * Copyright (C) 1997 Martin von Löwis, Régis Duchesne + * Code for sysctl handling in NTFS Linux kernel driver. Part of + * the Linux-NTFS project. Adapted from the old NTFS driver, * + * Copyright (C) 1997 Martin von Löwis, Régis Duchesne * Copyright (c) 2002-2005 Anton Altaparmakov */ @@ -20,7 +20,7 @@ #include "debug.h" /* Definition of the ntfs sysctl. */ -static struct ctl_table ntfs_sysctls[] = { +static const struct ctl_table ntfs_sysctls[] = { { .procname = "ntfs-debug", .data = &debug_msgs, /* Data pointer and size. */ @@ -28,6 +28,7 @@ static struct ctl_table ntfs_sysctls[] = { .mode = 0644, /* Mode, proc handler. */ .proc_handler = proc_dointvec }, + {} }; /* Storage for the sysctls header. */ @@ -42,17 +43,14 @@ static struct ctl_table_header *sysctls_root_table; int ntfs_sysctl(int add) { if (add) { - BUG_ON(sysctls_root_table); sysctls_root_table = register_sysctl("fs", ntfs_sysctls); if (!sysctls_root_table) return -ENOMEM; } else { - BUG_ON(!sysctls_root_table); unregister_sysctl_table(sysctls_root_table); sysctls_root_table = NULL; } return 0; } - #endif /* CONFIG_SYSCTL */ #endif /* DEBUG */ diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c index a6b6c64f14a9..b4424297bacf 100644 --- a/fs/ntfs/unistr.c +++ b/fs/ntfs/unistr.c @@ -1,15 +1,12 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * unistr.c - NTFS Unicode string handling. Part of the Linux-NTFS project. + * NTFS Unicode string handling. Part of the Linux-NTFS project. * * Copyright (c) 2001-2006 Anton Altaparmakov */ -#include - -#include "types.h" -#include "debug.h" #include "ntfs.h" +#include "malloc.h" /* * IMPORTANT @@ -51,9 +48,9 @@ static const u8 legal_ansi_char_array[0x40] = { * identical, or 'false' (0) if they are not identical. If @ic is IGNORE_CASE, * the @upcase table is used to performa a case insensitive comparison. */ -bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len, - const ntfschar *s2, size_t s2_len, const IGNORE_CASE_BOOL ic, - const ntfschar *upcase, const u32 upcase_size) +bool ntfs_are_names_equal(const __le16 *s1, size_t s1_len, + const __le16 *s2, size_t s2_len, const u32 ic, + const __le16 *upcase, const u32 upcase_size) { if (s1_len != s2_len) return false; @@ -65,7 +62,9 @@ bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len, /** * ntfs_collate_names - collate two Unicode names * @name1: first Unicode name to compare + * @name1_len: first Unicode name length * @name2: second Unicode name to compare + * @name2_len: second Unicode name length * @err_val: if @name1 contains an invalid character return this value * @ic: either CASE_SENSITIVE or IGNORE_CASE * @upcase: upcase table (ignored if @ic is CASE_SENSITIVE) @@ -80,10 +79,10 @@ bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len, * * The following characters are considered invalid: '"', '*', '<', '>' and '?'. */ -int ntfs_collate_names(const ntfschar *name1, const u32 name1_len, - const ntfschar *name2, const u32 name2_len, - const int err_val, const IGNORE_CASE_BOOL ic, - const ntfschar *upcase, const u32 upcase_len) +int ntfs_collate_names(const __le16 *name1, const u32 name1_len, + const __le16 *name2, const u32 name2_len, + const int err_val, const u32 ic, + const __le16 *upcase, const u32 upcase_len) { u32 cnt, min_len; u16 c1, c2; @@ -132,7 +131,7 @@ int ntfs_collate_names(const ntfschar *name1, const u32 name1_len, * if @s1 (or the first @n Unicode characters thereof) is found, respectively, * to be less than, to match, or be greater than @s2. */ -int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n) +int ntfs_ucsncmp(const __le16 *s1, const __le16 *s2, size_t n) { u16 c1, c2; size_t i; @@ -168,16 +167,18 @@ int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n) * if @s1 (or the first @n Unicode characters thereof) is found, respectively, * to be less than, to match, or be greater than @s2. */ -int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n, - const ntfschar *upcase, const u32 upcase_size) +int ntfs_ucsncasecmp(const __le16 *s1, const __le16 *s2, size_t n, + const __le16 *upcase, const u32 upcase_size) { size_t i; u16 c1, c2; for (i = 0; i < n; ++i) { - if ((c1 = le16_to_cpu(s1[i])) < upcase_size) + c1 = le16_to_cpu(s1[i]); + if (c1 < upcase_size) c1 = le16_to_cpu(upcase[c1]); - if ((c2 = le16_to_cpu(s2[i])) < upcase_size) + c2 = le16_to_cpu(s2[i]); + if (c2 < upcase_size) c2 = le16_to_cpu(upcase[c2]); if (c1 < c2) return -1; @@ -189,42 +190,20 @@ int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n, return 0; } -void ntfs_upcase_name(ntfschar *name, u32 name_len, const ntfschar *upcase, - const u32 upcase_len) +int ntfs_file_compare_values(const struct file_name_attr *file_name_attr1, + const struct file_name_attr *file_name_attr2, + const int err_val, const u32 ic, + const __le16 *upcase, const u32 upcase_len) { - u32 i; - u16 u; - - for (i = 0; i < name_len; i++) - if ((u = le16_to_cpu(name[i])) < upcase_len) - name[i] = upcase[u]; -} - -void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr, - const ntfschar *upcase, const u32 upcase_len) -{ - ntfs_upcase_name((ntfschar*)&file_name_attr->file_name, - file_name_attr->file_name_length, upcase, upcase_len); -} - -int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1, - FILE_NAME_ATTR *file_name_attr2, - const int err_val, const IGNORE_CASE_BOOL ic, - const ntfschar *upcase, const u32 upcase_len) -{ - return ntfs_collate_names((ntfschar*)&file_name_attr1->file_name, + return ntfs_collate_names((__le16 *)&file_name_attr1->file_name, file_name_attr1->file_name_length, - (ntfschar*)&file_name_attr2->file_name, + (__le16 *)&file_name_attr2->file_name, file_name_attr2->file_name_length, err_val, ic, upcase, upcase_len); } /** * ntfs_nlstoucs - convert NLS string to little endian Unicode string - * @vol: ntfs volume which we are working with - * @ins: input NLS string buffer - * @ins_len: length of input string in bytes - * @outs: on return contains the allocated output Unicode string buffer * * Convert the input string @ins, which is in whatever format the loaded NLS * map dictates, into a little endian, 2-byte Unicode string. @@ -242,53 +221,68 @@ int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1, * * This might look a bit odd due to fast path optimization... */ -int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins, - const int ins_len, ntfschar **outs) +int ntfs_nlstoucs(const struct ntfs_volume *vol, const char *ins, + const int ins_len, __le16 **outs, int max_name_len) { struct nls_table *nls = vol->nls_map; - ntfschar *ucs; + __le16 *ucs; wchar_t wc; int i, o, wc_len; /* We do not trust outside sources. */ if (likely(ins)) { - ucs = kmem_cache_alloc(ntfs_name_cache, GFP_NOFS); + if (max_name_len > NTFS_MAX_NAME_LEN) + ucs = kvmalloc((max_name_len + 2) * sizeof(__le16), + GFP_NOFS | __GFP_ZERO); + else + ucs = kmem_cache_alloc(ntfs_name_cache, GFP_NOFS); if (likely(ucs)) { - for (i = o = 0; i < ins_len; i += wc_len) { - wc_len = nls->char2uni(ins + i, ins_len - i, - &wc); - if (likely(wc_len >= 0 && - o < NTFS_MAX_NAME_LEN)) { - if (likely(wc)) { - ucs[o++] = cpu_to_le16(wc); - continue; - } /* else if (!wc) */ - break; - } /* else if (wc_len < 0 || - o >= NTFS_MAX_NAME_LEN) */ - goto name_err; + if (vol->nls_utf8) { + o = utf8s_to_utf16s(ins, ins_len, + UTF16_LITTLE_ENDIAN, + ucs, + max_name_len + 2); + if (o < 0 || o > max_name_len) { + wc_len = o; + goto name_err; + } + } else { + for (i = o = 0; i < ins_len; i += wc_len) { + wc_len = nls->char2uni(ins + i, ins_len - i, + &wc); + if (likely(wc_len >= 0 && + o < max_name_len)) { + if (likely(wc)) { + ucs[o++] = cpu_to_le16(wc); + continue; + } /* else if (!wc) */ + break; + } + + goto name_err; + } } ucs[o] = 0; *outs = ucs; return o; } /* else if (!ucs) */ - ntfs_error(vol->sb, "Failed to allocate buffer for converted " - "name from ntfs_name_cache."); + ntfs_debug("Failed to allocate buffer for converted name from ntfs_name_cache."); return -ENOMEM; } /* else if (!ins) */ ntfs_error(vol->sb, "Received NULL pointer."); return -EINVAL; name_err: - kmem_cache_free(ntfs_name_cache, ucs); + if (max_name_len > NTFS_MAX_NAME_LEN) + kvfree(ucs); + else + kmem_cache_free(ntfs_name_cache, ucs); if (wc_len < 0) { - ntfs_error(vol->sb, "Name using character set %s contains " - "characters that cannot be converted to " - "Unicode.", nls->charset); + ntfs_debug("Name using character set %s contains characters that cannot be converted to Unicode.", + nls->charset); i = -EILSEQ; - } else /* if (o >= NTFS_MAX_NAME_LEN) */ { - ntfs_error(vol->sb, "Name is too long (maximum length for a " - "name on NTFS is %d Unicode characters.", - NTFS_MAX_NAME_LEN); + } else { + ntfs_debug("Name is too long (maximum length for a name on NTFS is %d Unicode characters.", + max_name_len); i = -ENAMETOOLONG; } return i; @@ -319,7 +313,7 @@ int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins, * * This might look a bit odd due to fast path optimization... */ -int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins, +int ntfs_ucstonls(const struct ntfs_volume *vol, const __le16 *ins, const int ins_len, unsigned char **outs, int outs_len) { struct nls_table *nls = vol->nls_map; @@ -340,8 +334,20 @@ int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins, if (!ns) goto mem_err_out; } + + if (vol->nls_utf8) { + o = utf16s_to_utf8s((const wchar_t *)ins, ins_len, + UTF16_LITTLE_ENDIAN, ns, ns_len); + if (o >= ns_len) { + wc = -ENAMETOOLONG; + goto conversion_err; + } + goto done; + } + for (i = o = 0; i < ins_len; i++) { -retry: wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o, +retry: + wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o, ns_len - o); if (wc > 0) { o += wc; @@ -363,6 +369,7 @@ retry: wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o, } /* wc < 0, real error. */ goto conversion_err; } +done: ns[o] = 0; *outs = ns; return o; @@ -370,9 +377,9 @@ retry: wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o, ntfs_error(vol->sb, "Received NULL pointer."); return -EINVAL; conversion_err: - ntfs_error(vol->sb, "Unicode name contains characters that cannot be " - "converted to character set %s. You might want to " - "try to use the mount option nls=utf8.", nls->charset); + ntfs_error(vol->sb, + "Unicode name contains characters that cannot be converted to character set %s. You might want to try to use the mount option nls=utf8.", + nls->charset); if (ns != *outs) kfree(ns); if (wc != -ENAMETOOLONG) @@ -382,3 +389,85 @@ retry: wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o, ntfs_error(vol->sb, "Failed to allocate name!"); return -ENOMEM; } + +/** + * ntfs_ucsnlen - determine the length of a little endian Unicode string + * @s: pointer to Unicode string + * @maxlen: maximum length of string @s + * + * Return the number of Unicode characters in the little endian Unicode + * string @s up to a maximum of maxlen Unicode characters, not including + * the terminating (__le16)'\0'. If there is no (__le16)'\0' between @s + * and @s + @maxlen, @maxlen is returned. + * + * This function never looks beyond @s + @maxlen. + */ +static u32 ntfs_ucsnlen(const __le16 *s, u32 maxlen) +{ + u32 i; + + for (i = 0; i < maxlen; i++) { + if (!le16_to_cpu(s[i])) + break; + } + return i; +} + +/** + * ntfs_ucsndup - duplicate little endian Unicode string + * @s: pointer to Unicode string + * @maxlen: maximum length of string @s + * + * Return a pointer to a new little endian Unicode string which is a duplicate + * of the string s. Memory for the new string is obtained with ntfs_malloc(3), + * and can be freed with free(3). + * + * A maximum of @maxlen Unicode characters are copied and a terminating + * (__le16)'\0' little endian Unicode character is added. + * + * This function never looks beyond @s + @maxlen. + * + * Return a pointer to the new little endian Unicode string on success and NULL + * on failure with errno set to the error code. + */ +__le16 *ntfs_ucsndup(const __le16 *s, u32 maxlen) +{ + __le16 *dst; + u32 len; + + len = ntfs_ucsnlen(s, maxlen); + dst = ntfs_malloc_nofs((len + 1) * sizeof(__le16)); + if (dst) { + memcpy(dst, s, len * sizeof(__le16)); + dst[len] = cpu_to_le16(L'\0'); + } + return dst; +} + +/** + * ntfs_names_are_equal - compare two Unicode names for equality + * @s1: name to compare to @s2 + * @s1_len: length in Unicode characters of @s1 + * @s2: name to compare to @s1 + * @s2_len: length in Unicode characters of @s2 + * @ic: ignore case bool + * @upcase: upcase table (only if @ic == IGNORE_CASE) + * @upcase_size: length in Unicode characters of @upcase (if present) + * + * Compare the names @s1 and @s2 and return TRUE (1) if the names are + * identical, or FALSE (0) if they are not identical. If @ic is IGNORE_CASE, + * the @upcase table is used to perform a case insensitive comparison. + */ +bool ntfs_names_are_equal(const __le16 *s1, size_t s1_len, + const __le16 *s2, size_t s2_len, + const u32 ic, + const __le16 *upcase, const u32 upcase_size) +{ + if (s1_len != s2_len) + return false; + if (!s1_len) + return true; + if (ic == CASE_SENSITIVE) + return ntfs_ucsncmp(s1, s2, s1_len) ? false : true; + return ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size) ? false : true; +} diff --git a/fs/ntfs/upcase.c b/fs/ntfs/upcase.c index 4ebe84a78dea..21afd7e92428 100644 --- a/fs/ntfs/upcase.c +++ b/fs/ntfs/upcase.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * upcase.c - Generate the full NTFS Unicode upcase table in little endian. - * Part of the Linux-NTFS project. + * Generate the full NTFS Unicode upcase table in little endian. + * Part of the Linux-NTFS project. * * Copyright (c) 2001 Richard Russon * Copyright (c) 2001-2006 Anton Altaparmakov @@ -10,7 +10,7 @@ #include "malloc.h" #include "ntfs.h" -ntfschar *generate_default_upcase(void) +__le16 *generate_default_upcase(void) { static const int uc_run_table[][3] = { /* Start, End, Add */ {0x0061, 0x007B, -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72, 74}, @@ -52,12 +52,12 @@ ntfschar *generate_default_upcase(void) }; int i, r; - ntfschar *uc; + __le16 *uc; - uc = ntfs_malloc_nofs(default_upcase_len * sizeof(ntfschar)); + uc = ntfs_malloc_nofs(default_upcase_len * sizeof(__le16)); if (!uc) return uc; - memset(uc, 0, default_upcase_len * sizeof(ntfschar)); + memset(uc, 0, default_upcase_len * sizeof(__le16)); /* Generate the little endian Unicode upcase table used by ntfs. */ for (i = 0; i < default_upcase_len; i++) uc[i] = cpu_to_le16(i); -- 2.25.1 ntfs filesystem has been remade and is returning as a new implementation. ntfs3 no longer needs to be an alias for ntfs. Signed-off-by: Namjae Jeon --- fs/ntfs3/Kconfig | 9 -------- fs/ntfs3/dir.c | 9 -------- fs/ntfs3/file.c | 10 --------- fs/ntfs3/inode.c | 16 ++++---------- fs/ntfs3/ntfs_fs.h | 11 ---------- fs/ntfs3/super.c | 52 ---------------------------------------------- 6 files changed, 4 insertions(+), 103 deletions(-) diff --git a/fs/ntfs3/Kconfig b/fs/ntfs3/Kconfig index 7bc31d69f680..cdfdf51e55d7 100644 --- a/fs/ntfs3/Kconfig +++ b/fs/ntfs3/Kconfig @@ -46,12 +46,3 @@ config NTFS3_FS_POSIX_ACL NOTE: this is linux only feature. Windows will ignore these ACLs. If you don't know what Access Control Lists are, say N. - -config NTFS_FS - tristate "NTFS file system support" - select NTFS3_FS - select BUFFER_HEAD - select NLS - help - This config option is here only for backward compatibility. NTFS - filesystem is now handled by the NTFS3 driver. diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c index b98e95d6b4d9..fc39e7330365 100644 --- a/fs/ntfs3/dir.c +++ b/fs/ntfs3/dir.c @@ -631,13 +631,4 @@ const struct file_operations ntfs_dir_operations = { .compat_ioctl = ntfs_compat_ioctl, #endif }; - -#if IS_ENABLED(CONFIG_NTFS_FS) -const struct file_operations ntfs_legacy_dir_operations = { - .llseek = generic_file_llseek, - .read = generic_read_dir, - .iterate_shared = ntfs_readdir, - .open = ntfs_file_open, -}; -#endif // clang-format on diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 2e7b2e566ebe..0faa856fc470 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -1478,14 +1478,4 @@ const struct file_operations ntfs_file_operations = { .fallocate = ntfs_fallocate, .release = ntfs_file_release, }; - -#if IS_ENABLED(CONFIG_NTFS_FS) -const struct file_operations ntfs_legacy_file_operations = { - .llseek = generic_file_llseek, - .read_iter = ntfs_file_read_iter, - .splice_read = ntfs_file_splice_read, - .open = ntfs_file_open, - .release = ntfs_file_release, -}; -#endif // clang-format on diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 0a9ac5efeb67..826840c257d3 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -444,9 +444,7 @@ static struct inode *ntfs_read_mft(struct inode *inode, * Usually a hard links to directories are disabled. */ inode->i_op = &ntfs_dir_inode_operations; - inode->i_fop = unlikely(is_legacy_ntfs(sb)) ? - &ntfs_legacy_dir_operations : - &ntfs_dir_operations; + inode->i_fop = &ntfs_dir_operations; ni->i_valid = 0; } else if (S_ISLNK(mode)) { ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY; @@ -456,9 +454,7 @@ static struct inode *ntfs_read_mft(struct inode *inode, } else if (S_ISREG(mode)) { ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY; inode->i_op = &ntfs_file_inode_operations; - inode->i_fop = unlikely(is_legacy_ntfs(sb)) ? - &ntfs_legacy_file_operations : - &ntfs_file_operations; + inode->i_fop = &ntfs_file_operations; inode->i_mapping->a_ops = is_compressed(ni) ? &ntfs_aops_cmpr : &ntfs_aops; if (ino != MFT_REC_MFT) @@ -1590,9 +1586,7 @@ int ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir, if (S_ISDIR(mode)) { inode->i_op = &ntfs_dir_inode_operations; - inode->i_fop = unlikely(is_legacy_ntfs(sb)) ? - &ntfs_legacy_dir_operations : - &ntfs_dir_operations; + inode->i_fop = &ntfs_dir_operations; } else if (S_ISLNK(mode)) { inode->i_op = &ntfs_link_inode_operations; inode->i_fop = NULL; @@ -1601,9 +1595,7 @@ int ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir, inode_nohighmem(inode); } else if (S_ISREG(mode)) { inode->i_op = &ntfs_file_inode_operations; - inode->i_fop = unlikely(is_legacy_ntfs(sb)) ? - &ntfs_legacy_file_operations : - &ntfs_file_operations; + inode->i_fop = &ntfs_file_operations; inode->i_mapping->a_ops = is_compressed(ni) ? &ntfs_aops_cmpr : &ntfs_aops; init_rwsem(&ni->file.run_lock); diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index a4559c9f64e6..326644d23110 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -501,7 +501,6 @@ struct inode *dir_search_u(struct inode *dir, const struct cpu_str *uni, struct ntfs_fnd *fnd); bool dir_is_empty(struct inode *dir); extern const struct file_operations ntfs_dir_operations; -extern const struct file_operations ntfs_legacy_dir_operations; /* Globals from file.c */ int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path, @@ -516,7 +515,6 @@ long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg); extern const struct inode_operations ntfs_special_inode_operations; extern const struct inode_operations ntfs_file_inode_operations; extern const struct file_operations ntfs_file_operations; -extern const struct file_operations ntfs_legacy_file_operations; /* Globals from frecord.c */ void ni_remove_mi(struct ntfs_inode *ni, struct mft_inode *mi); @@ -1160,13 +1158,4 @@ static inline void le64_sub_cpu(__le64 *var, u64 val) *var = cpu_to_le64(le64_to_cpu(*var) - val); } -#if IS_ENABLED(CONFIG_NTFS_FS) -bool is_legacy_ntfs(struct super_block *sb); -#else -static inline bool is_legacy_ntfs(struct super_block *sb) -{ - return false; -} -#endif - #endif /* _LINUX_NTFS3_NTFS_FS_H */ diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 8b0cf0ed4f72..d6fd14c191a9 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -415,12 +415,6 @@ static int ntfs_fs_reconfigure(struct fs_context *fc) struct ntfs_mount_options *new_opts = fc->fs_private; int ro_rw; - /* If ntfs3 is used as legacy ntfs enforce read-only mode. */ - if (is_legacy_ntfs(sb)) { - fc->sb_flags |= SB_RDONLY; - goto out; - } - ro_rw = sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY); if (ro_rw && (sbi->flags & NTFS_FLAGS_NEED_REPLAY)) { errorf(fc, @@ -447,7 +441,6 @@ static int ntfs_fs_reconfigure(struct fs_context *fc) return -EINVAL; } -out: sync_filesystem(sb); swap(sbi->options, fc->fs_private); @@ -1670,8 +1663,6 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) ntfs_create_procdir(sb); - if (is_legacy_ntfs(sb)) - sb->s_flags |= SB_RDONLY; return 0; put_inode_out: @@ -1876,47 +1867,6 @@ static struct file_system_type ntfs_fs_type = { .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; -#if IS_ENABLED(CONFIG_NTFS_FS) -static int ntfs_legacy_init_fs_context(struct fs_context *fc) -{ - int ret; - - ret = __ntfs_init_fs_context(fc); - /* If ntfs3 is used as legacy ntfs enforce read-only mode. */ - fc->sb_flags |= SB_RDONLY; - return ret; -} - -static struct file_system_type ntfs_legacy_fs_type = { - .owner = THIS_MODULE, - .name = "ntfs", - .init_fs_context = ntfs_legacy_init_fs_context, - .parameters = ntfs_fs_parameters, - .kill_sb = ntfs3_kill_sb, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, -}; -MODULE_ALIAS_FS("ntfs"); - -static inline void register_as_ntfs_legacy(void) -{ - int err = register_filesystem(&ntfs_legacy_fs_type); - if (err) - pr_warn("ntfs3: Failed to register legacy ntfs filesystem driver: %d\n", err); -} - -static inline void unregister_as_ntfs_legacy(void) -{ - unregister_filesystem(&ntfs_legacy_fs_type); -} -bool is_legacy_ntfs(struct super_block *sb) -{ - return sb->s_type == &ntfs_legacy_fs_type; -} -#else -static inline void register_as_ntfs_legacy(void) {} -static inline void unregister_as_ntfs_legacy(void) {} -#endif - // clang-format on static int __init init_ntfs_fs(void) @@ -1945,7 +1895,6 @@ static int __init init_ntfs_fs(void) goto out1; } - register_as_ntfs_legacy(); err = register_filesystem(&ntfs_fs_type); if (err) goto out; @@ -1965,7 +1914,6 @@ static void __exit exit_ntfs_fs(void) rcu_barrier(); kmem_cache_destroy(ntfs_inode_cachep); unregister_filesystem(&ntfs_fs_type); - unregister_as_ntfs_legacy(); ntfs3_exit_bitmap(); ntfs_remove_proc_root(); } -- 2.25.1 This adds the Kconfig and Makefile Signed-off-by: Namjae Jeon --- fs/Kconfig | 18 ++++++++++++++++++ fs/Makefile | 1 + fs/ntfs/Kconfig | 46 ++++++++++++++++++++++++++++++++++++++++++++++ fs/ntfs/Makefile | 18 ++++++++++++++++++ 4 files changed, 83 insertions(+) create mode 100644 fs/ntfs/Kconfig create mode 100644 fs/ntfs/Makefile diff --git a/fs/Kconfig b/fs/Kconfig index 0bfdaecaa877..c57cb6a53baf 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -152,8 +152,26 @@ menu "DOS/FAT/EXFAT/NT Filesystems" source "fs/fat/Kconfig" source "fs/exfat/Kconfig" +source "fs/ntfs/Kconfig" source "fs/ntfs3/Kconfig" +choice + prompt "Select built-in NTFS filesystem (only one can be built-in)" + default DEFAULT_NTFS + help + Only one NTFS can be built into the kernel(y) when selecting a + specific default. Both can still be built as modules(m). + + config DEFAULT_NTFS_NONE + bool "No built-in restriction (allows both drivers as 'y')" + + config DEFAULT_NTFS + bool "NTFS" + + config DEFAULT_NTFS3 + bool "NTFS3" +endchoice + endmenu endif # BLOCK diff --git a/fs/Makefile b/fs/Makefile index a04274a3c854..6893496697c4 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -90,6 +90,7 @@ obj-$(CONFIG_NLS) += nls/ obj-y += unicode/ obj-$(CONFIG_SMBFS) += smb/ obj-$(CONFIG_HPFS_FS) += hpfs/ +obj-$(CONFIG_NTFS_FS) += ntfs/ obj-$(CONFIG_NTFS3_FS) += ntfs3/ obj-$(CONFIG_UFS_FS) += ufs/ obj-$(CONFIG_EFS_FS) += efs/ diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig new file mode 100644 index 000000000000..ef14c68ed36c --- /dev/null +++ b/fs/ntfs/Kconfig @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: GPL-2.0-only +config NTFS_FS + tristate "NTFS file system support" + depends on !DEFAULT_NTFS3 || m + select NLS + help + NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003. + This allows you to mount devices formatted with the ntfs file system. + + To compile this as a module, choose M here: the module will be called + ntfs. + +config NTFS_DEBUG + bool "NTFS debugging support" + depends on NTFS_FS + help + If you are experiencing any problems with the NTFS file system, say + Y here. This will result in additional consistency checks to be + performed by the driver as well as additional debugging messages to + be written to the system log. Note that debugging messages are + disabled by default. To enable them, supply the option debug_msgs=1 + at the kernel command line when booting the kernel or as an option + to insmod when loading the ntfs module. Once the driver is active, + you can enable debugging messages by doing (as root): + echo 1 > /proc/sys/fs/ntfs-debug + Replacing the "1" with "0" would disable debug messages. + + If you leave debugging messages disabled, this results in little + overhead, but enabling debug messages results in very significant + slowdown of the system. + + When reporting bugs, please try to have available a full dump of + debugging messages while the misbehaviour was occurring. + +config NTFS_FS_POSIX_ACL + bool "NTFS POSIX Access Control Lists" + depends on NTFS_FS + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support additional access rights + for users and groups beyond the standard owner/group/world scheme, + and this option selects support for ACLs specifically for ntfs + filesystems. + NOTE: this is linux only feature. Windows will ignore these ACLs. + + If you don't know what Access Control Lists are, say N. diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile new file mode 100644 index 000000000000..01faad8cbbc9 --- /dev/null +++ b/fs/ntfs/Makefile @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the ntfs filesystem support. +# + +# to check robot warnings +ccflags-y += -Wint-to-pointer-cast \ + $(call cc-option,-Wunused-but-set-variable,-Wunused-const-variable) \ + $(call cc-option,-Wold-style-declaration,-Wout-of-line-declaration) + +obj-$(CONFIG_NTFS_FS) += ntfs.o + +ntfs-y := aops.o attrib.o collate.o dir.o file.o index.o inode.o \ + mft.o mst.o namei.o runlist.o super.o unistr.o attrlist.o ea.o \ + upcase.o bitmap.o lcnalloc.o logfile.o reparse.o compress.o \ + iomap.o debug.o sysctl.o quota.o + +ccflags-$(CONFIG_NTFS_DEBUG) += -DDEBUG -- 2.25.1 Add myself and Hyunchul Lee as ntfs maintainer. Signed-off-by: Namjae Jeon --- MAINTAINERS | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 12f49de7fe03..adf80c8207f1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18646,6 +18646,15 @@ W: https://github.com/davejiang/linux/wiki T: git https://github.com/davejiang/linux.git F: drivers/ntb/hw/intel/ +NTFS FILESYSTEM +M: Namjae Jeon +M: Hyunchul Lee +L: linux-fsdevel@vger.kernel.org +S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/linkinjeon/ntfs.git +F: Documentation/filesystems/ntfs.rst +F: fs/ntfs/ + NTFS3 FILESYSTEM M: Konstantin Komarov L: ntfs3@lists.linux.dev -- 2.25.1