Skip to content

Commit 56d1115

Browse files
Brian Fosterdchinner
authored andcommitted
xfs: allocate sparse inode chunks on full chunk allocation failure
xfs_ialloc_ag_alloc() makes several attempts to allocate a full inode chunk. If all else fails, reduce the allocation to the sparse length and alignment and attempt to allocate a sparse inode chunk. If sparse chunk allocation succeeds, check whether an inobt record already exists that can track the chunk. If so, inherit and update the existing record. Otherwise, insert a new record for the sparse chunk. Create helpers to align sparse chunk inode records and insert or update existing records in the inode btrees. The xfs_inobt_insert_sprec() helper implements the merge or update semantics required for sparse inode records with respect to both the inobt and finobt. To update the inobt, either insert a new record or merge with an existing record. To update the finobt, use the updated inobt record to either insert or replace an existing record. Signed-off-by: Brian Foster <[email protected]> Reviewed-by: Dave Chinner <[email protected]> Signed-off-by: Dave Chinner <[email protected]>
1 parent 4148c34 commit 56d1115

File tree

4 files changed

+401
-14
lines changed

4 files changed

+401
-14
lines changed

fs/xfs/libxfs/xfs_ialloc.c

Lines changed: 316 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,214 @@ xfs_ialloc_inode_init(
377377
return 0;
378378
}
379379

380+
/*
381+
* Align startino and allocmask for a recently allocated sparse chunk such that
382+
* they are fit for insertion (or merge) into the on-disk inode btrees.
383+
*
384+
* Background:
385+
*
386+
* When enabled, sparse inode support increases the inode alignment from cluster
387+
* size to inode chunk size. This means that the minimum range between two
388+
* non-adjacent inode records in the inobt is large enough for a full inode
389+
* record. This allows for cluster sized, cluster aligned block allocation
390+
* without need to worry about whether the resulting inode record overlaps with
391+
* another record in the tree. Without this basic rule, we would have to deal
392+
* with the consequences of overlap by potentially undoing recent allocations in
393+
* the inode allocation codepath.
394+
*
395+
* Because of this alignment rule (which is enforced on mount), there are two
396+
* inobt possibilities for newly allocated sparse chunks. One is that the
397+
* aligned inode record for the chunk covers a range of inodes not already
398+
* covered in the inobt (i.e., it is safe to insert a new sparse record). The
399+
* other is that a record already exists at the aligned startino that considers
400+
* the newly allocated range as sparse. In the latter case, record content is
401+
* merged in hope that sparse inode chunks fill to full chunks over time.
402+
*/
403+
STATIC void
404+
xfs_align_sparse_ino(
405+
struct xfs_mount *mp,
406+
xfs_agino_t *startino,
407+
uint16_t *allocmask)
408+
{
409+
xfs_agblock_t agbno;
410+
xfs_agblock_t mod;
411+
int offset;
412+
413+
agbno = XFS_AGINO_TO_AGBNO(mp, *startino);
414+
mod = agbno % mp->m_sb.sb_inoalignmt;
415+
if (!mod)
416+
return;
417+
418+
/* calculate the inode offset and align startino */
419+
offset = mod << mp->m_sb.sb_inopblog;
420+
*startino -= offset;
421+
422+
/*
423+
* Since startino has been aligned down, left shift allocmask such that
424+
* it continues to represent the same physical inodes relative to the
425+
* new startino.
426+
*/
427+
*allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT;
428+
}
429+
430+
/*
431+
* Determine whether the source inode record can merge into the target. Both
432+
* records must be sparse, the inode ranges must match and there must be no
433+
* allocation overlap between the records.
434+
*/
435+
STATIC bool
436+
__xfs_inobt_can_merge(
437+
struct xfs_inobt_rec_incore *trec, /* tgt record */
438+
struct xfs_inobt_rec_incore *srec) /* src record */
439+
{
440+
uint64_t talloc;
441+
uint64_t salloc;
442+
443+
/* records must cover the same inode range */
444+
if (trec->ir_startino != srec->ir_startino)
445+
return false;
446+
447+
/* both records must be sparse */
448+
if (!xfs_inobt_issparse(trec->ir_holemask) ||
449+
!xfs_inobt_issparse(srec->ir_holemask))
450+
return false;
451+
452+
/* both records must track some inodes */
453+
if (!trec->ir_count || !srec->ir_count)
454+
return false;
455+
456+
/* can't exceed capacity of a full record */
457+
if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK)
458+
return false;
459+
460+
/* verify there is no allocation overlap */
461+
talloc = xfs_inobt_irec_to_allocmask(trec);
462+
salloc = xfs_inobt_irec_to_allocmask(srec);
463+
if (talloc & salloc)
464+
return false;
465+
466+
return true;
467+
}
468+
469+
/*
470+
* Merge the source inode record into the target. The caller must call
471+
* __xfs_inobt_can_merge() to ensure the merge is valid.
472+
*/
473+
STATIC void
474+
__xfs_inobt_rec_merge(
475+
struct xfs_inobt_rec_incore *trec, /* target */
476+
struct xfs_inobt_rec_incore *srec) /* src */
477+
{
478+
ASSERT(trec->ir_startino == srec->ir_startino);
479+
480+
/* combine the counts */
481+
trec->ir_count += srec->ir_count;
482+
trec->ir_freecount += srec->ir_freecount;
483+
484+
/*
485+
* Merge the holemask and free mask. For both fields, 0 bits refer to
486+
* allocated inodes. We combine the allocated ranges with bitwise AND.
487+
*/
488+
trec->ir_holemask &= srec->ir_holemask;
489+
trec->ir_free &= srec->ir_free;
490+
}
491+
492+
/*
493+
* Insert a new sparse inode chunk into the associated inode btree. The inode
494+
* record for the sparse chunk is pre-aligned to a startino that should match
495+
* any pre-existing sparse inode record in the tree. This allows sparse chunks
496+
* to fill over time.
497+
*
498+
* This function supports two modes of handling preexisting records depending on
499+
* the merge flag. If merge is true, the provided record is merged with the
500+
* existing record and updated in place. The merged record is returned in nrec.
501+
* If merge is false, an existing record is replaced with the provided record.
502+
* If no preexisting record exists, the provided record is always inserted.
503+
*
504+
* It is considered corruption if a merge is requested and not possible. Given
505+
* the sparse inode alignment constraints, this should never happen.
506+
*/
507+
STATIC int
508+
xfs_inobt_insert_sprec(
509+
struct xfs_mount *mp,
510+
struct xfs_trans *tp,
511+
struct xfs_buf *agbp,
512+
int btnum,
513+
struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */
514+
bool merge) /* merge or replace */
515+
{
516+
struct xfs_btree_cur *cur;
517+
struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
518+
xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
519+
int error;
520+
int i;
521+
struct xfs_inobt_rec_incore rec;
522+
523+
cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
524+
525+
/* the new record is pre-aligned so we know where to look */
526+
error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
527+
if (error)
528+
goto error;
529+
/* if nothing there, insert a new record and return */
530+
if (i == 0) {
531+
error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
532+
nrec->ir_count, nrec->ir_freecount,
533+
nrec->ir_free, &i);
534+
if (error)
535+
goto error;
536+
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
537+
538+
goto out;
539+
}
540+
541+
/*
542+
* A record exists at this startino. Merge or replace the record
543+
* depending on what we've been asked to do.
544+
*/
545+
if (merge) {
546+
error = xfs_inobt_get_rec(cur, &rec, &i);
547+
if (error)
548+
goto error;
549+
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
550+
XFS_WANT_CORRUPTED_GOTO(mp,
551+
rec.ir_startino == nrec->ir_startino,
552+
error);
553+
554+
/*
555+
* This should never fail. If we have coexisting records that
556+
* cannot merge, something is seriously wrong.
557+
*/
558+
XFS_WANT_CORRUPTED_GOTO(mp, __xfs_inobt_can_merge(nrec, &rec),
559+
error);
560+
561+
trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino,
562+
rec.ir_holemask, nrec->ir_startino,
563+
nrec->ir_holemask);
564+
565+
/* merge to nrec to output the updated record */
566+
__xfs_inobt_rec_merge(nrec, &rec);
567+
568+
trace_xfs_irec_merge_post(mp, agno, nrec->ir_startino,
569+
nrec->ir_holemask);
570+
571+
error = xfs_inobt_rec_check_count(mp, nrec);
572+
if (error)
573+
goto error;
574+
}
575+
576+
error = xfs_inobt_update(cur, nrec);
577+
if (error)
578+
goto error;
579+
580+
out:
581+
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
582+
return 0;
583+
error:
584+
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
585+
return error;
586+
}
587+
380588
/*
381589
* Allocate new inodes in the allocation group specified by agbp.
382590
* Return 0 for success, else error code.
@@ -395,6 +603,8 @@ xfs_ialloc_ag_alloc(
395603
xfs_agino_t newlen; /* new number of inodes */
396604
int isaligned = 0; /* inode allocation at stripe unit */
397605
/* boundary */
606+
uint16_t allocmask = (uint16_t) -1; /* init. to full chunk */
607+
struct xfs_inobt_rec_incore rec;
398608
struct xfs_perag *pag;
399609

400610
memset(&args, 0, sizeof(args));
@@ -511,6 +721,45 @@ xfs_ialloc_ag_alloc(
511721
return error;
512722
}
513723

724+
/*
725+
* Finally, try a sparse allocation if the filesystem supports it and
726+
* the sparse allocation length is smaller than a full chunk.
727+
*/
728+
if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) &&
729+
args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks &&
730+
args.fsbno == NULLFSBLOCK) {
731+
args.type = XFS_ALLOCTYPE_NEAR_BNO;
732+
args.agbno = be32_to_cpu(agi->agi_root);
733+
args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
734+
args.alignment = args.mp->m_sb.sb_spino_align;
735+
args.prod = 1;
736+
737+
args.minlen = args.mp->m_ialloc_min_blks;
738+
args.maxlen = args.minlen;
739+
740+
/*
741+
* The inode record will be aligned to full chunk size. We must
742+
* prevent sparse allocation from AG boundaries that result in
743+
* invalid inode records, such as records that start at agbno 0
744+
* or extend beyond the AG.
745+
*
746+
* Set min agbno to the first aligned, non-zero agbno and max to
747+
* the last aligned agbno that is at least one full chunk from
748+
* the end of the AG.
749+
*/
750+
args.min_agbno = args.mp->m_sb.sb_inoalignmt;
751+
args.max_agbno = round_down(args.mp->m_sb.sb_agblocks,
752+
args.mp->m_sb.sb_inoalignmt) -
753+
args.mp->m_ialloc_blks;
754+
755+
error = xfs_alloc_vextent(&args);
756+
if (error)
757+
return error;
758+
759+
newlen = args.len << args.mp->m_sb.sb_inopblog;
760+
allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1;
761+
}
762+
514763
if (args.fsbno == NULLFSBLOCK) {
515764
*alloc = 0;
516765
return 0;
@@ -535,27 +784,80 @@ xfs_ialloc_ag_alloc(
535784
* Convert the results.
536785
*/
537786
newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
787+
788+
if (xfs_inobt_issparse(~allocmask)) {
789+
/*
790+
* We've allocated a sparse chunk. Align the startino and mask.
791+
*/
792+
xfs_align_sparse_ino(args.mp, &newino, &allocmask);
793+
794+
rec.ir_startino = newino;
795+
rec.ir_holemask = ~allocmask;
796+
rec.ir_count = newlen;
797+
rec.ir_freecount = newlen;
798+
rec.ir_free = XFS_INOBT_ALL_FREE;
799+
800+
/*
801+
* Insert the sparse record into the inobt and allow for a merge
802+
* if necessary. If a merge does occur, rec is updated to the
803+
* merged record.
804+
*/
805+
error = xfs_inobt_insert_sprec(args.mp, tp, agbp, XFS_BTNUM_INO,
806+
&rec, true);
807+
if (error == -EFSCORRUPTED) {
808+
xfs_alert(args.mp,
809+
"invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
810+
XFS_AGINO_TO_INO(args.mp, agno,
811+
rec.ir_startino),
812+
rec.ir_holemask, rec.ir_count);
813+
xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE);
814+
}
815+
if (error)
816+
return error;
817+
818+
/*
819+
* We can't merge the part we've just allocated as for the inobt
820+
* due to finobt semantics. The original record may or may not
821+
* exist independent of whether physical inodes exist in this
822+
* sparse chunk.
823+
*
824+
* We must update the finobt record based on the inobt record.
825+
* rec contains the fully merged and up to date inobt record
826+
* from the previous call. Set merge false to replace any
827+
* existing record with this one.
828+
*/
829+
if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
830+
error = xfs_inobt_insert_sprec(args.mp, tp, agbp,
831+
XFS_BTNUM_FINO, &rec,
832+
false);
833+
if (error)
834+
return error;
835+
}
836+
} else {
837+
/* full chunk - insert new records to both btrees */
838+
error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
839+
XFS_BTNUM_INO);
840+
if (error)
841+
return error;
842+
843+
if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
844+
error = xfs_inobt_insert(args.mp, tp, agbp, newino,
845+
newlen, XFS_BTNUM_FINO);
846+
if (error)
847+
return error;
848+
}
849+
}
850+
851+
/*
852+
* Update AGI counts and newino.
853+
*/
538854
be32_add_cpu(&agi->agi_count, newlen);
539855
be32_add_cpu(&agi->agi_freecount, newlen);
540856
pag = xfs_perag_get(args.mp, agno);
541857
pag->pagi_freecount += newlen;
542858
xfs_perag_put(pag);
543859
agi->agi_newino = cpu_to_be32(newino);
544860

545-
/*
546-
* Insert records describing the new inode chunk into the btrees.
547-
*/
548-
error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
549-
XFS_BTNUM_INO);
550-
if (error)
551-
return error;
552-
553-
if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
554-
error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
555-
XFS_BTNUM_FINO);
556-
if (error)
557-
return error;
558-
}
559861
/*
560862
* Log allocation group header fields
561863
*/

fs/xfs/libxfs/xfs_ialloc_btree.c

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -478,3 +478,34 @@ xfs_inobt_irec_to_allocmask(
478478

479479
return bitmap;
480480
}
481+
482+
#if defined(DEBUG) || defined(XFS_WARN)
483+
/*
484+
* Verify that an in-core inode record has a valid inode count.
485+
*/
486+
int
487+
xfs_inobt_rec_check_count(
488+
struct xfs_mount *mp,
489+
struct xfs_inobt_rec_incore *rec)
490+
{
491+
int inocount = 0;
492+
int nextbit = 0;
493+
uint64_t allocbmap;
494+
int wordsz;
495+
496+
wordsz = sizeof(allocbmap) / sizeof(unsigned int);
497+
allocbmap = xfs_inobt_irec_to_allocmask(rec);
498+
499+
nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, nextbit);
500+
while (nextbit != -1) {
501+
inocount++;
502+
nextbit = xfs_next_bit((uint *) &allocbmap, wordsz,
503+
nextbit + 1);
504+
}
505+
506+
if (inocount != rec->ir_count)
507+
return -EFSCORRUPTED;
508+
509+
return 0;
510+
}
511+
#endif /* DEBUG */

0 commit comments

Comments
 (0)