@@ -377,6 +377,214 @@ xfs_ialloc_inode_init(
377
377
return 0 ;
378
378
}
379
379
380
+ /*
381
+ * Align startino and allocmask for a recently allocated sparse chunk such that
382
+ * they are fit for insertion (or merge) into the on-disk inode btrees.
383
+ *
384
+ * Background:
385
+ *
386
+ * When enabled, sparse inode support increases the inode alignment from cluster
387
+ * size to inode chunk size. This means that the minimum range between two
388
+ * non-adjacent inode records in the inobt is large enough for a full inode
389
+ * record. This allows for cluster sized, cluster aligned block allocation
390
+ * without need to worry about whether the resulting inode record overlaps with
391
+ * another record in the tree. Without this basic rule, we would have to deal
392
+ * with the consequences of overlap by potentially undoing recent allocations in
393
+ * the inode allocation codepath.
394
+ *
395
+ * Because of this alignment rule (which is enforced on mount), there are two
396
+ * inobt possibilities for newly allocated sparse chunks. One is that the
397
+ * aligned inode record for the chunk covers a range of inodes not already
398
+ * covered in the inobt (i.e., it is safe to insert a new sparse record). The
399
+ * other is that a record already exists at the aligned startino that considers
400
+ * the newly allocated range as sparse. In the latter case, record content is
401
+ * merged in hope that sparse inode chunks fill to full chunks over time.
402
+ */
403
+ STATIC void
404
+ xfs_align_sparse_ino (
405
+ struct xfs_mount * mp ,
406
+ xfs_agino_t * startino ,
407
+ uint16_t * allocmask )
408
+ {
409
+ xfs_agblock_t agbno ;
410
+ xfs_agblock_t mod ;
411
+ int offset ;
412
+
413
+ agbno = XFS_AGINO_TO_AGBNO (mp , * startino );
414
+ mod = agbno % mp -> m_sb .sb_inoalignmt ;
415
+ if (!mod )
416
+ return ;
417
+
418
+ /* calculate the inode offset and align startino */
419
+ offset = mod << mp -> m_sb .sb_inopblog ;
420
+ * startino -= offset ;
421
+
422
+ /*
423
+ * Since startino has been aligned down, left shift allocmask such that
424
+ * it continues to represent the same physical inodes relative to the
425
+ * new startino.
426
+ */
427
+ * allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT ;
428
+ }
429
+
430
+ /*
431
+ * Determine whether the source inode record can merge into the target. Both
432
+ * records must be sparse, the inode ranges must match and there must be no
433
+ * allocation overlap between the records.
434
+ */
435
+ STATIC bool
436
+ __xfs_inobt_can_merge (
437
+ struct xfs_inobt_rec_incore * trec , /* tgt record */
438
+ struct xfs_inobt_rec_incore * srec ) /* src record */
439
+ {
440
+ uint64_t talloc ;
441
+ uint64_t salloc ;
442
+
443
+ /* records must cover the same inode range */
444
+ if (trec -> ir_startino != srec -> ir_startino )
445
+ return false;
446
+
447
+ /* both records must be sparse */
448
+ if (!xfs_inobt_issparse (trec -> ir_holemask ) ||
449
+ !xfs_inobt_issparse (srec -> ir_holemask ))
450
+ return false;
451
+
452
+ /* both records must track some inodes */
453
+ if (!trec -> ir_count || !srec -> ir_count )
454
+ return false;
455
+
456
+ /* can't exceed capacity of a full record */
457
+ if (trec -> ir_count + srec -> ir_count > XFS_INODES_PER_CHUNK )
458
+ return false;
459
+
460
+ /* verify there is no allocation overlap */
461
+ talloc = xfs_inobt_irec_to_allocmask (trec );
462
+ salloc = xfs_inobt_irec_to_allocmask (srec );
463
+ if (talloc & salloc )
464
+ return false;
465
+
466
+ return true;
467
+ }
468
+
469
+ /*
470
+ * Merge the source inode record into the target. The caller must call
471
+ * __xfs_inobt_can_merge() to ensure the merge is valid.
472
+ */
473
+ STATIC void
474
+ __xfs_inobt_rec_merge (
475
+ struct xfs_inobt_rec_incore * trec , /* target */
476
+ struct xfs_inobt_rec_incore * srec ) /* src */
477
+ {
478
+ ASSERT (trec -> ir_startino == srec -> ir_startino );
479
+
480
+ /* combine the counts */
481
+ trec -> ir_count += srec -> ir_count ;
482
+ trec -> ir_freecount += srec -> ir_freecount ;
483
+
484
+ /*
485
+ * Merge the holemask and free mask. For both fields, 0 bits refer to
486
+ * allocated inodes. We combine the allocated ranges with bitwise AND.
487
+ */
488
+ trec -> ir_holemask &= srec -> ir_holemask ;
489
+ trec -> ir_free &= srec -> ir_free ;
490
+ }
491
+
492
+ /*
493
+ * Insert a new sparse inode chunk into the associated inode btree. The inode
494
+ * record for the sparse chunk is pre-aligned to a startino that should match
495
+ * any pre-existing sparse inode record in the tree. This allows sparse chunks
496
+ * to fill over time.
497
+ *
498
+ * This function supports two modes of handling preexisting records depending on
499
+ * the merge flag. If merge is true, the provided record is merged with the
500
+ * existing record and updated in place. The merged record is returned in nrec.
501
+ * If merge is false, an existing record is replaced with the provided record.
502
+ * If no preexisting record exists, the provided record is always inserted.
503
+ *
504
+ * It is considered corruption if a merge is requested and not possible. Given
505
+ * the sparse inode alignment constraints, this should never happen.
506
+ */
507
+ STATIC int
508
+ xfs_inobt_insert_sprec (
509
+ struct xfs_mount * mp ,
510
+ struct xfs_trans * tp ,
511
+ struct xfs_buf * agbp ,
512
+ int btnum ,
513
+ struct xfs_inobt_rec_incore * nrec , /* in/out: new/merged rec. */
514
+ bool merge ) /* merge or replace */
515
+ {
516
+ struct xfs_btree_cur * cur ;
517
+ struct xfs_agi * agi = XFS_BUF_TO_AGI (agbp );
518
+ xfs_agnumber_t agno = be32_to_cpu (agi -> agi_seqno );
519
+ int error ;
520
+ int i ;
521
+ struct xfs_inobt_rec_incore rec ;
522
+
523
+ cur = xfs_inobt_init_cursor (mp , tp , agbp , agno , btnum );
524
+
525
+ /* the new record is pre-aligned so we know where to look */
526
+ error = xfs_inobt_lookup (cur , nrec -> ir_startino , XFS_LOOKUP_EQ , & i );
527
+ if (error )
528
+ goto error ;
529
+ /* if nothing there, insert a new record and return */
530
+ if (i == 0 ) {
531
+ error = xfs_inobt_insert_rec (cur , nrec -> ir_holemask ,
532
+ nrec -> ir_count , nrec -> ir_freecount ,
533
+ nrec -> ir_free , & i );
534
+ if (error )
535
+ goto error ;
536
+ XFS_WANT_CORRUPTED_GOTO (mp , i == 1 , error );
537
+
538
+ goto out ;
539
+ }
540
+
541
+ /*
542
+ * A record exists at this startino. Merge or replace the record
543
+ * depending on what we've been asked to do.
544
+ */
545
+ if (merge ) {
546
+ error = xfs_inobt_get_rec (cur , & rec , & i );
547
+ if (error )
548
+ goto error ;
549
+ XFS_WANT_CORRUPTED_GOTO (mp , i == 1 , error );
550
+ XFS_WANT_CORRUPTED_GOTO (mp ,
551
+ rec .ir_startino == nrec -> ir_startino ,
552
+ error );
553
+
554
+ /*
555
+ * This should never fail. If we have coexisting records that
556
+ * cannot merge, something is seriously wrong.
557
+ */
558
+ XFS_WANT_CORRUPTED_GOTO (mp , __xfs_inobt_can_merge (nrec , & rec ),
559
+ error );
560
+
561
+ trace_xfs_irec_merge_pre (mp , agno , rec .ir_startino ,
562
+ rec .ir_holemask , nrec -> ir_startino ,
563
+ nrec -> ir_holemask );
564
+
565
+ /* merge to nrec to output the updated record */
566
+ __xfs_inobt_rec_merge (nrec , & rec );
567
+
568
+ trace_xfs_irec_merge_post (mp , agno , nrec -> ir_startino ,
569
+ nrec -> ir_holemask );
570
+
571
+ error = xfs_inobt_rec_check_count (mp , nrec );
572
+ if (error )
573
+ goto error ;
574
+ }
575
+
576
+ error = xfs_inobt_update (cur , nrec );
577
+ if (error )
578
+ goto error ;
579
+
580
+ out :
581
+ xfs_btree_del_cursor (cur , XFS_BTREE_NOERROR );
582
+ return 0 ;
583
+ error :
584
+ xfs_btree_del_cursor (cur , XFS_BTREE_ERROR );
585
+ return error ;
586
+ }
587
+
380
588
/*
381
589
* Allocate new inodes in the allocation group specified by agbp.
382
590
* Return 0 for success, else error code.
@@ -395,6 +603,8 @@ xfs_ialloc_ag_alloc(
395
603
xfs_agino_t newlen ; /* new number of inodes */
396
604
int isaligned = 0 ; /* inode allocation at stripe unit */
397
605
/* boundary */
606
+ uint16_t allocmask = (uint16_t ) -1 ; /* init. to full chunk */
607
+ struct xfs_inobt_rec_incore rec ;
398
608
struct xfs_perag * pag ;
399
609
400
610
memset (& args , 0 , sizeof (args ));
@@ -511,6 +721,45 @@ xfs_ialloc_ag_alloc(
511
721
return error ;
512
722
}
513
723
724
+ /*
725
+ * Finally, try a sparse allocation if the filesystem supports it and
726
+ * the sparse allocation length is smaller than a full chunk.
727
+ */
728
+ if (xfs_sb_version_hassparseinodes (& args .mp -> m_sb ) &&
729
+ args .mp -> m_ialloc_min_blks < args .mp -> m_ialloc_blks &&
730
+ args .fsbno == NULLFSBLOCK ) {
731
+ args .type = XFS_ALLOCTYPE_NEAR_BNO ;
732
+ args .agbno = be32_to_cpu (agi -> agi_root );
733
+ args .fsbno = XFS_AGB_TO_FSB (args .mp , agno , args .agbno );
734
+ args .alignment = args .mp -> m_sb .sb_spino_align ;
735
+ args .prod = 1 ;
736
+
737
+ args .minlen = args .mp -> m_ialloc_min_blks ;
738
+ args .maxlen = args .minlen ;
739
+
740
+ /*
741
+ * The inode record will be aligned to full chunk size. We must
742
+ * prevent sparse allocation from AG boundaries that result in
743
+ * invalid inode records, such as records that start at agbno 0
744
+ * or extend beyond the AG.
745
+ *
746
+ * Set min agbno to the first aligned, non-zero agbno and max to
747
+ * the last aligned agbno that is at least one full chunk from
748
+ * the end of the AG.
749
+ */
750
+ args .min_agbno = args .mp -> m_sb .sb_inoalignmt ;
751
+ args .max_agbno = round_down (args .mp -> m_sb .sb_agblocks ,
752
+ args .mp -> m_sb .sb_inoalignmt ) -
753
+ args .mp -> m_ialloc_blks ;
754
+
755
+ error = xfs_alloc_vextent (& args );
756
+ if (error )
757
+ return error ;
758
+
759
+ newlen = args .len << args .mp -> m_sb .sb_inopblog ;
760
+ allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT )) - 1 ;
761
+ }
762
+
514
763
if (args .fsbno == NULLFSBLOCK ) {
515
764
* alloc = 0 ;
516
765
return 0 ;
@@ -535,27 +784,80 @@ xfs_ialloc_ag_alloc(
535
784
* Convert the results.
536
785
*/
537
786
newino = XFS_OFFBNO_TO_AGINO (args .mp , args .agbno , 0 );
787
+
788
+ if (xfs_inobt_issparse (~allocmask )) {
789
+ /*
790
+ * We've allocated a sparse chunk. Align the startino and mask.
791
+ */
792
+ xfs_align_sparse_ino (args .mp , & newino , & allocmask );
793
+
794
+ rec .ir_startino = newino ;
795
+ rec .ir_holemask = ~allocmask ;
796
+ rec .ir_count = newlen ;
797
+ rec .ir_freecount = newlen ;
798
+ rec .ir_free = XFS_INOBT_ALL_FREE ;
799
+
800
+ /*
801
+ * Insert the sparse record into the inobt and allow for a merge
802
+ * if necessary. If a merge does occur, rec is updated to the
803
+ * merged record.
804
+ */
805
+ error = xfs_inobt_insert_sprec (args .mp , tp , agbp , XFS_BTNUM_INO ,
806
+ & rec , true);
807
+ if (error == - EFSCORRUPTED ) {
808
+ xfs_alert (args .mp ,
809
+ "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u" ,
810
+ XFS_AGINO_TO_INO (args .mp , agno ,
811
+ rec .ir_startino ),
812
+ rec .ir_holemask , rec .ir_count );
813
+ xfs_force_shutdown (args .mp , SHUTDOWN_CORRUPT_INCORE );
814
+ }
815
+ if (error )
816
+ return error ;
817
+
818
+ /*
819
+ * We can't merge the part we've just allocated as for the inobt
820
+ * due to finobt semantics. The original record may or may not
821
+ * exist independent of whether physical inodes exist in this
822
+ * sparse chunk.
823
+ *
824
+ * We must update the finobt record based on the inobt record.
825
+ * rec contains the fully merged and up to date inobt record
826
+ * from the previous call. Set merge false to replace any
827
+ * existing record with this one.
828
+ */
829
+ if (xfs_sb_version_hasfinobt (& args .mp -> m_sb )) {
830
+ error = xfs_inobt_insert_sprec (args .mp , tp , agbp ,
831
+ XFS_BTNUM_FINO , & rec ,
832
+ false);
833
+ if (error )
834
+ return error ;
835
+ }
836
+ } else {
837
+ /* full chunk - insert new records to both btrees */
838
+ error = xfs_inobt_insert (args .mp , tp , agbp , newino , newlen ,
839
+ XFS_BTNUM_INO );
840
+ if (error )
841
+ return error ;
842
+
843
+ if (xfs_sb_version_hasfinobt (& args .mp -> m_sb )) {
844
+ error = xfs_inobt_insert (args .mp , tp , agbp , newino ,
845
+ newlen , XFS_BTNUM_FINO );
846
+ if (error )
847
+ return error ;
848
+ }
849
+ }
850
+
851
+ /*
852
+ * Update AGI counts and newino.
853
+ */
538
854
be32_add_cpu (& agi -> agi_count , newlen );
539
855
be32_add_cpu (& agi -> agi_freecount , newlen );
540
856
pag = xfs_perag_get (args .mp , agno );
541
857
pag -> pagi_freecount += newlen ;
542
858
xfs_perag_put (pag );
543
859
agi -> agi_newino = cpu_to_be32 (newino );
544
860
545
- /*
546
- * Insert records describing the new inode chunk into the btrees.
547
- */
548
- error = xfs_inobt_insert (args .mp , tp , agbp , newino , newlen ,
549
- XFS_BTNUM_INO );
550
- if (error )
551
- return error ;
552
-
553
- if (xfs_sb_version_hasfinobt (& args .mp -> m_sb )) {
554
- error = xfs_inobt_insert (args .mp , tp , agbp , newino , newlen ,
555
- XFS_BTNUM_FINO );
556
- if (error )
557
- return error ;
558
- }
559
861
/*
560
862
* Log allocation group header fields
561
863
*/
0 commit comments