Specialized pure Order/OrderDescending Distinct #120131

henriquewr · 2025-09-26T02:06:51Z

Fixes #120125
Changed to use specialized PureOrderedDistinct
Benchmark:


BenchmarkDotNet v0.15.4, Windows 10 (10.0.19045.6332/22H2/2022Update)
AMD Ryzen 7 5700X 3.40GHz, 1 CPU, 16 logical and 8 physical cores
.NET SDK 10.0.100-rc.1.25451.107
  [Host]     : .NET 10.0.0 (10.0.0-rc.1.25451.107, 10.0.25.45207), X64 RyuJIT x86-64-v3
  Job-UXDFNM : .NET 10.0.0 (10.0.0-dev, 42.42.42.42424), X64 RyuJIT x86-64-v3
  DefaultJob : .NET 10.0.0 (10.0.0-rc.1.25451.107, 10.0.25.45207), X64 RyuJIT x86-64-v3

Method	Toolchain	Mean	Error	StdDev	Gen0	Gen1	Gen2	Allocated
OrderDistinctToList	PureOrderedDistinct	334.7 us	1.16 us	0.97 us	3.9063	-	-	63.99 KB
OrderByDistinctConsumer	PureOrderedDistinct	337.7 us	3.30 us	3.09 us	1.9531	-	-	39.2 KB
OrderDistinctToArray	PureOrderedDistinct	338.6 us	0.69 us	0.61 us	3.9063	-	-	63.96 KB
OrderDescendingDistinctToArray	PureOrderedDistinct	396.0 us	0.94 us	0.88 us	3.9063	-	-	63.96 KB
OrderDescendingDistinctConsumer	PureOrderedDistinct	396.7 us	3.33 us	3.11 us	1.9531	-	-	39.2 KB
OrderDescendingDistinctToList	PureOrderedDistinct	399.0 us	2.26 us	2.01 us	3.9063	-	-	63.99 KB
OrderByDistinctConsumer	Default	414.2 us	0.71 us	0.60 us	30.7617	30.7617	30.7617	291.32 KB
OrderDistinctToArray	Default	424.7 us	3.30 us	3.09 us	30.7617	30.7617	30.7617	316.19 KB
OrderDistinctToList	Default	429.7 us	3.54 us	3.31 us	30.7617	30.7617	30.7617	316.22 KB
OrderDescendingDistinctToArray	Default	485.1 us	3.32 us	3.11 us	30.2734	30.2734	30.2734	316.19 KB
OrderDescendingDistinctConsumer	Default	486.6 us	0.91 us	0.76 us	30.7617	30.7617	30.7617	291.32 KB
OrderDescendingDistinctToList	Default	503.3 us	6.22 us	5.82 us	30.2734	30.2734	30.2734	316.22 KB
DateTimeOrderDescendingDistinctToArray	PureOrderedDistinct	579.0 us	5.18 us	4.59 us	9.7656	-	-	166.98 KB
DateTimeOrderDescendingDistinctConsumer	PureOrderedDistinct	579.9 us	0.25 us	0.19 us	6.8359	-	-	117.47 KB
DateTimeOrderDescendingDistinctToList	PureOrderedDistinct	582.7 us	0.99 us	0.92 us	9.7656	-	-	167.01 KB
DateTimeOrderDistinctToList	PureOrderedDistinct	587.8 us	0.80 us	0.75 us	9.7656	-	-	167.01 KB
DateTimeOrderDistinctToArray	PureOrderedDistinct	588.0 us	1.00 us	0.78 us	9.7656	-	-	166.98 KB
DateTimeOrderByDistinctConsumer	PureOrderedDistinct	592.8 us	2.51 us	2.23 us	6.8359	-	-	117.47 KB
DateTimeOrderDescendingDistinctConsumer	Default	680.0 us	2.25 us	2.00 us	41.0156	41.0156	41.0156	432.46 KB
DateTimeOrderDistinctToList	Default	681.3 us	6.31 us	5.90 us	41.0156	41.0156	41.0156	482.11 KB
DateTimeOrderDistinctToArray	Default	682.6 us	3.52 us	3.29 us	41.0156	41.0156	41.0156	482.08 KB
DateTimeOrderDescendingDistinctToArray	Default	685.9 us	5.00 us	4.67 us	41.0156	41.0156	41.0156	482.08 KB
DateTimeOrderByDistinctConsumer	Default	697.2 us	4.00 us	3.55 us	41.0156	41.0156	41.0156	432.46 KB
DateTimeOrderDescendingDistinctToList	Default	707.4 us	8.56 us	7.59 us	41.0156	41.0156	41.0156	482.11 KB


using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Configs;
using BenchmarkDotNet.Engines;
using BenchmarkDotNet.Jobs;
using BenchmarkDotNet.Running;
using BenchmarkDotNet.Toolchains.CoreRun;

namespace Benchmarks
{
    internal class Program
    {
        static void Main(string[] args)
        {
            var coreRunFile = new FileInfo(coreRunPath);

            var customRuntimeToolchain = new CoreRunToolchain(
                coreRun: coreRunFile,
                targetFrameworkMoniker: "net10.0",
                createCopy: true,
                displayName: "PureOrderedDistinct"
            );

            var config = ManualConfig.CreateMinimumViable()
             .AddJob(Job.Default)

             .AddJob(Job.Default
               .WithToolchain(customRuntimeToolchain));

            BenchmarkRunner.Run<Benchmarks>(config: config, args: args);
        }
    }

    [Orderer(BenchmarkDotNet.Order.SummaryOrderPolicy.FastestToSlowest)]
    [MemoryDiagnoser(true)]
    [HideColumns("Job")]
    public class Benchmarks
    {
        private static readonly Random _rnd = new Random(4);

        private static readonly List<int> _list = Enumerable.Range(0, 10000).Select(i => _rnd.Next(10000)).ToList();
        private static readonly List<DateTime> _listDateTime = _list.Select(i => new DateTime(i)).ToList();

        private static readonly Consumer _consumer = new Consumer();

        [Benchmark]
        public int[] OrderDistinctToArray()
        {
            return _list.Order().Distinct().ToArray();
        }

        [Benchmark]
        public List<int> OrderDistinctToList()
        {
            return _list.Order().Distinct().ToList();
        }

        [Benchmark]
        public void OrderByDistinctConsumer()
        {
            var ordered = _list.Order().Distinct();

            foreach (var item in ordered)
            {
                _consumer.Consume(item);
            }
        }




        [Benchmark]
        public int[] OrderDescendingDistinctToArray()
        {
            return _list.OrderDescending().Distinct().ToArray();
        }

        [Benchmark]
        public List<int> OrderDescendingDistinctToList()
        {
            return _list.OrderDescending().Distinct().ToList();
        }

        [Benchmark]
        public void OrderDescendingDistinctConsumer()
        {
            var ordered = _list.OrderDescending().Distinct();

            foreach (var item in ordered)
            {
                _consumer.Consume(item);
            }
        }



        [Benchmark]
        public DateTime[] DateTimeOrderDistinctToArray()
        {
            return _listDateTime.Order().Distinct().ToArray();
        }

        [Benchmark]
        public List<DateTime> DateTimeOrderDistinctToList()
        {
            return _listDateTime.Order().Distinct().ToList();
        }

        [Benchmark]
        public void DateTimeOrderByDistinctConsumer()
        {
            var ordered = _listDateTime.Order().Distinct();

            foreach (var item in ordered)
            {
                _consumer.Consume(item);
            }
        }




        [Benchmark]
        public DateTime[] DateTimeOrderDescendingDistinctToArray()
        {
            return _listDateTime.OrderDescending().Distinct().ToArray();
        }

        [Benchmark]
        public List<DateTime> DateTimeOrderDescendingDistinctToList()
        {
            return _listDateTime.OrderDescending().Distinct().ToList();
        }

        [Benchmark]
        public void DateTimeOrderDescendingDistinctConsumer()
        {
            var ordered = _listDateTime.OrderDescending().Distinct();

            foreach (var item in ordered)
            {
                _consumer.Consume(item);
            }
        }
    }
}

henriquewr · 2025-09-26T02:13:31Z

src/libraries/System.Linq/src/System/Linq/OrderBy.cs

+
+        /// <summary>A type can be pure ordered when every single time equal elements is side by side: [ (someVal), (someVal),  (otherVal), (otherVal) ]</summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static bool TypeCanBePureOrdered<T>()


A type can be pure ordered when the equality methods is pure, meaning for the same input always have the same output

I don't think this is a correct application of the term "pure". A struct with two fields implementing IEquatable that only projects to one field is also pure in that sense but it is invalid from the perspective of this optimization. The defining property you seem to be testing for is implicit stability, so this is merely an extension of the existing TypeIsImplicitlyStable to more types.

Could !typeof(IEquatable<T>).IsAssignableFrom(typeof(T)) && !RuntimeHelpers.IsReferenceOrContainsReferences<T>() be an acceptable proxy for widening this test to more types?

Could !typeof(IEquatable).IsAssignableFrom(typeof(T)) && !RuntimeHelpers.IsReferenceOrContainsReferences() be an acceptable proxy for widening this test to more types?

Answering my own question, probably not because T could contain fields that themselves implement IEquatable<T> in an unstable manner. I don't think we need this method, TypeIsImplicitlyStable is probably good enough, although we could have conversation about how much we can extend its scope.

I agree that "Pure" is not the best word to describe that, but that's the best name that I came up with

Could !typeof(IEquatable).IsAssignableFrom(typeof(T)) && !RuntimeHelpers.IsReferenceOrContainsReferences() be an acceptable proxy for widening this test to more types?

Answering my own question, probably not because T could contain fields that themselves implement IEquatable<T> in an unstable manner. I don't think we need this method, TypeIsImplicitlyStable is probably good enough, although we could have conversation about how much we can extend its scope.

The method TypeIsImplicitlyStable is a subset of what the optimization can handle

Example:

class PureOrderableType : IEquatable<PureOrderableType> { public int Value { get; set; } public bool Equals(PureOrderableType? other) { return other?.Value == Value; } public override bool Equals(object? obj) { return Equals(obj as PureOrderableType); } public override int GetHashCode() { return Value; } } var firstElement = new PureOrderableType { Value = 1 }; var secondElement = new PureOrderableType { Value = 1 }; List<PureOrderableType> list = [firstElement, secondElement];

Although

firstElement.Equals(secondElement) == true

it needs to be in that order (stable)

Why is implicit stability important here? If you have a sorted enumerable and equality is compatible with comparison, it follows that equal elements should appear sequentially. By applying your algorithm where the first element from each group of equal values is being yielded, you should obtain behavior that is equivalent to Distinct even when applied to DateTimeOffset.

The implicitly stability is not important,
Consider this code:

List<DateTimeOffset> dateTimeOffsets = new List<DateTimeOffset> { DateTimeOffset.Parse("2025-09-29T12:00:00+00:00"), DateTimeOffset.Parse("2025-09-29T08:00:00-04:00"), DateTimeOffset.Parse("2025-09-29T07:00:00-05:00") }.Order().ToList();

dateTimeOffsets.Distinct() Will return: DateTimeOffset.Parse("2025-09-29T12:00:00+00:00") (The first item)

if the sorting wasn't stable the Distinct() can return other item

Linq sorting is stable. I guess what I'm trying to hint at is that your change looks promising, but you need to better clarify the conditions for when the optimization kicks in.

Linq sorting is stable. I guess what I'm trying to hint at is that your change looks promising, but you need to better clarify the conditions for when the optimization kicks in.

I think that's the conditions for this optimization

If the type is not implicitly stable, it should use a stable sort, otherwise it can use an unstable (doesn't matter)

When looping the ordered collection the current element (when is a sequence of equal elements like [1,1,1] is threated like a single element [1]) it not equals to any other past element

To achive that, the methods GetHashCode, CompareTo, Equals should be properly implemented

"properly implemented" means that the methods must agree with each other

Like this example:

class SomeType : IEquatable<SomeType>, IComparable<SomeType> { public int Value { get; set; } public int CompareTo(SomeType? other) { return (Value % 2).CompareTo(other.Value % 2); } public override int GetHashCode() { return Value % 2; } public bool Equals(SomeType? other) { return (other?.Value % 2) == (Value % 2); } }

Other good example:

class OtherGoodType : IEquatable<OtherGoodType>, IComparable<OtherGoodType> { public int Value { get; set; } public int CompareTo(OtherGoodType? other) { return (Value).CompareTo(other.Value); } public override int GetHashCode() { return Value; } public bool Equals(OtherGoodType? other) { return other?.Value == Value; } }

This would be invalid:

class InvalidType : IEquatable<InvalidType>, IComparable<InvalidType> { public int Value { get; set; } public int CompareTo(InvalidType? other) { return (Value % 2).CompareTo(other.Value % 2); } public override int GetHashCode() { return Value % 2; } public bool Equals(InvalidType? other) { return other?.Value == Value; // Does not follow CompareTo } }

We're in agreement, but I would like you to update the name of your test method to better reflect what is being tested and potentially include even more types.

We're in agreement, but I would like you to update the name of your test method to better reflect what is being tested and potentially include even more types.

I added more tests, I tried to improve the names, it's better, but I agree that they aren't the best names
And I remembered other 2 types to add to this optimization: DateOnly and TimeOnly

And about the failing tests in the ci, I believe that it's not related to this changes, all the logs says:
unable to pull image...

dotnet-policy-service · 2025-09-26T03:26:04Z

Tagging subscribers to this area: @dotnet/area-system-linq
See info in area-owners.md if you want to be subscribed.

huoyaoyuan · 2025-09-26T03:43:30Z

src/libraries/System.Linq/src/System/Linq/OrderBy.cs

+            Type? nullableUnderlyingType = Nullable.GetUnderlyingType(t);
+            if (nullableUnderlyingType != null)
+            {
+                t = nullableUnderlyingType;
+            }
+
+            if (typeof(T).IsEnum)
+            {
+                t = typeof(T).GetEnumUnderlyingType();
+            }


This can have negative impact. It may break constant folding for typeof(T) == typeof(X). What does this path handle?

The equality methods in nullable only add the null check, which are pure, if it is not null there is a fallback to the equality methods of the internal type

basically only the internal type of nullable matters

… TimeOnly

henriquewr · 2025-09-30T23:13:41Z

src/libraries/System.Linq/tests/DistinctTests.cs

            Assert.Equal(expected, source.Order().Distinct());
        }

        [Fact]


I Added only the iterator part in the tests, because the other methods like ToArray, ToList, Count doesn't add much logic and basically just consume the iterator, and they are in the tests, but only for int

I skipped the tests to some types such as long, short, Half, etc. because, in the tests it already has a numeric type (int), and floating point types (float, double, decimal)

eiriktsarpalis · 2025-10-01T07:51:52Z

src/libraries/System.Linq/src/System/Linq/OrderedEnumerable.cs

+            }
+        }
+
+        private sealed partial class PureOrderedIteratorImpl<TElement> : PureOrderedIterator<TElement>


I think I got too distracted by the naming (FWIW I still think "pure" is inappropriate terminology and should be renamed) and didn't take a closer look at the value prop of the optimization: why would somebody want to chain Order() with Distinct(), a.k.a. impose a $\mathcal O(n\log n)$ operation just to plop away duplicates afterwards? Just calling Distinct() on the source enumerable is going to be more much more efficient regardless of this optimization, and if sorted outputs are still a prerequesite Order() should be called after Distinct() so that handles a potentially smaller number of elements.

Optimizations like this don't come free: besides the added type tests that it imposes each generic type being added to Linq can substantially grow the static footprint or memory usage of an application.

So while the optimization is a clever one, the fact that it is being applied to a pattern that is inefficient by definition makes me think that we shouldn't take this.

That's true, the path Order().Distinct() is not that common, my idea was using that with disconnected functions like

void SomeFunction(IEnumerable<SomeType> enumerable) { enumerable.Distinct().UseThisSomeHow() // if it's ordered, great, if it's not use the hashset }

But probably with the limitations of the optimization would be better simply swapping the order of the operators
Order().Distinct() instead of using the operators in order, swapping to Distinct().Order(), which is faster, but in therms of memory it is not so good

public class Benchmarks { private static readonly Random _rnd = new Random(4); private static readonly IEnumerable<int> _list = Enumerable.Range(0, 10000).Select(i => _rnd.Next(10000)).ToList(); [Benchmark] public List<int> OrderDistinct() { return _list.Order().Distinct().ToList(); } [Benchmark] public List<int> DistinctOrder() { return _list.Distinct().Order().ToList(); } }

| Method | Toolchain | Mean | Error | StdDev | Gen0 | Gen1 | Gen2 | Allocated | |-------------- |-------------------- |---------:|--------:|--------:|--------:|--------:|--------:|----------:| | DistinctOrder | PureOrderedDistinct | 255.7 us | 0.74 us | 0.69 us | 38.0859 | 38.0859 | 38.0859 | 182.94 KB | | DistinctOrder | Default | 271.0 us | 1.48 us | 1.23 us | 38.0859 | 38.0859 | 38.0859 | 182.94 KB | | OrderDistinct | PureOrderedDistinct | 344.2 us | 3.32 us | 3.10 us | 3.9063 | - | - | 63.99 KB | | OrderDistinct | Default | 426.6 us | 1.82 us | 1.70 us | 30.7617 | 30.7617 | 30.7617 | 316.22 KB |

imposes each generic type being added to Linq can substantially grow the static footprint or memory usage of an application

The creation of the new type can be useful for other (still really uncommon) optimizations like Order().Except(), etc.
I'm not saying that this would be a game changer but probably it would be used in more places, not just Order().Distinct()

I think I got too distracted by the naming (FWIW I still think "pure" is inappropriate terminology and should be renamed)

I agree with that, maybe AdjacentOrdered would be a better name

The creation of the new type can be useful for other (still really uncommon) optimizations like Order().Except(), etc.
I'm not saying that this would be a game changer but probably it would be used in more places, not just Order().Distinct()

The problem is that they all appear to be equally unlikely. As such, I expect this would be introducing marginal perf regressions in the 99% of uses of either Order or Distinct where this pattern does not hold.

Thanks for submitting this PR, it is legitimately clever and thought provoking even though unfortunately it doesn't meet the bar for inclusion into LINQ.

PureOrdered Distinct

af3deba

github-actions bot added the needs-area-label An area label is needed to ensure this gets routed to the appropriate area owners label Sep 26, 2025

dotnet-policy-service bot added the community-contribution Indicates that the PR has been added by a community member label Sep 26, 2025

henriquewr commented Sep 26, 2025

View reviewed changes

Removed trailing whitespace

e809cc0

huoyaoyuan added area-System.Linq and removed needs-area-label An area label is needed to ensure this gets routed to the appropriate area owners labels Sep 26, 2025

huoyaoyuan reviewed Sep 26, 2025

View reviewed changes

build-analysis bot mentioned this pull request Sep 26, 2025

Unable to pull image from mcr.microsoft.com #117164

Open

henriquewr mentioned this pull request Sep 26, 2025

Specialize Order Distinct Iterator in LINQ #120125

Closed

Adjusted names in tests, tests for more types, and included DateOnly,…

bdab472

… TimeOnly

henriquewr commented Sep 30, 2025

View reviewed changes

Merge branch 'main' into pureOrderedDistinct120125

147fb53

eiriktsarpalis reviewed Oct 1, 2025

View reviewed changes

build-analysis bot mentioned this pull request Oct 1, 2025

AppHost tests fail with "Failure extracting contents of the application bundle." #119249

Open

eiriktsarpalis closed this Oct 2, 2025

Specialized pure Order/OrderDescending Distinct #120131

Specialized pure Order/OrderDescending Distinct #120131

Uh oh!

Conversation

henriquewr commented Sep 26, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

eiriktsarpalis Sep 26, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

henriquewr Sep 29, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

henriquewr Sep 30, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

henriquewr Sep 30, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

dotnet-policy-service bot commented Sep 26, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

henriquewr commented Sep 26, 2025 •

edited

Loading

eiriktsarpalis Sep 26, 2025 •

edited

Loading

henriquewr Sep 29, 2025 •

edited

Loading

henriquewr Sep 30, 2025 •

edited

Loading

henriquewr Sep 30, 2025 •

edited

Loading