From c57a652d2c1c0961963b3b8c1ca4d7200dcd4f9c Mon Sep 17 00:00:00 2001 From: Jean Helou Date: Fri, 18 May 2018 18:21:52 +0200 Subject: [PATCH] adds splitBy extension method to scala collections `Iterator#splitBy` constructs an iterator where consecutive elements of the original iterator are accumulated as long as the output of a key function for each element doesn't change. This operation makes sense as soon as you are trying to process an iterator where you know the elements will be sorted in a certain way and you need to group them without loading all the data in memory. For instance * processing a file where the ordering is guaranteed but the file doesn't fit in the heap, * processing a streaming resultset where the underlying database guarantees the ordering because of a sort clause. The same operation is added to `Iterable` with the difference that the specific container type of the input is preserved for both collection levels of the output, thus * `Set(1,2,3).splitBy(identity)` returns `Set(Set(1), Set(2), Set(3))` * `Vector(1,2,3).splitBy(identity)` returns `Vector(Vector1), Vector2), Vector3))` * etc. --- .../decorators/IterableDecorator.scala | 24 +++++++ .../decorators/IteratorDecorator.scala | 64 +++++++++++++++++++ .../decorators/IterableDecoratorTest.scala | 43 ++++++++++++- .../decorators/IteratorDecoratorTest.scala | 41 ++++++++++++ 4 files changed, 171 insertions(+), 1 deletion(-) create mode 100644 src/test/scala/scala/collection/decorators/IteratorDecoratorTest.scala diff --git a/src/main/scala/scala/collection/decorators/IterableDecorator.scala b/src/main/scala/scala/collection/decorators/IterableDecorator.scala index 579943b..3cf0bb4 100644 --- a/src/main/scala/scala/collection/decorators/IterableDecorator.scala +++ b/src/main/scala/scala/collection/decorators/IterableDecorator.scala @@ -33,4 +33,28 @@ class IterableDecorator[C, I <: IsIterable[C]](coll: C)(implicit val it: I) { def lazyFoldRight[B](z: B)(op: it.A => Either[B, B => B]): B = it(coll).iterator.lazyFoldRight(z)(op) + + /** + * Constructs a collection where consecutive elements are accumulated as + * long as the output of f for each element doesn't change. + *
+    * Vector(1,2,2,3,3,3,2,2)
+    * .splitBy(identity)
+    * 
+ * produces + *
+    * Vector(Vector(1),
+    * Vector(2,2),
+    * Vector(3,3,3),
+    * Vector(2,2))
+    * 
+ * + * @param f the function to compute a key for an element + * @tparam K the type of the computed key + * @return a collection of collections of the consecutive elements with the + * same key in the original collection + */ + def splitBy[K, CC1, CC2](f: it.A => K)(implicit bf: BuildFrom[C, it.A, CC1], bff: BuildFrom[C, CC1, CC2]): CC2 = { + bff.fromSpecific(coll)(it(coll).iterator.splitBy(f).map(bf.fromSpecific(coll))) + } } diff --git a/src/main/scala/scala/collection/decorators/IteratorDecorator.scala b/src/main/scala/scala/collection/decorators/IteratorDecorator.scala index 02b5af4..b806636 100644 --- a/src/main/scala/scala/collection/decorators/IteratorDecorator.scala +++ b/src/main/scala/scala/collection/decorators/IteratorDecorator.scala @@ -72,4 +72,68 @@ class IteratorDecorator[A](val `this`: Iterator[A]) extends AnyVal { loop(immutable.List.empty) } + /** + * Constructs an iterator where consecutive elements are accumulated as + * long as the output of f for each element doesn't change. + *
+    * Vector(1,2,2,3,3,3,2,2)
+    * .iterator
+    * .splitBy(identity)
+    * .toList
+    * 
+ * produces + *
+    * List(Seq(1),
+    * Seq(2,2),
+    * Seq(3,3,3),
+    * Seq(2,2))
+    * 
+ * + * @param f the function to compute a key for an element + * @tparam K the type of the computed key + * @return an iterator of sequences of the consecutive elements with the + * same key in the original iterator + */ + def splitBy[K](f: A => K): Iterator[immutable.Seq[A]] = + new AbstractIterator[immutable.Seq[A]] { + private var hd: A = _ + private var hdKey: K = _ + private var hdDefined: Boolean = false + + override def hasNext: Boolean = hdDefined || `this`.hasNext + + override def next(): immutable.Seq[A] = { + if (hasNext) { + val seq = Vector.newBuilder[A] + if (hdDefined) { + seq += hd + } else { + val init = `this`.next() + hd = init + hdKey = f(init) + hdDefined = true + seq += init + } + var hadSameKey = true + while (`this`.hasNext && hadSameKey) { + val el = `this`.next() + hdDefined = true + val key = f(el) + if (key == hdKey) { + seq += el + } else { + hadSameKey = false + hdKey = key + hd = el + } + } + if (hadSameKey) { + hdDefined = false + } + seq.result() + } else { + Iterator.empty.next() + } + } + } } diff --git a/src/test/scala/scala/collection/decorators/IterableDecoratorTest.scala b/src/test/scala/scala/collection/decorators/IterableDecoratorTest.scala index a3f6208..6395eef 100644 --- a/src/test/scala/scala/collection/decorators/IterableDecoratorTest.scala +++ b/src/test/scala/scala/collection/decorators/IterableDecoratorTest.scala @@ -2,7 +2,8 @@ package scala.collection package decorators import org.junit.{Assert, Test} -import scala.collection.immutable.{LazyList, List, Range, Map} + +import scala.collection.immutable.{LazyList, List, Map, Range} class IterableDecoratorTest { @@ -36,4 +37,44 @@ class IterableDecoratorTest { Assert.assertEquals(2, result2) } + @Test + def splitByShouldHonorEmptyIterator(): Unit = { + val split = Vector.empty[Int].splitBy(identity) + Assert.assertEquals(Vector.empty, split) + } + + @Test + def splitByShouldReturnSingleSeqWhenSingleElement(): Unit = { + val value = Vector("1") + val split = value.splitBy(identity) + Assert.assertEquals(Vector(value), split) + } + + @Test + def splitByShouldReturnSingleSeqWhenAllElHaveTheSameKey(): Unit = { + val value = Vector("1", "1", "1") + val split = value.splitBy(identity) + Assert.assertEquals(Vector(value), split) + } + + @Test + def splitByShouldReturnVectorOfVectorOrConsecutiveElementsWithTheSameKey(): Unit = { + val value = Vector("1", "2", "2", "3", "3", "3", "2", "2") + val split: Vector[Vector[String]] = value.splitBy(identity) + Assert.assertEquals(Vector(Vector("1"), Vector("2", "2"), Vector("3", "3", "3"), Vector("2", "2")), split) + } + + @Test + def splitByShouldReturnListOfListOfConsecutiveElementsWithTheSameKey(): Unit = { + val value = List("1", "2", "2", "3", "3", "3", "2", "2") + val split: List[List[String]] = value.splitBy(identity) + Assert.assertEquals(List(List("1"), List("2", "2"), List("3", "3", "3"), List("2", "2")), split) + } + + @Test + def splitByShouldReturnSetOfSetOfConsecutiveElementsWithTheSameKey(): Unit = { + val value = Set("1", "2", "2", "3", "3", "3", "2", "2") + val split: Set[Set[String]] = value.splitBy(identity) + Assert.assertEquals(Set(Set("1"), Set("2"), Set("3")), split) + } } diff --git a/src/test/scala/scala/collection/decorators/IteratorDecoratorTest.scala b/src/test/scala/scala/collection/decorators/IteratorDecoratorTest.scala new file mode 100644 index 0000000..08d93fc --- /dev/null +++ b/src/test/scala/scala/collection/decorators/IteratorDecoratorTest.scala @@ -0,0 +1,41 @@ +package scala.collection +package decorators + +import org.junit.{Assert, Test} + +import scala.util.Try + +class IteratorDecoratorTest { + @Test + def splitByShouldHonorEmptyIterator(): Unit = { + val groupedIterator = Iterator.empty.splitBy(identity) + Assert.assertFalse(groupedIterator.hasNext) + Assert.assertEquals(Try(groupedIterator.next).toString, Try(Iterator.empty.next()).toString) + } + + @Test + def splitByShouldReturnIteratorOfSingleSeqWhenAllElHaveTheSameKey(): Unit = { + val value = Vector("1", "1", "1") + val groupedIterator = value.iterator.splitBy(identity) + Assert.assertTrue(groupedIterator.hasNext) + Assert.assertEquals(groupedIterator.next.toVector, value) + Assert.assertFalse(groupedIterator.hasNext) + Assert.assertEquals(Try(groupedIterator.next).toString, Try(Iterator.empty.next()).toString) + } + + @Test + def splitByShouldReturnIteratorOfSeqOfConsecutiveElementsWithTheSameKey(): Unit = { + val value = Vector("1", "2", "2", "3", "3", "3", "2", "2") + val groupedIterator = value.iterator.splitBy(identity) + Assert.assertTrue(groupedIterator.hasNext) + Assert.assertEquals(groupedIterator.next.toVector, Vector("1")) + Assert.assertTrue(groupedIterator.hasNext) + Assert.assertEquals(groupedIterator.next.toVector, Vector("2", "2")) + Assert.assertTrue(groupedIterator.hasNext) + Assert.assertEquals(groupedIterator.next.toVector, Vector("3", "3", "3")) + Assert.assertTrue(groupedIterator.hasNext) + Assert.assertEquals(groupedIterator.next.toVector, Vector("2", "2")) + Assert.assertFalse(groupedIterator.hasNext) + Assert.assertEquals(Try(groupedIterator.next).toString, Try(Iterator.empty.next()).toString) + } +}