From 80eae961607c0f09965127fc2e2312f7ea137445 Mon Sep 17 00:00:00 2001 From: Danilo Burbano Date: Sat, 28 Oct 2023 08:17:11 -0500 Subject: [PATCH] [SPARKNLP-937] Fixing chunk construction when an entity is found --- .../nlp/annotators/er/AhoCorasickAutomaton.scala | 8 +++++--- .../johnsnowlabs/nlp/annotators/er/EntityRulerTest.scala | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/er/AhoCorasickAutomaton.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/er/AhoCorasickAutomaton.scala index 18b93185a38904..c4f2fe05b61b0a 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/er/AhoCorasickAutomaton.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/er/AhoCorasickAutomaton.scala @@ -123,7 +123,7 @@ class AhoCorasickAutomaton( if (state == 0 && previousState > 0) { val node = nodes(previousState).get - if (node.isLeaf) { + if (node.isLeaf && node.entity.nonEmpty) { val chunkAnnotation = buildAnnotation(chunk, node.entity, node.id, sentence) chunkAnnotations.append(chunkAnnotation) chunk.clear() @@ -135,8 +135,10 @@ class AhoCorasickAutomaton( if (chunk.nonEmpty) { val node = nodes(previousState).get - val chunkAnnotation = buildAnnotation(chunk, node.entity, node.id, sentence) - chunkAnnotations.append(chunkAnnotation) + if (node.entity.nonEmpty) { + val chunkAnnotation = buildAnnotation(chunk, node.entity, node.id, sentence) + chunkAnnotations.append(chunkAnnotation) + } chunk.clear() } diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/er/EntityRulerTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/er/EntityRulerTest.scala index 11ee7ed42eddec..28819bac14c443 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/er/EntityRulerTest.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/er/EntityRulerTest.scala @@ -791,7 +791,7 @@ class EntityRulerTest extends AnyFlatSpec with SparkSessionTest { AssertAnnotations.assertFields(expectedEntitiesFromText6, actualEntities) } - it should "work with LightPipeline" in { + it should "work with LightPipeline" taggedAs FastTest in { val externalResource = ExternalResource(s"$testPath/keywords_only.json", ReadAs.TEXT, Map("format" -> "json")) val entityRulerPipeline = getEntityRulerKeywordsPipeline(externalResource, useStorage = false)