From 96e341497838d5dd19e87fb7a703dc9d33e7f6f6 Mon Sep 17 00:00:00 2001 From: shreeshd-tn Date: Thu, 15 Jan 2026 22:05:29 +0530 Subject: [PATCH 1/3] leading zero and formal/informal year fixes Signed-off-by: shreeshd-tn --- .../hi/data/measure/unit_year_formal.tsv | 1 + .../text_normalization/hi/taggers/cardinal.py | 32 +++++--- .../text_normalization/hi/taggers/decimal.py | 2 +- .../text_normalization/hi/taggers/measure.py | 80 ++++++++++++++++++- .../test_cases_cardinal.txt | 5 +- .../test_cases_measure.txt | 7 ++ 6 files changed, 115 insertions(+), 12 deletions(-) create mode 100644 nemo_text_processing/text_normalization/hi/data/measure/unit_year_formal.tsv diff --git a/nemo_text_processing/text_normalization/hi/data/measure/unit_year_formal.tsv b/nemo_text_processing/text_normalization/hi/data/measure/unit_year_formal.tsv new file mode 100644 index 000000000..a3c7b2162 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/measure/unit_year_formal.tsv @@ -0,0 +1 @@ +yr वर्ष diff --git a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py index f361416f4..3777f4987 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py @@ -15,7 +15,11 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst, insert_space +from nemo_text_processing.text_normalization.hi.graph_utils import ( + GraphFst, + NEMO_HI_DIGIT, + insert_space, +) from nemo_text_processing.text_normalization.hi.utils import get_abs_path @@ -41,6 +45,11 @@ def __init__(self, deterministic: bool = True, lm: bool = False): self.zero = zero self.teens_and_ties = teens_and_ties + # Single digit graph for digit-by-digit reading + # e.g., "०७३" -> "शून्य सात तीन" + single_digit_graph = digit | zero + self.single_digits_graph = single_digit_graph + pynini.closure(insert_space + single_digit_graph) + def create_graph_suffix(digit_graph, suffix, zeros_counts): zero = pynutil.add_weight(pynutil.delete("०"), -0.1) if zeros_counts == 0: @@ -298,13 +307,8 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph): graph_ten_shankhs |= create_larger_number_graph(teens_and_ties, suffix_shankhs, 0, graph_ten_padmas) graph_ten_shankhs.optimize() - # Only match exactly 2 digits to avoid interfering with telephone numbers, decimals, etc. - # e.g., "०५" -> "शून्य पाँच" - single_digit = digit | zero - graph_leading_zero = zero + insert_space + single_digit - graph_leading_zero = pynutil.add_weight(graph_leading_zero, 0.5) - - final_graph = ( + # Graph without leading zeros - used by other taggers like ordinal, decimal and measure + graph_without_leading_zeros = ( digit | zero | teens_and_ties @@ -325,8 +329,18 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph): | graph_ten_padmas | graph_shankhs | graph_ten_shankhs - | graph_leading_zero ) + self.graph_without_leading_zeros = graph_without_leading_zeros.optimize() + + # Handle numbers with leading zeros by reading digit-by-digit + # e.g., "०७३" -> "शून्य सात तीन", "००५" -> "शून्य शून्य पाँच" + cardinal_with_leading_zeros = pynini.compose( + pynini.accep("०") + pynini.closure(NEMO_HI_DIGIT), self.single_digits_graph + ) + cardinal_with_leading_zeros = pynutil.add_weight(cardinal_with_leading_zeros, 0.5) + + # Full graph including leading zeros - for standalone cardinal matching + final_graph = graph_without_leading_zeros | cardinal_with_leading_zeros optional_minus_graph = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) diff --git a/nemo_text_processing/text_normalization/hi/taggers/decimal.py b/nemo_text_processing/text_normalization/hi/taggers/decimal.py index cb21d85b1..7522de2bb 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/hi/taggers/decimal.py @@ -59,7 +59,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="decimal", kind="classify", deterministic=deterministic) graph_digit = cardinal.digit | cardinal.zero - cardinal_graph = cardinal.final_graph + cardinal_graph = cardinal.graph_without_leading_zeros self.graph = graph_digit + pynini.closure(insert_space + graph_digit).optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/measure.py b/nemo_text_processing/text_normalization/hi/taggers/measure.py index 31ae54dc0..97e52eba0 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hi/taggers/measure.py @@ -218,6 +218,17 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, ordinal: GraphFst, inp decimal_graph = decimal_integers + point + insert_space + decimal.graph_fractional unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv")) + # Year unit variants for formal/informal handling + year_informal = pynini.string_map([("yr", "साल")]) + year_formal = pynini.string_file(get_abs_path("data/measure/unit_year_formal.tsv")) + + # All units EXCEPT year + unit_inputs_except_yr = pynini.difference( + pynini.project(unit_graph, "input"), + pynini.accep("yr") + ) + unit_graph_no_year = pynini.compose(unit_inputs_except_yr, unit_graph) + # Load quarterly units from separate files: map (FST) and list (FSA) quarterly_units_map = pynini.string_file(get_abs_path("data/measure/quarterly_units_map.tsv")) quarterly_units_list = pynini.string_file(get_abs_path("data/measure/quarterly_units_list.tsv")) @@ -243,7 +254,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, ordinal: GraphFst, inp unit = ( pynutil.insert(NEMO_SPACE) + pynutil.insert("units: \"") - + unit_graph + + unit_graph_no_year + pynutil.insert("\"") + pynutil.insert(NEMO_SPACE) ) @@ -255,6 +266,34 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, ordinal: GraphFst, inp + pynutil.insert(NEMO_SPACE) ) + # Year-specific unit wrappers + unit_year_informal = ( + pynutil.insert(NEMO_SPACE) + + pynutil.insert("units: \"") + + year_informal + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) + unit_year_formal = ( + pynutil.insert(NEMO_SPACE) + + pynutil.insert("units: \"") + + year_formal + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) + + # Cardinal >= 1000 -> formal year (वर्ष) + # Use graph_without_leading_zeros which covers all number ranges (thousands to shankhs) + cardinal_large = cardinal.graph_without_leading_zeros + + # Cardinal < 1000 -> informal year (साल) + cardinal_small = ( + cardinal.zero + | cardinal.digit + | cardinal.teens_and_ties + | cardinal.graph_hundreds + ) + symbol_graph = pynini.string_map( [ (LOWERCASE_X, HI_BY), @@ -354,6 +393,42 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, ordinal: GraphFst, inp + unit ) + # Large numbers (>=1000) + yr -> formal (वर्ष) + graph_cardinal_year_formal = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + pynutil.insert("integer: \"") + + cardinal_large + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + + pynutil.insert("}") + + delete_space + + unit_year_formal + ) + + # Small numbers (<1000) + yr -> informal (साल) + graph_cardinal_year_informal = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + pynutil.insert("integer: \"") + + cardinal_small + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + + pynutil.insert("}") + + delete_space + + unit_year_informal + ) + + # Regular decimals (e.g., 16.07) + yr -> formal (वर्ष) + graph_decimal_year_formal = ( + pynutil.insert("decimal { ") + + optional_graph_negative + + decimal_graph + + pynutil.insert(" }") + + delete_space + + unit_year_formal + ) + # Handling cardinal clubbed with symbol as single token graph_exceptions = ( pynutil.insert("cardinal { ") @@ -381,7 +456,10 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, ordinal: GraphFst, inp graph = ( pynutil.add_weight(graph_decimal, 0.1) + | pynutil.add_weight(graph_decimal_year_formal, 0.1) | pynutil.add_weight(graph_cardinal, 0.1) + | pynutil.add_weight(graph_cardinal_year_formal, 0.1) + | pynutil.add_weight(graph_cardinal_year_informal, -0.1) # Higher priority for small numbers | pynutil.add_weight(graph_exceptions, 0.1) | pynutil.add_weight(graph_dedh_dhai, -0.2) | pynutil.add_weight(graph_savva, -0.1) diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_cardinal.txt index 2a52b2a20..46f981a88 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_cardinal.txt @@ -144,4 +144,7 @@ ५१०२२३४५५६७~इक्यावन अरब दो करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ २ पॉइंट्स १२ गोल~दो पॉइंट्स बारह गोल ०५~शून्य पाँच -०१~शून्य एक \ No newline at end of file +०१~शून्य एक +०७३~शून्य सात तीन +०००१~शून्य शून्य शून्य एक +०००~शून्य शून्य शून्य \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt index 86a824f72..95186a60d 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt @@ -64,3 +64,10 @@ ५x५ का सोफ़ा~पाँच बाई पाँच का सोफ़ा २x२ रुबिक्स क्यूब~दो बाई दो रुबिक्स क्यूब १३x१३ का घर~तेरह बाई तेरह का घर +१००० yr~एक हज़ार वर्ष +९९९९ yr~नौ हज़ार नौ सौ निन्यानबे वर्ष +१६.०७ yr~सोलह दशमलव शून्य सात वर्ष +५ yr~पाँच साल +१.५ yr~डेढ़ साल +२.५ yr~ढाई साल +३.५ yr~साढ़े तीन साल From 40b221d454704d489caeffc9800a93f8f95da5f2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 15 Jan 2026 16:47:19 +0000 Subject: [PATCH 2/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/hi/taggers/cardinal.py | 6 +----- .../text_normalization/hi/taggers/measure.py | 12 ++---------- 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py index 3777f4987..eb4feaef1 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py @@ -15,11 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.hi.graph_utils import ( - GraphFst, - NEMO_HI_DIGIT, - insert_space, -) +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_HI_DIGIT, GraphFst, insert_space from nemo_text_processing.text_normalization.hi.utils import get_abs_path diff --git a/nemo_text_processing/text_normalization/hi/taggers/measure.py b/nemo_text_processing/text_normalization/hi/taggers/measure.py index 97e52eba0..04d509559 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hi/taggers/measure.py @@ -223,10 +223,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, ordinal: GraphFst, inp year_formal = pynini.string_file(get_abs_path("data/measure/unit_year_formal.tsv")) # All units EXCEPT year - unit_inputs_except_yr = pynini.difference( - pynini.project(unit_graph, "input"), - pynini.accep("yr") - ) + unit_inputs_except_yr = pynini.difference(pynini.project(unit_graph, "input"), pynini.accep("yr")) unit_graph_no_year = pynini.compose(unit_inputs_except_yr, unit_graph) # Load quarterly units from separate files: map (FST) and list (FSA) @@ -287,12 +284,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, ordinal: GraphFst, inp cardinal_large = cardinal.graph_without_leading_zeros # Cardinal < 1000 -> informal year (साल) - cardinal_small = ( - cardinal.zero - | cardinal.digit - | cardinal.teens_and_ties - | cardinal.graph_hundreds - ) + cardinal_small = cardinal.zero | cardinal.digit | cardinal.teens_and_ties | cardinal.graph_hundreds symbol_graph = pynini.string_map( [ From 2cc2b46e73a1c15e66d03df0eeda9d127624c707 Mon Sep 17 00:00:00 2001 From: shreeshd-tn Date: Thu, 15 Jan 2026 22:17:49 +0530 Subject: [PATCH 3/3] Jenkins date update Signed-off-by: shreeshd-tn --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index ea9ba0384..81a259a32 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -26,7 +26,7 @@ pipeline { HY_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/01-12-26-0' + HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/01-16-26-0' DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0' } stages {