diff --git a/Jenkinsfile b/Jenkinsfile index ea9ba0384..81a259a32 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -26,7 +26,7 @@ pipeline { HY_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/01-12-26-0' + HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/01-16-26-0' DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/text_normalization/hi/data/measure/unit_year_formal.tsv b/nemo_text_processing/text_normalization/hi/data/measure/unit_year_formal.tsv new file mode 100644 index 000000000..a3c7b2162 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/measure/unit_year_formal.tsv @@ -0,0 +1 @@ +yr वर्ष diff --git a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py index f361416f4..eb4feaef1 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst, insert_space +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_HI_DIGIT, GraphFst, insert_space from nemo_text_processing.text_normalization.hi.utils import get_abs_path @@ -41,6 +41,11 @@ def __init__(self, deterministic: bool = True, lm: bool = False): self.zero = zero self.teens_and_ties = teens_and_ties + # Single digit graph for digit-by-digit reading + # e.g., "०७३" -> "शून्य सात तीन" + single_digit_graph = digit | zero + self.single_digits_graph = single_digit_graph + pynini.closure(insert_space + single_digit_graph) + def create_graph_suffix(digit_graph, suffix, zeros_counts): zero = pynutil.add_weight(pynutil.delete("०"), -0.1) if zeros_counts == 0: @@ -298,13 +303,8 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph): graph_ten_shankhs |= create_larger_number_graph(teens_and_ties, suffix_shankhs, 0, graph_ten_padmas) graph_ten_shankhs.optimize() - # Only match exactly 2 digits to avoid interfering with telephone numbers, decimals, etc. - # e.g., "०५" -> "शून्य पाँच" - single_digit = digit | zero - graph_leading_zero = zero + insert_space + single_digit - graph_leading_zero = pynutil.add_weight(graph_leading_zero, 0.5) - - final_graph = ( + # Graph without leading zeros - used by other taggers like ordinal, decimal and measure + graph_without_leading_zeros = ( digit | zero | teens_and_ties @@ -325,8 +325,18 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph): | graph_ten_padmas | graph_shankhs | graph_ten_shankhs - | graph_leading_zero ) + self.graph_without_leading_zeros = graph_without_leading_zeros.optimize() + + # Handle numbers with leading zeros by reading digit-by-digit + # e.g., "०७३" -> "शून्य सात तीन", "००५" -> "शून्य शून्य पाँच" + cardinal_with_leading_zeros = pynini.compose( + pynini.accep("०") + pynini.closure(NEMO_HI_DIGIT), self.single_digits_graph + ) + cardinal_with_leading_zeros = pynutil.add_weight(cardinal_with_leading_zeros, 0.5) + + # Full graph including leading zeros - for standalone cardinal matching + final_graph = graph_without_leading_zeros | cardinal_with_leading_zeros optional_minus_graph = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) diff --git a/nemo_text_processing/text_normalization/hi/taggers/decimal.py b/nemo_text_processing/text_normalization/hi/taggers/decimal.py index cb21d85b1..7522de2bb 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/hi/taggers/decimal.py @@ -59,7 +59,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="decimal", kind="classify", deterministic=deterministic) graph_digit = cardinal.digit | cardinal.zero - cardinal_graph = cardinal.final_graph + cardinal_graph = cardinal.graph_without_leading_zeros self.graph = graph_digit + pynini.closure(insert_space + graph_digit).optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/measure.py b/nemo_text_processing/text_normalization/hi/taggers/measure.py index 31ae54dc0..04d509559 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hi/taggers/measure.py @@ -218,6 +218,14 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, ordinal: GraphFst, inp decimal_graph = decimal_integers + point + insert_space + decimal.graph_fractional unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv")) + # Year unit variants for formal/informal handling + year_informal = pynini.string_map([("yr", "साल")]) + year_formal = pynini.string_file(get_abs_path("data/measure/unit_year_formal.tsv")) + + # All units EXCEPT year + unit_inputs_except_yr = pynini.difference(pynini.project(unit_graph, "input"), pynini.accep("yr")) + unit_graph_no_year = pynini.compose(unit_inputs_except_yr, unit_graph) + # Load quarterly units from separate files: map (FST) and list (FSA) quarterly_units_map = pynini.string_file(get_abs_path("data/measure/quarterly_units_map.tsv")) quarterly_units_list = pynini.string_file(get_abs_path("data/measure/quarterly_units_list.tsv")) @@ -243,7 +251,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, ordinal: GraphFst, inp unit = ( pynutil.insert(NEMO_SPACE) + pynutil.insert("units: \"") - + unit_graph + + unit_graph_no_year + pynutil.insert("\"") + pynutil.insert(NEMO_SPACE) ) @@ -255,6 +263,29 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, ordinal: GraphFst, inp + pynutil.insert(NEMO_SPACE) ) + # Year-specific unit wrappers + unit_year_informal = ( + pynutil.insert(NEMO_SPACE) + + pynutil.insert("units: \"") + + year_informal + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) + unit_year_formal = ( + pynutil.insert(NEMO_SPACE) + + pynutil.insert("units: \"") + + year_formal + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + ) + + # Cardinal >= 1000 -> formal year (वर्ष) + # Use graph_without_leading_zeros which covers all number ranges (thousands to shankhs) + cardinal_large = cardinal.graph_without_leading_zeros + + # Cardinal < 1000 -> informal year (साल) + cardinal_small = cardinal.zero | cardinal.digit | cardinal.teens_and_ties | cardinal.graph_hundreds + symbol_graph = pynini.string_map( [ (LOWERCASE_X, HI_BY), @@ -354,6 +385,42 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, ordinal: GraphFst, inp + unit ) + # Large numbers (>=1000) + yr -> formal (वर्ष) + graph_cardinal_year_formal = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + pynutil.insert("integer: \"") + + cardinal_large + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + + pynutil.insert("}") + + delete_space + + unit_year_formal + ) + + # Small numbers (<1000) + yr -> informal (साल) + graph_cardinal_year_informal = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + pynutil.insert("integer: \"") + + cardinal_small + + pynutil.insert("\"") + + pynutil.insert(NEMO_SPACE) + + pynutil.insert("}") + + delete_space + + unit_year_informal + ) + + # Regular decimals (e.g., 16.07) + yr -> formal (वर्ष) + graph_decimal_year_formal = ( + pynutil.insert("decimal { ") + + optional_graph_negative + + decimal_graph + + pynutil.insert(" }") + + delete_space + + unit_year_formal + ) + # Handling cardinal clubbed with symbol as single token graph_exceptions = ( pynutil.insert("cardinal { ") @@ -381,7 +448,10 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, ordinal: GraphFst, inp graph = ( pynutil.add_weight(graph_decimal, 0.1) + | pynutil.add_weight(graph_decimal_year_formal, 0.1) | pynutil.add_weight(graph_cardinal, 0.1) + | pynutil.add_weight(graph_cardinal_year_formal, 0.1) + | pynutil.add_weight(graph_cardinal_year_informal, -0.1) # Higher priority for small numbers | pynutil.add_weight(graph_exceptions, 0.1) | pynutil.add_weight(graph_dedh_dhai, -0.2) | pynutil.add_weight(graph_savva, -0.1) diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_cardinal.txt index 2a52b2a20..46f981a88 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_cardinal.txt @@ -144,4 +144,7 @@ ५१०२२३४५५६७~इक्यावन अरब दो करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ २ पॉइंट्स १२ गोल~दो पॉइंट्स बारह गोल ०५~शून्य पाँच -०१~शून्य एक \ No newline at end of file +०१~शून्य एक +०७३~शून्य सात तीन +०००१~शून्य शून्य शून्य एक +०००~शून्य शून्य शून्य \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt index 86a824f72..95186a60d 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt @@ -64,3 +64,10 @@ ५x५ का सोफ़ा~पाँच बाई पाँच का सोफ़ा २x२ रुबिक्स क्यूब~दो बाई दो रुबिक्स क्यूब १३x१३ का घर~तेरह बाई तेरह का घर +१००० yr~एक हज़ार वर्ष +९९९९ yr~नौ हज़ार नौ सौ निन्यानबे वर्ष +१६.०७ yr~सोलह दशमलव शून्य सात वर्ष +५ yr~पाँच साल +१.५ yr~डेढ़ साल +२.५ yr~ढाई साल +३.५ yr~साढ़े तीन साल