diff --git a/tn/english/data/money/__init__.py b/tn/english/data/money/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tn/english/data/money/currency_major.tsv b/tn/english/data/money/currency_major.tsv new file mode 100644 index 00000000..5201efd8 --- /dev/null +++ b/tn/english/data/money/currency_major.tsv @@ -0,0 +1,39 @@ +$ dollar +$ us dollar +US$ us dollar +฿ Thai Baht +£ pound +€ euro +₩ won +nzd new zealand dollar +rs rupee +chf swiss franc +dkk danish kroner +fim finnish markka +aed arab emirates dirham +¥ yen +czk czech koruna +mro mauritanian ouguiya +pkr pakistani rupee +crc costa rican colon +hk$ hong kong dollar +npr nepalese rupee +awg aruban florin +nok norwegian kroner +tzs tanzanian shilling +sek swedish kronor +cyp cypriot pound +r real +sar saudi riyal +cve cape verde escudo +rsd serbian dinar +dm german mark +shp saint helena pounds +php philippine peso +cad canadian dollar +ssp south sudanese pound +scr seychelles rupee +mvr maldivian rufiyaa +DH dirham +Dh dirham +Dhs. dirham diff --git a/tn/english/data/money/currency_minor_plural.tsv b/tn/english/data/money/currency_minor_plural.tsv new file mode 100644 index 00000000..a2f67b3a --- /dev/null +++ b/tn/english/data/money/currency_minor_plural.tsv @@ -0,0 +1,4 @@ +$ cents +US$ cents +€ cents +£ pence \ No newline at end of file diff --git a/tn/english/data/money/currency_minor_singular.tsv b/tn/english/data/money/currency_minor_singular.tsv new file mode 100644 index 00000000..fe629d87 --- /dev/null +++ b/tn/english/data/money/currency_minor_singular.tsv @@ -0,0 +1,3 @@ +$ cent +€ cent +£ penny \ No newline at end of file diff --git a/tn/english/data/money/per_unit.tsv b/tn/english/data/money/per_unit.tsv new file mode 100644 index 00000000..65480668 --- /dev/null +++ b/tn/english/data/money/per_unit.tsv @@ -0,0 +1,2 @@ +/ea each +/dozen \ No newline at end of file diff --git a/tn/english/normalizer.py b/tn/english/normalizer.py index 6db9823a..5d83b112 100644 --- a/tn/english/normalizer.py +++ b/tn/english/normalizer.py @@ -22,6 +22,7 @@ from tn.english.rules.date import Date from tn.english.rules.time import Time from tn.english.rules.measure import Measure +from tn.english.rules.money import Money from pynini.lib.pynutil import add_weight, delete from importlib_resources import files @@ -43,10 +44,11 @@ def build_tagger(self): date = add_weight(Date().tagger, 0.99) time = add_weight(Time().tagger, 1.00) measure = add_weight(Measure().tagger, 1.00) + money = add_weight(Money().tagger, 1.00) word = add_weight(Word().tagger, 100) tagger = (cardinal | ordinal | word | date | decimal | fraction - | time | measure).optimize() + self.DELETE_SPACE + | time | measure | money).optimize() + self.DELETE_SPACE # delete the last space self.tagger = tagger.star @ self.build_rule(delete(' '), r='[EOS]') @@ -59,8 +61,9 @@ def build_verbalizer(self): date = Date().verbalizer time = Time().verbalizer measure = Measure().verbalizer + money = Money().verbalizer verbalizer = (cardinal | ordinal | word | date | decimal | fraction | time - | measure).optimize() + self.INSERT_SPACE + | measure | money).optimize() + self.INSERT_SPACE self.verbalizer = verbalizer.star diff --git a/tn/english/rules/money.py b/tn/english/rules/money.py new file mode 100644 index 00000000..eca5d001 --- /dev/null +++ b/tn/english/rules/money.py @@ -0,0 +1,258 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from tn.processor import Processor +from tn.utils import get_abs_path, load_labels +from tn.english.rules.cardinal import Cardinal +from tn.english.rules.decimal import Decimal +from tn.english.rules.measure import SINGULAR_TO_PLURAL + +min_singular = pynini.string_file( + get_abs_path("english/data/money/currency_minor_singular.tsv")) +min_plural = pynini.string_file( + get_abs_path("english/data/money/currency_minor_plural.tsv")) +maj_singular = pynini.string_file( + (get_abs_path("english/data/money/currency_major.tsv"))) + + +class Money(Processor): + + def __init__(self, deterministic: bool = False): + """ + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + super().__init__('money', ordertype="en_tn") + self.deterministic = deterministic + self.build_tagger() + self.build_verbalizer() + + def build_tagger(self): + """ + Finite state transducer for classifying money, suppletive aware, e.g. + $12.05 -> money { integer_part: "twelve" currency_maj: "dollars" fractional_part: "five" currency_min: "cents" preserve_order: \"true\" } + $12.0500 -> money { integer_part: "twelve" currency_maj: "dollars" fractional_part: "five" currency_min: "cents" preserve_order: \"true\" } + $1 -> money { currency_maj: "dollar" integer_part: "one" } + $1.00 -> money { currency_maj: "dollar" integer_part: "one" } + $0.05 -> money { fractional_part: "five" currency_min: "cents" preserve_order: \"true\" } + $1 million -> money { currency_maj: "dollars" integer_part: "one" quantity: "million" } + $1.2 million -> money { currency_maj: "dollars" integer_part: "one" fractional_part: "two" quantity: "million" } + $1.2320 -> money { currency_maj: "dollars" integer_part: "one" fractional_part: "two three two" } + """ + cardinal = Cardinal(self.deterministic) + decimal = Decimal(self.deterministic) + cardinal_graph = cardinal.graph_with_and + graph_decimal_final = decimal.final_graph_wo_negative_w_abbr + + maj_singular_labels = load_labels( + get_abs_path("english/data/money/currency_major.tsv")) + maj_unit_plural = maj_singular @ SINGULAR_TO_PLURAL + maj_unit_singular = maj_singular + + graph_maj_singular = pynutil.insert( + "currency_maj: \"") + maj_unit_singular + pynutil.insert("\"") + graph_maj_plural = pynutil.insert( + "currency_maj: \"") + maj_unit_plural + pynutil.insert("\"") + + optional_delete_fractional_zeros = pynini.closure( + pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), 0, 1) + + graph_integer_one = pynutil.insert("integer_part: \"") + pynini.cross( + "1", "one") + pynutil.insert("\"") + # only for decimals where third decimal after comma is non-zero or with quantity + decimal_delete_last_zeros = ( + pynini.closure(self.DIGIT | pynutil.delete(",")) + + pynini.accep(".") + pynini.closure(self.DIGIT, 2) + + (self.DIGIT - "0") + pynini.closure(pynutil.delete("0"))) + decimal_with_quantity = pynini.closure(self.VCHAR) + self.ALPHA + + graph_decimal = (graph_maj_plural + self.INSERT_SPACE + + (decimal_delete_last_zeros | decimal_with_quantity) + @ graph_decimal_final) + + graph_integer = ( + pynutil.insert("integer_part: \"") + + ((pynini.closure(self.VCHAR) - "1") @ cardinal_graph) + + pynutil.insert("\"")) # noqa + + graph_integer_only = graph_maj_singular + self.INSERT_SPACE + graph_integer_one + graph_integer_only |= graph_maj_plural + self.INSERT_SPACE + graph_integer + + final_graph = (graph_integer_only + + optional_delete_fractional_zeros) | graph_decimal + + # remove trailing zeros of non zero number in the first 2 digits and fill up to 2 digits + # e.g. 2000 -> 20, 0200->02, 01 -> 01, 10 -> 10 + # not accepted: 002, 00, 0, + two_digits_fractional_part = ( + pynini.closure(self.DIGIT) + + (self.DIGIT - "0") + pynini.closure(pynutil.delete("0"))) @ ( + (pynutil.delete("0") + (self.DIGIT - "0")) + | ((self.DIGIT - "0") + pynutil.insert("0")) + | ((self.DIGIT - "0") + self.DIGIT)) + + graph_min_singular = pynutil.insert( + "currency_min: \"") + min_singular + pynutil.insert("\"") + graph_min_plural = pynutil.insert( + "currency_min: \"") + min_plural + pynutil.insert("\"") + # format ** dollars ** cent + decimal_graph_with_minor = None + integer_graph_reordered = None + decimal_default_reordered = None + for curr_symbol, _ in maj_singular_labels: + preserve_order = pynutil.insert(" preserve_order: \"true\"") + integer_plus_maj = graph_integer + self.INSERT_SPACE + pynutil.insert( + curr_symbol) @ graph_maj_plural + integer_plus_maj |= graph_integer_one + self.INSERT_SPACE + pynutil.insert( + curr_symbol) @ graph_maj_singular + + integer_plus_maj_with_comma = pynini.compose( + self.DIGIT - "0" + + pynini.closure(self.DIGIT | pynutil.delete(",")), + integer_plus_maj) + integer_plus_maj = pynini.compose( + pynini.closure(self.DIGIT) - "0", integer_plus_maj) + integer_plus_maj |= integer_plus_maj_with_comma + + graph_fractional_one = two_digits_fractional_part @ pynini.cross( + "1", "one") + graph_fractional_one = pynutil.insert( + "fractional_part: \"") + graph_fractional_one + pynutil.insert( + "\"") + graph_fractional = (two_digits_fractional_part @ ( + pynini.closure(self.DIGIT, 1, 2) - "1" + ) @ cardinal.graph_hundred_component_at_least_one_none_zero_digit) + graph_fractional = pynutil.insert( + "fractional_part: \"") + graph_fractional + pynutil.insert( + "\"") + + fractional_plus_min = graph_fractional + self.INSERT_SPACE + pynutil.insert( + curr_symbol) @ graph_min_plural + fractional_plus_min |= ( + graph_fractional_one + self.INSERT_SPACE + + pynutil.insert(curr_symbol) @ graph_min_singular) + + decimal_graph_with_minor_curr = integer_plus_maj + pynini.cross( + ".", " ") + fractional_plus_min + + if not self.deterministic: + decimal_graph_with_minor_curr |= pynutil.add_weight( + integer_plus_maj + pynini.cross(".", " ") + + pynutil.insert("fractional_part: \"") + + two_digits_fractional_part @ cardinal. + graph_hundred_component_at_least_one_none_zero_digit + + pynutil.insert("\""), + weight=0.0001, + ) + default_fraction_graph = ( + decimal_delete_last_zeros + | decimal_with_quantity) @ graph_decimal_final + decimal_graph_with_minor_curr |= ( + pynini.closure(pynutil.delete("0"), 0, 1) + + pynutil.delete(".") + fractional_plus_min) + decimal_graph_with_minor_curr = (pynutil.delete(curr_symbol) + + decimal_graph_with_minor_curr + + preserve_order) + + decimal_graph_with_minor = ( + decimal_graph_with_minor_curr + if decimal_graph_with_minor is None else pynini.union( + decimal_graph_with_minor, + decimal_graph_with_minor_curr).optimize()) + + if not self.deterministic: + integer_graph_reordered_curr = (pynutil.delete(curr_symbol) + + integer_plus_maj + + preserve_order).optimize() + + integer_graph_reordered = ( + integer_graph_reordered_curr + if integer_graph_reordered is None else pynini.union( + integer_graph_reordered, + integer_graph_reordered_curr).optimize()) + decimal_default_reordered_curr = ( + pynutil.delete(curr_symbol) + default_fraction_graph + + self.INSERT_SPACE + + pynutil.insert(curr_symbol) @ graph_maj_plural) + + decimal_default_reordered = ( + decimal_default_reordered_curr + if decimal_default_reordered is None else pynini.union( + decimal_default_reordered, + decimal_default_reordered_curr)).optimize() + + # weight for SH + final_graph |= pynutil.add_weight(decimal_graph_with_minor, -0.0001) + + if not self.deterministic: + final_graph |= pynutil.add_weight( + integer_graph_reordered | decimal_default_reordered, -0.1) + # to handle "$2.00" cases + final_graph |= pynini.compose( + pynini.closure(self.VCHAR) + pynutil.delete(".") + + pynini.closure(pynutil.delete("0"), 1), + integer_graph_reordered) + final_graph = self.add_tokens(final_graph.optimize()) + self.tagger = final_graph.optimize() + + def build_verbalizer(self): + """ + Finite state transducer for verbalizing money, e.g. + money { integer_part: "twelve" fractional_part: "o five" currency: "dollars" } -> twelve o five dollars + """ + decimal = Decimal(self.deterministic) + delete_preserve_order = pynini.closure( + pynutil.delete(" preserve_order: \"true\"") + | (pynutil.delete(" field_order: \"") + self.NOT_QUOTE + + pynutil.delete("\""))) + keep_space = pynini.accep(" ") + maj = pynutil.delete("currency_maj: \"") + pynini.closure( + self.NOT_QUOTE, 1) + pynutil.delete("\"") + min = pynutil.delete("currency_min: \"") + pynini.closure( + self.NOT_QUOTE, 1) + pynutil.delete("\"") + + fractional_part = (pynutil.delete("fractional_part: \"") + + pynini.closure(self.NOT_QUOTE, 1) + + pynutil.delete("\"")) + + integer_part = decimal.integer + + # *** currency_maj + graph_integer = integer_part + keep_space + maj + + # *** currency_maj + (***) | ((and) *** current_min) + fractional = fractional_part + self.DELETE_EXTRA_SPACE + min + + if not self.deterministic: + fractional |= pynutil.insert("and ") + fractional + + graph_integer_with_minor = integer_part + keep_space + maj + keep_space + fractional + delete_preserve_order + + # *** point *** currency_maj + graph_decimal = decimal.numbers + keep_space + maj + + # *** current_min + graph_minor = fractional_part + self.DELETE_EXTRA_SPACE + min + delete_preserve_order + + graph = graph_integer | graph_integer_with_minor | graph_decimal | graph_minor + + if not self.deterministic: + graph |= graph_integer + delete_preserve_order + + delete_tokens = self.delete_tokens(graph) + self.verbalizer = delete_tokens.optimize() diff --git a/tn/english/test/data/money.txt b/tn/english/test/data/money.txt new file mode 100644 index 00000000..6d330797 --- /dev/null +++ b/tn/english/test/data/money.txt @@ -0,0 +1,8 @@ +$12.05 => twelve dollars five cents +$12.0500 => twelve dollars five cents +$1 => one dollar +$1.00 => one dollar +$0.05 => five cents +$1 million => one million dollars +$1.2 million => one point two million dollars +$1.2320 => one point two three two dollars diff --git a/tn/english/test/money_test.py b/tn/english/test/money_test.py new file mode 100644 index 00000000..8f432eb6 --- /dev/null +++ b/tn/english/test/money_test.py @@ -0,0 +1,28 @@ +# Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from tn.english.rules.money import Money +from tn.english.test.utils import parse_test_case + + +class TestMoney: + + money = Money(deterministic=False) + money_cases = parse_test_case('data/money.txt') + + @pytest.mark.parametrize("written, spoken", money_cases) + def test_money(self, written, spoken): + assert self.money.normalize(written) == spoken