From 9cce9efdc55f0297c7a553a683f4127f2a1cb622 Mon Sep 17 00:00:00 2001 From: xingchensong Date: Sun, 2 Jun 2024 18:00:37 +0800 Subject: [PATCH] [tn] english, support telephone --- tn/english/data/telephone/__init__.py | 0 tn/english/data/telephone/ip_prompt.tsv | 2 + tn/english/data/telephone/ssn_prompt.tsv | 4 + .../data/telephone/telephone_prompt.tsv | 5 + tn/english/normalizer.py | 9 +- tn/english/rules/telephone.py | 179 ++++++++++++++++++ tn/english/test/data/telephone.txt | 2 + tn/english/test/telephone_test.py | 28 +++ 8 files changed, 227 insertions(+), 2 deletions(-) create mode 100644 tn/english/data/telephone/__init__.py create mode 100644 tn/english/data/telephone/ip_prompt.tsv create mode 100644 tn/english/data/telephone/ssn_prompt.tsv create mode 100644 tn/english/data/telephone/telephone_prompt.tsv create mode 100644 tn/english/rules/telephone.py create mode 100644 tn/english/test/data/telephone.txt create mode 100644 tn/english/test/telephone_test.py diff --git a/tn/english/data/telephone/__init__.py b/tn/english/data/telephone/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tn/english/data/telephone/ip_prompt.tsv b/tn/english/data/telephone/ip_prompt.tsv new file mode 100644 index 00000000..03e25298 --- /dev/null +++ b/tn/english/data/telephone/ip_prompt.tsv @@ -0,0 +1,2 @@ +IP address is +IP is \ No newline at end of file diff --git a/tn/english/data/telephone/ssn_prompt.tsv b/tn/english/data/telephone/ssn_prompt.tsv new file mode 100644 index 00000000..8bbdb9f7 --- /dev/null +++ b/tn/english/data/telephone/ssn_prompt.tsv @@ -0,0 +1,4 @@ +ssn is SSN is +ssn is SSN is +SSN is +SSN \ No newline at end of file diff --git a/tn/english/data/telephone/telephone_prompt.tsv b/tn/english/data/telephone/telephone_prompt.tsv new file mode 100644 index 00000000..6dcfb6ca --- /dev/null +++ b/tn/english/data/telephone/telephone_prompt.tsv @@ -0,0 +1,5 @@ +call me at +reach at +reached at +my number is +hit me up at \ No newline at end of file diff --git a/tn/english/normalizer.py b/tn/english/normalizer.py index 5d83b112..4784d052 100644 --- a/tn/english/normalizer.py +++ b/tn/english/normalizer.py @@ -23,6 +23,7 @@ from tn.english.rules.time import Time from tn.english.rules.measure import Measure from tn.english.rules.money import Money +from tn.english.rules.telephone import Telephone from pynini.lib.pynutil import add_weight, delete from importlib_resources import files @@ -45,10 +46,12 @@ def build_tagger(self): time = add_weight(Time().tagger, 1.00) measure = add_weight(Measure().tagger, 1.00) money = add_weight(Money().tagger, 1.00) + telephone = add_weight(Telephone().tagger, 1.00) word = add_weight(Word().tagger, 100) tagger = (cardinal | ordinal | word | date | decimal | fraction - | time | measure | money).optimize() + self.DELETE_SPACE + | time | measure | money + | telephone).optimize() + self.DELETE_SPACE # delete the last space self.tagger = tagger.star @ self.build_rule(delete(' '), r='[EOS]') @@ -62,8 +65,10 @@ def build_verbalizer(self): time = Time().verbalizer measure = Measure().verbalizer money = Money().verbalizer + telephone = Telephone().verbalizer verbalizer = (cardinal | ordinal | word | date | decimal | fraction | time - | measure | money).optimize() + self.INSERT_SPACE + | measure | money + | telephone).optimize() + self.INSERT_SPACE self.verbalizer = verbalizer.star diff --git a/tn/english/rules/telephone.py b/tn/english/rules/telephone.py new file mode 100644 index 00000000..2f2fd699 --- /dev/null +++ b/tn/english/rules/telephone.py @@ -0,0 +1,179 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil +from pynini.examples import plurals + +from tn.processor import Processor +from tn.utils import get_abs_path + + +class Telephone(Processor): + + def __init__(self, deterministic: bool = True): + """ + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + super().__init__('telephone', ordertype="en_tn") + self.deterministic = deterministic + self.build_tagger() + self.build_verbalizer() + + def build_tagger(self): + """ + Finite state transducer for classifying telephone, and IP, and SSN which includes country code, number part and extension + country code optional: +*** + number part: ***-***-****, or (***) ***-**** + extension optional: 1-9999 + E.g + +1 123-123-5678-1 -> telephone { country_code: "one" number_part: "one two three, one two three, five six seven eight" extension: "one" } + 1-800-GO-U-HAUL -> telephone { country_code: "one" number_part: "one, eight hundred GO U HAUL" } + """ + add_separator = pynutil.insert(", ") # between components + zero = pynini.cross("0", "zero") + if not self.deterministic: + zero |= pynini.cross("0", pynini.union("o", "oh")) + digit = pynini.invert( + pynini.string_file(get_abs_path( + "english/data/number/digit.tsv"))).optimize() | zero + + telephone_prompts = pynini.string_file( + get_abs_path("english/data/telephone/telephone_prompt.tsv")) + country_code = ( + pynini.closure(telephone_prompts + self.DELETE_EXTRA_SPACE, 0, 1) + + pynini.closure(pynini.cross("+", "plus "), 0, 1) + + pynini.closure(digit + self.INSERT_SPACE, 0, 2) + digit + + pynutil.insert(",")) + country_code |= telephone_prompts + country_code = pynutil.insert( + "country_code: \"") + country_code + pynutil.insert("\"") + country_code = country_code + pynini.closure( + pynutil.delete("-"), 0, 1) + self.DELETE_SPACE + self.INSERT_SPACE + + area_part_default = pynini.closure(digit + self.INSERT_SPACE, 2, + 2) + digit + area_part = pynini.cross("800", "eight hundred") | pynini.compose( + pynini.difference(pynini.closure(self.VCHAR), "800"), + area_part_default) + + area_part = ( + (area_part + (pynutil.delete("-") | pynutil.delete("."))) + | + (pynutil.delete("(") + area_part + + ((pynutil.delete(")") + pynini.closure(pynutil.delete(" "), 0, 1)) + | pynutil.delete(")-")))) + add_separator + + del_separator = pynini.closure(pynini.union("-", " ", "."), 0, 1) + number_length = ((self.DIGIT + del_separator) | + (self.ALPHA + del_separator))**7 + number_words = pynini.closure((self.DIGIT @ digit) + + (self.INSERT_SPACE | + (pynini.cross("-", ', '))) + | self.ALPHA + | (self.ALPHA + pynini.cross("-", ' '))) + number_words |= pynini.closure((self.DIGIT @ digit) + + (self.INSERT_SPACE | + (pynini.cross(".", ', '))) + | self.ALPHA + | (self.ALPHA + pynini.cross(".", ' '))) + number_words = pynini.compose(number_length, number_words) + number_part = area_part + number_words + number_part = pynutil.insert( + "number_part: \"") + number_part + pynutil.insert("\"") + extension = (pynutil.insert("extension: \"") + + pynini.closure(digit + self.INSERT_SPACE, 0, 3) + digit + + pynutil.insert("\"")) + extension = pynini.closure(self.INSERT_SPACE + extension, 0, 1) + + graph = plurals._priority_union(country_code + number_part, + number_part, + pynini.closure(self.VCHAR)).optimize() + graph = plurals._priority_union(country_code + number_part + extension, + graph, + pynini.closure(self.VCHAR)).optimize() + graph = plurals._priority_union(number_part + extension, graph, + pynini.closure(self.VCHAR)).optimize() + + # ip + ip_prompts = pynini.string_file( + get_abs_path("english/data/telephone/ip_prompt.tsv")) + digit_to_str_graph = digit + pynini.closure( + pynutil.insert(" ") + digit, 0, 2) + ip_graph = digit_to_str_graph + (pynini.cross(".", " dot ") + + digit_to_str_graph)**3 + graph |= ( + pynini.closure( # noqa + pynutil.insert("country_code: \"") + ip_prompts + # noqa + pynutil.insert("\"") + self.DELETE_EXTRA_SPACE, + 0, + 1) # noqa + + pynutil.insert("number_part: \"") # noqa + + ip_graph.optimize() + pynutil.insert("\"") # noqa + ) + # ssn + ssn_prompts = pynini.string_file( + get_abs_path("english/data/telephone/ssn_prompt.tsv")) + three_digit_part = digit + (pynutil.insert(" ") + digit)**2 + two_digit_part = digit + pynutil.insert(" ") + digit + four_digit_part = digit + (pynutil.insert(" ") + digit)**3 + ssn_separator = pynini.cross("-", ", ") + ssn_graph = three_digit_part + ssn_separator + two_digit_part + ssn_separator + four_digit_part + + graph |= ( + pynini.closure( # noqa + pynutil.insert("country_code: \"") + ssn_prompts + # noqa + pynutil.insert("\"") + self.DELETE_EXTRA_SPACE, # noqa + 0, + 1) + pynutil.insert("number_part: \"") # noqa + + ssn_graph.optimize() # noqa + + pynutil.insert("\"") # noqa + ) + + final_graph = self.add_tokens(graph) + self.tagger = final_graph.optimize() + + def build_verbalizer(self): + """ + Finite state transducer for verbalizing telephone numbers, e.g. + telephone { country_code: "one" number_part: "one two three, one two three, five six seven eight" extension: "one" } + -> one, one two three, one two three, five six seven eight, one + """ + optional_country_code = pynini.closure( + pynutil.delete("country_code: \"") + + pynini.closure(self.NOT_QUOTE, 1) + pynutil.delete("\"") + + self.DELETE_SPACE + self.INSERT_SPACE, + 0, + 1, + ) + + number_part = ( + pynutil.delete("number_part: \"") + + pynini.closure(self.NOT_QUOTE, 1) + pynini.closure( + pynutil.add_weight(pynutil.delete(" "), -0.0001), 0, 1) + + pynutil.delete("\"")) + + optional_extension = pynini.closure( + self.DELETE_SPACE + self.INSERT_SPACE + + pynutil.delete("extension: \"") + + pynini.closure(self.NOT_QUOTE, 1) + pynutil.delete("\""), + 0, + 1, + ) + + graph = optional_country_code + number_part + optional_extension + delete_tokens = self.delete_tokens(graph) + self.verbalizer = delete_tokens.optimize() diff --git a/tn/english/test/data/telephone.txt b/tn/english/test/data/telephone.txt new file mode 100644 index 00000000..01639701 --- /dev/null +++ b/tn/english/test/data/telephone.txt @@ -0,0 +1,2 @@ ++1 123-123-5678-1 => plus one, one two three, one two three, five six seven eight, one +1-800-GO-U-HAUL => one, eight hundred, GO U HAUL diff --git a/tn/english/test/telephone_test.py b/tn/english/test/telephone_test.py new file mode 100644 index 00000000..dcf8f879 --- /dev/null +++ b/tn/english/test/telephone_test.py @@ -0,0 +1,28 @@ +# Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from tn.english.rules.telephone import Telephone +from tn.english.test.utils import parse_test_case + + +class TestTelephone: + + telephone = Telephone(deterministic=False) + telephone_cases = parse_test_case('data/telephone.txt') + + @pytest.mark.parametrize("written, spoken", telephone_cases) + def test_telephone(self, written, spoken): + assert self.telephone.normalize(written) == spoken